sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. sglang/__init__.py +21 -23
  2. sglang/api.py +2 -7
  3. sglang/bench_offline_throughput.py +41 -27
  4. sglang/bench_one_batch.py +60 -4
  5. sglang/bench_one_batch_server.py +1 -1
  6. sglang/bench_serving.py +83 -71
  7. sglang/lang/backend/runtime_endpoint.py +183 -4
  8. sglang/lang/chat_template.py +46 -4
  9. sglang/launch_server.py +1 -1
  10. sglang/srt/_custom_ops.py +80 -42
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/load_config.py +1 -0
  13. sglang/srt/configs/model_config.py +1 -0
  14. sglang/srt/constrained/base_grammar_backend.py +21 -0
  15. sglang/srt/constrained/xgrammar_backend.py +8 -4
  16. sglang/srt/conversation.py +14 -1
  17. sglang/srt/distributed/__init__.py +3 -3
  18. sglang/srt/distributed/communication_op.py +2 -1
  19. sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
  20. sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
  21. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  22. sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
  23. sglang/srt/distributed/device_communicators/pynccl.py +80 -1
  24. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
  25. sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
  26. sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
  27. sglang/srt/distributed/parallel_state.py +1 -1
  28. sglang/srt/distributed/utils.py +2 -1
  29. sglang/srt/entrypoints/engine.py +452 -0
  30. sglang/srt/entrypoints/http_server.py +603 -0
  31. sglang/srt/function_call_parser.py +494 -0
  32. sglang/srt/layers/activation.py +8 -8
  33. sglang/srt/layers/attention/flashinfer_backend.py +10 -9
  34. sglang/srt/layers/attention/triton_backend.py +4 -6
  35. sglang/srt/layers/attention/vision.py +204 -0
  36. sglang/srt/layers/dp_attention.py +71 -0
  37. sglang/srt/layers/layernorm.py +5 -5
  38. sglang/srt/layers/linear.py +65 -14
  39. sglang/srt/layers/logits_processor.py +49 -64
  40. sglang/srt/layers/moe/ep_moe/layer.py +24 -16
  41. sglang/srt/layers/moe/fused_moe_native.py +84 -1
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  43. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
  44. sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
  45. sglang/srt/layers/parameter.py +18 -8
  46. sglang/srt/layers/quantization/__init__.py +20 -23
  47. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  48. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  49. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  50. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  51. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  52. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  53. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  54. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  55. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  56. sglang/srt/layers/quantization/fp8.py +10 -4
  57. sglang/srt/layers/quantization/modelopt_quant.py +1 -2
  58. sglang/srt/layers/quantization/w8a8_int8.py +1 -1
  59. sglang/srt/layers/radix_attention.py +2 -2
  60. sglang/srt/layers/rotary_embedding.py +1184 -31
  61. sglang/srt/layers/sampler.py +64 -6
  62. sglang/srt/layers/torchao_utils.py +12 -6
  63. sglang/srt/layers/vocab_parallel_embedding.py +2 -2
  64. sglang/srt/lora/lora.py +1 -9
  65. sglang/srt/managers/configure_logging.py +3 -0
  66. sglang/srt/managers/data_parallel_controller.py +79 -72
  67. sglang/srt/managers/detokenizer_manager.py +24 -6
  68. sglang/srt/managers/image_processor.py +158 -2
  69. sglang/srt/managers/io_struct.py +57 -3
  70. sglang/srt/managers/schedule_batch.py +78 -45
  71. sglang/srt/managers/schedule_policy.py +26 -12
  72. sglang/srt/managers/scheduler.py +326 -201
  73. sglang/srt/managers/session_controller.py +1 -0
  74. sglang/srt/managers/tokenizer_manager.py +210 -121
  75. sglang/srt/managers/tp_worker.py +6 -4
  76. sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
  77. sglang/srt/managers/utils.py +44 -0
  78. sglang/srt/mem_cache/memory_pool.py +10 -32
  79. sglang/srt/metrics/collector.py +15 -6
  80. sglang/srt/model_executor/cuda_graph_runner.py +26 -30
  81. sglang/srt/model_executor/forward_batch_info.py +5 -7
  82. sglang/srt/model_executor/model_runner.py +44 -19
  83. sglang/srt/model_loader/loader.py +83 -6
  84. sglang/srt/model_loader/weight_utils.py +145 -6
  85. sglang/srt/models/baichuan.py +6 -6
  86. sglang/srt/models/chatglm.py +2 -2
  87. sglang/srt/models/commandr.py +17 -5
  88. sglang/srt/models/dbrx.py +13 -5
  89. sglang/srt/models/deepseek.py +3 -3
  90. sglang/srt/models/deepseek_v2.py +11 -11
  91. sglang/srt/models/exaone.py +2 -2
  92. sglang/srt/models/gemma.py +2 -2
  93. sglang/srt/models/gemma2.py +15 -25
  94. sglang/srt/models/gpt2.py +3 -5
  95. sglang/srt/models/gpt_bigcode.py +1 -1
  96. sglang/srt/models/granite.py +2 -2
  97. sglang/srt/models/grok.py +4 -3
  98. sglang/srt/models/internlm2.py +2 -2
  99. sglang/srt/models/llama.py +7 -5
  100. sglang/srt/models/minicpm.py +2 -2
  101. sglang/srt/models/minicpm3.py +9 -9
  102. sglang/srt/models/minicpmv.py +1238 -0
  103. sglang/srt/models/mixtral.py +3 -3
  104. sglang/srt/models/mixtral_quant.py +3 -3
  105. sglang/srt/models/mllama.py +2 -2
  106. sglang/srt/models/olmo.py +3 -3
  107. sglang/srt/models/olmo2.py +4 -4
  108. sglang/srt/models/olmoe.py +7 -13
  109. sglang/srt/models/phi3_small.py +2 -2
  110. sglang/srt/models/qwen.py +2 -2
  111. sglang/srt/models/qwen2.py +41 -4
  112. sglang/srt/models/qwen2_moe.py +3 -3
  113. sglang/srt/models/qwen2_vl.py +22 -122
  114. sglang/srt/models/stablelm.py +2 -2
  115. sglang/srt/models/torch_native_llama.py +20 -7
  116. sglang/srt/models/xverse.py +6 -6
  117. sglang/srt/models/xverse_moe.py +6 -6
  118. sglang/srt/openai_api/adapter.py +139 -37
  119. sglang/srt/openai_api/protocol.py +7 -4
  120. sglang/srt/sampling/custom_logit_processor.py +38 -0
  121. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
  122. sglang/srt/sampling/sampling_batch_info.py +143 -18
  123. sglang/srt/sampling/sampling_params.py +3 -1
  124. sglang/srt/server.py +4 -1090
  125. sglang/srt/server_args.py +77 -15
  126. sglang/srt/speculative/eagle_utils.py +37 -15
  127. sglang/srt/speculative/eagle_worker.py +11 -13
  128. sglang/srt/utils.py +164 -129
  129. sglang/test/runners.py +8 -13
  130. sglang/test/test_programs.py +2 -1
  131. sglang/test/test_utils.py +83 -22
  132. sglang/utils.py +12 -2
  133. sglang/version.py +1 -1
  134. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
  135. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
  136. sglang/launch_server_llavavid.py +0 -25
  137. sglang/srt/constrained/__init__.py +0 -16
  138. sglang/srt/distributed/device_communicators/__init__.py +0 -0
  139. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
  140. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
  141. {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -452,6 +452,8 @@ def get_dataset(args, tokenizer):
452
452
  num_requests=args.num_prompts,
453
453
  tokenizer=tokenizer,
454
454
  fixed_output_len=args.sharegpt_output_len,
455
+ context_len=args.sharegpt_context_len,
456
+ apply_chat_template=args.apply_chat_template,
455
457
  )
456
458
  elif args.dataset_name == "random":
457
459
  input_requests = sample_random_requests(
@@ -464,11 +466,11 @@ def get_dataset(args, tokenizer):
464
466
  )
465
467
  elif args.dataset_name == "generated-shared-prefix":
466
468
  input_requests = sample_generated_shared_prefix_requests(
467
- num_groups=args.gen_num_groups,
468
- prompts_per_group=args.gen_prompts_per_group,
469
- system_prompt_len=args.gen_system_prompt_len,
470
- question_len=args.gen_question_len,
471
- output_len=args.gen_output_len,
469
+ num_groups=args.gsp_num_groups,
470
+ prompts_per_group=args.gsp_prompts_per_group,
471
+ system_prompt_len=args.gsp_system_prompt_len,
472
+ question_len=args.gsp_question_len,
473
+ output_len=args.gsp_output_len,
472
474
  tokenizer=tokenizer,
473
475
  )
474
476
  else:
@@ -516,6 +518,7 @@ class BenchmarkMetrics:
516
518
  median_e2e_latency_ms: float
517
519
  std_e2e_latency_ms: float
518
520
  p99_e2e_latency_ms: float
521
+ concurrency: float
519
522
 
520
523
 
521
524
  SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
@@ -560,6 +563,8 @@ def sample_sharegpt_requests(
560
563
  num_requests: int,
561
564
  tokenizer: PreTrainedTokenizerBase,
562
565
  fixed_output_len: Optional[int] = None,
566
+ context_len: Optional[int] = None,
567
+ apply_chat_template=False,
563
568
  ) -> List[Tuple[str, int, int]]:
564
569
  if fixed_output_len is not None and fixed_output_len < 4:
565
570
  raise ValueError("output_len too small")
@@ -590,6 +595,15 @@ def sample_sharegpt_requests(
590
595
 
591
596
  # Tokenize the prompts and completions.
592
597
  prompt = dataset[i][0]
598
+
599
+ if apply_chat_template:
600
+ prompt = tokenizer.apply_chat_template(
601
+ [{"role": "user", "content": prompt}],
602
+ add_generation_prompt=True,
603
+ tokenize=False,
604
+ )
605
+ prompt = prompt.replace(tokenizer.bos_token, "")
606
+
593
607
  prompt_token_ids = tokenizer.encode(prompt)
594
608
  completion = dataset[i][1]
595
609
  completion_token_ids = tokenizer.encode(completion)
@@ -597,14 +611,15 @@ def sample_sharegpt_requests(
597
611
  output_len = (
598
612
  len(completion_token_ids) if fixed_output_len is None else fixed_output_len
599
613
  )
600
- if prompt_len < 4 or output_len < 4:
614
+
615
+ if prompt_len < 2 or output_len < 2:
601
616
  # Prune too short sequences.
602
617
  continue
603
- if prompt_len > 1024 or (
604
- prompt_len + output_len > 2048 and fixed_output_len is None
605
- ):
618
+
619
+ if context_len and prompt_len + output_len > context_len:
606
620
  # Prune too long sequences.
607
621
  continue
622
+
608
623
  filtered_dataset.append((prompt, prompt_len, output_len))
609
624
 
610
625
  print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
@@ -706,8 +721,8 @@ def get_gen_prefix_cache_path(args, tokenizer):
706
721
 
707
722
  # Create a unique cache filename based on the generation parameters
708
723
  cache_key = (
709
- f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
710
- f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
724
+ f"gen_shared_prefix_{args.gsp_num_groups}_{args.gsp_prompts_per_group}_"
725
+ f"{args.gsp_system_prompt_len}_{args.gsp_question_len}_{args.gsp_output_len}_"
711
726
  f"{tokenizer.__class__.__name__}.pkl"
712
727
  )
713
728
  return cache_dir / cache_key
@@ -877,6 +892,7 @@ def calculate_metrics(
877
892
  median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
878
893
  std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
879
894
  p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
895
+ concurrency=np.sum(e2e_latencies) / dur_s,
880
896
  )
881
897
 
882
898
  return metrics, output_lens
@@ -1028,6 +1044,7 @@ async def benchmark(
1028
1044
  "Total token throughput (tok/s):", metrics.total_throughput
1029
1045
  )
1030
1046
  )
1047
+ print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
1031
1048
  print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
1032
1049
  print(
1033
1050
  "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
@@ -1059,13 +1076,24 @@ async def benchmark(
1059
1076
  and metrics.output_throughput is not None
1060
1077
  ):
1061
1078
  result = {
1079
+ # Arguments
1062
1080
  "backend": args.backend,
1063
1081
  "dataset_name": args.dataset_name,
1064
1082
  "request_rate": request_rate,
1065
1083
  "max_concurrency": max_concurrency,
1084
+ "sharegpt_output_len": args.sharegpt_output_len,
1085
+ "random_input_len": args.random_input_len,
1086
+ "random_output_len": args.random_output_len,
1087
+ "random_range_ratio": args.random_range_ratio,
1088
+ # Results
1089
+ "duration": benchmark_duration,
1090
+ "completed": metrics.completed,
1066
1091
  "total_input_tokens": metrics.total_input,
1067
1092
  "total_output_tokens": metrics.total_output,
1068
1093
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
1094
+ "request_throughput": metrics.request_throughput,
1095
+ "input_throughput": metrics.input_throughput,
1096
+ "output_throughput": metrics.output_throughput,
1069
1097
  "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
1070
1098
  "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
1071
1099
  "std_e2e_latency_ms": metrics.std_e2e_latency_ms,
@@ -1082,14 +1110,7 @@ async def benchmark(
1082
1110
  "median_itl_ms": metrics.median_itl_ms,
1083
1111
  "std_itl_ms": metrics.std_itl_ms,
1084
1112
  "p99_itl_ms": metrics.p99_itl_ms,
1085
- "input_throughput": metrics.input_throughput,
1086
- "output_throughput": metrics.output_throughput,
1087
- "sharegpt_output_len": args.sharegpt_output_len,
1088
- "random_input_len": args.random_input_len,
1089
- "random_output_len": args.random_output_len,
1090
- "random_range_ratio": args.random_range_ratio,
1091
- "duration": benchmark_duration,
1092
- "completed": metrics.completed,
1113
+ "concurrency": metrics.concurrency,
1093
1114
  }
1094
1115
  else:
1095
1116
  print(f"Error running benchmark for request rate: {request_rate}")
@@ -1109,36 +1130,16 @@ async def benchmark(
1109
1130
  with open(output_file_name, "a") as file:
1110
1131
  file.write(json.dumps(result) + "\n")
1111
1132
 
1112
- result = {
1113
- "duration": benchmark_duration,
1114
- "completed": metrics.completed,
1115
- "total_input_tokens": metrics.total_input,
1116
- "total_output_tokens": metrics.total_output,
1117
- "total_output_tokens_retokenized": metrics.total_output_retokenized,
1118
- "request_throughput": metrics.request_throughput,
1119
- "input_throughput": metrics.input_throughput,
1120
- "output_throughput": metrics.output_throughput,
1121
- "mean_ttft_ms": metrics.mean_ttft_ms,
1122
- "median_ttft_ms": metrics.median_ttft_ms,
1123
- "std_ttft_ms": metrics.std_ttft_ms,
1124
- "p99_ttft_ms": metrics.p99_ttft_ms,
1125
- "mean_tpot_ms": metrics.mean_tpot_ms,
1126
- "median_tpot_ms": metrics.median_tpot_ms,
1127
- "std_tpot_ms": metrics.std_tpot_ms,
1128
- "p99_tpot_ms": metrics.p99_tpot_ms,
1129
- "mean_itl_ms": metrics.mean_itl_ms,
1130
- "median_itl_ms": metrics.median_itl_ms,
1131
- "std_itl_ms": metrics.std_itl_ms,
1132
- "p99_itl_ms": metrics.p99_itl_ms,
1133
- "input_lens": [output.prompt_len for output in outputs],
1134
- "output_lens": output_lens,
1135
- "ttfts": [output.ttft for output in outputs],
1136
- "itls": [output.itl for output in outputs],
1137
- "generated_texts": [output.generated_text for output in outputs],
1138
- "errors": [output.error for output in outputs],
1139
- "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
1140
- "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
1141
- }
1133
+ result.update(
1134
+ {
1135
+ "input_lens": [output.prompt_len for output in outputs],
1136
+ "output_lens": output_lens,
1137
+ "ttfts": [output.ttft for output in outputs],
1138
+ "itls": [output.itl for output in outputs],
1139
+ "generated_texts": [output.generated_text for output in outputs],
1140
+ "errors": [output.error for output in outputs],
1141
+ }
1142
+ )
1142
1143
  return result
1143
1144
 
1144
1145
 
@@ -1374,6 +1375,12 @@ if __name__ == "__main__":
1374
1375
  default=None,
1375
1376
  help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
1376
1377
  )
1378
+ parser.add_argument(
1379
+ "--sharegpt-context-len",
1380
+ type=int,
1381
+ default=None,
1382
+ help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
1383
+ )
1377
1384
  parser.add_argument(
1378
1385
  "--random-input-len",
1379
1386
  type=int,
@@ -1413,7 +1420,6 @@ if __name__ == "__main__":
1413
1420
  "actual request rate may be lower than specified with --request-rate, "
1414
1421
  "if the server is not processing requests fast enough to keep up.",
1415
1422
  )
1416
- parser.add_argument("--seed", type=int, default=1, help="The random seed.")
1417
1423
  parser.add_argument(
1418
1424
  "--multi",
1419
1425
  action="store_true",
@@ -1437,14 +1443,15 @@ if __name__ == "__main__":
1437
1443
  help="Disable streaming mode.",
1438
1444
  )
1439
1445
  parser.add_argument(
1440
- "--disable-ignore-eos",
1446
+ "--return-logprob",
1441
1447
  action="store_true",
1442
- help="Disable ignoring EOS.",
1448
+ help="Return logprob.",
1443
1449
  )
1450
+ parser.add_argument("--seed", type=int, default=1, help="The random seed.")
1444
1451
  parser.add_argument(
1445
- "--return-logprob",
1452
+ "--disable-ignore-eos",
1446
1453
  action="store_true",
1447
- help="Return logprob.",
1454
+ help="Disable ignoring EOS.",
1448
1455
  )
1449
1456
  parser.add_argument(
1450
1457
  "--extra-request-body",
@@ -1453,49 +1460,54 @@ if __name__ == "__main__":
1453
1460
  help="Append given JSON object to the request payload. You can use this to specify"
1454
1461
  "additional generate params like sampling params.",
1455
1462
  )
1463
+ parser.add_argument(
1464
+ "--apply-chat-template",
1465
+ action="store_true",
1466
+ help="Apply chat template",
1467
+ )
1468
+ parser.add_argument(
1469
+ "--profile",
1470
+ action="store_true",
1471
+ help="Use Torch Profiler. The endpoint must be launched with "
1472
+ "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
1473
+ )
1474
+ parser.add_argument(
1475
+ "--lora-name",
1476
+ type=str,
1477
+ default=None,
1478
+ help="The name of LoRA adapter",
1479
+ )
1456
1480
 
1457
1481
  group = parser.add_argument_group("generated-shared-prefix dataset arguments")
1458
1482
  group.add_argument(
1459
- "--gen-num-groups",
1483
+ "--gsp-num-groups",
1460
1484
  type=int,
1461
1485
  default=64,
1462
1486
  help="Number of system prompt groups for generated-shared-prefix dataset",
1463
1487
  )
1464
1488
  group.add_argument(
1465
- "--gen-prompts-per-group",
1489
+ "--gsp-prompts-per-group",
1466
1490
  type=int,
1467
1491
  default=16,
1468
1492
  help="Number of prompts per system prompt group for generated-shared-prefix dataset",
1469
1493
  )
1470
1494
  group.add_argument(
1471
- "--gen-system-prompt-len",
1495
+ "--gsp-system-prompt-len",
1472
1496
  type=int,
1473
1497
  default=2048,
1474
1498
  help="Target length in tokens for system prompts in generated-shared-prefix dataset",
1475
1499
  )
1476
1500
  group.add_argument(
1477
- "--gen-question-len",
1501
+ "--gsp-question-len",
1478
1502
  type=int,
1479
1503
  default=128,
1480
1504
  help="Target length in tokens for questions in generated-shared-prefix dataset",
1481
1505
  )
1482
1506
  group.add_argument(
1483
- "--gen-output-len",
1507
+ "--gsp-output-len",
1484
1508
  type=int,
1485
1509
  default=256,
1486
1510
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
1487
1511
  )
1488
- parser.add_argument(
1489
- "--profile",
1490
- action="store_true",
1491
- help="Use Torch Profiler. The endpoint must be launched with "
1492
- "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
1493
- )
1494
- parser.add_argument(
1495
- "--lora-name",
1496
- type=str,
1497
- default=None,
1498
- help="The name of LoRA adapter",
1499
- )
1500
1512
  args = parser.parse_args()
1501
1513
  run_benchmark(args)
@@ -1,6 +1,11 @@
1
+ import atexit
1
2
  import json
3
+ import multiprocessing
2
4
  import warnings
3
- from typing import List, Optional
5
+ from typing import Dict, List, Optional, Union
6
+
7
+ import aiohttp
8
+ import requests
4
9
 
5
10
  from sglang.global_config import global_config
6
11
  from sglang.lang.backend.base_backend import BaseBackend
@@ -251,11 +256,12 @@ class RuntimeEndpoint(BaseBackend):
251
256
  }
252
257
  obj = self._generate_http_request(s, data)
253
258
 
254
- normalized_prompt_logprobs = [
255
- r["meta_info"]["normalized_prompt_logprob"] for r in obj
256
- ]
257
259
  input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
258
260
  output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
261
+ normalized_prompt_logprobs = [
262
+ compute_normalized_prompt_logprobs(r["meta_info"]["input_token_logprobs"])
263
+ for r in obj
264
+ ]
259
265
 
260
266
  # Remove extra token if no token healing occurred
261
267
  for i in range(len(input_token_logprobs)):
@@ -319,3 +325,176 @@ class RuntimeEndpoint(BaseBackend):
319
325
  def _assert_success(self, res):
320
326
  if res.status_code != 200:
321
327
  raise RuntimeError(res.json())
328
+
329
+
330
+ def compute_normalized_prompt_logprobs(input_logprobs):
331
+ values = [x[0] for x in input_logprobs if x[0]]
332
+ return sum(values) / len(values)
333
+
334
+
335
+ class Runtime:
336
+ """
337
+ A wrapper for the HTTP server.
338
+ This is used for launching the server in a python program without
339
+ using the commond line interface.
340
+
341
+ It is mainly used for the frontend language.
342
+ You should use the Engine class if you want to do normal offline processing without the frontend language.
343
+ """
344
+
345
+ def __init__(
346
+ self,
347
+ log_level: str = "error",
348
+ *args,
349
+ **kwargs,
350
+ ):
351
+ """See the arguments in server_args.py::ServerArgs"""
352
+ # We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
353
+ # client code without installing SRT server and its dependency if they want.
354
+ from sglang.srt.entrypoints.http_server import launch_server
355
+ from sglang.srt.server_args import ServerArgs
356
+ from sglang.srt.utils import is_port_available
357
+
358
+ self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
359
+
360
+ # Pre-allocate ports
361
+ for port in range(self.server_args.port, 40000):
362
+ if is_port_available(port):
363
+ break
364
+ self.server_args.port = port
365
+
366
+ self.url = self.server_args.url()
367
+ self.generate_url = self.url + "/generate"
368
+
369
+ # NOTE: We store pid instead of proc to fix some issues during __delete__
370
+ self.pid = None
371
+ pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
372
+
373
+ proc = multiprocessing.Process(
374
+ target=launch_server,
375
+ args=(self.server_args, pipe_writer),
376
+ )
377
+ proc.start()
378
+ pipe_writer.close()
379
+ self.pid = proc.pid
380
+
381
+ # Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
382
+ atexit.register(self.shutdown)
383
+
384
+ # TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
385
+ try:
386
+ init_state = pipe_reader.recv()
387
+ except EOFError:
388
+ init_state = ""
389
+
390
+ if init_state != "ready":
391
+ self.shutdown()
392
+ raise RuntimeError(
393
+ "Initialization failed. Please see the error messages above."
394
+ )
395
+
396
+ self.endpoint = RuntimeEndpoint(self.url)
397
+
398
+ def shutdown(self):
399
+ from sglang.srt.utils import kill_process_tree
400
+
401
+ if self.pid is not None:
402
+ kill_process_tree(self.pid)
403
+ self.pid = None
404
+
405
+ def cache_prefix(self, prefix: str):
406
+ self.endpoint.cache_prefix(prefix)
407
+
408
+ def get_tokenizer(self):
409
+ from sglang.srt.hf_transformers_utils import get_tokenizer
410
+
411
+ return get_tokenizer(
412
+ self.server_args.tokenizer_path,
413
+ tokenizer_mode=self.server_args.tokenizer_mode,
414
+ trust_remote_code=self.server_args.trust_remote_code,
415
+ revision=self.server_args.revision,
416
+ )
417
+
418
+ async def async_generate(
419
+ self,
420
+ prompt: str,
421
+ sampling_params: Optional[Dict] = None,
422
+ ):
423
+ if self.server_args.skip_tokenizer_init:
424
+ json_data = {
425
+ "input_ids": prompt,
426
+ "sampling_params": sampling_params,
427
+ "stream": True,
428
+ }
429
+ else:
430
+ json_data = {
431
+ "text": prompt,
432
+ "sampling_params": sampling_params,
433
+ "stream": True,
434
+ }
435
+ pos = 0
436
+
437
+ timeout = aiohttp.ClientTimeout(total=3 * 3600)
438
+ async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
439
+ async with session.post(self.generate_url, json=json_data) as response:
440
+ async for chunk, _ in response.content.iter_chunks():
441
+ chunk = chunk.decode("utf-8")
442
+ if chunk and chunk.startswith("data:"):
443
+ if chunk == "data: [DONE]\n\n":
444
+ break
445
+ data = json.loads(chunk[5:].strip("\n"))
446
+ if "text" in data:
447
+ cur = data["text"][pos:]
448
+ if cur:
449
+ yield cur
450
+ pos += len(cur)
451
+ else:
452
+ yield data
453
+
454
+ add_request = async_generate
455
+
456
+ def generate(
457
+ self,
458
+ prompt: Union[str, List[str]],
459
+ sampling_params: Optional[Dict] = None,
460
+ return_logprob: Optional[Union[List[bool], bool]] = False,
461
+ logprob_start_len: Optional[Union[List[int], int]] = None,
462
+ top_logprobs_num: Optional[Union[List[int], int]] = None,
463
+ lora_path: Optional[List[Optional[str]]] = None,
464
+ ):
465
+ json_data = {
466
+ "text": prompt,
467
+ "sampling_params": sampling_params,
468
+ "return_logprob": return_logprob,
469
+ "logprob_start_len": logprob_start_len,
470
+ "top_logprobs_num": top_logprobs_num,
471
+ "lora_path": lora_path,
472
+ }
473
+ assert not isinstance(lora_path, list) or len(lora_path) == len(prompt)
474
+ response = requests.post(
475
+ self.url + "/generate",
476
+ json=json_data,
477
+ )
478
+ return json.dumps(response.json())
479
+
480
+ def encode(
481
+ self,
482
+ prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
483
+ ):
484
+ json_data = {"text": prompt}
485
+ response = requests.post(self.url + "/encode", json=json_data)
486
+ return json.dumps(response.json())
487
+
488
+ async def get_server_info(self):
489
+ async with aiohttp.ClientSession() as session:
490
+ async with session.get(f"{self.url}/get_server_info") as response:
491
+ if response.status == 200:
492
+ return await response.json()
493
+ else:
494
+ error_data = await response.json()
495
+ raise RuntimeError(
496
+ f"Failed to get server info. {error_data['error']['message']}"
497
+ )
498
+
499
+ def __del__(self):
500
+ self.shutdown()
@@ -88,7 +88,6 @@ register_chat_template(
88
88
  )
89
89
  )
90
90
 
91
-
92
91
  register_chat_template(
93
92
  ChatTemplate(
94
93
  name="claude",
@@ -101,7 +100,6 @@ register_chat_template(
101
100
  )
102
101
  )
103
102
 
104
-
105
103
  register_chat_template(
106
104
  ChatTemplate(
107
105
  name="chatml",
@@ -116,7 +114,6 @@ register_chat_template(
116
114
  )
117
115
  )
118
116
 
119
-
120
117
  register_chat_template(
121
118
  ChatTemplate(
122
119
  name="chatml-llava",
@@ -132,7 +129,6 @@ register_chat_template(
132
129
  )
133
130
  )
134
131
 
135
-
136
132
  # There is default system prompt for qwen
137
133
  # reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
138
134
  # The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
@@ -219,6 +215,21 @@ register_chat_template(
219
215
  )
220
216
  )
221
217
 
218
+ # https://huggingface.co/openbmb/MiniCPM-V-2_6
219
+ register_chat_template(
220
+ ChatTemplate(
221
+ name="minicpmv",
222
+ default_system_prompt=None,
223
+ role_prefix_and_suffix={
224
+ "system": ("", " "),
225
+ "user": ("user:", " "),
226
+ "assistant": ("assistant:", "</s>"),
227
+ },
228
+ stop_str=("<|im_end|>", "<|endoftext|>"),
229
+ image_token="(<image>./</image>)",
230
+ )
231
+ )
232
+
222
233
  # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
223
234
  register_chat_template(
224
235
  ChatTemplate(
@@ -343,6 +354,37 @@ register_chat_template(
343
354
  )
344
355
 
345
356
 
357
+ register_chat_template(
358
+ ChatTemplate(
359
+ name="deepseek-v3",
360
+ default_system_prompt=None,
361
+ role_prefix_and_suffix={
362
+ "system": (
363
+ "",
364
+ "",
365
+ ),
366
+ "user": (
367
+ "<|User|>",
368
+ "",
369
+ ),
370
+ "assistant": (
371
+ "<|Assistant|>",
372
+ "<|end▁of▁sentence|>",
373
+ ),
374
+ },
375
+ stop_str=("<|end▁of▁sentence|>",),
376
+ )
377
+ )
378
+
379
+
380
+ @register_chat_template_matching_function
381
+ def match_deepseek(model_path: str):
382
+ if (
383
+ "deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
384
+ ) and "base" not in model_path.lower():
385
+ return get_chat_template("deepseek-v3")
386
+
387
+
346
388
  @register_chat_template_matching_function
347
389
  def match_dbrx(model_path: str):
348
390
  if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
sglang/launch_server.py CHANGED
@@ -3,7 +3,7 @@
3
3
  import os
4
4
  import sys
5
5
 
6
- from sglang.srt.server import launch_server
6
+ from sglang.srt.entrypoints.http_server import launch_server
7
7
  from sglang.srt.server_args import prepare_server_args
8
8
  from sglang.srt.utils import kill_process_tree
9
9