sglang 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_latency.py +3 -3
  3. sglang/bench_server_latency.py +2 -3
  4. sglang/bench_serving.py +92 -0
  5. sglang/global_config.py +9 -3
  6. sglang/lang/chat_template.py +50 -25
  7. sglang/lang/interpreter.py +9 -1
  8. sglang/lang/ir.py +11 -2
  9. sglang/launch_server.py +1 -1
  10. sglang/srt/configs/model_config.py +76 -15
  11. sglang/srt/constrained/__init__.py +18 -0
  12. sglang/srt/constrained/bnf_cache.py +61 -0
  13. sglang/srt/constrained/fsm_cache.py +10 -3
  14. sglang/srt/constrained/grammar.py +190 -0
  15. sglang/srt/hf_transformers_utils.py +20 -5
  16. sglang/srt/layers/attention/flashinfer_backend.py +5 -5
  17. sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
  18. sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
  19. sglang/srt/layers/fused_moe/fused_moe.py +4 -3
  20. sglang/srt/layers/fused_moe/layer.py +28 -0
  21. sglang/srt/layers/logits_processor.py +5 -5
  22. sglang/srt/layers/quantization/base_config.py +16 -1
  23. sglang/srt/layers/rotary_embedding.py +15 -48
  24. sglang/srt/layers/sampler.py +51 -39
  25. sglang/srt/layers/vocab_parallel_embedding.py +486 -0
  26. sglang/srt/managers/data_parallel_controller.py +8 -7
  27. sglang/srt/managers/detokenizer_manager.py +11 -9
  28. sglang/srt/managers/image_processor.py +4 -3
  29. sglang/srt/managers/io_struct.py +80 -78
  30. sglang/srt/managers/schedule_batch.py +46 -52
  31. sglang/srt/managers/schedule_policy.py +24 -13
  32. sglang/srt/managers/scheduler.py +145 -82
  33. sglang/srt/managers/tokenizer_manager.py +236 -334
  34. sglang/srt/managers/tp_worker.py +5 -5
  35. sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
  36. sglang/srt/mem_cache/flush_cache.py +1 -1
  37. sglang/srt/mem_cache/memory_pool.py +10 -3
  38. sglang/srt/model_executor/cuda_graph_runner.py +34 -23
  39. sglang/srt/model_executor/forward_batch_info.py +6 -9
  40. sglang/srt/model_executor/model_runner.py +10 -19
  41. sglang/srt/models/baichuan.py +4 -4
  42. sglang/srt/models/chatglm.py +4 -4
  43. sglang/srt/models/commandr.py +1 -1
  44. sglang/srt/models/dbrx.py +5 -5
  45. sglang/srt/models/deepseek.py +4 -4
  46. sglang/srt/models/deepseek_v2.py +4 -4
  47. sglang/srt/models/exaone.py +4 -4
  48. sglang/srt/models/gemma.py +1 -1
  49. sglang/srt/models/gemma2.py +1 -1
  50. sglang/srt/models/gpt2.py +287 -0
  51. sglang/srt/models/gpt_bigcode.py +1 -1
  52. sglang/srt/models/grok.py +4 -4
  53. sglang/srt/models/internlm2.py +4 -4
  54. sglang/srt/models/llama.py +15 -7
  55. sglang/srt/models/llama_embedding.py +2 -10
  56. sglang/srt/models/llama_reward.py +5 -0
  57. sglang/srt/models/minicpm.py +4 -4
  58. sglang/srt/models/minicpm3.py +4 -4
  59. sglang/srt/models/mixtral.py +7 -5
  60. sglang/srt/models/mixtral_quant.py +4 -4
  61. sglang/srt/models/mllama.py +5 -5
  62. sglang/srt/models/olmo.py +4 -4
  63. sglang/srt/models/olmoe.py +4 -4
  64. sglang/srt/models/qwen.py +4 -4
  65. sglang/srt/models/qwen2.py +4 -4
  66. sglang/srt/models/qwen2_moe.py +4 -4
  67. sglang/srt/models/qwen2_vl.py +4 -8
  68. sglang/srt/models/stablelm.py +4 -4
  69. sglang/srt/models/torch_native_llama.py +4 -4
  70. sglang/srt/models/xverse.py +4 -4
  71. sglang/srt/models/xverse_moe.py +4 -4
  72. sglang/srt/openai_api/adapter.py +52 -66
  73. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
  74. sglang/srt/sampling/sampling_batch_info.py +7 -13
  75. sglang/srt/sampling/sampling_params.py +5 -7
  76. sglang/srt/server.py +41 -33
  77. sglang/srt/server_args.py +34 -5
  78. sglang/srt/utils.py +40 -56
  79. sglang/test/run_eval.py +2 -0
  80. sglang/test/runners.py +2 -1
  81. sglang/test/srt/sampling/penaltylib/utils.py +1 -0
  82. sglang/test/test_utils.py +151 -6
  83. sglang/utils.py +62 -1
  84. sglang/version.py +1 -1
  85. sglang-0.3.5.dist-info/METADATA +344 -0
  86. sglang-0.3.5.dist-info/RECORD +152 -0
  87. {sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
  88. sglang-0.3.4.post1.dist-info/METADATA +0 -900
  89. sglang-0.3.4.post1.dist-info/RECORD +0 -148
  90. {sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
  91. {sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -63,6 +63,7 @@ class ServerArgs:
63
63
  stream_interval: int = 1
64
64
  random_seed: Optional[int] = None
65
65
  constrained_json_whitespace_pattern: Optional[str] = None
66
+ decode_log_interval: int = 40
66
67
 
67
68
  # Logging
68
69
  log_level: str = "info"
@@ -74,6 +75,7 @@ class ServerArgs:
74
75
  api_key: Optional[str] = None
75
76
  file_storage_pth: str = "SGLang_storage"
76
77
  enable_cache_report: bool = False
78
+ watchdog_timeout: float = 600
77
79
 
78
80
  # Data parallelism
79
81
  dp_size: int = 1
@@ -102,6 +104,7 @@ class ServerArgs:
102
104
  # Kernel backend
103
105
  attention_backend: Optional[str] = None
104
106
  sampling_backend: Optional[str] = None
107
+ grammar_backend: Optional[str] = "outlines"
105
108
 
106
109
  # Optimization/debug options
107
110
  disable_flashinfer: bool = False
@@ -118,7 +121,8 @@ class ServerArgs:
118
121
  enable_overlap_schedule: bool = False
119
122
  enable_mixed_chunk: bool = False
120
123
  enable_torch_compile: bool = False
121
- max_torch_compile_bs: int = 32
124
+ torch_compile_max_bs: int = 32
125
+ cuda_graph_max_bs: int = 160
122
126
  torchao_config: str = ""
123
127
  enable_p2p_check: bool = False
124
128
  triton_attention_reduce_in_fp32: bool = False
@@ -427,6 +431,18 @@ class ServerArgs:
427
431
  action="store_true",
428
432
  help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
429
433
  )
434
+ parser.add_argument(
435
+ "--watchdog-timeout",
436
+ type=float,
437
+ default=ServerArgs.watchdog_timeout,
438
+ help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
439
+ )
440
+ parser.add_argument(
441
+ "--decode-log-interval",
442
+ type=int,
443
+ default=ServerArgs.decode_log_interval,
444
+ help="The log interval of decode batch"
445
+ )
430
446
 
431
447
  # Data parallelism
432
448
  parser.add_argument(
@@ -537,6 +553,13 @@ class ServerArgs:
537
553
  default=ServerArgs.sampling_backend,
538
554
  help="Choose the kernels for sampling layers.",
539
555
  )
556
+ parser.add_argument(
557
+ "--grammar-backend",
558
+ type=str,
559
+ choices=["xgrammar", "outlines"],
560
+ default=ServerArgs.grammar_backend,
561
+ help="Choose the backend for constrained decoding.",
562
+ )
540
563
 
541
564
  # Optimization/debug options
542
565
  parser.add_argument(
@@ -611,11 +634,17 @@ class ServerArgs:
611
634
  help="Optimize the model with torch.compile. Experimental feature.",
612
635
  )
613
636
  parser.add_argument(
614
- "--max-torch-compile-bs",
637
+ "--torch-compile-max-bs",
615
638
  type=int,
616
- default=ServerArgs.max_torch_compile_bs,
639
+ default=ServerArgs.torch_compile_max_bs,
617
640
  help="Set the maximum batch size when using torch compile.",
618
641
  )
642
+ parser.add_argument(
643
+ "--cuda-graph-max-bs",
644
+ type=int,
645
+ default=ServerArgs.cuda_graph_max_bs,
646
+ help="Set the maximum batch size for cuda graph.",
647
+ )
619
648
  parser.add_argument(
620
649
  "--torchao-config",
621
650
  type=str,
@@ -712,11 +741,11 @@ class PortArgs:
712
741
 
713
742
  @staticmethod
714
743
  def init_new(server_args) -> "PortArgs":
715
- port = server_args.port + 1
744
+ port = server_args.port + 42
716
745
  while True:
717
746
  if is_port_available(port):
718
747
  break
719
- port += 1
748
+ port += 42
720
749
 
721
750
  return PortArgs(
722
751
  tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
sglang/srt/utils.py CHANGED
@@ -35,6 +35,7 @@ import psutil
35
35
  import requests
36
36
  import torch
37
37
  import torch.distributed as dist
38
+ import zmq
38
39
  from fastapi.responses import ORJSONResponse
39
40
  from packaging import version as pkg_version
40
41
  from torch import nn
@@ -203,56 +204,6 @@ def is_port_available(port):
203
204
  return False
204
205
 
205
206
 
206
- def is_multimodal_model(model_architectures):
207
- if (
208
- "LlavaLlamaForCausalLM" in model_architectures
209
- or "LlavaQwenForCausalLM" in model_architectures
210
- or "LlavaMistralForCausalLM" in model_architectures
211
- or "LlavaVidForCausalLM" in model_architectures
212
- or "MllamaForConditionalGeneration" in model_architectures
213
- or "Qwen2VLForConditionalGeneration" in model_architectures
214
- ):
215
- return True
216
- else:
217
- return False
218
-
219
-
220
- def is_attention_free_model(model_architectures):
221
- return False
222
-
223
-
224
- def model_has_inner_state(model_architectures):
225
- return False
226
-
227
-
228
- def is_embedding_model(model_architectures):
229
- if (
230
- "LlamaEmbeddingModel" in model_architectures
231
- or "MistralModel" in model_architectures
232
- or "LlamaForSequenceClassification" in model_architectures
233
- or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
234
- ):
235
- return True
236
- else:
237
- return False
238
-
239
-
240
- def is_generation_model(model_architectures, is_embedding: bool = False):
241
- # We have two ways to determine whether a model is a generative model.
242
- # 1. Check the model architectue
243
- # 2. check the `is_embedding` server args
244
-
245
- if (
246
- "LlamaEmbeddingModel" in model_architectures
247
- or "MistralModel" in model_architectures
248
- or "LlamaForSequenceClassification" in model_architectures
249
- or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
250
- ):
251
- return False
252
- else:
253
- return not is_embedding
254
-
255
-
256
207
  def decode_video_base64(video_base64):
257
208
  from PIL import Image
258
209
 
@@ -397,17 +348,26 @@ def kill_parent_process():
397
348
  """Kill the parent process and all children of the parent process."""
398
349
  current_process = psutil.Process()
399
350
  parent_process = current_process.parent()
400
- kill_child_process(parent_process.pid, skip_pid=current_process.pid)
351
+ kill_child_process(
352
+ parent_process.pid, include_self=True, skip_pid=current_process.pid
353
+ )
354
+ try:
355
+ current_process.kill()
356
+ except psutil.NoSuchProcess:
357
+ pass
401
358
 
402
359
 
403
- def kill_child_process(pid, including_parent=True, skip_pid=None):
360
+ def kill_child_process(pid=None, include_self=False, skip_pid=None):
404
361
  """Kill the process and all its children process."""
362
+ if pid is None:
363
+ pid = os.getpid()
364
+
405
365
  try:
406
- parent = psutil.Process(pid)
366
+ itself = psutil.Process(pid)
407
367
  except psutil.NoSuchProcess:
408
368
  return
409
369
 
410
- children = parent.children(recursive=True)
370
+ children = itself.children(recursive=True)
411
371
  for child in children:
412
372
  if child.pid == skip_pid:
413
373
  continue
@@ -416,9 +376,9 @@ def kill_child_process(pid, including_parent=True, skip_pid=None):
416
376
  except psutil.NoSuchProcess:
417
377
  pass
418
378
 
419
- if including_parent:
379
+ if include_self:
420
380
  try:
421
- parent.kill()
381
+ itself.kill()
422
382
  except psutil.NoSuchProcess:
423
383
  pass
424
384
 
@@ -720,3 +680,27 @@ def first_rank_print(*args, **kwargs):
720
680
  print(*args, **kwargs)
721
681
  else:
722
682
  pass
683
+
684
+
685
+ def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint: str):
686
+ mem = psutil.virtual_memory()
687
+ total_mem = mem.total / 1024**3
688
+ available_mem = mem.available / 1024**3
689
+ if total_mem > 32 and available_mem > 16:
690
+ buf_size = int(0.5 * 1024**3)
691
+ else:
692
+ buf_size = -1
693
+
694
+ socket = context.socket(socket_type)
695
+ if socket_type == zmq.PUSH:
696
+ socket.setsockopt(zmq.SNDHWM, 0)
697
+ socket.setsockopt(zmq.SNDBUF, buf_size)
698
+ socket.connect(f"ipc://{endpoint}")
699
+ elif socket_type == zmq.PULL:
700
+ socket.setsockopt(zmq.RCVHWM, 0)
701
+ socket.setsockopt(zmq.RCVBUF, buf_size)
702
+ socket.bind(f"ipc://{endpoint}")
703
+ else:
704
+ raise ValueError(f"Unsupported socket type: {socket_type}")
705
+
706
+ return socket
sglang/test/run_eval.py CHANGED
@@ -67,6 +67,7 @@ def run_eval(args):
67
67
  model=args.model,
68
68
  max_tokens=2048,
69
69
  base_url=base_url,
70
+ temperature=getattr(args, "temperature", 0.0),
70
71
  )
71
72
 
72
73
  # Run eval
@@ -119,6 +120,7 @@ if __name__ == "__main__":
119
120
  parser.add_argument("--eval-name", type=str, default="mmlu")
120
121
  parser.add_argument("--num-examples", type=int)
121
122
  parser.add_argument("--num-threads", type=int, default=512)
123
+ parser.add_argument("--temperature", type=float, default=0.0)
122
124
  args = parser.parse_args()
123
125
 
124
126
  run_eval(args)
sglang/test/runners.py CHANGED
@@ -273,6 +273,7 @@ class SRTRunner:
273
273
  disable_cuda_graph=disable_cuda_graph,
274
274
  disable_radix_cache=disable_radix_cache,
275
275
  )
276
+ self.tokenizer = get_tokenizer(model_path)
276
277
 
277
278
  def forward(
278
279
  self,
@@ -366,7 +367,7 @@ class SRTRunner:
366
367
  return ModelOutput(embed_logits=logits)
367
368
  else:
368
369
  scores = [x["embedding"][0] for x in response]
369
- return ModelOutput(scores=logits)
370
+ return ModelOutput(scores=scores)
370
371
 
371
372
  def __enter__(self):
372
373
  return self
@@ -24,6 +24,7 @@ class MockSamplingParams:
24
24
  @dataclasses.dataclass
25
25
  class MockTokenizer:
26
26
  eos_token_id: int
27
+ additional_stop_token_ids: typing.Optional[typing.List[int]] = None
27
28
 
28
29
 
29
30
  @dataclasses.dataclass
sglang/test/test_utils.py CHANGED
@@ -3,9 +3,11 @@
3
3
  import argparse
4
4
  import asyncio
5
5
  import os
6
+ import random
6
7
  import subprocess
7
8
  import threading
8
9
  import time
10
+ from concurrent.futures import ThreadPoolExecutor
9
11
  from functools import partial
10
12
  from types import SimpleNamespace
11
13
  from typing import Callable, List, Optional
@@ -20,6 +22,7 @@ from sglang.global_config import global_config
20
22
  from sglang.lang.backend.openai import OpenAI
21
23
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
22
24
  from sglang.srt.utils import kill_child_process
25
+ from sglang.test.run_eval import run_eval
23
26
  from sglang.utils import get_exception_traceback
24
27
 
25
28
  DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
@@ -400,7 +403,7 @@ def popen_launch_server(
400
403
  api_key: Optional[str] = None,
401
404
  other_args: tuple = (),
402
405
  env: Optional[dict] = None,
403
- return_stdout_stderr: bool = False,
406
+ return_stdout_stderr: Optional[tuple] = None,
404
407
  ):
405
408
  _, host, port = base_url.split(":")
406
409
  host = host[2:]
@@ -423,8 +426,8 @@ def popen_launch_server(
423
426
  if return_stdout_stderr:
424
427
  process = subprocess.Popen(
425
428
  command,
426
- stdout=subprocess.PIPE,
427
- stderr=subprocess.PIPE,
429
+ stdout=return_stdout_stderr[0],
430
+ stderr=return_stdout_stderr[1],
428
431
  env=env,
429
432
  text=True,
430
433
  )
@@ -493,7 +496,7 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
493
496
  )
494
497
  assert ret_code == 0
495
498
  except TimeoutError:
496
- kill_child_process(process.pid)
499
+ kill_child_process(process.pid, include_self=True)
497
500
  time.sleep(5)
498
501
  print(
499
502
  f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
@@ -561,7 +564,7 @@ def run_bench_serving(
561
564
  try:
562
565
  res = run_benchmark(args)
563
566
  finally:
564
- kill_child_process(process.pid)
567
+ kill_child_process(process.pid, include_self=True)
565
568
 
566
569
  assert res["completed"] == num_prompts
567
570
  return res
@@ -594,7 +597,7 @@ def run_bench_latency(model, other_args):
594
597
  lastline = output.split("\n")[-3]
595
598
  output_throughput = float(lastline.split(" ")[-2])
596
599
  finally:
597
- kill_child_process(process.pid)
600
+ kill_child_process(process.pid, include_self=True)
598
601
 
599
602
  return output_throughput
600
603
 
@@ -631,3 +634,145 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
631
634
  rouge_l_scores.append(fmeasure)
632
635
 
633
636
  return rouge_l_scores
637
+
638
+
639
+ STDOUT_FILENAME = "stdout.txt"
640
+ STDERR_FILENAME = "stderr.txt"
641
+
642
+
643
+ def read_output(output_lines):
644
+ """Print the output in real time with another thread."""
645
+ while not os.path.exists(STDERR_FILENAME):
646
+ time.sleep(1)
647
+
648
+ pt = 0
649
+ while pt >= 0:
650
+ if pt > 0 and not os.path.exists(STDERR_FILENAME):
651
+ break
652
+ lines = open(STDERR_FILENAME).readlines()
653
+ for line in lines[pt:]:
654
+ print(line, end="", flush=True)
655
+ output_lines.append(line)
656
+ pt += 1
657
+ time.sleep(0.1)
658
+
659
+
660
+ def run_and_check_memory_leak(
661
+ workload_func,
662
+ disable_radix_cache,
663
+ enable_mixed_chunk,
664
+ enable_overlap,
665
+ chunked_prefill_size,
666
+ ):
667
+ other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
668
+ if disable_radix_cache:
669
+ other_args += ["--disable-radix-cache"]
670
+ if enable_mixed_chunk:
671
+ other_args += ["--enable-mixed-chunk"]
672
+ if enable_overlap:
673
+ other_args += ["--enable-overlap-scheduler"]
674
+
675
+ model = DEFAULT_MODEL_NAME_FOR_TEST
676
+ port = random.randint(4000, 5000)
677
+ base_url = f"http://127.0.0.1:{port}"
678
+
679
+ # Create files and launch the server
680
+ stdout = open(STDOUT_FILENAME, "w")
681
+ stderr = open(STDERR_FILENAME, "w")
682
+ process = popen_launch_server(
683
+ model,
684
+ base_url,
685
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
686
+ other_args=other_args,
687
+ return_stdout_stderr=(stdout, stderr),
688
+ )
689
+
690
+ # Launch a thread to stream the output
691
+ output_lines = []
692
+ t = threading.Thread(target=read_output, args=(output_lines,))
693
+ t.start()
694
+
695
+ # Run the workload
696
+ workload_func(base_url, model)
697
+
698
+ # Clean up everything
699
+ kill_child_process(process.pid, include_self=True)
700
+ kill_child_process(process.pid, include_self=True)
701
+ stdout.close()
702
+ stderr.close()
703
+ if os.path.exists(STDOUT_FILENAME):
704
+ os.remove(STDOUT_FILENAME)
705
+ if os.path.exists(STDERR_FILENAME):
706
+ os.remove(STDERR_FILENAME)
707
+ t.join()
708
+
709
+ # Assert success
710
+ has_new_server = False
711
+ has_leak = False
712
+ for line in output_lines:
713
+ if "The server is fired" in line:
714
+ has_new_server = True
715
+ if "leak" in line:
716
+ has_leak = True
717
+
718
+ assert has_new_server
719
+ assert not has_leak
720
+
721
+
722
+ def run_mmlu_test(
723
+ disable_radix_cache=False,
724
+ enable_mixed_chunk=False,
725
+ enable_overlap=False,
726
+ chunked_prefill_size=32,
727
+ ):
728
+ def workload_func(base_url, model):
729
+ # Run the eval
730
+ args = SimpleNamespace(
731
+ base_url=base_url,
732
+ model=model,
733
+ eval_name="mmlu",
734
+ num_examples=128,
735
+ num_threads=128,
736
+ )
737
+
738
+ try:
739
+ metrics = run_eval(args)
740
+ print(f"{metrics=}")
741
+ assert metrics["score"] >= 0.65
742
+ finally:
743
+ pass
744
+
745
+ run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
746
+
747
+
748
+ def run_mulit_request_test(
749
+ disable_radix_cache=False,
750
+ enable_mixed_chunk=False,
751
+ enable_overlap=False,
752
+ chunked_prefill_size=32,
753
+ ):
754
+
755
+ def workload_func(base_url, model):
756
+ def run_one(_):
757
+ prompt = """
758
+ System: You are a helpful assistant.
759
+ User: What is the capital of France?
760
+ Assistant: The capital of France is
761
+ """
762
+
763
+ response = requests.post(
764
+ f"{base_url}/generate",
765
+ json={
766
+ "text": prompt,
767
+ "sampling_params": {
768
+ "temperature": 0,
769
+ "max_new_tokens": 8,
770
+ },
771
+ },
772
+ )
773
+ ret = response.json()
774
+
775
+ with ThreadPoolExecutor(2) as executor:
776
+ list(executor.map(run_one, list(range(4))))
777
+
778
+ run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
sglang/utils.py CHANGED
@@ -1,12 +1,15 @@
1
1
  """Common utilities."""
2
2
 
3
3
  import base64
4
+ import gc
4
5
  import importlib
5
6
  import json
6
7
  import logging
7
8
  import os
8
9
  import signal
10
+ import subprocess
9
11
  import sys
12
+ import time
10
13
  import traceback
11
14
  import urllib.request
12
15
  from concurrent.futures import ThreadPoolExecutor
@@ -16,6 +19,7 @@ from typing import Optional, Union
16
19
 
17
20
  import numpy as np
18
21
  import requests
22
+ from IPython.display import HTML, display
19
23
  from tqdm import tqdm
20
24
 
21
25
  logger = logging.getLogger(__name__)
@@ -151,7 +155,7 @@ def encode_video_base64(video_path: str, num_frames: int = 16):
151
155
  frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
152
156
 
153
157
  frames = []
154
- for i in range(total_frames):
158
+ for _ in range(total_frames):
155
159
  ret, frame = cap.read()
156
160
  if ret:
157
161
  frames.append(frame)
@@ -294,3 +298,60 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
294
298
  bar.update(len(chunk))
295
299
 
296
300
  return filename
301
+
302
+
303
+ def execute_shell_command(command: str) -> subprocess.Popen:
304
+ """
305
+ Execute a shell command and return the process handle
306
+
307
+ Args:
308
+ command: Shell command as a string (can include \\ line continuations)
309
+ Returns:
310
+ subprocess.Popen: Process handle
311
+ """
312
+ # Replace \ newline with space and split
313
+ command = command.replace("\\\n", " ").replace("\\", " ")
314
+ parts = command.split()
315
+
316
+ return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
317
+
318
+
319
+ def wait_for_server(base_url: str, timeout: int = None) -> None:
320
+ """Wait for the server to be ready by polling the /v1/models endpoint.
321
+
322
+ Args:
323
+ base_url: The base URL of the server
324
+ timeout: Maximum time to wait in seconds. None means wait forever.
325
+ """
326
+ start_time = time.time()
327
+ while True:
328
+ try:
329
+ response = requests.get(
330
+ f"{base_url}/v1/models",
331
+ headers={"Authorization": "Bearer None"},
332
+ )
333
+ if response.status_code == 200:
334
+ time.sleep(5)
335
+ print_highlight(
336
+ """\n
337
+ NOTE: Typically, the server runs in a separate terminal.
338
+ In this notebook, we run the server and notebook code together, so their outputs are combined.
339
+ To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
340
+ """
341
+ )
342
+ break
343
+
344
+ if timeout and time.time() - start_time > timeout:
345
+ raise TimeoutError("Server did not become ready within timeout period")
346
+ except requests.exceptions.RequestException:
347
+ time.sleep(1)
348
+
349
+
350
+ def terminate_process(process):
351
+ from sglang.srt.utils import kill_child_process
352
+ kill_child_process(process.pid, include_self=True)
353
+
354
+
355
+ def print_highlight(html_content: str):
356
+ html_content = str(html_content).replace("\n", "<br>")
357
+ display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.4.post1"
1
+ __version__ = "0.3.5"