sglang 0.3.5__py3-none-any.whl → 0.3.5.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. sglang/bench_serving.py +113 -3
  2. sglang/srt/configs/model_config.py +5 -2
  3. sglang/srt/constrained/__init__.py +2 -66
  4. sglang/srt/constrained/base_grammar_backend.py +72 -0
  5. sglang/srt/constrained/outlines_backend.py +165 -0
  6. sglang/srt/constrained/outlines_jump_forward.py +182 -0
  7. sglang/srt/constrained/xgrammar_backend.py +114 -0
  8. sglang/srt/layers/attention/triton_ops/decode_attention.py +7 -0
  9. sglang/srt/layers/attention/triton_ops/extend_attention.py +6 -0
  10. sglang/srt/layers/fused_moe/fused_moe.py +23 -7
  11. sglang/srt/layers/quantization/base_config.py +4 -6
  12. sglang/srt/layers/vocab_parallel_embedding.py +216 -150
  13. sglang/srt/managers/io_struct.py +5 -3
  14. sglang/srt/managers/schedule_batch.py +14 -20
  15. sglang/srt/managers/scheduler.py +153 -94
  16. sglang/srt/managers/tokenizer_manager.py +81 -17
  17. sglang/srt/metrics/collector.py +211 -0
  18. sglang/srt/metrics/func_timer.py +108 -0
  19. sglang/srt/mm_utils.py +1 -1
  20. sglang/srt/model_executor/cuda_graph_runner.py +2 -2
  21. sglang/srt/model_executor/forward_batch_info.py +7 -3
  22. sglang/srt/model_executor/model_runner.py +2 -1
  23. sglang/srt/models/gemma2_reward.py +69 -0
  24. sglang/srt/models/gpt2.py +31 -37
  25. sglang/srt/models/internlm2_reward.py +62 -0
  26. sglang/srt/models/llama.py +11 -6
  27. sglang/srt/models/llama_reward.py +5 -26
  28. sglang/srt/models/qwen2_vl.py +5 -7
  29. sglang/srt/openai_api/adapter.py +6 -2
  30. sglang/srt/sampling/sampling_batch_info.py +2 -3
  31. sglang/srt/sampling/sampling_params.py +0 -14
  32. sglang/srt/server.py +58 -16
  33. sglang/srt/server_args.py +42 -22
  34. sglang/srt/utils.py +87 -0
  35. sglang/test/simple_eval_common.py +1 -1
  36. sglang/test/simple_eval_humaneval.py +2 -2
  37. sglang/test/simple_eval_mgsm.py +2 -2
  38. sglang/test/test_utils.py +18 -4
  39. sglang/utils.py +1 -0
  40. sglang/version.py +1 -1
  41. {sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/METADATA +11 -7
  42. {sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/RECORD +45 -42
  43. {sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/WHEEL +1 -1
  44. sglang/srt/constrained/base_tool_cache.py +0 -65
  45. sglang/srt/constrained/bnf_cache.py +0 -61
  46. sglang/srt/constrained/fsm_cache.py +0 -95
  47. sglang/srt/constrained/grammar.py +0 -190
  48. sglang/srt/constrained/jump_forward.py +0 -203
  49. {sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/LICENSE +0 -0
  50. {sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server.py CHANGED
@@ -30,12 +30,11 @@ import time
30
30
  from http import HTTPStatus
31
31
  from typing import AsyncIterator, Dict, List, Optional, Union
32
32
 
33
- import orjson
34
-
35
33
  # Fix a bug of Python threading
36
34
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
37
35
 
38
36
  import aiohttp
37
+ import orjson
39
38
  import requests
40
39
  import uvicorn
41
40
  import uvloop
@@ -57,6 +56,7 @@ from sglang.srt.managers.io_struct import (
57
56
  )
58
57
  from sglang.srt.managers.scheduler import run_scheduler_process
59
58
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
59
+ from sglang.srt.metrics.func_timer import enable_func_timer, time_func_latency
60
60
  from sglang.srt.openai_api.adapter import (
61
61
  load_chat_template_for_openai_api,
62
62
  v1_batches,
@@ -74,12 +74,15 @@ from sglang.srt.openai_api.protocol import ModelCard, ModelList
74
74
  from sglang.srt.server_args import PortArgs, ServerArgs
75
75
  from sglang.srt.utils import (
76
76
  add_api_key_middleware,
77
+ add_prometheus_middleware,
77
78
  assert_pkg_version,
78
79
  configure_logger,
80
+ delete_directory,
79
81
  is_port_available,
80
82
  kill_child_process,
81
83
  maybe_set_triton_cache_manager,
82
84
  prepare_model_and_tokenizer,
85
+ set_prometheus_multiproc_dir,
83
86
  set_ulimit,
84
87
  )
85
88
  from sglang.utils import get_exception_traceback
@@ -90,8 +93,6 @@ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
90
93
 
91
94
 
92
95
  app = FastAPI()
93
- tokenizer_manager: TokenizerManager = None
94
-
95
96
  app.add_middleware(
96
97
  CORSMiddleware,
97
98
  allow_origins=["*"],
@@ -100,6 +101,10 @@ app.add_middleware(
100
101
  allow_headers=["*"],
101
102
  )
102
103
 
104
+ tokenizer_manager: TokenizerManager = None
105
+
106
+ ##### Native API endpoints #####
107
+
103
108
 
104
109
  @app.get("/health")
105
110
  async def health() -> Response:
@@ -110,9 +115,16 @@ async def health() -> Response:
110
115
  @app.get("/health_generate")
111
116
  async def health_generate(request: Request) -> Response:
112
117
  """Check the health of the inference server by generating one token."""
113
- gri = GenerateReqInput(
114
- text="s", sampling_params={"max_new_tokens": 1, "temperature": 0.7}
115
- )
118
+
119
+ if tokenizer_manager.is_generation:
120
+ gri = GenerateReqInput(
121
+ input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
122
+ )
123
+ else:
124
+ gri = EmbeddingReqInput(
125
+ input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
126
+ )
127
+
116
128
  try:
117
129
  async for _ in tokenizer_manager.generate_request(gri, request):
118
130
  break
@@ -185,6 +197,7 @@ async def get_memory_pool_size():
185
197
 
186
198
 
187
199
  @app.post("/update_weights")
200
+ @time_func_latency
188
201
  async def update_weights(obj: UpdateWeightReqInput, request: Request):
189
202
  """Update the weights inplace without re-launching the server."""
190
203
  success, message = await tokenizer_manager.update_weights(obj, request)
@@ -201,7 +214,7 @@ async def update_weights(obj: UpdateWeightReqInput, request: Request):
201
214
  )
202
215
 
203
216
 
204
- # fastapi implicitly converts json in the request to obj (dataclass)
217
+ @time_func_latency
205
218
  async def generate_request(obj: GenerateReqInput, request: Request):
206
219
  """Handle a generate request."""
207
220
  if obj.stream:
@@ -234,10 +247,12 @@ async def generate_request(obj: GenerateReqInput, request: Request):
234
247
  )
235
248
 
236
249
 
250
+ # fastapi implicitly converts json in the request to obj (dataclass)
237
251
  app.post("/generate")(generate_request)
238
252
  app.put("/generate")(generate_request)
239
253
 
240
254
 
255
+ @time_func_latency
241
256
  async def encode_request(obj: EmbeddingReqInput, request: Request):
242
257
  """Handle an embedding request."""
243
258
  try:
@@ -253,7 +268,8 @@ app.post("/encode")(encode_request)
253
268
  app.put("/encode")(encode_request)
254
269
 
255
270
 
256
- async def judge_request(obj: EmbeddingReqInput, request: Request):
271
+ @time_func_latency
272
+ async def classify_request(obj: EmbeddingReqInput, request: Request):
257
273
  """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
258
274
  try:
259
275
  ret = await tokenizer_manager.generate_request(obj, request).__anext__()
@@ -264,21 +280,27 @@ async def judge_request(obj: EmbeddingReqInput, request: Request):
264
280
  )
265
281
 
266
282
 
267
- app.post("/judge")(judge_request)
268
- app.put("/judge")(judge_request)
283
+ app.post("/classify")(classify_request)
284
+ app.put("/classify")(classify_request)
285
+
286
+
287
+ ##### OpenAI-compatible API endpoints #####
269
288
 
270
289
 
271
290
  @app.post("/v1/completions")
291
+ @time_func_latency
272
292
  async def openai_v1_completions(raw_request: Request):
273
293
  return await v1_completions(tokenizer_manager, raw_request)
274
294
 
275
295
 
276
296
  @app.post("/v1/chat/completions")
297
+ @time_func_latency
277
298
  async def openai_v1_chat_completions(raw_request: Request):
278
299
  return await v1_chat_completions(tokenizer_manager, raw_request)
279
300
 
280
301
 
281
302
  @app.post("/v1/embeddings", response_class=ORJSONResponse)
303
+ @time_func_latency
282
304
  async def openai_v1_embeddings(raw_request: Request):
283
305
  response = await v1_embeddings(tokenizer_manager, raw_request)
284
306
  return response
@@ -432,13 +454,17 @@ def launch_server(
432
454
  1. The HTTP server and Tokenizer Manager both run in the main process.
433
455
  2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
434
456
  """
435
-
436
457
  launch_engine(server_args=server_args)
437
458
 
438
459
  # Add api key authorization
439
460
  if server_args.api_key:
440
461
  add_api_key_middleware(app, server_args.api_key)
441
462
 
463
+ # add prometheus middleware
464
+ if server_args.enable_metrics:
465
+ add_prometheus_middleware(app)
466
+ enable_func_timer()
467
+
442
468
  # Send a warmup request
443
469
  t = threading.Thread(
444
470
  target=_wait_and_warmup, args=(server_args, pipe_finish_writer)
@@ -475,6 +501,10 @@ def _set_envs_and_config(server_args: ServerArgs):
475
501
  os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
476
502
  os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
477
503
 
504
+ # Set prometheus env vars
505
+ if server_args.enable_metrics:
506
+ set_prometheus_multiproc_dir()
507
+
478
508
  # Set ulimit
479
509
  set_ulimit()
480
510
 
@@ -523,6 +553,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
523
553
  return
524
554
 
525
555
  model_info = res.json()
556
+
526
557
  # Send a warmup request
527
558
  request_name = "/generate" if model_info["is_generation"] else "/encode"
528
559
  max_new_tokens = 8 if model_info["is_generation"] else 1
@@ -560,6 +591,9 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
560
591
  if pipe_finish_writer is not None:
561
592
  pipe_finish_writer.send("ready")
562
593
 
594
+ if server_args.delete_ckpt_after_loading:
595
+ delete_directory(server_args.model_path)
596
+
563
597
 
564
598
  class Runtime:
565
599
  """
@@ -720,12 +754,12 @@ class Engine:
720
754
 
721
755
  # before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
722
756
  atexit.register(self.shutdown)
723
-
757
+
724
758
  # runtime server default log level is log
725
759
  # offline engine works in scripts, so we set it to error
726
760
 
727
- if 'log_level' not in kwargs:
728
- kwargs['log_level'] = 'error'
761
+ if "log_level" not in kwargs:
762
+ kwargs["log_level"] = "error"
729
763
 
730
764
  server_args = ServerArgs(*args, **kwargs)
731
765
  launch_engine(server_args=server_args)
@@ -840,4 +874,12 @@ class Engine:
840
874
  else:
841
875
  return tokenizer_manager.tokenizer
842
876
 
843
- # TODO (ByronHsu): encode
877
+ def encode(
878
+ self,
879
+ prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
880
+ ):
881
+ obj = EmbeddingReqInput(text=prompt)
882
+
883
+ # get the current event loop
884
+ loop = asyncio.get_event_loop()
885
+ return loop.run_until_complete(encode_request(obj, None))
sglang/srt/server_args.py CHANGED
@@ -63,25 +63,26 @@ class ServerArgs:
63
63
  stream_interval: int = 1
64
64
  random_seed: Optional[int] = None
65
65
  constrained_json_whitespace_pattern: Optional[str] = None
66
- decode_log_interval: int = 40
66
+ watchdog_timeout: float = 300
67
67
 
68
68
  # Logging
69
69
  log_level: str = "info"
70
70
  log_level_http: Optional[str] = None
71
71
  log_requests: bool = False
72
72
  show_time_cost: bool = False
73
+ enable_metrics: bool = False
74
+ decode_log_interval: int = 40
73
75
 
74
- # Other
76
+ # API related
75
77
  api_key: Optional[str] = None
76
78
  file_storage_pth: str = "SGLang_storage"
77
79
  enable_cache_report: bool = False
78
- watchdog_timeout: float = 600
79
80
 
80
81
  # Data parallelism
81
82
  dp_size: int = 1
82
83
  load_balance_method: str = "round_robin"
83
84
 
84
- # Distributed args
85
+ # Multi-node distributed serving
85
86
  dist_init_addr: Optional[str] = None
86
87
  nnodes: int = 1
87
88
  node_rank: int = 0
@@ -110,7 +111,7 @@ class ServerArgs:
110
111
  disable_flashinfer: bool = False
111
112
  disable_flashinfer_sampling: bool = False
112
113
  disable_radix_cache: bool = False
113
- disable_regex_jump_forward: bool = False
114
+ disable_jump_forward: bool = False
114
115
  disable_cuda_graph: bool = False
115
116
  disable_cuda_graph_padding: bool = False
116
117
  disable_disk_cache: bool = False
@@ -127,6 +128,7 @@ class ServerArgs:
127
128
  enable_p2p_check: bool = False
128
129
  triton_attention_reduce_in_fp32: bool = False
129
130
  num_continuous_decode_steps: int = 1
131
+ delete_ckpt_after_loading: bool = False
130
132
 
131
133
  def __post_init__(self):
132
134
  # Set missing default values
@@ -204,6 +206,7 @@ class ServerArgs:
204
206
 
205
207
  @staticmethod
206
208
  def add_cli_args(parser: argparse.ArgumentParser):
209
+ # Model and port args
207
210
  parser.add_argument(
208
211
  "--model-path",
209
212
  type=str,
@@ -323,6 +326,8 @@ class ServerArgs:
323
326
  action="store_true",
324
327
  help="Whether to use a CausalLM as an embedding model.",
325
328
  )
329
+
330
+ # Memory and scheduling
326
331
  parser.add_argument(
327
332
  "--mem-fraction-static",
328
333
  type=float,
@@ -367,6 +372,8 @@ class ServerArgs:
367
372
  default=ServerArgs.schedule_conservativeness,
368
373
  help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
369
374
  )
375
+
376
+ # Other runtime options
370
377
  parser.add_argument(
371
378
  "--tensor-parallel-size",
372
379
  "--tp-size",
@@ -392,6 +399,14 @@ class ServerArgs:
392
399
  default=ServerArgs.constrained_json_whitespace_pattern,
393
400
  help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
394
401
  )
402
+ parser.add_argument(
403
+ "--watchdog-timeout",
404
+ type=float,
405
+ default=ServerArgs.watchdog_timeout,
406
+ help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
407
+ )
408
+
409
+ # Logging
395
410
  parser.add_argument(
396
411
  "--log-level",
397
412
  type=str,
@@ -414,6 +429,19 @@ class ServerArgs:
414
429
  action="store_true",
415
430
  help="Show time cost of custom marks.",
416
431
  )
432
+ parser.add_argument(
433
+ "--enable-metrics",
434
+ action="store_true",
435
+ help="Enable log prometheus metrics.",
436
+ )
437
+ parser.add_argument(
438
+ "--decode-log-interval",
439
+ type=int,
440
+ default=ServerArgs.decode_log_interval,
441
+ help="The log interval of decode batch",
442
+ )
443
+
444
+ # API related
417
445
  parser.add_argument(
418
446
  "--api-key",
419
447
  type=str,
@@ -431,18 +459,6 @@ class ServerArgs:
431
459
  action="store_true",
432
460
  help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
433
461
  )
434
- parser.add_argument(
435
- "--watchdog-timeout",
436
- type=float,
437
- default=ServerArgs.watchdog_timeout,
438
- help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
439
- )
440
- parser.add_argument(
441
- "--decode-log-interval",
442
- type=int,
443
- default=ServerArgs.decode_log_interval,
444
- help="The log interval of decode batch"
445
- )
446
462
 
447
463
  # Data parallelism
448
464
  parser.add_argument(
@@ -463,7 +479,7 @@ class ServerArgs:
463
479
  ],
464
480
  )
465
481
 
466
- # Multi-node distributed serving args
482
+ # Multi-node distributed serving
467
483
  parser.add_argument(
468
484
  "--dist-init-addr",
469
485
  "--nccl-init-addr", # For backward compatbility. This will be removed in the future.
@@ -558,7 +574,7 @@ class ServerArgs:
558
574
  type=str,
559
575
  choices=["xgrammar", "outlines"],
560
576
  default=ServerArgs.grammar_backend,
561
- help="Choose the backend for constrained decoding.",
577
+ help="Choose the backend for grammar-guided decoding.",
562
578
  )
563
579
 
564
580
  # Optimization/debug options
@@ -578,9 +594,9 @@ class ServerArgs:
578
594
  help="Disable RadixAttention for prefix caching.",
579
595
  )
580
596
  parser.add_argument(
581
- "--disable-regex-jump-forward",
597
+ "--disable-jump-forward",
582
598
  action="store_true",
583
- help="Disable regex jump-forward.",
599
+ help="Disable jump-forward for grammar-guided decoding.",
584
600
  )
585
601
  parser.add_argument(
586
602
  "--disable-cuda-graph",
@@ -600,7 +616,6 @@ class ServerArgs:
600
616
  parser.add_argument(
601
617
  "--disable-custom-all-reduce",
602
618
  action="store_true",
603
- default=False,
604
619
  help="Disable the custom all-reduce kernel and fall back to NCCL.",
605
620
  )
606
621
  parser.add_argument(
@@ -670,6 +685,11 @@ class ServerArgs:
670
685
  "This can potentially increase throughput but may also increase time-to-first-token latency. "
671
686
  "The default value is 1, meaning only run one decoding step at a time.",
672
687
  )
688
+ parser.add_argument(
689
+ "--delete-ckpt-after-loading",
690
+ action="store_true",
691
+ help="Delete the model checkpoint after loading the model.",
692
+ )
673
693
 
674
694
  @classmethod
675
695
  def from_cli_args(cls, args: argparse.Namespace):
sglang/srt/utils.py CHANGED
@@ -22,8 +22,12 @@ import logging
22
22
  import os
23
23
  import pickle
24
24
  import random
25
+ import re
25
26
  import resource
27
+ import shutil
28
+ import signal
26
29
  import socket
30
+ import tempfile
27
31
  import time
28
32
  import warnings
29
33
  from importlib.metadata import PackageNotFoundError, version
@@ -35,9 +39,11 @@ import psutil
35
39
  import requests
36
40
  import torch
37
41
  import torch.distributed as dist
42
+ import triton
38
43
  import zmq
39
44
  from fastapi.responses import ORJSONResponse
40
45
  from packaging import version as pkg_version
46
+ from starlette.routing import Mount
41
47
  from torch import nn
42
48
  from torch.profiler import ProfilerActivity, profile, record_function
43
49
  from triton.runtime.cache import (
@@ -379,6 +385,10 @@ def kill_child_process(pid=None, include_self=False, skip_pid=None):
379
385
  if include_self:
380
386
  try:
381
387
  itself.kill()
388
+
389
+ # Sometime processes cannot be killed with SIGKILL (e.g, PID=1 launched by kubernetes),
390
+ # so we send an additional signal to kill them.
391
+ itself.send_signal(signal.SIGINT)
382
392
  except psutil.NoSuchProcess:
383
393
  pass
384
394
 
@@ -704,3 +714,80 @@ def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint:
704
714
  raise ValueError(f"Unsupported socket type: {socket_type}")
705
715
 
706
716
  return socket
717
+
718
+
719
+ def dump_to_file(dirpath, name, value):
720
+ from vllm.distributed import get_tensor_model_parallel_rank
721
+
722
+ if get_tensor_model_parallel_rank() != 0:
723
+ return
724
+
725
+ os.makedirs(dirpath, exist_ok=True)
726
+ if value.dtype is torch.bfloat16:
727
+ value = value.float()
728
+ value = value.cpu().numpy()
729
+ output_filename = os.path.join(dirpath, f"pytorch_dump_{name}.npy")
730
+ logger.info(f"Dump a tensor to {output_filename}. Shape = {value.shape}")
731
+ np.save(output_filename, value)
732
+
733
+
734
+ def is_triton_3():
735
+ return triton.__version__.startswith("3.")
736
+
737
+
738
+ def maybe_torch_compile(*args, **kwargs):
739
+ """
740
+ torch.compile does not work for triton 2.2.0, which is needed in xlm1's jax.
741
+ Therefore, we disable it here.
742
+ """
743
+
744
+ def decorator(func):
745
+ if is_triton_3():
746
+ return torch.compile(*args, **kwargs)(func)
747
+ return func
748
+
749
+ return decorator
750
+
751
+
752
+ def delete_directory(dirpath):
753
+ try:
754
+ # This will remove the directory and all its contents
755
+ shutil.rmtree(dirpath)
756
+ except OSError as e:
757
+ print(f"Warning: {dirpath} : {e.strerror}")
758
+
759
+
760
+ # Temporary directory for prometheus multiprocess mode
761
+ # Cleaned up automatically when this object is garbage collected
762
+ prometheus_multiproc_dir: tempfile.TemporaryDirectory
763
+
764
+
765
+ def set_prometheus_multiproc_dir():
766
+ # Set prometheus multiprocess directory
767
+ # sglang uses prometheus multiprocess mode
768
+ # we need to set this before importing prometheus_client
769
+ # https://prometheus.github.io/client_python/multiprocess/
770
+ global prometheus_multiproc_dir
771
+
772
+ if "PROMETHEUS_MULTIPROC_DIR" in os.environ:
773
+ logger.debug("User set PROMETHEUS_MULTIPROC_DIR detected.")
774
+ prometheus_multiproc_dir = tempfile.TemporaryDirectory(
775
+ dir=os.environ["PROMETHEUS_MULTIPROC_DIR"]
776
+ )
777
+ else:
778
+ prometheus_multiproc_dir = tempfile.TemporaryDirectory()
779
+ os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
780
+ logger.debug(f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}")
781
+
782
+
783
+ def add_prometheus_middleware(app):
784
+ # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
785
+ from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess
786
+
787
+ registry = CollectorRegistry()
788
+ multiprocess.MultiProcessCollector(registry)
789
+ metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
790
+
791
+ # Workaround for 307 Redirect for /metrics
792
+ metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
793
+ app.routes.append(metrics_route)
@@ -320,7 +320,7 @@ jinja_env = jinja2.Environment(
320
320
  _message_template = """
321
321
  <div class="message {{ role }}">
322
322
  <div class="role">
323
- {{ role }}
323
+ {{ role }}
324
324
  {% if variant %}<span class="variant">({{ variant }})</span>{% endif %}
325
325
  </div>
326
326
  <div class="content">
@@ -2,8 +2,8 @@
2
2
 
3
3
  """
4
4
  HumanEval: Evaluating Large Language Models Trained on Code
5
- Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
6
- https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
5
+ Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
6
+ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
7
7
  """
8
8
 
9
9
  import random
@@ -1,10 +1,10 @@
1
1
  # Adapted from https://github.com/openai/simple-evals/
2
2
 
3
3
  """
4
- MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
4
+ MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
5
5
  Language Models are Multilingual Chain-of-Thought Reasoners
6
6
  Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei
7
- https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
7
+ https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
8
8
  """
9
9
 
10
10
  import re
sglang/test/test_utils.py CHANGED
@@ -27,6 +27,8 @@ from sglang.utils import get_exception_traceback
27
27
 
28
28
  DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
29
29
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
30
+ DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
31
+ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
30
32
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
31
33
  DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
32
34
  DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
@@ -441,7 +443,7 @@ def popen_launch_server(
441
443
  "Content-Type": "application/json; charset=utf-8",
442
444
  "Authorization": f"Bearer {api_key}",
443
445
  }
444
- response = requests.get(f"{base_url}/v1/models", headers=headers)
446
+ response = requests.get(f"{base_url}/health_generate", headers=headers)
445
447
  if response.status_code == 200:
446
448
  return process
447
449
  except requests.RequestException:
@@ -636,8 +638,8 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
636
638
  return rouge_l_scores
637
639
 
638
640
 
639
- STDOUT_FILENAME = "stdout.txt"
640
641
  STDERR_FILENAME = "stderr.txt"
642
+ STDOUT_FILENAME = "stdout.txt"
641
643
 
642
644
 
643
645
  def read_output(output_lines):
@@ -742,7 +744,13 @@ def run_mmlu_test(
742
744
  finally:
743
745
  pass
744
746
 
745
- run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
747
+ run_and_check_memory_leak(
748
+ workload_func,
749
+ disable_radix_cache,
750
+ enable_mixed_chunk,
751
+ enable_overlap,
752
+ chunked_prefill_size,
753
+ )
746
754
 
747
755
 
748
756
  def run_mulit_request_test(
@@ -775,4 +783,10 @@ def run_mulit_request_test(
775
783
  with ThreadPoolExecutor(2) as executor:
776
784
  list(executor.map(run_one, list(range(4))))
777
785
 
778
- run_and_check_memory_leak(workload_func, disable_radix_cache, enable_mixed_chunk, enable_overlap, chunked_prefill_size)
786
+ run_and_check_memory_leak(
787
+ workload_func,
788
+ disable_radix_cache,
789
+ enable_mixed_chunk,
790
+ enable_overlap,
791
+ chunked_prefill_size,
792
+ )
sglang/utils.py CHANGED
@@ -349,6 +349,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
349
349
 
350
350
  def terminate_process(process):
351
351
  from sglang.srt.utils import kill_child_process
352
+
352
353
  kill_child_process(process.pid, include_self=True)
353
354
 
354
355
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.5"
1
+ __version__ = "0.3.5.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.5
3
+ Version: 0.3.5.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -256,13 +256,14 @@ Requires-Dist: interegular; extra == "runtime-common"
256
256
  Requires-Dist: orjson; extra == "runtime-common"
257
257
  Requires-Dist: packaging; extra == "runtime-common"
258
258
  Requires-Dist: pillow; extra == "runtime-common"
259
+ Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
259
260
  Requires-Dist: psutil; extra == "runtime-common"
260
261
  Requires-Dist: pydantic; extra == "runtime-common"
261
262
  Requires-Dist: python-multipart; extra == "runtime-common"
262
263
  Requires-Dist: torchao; extra == "runtime-common"
263
264
  Requires-Dist: uvicorn; extra == "runtime-common"
264
265
  Requires-Dist: uvloop; extra == "runtime-common"
265
- Requires-Dist: zmq; extra == "runtime-common"
266
+ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
266
267
  Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
267
268
  Requires-Dist: modelscope; extra == "runtime-common"
268
269
  Provides-Extra: srt
@@ -291,13 +292,14 @@ Requires-Dist: peft; extra == "test"
291
292
  [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
292
293
  [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
293
294
  [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
295
+ [![](https://img.shields.io/badge/Gurubase-(experimental)-006BFF)](https://gurubase.io/g/sglang)
294
296
 
295
297
  </div>
296
298
 
297
299
  --------------------------------------------------------------------------------
298
300
 
299
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
300
- [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
301
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
302
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
301
303
 
302
304
  ## News
303
305
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
@@ -321,11 +323,13 @@ The core features include:
321
323
 
322
324
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
323
325
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
324
- - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
326
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
325
327
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
326
328
 
327
- ## Install
328
- See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
329
+ ## Getting Started
330
+ Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
331
+
332
+ Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
329
333
 
330
334
  ## Backend: SGLang Runtime (SRT)
331
335
  See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)