sglang 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. sglang/__init__.py +2 -2
  2. sglang/api.py +30 -4
  3. sglang/backend/litellm.py +2 -2
  4. sglang/backend/openai.py +26 -15
  5. sglang/backend/runtime_endpoint.py +18 -14
  6. sglang/bench_latency.py +317 -0
  7. sglang/global_config.py +5 -1
  8. sglang/lang/chat_template.py +41 -6
  9. sglang/lang/compiler.py +2 -2
  10. sglang/lang/interpreter.py +6 -2
  11. sglang/lang/ir.py +74 -28
  12. sglang/launch_server.py +4 -1
  13. sglang/launch_server_llavavid.py +2 -1
  14. sglang/srt/constrained/__init__.py +14 -6
  15. sglang/srt/constrained/fsm_cache.py +6 -3
  16. sglang/srt/constrained/jump_forward.py +113 -25
  17. sglang/srt/conversation.py +2 -0
  18. sglang/srt/flush_cache.py +2 -0
  19. sglang/srt/hf_transformers_utils.py +68 -9
  20. sglang/srt/layers/extend_attention.py +2 -1
  21. sglang/srt/layers/fused_moe.py +280 -169
  22. sglang/srt/layers/logits_processor.py +106 -42
  23. sglang/srt/layers/radix_attention.py +53 -29
  24. sglang/srt/layers/token_attention.py +4 -1
  25. sglang/srt/managers/controller/dp_worker.py +6 -3
  26. sglang/srt/managers/controller/infer_batch.py +144 -69
  27. sglang/srt/managers/controller/manager_multi.py +5 -5
  28. sglang/srt/managers/controller/manager_single.py +9 -4
  29. sglang/srt/managers/controller/model_runner.py +167 -55
  30. sglang/srt/managers/controller/radix_cache.py +4 -0
  31. sglang/srt/managers/controller/schedule_heuristic.py +2 -0
  32. sglang/srt/managers/controller/tp_worker.py +156 -134
  33. sglang/srt/managers/detokenizer_manager.py +19 -21
  34. sglang/srt/managers/io_struct.py +11 -5
  35. sglang/srt/managers/tokenizer_manager.py +16 -14
  36. sglang/srt/model_config.py +89 -4
  37. sglang/srt/models/chatglm.py +399 -0
  38. sglang/srt/models/commandr.py +2 -2
  39. sglang/srt/models/dbrx.py +1 -1
  40. sglang/srt/models/gemma.py +5 -1
  41. sglang/srt/models/gemma2.py +436 -0
  42. sglang/srt/models/grok.py +204 -137
  43. sglang/srt/models/llama2.py +12 -5
  44. sglang/srt/models/llama_classification.py +107 -0
  45. sglang/srt/models/llava.py +11 -8
  46. sglang/srt/models/llavavid.py +1 -1
  47. sglang/srt/models/minicpm.py +373 -0
  48. sglang/srt/models/mixtral.py +164 -115
  49. sglang/srt/models/mixtral_quant.py +0 -1
  50. sglang/srt/models/qwen.py +1 -1
  51. sglang/srt/models/qwen2.py +1 -1
  52. sglang/srt/models/qwen2_moe.py +454 -0
  53. sglang/srt/models/stablelm.py +1 -1
  54. sglang/srt/models/yivl.py +2 -2
  55. sglang/srt/openai_api_adapter.py +35 -25
  56. sglang/srt/openai_protocol.py +2 -2
  57. sglang/srt/server.py +69 -19
  58. sglang/srt/server_args.py +76 -43
  59. sglang/srt/utils.py +177 -35
  60. sglang/test/test_programs.py +28 -10
  61. sglang/utils.py +4 -3
  62. {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/METADATA +44 -31
  63. sglang-0.1.19.dist-info/RECORD +81 -0
  64. {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/WHEEL +1 -1
  65. sglang/srt/managers/router/infer_batch.py +0 -596
  66. sglang/srt/managers/router/manager.py +0 -82
  67. sglang/srt/managers/router/model_rpc.py +0 -818
  68. sglang/srt/managers/router/model_runner.py +0 -445
  69. sglang/srt/managers/router/radix_cache.py +0 -267
  70. sglang/srt/managers/router/scheduler.py +0 -59
  71. sglang-0.1.17.dist-info/RECORD +0 -81
  72. {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
  73. {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -1,11 +1,13 @@
1
1
  """Common utilities."""
2
2
 
3
3
  import base64
4
- import multiprocessing
4
+ import fcntl
5
5
  import logging
6
+ import multiprocessing
6
7
  import os
7
8
  import random
8
9
  import socket
10
+ import struct
9
11
  import time
10
12
  from importlib.metadata import PackageNotFoundError, version
11
13
  from io import BytesIO
@@ -17,12 +19,11 @@ import requests
17
19
  import rpyc
18
20
  import torch
19
21
  import triton
20
- from rpyc.utils.server import ThreadedServer
21
22
  from fastapi.responses import JSONResponse
22
23
  from packaging import version as pkg_version
24
+ from rpyc.utils.server import ThreadedServer
23
25
  from starlette.middleware.base import BaseHTTPMiddleware
24
26
 
25
-
26
27
  logger = logging.getLogger(__name__)
27
28
 
28
29
 
@@ -370,23 +371,7 @@ def load_image(image_file):
370
371
  return image, image_size
371
372
 
372
373
 
373
- def init_rpyc_service(service: rpyc.Service, port: int):
374
- t = ThreadedServer(
375
- service=service,
376
- port=port,
377
- protocol_config={
378
- "allow_public_attrs": True,
379
- "allow_pickle": True,
380
- "sync_request_timeout": 3600
381
- },
382
- )
383
- t.logger.setLevel(logging.WARN)
384
- t.start()
385
-
386
-
387
- def connect_to_rpyc_service(port, host="localhost"):
388
- time.sleep(1)
389
-
374
+ def connect_rpyc_service(host, port):
390
375
  repeat_count = 0
391
376
  while repeat_count < 20:
392
377
  try:
@@ -396,26 +381,37 @@ def connect_to_rpyc_service(port, host="localhost"):
396
381
  config={
397
382
  "allow_public_attrs": True,
398
383
  "allow_pickle": True,
399
- "sync_request_timeout": 3600
384
+ "sync_request_timeout": 3600,
400
385
  },
401
386
  )
402
387
  break
403
- except ConnectionRefusedError:
388
+ except ConnectionRefusedError as e:
404
389
  time.sleep(1)
405
390
  repeat_count += 1
406
391
  if repeat_count == 20:
407
- raise RuntimeError("init rpc env error!")
392
+ raise RuntimeError(f"Connect rpyc error: {e}")
408
393
 
409
394
  return con.root
410
395
 
411
396
 
412
- def start_rpyc_process(service: rpyc.Service, port: int):
413
- # Return the proxy and the process
414
- proc = multiprocessing.Process(target=init_rpyc_service, args=(service, port))
397
+ def start_rpyc_service(service: rpyc.Service, port: int):
398
+ t = ThreadedServer(
399
+ service=service,
400
+ port=port,
401
+ protocol_config={
402
+ "allow_public_attrs": True,
403
+ "allow_pickle": True,
404
+ "sync_request_timeout": 3600,
405
+ },
406
+ )
407
+ t.logger.setLevel(logging.WARN)
408
+ t.start()
409
+
410
+
411
+ def start_rpyc_service_process(service: rpyc.Service, port: int):
412
+ proc = multiprocessing.Process(target=start_rpyc_service, args=(service, port))
415
413
  proc.start()
416
- proxy = connect_to_rpyc_service(port)
417
- assert proc.is_alive()
418
- return proxy, proc
414
+ return proc
419
415
 
420
416
 
421
417
  def suppress_other_loggers():
@@ -423,22 +419,25 @@ def suppress_other_loggers():
423
419
 
424
420
  vllm_default_logger.setLevel(logging.WARN)
425
421
  logging.getLogger("vllm.config").setLevel(logging.ERROR)
426
- logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(logging.WARN)
422
+ logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
423
+ logging.WARN
424
+ )
427
425
  logging.getLogger("vllm.selector").setLevel(logging.WARN)
428
426
  logging.getLogger("vllm.utils").setLevel(logging.WARN)
429
427
 
430
428
 
431
- def assert_pkg_version(pkg: str, min_version: str):
429
+ def assert_pkg_version(pkg: str, min_version: str, message: str):
432
430
  try:
433
431
  installed_version = version(pkg)
434
432
  if pkg_version.parse(installed_version) < pkg_version.parse(min_version):
435
433
  raise Exception(
436
- f"{pkg} is installed with version {installed_version} which "
437
- f"is less than the minimum required version {min_version}"
434
+ f"{pkg} is installed with version {installed_version}, which "
435
+ f"is less than the minimum required version {min_version}. " + message
438
436
  )
439
437
  except PackageNotFoundError:
440
438
  raise Exception(
441
- f"{pkg} with minimum required version {min_version} is not installed"
439
+ f"{pkg} with minimum required version {min_version} is not installed. "
440
+ + message
442
441
  )
443
442
 
444
443
 
@@ -453,16 +452,75 @@ def kill_parent_process():
453
452
  os.kill(parent_process.pid, 9)
454
453
 
455
454
 
456
- def monkey_patch_vllm_p2p_access_check():
455
+ def monkey_patch_vllm_p2p_access_check(gpu_id: int):
457
456
  """
458
457
  Monkey patch the slow p2p access check in vllm.
459
458
  NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
460
459
  """
460
+
461
461
  import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
462
462
 
463
463
  setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
464
464
 
465
465
 
466
+ def monkey_patch_vllm_dummy_weight_loader():
467
+ """
468
+ Monkey patch the dummy weight loader in vllm to call process_weights_after_loading.
469
+ """
470
+
471
+ from vllm.model_executor.model_loader.loader import (
472
+ CacheConfig,
473
+ DeviceConfig,
474
+ DummyModelLoader,
475
+ LoRAConfig,
476
+ ModelConfig,
477
+ ParallelConfig,
478
+ SchedulerConfig,
479
+ MultiModalConfig,
480
+ _initialize_model,
481
+ initialize_dummy_weights,
482
+ nn,
483
+ set_default_torch_dtype,
484
+ )
485
+
486
+ def load_model(
487
+ self,
488
+ *,
489
+ model_config: ModelConfig,
490
+ device_config: DeviceConfig,
491
+ lora_config: Optional[LoRAConfig],
492
+ multimodal_config: Optional[MultiModalConfig],
493
+ parallel_config: ParallelConfig,
494
+ scheduler_config: SchedulerConfig,
495
+ cache_config: CacheConfig,
496
+ ) -> nn.Module:
497
+ with set_default_torch_dtype(model_config.dtype):
498
+ with torch.device(device_config.device):
499
+ model = _initialize_model(
500
+ model_config,
501
+ self.load_config,
502
+ lora_config,
503
+ multimodal_config,
504
+ cache_config,
505
+ )
506
+
507
+ for _, module in model.named_modules():
508
+ quant_method = getattr(module, "quant_method", None)
509
+ if quant_method is not None:
510
+ quant_method.process_weights_after_loading(module)
511
+ # FIXME: Remove this after Mixtral is updated
512
+ # to use quant_method.
513
+ if hasattr(module, "process_weights_after_loading"):
514
+ module.process_weights_after_loading()
515
+
516
+ # NOTE(woosuk): For accurate performance evaluation, we assign
517
+ # random values to the weights.
518
+ initialize_dummy_weights(model)
519
+ return model.eval()
520
+
521
+ setattr(DummyModelLoader, "load_model", load_model)
522
+
523
+
466
524
  API_KEY_HEADER_NAME = "X-API-Key"
467
525
 
468
526
 
@@ -482,3 +540,87 @@ class APIKeyValidatorMiddleware(BaseHTTPMiddleware):
482
540
  response = await call_next(request)
483
541
  return response
484
542
 
543
+
544
+ def get_ip_address(ifname):
545
+ """
546
+ Get the IP address of a network interface.
547
+
548
+ :param ifname: Name of the network interface (e.g., 'eth0')
549
+ :return: IP address of the network interface
550
+ """
551
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
552
+ ip_address = fcntl.ioctl(
553
+ s.fileno(),
554
+ 0x8915, # SIOCGIFADDR
555
+ struct.pack("256s", bytes(ifname[:15], "utf-8")),
556
+ )[20:24]
557
+ return socket.inet_ntoa(ip_address)
558
+
559
+
560
+ def send_addrs_to_rank_0(model_port_args, server_args):
561
+ assert server_args.node_rank != 0 and server_args.dp_size == 1
562
+ import torch.distributed as dist
563
+
564
+ ifname = os.environ.get(
565
+ "SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
566
+ )
567
+ ip_addr = get_ip_address(ifname)
568
+
569
+ num_tp_ports = server_args.tp_size // server_args.nnodes
570
+ model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
571
+ ip_addr = [int(x) for x in ip_addr.split(".")]
572
+ addrs_tensor = torch.tensor(
573
+ ip_addr + model_port_args.model_tp_ports, dtype=torch.int
574
+ )
575
+
576
+ init_method = f"tcp://{server_args.nccl_init_addr}"
577
+ dist.init_process_group(
578
+ backend="gloo",
579
+ init_method=init_method,
580
+ rank=server_args.node_rank,
581
+ world_size=server_args.nnodes,
582
+ )
583
+ dist.send(addrs_tensor, dst=0)
584
+ print(
585
+ f"Node {server_args.node_rank} sent: ip_address {ip_addr} and ports {model_port_args.model_tp_ports}"
586
+ )
587
+
588
+ dist.barrier()
589
+ dist.destroy_process_group()
590
+
591
+
592
+ def receive_addrs(model_port_args, server_args):
593
+ assert server_args.node_rank == 0 and server_args.dp_size == 1
594
+ import torch.distributed as dist
595
+
596
+ ifname = os.environ.get(
597
+ "SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
598
+ )
599
+ ip_addr = get_ip_address(ifname)
600
+
601
+ num_tp_ports = server_args.tp_size // server_args.nnodes
602
+ model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
603
+
604
+ init_method = f"tcp://{server_args.nccl_init_addr}"
605
+ dist.init_process_group(
606
+ backend="gloo",
607
+ init_method=init_method,
608
+ rank=server_args.node_rank,
609
+ world_size=server_args.nnodes,
610
+ )
611
+
612
+ for src_rank in range(1, server_args.nnodes):
613
+ tensor = torch.zeros(4 + num_tp_ports, dtype=torch.int)
614
+ dist.recv(tensor, src=src_rank)
615
+ ip = ".".join([str(x) for x in tensor[:4].tolist()])
616
+ ports = tensor[4:].tolist()
617
+ model_port_args.model_tp_ips[
618
+ num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
619
+ ] = [ip] * num_tp_ports
620
+ model_port_args.model_tp_ports[
621
+ num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
622
+ ] = ports
623
+ print(f"Node 0 received from rank {src_rank}: {tensor.tolist()}")
624
+
625
+ dist.barrier()
626
+ dist.destroy_process_group()
@@ -1,6 +1,4 @@
1
- """
2
- This file contains the SGL programs used for unit testing.
3
- """
1
+ """This file contains the SGL programs used for unit testing."""
4
2
 
5
3
  import json
6
4
  import re
@@ -358,16 +356,25 @@ def test_completion_speculative():
358
356
  s += "Construct a character within the following format:\n"
359
357
  s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
360
358
  s += "\nPlease generate new Name, Birthday and Job.\n"
361
- s += "Name:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n")
359
+ s += (
360
+ "Name:"
361
+ + sgl.gen("name", stop="\n")
362
+ + "\nBirthday:"
363
+ + sgl.gen("birthday", stop="\n")
364
+ )
362
365
  s += "\nJob:" + sgl.gen("job", stop="\n") + "\n"
363
366
 
364
-
365
367
  @sgl.function
366
368
  def gen_character_no_spec(s):
367
369
  s += "Construct a character within the following format:\n"
368
370
  s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
369
371
  s += "\nPlease generate new Name, Birthday and Job.\n"
370
- s += "Name:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n")
372
+ s += (
373
+ "Name:"
374
+ + sgl.gen("name", stop="\n")
375
+ + "\nBirthday:"
376
+ + sgl.gen("birthday", stop="\n")
377
+ )
371
378
  s += "\nJob:" + sgl.gen("job", stop="\n") + "\n"
372
379
 
373
380
  token_usage = sgl.global_config.default_backend.token_usage
@@ -380,7 +387,9 @@ def test_completion_speculative():
380
387
  gen_character_no_spec().sync()
381
388
  usage_with_no_spec = token_usage.prompt_tokens
382
389
 
383
- assert usage_with_spec < usage_with_no_spec, f"{usage_with_spec} vs {usage_with_no_spec}"
390
+ assert (
391
+ usage_with_spec < usage_with_no_spec
392
+ ), f"{usage_with_spec} vs {usage_with_no_spec}"
384
393
 
385
394
 
386
395
  def test_chat_completion_speculative():
@@ -388,8 +397,17 @@ def test_chat_completion_speculative():
388
397
  def gen_character_spec(s):
389
398
  s += sgl.system("You are a helpful assistant.")
390
399
  s += sgl.user("Construct a character within the following format:")
391
- s += sgl.assistant("Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n")
400
+ s += sgl.assistant(
401
+ "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
402
+ )
392
403
  s += sgl.user("Please generate new Name, Birthday and Job.\n")
393
- s += sgl.assistant("Name:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
404
+ s += sgl.assistant(
405
+ "Name:"
406
+ + sgl.gen("name", stop="\n")
407
+ + "\nBirthday:"
408
+ + sgl.gen("birthday", stop="\n")
409
+ + "\nJob:"
410
+ + sgl.gen("job", stop="\n")
411
+ )
394
412
 
395
- gen_character_spec().sync()
413
+ gen_character_spec().sync()
sglang/utils.py CHANGED
@@ -15,7 +15,6 @@ from json import dumps
15
15
  import numpy as np
16
16
  import requests
17
17
 
18
-
19
18
  logger = logging.getLogger(__name__)
20
19
 
21
20
 
@@ -255,8 +254,10 @@ def run_with_timeout(func, args=(), kwargs=None, timeout=None):
255
254
 
256
255
  def graceful_registry(sub_module_name):
257
256
  def graceful_shutdown(signum, frame):
258
- logger.info(f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown...")
257
+ logger.info(
258
+ f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
259
+ )
259
260
  if signum == signal.SIGTERM:
260
261
  logger.info(f"{sub_module_name} recive sigterm")
261
262
 
262
- signal.signal(signal.SIGTERM, graceful_shutdown)
263
+ signal.signal(signal.SIGTERM, graceful_shutdown)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.17
3
+ Version: 0.1.19
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -213,6 +213,7 @@ Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: requests
215
215
  Requires-Dist: tqdm
216
+ Requires-Dist: numpy
216
217
  Provides-Extra: all
217
218
  Requires-Dist: sglang[srt] ; extra == 'all'
218
219
  Requires-Dist: sglang[openai] ; extra == 'all'
@@ -220,30 +221,28 @@ Requires-Dist: sglang[anthropic] ; extra == 'all'
220
221
  Requires-Dist: sglang[litellm] ; extra == 'all'
221
222
  Provides-Extra: anthropic
222
223
  Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
223
- Requires-Dist: numpy ; extra == 'anthropic'
224
224
  Provides-Extra: litellm
225
225
  Requires-Dist: litellm >=1.0.0 ; extra == 'litellm'
226
226
  Provides-Extra: openai
227
227
  Requires-Dist: openai >=1.0 ; extra == 'openai'
228
- Requires-Dist: numpy ; extra == 'openai'
229
228
  Requires-Dist: tiktoken ; extra == 'openai'
230
229
  Provides-Extra: srt
231
230
  Requires-Dist: aiohttp ; extra == 'srt'
232
231
  Requires-Dist: fastapi ; extra == 'srt'
232
+ Requires-Dist: hf-transfer ; extra == 'srt'
233
+ Requires-Dist: huggingface-hub ; extra == 'srt'
234
+ Requires-Dist: interegular ; extra == 'srt'
235
+ Requires-Dist: packaging ; extra == 'srt'
236
+ Requires-Dist: pillow ; extra == 'srt'
233
237
  Requires-Dist: psutil ; extra == 'srt'
238
+ Requires-Dist: pydantic ; extra == 'srt'
234
239
  Requires-Dist: rpyc ; extra == 'srt'
235
240
  Requires-Dist: torch ; extra == 'srt'
236
- Requires-Dist: uvloop ; extra == 'srt'
237
241
  Requires-Dist: uvicorn ; extra == 'srt'
242
+ Requires-Dist: uvloop ; extra == 'srt'
238
243
  Requires-Dist: zmq ; extra == 'srt'
239
- Requires-Dist: vllm ==0.4.3 ; extra == 'srt'
240
- Requires-Dist: interegular ; extra == 'srt'
241
- Requires-Dist: pydantic ; extra == 'srt'
242
- Requires-Dist: pillow ; extra == 'srt'
243
- Requires-Dist: packaging ; extra == 'srt'
244
- Requires-Dist: huggingface-hub ; extra == 'srt'
245
- Requires-Dist: hf-transfer ; extra == 'srt'
246
- Requires-Dist: outlines >=0.0.34 ; extra == 'srt'
244
+ Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
245
+ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
247
246
 
248
247
  <div align="center">
249
248
  <img src="assets/logo.png" alt="logo" width="400"></img>
@@ -257,8 +256,8 @@ SGLang is a structured generation language designed for large language models (L
257
256
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
258
257
 
259
258
  The core features include:
260
- - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
261
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
259
+ - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
262
261
 
263
262
  ## News
264
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -279,19 +278,33 @@ The core features include:
279
278
  ### Method 1: With pip
280
279
  ```
281
280
  pip install "sglang[all]"
281
+
282
+ # Install FlashInfer CUDA kernels
283
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
282
284
  ```
283
285
 
284
286
  ### Method 2: From source
285
287
  ```
286
- git clone git@github.com:sgl-project/sglang.git
288
+ git clone https://github.com/sgl-project/sglang.git
287
289
  cd sglang
288
290
 
289
- pip install --upgrade pip
290
291
  pip install -e "python[all]"
292
+
293
+ # Install FlashInfer CUDA kernels
294
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
291
295
  ```
292
296
 
293
- ### Notes
294
- - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
297
+ ### Method 3: Using docker
298
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
299
+
300
+ ### Common Notes
301
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
302
+ ```
303
+ pip uninstall -y triton triton-nightly
304
+ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
305
+ ```
306
+ - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
307
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
295
308
 
296
309
  ## Quick Start
297
310
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -511,8 +524,8 @@ for out in state.text_iter():
511
524
  ```
512
525
 
513
526
  ### Tips and Implementation Details
514
- - The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
515
- - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
527
+ - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
528
+ - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
516
529
 
517
530
  ## Backend: SGLang Runtime (SRT)
518
531
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -569,7 +582,6 @@ response = client.chat.completions.create(
569
582
  print(response)
570
583
  ```
571
584
 
572
-
573
585
  By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
574
586
 
575
587
  If needed, you can also override the chat template when launching the server:
@@ -598,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
598
610
  ```
599
611
 
600
612
  ### Additional Arguments
601
- - Add `--tp 2` to enable tensor parallelism.
613
+ - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
602
614
  ```
603
615
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
604
616
  ```
@@ -610,16 +622,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
610
622
  ```
611
623
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
612
624
  ```
613
- - See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
614
625
  - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
615
626
 
616
627
  ### Supported Models
617
628
  - Llama
618
629
  - Mistral
619
630
  - Mixtral
620
- - Qwen / Qwen 2
621
- - Gemma
622
- - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
631
+ - Qwen / Qwen 2 / Qwen 2 MoE
632
+ - Gemma / Gemma 2
623
633
  - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
624
634
  - LLaVA
625
635
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
@@ -632,6 +642,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
632
642
  - StableLM
633
643
  - Command-R
634
644
  - DBRX
645
+ - Grok
646
+ - ChatGLM
635
647
  - AWQ/GPTQ/Marlin quantization
636
648
 
637
649
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
@@ -643,17 +655,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
643
655
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
644
656
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
645
657
 
646
- Learn more [here](docs/benchmark_results.md).
658
+ - Learn more about the above [results](docs/benchmark_results.md).
659
+ - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
647
660
 
648
661
  ## Roadmap
649
662
  https://github.com/sgl-project/sglang/issues/157
650
663
 
651
664
  ## Citation And Acknowledgment
652
665
  ```
653
- @misc{zheng2023efficiently,
654
- title={Efficiently Programming Large Language Models using SGLang},
655
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
656
- year={2023},
666
+ @misc{zheng2024sglang,
667
+ title={SGLang: Efficient Execution of Structured Language Model Programs},
668
+ author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
669
+ year={2024},
657
670
  eprint={2312.07104},
658
671
  archivePrefix={arXiv},
659
672
  primaryClass={cs.AI}
@@ -0,0 +1,81 @@
1
+ sglang/__init__.py,sha256=GriWuMrszCcPLrLQRv50jP0Crc6b8CLsBA3UYM36ISw,1116
2
+ sglang/api.py,sha256=W_FO5JTrW9I-DoGx2O8cLhcSA6LJqgplrOIqAX-ryNA,5560
3
+ sglang/bench_latency.py,sha256=Ln3DbLmTwIhgsiFZH0_L5Fd3Sc5jM_Vb9PFZytX76hM,10299
4
+ sglang/global_config.py,sha256=1HsHrPFgkqCc5iIwrweKQ0HLip0DLogtpm9vaqbZqfE,1426
5
+ sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
6
+ sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
7
+ sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
8
+ sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
10
+ sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
11
+ sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
12
+ sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
13
+ sglang/backend/runtime_endpoint.py,sha256=XTHAoN_EAwdfADc6vq9tuqri7udGMUih8dStgTuKV1g,9077
14
+ sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
15
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ sglang/lang/chat_template.py,sha256=hLX1qpXaUQi7PFndAwbOoOeGlX0NekskR_HndAvGnwQ,13307
17
+ sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
18
+ sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
19
+ sglang/lang/ir.py,sha256=ZGXJbJELlt8D8H7CyW3IqcRpZm8Pp7h_hLQw46NSb6I,16639
20
+ sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
21
+ sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
22
+ sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
23
+ sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
24
+ sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
25
+ sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
26
+ sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
27
+ sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
28
+ sglang/srt/openai_protocol.py,sha256=-KJsGx2izL3Fc5EhOGi9PAXExuaq-DKRk0UlNjts11E,5348
29
+ sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
30
+ sglang/srt/server.py,sha256=ntl5XwnbOm2favQWbqVULXBUOLhXsgZ3mf1i2MY4e14,13226
31
+ sglang/srt/server_args.py,sha256=rvJImd-b9CVveg_V7n7dSotlro6q6pAqBk7lOxRC7nk,12307
32
+ sglang/srt/utils.py,sha256=e-yPzqDMCGsPgEf4TIe7CEh44lsKpZnclsrMtBggS_Y,19366
33
+ sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
34
+ sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
35
+ sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
36
+ sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
37
+ sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
38
+ sglang/srt/layers/extend_attention.py,sha256=sVd94ViwwQaQDuE94sPMg6Ac6VOp7nX80hFol8qr85Q,13008
39
+ sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
40
+ sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
41
+ sglang/srt/layers/radix_attention.py,sha256=e468GCYteIuVOW7T9xols-IqXS0hJysmicvEiwD0xIM,6857
42
+ sglang/srt/layers/token_attention.py,sha256=eKUUU5pvYsF5EGthfbv-L_IUlg366l5e5X1eWTkE_Xw,8908
43
+ sglang/srt/managers/detokenizer_manager.py,sha256=2oYNtYrSwtfu8G-QcFz_vZK6Buq-eHuZGg9VpxVhYOI,3492
44
+ sglang/srt/managers/io_struct.py,sha256=aCI4yYtKoioP459lWRN8kqVf4tvYYr_IhZaSnvJylgY,4533
45
+ sglang/srt/managers/tokenizer_manager.py,sha256=h5nOR8NHCwEm52wiL-ZA1hoM_pvMuyG0j7Zj1h7aMxk,14898
46
+ sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
47
+ sglang/srt/managers/controller/infer_batch.py,sha256=wOuvi4lNhVEZtfXZKinBXCubG_VEaRTv60ijbHpSMgM,25713
48
+ sglang/srt/managers/controller/manager_multi.py,sha256=Z0a-iZzqk8T2Xl7ak2DgE9j00GA6Eb0XoNVx7UlxKa4,6630
49
+ sglang/srt/managers/controller/manager_single.py,sha256=5c33d1jPgOtys5gmfZe79UD7aXrsV--1Yq9Yc24bh1g,3469
50
+ sglang/srt/managers/controller/model_runner.py,sha256=a-1RKjA12U11BvDbnOECyPf6rpxes895pEZ0-Hyxo6c,21888
51
+ sglang/srt/managers/controller/radix_cache.py,sha256=fMqIm1fTvufI9I_QMoFLfQMkSUWp8VN4wh3-63KJUL0,8193
52
+ sglang/srt/managers/controller/schedule_heuristic.py,sha256=_ne7W2mrpuO794uh5tYLR3q6XBbgTMdNmE6VpzY1sJE,2312
53
+ sglang/srt/managers/controller/tp_worker.py,sha256=WBqL5_VVDAf3o12ymZwxQn7RYZ_dm_w2dXCnMVQ5L3M,31828
54
+ sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
55
+ sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
56
+ sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
57
+ sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
58
+ sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
59
+ sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
60
+ sglang/srt/models/llama2.py,sha256=FIUlkFoBhRNidU_Tlcr4UbSqzKPdz3wBc9OocN_CzQs,12188
61
+ sglang/srt/models/llama_classification.py,sha256=bLuugRFcPGEaNd58_LFOkWqOru2rCAGChhBw9dSu7pc,4349
62
+ sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
63
+ sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
64
+ sglang/srt/models/minicpm.py,sha256=vYCGjUjYIYVroiV2kOXWdWIPF6__vkN8JnRK-DqgKNI,13271
65
+ sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
66
+ sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
67
+ sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
68
+ sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
69
+ sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
70
+ sglang/srt/models/qwen2_moe.py,sha256=hV3dF_AzYONd-pQEmEkrrwpTZC6A7K4wY1_cph9UC54,18421
71
+ sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
72
+ sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
73
+ sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
74
+ sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
75
+ sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
76
+ sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
77
+ sglang-0.1.19.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
78
+ sglang-0.1.19.dist-info/METADATA,sha256=iSIkO_DxfMHQIEv7ZdMXWwi_weLZtf8YRNS80vjf1Kk,30262
79
+ sglang-0.1.19.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
80
+ sglang-0.1.19.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
81
+ sglang-0.1.19.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (70.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5