sglang 0.1.17__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -2
- sglang/api.py +4 -4
- sglang/backend/litellm.py +2 -2
- sglang/backend/openai.py +26 -15
- sglang/bench_latency.py +299 -0
- sglang/global_config.py +4 -1
- sglang/lang/compiler.py +2 -2
- sglang/lang/interpreter.py +1 -1
- sglang/lang/ir.py +15 -5
- sglang/launch_server.py +4 -1
- sglang/launch_server_llavavid.py +2 -1
- sglang/srt/constrained/__init__.py +13 -6
- sglang/srt/constrained/fsm_cache.py +6 -3
- sglang/srt/constrained/jump_forward.py +113 -25
- sglang/srt/conversation.py +2 -0
- sglang/srt/flush_cache.py +2 -0
- sglang/srt/hf_transformers_utils.py +64 -9
- sglang/srt/layers/fused_moe.py +186 -89
- sglang/srt/layers/logits_processor.py +53 -25
- sglang/srt/layers/radix_attention.py +34 -7
- sglang/srt/managers/controller/dp_worker.py +6 -3
- sglang/srt/managers/controller/infer_batch.py +142 -67
- sglang/srt/managers/controller/manager_multi.py +5 -5
- sglang/srt/managers/controller/manager_single.py +8 -3
- sglang/srt/managers/controller/model_runner.py +154 -54
- sglang/srt/managers/controller/radix_cache.py +4 -0
- sglang/srt/managers/controller/schedule_heuristic.py +2 -0
- sglang/srt/managers/controller/tp_worker.py +140 -135
- sglang/srt/managers/detokenizer_manager.py +15 -19
- sglang/srt/managers/io_struct.py +10 -4
- sglang/srt/managers/tokenizer_manager.py +14 -13
- sglang/srt/model_config.py +83 -4
- sglang/srt/models/chatglm.py +399 -0
- sglang/srt/models/commandr.py +2 -2
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/gemma.py +5 -1
- sglang/srt/models/grok.py +204 -137
- sglang/srt/models/llama2.py +11 -4
- sglang/srt/models/llama_classification.py +104 -0
- sglang/srt/models/llava.py +11 -8
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/mixtral.py +164 -115
- sglang/srt/models/mixtral_quant.py +0 -1
- sglang/srt/models/qwen.py +1 -1
- sglang/srt/models/qwen2.py +1 -1
- sglang/srt/models/stablelm.py +1 -1
- sglang/srt/models/yivl.py +2 -2
- sglang/srt/openai_api_adapter.py +33 -23
- sglang/srt/openai_protocol.py +1 -1
- sglang/srt/server.py +60 -19
- sglang/srt/server_args.py +79 -44
- sglang/srt/utils.py +146 -37
- sglang/test/test_programs.py +28 -10
- sglang/utils.py +4 -3
- {sglang-0.1.17.dist-info → sglang-0.1.18.dist-info}/METADATA +29 -22
- sglang-0.1.18.dist-info/RECORD +78 -0
- {sglang-0.1.17.dist-info → sglang-0.1.18.dist-info}/WHEEL +1 -1
- sglang/srt/managers/router/infer_batch.py +0 -596
- sglang/srt/managers/router/manager.py +0 -82
- sglang/srt/managers/router/model_rpc.py +0 -818
- sglang/srt/managers/router/model_runner.py +0 -445
- sglang/srt/managers/router/radix_cache.py +0 -267
- sglang/srt/managers/router/scheduler.py +0 -59
- sglang-0.1.17.dist-info/RECORD +0 -81
- {sglang-0.1.17.dist-info → sglang-0.1.18.dist-info}/LICENSE +0 -0
- {sglang-0.1.17.dist-info → sglang-0.1.18.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
"""Common utilities."""
|
2
2
|
|
3
3
|
import base64
|
4
|
-
import
|
4
|
+
import fcntl
|
5
5
|
import logging
|
6
|
+
import multiprocessing
|
6
7
|
import os
|
7
8
|
import random
|
8
9
|
import socket
|
10
|
+
import struct
|
9
11
|
import time
|
10
12
|
from importlib.metadata import PackageNotFoundError, version
|
11
13
|
from io import BytesIO
|
@@ -17,12 +19,11 @@ import requests
|
|
17
19
|
import rpyc
|
18
20
|
import torch
|
19
21
|
import triton
|
20
|
-
from rpyc.utils.server import ThreadedServer
|
21
22
|
from fastapi.responses import JSONResponse
|
22
23
|
from packaging import version as pkg_version
|
24
|
+
from rpyc.utils.server import ThreadedServer
|
23
25
|
from starlette.middleware.base import BaseHTTPMiddleware
|
24
26
|
|
25
|
-
|
26
27
|
logger = logging.getLogger(__name__)
|
27
28
|
|
28
29
|
|
@@ -370,23 +371,7 @@ def load_image(image_file):
|
|
370
371
|
return image, image_size
|
371
372
|
|
372
373
|
|
373
|
-
def
|
374
|
-
t = ThreadedServer(
|
375
|
-
service=service,
|
376
|
-
port=port,
|
377
|
-
protocol_config={
|
378
|
-
"allow_public_attrs": True,
|
379
|
-
"allow_pickle": True,
|
380
|
-
"sync_request_timeout": 3600
|
381
|
-
},
|
382
|
-
)
|
383
|
-
t.logger.setLevel(logging.WARN)
|
384
|
-
t.start()
|
385
|
-
|
386
|
-
|
387
|
-
def connect_to_rpyc_service(port, host="localhost"):
|
388
|
-
time.sleep(1)
|
389
|
-
|
374
|
+
def connect_rpyc_service(host, port):
|
390
375
|
repeat_count = 0
|
391
376
|
while repeat_count < 20:
|
392
377
|
try:
|
@@ -396,26 +381,37 @@ def connect_to_rpyc_service(port, host="localhost"):
|
|
396
381
|
config={
|
397
382
|
"allow_public_attrs": True,
|
398
383
|
"allow_pickle": True,
|
399
|
-
"sync_request_timeout": 3600
|
384
|
+
"sync_request_timeout": 3600,
|
400
385
|
},
|
401
386
|
)
|
402
387
|
break
|
403
|
-
except ConnectionRefusedError:
|
388
|
+
except ConnectionRefusedError as e:
|
404
389
|
time.sleep(1)
|
405
390
|
repeat_count += 1
|
406
391
|
if repeat_count == 20:
|
407
|
-
raise RuntimeError("
|
392
|
+
raise RuntimeError(f"Connect rpyc error: {e}")
|
408
393
|
|
409
394
|
return con.root
|
410
395
|
|
411
396
|
|
412
|
-
def
|
413
|
-
|
414
|
-
|
397
|
+
def start_rpyc_service(service: rpyc.Service, port: int):
|
398
|
+
t = ThreadedServer(
|
399
|
+
service=service,
|
400
|
+
port=port,
|
401
|
+
protocol_config={
|
402
|
+
"allow_public_attrs": True,
|
403
|
+
"allow_pickle": True,
|
404
|
+
"sync_request_timeout": 3600,
|
405
|
+
},
|
406
|
+
)
|
407
|
+
t.logger.setLevel(logging.WARN)
|
408
|
+
t.start()
|
409
|
+
|
410
|
+
|
411
|
+
def start_rpyc_service_process(service: rpyc.Service, port: int):
|
412
|
+
proc = multiprocessing.Process(target=start_rpyc_service, args=(service, port))
|
415
413
|
proc.start()
|
416
|
-
|
417
|
-
assert proc.is_alive()
|
418
|
-
return proxy, proc
|
414
|
+
return proc
|
419
415
|
|
420
416
|
|
421
417
|
def suppress_other_loggers():
|
@@ -423,22 +419,26 @@ def suppress_other_loggers():
|
|
423
419
|
|
424
420
|
vllm_default_logger.setLevel(logging.WARN)
|
425
421
|
logging.getLogger("vllm.config").setLevel(logging.ERROR)
|
426
|
-
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
422
|
+
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
423
|
+
logging.WARN
|
424
|
+
)
|
427
425
|
logging.getLogger("vllm.selector").setLevel(logging.WARN)
|
428
426
|
logging.getLogger("vllm.utils").setLevel(logging.WARN)
|
429
427
|
|
430
428
|
|
431
|
-
def assert_pkg_version(pkg: str, min_version: str):
|
429
|
+
def assert_pkg_version(pkg: str, min_version: str, message: str):
|
432
430
|
try:
|
433
431
|
installed_version = version(pkg)
|
434
432
|
if pkg_version.parse(installed_version) < pkg_version.parse(min_version):
|
435
433
|
raise Exception(
|
436
|
-
f"{pkg} is installed with version {installed_version} which "
|
437
|
-
f"is less than the minimum required version {min_version}"
|
434
|
+
f"{pkg} is installed with version {installed_version}, which "
|
435
|
+
f"is less than the minimum required version {min_version}. " +
|
436
|
+
message
|
438
437
|
)
|
439
438
|
except PackageNotFoundError:
|
440
439
|
raise Exception(
|
441
|
-
f"{pkg} with minimum required version {min_version} is not installed"
|
440
|
+
f"{pkg} with minimum required version {min_version} is not installed. " +
|
441
|
+
message
|
442
442
|
)
|
443
443
|
|
444
444
|
|
@@ -453,14 +453,61 @@ def kill_parent_process():
|
|
453
453
|
os.kill(parent_process.pid, 9)
|
454
454
|
|
455
455
|
|
456
|
-
def monkey_patch_vllm_p2p_access_check():
|
456
|
+
def monkey_patch_vllm_p2p_access_check(gpu_id: int):
|
457
457
|
"""
|
458
458
|
Monkey patch the slow p2p access check in vllm.
|
459
459
|
NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
|
460
460
|
"""
|
461
|
-
import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
|
462
461
|
|
463
|
-
|
462
|
+
# TODO: need a better check than just dev str name match
|
463
|
+
# compat: skip RTX 40 series as they do not have P2P feature and even checking for them may cause errors
|
464
|
+
device_name = torch.cuda.get_device_name(gpu_id)
|
465
|
+
if "RTX 40" not in device_name:
|
466
|
+
import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
|
467
|
+
|
468
|
+
setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
|
469
|
+
|
470
|
+
|
471
|
+
def monkey_patch_vllm_dummy_weight_loader():
|
472
|
+
"""
|
473
|
+
Monkey patch the dummy weight loader in vllm to call process_weights_after_loading.
|
474
|
+
"""
|
475
|
+
|
476
|
+
from vllm.model_executor.model_loader.loader import (
|
477
|
+
ModelConfig, DeviceConfig, LoRAConfig, VisionLanguageConfig,
|
478
|
+
ParallelConfig, SchedulerConfig, CacheConfig, nn,
|
479
|
+
set_default_torch_dtype, _initialize_model, initialize_dummy_weights,
|
480
|
+
DummyModelLoader
|
481
|
+
)
|
482
|
+
|
483
|
+
def load_model(self, *, model_config: ModelConfig,
|
484
|
+
device_config: DeviceConfig,
|
485
|
+
lora_config: Optional[LoRAConfig],
|
486
|
+
vision_language_config: Optional[VisionLanguageConfig],
|
487
|
+
parallel_config: ParallelConfig,
|
488
|
+
scheduler_config: SchedulerConfig,
|
489
|
+
cache_config: CacheConfig) -> nn.Module:
|
490
|
+
with set_default_torch_dtype(model_config.dtype):
|
491
|
+
with torch.device(device_config.device):
|
492
|
+
model = _initialize_model(model_config, self.load_config,
|
493
|
+
lora_config, vision_language_config,
|
494
|
+
cache_config)
|
495
|
+
|
496
|
+
for _, module in model.named_modules():
|
497
|
+
quant_method = getattr(module, "quant_method", None)
|
498
|
+
if quant_method is not None:
|
499
|
+
quant_method.process_weights_after_loading(module)
|
500
|
+
# FIXME: Remove this after Mixtral is updated
|
501
|
+
# to use quant_method.
|
502
|
+
if hasattr(module, "process_weights_after_loading"):
|
503
|
+
module.process_weights_after_loading()
|
504
|
+
|
505
|
+
# NOTE(woosuk): For accurate performance evaluation, we assign
|
506
|
+
# random values to the weights.
|
507
|
+
initialize_dummy_weights(model)
|
508
|
+
return model.eval()
|
509
|
+
|
510
|
+
setattr(DummyModelLoader, "load_model", load_model)
|
464
511
|
|
465
512
|
|
466
513
|
API_KEY_HEADER_NAME = "X-API-Key"
|
@@ -482,3 +529,65 @@ class APIKeyValidatorMiddleware(BaseHTTPMiddleware):
|
|
482
529
|
response = await call_next(request)
|
483
530
|
return response
|
484
531
|
|
532
|
+
|
533
|
+
def get_ip_address(ifname):
|
534
|
+
"""
|
535
|
+
Get the IP address of a network interface.
|
536
|
+
|
537
|
+
:param ifname: Name of the network interface (e.g., 'eth0')
|
538
|
+
:return: IP address of the network interface
|
539
|
+
"""
|
540
|
+
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
541
|
+
ip_address = fcntl.ioctl(
|
542
|
+
s.fileno(),
|
543
|
+
0x8915, # SIOCGIFADDR
|
544
|
+
struct.pack('256s', bytes(ifname[:15], 'utf-8'))
|
545
|
+
)[20:24]
|
546
|
+
return socket.inet_ntoa(ip_address)
|
547
|
+
|
548
|
+
|
549
|
+
def send_addrs_to_rank_0(model_port_args, server_args):
|
550
|
+
assert server_args.node_rank != 0 and server_args.dp_size == 1
|
551
|
+
import torch.distributed as dist
|
552
|
+
|
553
|
+
ifname = os.environ.get("SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0"))
|
554
|
+
ip_addr = get_ip_address(ifname)
|
555
|
+
|
556
|
+
num_tp_ports = server_args.tp_size // server_args.nnodes
|
557
|
+
model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
|
558
|
+
ip_addr = [int(x) for x in ip_addr.split(".")]
|
559
|
+
addrs_tensor = torch.tensor(ip_addr + model_port_args.model_tp_ports, dtype=torch.int)
|
560
|
+
|
561
|
+
init_method = f"tcp://{server_args.nccl_init_addr}"
|
562
|
+
dist.init_process_group(backend="gloo", init_method=init_method, rank=server_args.node_rank, world_size=server_args.nnodes)
|
563
|
+
dist.send(addrs_tensor, dst=0)
|
564
|
+
print(f"Node {server_args.node_rank} sent: ip_address {ip_addr} and ports {model_port_args.model_tp_ports}")
|
565
|
+
|
566
|
+
dist.barrier()
|
567
|
+
dist.destroy_process_group()
|
568
|
+
|
569
|
+
|
570
|
+
def receive_addrs(model_port_args, server_args):
|
571
|
+
assert server_args.node_rank == 0 and server_args.dp_size == 1
|
572
|
+
import torch.distributed as dist
|
573
|
+
|
574
|
+
ifname = os.environ.get("SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0"))
|
575
|
+
ip_addr = get_ip_address(ifname)
|
576
|
+
|
577
|
+
num_tp_ports = server_args.tp_size // server_args.nnodes
|
578
|
+
model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
|
579
|
+
|
580
|
+
init_method = f"tcp://{server_args.nccl_init_addr}"
|
581
|
+
dist.init_process_group(backend="gloo", init_method=init_method, rank=server_args.node_rank, world_size=server_args.nnodes)
|
582
|
+
|
583
|
+
for src_rank in range(1, server_args.nnodes):
|
584
|
+
tensor = torch.zeros(4 + num_tp_ports, dtype=torch.int)
|
585
|
+
dist.recv(tensor, src=src_rank)
|
586
|
+
ip = ".".join([str(x) for x in tensor[:4].tolist()])
|
587
|
+
ports = tensor[4:].tolist()
|
588
|
+
model_port_args.model_tp_ips[num_tp_ports * src_rank: num_tp_ports * (src_rank + 1)] = [ip] * num_tp_ports
|
589
|
+
model_port_args.model_tp_ports[num_tp_ports * src_rank: num_tp_ports * (src_rank + 1)] = ports
|
590
|
+
print(f"Node 0 received from rank {src_rank}: {tensor.tolist()}")
|
591
|
+
|
592
|
+
dist.barrier()
|
593
|
+
dist.destroy_process_group()
|
sglang/test/test_programs.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1
|
-
"""
|
2
|
-
This file contains the SGL programs used for unit testing.
|
3
|
-
"""
|
1
|
+
"""This file contains the SGL programs used for unit testing."""
|
4
2
|
|
5
3
|
import json
|
6
4
|
import re
|
@@ -358,16 +356,25 @@ def test_completion_speculative():
|
|
358
356
|
s += "Construct a character within the following format:\n"
|
359
357
|
s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
|
360
358
|
s += "\nPlease generate new Name, Birthday and Job.\n"
|
361
|
-
s +=
|
359
|
+
s += (
|
360
|
+
"Name:"
|
361
|
+
+ sgl.gen("name", stop="\n")
|
362
|
+
+ "\nBirthday:"
|
363
|
+
+ sgl.gen("birthday", stop="\n")
|
364
|
+
)
|
362
365
|
s += "\nJob:" + sgl.gen("job", stop="\n") + "\n"
|
363
366
|
|
364
|
-
|
365
367
|
@sgl.function
|
366
368
|
def gen_character_no_spec(s):
|
367
369
|
s += "Construct a character within the following format:\n"
|
368
370
|
s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
|
369
371
|
s += "\nPlease generate new Name, Birthday and Job.\n"
|
370
|
-
s +=
|
372
|
+
s += (
|
373
|
+
"Name:"
|
374
|
+
+ sgl.gen("name", stop="\n")
|
375
|
+
+ "\nBirthday:"
|
376
|
+
+ sgl.gen("birthday", stop="\n")
|
377
|
+
)
|
371
378
|
s += "\nJob:" + sgl.gen("job", stop="\n") + "\n"
|
372
379
|
|
373
380
|
token_usage = sgl.global_config.default_backend.token_usage
|
@@ -380,7 +387,9 @@ def test_completion_speculative():
|
|
380
387
|
gen_character_no_spec().sync()
|
381
388
|
usage_with_no_spec = token_usage.prompt_tokens
|
382
389
|
|
383
|
-
assert
|
390
|
+
assert (
|
391
|
+
usage_with_spec < usage_with_no_spec
|
392
|
+
), f"{usage_with_spec} vs {usage_with_no_spec}"
|
384
393
|
|
385
394
|
|
386
395
|
def test_chat_completion_speculative():
|
@@ -388,8 +397,17 @@ def test_chat_completion_speculative():
|
|
388
397
|
def gen_character_spec(s):
|
389
398
|
s += sgl.system("You are a helpful assistant.")
|
390
399
|
s += sgl.user("Construct a character within the following format:")
|
391
|
-
s += sgl.assistant(
|
400
|
+
s += sgl.assistant(
|
401
|
+
"Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
|
402
|
+
)
|
392
403
|
s += sgl.user("Please generate new Name, Birthday and Job.\n")
|
393
|
-
s += sgl.assistant(
|
404
|
+
s += sgl.assistant(
|
405
|
+
"Name:"
|
406
|
+
+ sgl.gen("name", stop="\n")
|
407
|
+
+ "\nBirthday:"
|
408
|
+
+ sgl.gen("birthday", stop="\n")
|
409
|
+
+ "\nJob:"
|
410
|
+
+ sgl.gen("job", stop="\n")
|
411
|
+
)
|
394
412
|
|
395
|
-
gen_character_spec().sync()
|
413
|
+
gen_character_spec().sync()
|
sglang/utils.py
CHANGED
@@ -15,7 +15,6 @@ from json import dumps
|
|
15
15
|
import numpy as np
|
16
16
|
import requests
|
17
17
|
|
18
|
-
|
19
18
|
logger = logging.getLogger(__name__)
|
20
19
|
|
21
20
|
|
@@ -255,8 +254,10 @@ def run_with_timeout(func, args=(), kwargs=None, timeout=None):
|
|
255
254
|
|
256
255
|
def graceful_registry(sub_module_name):
|
257
256
|
def graceful_shutdown(signum, frame):
|
258
|
-
logger.info(
|
257
|
+
logger.info(
|
258
|
+
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
|
259
|
+
)
|
259
260
|
if signum == signal.SIGTERM:
|
260
261
|
logger.info(f"{sub_module_name} recive sigterm")
|
261
262
|
|
262
|
-
signal.signal(signal.SIGTERM, graceful_shutdown)
|
263
|
+
signal.signal(signal.SIGTERM, graceful_shutdown)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -213,6 +213,7 @@ Description-Content-Type: text/markdown
|
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: requests
|
215
215
|
Requires-Dist: tqdm
|
216
|
+
Requires-Dist: numpy
|
216
217
|
Provides-Extra: all
|
217
218
|
Requires-Dist: sglang[srt] ; extra == 'all'
|
218
219
|
Requires-Dist: sglang[openai] ; extra == 'all'
|
@@ -220,30 +221,28 @@ Requires-Dist: sglang[anthropic] ; extra == 'all'
|
|
220
221
|
Requires-Dist: sglang[litellm] ; extra == 'all'
|
221
222
|
Provides-Extra: anthropic
|
222
223
|
Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
|
223
|
-
Requires-Dist: numpy ; extra == 'anthropic'
|
224
224
|
Provides-Extra: litellm
|
225
225
|
Requires-Dist: litellm >=1.0.0 ; extra == 'litellm'
|
226
226
|
Provides-Extra: openai
|
227
227
|
Requires-Dist: openai >=1.0 ; extra == 'openai'
|
228
|
-
Requires-Dist: numpy ; extra == 'openai'
|
229
228
|
Requires-Dist: tiktoken ; extra == 'openai'
|
230
229
|
Provides-Extra: srt
|
231
230
|
Requires-Dist: aiohttp ; extra == 'srt'
|
232
231
|
Requires-Dist: fastapi ; extra == 'srt'
|
232
|
+
Requires-Dist: hf-transfer ; extra == 'srt'
|
233
|
+
Requires-Dist: huggingface-hub ; extra == 'srt'
|
234
|
+
Requires-Dist: interegular ; extra == 'srt'
|
235
|
+
Requires-Dist: packaging ; extra == 'srt'
|
236
|
+
Requires-Dist: pillow ; extra == 'srt'
|
233
237
|
Requires-Dist: psutil ; extra == 'srt'
|
238
|
+
Requires-Dist: pydantic ; extra == 'srt'
|
234
239
|
Requires-Dist: rpyc ; extra == 'srt'
|
235
240
|
Requires-Dist: torch ; extra == 'srt'
|
236
|
-
Requires-Dist: uvloop ; extra == 'srt'
|
237
241
|
Requires-Dist: uvicorn ; extra == 'srt'
|
242
|
+
Requires-Dist: uvloop ; extra == 'srt'
|
238
243
|
Requires-Dist: zmq ; extra == 'srt'
|
239
|
-
Requires-Dist: vllm ==0.
|
240
|
-
Requires-Dist:
|
241
|
-
Requires-Dist: pydantic ; extra == 'srt'
|
242
|
-
Requires-Dist: pillow ; extra == 'srt'
|
243
|
-
Requires-Dist: packaging ; extra == 'srt'
|
244
|
-
Requires-Dist: huggingface-hub ; extra == 'srt'
|
245
|
-
Requires-Dist: hf-transfer ; extra == 'srt'
|
246
|
-
Requires-Dist: outlines >=0.0.34 ; extra == 'srt'
|
244
|
+
Requires-Dist: vllm ==0.5.0 ; extra == 'srt'
|
245
|
+
Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
|
247
246
|
|
248
247
|
<div align="center">
|
249
248
|
<img src="assets/logo.png" alt="logo" width="400"></img>
|
@@ -257,8 +256,8 @@ SGLang is a structured generation language designed for large language models (L
|
|
257
256
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
258
257
|
|
259
258
|
The core features include:
|
260
|
-
- **
|
261
|
-
- **
|
259
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
262
261
|
|
263
262
|
## News
|
264
263
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -279,19 +278,27 @@ The core features include:
|
|
279
278
|
### Method 1: With pip
|
280
279
|
```
|
281
280
|
pip install "sglang[all]"
|
281
|
+
|
282
|
+
# Install FlashInfer CUDA kernels
|
283
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
282
284
|
```
|
283
285
|
|
284
286
|
### Method 2: From source
|
285
287
|
```
|
286
|
-
git clone
|
288
|
+
git clone https://github.com/sgl-project/sglang.git
|
287
289
|
cd sglang
|
288
290
|
|
289
291
|
pip install --upgrade pip
|
290
292
|
pip install -e "python[all]"
|
293
|
+
|
294
|
+
# Install FlashInfer CUDA kernels
|
295
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
291
296
|
```
|
292
297
|
|
293
298
|
### Notes
|
294
|
-
- If you
|
299
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
|
300
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
301
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
295
302
|
|
296
303
|
## Quick Start
|
297
304
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -610,7 +617,6 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
610
617
|
```
|
611
618
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
612
619
|
```
|
613
|
-
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
614
620
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
615
621
|
|
616
622
|
### Supported Models
|
@@ -643,17 +649,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
643
649
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
644
650
|

|
645
651
|
|
646
|
-
Learn more [
|
652
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
653
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
647
654
|
|
648
655
|
## Roadmap
|
649
656
|
https://github.com/sgl-project/sglang/issues/157
|
650
657
|
|
651
658
|
## Citation And Acknowledgment
|
652
659
|
```
|
653
|
-
@misc{
|
654
|
-
title={
|
655
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
656
|
-
year={
|
660
|
+
@misc{zheng2024sglang,
|
661
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
662
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
663
|
+
year={2024},
|
657
664
|
eprint={2312.07104},
|
658
665
|
archivePrefix={arXiv},
|
659
666
|
primaryClass={cs.AI}
|
@@ -0,0 +1,78 @@
|
|
1
|
+
sglang/__init__.py,sha256=PhkN9MopSdHLXHG9_7l5JB-awRDI9CdR6Qht1vWA9C8,1116
|
2
|
+
sglang/api.py,sha256=92oqUgVeKq9B9If2A8LHzEhPicZK5Rq3rKUShwPAq0E,4579
|
3
|
+
sglang/bench_latency.py,sha256=MNxmVCwBM7ZWFYSFy2m-y8MmEWNWvZO2gUBbuMyWSBI,10018
|
4
|
+
sglang/global_config.py,sha256=xMX7JqPgDRwtvcbULkwHJ-bfysNefEN42V3BGss9mlo,1425
|
5
|
+
sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
|
6
|
+
sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
|
7
|
+
sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
|
8
|
+
sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
|
10
|
+
sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
|
11
|
+
sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
|
12
|
+
sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
|
13
|
+
sglang/backend/runtime_endpoint.py,sha256=8NyWgMvhzUcA5VEsPLo1AacZ_UPVSnpxpzt6vYdVQSU,8871
|
14
|
+
sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
|
15
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
sglang/lang/chat_template.py,sha256=ogIT8iMlDcSEgcNBTh5pRLoCkdQI_ec5Hc27wFUFDIg,11532
|
17
|
+
sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
|
18
|
+
sglang/lang/interpreter.py,sha256=MMdvli-75ySiKiULlsnoVmb8oEu5bvSkYz8GRdtZoVk,29494
|
19
|
+
sglang/lang/ir.py,sha256=KZxXVva2r1UihYOVWRKcU_zILMx05oWV2yLy3SeZfnA,14603
|
20
|
+
sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
|
21
|
+
sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
|
22
|
+
sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
|
23
|
+
sglang/srt/hf_transformers_utils.py,sha256=P6eXfGwH-OeU6hDrlGYL5GACcTNPdYOimpKZ0ZBZUao,10683
|
24
|
+
sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
|
25
|
+
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
26
|
+
sglang/srt/model_config.py,sha256=eGt8hTtipSTqp-AsB-Cl4wfZDb14CTcOtIz-iXgaVk8,4997
|
27
|
+
sglang/srt/openai_api_adapter.py,sha256=pqGP0bON-wEZOnZyo85gzrO9MSzeIkHh5xqhpN5RkyY,15120
|
28
|
+
sglang/srt/openai_protocol.py,sha256=CNJOMr3PJvoRGI2TIh9t8f_4wYTtT0EF8kzsrYsASYY,5350
|
29
|
+
sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
|
30
|
+
sglang/srt/server.py,sha256=742w8gn0GgE7w3EwgPhq7MYabaVxcdPpMAovEE6-DaU,13112
|
31
|
+
sglang/srt/server_args.py,sha256=j0-Aj8sHQ-zgumd4w0IaezRqDdjDC6MMMG5M8zzITVw,12166
|
32
|
+
sglang/srt/utils.py,sha256=V2C4fb93oKS4D3lezlRgHkD7MQDNBZlIy_4ZTNzAC9E,19423
|
33
|
+
sglang/srt/constrained/__init__.py,sha256=Q-XnKFChC9q6WDCnJKAKAuXzKHHg4QoFlYODge8ZKCs,1504
|
34
|
+
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
35
|
+
sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
|
36
|
+
sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
|
37
|
+
sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
|
38
|
+
sglang/srt/layers/extend_attention.py,sha256=JUYuYSAhfbgOXrwIK5YHJCXPq54a6IZ7vQrze-3VvMQ,12955
|
39
|
+
sglang/srt/layers/fused_moe.py,sha256=M_cTHMNSoD-wdh6XjzHseuq3zsdqOmECWxNeEVJklu4,22257
|
40
|
+
sglang/srt/layers/logits_processor.py,sha256=t-bZIcGj70KKf2Jcor9K7Va1NsBlDVNrQ4Ktlq0lUlU,8506
|
41
|
+
sglang/srt/layers/radix_attention.py,sha256=XsHFf7myNKZwyt3qB5LEXAttTKMY9OP3M3t5CZnyu3g,6911
|
42
|
+
sglang/srt/layers/token_attention.py,sha256=rVbPlFpmLoU3nx3qtK2YZdynDxfvMKtQNTPeKi0KNP0,8823
|
43
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=tOjURt-XQofPblnGECoJfoRSMPoWFVAH99R05hXeYNw,3353
|
44
|
+
sglang/srt/managers/io_struct.py,sha256=O1cz6hDV6BjXbZ0-tk6VaDNjYFuMBUOGswbG3H_GliY,4532
|
45
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=TswupFKrlXAvUM5-1eT2cR6uNJoQVivp2MQkEFu4axQ,14848
|
46
|
+
sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
|
47
|
+
sglang/srt/managers/controller/infer_batch.py,sha256=-Q17Pk_Mmccobxly7UM8wCC6dYKJ4zmjplMboN1q8b0,25700
|
48
|
+
sglang/srt/managers/controller/manager_multi.py,sha256=Z0a-iZzqk8T2Xl7ak2DgE9j00GA6Eb0XoNVx7UlxKa4,6630
|
49
|
+
sglang/srt/managers/controller/manager_single.py,sha256=OIm_BjbDaEcYmpb_E_7wv0xfOlb2le0zXjPMqf1pU9U,3468
|
50
|
+
sglang/srt/managers/controller/model_runner.py,sha256=HjOHp_Rtdm7OnMmhtnSwPWPmEYHDpnt5LjeKbiYb6mo,21718
|
51
|
+
sglang/srt/managers/controller/radix_cache.py,sha256=fMqIm1fTvufI9I_QMoFLfQMkSUWp8VN4wh3-63KJUL0,8193
|
52
|
+
sglang/srt/managers/controller/schedule_heuristic.py,sha256=_ne7W2mrpuO794uh5tYLR3q6XBbgTMdNmE6VpzY1sJE,2312
|
53
|
+
sglang/srt/managers/controller/tp_worker.py,sha256=VNVQ0oqPGllC00cZCxHB-0LqudxgS74jf-it2zDHzTA,31411
|
54
|
+
sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
|
55
|
+
sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
|
56
|
+
sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
|
57
|
+
sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
|
58
|
+
sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
|
59
|
+
sglang/srt/models/llama2.py,sha256=7aPPSLABRIy7_iy4YvFHV7Beqc2I1-Vc1obSbsgzNzY,12190
|
60
|
+
sglang/srt/models/llama_classification.py,sha256=LrPRFB-Yd2haZADNY3uIusbajQwcZNQrOCTd92L2vS0,4304
|
61
|
+
sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
|
62
|
+
sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
|
63
|
+
sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
|
64
|
+
sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
|
65
|
+
sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
|
66
|
+
sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
|
67
|
+
sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
|
68
|
+
sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
|
69
|
+
sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
|
70
|
+
sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
|
71
|
+
sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
|
72
|
+
sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
|
73
|
+
sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
|
74
|
+
sglang-0.1.18.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
75
|
+
sglang-0.1.18.dist-info/METADATA,sha256=tDdBZo2qvH8wWC4faXxfryjh7-6frEsBnH0vJ_ia1w4,29752
|
76
|
+
sglang-0.1.18.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
|
77
|
+
sglang-0.1.18.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
78
|
+
sglang-0.1.18.dist-info/RECORD,,
|