sglang 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -2
- sglang/api.py +30 -4
- sglang/backend/litellm.py +2 -2
- sglang/backend/openai.py +26 -15
- sglang/backend/runtime_endpoint.py +18 -14
- sglang/bench_latency.py +317 -0
- sglang/global_config.py +5 -1
- sglang/lang/chat_template.py +41 -6
- sglang/lang/compiler.py +2 -2
- sglang/lang/interpreter.py +6 -2
- sglang/lang/ir.py +74 -28
- sglang/launch_server.py +4 -1
- sglang/launch_server_llavavid.py +2 -1
- sglang/srt/constrained/__init__.py +14 -6
- sglang/srt/constrained/fsm_cache.py +6 -3
- sglang/srt/constrained/jump_forward.py +113 -25
- sglang/srt/conversation.py +2 -0
- sglang/srt/flush_cache.py +2 -0
- sglang/srt/hf_transformers_utils.py +68 -9
- sglang/srt/layers/extend_attention.py +2 -1
- sglang/srt/layers/fused_moe.py +280 -169
- sglang/srt/layers/logits_processor.py +106 -42
- sglang/srt/layers/radix_attention.py +53 -29
- sglang/srt/layers/token_attention.py +4 -1
- sglang/srt/managers/controller/dp_worker.py +6 -3
- sglang/srt/managers/controller/infer_batch.py +144 -69
- sglang/srt/managers/controller/manager_multi.py +5 -5
- sglang/srt/managers/controller/manager_single.py +9 -4
- sglang/srt/managers/controller/model_runner.py +167 -55
- sglang/srt/managers/controller/radix_cache.py +4 -0
- sglang/srt/managers/controller/schedule_heuristic.py +2 -0
- sglang/srt/managers/controller/tp_worker.py +156 -134
- sglang/srt/managers/detokenizer_manager.py +19 -21
- sglang/srt/managers/io_struct.py +11 -5
- sglang/srt/managers/tokenizer_manager.py +16 -14
- sglang/srt/model_config.py +89 -4
- sglang/srt/models/chatglm.py +399 -0
- sglang/srt/models/commandr.py +2 -2
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/gemma.py +5 -1
- sglang/srt/models/gemma2.py +436 -0
- sglang/srt/models/grok.py +204 -137
- sglang/srt/models/llama2.py +12 -5
- sglang/srt/models/llama_classification.py +107 -0
- sglang/srt/models/llava.py +11 -8
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +373 -0
- sglang/srt/models/mixtral.py +164 -115
- sglang/srt/models/mixtral_quant.py +0 -1
- sglang/srt/models/qwen.py +1 -1
- sglang/srt/models/qwen2.py +1 -1
- sglang/srt/models/qwen2_moe.py +454 -0
- sglang/srt/models/stablelm.py +1 -1
- sglang/srt/models/yivl.py +2 -2
- sglang/srt/openai_api_adapter.py +35 -25
- sglang/srt/openai_protocol.py +2 -2
- sglang/srt/server.py +69 -19
- sglang/srt/server_args.py +76 -43
- sglang/srt/utils.py +177 -35
- sglang/test/test_programs.py +28 -10
- sglang/utils.py +4 -3
- {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/METADATA +44 -31
- sglang-0.1.19.dist-info/RECORD +81 -0
- {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/WHEEL +1 -1
- sglang/srt/managers/router/infer_batch.py +0 -596
- sglang/srt/managers/router/manager.py +0 -82
- sglang/srt/managers/router/model_rpc.py +0 -818
- sglang/srt/managers/router/model_runner.py +0 -445
- sglang/srt/managers/router/radix_cache.py +0 -267
- sglang/srt/managers/router/scheduler.py +0 -59
- sglang-0.1.17.dist-info/RECORD +0 -81
- {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
- {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
"""Common utilities."""
|
2
2
|
|
3
3
|
import base64
|
4
|
-
import
|
4
|
+
import fcntl
|
5
5
|
import logging
|
6
|
+
import multiprocessing
|
6
7
|
import os
|
7
8
|
import random
|
8
9
|
import socket
|
10
|
+
import struct
|
9
11
|
import time
|
10
12
|
from importlib.metadata import PackageNotFoundError, version
|
11
13
|
from io import BytesIO
|
@@ -17,12 +19,11 @@ import requests
|
|
17
19
|
import rpyc
|
18
20
|
import torch
|
19
21
|
import triton
|
20
|
-
from rpyc.utils.server import ThreadedServer
|
21
22
|
from fastapi.responses import JSONResponse
|
22
23
|
from packaging import version as pkg_version
|
24
|
+
from rpyc.utils.server import ThreadedServer
|
23
25
|
from starlette.middleware.base import BaseHTTPMiddleware
|
24
26
|
|
25
|
-
|
26
27
|
logger = logging.getLogger(__name__)
|
27
28
|
|
28
29
|
|
@@ -370,23 +371,7 @@ def load_image(image_file):
|
|
370
371
|
return image, image_size
|
371
372
|
|
372
373
|
|
373
|
-
def
|
374
|
-
t = ThreadedServer(
|
375
|
-
service=service,
|
376
|
-
port=port,
|
377
|
-
protocol_config={
|
378
|
-
"allow_public_attrs": True,
|
379
|
-
"allow_pickle": True,
|
380
|
-
"sync_request_timeout": 3600
|
381
|
-
},
|
382
|
-
)
|
383
|
-
t.logger.setLevel(logging.WARN)
|
384
|
-
t.start()
|
385
|
-
|
386
|
-
|
387
|
-
def connect_to_rpyc_service(port, host="localhost"):
|
388
|
-
time.sleep(1)
|
389
|
-
|
374
|
+
def connect_rpyc_service(host, port):
|
390
375
|
repeat_count = 0
|
391
376
|
while repeat_count < 20:
|
392
377
|
try:
|
@@ -396,26 +381,37 @@ def connect_to_rpyc_service(port, host="localhost"):
|
|
396
381
|
config={
|
397
382
|
"allow_public_attrs": True,
|
398
383
|
"allow_pickle": True,
|
399
|
-
"sync_request_timeout": 3600
|
384
|
+
"sync_request_timeout": 3600,
|
400
385
|
},
|
401
386
|
)
|
402
387
|
break
|
403
|
-
except ConnectionRefusedError:
|
388
|
+
except ConnectionRefusedError as e:
|
404
389
|
time.sleep(1)
|
405
390
|
repeat_count += 1
|
406
391
|
if repeat_count == 20:
|
407
|
-
raise RuntimeError("
|
392
|
+
raise RuntimeError(f"Connect rpyc error: {e}")
|
408
393
|
|
409
394
|
return con.root
|
410
395
|
|
411
396
|
|
412
|
-
def
|
413
|
-
|
414
|
-
|
397
|
+
def start_rpyc_service(service: rpyc.Service, port: int):
|
398
|
+
t = ThreadedServer(
|
399
|
+
service=service,
|
400
|
+
port=port,
|
401
|
+
protocol_config={
|
402
|
+
"allow_public_attrs": True,
|
403
|
+
"allow_pickle": True,
|
404
|
+
"sync_request_timeout": 3600,
|
405
|
+
},
|
406
|
+
)
|
407
|
+
t.logger.setLevel(logging.WARN)
|
408
|
+
t.start()
|
409
|
+
|
410
|
+
|
411
|
+
def start_rpyc_service_process(service: rpyc.Service, port: int):
|
412
|
+
proc = multiprocessing.Process(target=start_rpyc_service, args=(service, port))
|
415
413
|
proc.start()
|
416
|
-
|
417
|
-
assert proc.is_alive()
|
418
|
-
return proxy, proc
|
414
|
+
return proc
|
419
415
|
|
420
416
|
|
421
417
|
def suppress_other_loggers():
|
@@ -423,22 +419,25 @@ def suppress_other_loggers():
|
|
423
419
|
|
424
420
|
vllm_default_logger.setLevel(logging.WARN)
|
425
421
|
logging.getLogger("vllm.config").setLevel(logging.ERROR)
|
426
|
-
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
422
|
+
logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
|
423
|
+
logging.WARN
|
424
|
+
)
|
427
425
|
logging.getLogger("vllm.selector").setLevel(logging.WARN)
|
428
426
|
logging.getLogger("vllm.utils").setLevel(logging.WARN)
|
429
427
|
|
430
428
|
|
431
|
-
def assert_pkg_version(pkg: str, min_version: str):
|
429
|
+
def assert_pkg_version(pkg: str, min_version: str, message: str):
|
432
430
|
try:
|
433
431
|
installed_version = version(pkg)
|
434
432
|
if pkg_version.parse(installed_version) < pkg_version.parse(min_version):
|
435
433
|
raise Exception(
|
436
|
-
f"{pkg} is installed with version {installed_version} which "
|
437
|
-
f"is less than the minimum required version {min_version}"
|
434
|
+
f"{pkg} is installed with version {installed_version}, which "
|
435
|
+
f"is less than the minimum required version {min_version}. " + message
|
438
436
|
)
|
439
437
|
except PackageNotFoundError:
|
440
438
|
raise Exception(
|
441
|
-
f"{pkg} with minimum required version {min_version} is not installed"
|
439
|
+
f"{pkg} with minimum required version {min_version} is not installed. "
|
440
|
+
+ message
|
442
441
|
)
|
443
442
|
|
444
443
|
|
@@ -453,16 +452,75 @@ def kill_parent_process():
|
|
453
452
|
os.kill(parent_process.pid, 9)
|
454
453
|
|
455
454
|
|
456
|
-
def monkey_patch_vllm_p2p_access_check():
|
455
|
+
def monkey_patch_vllm_p2p_access_check(gpu_id: int):
|
457
456
|
"""
|
458
457
|
Monkey patch the slow p2p access check in vllm.
|
459
458
|
NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
|
460
459
|
"""
|
460
|
+
|
461
461
|
import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
|
462
462
|
|
463
463
|
setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
|
464
464
|
|
465
465
|
|
466
|
+
def monkey_patch_vllm_dummy_weight_loader():
|
467
|
+
"""
|
468
|
+
Monkey patch the dummy weight loader in vllm to call process_weights_after_loading.
|
469
|
+
"""
|
470
|
+
|
471
|
+
from vllm.model_executor.model_loader.loader import (
|
472
|
+
CacheConfig,
|
473
|
+
DeviceConfig,
|
474
|
+
DummyModelLoader,
|
475
|
+
LoRAConfig,
|
476
|
+
ModelConfig,
|
477
|
+
ParallelConfig,
|
478
|
+
SchedulerConfig,
|
479
|
+
MultiModalConfig,
|
480
|
+
_initialize_model,
|
481
|
+
initialize_dummy_weights,
|
482
|
+
nn,
|
483
|
+
set_default_torch_dtype,
|
484
|
+
)
|
485
|
+
|
486
|
+
def load_model(
|
487
|
+
self,
|
488
|
+
*,
|
489
|
+
model_config: ModelConfig,
|
490
|
+
device_config: DeviceConfig,
|
491
|
+
lora_config: Optional[LoRAConfig],
|
492
|
+
multimodal_config: Optional[MultiModalConfig],
|
493
|
+
parallel_config: ParallelConfig,
|
494
|
+
scheduler_config: SchedulerConfig,
|
495
|
+
cache_config: CacheConfig,
|
496
|
+
) -> nn.Module:
|
497
|
+
with set_default_torch_dtype(model_config.dtype):
|
498
|
+
with torch.device(device_config.device):
|
499
|
+
model = _initialize_model(
|
500
|
+
model_config,
|
501
|
+
self.load_config,
|
502
|
+
lora_config,
|
503
|
+
multimodal_config,
|
504
|
+
cache_config,
|
505
|
+
)
|
506
|
+
|
507
|
+
for _, module in model.named_modules():
|
508
|
+
quant_method = getattr(module, "quant_method", None)
|
509
|
+
if quant_method is not None:
|
510
|
+
quant_method.process_weights_after_loading(module)
|
511
|
+
# FIXME: Remove this after Mixtral is updated
|
512
|
+
# to use quant_method.
|
513
|
+
if hasattr(module, "process_weights_after_loading"):
|
514
|
+
module.process_weights_after_loading()
|
515
|
+
|
516
|
+
# NOTE(woosuk): For accurate performance evaluation, we assign
|
517
|
+
# random values to the weights.
|
518
|
+
initialize_dummy_weights(model)
|
519
|
+
return model.eval()
|
520
|
+
|
521
|
+
setattr(DummyModelLoader, "load_model", load_model)
|
522
|
+
|
523
|
+
|
466
524
|
API_KEY_HEADER_NAME = "X-API-Key"
|
467
525
|
|
468
526
|
|
@@ -482,3 +540,87 @@ class APIKeyValidatorMiddleware(BaseHTTPMiddleware):
|
|
482
540
|
response = await call_next(request)
|
483
541
|
return response
|
484
542
|
|
543
|
+
|
544
|
+
def get_ip_address(ifname):
|
545
|
+
"""
|
546
|
+
Get the IP address of a network interface.
|
547
|
+
|
548
|
+
:param ifname: Name of the network interface (e.g., 'eth0')
|
549
|
+
:return: IP address of the network interface
|
550
|
+
"""
|
551
|
+
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
552
|
+
ip_address = fcntl.ioctl(
|
553
|
+
s.fileno(),
|
554
|
+
0x8915, # SIOCGIFADDR
|
555
|
+
struct.pack("256s", bytes(ifname[:15], "utf-8")),
|
556
|
+
)[20:24]
|
557
|
+
return socket.inet_ntoa(ip_address)
|
558
|
+
|
559
|
+
|
560
|
+
def send_addrs_to_rank_0(model_port_args, server_args):
|
561
|
+
assert server_args.node_rank != 0 and server_args.dp_size == 1
|
562
|
+
import torch.distributed as dist
|
563
|
+
|
564
|
+
ifname = os.environ.get(
|
565
|
+
"SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
|
566
|
+
)
|
567
|
+
ip_addr = get_ip_address(ifname)
|
568
|
+
|
569
|
+
num_tp_ports = server_args.tp_size // server_args.nnodes
|
570
|
+
model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
|
571
|
+
ip_addr = [int(x) for x in ip_addr.split(".")]
|
572
|
+
addrs_tensor = torch.tensor(
|
573
|
+
ip_addr + model_port_args.model_tp_ports, dtype=torch.int
|
574
|
+
)
|
575
|
+
|
576
|
+
init_method = f"tcp://{server_args.nccl_init_addr}"
|
577
|
+
dist.init_process_group(
|
578
|
+
backend="gloo",
|
579
|
+
init_method=init_method,
|
580
|
+
rank=server_args.node_rank,
|
581
|
+
world_size=server_args.nnodes,
|
582
|
+
)
|
583
|
+
dist.send(addrs_tensor, dst=0)
|
584
|
+
print(
|
585
|
+
f"Node {server_args.node_rank} sent: ip_address {ip_addr} and ports {model_port_args.model_tp_ports}"
|
586
|
+
)
|
587
|
+
|
588
|
+
dist.barrier()
|
589
|
+
dist.destroy_process_group()
|
590
|
+
|
591
|
+
|
592
|
+
def receive_addrs(model_port_args, server_args):
|
593
|
+
assert server_args.node_rank == 0 and server_args.dp_size == 1
|
594
|
+
import torch.distributed as dist
|
595
|
+
|
596
|
+
ifname = os.environ.get(
|
597
|
+
"SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
|
598
|
+
)
|
599
|
+
ip_addr = get_ip_address(ifname)
|
600
|
+
|
601
|
+
num_tp_ports = server_args.tp_size // server_args.nnodes
|
602
|
+
model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
|
603
|
+
|
604
|
+
init_method = f"tcp://{server_args.nccl_init_addr}"
|
605
|
+
dist.init_process_group(
|
606
|
+
backend="gloo",
|
607
|
+
init_method=init_method,
|
608
|
+
rank=server_args.node_rank,
|
609
|
+
world_size=server_args.nnodes,
|
610
|
+
)
|
611
|
+
|
612
|
+
for src_rank in range(1, server_args.nnodes):
|
613
|
+
tensor = torch.zeros(4 + num_tp_ports, dtype=torch.int)
|
614
|
+
dist.recv(tensor, src=src_rank)
|
615
|
+
ip = ".".join([str(x) for x in tensor[:4].tolist()])
|
616
|
+
ports = tensor[4:].tolist()
|
617
|
+
model_port_args.model_tp_ips[
|
618
|
+
num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
|
619
|
+
] = [ip] * num_tp_ports
|
620
|
+
model_port_args.model_tp_ports[
|
621
|
+
num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
|
622
|
+
] = ports
|
623
|
+
print(f"Node 0 received from rank {src_rank}: {tensor.tolist()}")
|
624
|
+
|
625
|
+
dist.barrier()
|
626
|
+
dist.destroy_process_group()
|
sglang/test/test_programs.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1
|
-
"""
|
2
|
-
This file contains the SGL programs used for unit testing.
|
3
|
-
"""
|
1
|
+
"""This file contains the SGL programs used for unit testing."""
|
4
2
|
|
5
3
|
import json
|
6
4
|
import re
|
@@ -358,16 +356,25 @@ def test_completion_speculative():
|
|
358
356
|
s += "Construct a character within the following format:\n"
|
359
357
|
s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
|
360
358
|
s += "\nPlease generate new Name, Birthday and Job.\n"
|
361
|
-
s +=
|
359
|
+
s += (
|
360
|
+
"Name:"
|
361
|
+
+ sgl.gen("name", stop="\n")
|
362
|
+
+ "\nBirthday:"
|
363
|
+
+ sgl.gen("birthday", stop="\n")
|
364
|
+
)
|
362
365
|
s += "\nJob:" + sgl.gen("job", stop="\n") + "\n"
|
363
366
|
|
364
|
-
|
365
367
|
@sgl.function
|
366
368
|
def gen_character_no_spec(s):
|
367
369
|
s += "Construct a character within the following format:\n"
|
368
370
|
s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
|
369
371
|
s += "\nPlease generate new Name, Birthday and Job.\n"
|
370
|
-
s +=
|
372
|
+
s += (
|
373
|
+
"Name:"
|
374
|
+
+ sgl.gen("name", stop="\n")
|
375
|
+
+ "\nBirthday:"
|
376
|
+
+ sgl.gen("birthday", stop="\n")
|
377
|
+
)
|
371
378
|
s += "\nJob:" + sgl.gen("job", stop="\n") + "\n"
|
372
379
|
|
373
380
|
token_usage = sgl.global_config.default_backend.token_usage
|
@@ -380,7 +387,9 @@ def test_completion_speculative():
|
|
380
387
|
gen_character_no_spec().sync()
|
381
388
|
usage_with_no_spec = token_usage.prompt_tokens
|
382
389
|
|
383
|
-
assert
|
390
|
+
assert (
|
391
|
+
usage_with_spec < usage_with_no_spec
|
392
|
+
), f"{usage_with_spec} vs {usage_with_no_spec}"
|
384
393
|
|
385
394
|
|
386
395
|
def test_chat_completion_speculative():
|
@@ -388,8 +397,17 @@ def test_chat_completion_speculative():
|
|
388
397
|
def gen_character_spec(s):
|
389
398
|
s += sgl.system("You are a helpful assistant.")
|
390
399
|
s += sgl.user("Construct a character within the following format:")
|
391
|
-
s += sgl.assistant(
|
400
|
+
s += sgl.assistant(
|
401
|
+
"Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
|
402
|
+
)
|
392
403
|
s += sgl.user("Please generate new Name, Birthday and Job.\n")
|
393
|
-
s += sgl.assistant(
|
404
|
+
s += sgl.assistant(
|
405
|
+
"Name:"
|
406
|
+
+ sgl.gen("name", stop="\n")
|
407
|
+
+ "\nBirthday:"
|
408
|
+
+ sgl.gen("birthday", stop="\n")
|
409
|
+
+ "\nJob:"
|
410
|
+
+ sgl.gen("job", stop="\n")
|
411
|
+
)
|
394
412
|
|
395
|
-
gen_character_spec().sync()
|
413
|
+
gen_character_spec().sync()
|
sglang/utils.py
CHANGED
@@ -15,7 +15,6 @@ from json import dumps
|
|
15
15
|
import numpy as np
|
16
16
|
import requests
|
17
17
|
|
18
|
-
|
19
18
|
logger = logging.getLogger(__name__)
|
20
19
|
|
21
20
|
|
@@ -255,8 +254,10 @@ def run_with_timeout(func, args=(), kwargs=None, timeout=None):
|
|
255
254
|
|
256
255
|
def graceful_registry(sub_module_name):
|
257
256
|
def graceful_shutdown(signum, frame):
|
258
|
-
logger.info(
|
257
|
+
logger.info(
|
258
|
+
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
|
259
|
+
)
|
259
260
|
if signum == signal.SIGTERM:
|
260
261
|
logger.info(f"{sub_module_name} recive sigterm")
|
261
262
|
|
262
|
-
signal.signal(signal.SIGTERM, graceful_shutdown)
|
263
|
+
signal.signal(signal.SIGTERM, graceful_shutdown)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.19
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -213,6 +213,7 @@ Description-Content-Type: text/markdown
|
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: requests
|
215
215
|
Requires-Dist: tqdm
|
216
|
+
Requires-Dist: numpy
|
216
217
|
Provides-Extra: all
|
217
218
|
Requires-Dist: sglang[srt] ; extra == 'all'
|
218
219
|
Requires-Dist: sglang[openai] ; extra == 'all'
|
@@ -220,30 +221,28 @@ Requires-Dist: sglang[anthropic] ; extra == 'all'
|
|
220
221
|
Requires-Dist: sglang[litellm] ; extra == 'all'
|
221
222
|
Provides-Extra: anthropic
|
222
223
|
Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
|
223
|
-
Requires-Dist: numpy ; extra == 'anthropic'
|
224
224
|
Provides-Extra: litellm
|
225
225
|
Requires-Dist: litellm >=1.0.0 ; extra == 'litellm'
|
226
226
|
Provides-Extra: openai
|
227
227
|
Requires-Dist: openai >=1.0 ; extra == 'openai'
|
228
|
-
Requires-Dist: numpy ; extra == 'openai'
|
229
228
|
Requires-Dist: tiktoken ; extra == 'openai'
|
230
229
|
Provides-Extra: srt
|
231
230
|
Requires-Dist: aiohttp ; extra == 'srt'
|
232
231
|
Requires-Dist: fastapi ; extra == 'srt'
|
232
|
+
Requires-Dist: hf-transfer ; extra == 'srt'
|
233
|
+
Requires-Dist: huggingface-hub ; extra == 'srt'
|
234
|
+
Requires-Dist: interegular ; extra == 'srt'
|
235
|
+
Requires-Dist: packaging ; extra == 'srt'
|
236
|
+
Requires-Dist: pillow ; extra == 'srt'
|
233
237
|
Requires-Dist: psutil ; extra == 'srt'
|
238
|
+
Requires-Dist: pydantic ; extra == 'srt'
|
234
239
|
Requires-Dist: rpyc ; extra == 'srt'
|
235
240
|
Requires-Dist: torch ; extra == 'srt'
|
236
|
-
Requires-Dist: uvloop ; extra == 'srt'
|
237
241
|
Requires-Dist: uvicorn ; extra == 'srt'
|
242
|
+
Requires-Dist: uvloop ; extra == 'srt'
|
238
243
|
Requires-Dist: zmq ; extra == 'srt'
|
239
|
-
Requires-Dist: vllm ==0.
|
240
|
-
Requires-Dist:
|
241
|
-
Requires-Dist: pydantic ; extra == 'srt'
|
242
|
-
Requires-Dist: pillow ; extra == 'srt'
|
243
|
-
Requires-Dist: packaging ; extra == 'srt'
|
244
|
-
Requires-Dist: huggingface-hub ; extra == 'srt'
|
245
|
-
Requires-Dist: hf-transfer ; extra == 'srt'
|
246
|
-
Requires-Dist: outlines >=0.0.34 ; extra == 'srt'
|
244
|
+
Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
|
245
|
+
Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
|
247
246
|
|
248
247
|
<div align="center">
|
249
248
|
<img src="assets/logo.png" alt="logo" width="400"></img>
|
@@ -257,8 +256,8 @@ SGLang is a structured generation language designed for large language models (L
|
|
257
256
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
258
257
|
|
259
258
|
The core features include:
|
260
|
-
- **
|
261
|
-
- **
|
259
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
262
261
|
|
263
262
|
## News
|
264
263
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -279,19 +278,33 @@ The core features include:
|
|
279
278
|
### Method 1: With pip
|
280
279
|
```
|
281
280
|
pip install "sglang[all]"
|
281
|
+
|
282
|
+
# Install FlashInfer CUDA kernels
|
283
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
282
284
|
```
|
283
285
|
|
284
286
|
### Method 2: From source
|
285
287
|
```
|
286
|
-
git clone
|
288
|
+
git clone https://github.com/sgl-project/sglang.git
|
287
289
|
cd sglang
|
288
290
|
|
289
|
-
pip install --upgrade pip
|
290
291
|
pip install -e "python[all]"
|
292
|
+
|
293
|
+
# Install FlashInfer CUDA kernels
|
294
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
291
295
|
```
|
292
296
|
|
293
|
-
###
|
294
|
-
|
297
|
+
### Method 3: Using docker
|
298
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
299
|
+
|
300
|
+
### Common Notes
|
301
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
|
302
|
+
```
|
303
|
+
pip uninstall -y triton triton-nightly
|
304
|
+
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
|
305
|
+
```
|
306
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
307
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
295
308
|
|
296
309
|
## Quick Start
|
297
310
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -511,8 +524,8 @@ for out in state.text_iter():
|
|
511
524
|
```
|
512
525
|
|
513
526
|
### Tips and Implementation Details
|
514
|
-
- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
|
515
|
-
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
|
527
|
+
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
528
|
+
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
516
529
|
|
517
530
|
## Backend: SGLang Runtime (SRT)
|
518
531
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
@@ -569,7 +582,6 @@ response = client.chat.completions.create(
|
|
569
582
|
print(response)
|
570
583
|
```
|
571
584
|
|
572
|
-
|
573
585
|
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
574
586
|
|
575
587
|
If needed, you can also override the chat template when launching the server:
|
@@ -598,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
598
610
|
```
|
599
611
|
|
600
612
|
### Additional Arguments
|
601
|
-
- Add `--tp 2` to enable tensor parallelism.
|
613
|
+
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
602
614
|
```
|
603
615
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
604
616
|
```
|
@@ -610,16 +622,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
610
622
|
```
|
611
623
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
612
624
|
```
|
613
|
-
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
614
625
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
615
626
|
|
616
627
|
### Supported Models
|
617
628
|
- Llama
|
618
629
|
- Mistral
|
619
630
|
- Mixtral
|
620
|
-
- Qwen / Qwen 2
|
621
|
-
- Gemma
|
622
|
-
- Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
|
631
|
+
- Qwen / Qwen 2 / Qwen 2 MoE
|
632
|
+
- Gemma / Gemma 2
|
623
633
|
- `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
|
624
634
|
- LLaVA
|
625
635
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -632,6 +642,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
632
642
|
- StableLM
|
633
643
|
- Command-R
|
634
644
|
- DBRX
|
645
|
+
- Grok
|
646
|
+
- ChatGLM
|
635
647
|
- AWQ/GPTQ/Marlin quantization
|
636
648
|
|
637
649
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
@@ -643,17 +655,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
643
655
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
644
656
|

|
645
657
|
|
646
|
-
Learn more [
|
658
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
659
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
647
660
|
|
648
661
|
## Roadmap
|
649
662
|
https://github.com/sgl-project/sglang/issues/157
|
650
663
|
|
651
664
|
## Citation And Acknowledgment
|
652
665
|
```
|
653
|
-
@misc{
|
654
|
-
title={
|
655
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
656
|
-
year={
|
666
|
+
@misc{zheng2024sglang,
|
667
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
668
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
669
|
+
year={2024},
|
657
670
|
eprint={2312.07104},
|
658
671
|
archivePrefix={arXiv},
|
659
672
|
primaryClass={cs.AI}
|
@@ -0,0 +1,81 @@
|
|
1
|
+
sglang/__init__.py,sha256=GriWuMrszCcPLrLQRv50jP0Crc6b8CLsBA3UYM36ISw,1116
|
2
|
+
sglang/api.py,sha256=W_FO5JTrW9I-DoGx2O8cLhcSA6LJqgplrOIqAX-ryNA,5560
|
3
|
+
sglang/bench_latency.py,sha256=Ln3DbLmTwIhgsiFZH0_L5Fd3Sc5jM_Vb9PFZytX76hM,10299
|
4
|
+
sglang/global_config.py,sha256=1HsHrPFgkqCc5iIwrweKQ0HLip0DLogtpm9vaqbZqfE,1426
|
5
|
+
sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
|
6
|
+
sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
|
7
|
+
sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
|
8
|
+
sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
|
10
|
+
sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
|
11
|
+
sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
|
12
|
+
sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
|
13
|
+
sglang/backend/runtime_endpoint.py,sha256=XTHAoN_EAwdfADc6vq9tuqri7udGMUih8dStgTuKV1g,9077
|
14
|
+
sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
|
15
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
sglang/lang/chat_template.py,sha256=hLX1qpXaUQi7PFndAwbOoOeGlX0NekskR_HndAvGnwQ,13307
|
17
|
+
sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
|
18
|
+
sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
|
19
|
+
sglang/lang/ir.py,sha256=ZGXJbJELlt8D8H7CyW3IqcRpZm8Pp7h_hLQw46NSb6I,16639
|
20
|
+
sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
|
21
|
+
sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
|
22
|
+
sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
|
23
|
+
sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
|
24
|
+
sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
|
25
|
+
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
26
|
+
sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
|
27
|
+
sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
|
28
|
+
sglang/srt/openai_protocol.py,sha256=-KJsGx2izL3Fc5EhOGi9PAXExuaq-DKRk0UlNjts11E,5348
|
29
|
+
sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
|
30
|
+
sglang/srt/server.py,sha256=ntl5XwnbOm2favQWbqVULXBUOLhXsgZ3mf1i2MY4e14,13226
|
31
|
+
sglang/srt/server_args.py,sha256=rvJImd-b9CVveg_V7n7dSotlro6q6pAqBk7lOxRC7nk,12307
|
32
|
+
sglang/srt/utils.py,sha256=e-yPzqDMCGsPgEf4TIe7CEh44lsKpZnclsrMtBggS_Y,19366
|
33
|
+
sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
|
34
|
+
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
35
|
+
sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
|
36
|
+
sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
|
37
|
+
sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
|
38
|
+
sglang/srt/layers/extend_attention.py,sha256=sVd94ViwwQaQDuE94sPMg6Ac6VOp7nX80hFol8qr85Q,13008
|
39
|
+
sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
|
40
|
+
sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
|
41
|
+
sglang/srt/layers/radix_attention.py,sha256=e468GCYteIuVOW7T9xols-IqXS0hJysmicvEiwD0xIM,6857
|
42
|
+
sglang/srt/layers/token_attention.py,sha256=eKUUU5pvYsF5EGthfbv-L_IUlg366l5e5X1eWTkE_Xw,8908
|
43
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=2oYNtYrSwtfu8G-QcFz_vZK6Buq-eHuZGg9VpxVhYOI,3492
|
44
|
+
sglang/srt/managers/io_struct.py,sha256=aCI4yYtKoioP459lWRN8kqVf4tvYYr_IhZaSnvJylgY,4533
|
45
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=h5nOR8NHCwEm52wiL-ZA1hoM_pvMuyG0j7Zj1h7aMxk,14898
|
46
|
+
sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
|
47
|
+
sglang/srt/managers/controller/infer_batch.py,sha256=wOuvi4lNhVEZtfXZKinBXCubG_VEaRTv60ijbHpSMgM,25713
|
48
|
+
sglang/srt/managers/controller/manager_multi.py,sha256=Z0a-iZzqk8T2Xl7ak2DgE9j00GA6Eb0XoNVx7UlxKa4,6630
|
49
|
+
sglang/srt/managers/controller/manager_single.py,sha256=5c33d1jPgOtys5gmfZe79UD7aXrsV--1Yq9Yc24bh1g,3469
|
50
|
+
sglang/srt/managers/controller/model_runner.py,sha256=a-1RKjA12U11BvDbnOECyPf6rpxes895pEZ0-Hyxo6c,21888
|
51
|
+
sglang/srt/managers/controller/radix_cache.py,sha256=fMqIm1fTvufI9I_QMoFLfQMkSUWp8VN4wh3-63KJUL0,8193
|
52
|
+
sglang/srt/managers/controller/schedule_heuristic.py,sha256=_ne7W2mrpuO794uh5tYLR3q6XBbgTMdNmE6VpzY1sJE,2312
|
53
|
+
sglang/srt/managers/controller/tp_worker.py,sha256=WBqL5_VVDAf3o12ymZwxQn7RYZ_dm_w2dXCnMVQ5L3M,31828
|
54
|
+
sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
|
55
|
+
sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
|
56
|
+
sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
|
57
|
+
sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
|
58
|
+
sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
|
59
|
+
sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
|
60
|
+
sglang/srt/models/llama2.py,sha256=FIUlkFoBhRNidU_Tlcr4UbSqzKPdz3wBc9OocN_CzQs,12188
|
61
|
+
sglang/srt/models/llama_classification.py,sha256=bLuugRFcPGEaNd58_LFOkWqOru2rCAGChhBw9dSu7pc,4349
|
62
|
+
sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
|
63
|
+
sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
|
64
|
+
sglang/srt/models/minicpm.py,sha256=vYCGjUjYIYVroiV2kOXWdWIPF6__vkN8JnRK-DqgKNI,13271
|
65
|
+
sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
|
66
|
+
sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
|
67
|
+
sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
|
68
|
+
sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
|
69
|
+
sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
|
70
|
+
sglang/srt/models/qwen2_moe.py,sha256=hV3dF_AzYONd-pQEmEkrrwpTZC6A7K4wY1_cph9UC54,18421
|
71
|
+
sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
|
72
|
+
sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
|
73
|
+
sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
|
74
|
+
sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
|
75
|
+
sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
|
76
|
+
sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
|
77
|
+
sglang-0.1.19.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
78
|
+
sglang-0.1.19.dist-info/METADATA,sha256=iSIkO_DxfMHQIEv7ZdMXWwi_weLZtf8YRNS80vjf1Kk,30262
|
79
|
+
sglang-0.1.19.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
|
80
|
+
sglang-0.1.19.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
81
|
+
sglang-0.1.19.dist-info/RECORD,,
|