sglang 0.3.5__py3-none-any.whl → 0.3.5.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +113 -3
- sglang/srt/configs/model_config.py +5 -2
- sglang/srt/constrained/__init__.py +2 -66
- sglang/srt/constrained/base_grammar_backend.py +72 -0
- sglang/srt/constrained/outlines_backend.py +165 -0
- sglang/srt/constrained/outlines_jump_forward.py +182 -0
- sglang/srt/constrained/xgrammar_backend.py +114 -0
- sglang/srt/layers/attention/triton_ops/decode_attention.py +7 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +6 -0
- sglang/srt/layers/fused_moe/fused_moe.py +23 -7
- sglang/srt/layers/quantization/base_config.py +4 -6
- sglang/srt/layers/vocab_parallel_embedding.py +216 -150
- sglang/srt/managers/io_struct.py +5 -3
- sglang/srt/managers/schedule_batch.py +14 -20
- sglang/srt/managers/scheduler.py +153 -94
- sglang/srt/managers/tokenizer_manager.py +81 -17
- sglang/srt/metrics/collector.py +211 -0
- sglang/srt/metrics/func_timer.py +108 -0
- sglang/srt/mm_utils.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +2 -2
- sglang/srt/model_executor/forward_batch_info.py +7 -3
- sglang/srt/model_executor/model_runner.py +2 -1
- sglang/srt/models/gemma2_reward.py +69 -0
- sglang/srt/models/gpt2.py +31 -37
- sglang/srt/models/internlm2_reward.py +62 -0
- sglang/srt/models/llama.py +11 -6
- sglang/srt/models/llama_reward.py +5 -26
- sglang/srt/models/qwen2_vl.py +5 -7
- sglang/srt/openai_api/adapter.py +6 -2
- sglang/srt/sampling/sampling_batch_info.py +2 -3
- sglang/srt/sampling/sampling_params.py +0 -14
- sglang/srt/server.py +58 -16
- sglang/srt/server_args.py +42 -22
- sglang/srt/utils.py +87 -0
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +2 -2
- sglang/test/simple_eval_mgsm.py +2 -2
- sglang/test/test_utils.py +18 -4
- sglang/utils.py +1 -0
- sglang/version.py +1 -1
- {sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/METADATA +11 -7
- {sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/RECORD +45 -42
- {sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/WHEEL +1 -1
- sglang/srt/constrained/base_tool_cache.py +0 -65
- sglang/srt/constrained/bnf_cache.py +0 -61
- sglang/srt/constrained/fsm_cache.py +0 -95
- sglang/srt/constrained/grammar.py +0 -190
- sglang/srt/constrained/jump_forward.py +0 -203
- {sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/LICENSE +0 -0
- {sglang-0.3.5.dist-info → sglang-0.3.5.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server.py
CHANGED
@@ -30,12 +30,11 @@ import time
|
|
30
30
|
from http import HTTPStatus
|
31
31
|
from typing import AsyncIterator, Dict, List, Optional, Union
|
32
32
|
|
33
|
-
import orjson
|
34
|
-
|
35
33
|
# Fix a bug of Python threading
|
36
34
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
37
35
|
|
38
36
|
import aiohttp
|
37
|
+
import orjson
|
39
38
|
import requests
|
40
39
|
import uvicorn
|
41
40
|
import uvloop
|
@@ -57,6 +56,7 @@ from sglang.srt.managers.io_struct import (
|
|
57
56
|
)
|
58
57
|
from sglang.srt.managers.scheduler import run_scheduler_process
|
59
58
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
59
|
+
from sglang.srt.metrics.func_timer import enable_func_timer, time_func_latency
|
60
60
|
from sglang.srt.openai_api.adapter import (
|
61
61
|
load_chat_template_for_openai_api,
|
62
62
|
v1_batches,
|
@@ -74,12 +74,15 @@ from sglang.srt.openai_api.protocol import ModelCard, ModelList
|
|
74
74
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
75
75
|
from sglang.srt.utils import (
|
76
76
|
add_api_key_middleware,
|
77
|
+
add_prometheus_middleware,
|
77
78
|
assert_pkg_version,
|
78
79
|
configure_logger,
|
80
|
+
delete_directory,
|
79
81
|
is_port_available,
|
80
82
|
kill_child_process,
|
81
83
|
maybe_set_triton_cache_manager,
|
82
84
|
prepare_model_and_tokenizer,
|
85
|
+
set_prometheus_multiproc_dir,
|
83
86
|
set_ulimit,
|
84
87
|
)
|
85
88
|
from sglang.utils import get_exception_traceback
|
@@ -90,8 +93,6 @@ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
|
90
93
|
|
91
94
|
|
92
95
|
app = FastAPI()
|
93
|
-
tokenizer_manager: TokenizerManager = None
|
94
|
-
|
95
96
|
app.add_middleware(
|
96
97
|
CORSMiddleware,
|
97
98
|
allow_origins=["*"],
|
@@ -100,6 +101,10 @@ app.add_middleware(
|
|
100
101
|
allow_headers=["*"],
|
101
102
|
)
|
102
103
|
|
104
|
+
tokenizer_manager: TokenizerManager = None
|
105
|
+
|
106
|
+
##### Native API endpoints #####
|
107
|
+
|
103
108
|
|
104
109
|
@app.get("/health")
|
105
110
|
async def health() -> Response:
|
@@ -110,9 +115,16 @@ async def health() -> Response:
|
|
110
115
|
@app.get("/health_generate")
|
111
116
|
async def health_generate(request: Request) -> Response:
|
112
117
|
"""Check the health of the inference server by generating one token."""
|
113
|
-
|
114
|
-
|
115
|
-
|
118
|
+
|
119
|
+
if tokenizer_manager.is_generation:
|
120
|
+
gri = GenerateReqInput(
|
121
|
+
input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
|
122
|
+
)
|
123
|
+
else:
|
124
|
+
gri = EmbeddingReqInput(
|
125
|
+
input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
|
126
|
+
)
|
127
|
+
|
116
128
|
try:
|
117
129
|
async for _ in tokenizer_manager.generate_request(gri, request):
|
118
130
|
break
|
@@ -185,6 +197,7 @@ async def get_memory_pool_size():
|
|
185
197
|
|
186
198
|
|
187
199
|
@app.post("/update_weights")
|
200
|
+
@time_func_latency
|
188
201
|
async def update_weights(obj: UpdateWeightReqInput, request: Request):
|
189
202
|
"""Update the weights inplace without re-launching the server."""
|
190
203
|
success, message = await tokenizer_manager.update_weights(obj, request)
|
@@ -201,7 +214,7 @@ async def update_weights(obj: UpdateWeightReqInput, request: Request):
|
|
201
214
|
)
|
202
215
|
|
203
216
|
|
204
|
-
|
217
|
+
@time_func_latency
|
205
218
|
async def generate_request(obj: GenerateReqInput, request: Request):
|
206
219
|
"""Handle a generate request."""
|
207
220
|
if obj.stream:
|
@@ -234,10 +247,12 @@ async def generate_request(obj: GenerateReqInput, request: Request):
|
|
234
247
|
)
|
235
248
|
|
236
249
|
|
250
|
+
# fastapi implicitly converts json in the request to obj (dataclass)
|
237
251
|
app.post("/generate")(generate_request)
|
238
252
|
app.put("/generate")(generate_request)
|
239
253
|
|
240
254
|
|
255
|
+
@time_func_latency
|
241
256
|
async def encode_request(obj: EmbeddingReqInput, request: Request):
|
242
257
|
"""Handle an embedding request."""
|
243
258
|
try:
|
@@ -253,7 +268,8 @@ app.post("/encode")(encode_request)
|
|
253
268
|
app.put("/encode")(encode_request)
|
254
269
|
|
255
270
|
|
256
|
-
|
271
|
+
@time_func_latency
|
272
|
+
async def classify_request(obj: EmbeddingReqInput, request: Request):
|
257
273
|
"""Handle a reward model request. Now the arguments and return values are the same as embedding models."""
|
258
274
|
try:
|
259
275
|
ret = await tokenizer_manager.generate_request(obj, request).__anext__()
|
@@ -264,21 +280,27 @@ async def judge_request(obj: EmbeddingReqInput, request: Request):
|
|
264
280
|
)
|
265
281
|
|
266
282
|
|
267
|
-
app.post("/
|
268
|
-
app.put("/
|
283
|
+
app.post("/classify")(classify_request)
|
284
|
+
app.put("/classify")(classify_request)
|
285
|
+
|
286
|
+
|
287
|
+
##### OpenAI-compatible API endpoints #####
|
269
288
|
|
270
289
|
|
271
290
|
@app.post("/v1/completions")
|
291
|
+
@time_func_latency
|
272
292
|
async def openai_v1_completions(raw_request: Request):
|
273
293
|
return await v1_completions(tokenizer_manager, raw_request)
|
274
294
|
|
275
295
|
|
276
296
|
@app.post("/v1/chat/completions")
|
297
|
+
@time_func_latency
|
277
298
|
async def openai_v1_chat_completions(raw_request: Request):
|
278
299
|
return await v1_chat_completions(tokenizer_manager, raw_request)
|
279
300
|
|
280
301
|
|
281
302
|
@app.post("/v1/embeddings", response_class=ORJSONResponse)
|
303
|
+
@time_func_latency
|
282
304
|
async def openai_v1_embeddings(raw_request: Request):
|
283
305
|
response = await v1_embeddings(tokenizer_manager, raw_request)
|
284
306
|
return response
|
@@ -432,13 +454,17 @@ def launch_server(
|
|
432
454
|
1. The HTTP server and Tokenizer Manager both run in the main process.
|
433
455
|
2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
|
434
456
|
"""
|
435
|
-
|
436
457
|
launch_engine(server_args=server_args)
|
437
458
|
|
438
459
|
# Add api key authorization
|
439
460
|
if server_args.api_key:
|
440
461
|
add_api_key_middleware(app, server_args.api_key)
|
441
462
|
|
463
|
+
# add prometheus middleware
|
464
|
+
if server_args.enable_metrics:
|
465
|
+
add_prometheus_middleware(app)
|
466
|
+
enable_func_timer()
|
467
|
+
|
442
468
|
# Send a warmup request
|
443
469
|
t = threading.Thread(
|
444
470
|
target=_wait_and_warmup, args=(server_args, pipe_finish_writer)
|
@@ -475,6 +501,10 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
475
501
|
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
476
502
|
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
|
477
503
|
|
504
|
+
# Set prometheus env vars
|
505
|
+
if server_args.enable_metrics:
|
506
|
+
set_prometheus_multiproc_dir()
|
507
|
+
|
478
508
|
# Set ulimit
|
479
509
|
set_ulimit()
|
480
510
|
|
@@ -523,6 +553,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
|
|
523
553
|
return
|
524
554
|
|
525
555
|
model_info = res.json()
|
556
|
+
|
526
557
|
# Send a warmup request
|
527
558
|
request_name = "/generate" if model_info["is_generation"] else "/encode"
|
528
559
|
max_new_tokens = 8 if model_info["is_generation"] else 1
|
@@ -560,6 +591,9 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
|
|
560
591
|
if pipe_finish_writer is not None:
|
561
592
|
pipe_finish_writer.send("ready")
|
562
593
|
|
594
|
+
if server_args.delete_ckpt_after_loading:
|
595
|
+
delete_directory(server_args.model_path)
|
596
|
+
|
563
597
|
|
564
598
|
class Runtime:
|
565
599
|
"""
|
@@ -720,12 +754,12 @@ class Engine:
|
|
720
754
|
|
721
755
|
# before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
|
722
756
|
atexit.register(self.shutdown)
|
723
|
-
|
757
|
+
|
724
758
|
# runtime server default log level is log
|
725
759
|
# offline engine works in scripts, so we set it to error
|
726
760
|
|
727
|
-
if
|
728
|
-
kwargs[
|
761
|
+
if "log_level" not in kwargs:
|
762
|
+
kwargs["log_level"] = "error"
|
729
763
|
|
730
764
|
server_args = ServerArgs(*args, **kwargs)
|
731
765
|
launch_engine(server_args=server_args)
|
@@ -840,4 +874,12 @@ class Engine:
|
|
840
874
|
else:
|
841
875
|
return tokenizer_manager.tokenizer
|
842
876
|
|
843
|
-
|
877
|
+
def encode(
|
878
|
+
self,
|
879
|
+
prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
|
880
|
+
):
|
881
|
+
obj = EmbeddingReqInput(text=prompt)
|
882
|
+
|
883
|
+
# get the current event loop
|
884
|
+
loop = asyncio.get_event_loop()
|
885
|
+
return loop.run_until_complete(encode_request(obj, None))
|
sglang/srt/server_args.py
CHANGED
@@ -63,25 +63,26 @@ class ServerArgs:
|
|
63
63
|
stream_interval: int = 1
|
64
64
|
random_seed: Optional[int] = None
|
65
65
|
constrained_json_whitespace_pattern: Optional[str] = None
|
66
|
-
|
66
|
+
watchdog_timeout: float = 300
|
67
67
|
|
68
68
|
# Logging
|
69
69
|
log_level: str = "info"
|
70
70
|
log_level_http: Optional[str] = None
|
71
71
|
log_requests: bool = False
|
72
72
|
show_time_cost: bool = False
|
73
|
+
enable_metrics: bool = False
|
74
|
+
decode_log_interval: int = 40
|
73
75
|
|
74
|
-
#
|
76
|
+
# API related
|
75
77
|
api_key: Optional[str] = None
|
76
78
|
file_storage_pth: str = "SGLang_storage"
|
77
79
|
enable_cache_report: bool = False
|
78
|
-
watchdog_timeout: float = 600
|
79
80
|
|
80
81
|
# Data parallelism
|
81
82
|
dp_size: int = 1
|
82
83
|
load_balance_method: str = "round_robin"
|
83
84
|
|
84
|
-
#
|
85
|
+
# Multi-node distributed serving
|
85
86
|
dist_init_addr: Optional[str] = None
|
86
87
|
nnodes: int = 1
|
87
88
|
node_rank: int = 0
|
@@ -110,7 +111,7 @@ class ServerArgs:
|
|
110
111
|
disable_flashinfer: bool = False
|
111
112
|
disable_flashinfer_sampling: bool = False
|
112
113
|
disable_radix_cache: bool = False
|
113
|
-
|
114
|
+
disable_jump_forward: bool = False
|
114
115
|
disable_cuda_graph: bool = False
|
115
116
|
disable_cuda_graph_padding: bool = False
|
116
117
|
disable_disk_cache: bool = False
|
@@ -127,6 +128,7 @@ class ServerArgs:
|
|
127
128
|
enable_p2p_check: bool = False
|
128
129
|
triton_attention_reduce_in_fp32: bool = False
|
129
130
|
num_continuous_decode_steps: int = 1
|
131
|
+
delete_ckpt_after_loading: bool = False
|
130
132
|
|
131
133
|
def __post_init__(self):
|
132
134
|
# Set missing default values
|
@@ -204,6 +206,7 @@ class ServerArgs:
|
|
204
206
|
|
205
207
|
@staticmethod
|
206
208
|
def add_cli_args(parser: argparse.ArgumentParser):
|
209
|
+
# Model and port args
|
207
210
|
parser.add_argument(
|
208
211
|
"--model-path",
|
209
212
|
type=str,
|
@@ -323,6 +326,8 @@ class ServerArgs:
|
|
323
326
|
action="store_true",
|
324
327
|
help="Whether to use a CausalLM as an embedding model.",
|
325
328
|
)
|
329
|
+
|
330
|
+
# Memory and scheduling
|
326
331
|
parser.add_argument(
|
327
332
|
"--mem-fraction-static",
|
328
333
|
type=float,
|
@@ -367,6 +372,8 @@ class ServerArgs:
|
|
367
372
|
default=ServerArgs.schedule_conservativeness,
|
368
373
|
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
|
369
374
|
)
|
375
|
+
|
376
|
+
# Other runtime options
|
370
377
|
parser.add_argument(
|
371
378
|
"--tensor-parallel-size",
|
372
379
|
"--tp-size",
|
@@ -392,6 +399,14 @@ class ServerArgs:
|
|
392
399
|
default=ServerArgs.constrained_json_whitespace_pattern,
|
393
400
|
help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
|
394
401
|
)
|
402
|
+
parser.add_argument(
|
403
|
+
"--watchdog-timeout",
|
404
|
+
type=float,
|
405
|
+
default=ServerArgs.watchdog_timeout,
|
406
|
+
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
|
407
|
+
)
|
408
|
+
|
409
|
+
# Logging
|
395
410
|
parser.add_argument(
|
396
411
|
"--log-level",
|
397
412
|
type=str,
|
@@ -414,6 +429,19 @@ class ServerArgs:
|
|
414
429
|
action="store_true",
|
415
430
|
help="Show time cost of custom marks.",
|
416
431
|
)
|
432
|
+
parser.add_argument(
|
433
|
+
"--enable-metrics",
|
434
|
+
action="store_true",
|
435
|
+
help="Enable log prometheus metrics.",
|
436
|
+
)
|
437
|
+
parser.add_argument(
|
438
|
+
"--decode-log-interval",
|
439
|
+
type=int,
|
440
|
+
default=ServerArgs.decode_log_interval,
|
441
|
+
help="The log interval of decode batch",
|
442
|
+
)
|
443
|
+
|
444
|
+
# API related
|
417
445
|
parser.add_argument(
|
418
446
|
"--api-key",
|
419
447
|
type=str,
|
@@ -431,18 +459,6 @@ class ServerArgs:
|
|
431
459
|
action="store_true",
|
432
460
|
help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
|
433
461
|
)
|
434
|
-
parser.add_argument(
|
435
|
-
"--watchdog-timeout",
|
436
|
-
type=float,
|
437
|
-
default=ServerArgs.watchdog_timeout,
|
438
|
-
help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
|
439
|
-
)
|
440
|
-
parser.add_argument(
|
441
|
-
"--decode-log-interval",
|
442
|
-
type=int,
|
443
|
-
default=ServerArgs.decode_log_interval,
|
444
|
-
help="The log interval of decode batch"
|
445
|
-
)
|
446
462
|
|
447
463
|
# Data parallelism
|
448
464
|
parser.add_argument(
|
@@ -463,7 +479,7 @@ class ServerArgs:
|
|
463
479
|
],
|
464
480
|
)
|
465
481
|
|
466
|
-
# Multi-node distributed serving
|
482
|
+
# Multi-node distributed serving
|
467
483
|
parser.add_argument(
|
468
484
|
"--dist-init-addr",
|
469
485
|
"--nccl-init-addr", # For backward compatbility. This will be removed in the future.
|
@@ -558,7 +574,7 @@ class ServerArgs:
|
|
558
574
|
type=str,
|
559
575
|
choices=["xgrammar", "outlines"],
|
560
576
|
default=ServerArgs.grammar_backend,
|
561
|
-
help="Choose the backend for
|
577
|
+
help="Choose the backend for grammar-guided decoding.",
|
562
578
|
)
|
563
579
|
|
564
580
|
# Optimization/debug options
|
@@ -578,9 +594,9 @@ class ServerArgs:
|
|
578
594
|
help="Disable RadixAttention for prefix caching.",
|
579
595
|
)
|
580
596
|
parser.add_argument(
|
581
|
-
"--disable-
|
597
|
+
"--disable-jump-forward",
|
582
598
|
action="store_true",
|
583
|
-
help="Disable
|
599
|
+
help="Disable jump-forward for grammar-guided decoding.",
|
584
600
|
)
|
585
601
|
parser.add_argument(
|
586
602
|
"--disable-cuda-graph",
|
@@ -600,7 +616,6 @@ class ServerArgs:
|
|
600
616
|
parser.add_argument(
|
601
617
|
"--disable-custom-all-reduce",
|
602
618
|
action="store_true",
|
603
|
-
default=False,
|
604
619
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
605
620
|
)
|
606
621
|
parser.add_argument(
|
@@ -670,6 +685,11 @@ class ServerArgs:
|
|
670
685
|
"This can potentially increase throughput but may also increase time-to-first-token latency. "
|
671
686
|
"The default value is 1, meaning only run one decoding step at a time.",
|
672
687
|
)
|
688
|
+
parser.add_argument(
|
689
|
+
"--delete-ckpt-after-loading",
|
690
|
+
action="store_true",
|
691
|
+
help="Delete the model checkpoint after loading the model.",
|
692
|
+
)
|
673
693
|
|
674
694
|
@classmethod
|
675
695
|
def from_cli_args(cls, args: argparse.Namespace):
|
sglang/srt/utils.py
CHANGED
@@ -22,8 +22,12 @@ import logging
|
|
22
22
|
import os
|
23
23
|
import pickle
|
24
24
|
import random
|
25
|
+
import re
|
25
26
|
import resource
|
27
|
+
import shutil
|
28
|
+
import signal
|
26
29
|
import socket
|
30
|
+
import tempfile
|
27
31
|
import time
|
28
32
|
import warnings
|
29
33
|
from importlib.metadata import PackageNotFoundError, version
|
@@ -35,9 +39,11 @@ import psutil
|
|
35
39
|
import requests
|
36
40
|
import torch
|
37
41
|
import torch.distributed as dist
|
42
|
+
import triton
|
38
43
|
import zmq
|
39
44
|
from fastapi.responses import ORJSONResponse
|
40
45
|
from packaging import version as pkg_version
|
46
|
+
from starlette.routing import Mount
|
41
47
|
from torch import nn
|
42
48
|
from torch.profiler import ProfilerActivity, profile, record_function
|
43
49
|
from triton.runtime.cache import (
|
@@ -379,6 +385,10 @@ def kill_child_process(pid=None, include_self=False, skip_pid=None):
|
|
379
385
|
if include_self:
|
380
386
|
try:
|
381
387
|
itself.kill()
|
388
|
+
|
389
|
+
# Sometime processes cannot be killed with SIGKILL (e.g, PID=1 launched by kubernetes),
|
390
|
+
# so we send an additional signal to kill them.
|
391
|
+
itself.send_signal(signal.SIGINT)
|
382
392
|
except psutil.NoSuchProcess:
|
383
393
|
pass
|
384
394
|
|
@@ -704,3 +714,80 @@ def get_zmq_socket(context: zmq.Context, socket_type: zmq.SocketType, endpoint:
|
|
704
714
|
raise ValueError(f"Unsupported socket type: {socket_type}")
|
705
715
|
|
706
716
|
return socket
|
717
|
+
|
718
|
+
|
719
|
+
def dump_to_file(dirpath, name, value):
|
720
|
+
from vllm.distributed import get_tensor_model_parallel_rank
|
721
|
+
|
722
|
+
if get_tensor_model_parallel_rank() != 0:
|
723
|
+
return
|
724
|
+
|
725
|
+
os.makedirs(dirpath, exist_ok=True)
|
726
|
+
if value.dtype is torch.bfloat16:
|
727
|
+
value = value.float()
|
728
|
+
value = value.cpu().numpy()
|
729
|
+
output_filename = os.path.join(dirpath, f"pytorch_dump_{name}.npy")
|
730
|
+
logger.info(f"Dump a tensor to {output_filename}. Shape = {value.shape}")
|
731
|
+
np.save(output_filename, value)
|
732
|
+
|
733
|
+
|
734
|
+
def is_triton_3():
|
735
|
+
return triton.__version__.startswith("3.")
|
736
|
+
|
737
|
+
|
738
|
+
def maybe_torch_compile(*args, **kwargs):
|
739
|
+
"""
|
740
|
+
torch.compile does not work for triton 2.2.0, which is needed in xlm1's jax.
|
741
|
+
Therefore, we disable it here.
|
742
|
+
"""
|
743
|
+
|
744
|
+
def decorator(func):
|
745
|
+
if is_triton_3():
|
746
|
+
return torch.compile(*args, **kwargs)(func)
|
747
|
+
return func
|
748
|
+
|
749
|
+
return decorator
|
750
|
+
|
751
|
+
|
752
|
+
def delete_directory(dirpath):
|
753
|
+
try:
|
754
|
+
# This will remove the directory and all its contents
|
755
|
+
shutil.rmtree(dirpath)
|
756
|
+
except OSError as e:
|
757
|
+
print(f"Warning: {dirpath} : {e.strerror}")
|
758
|
+
|
759
|
+
|
760
|
+
# Temporary directory for prometheus multiprocess mode
|
761
|
+
# Cleaned up automatically when this object is garbage collected
|
762
|
+
prometheus_multiproc_dir: tempfile.TemporaryDirectory
|
763
|
+
|
764
|
+
|
765
|
+
def set_prometheus_multiproc_dir():
|
766
|
+
# Set prometheus multiprocess directory
|
767
|
+
# sglang uses prometheus multiprocess mode
|
768
|
+
# we need to set this before importing prometheus_client
|
769
|
+
# https://prometheus.github.io/client_python/multiprocess/
|
770
|
+
global prometheus_multiproc_dir
|
771
|
+
|
772
|
+
if "PROMETHEUS_MULTIPROC_DIR" in os.environ:
|
773
|
+
logger.debug("User set PROMETHEUS_MULTIPROC_DIR detected.")
|
774
|
+
prometheus_multiproc_dir = tempfile.TemporaryDirectory(
|
775
|
+
dir=os.environ["PROMETHEUS_MULTIPROC_DIR"]
|
776
|
+
)
|
777
|
+
else:
|
778
|
+
prometheus_multiproc_dir = tempfile.TemporaryDirectory()
|
779
|
+
os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
|
780
|
+
logger.debug(f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}")
|
781
|
+
|
782
|
+
|
783
|
+
def add_prometheus_middleware(app):
|
784
|
+
# We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
|
785
|
+
from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess
|
786
|
+
|
787
|
+
registry = CollectorRegistry()
|
788
|
+
multiprocess.MultiProcessCollector(registry)
|
789
|
+
metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
|
790
|
+
|
791
|
+
# Workaround for 307 Redirect for /metrics
|
792
|
+
metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
|
793
|
+
app.routes.append(metrics_route)
|
@@ -320,7 +320,7 @@ jinja_env = jinja2.Environment(
|
|
320
320
|
_message_template = """
|
321
321
|
<div class="message {{ role }}">
|
322
322
|
<div class="role">
|
323
|
-
{{ role }}
|
323
|
+
{{ role }}
|
324
324
|
{% if variant %}<span class="variant">({{ variant }})</span>{% endif %}
|
325
325
|
</div>
|
326
326
|
<div class="content">
|
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
"""
|
4
4
|
HumanEval: Evaluating Large Language Models Trained on Code
|
5
|
-
Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
|
6
|
-
https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
|
5
|
+
Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
|
6
|
+
https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
|
7
7
|
"""
|
8
8
|
|
9
9
|
import random
|
sglang/test/simple_eval_mgsm.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# Adapted from https://github.com/openai/simple-evals/
|
2
2
|
|
3
3
|
"""
|
4
|
-
MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
|
4
|
+
MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
|
5
5
|
Language Models are Multilingual Chain-of-Thought Reasoners
|
6
6
|
Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei
|
7
|
-
https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
|
7
|
+
https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
|
8
8
|
"""
|
9
9
|
|
10
10
|
import re
|
sglang/test/test_utils.py
CHANGED
@@ -27,6 +27,8 @@ from sglang.utils import get_exception_traceback
|
|
27
27
|
|
28
28
|
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
29
29
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
30
|
+
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
31
|
+
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
30
32
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
31
33
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
32
34
|
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
@@ -441,7 +443,7 @@ def popen_launch_server(
|
|
441
443
|
"Content-Type": "application/json; charset=utf-8",
|
442
444
|
"Authorization": f"Bearer {api_key}",
|
443
445
|
}
|
444
|
-
response = requests.get(f"{base_url}/
|
446
|
+
response = requests.get(f"{base_url}/health_generate", headers=headers)
|
445
447
|
if response.status_code == 200:
|
446
448
|
return process
|
447
449
|
except requests.RequestException:
|
@@ -636,8 +638,8 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
|
|
636
638
|
return rouge_l_scores
|
637
639
|
|
638
640
|
|
639
|
-
STDOUT_FILENAME = "stdout.txt"
|
640
641
|
STDERR_FILENAME = "stderr.txt"
|
642
|
+
STDOUT_FILENAME = "stdout.txt"
|
641
643
|
|
642
644
|
|
643
645
|
def read_output(output_lines):
|
@@ -742,7 +744,13 @@ def run_mmlu_test(
|
|
742
744
|
finally:
|
743
745
|
pass
|
744
746
|
|
745
|
-
run_and_check_memory_leak(
|
747
|
+
run_and_check_memory_leak(
|
748
|
+
workload_func,
|
749
|
+
disable_radix_cache,
|
750
|
+
enable_mixed_chunk,
|
751
|
+
enable_overlap,
|
752
|
+
chunked_prefill_size,
|
753
|
+
)
|
746
754
|
|
747
755
|
|
748
756
|
def run_mulit_request_test(
|
@@ -775,4 +783,10 @@ def run_mulit_request_test(
|
|
775
783
|
with ThreadPoolExecutor(2) as executor:
|
776
784
|
list(executor.map(run_one, list(range(4))))
|
777
785
|
|
778
|
-
run_and_check_memory_leak(
|
786
|
+
run_and_check_memory_leak(
|
787
|
+
workload_func,
|
788
|
+
disable_radix_cache,
|
789
|
+
enable_mixed_chunk,
|
790
|
+
enable_overlap,
|
791
|
+
chunked_prefill_size,
|
792
|
+
)
|
sglang/utils.py
CHANGED
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.5"
|
1
|
+
__version__ = "0.3.5.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.5
|
3
|
+
Version: 0.3.5.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -256,13 +256,14 @@ Requires-Dist: interegular; extra == "runtime-common"
|
|
256
256
|
Requires-Dist: orjson; extra == "runtime-common"
|
257
257
|
Requires-Dist: packaging; extra == "runtime-common"
|
258
258
|
Requires-Dist: pillow; extra == "runtime-common"
|
259
|
+
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
259
260
|
Requires-Dist: psutil; extra == "runtime-common"
|
260
261
|
Requires-Dist: pydantic; extra == "runtime-common"
|
261
262
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
262
263
|
Requires-Dist: torchao; extra == "runtime-common"
|
263
264
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
264
265
|
Requires-Dist: uvloop; extra == "runtime-common"
|
265
|
-
Requires-Dist:
|
266
|
+
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
266
267
|
Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
|
267
268
|
Requires-Dist: modelscope; extra == "runtime-common"
|
268
269
|
Provides-Extra: srt
|
@@ -291,13 +292,14 @@ Requires-Dist: peft; extra == "test"
|
|
291
292
|
[](https://github.com/sgl-project/sglang/tree/main/LICENSE)
|
292
293
|
[](https://github.com/sgl-project/sglang/issues)
|
293
294
|
[](https://github.com/sgl-project/sglang/issues)
|
295
|
+
[-006BFF)](https://gurubase.io/g/sglang)
|
294
296
|
|
295
297
|
</div>
|
296
298
|
|
297
299
|
--------------------------------------------------------------------------------
|
298
300
|
|
299
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-
|
300
|
-
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
|
301
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
|
302
|
+
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
301
303
|
|
302
304
|
## News
|
303
305
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
@@ -321,11 +323,13 @@ The core features include:
|
|
321
323
|
|
322
324
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
323
325
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
324
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.)
|
326
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
325
327
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
326
328
|
|
327
|
-
##
|
328
|
-
See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
|
329
|
+
## Getting Started
|
330
|
+
Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
|
331
|
+
|
332
|
+
Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
|
329
333
|
|
330
334
|
## Backend: SGLang Runtime (SRT)
|
331
335
|
See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
|