sglang 0.2.14__py3-none-any.whl → 0.2.14.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/launch_server_llavavid.py +26 -0
- sglang/srt/constrained/fsm_cache.py +11 -2
- sglang/srt/constrained/jump_forward.py +1 -0
- sglang/srt/hf_transformers_utils.py +0 -149
- sglang/srt/layers/activation.py +93 -11
- sglang/srt/layers/layernorm.py +47 -4
- sglang/srt/layers/logits_processor.py +4 -4
- sglang/srt/layers/sampler.py +15 -68
- sglang/srt/managers/io_struct.py +5 -4
- sglang/srt/managers/schedule_batch.py +20 -25
- sglang/srt/managers/tokenizer_manager.py +74 -61
- sglang/srt/managers/tp_worker.py +49 -43
- sglang/srt/model_executor/cuda_graph_runner.py +17 -31
- sglang/srt/model_executor/forward_batch_info.py +9 -26
- sglang/srt/model_executor/model_runner.py +20 -17
- sglang/srt/models/chatglm.py +13 -5
- sglang/srt/models/commandr.py +1 -5
- sglang/srt/models/dbrx.py +1 -5
- sglang/srt/models/deepseek.py +1 -5
- sglang/srt/models/deepseek_v2.py +1 -5
- sglang/srt/models/gemma.py +3 -7
- sglang/srt/models/gemma2.py +2 -56
- sglang/srt/models/gpt_bigcode.py +2 -6
- sglang/srt/models/grok.py +10 -8
- sglang/srt/models/internlm2.py +1 -5
- sglang/srt/models/llama2.py +6 -11
- sglang/srt/models/llama_classification.py +2 -6
- sglang/srt/models/llama_embedding.py +3 -4
- sglang/srt/models/llava.py +69 -91
- sglang/srt/models/llavavid.py +40 -86
- sglang/srt/models/minicpm.py +1 -5
- sglang/srt/models/mixtral.py +1 -5
- sglang/srt/models/mixtral_quant.py +1 -5
- sglang/srt/models/qwen.py +2 -5
- sglang/srt/models/qwen2.py +5 -10
- sglang/srt/models/qwen2_moe.py +21 -24
- sglang/srt/models/stablelm.py +1 -5
- sglang/srt/models/yivl.py +2 -7
- sglang/srt/openai_api/adapter.py +85 -4
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/sampling/sampling_batch_info.py +1 -74
- sglang/srt/sampling/sampling_params.py +4 -0
- sglang/srt/server.py +11 -4
- sglang/srt/utils.py +18 -33
- sglang/test/runners.py +2 -2
- sglang/test/test_layernorm.py +53 -1
- sglang/version.py +1 -1
- {sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/METADATA +11 -5
- {sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/RECORD +52 -51
- {sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/WHEEL +1 -1
- {sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/LICENSE +0 -0
- {sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/top_level.txt +0 -0
sglang/srt/openai_api/adapter.py
CHANGED
@@ -275,10 +275,12 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
275
275
|
end_point = batch_storage[batch_id].endpoint
|
276
276
|
file_request_list = []
|
277
277
|
all_requests = []
|
278
|
+
request_ids = []
|
278
279
|
for line in lines:
|
279
280
|
request_data = json.loads(line)
|
280
281
|
file_request_list.append(request_data)
|
281
282
|
body = request_data["body"]
|
283
|
+
request_ids.append(request_data["custom_id"])
|
282
284
|
|
283
285
|
# Although streaming is supported for standalone completions, it is not supported in
|
284
286
|
# batch mode (multiple completions in single request).
|
@@ -289,12 +291,16 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
289
291
|
all_requests.append(ChatCompletionRequest(**body))
|
290
292
|
elif end_point == "/v1/completions":
|
291
293
|
all_requests.append(CompletionRequest(**body))
|
294
|
+
|
292
295
|
if end_point == "/v1/chat/completions":
|
293
296
|
adapted_request, request = v1_chat_generate_request(
|
294
|
-
all_requests, tokenizer_manager
|
297
|
+
all_requests, tokenizer_manager, request_ids=request_ids
|
295
298
|
)
|
296
299
|
elif end_point == "/v1/completions":
|
297
|
-
adapted_request, request = v1_generate_request(
|
300
|
+
adapted_request, request = v1_generate_request(
|
301
|
+
all_requests, request_ids=request_ids
|
302
|
+
)
|
303
|
+
|
298
304
|
try:
|
299
305
|
ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
|
300
306
|
if not isinstance(ret, list):
|
@@ -326,6 +332,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
326
332
|
}
|
327
333
|
all_ret.append(response_json)
|
328
334
|
completed_requests += 1
|
335
|
+
|
329
336
|
# Write results to a new file
|
330
337
|
output_file_id = f"backend_result_file-{uuid.uuid4()}"
|
331
338
|
global storage_dir
|
@@ -372,6 +379,72 @@ async def v1_retrieve_batch(batch_id: str):
|
|
372
379
|
return batch_response
|
373
380
|
|
374
381
|
|
382
|
+
async def v1_cancel_batch(tokenizer_manager, batch_id: str):
|
383
|
+
# Retrieve the batch job from the in-memory storage
|
384
|
+
batch_response = batch_storage.get(batch_id)
|
385
|
+
if batch_response is None:
|
386
|
+
raise HTTPException(status_code=404, detail="Batch not found")
|
387
|
+
|
388
|
+
# Only do cancal when status is "validating" or "in_progress"
|
389
|
+
if batch_response.status in ["validating", "in_progress"]:
|
390
|
+
# Start cancelling the batch asynchronously
|
391
|
+
asyncio.create_task(
|
392
|
+
cancel_batch(
|
393
|
+
tokenizer_manager=tokenizer_manager,
|
394
|
+
batch_id=batch_id,
|
395
|
+
input_file_id=batch_response.input_file_id,
|
396
|
+
)
|
397
|
+
)
|
398
|
+
|
399
|
+
# Update batch status to "cancelling"
|
400
|
+
batch_response.status = "cancelling"
|
401
|
+
|
402
|
+
return batch_response
|
403
|
+
else:
|
404
|
+
raise HTTPException(
|
405
|
+
status_code=500,
|
406
|
+
detail=f"Current status is {batch_response.status}, no need to cancel",
|
407
|
+
)
|
408
|
+
|
409
|
+
|
410
|
+
async def cancel_batch(tokenizer_manager, batch_id: str, input_file_id: str):
|
411
|
+
try:
|
412
|
+
# Update the batch status to "cancelling"
|
413
|
+
batch_storage[batch_id].status = "cancelling"
|
414
|
+
|
415
|
+
# Retrieve the input file content
|
416
|
+
input_file_request = file_id_request.get(input_file_id)
|
417
|
+
if not input_file_request:
|
418
|
+
raise ValueError("Input file not found")
|
419
|
+
|
420
|
+
# Parse the JSONL file and process each request
|
421
|
+
input_file_path = file_id_storage.get(input_file_id)
|
422
|
+
with open(input_file_path, "r", encoding="utf-8") as f:
|
423
|
+
lines = f.readlines()
|
424
|
+
|
425
|
+
file_request_list = []
|
426
|
+
request_ids = []
|
427
|
+
for line in lines:
|
428
|
+
request_data = json.loads(line)
|
429
|
+
file_request_list.append(request_data)
|
430
|
+
request_ids.append(request_data["custom_id"])
|
431
|
+
|
432
|
+
# Cancel requests by request_ids
|
433
|
+
for rid in request_ids:
|
434
|
+
tokenizer_manager.abort_request(rid=rid)
|
435
|
+
|
436
|
+
retrieve_batch = batch_storage[batch_id]
|
437
|
+
retrieve_batch.status = "cancelled"
|
438
|
+
|
439
|
+
except Exception as e:
|
440
|
+
logger.error("error in SGLang:", e)
|
441
|
+
# Update batch status to "failed"
|
442
|
+
retrieve_batch = batch_storage[batch_id]
|
443
|
+
retrieve_batch.status = "failed"
|
444
|
+
retrieve_batch.failed_at = int(time.time())
|
445
|
+
retrieve_batch.errors = {"message": str(e)}
|
446
|
+
|
447
|
+
|
375
448
|
async def v1_retrieve_file(file_id: str):
|
376
449
|
# Retrieve the batch job from the in-memory storage
|
377
450
|
file_response = file_id_response.get(file_id)
|
@@ -392,7 +465,9 @@ async def v1_retrieve_file_content(file_id: str):
|
|
392
465
|
return StreamingResponse(iter_file(), media_type="application/octet-stream")
|
393
466
|
|
394
467
|
|
395
|
-
def v1_generate_request(
|
468
|
+
def v1_generate_request(
|
469
|
+
all_requests: List[CompletionRequest], request_ids: List[str] = None
|
470
|
+
):
|
396
471
|
prompts = []
|
397
472
|
sampling_params_list = []
|
398
473
|
return_logprobs = []
|
@@ -434,6 +509,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
|
|
434
509
|
"frequency_penalty": request.frequency_penalty,
|
435
510
|
"repetition_penalty": request.repetition_penalty,
|
436
511
|
"regex": request.regex,
|
512
|
+
"json_schema": request.json_schema,
|
437
513
|
"n": request.n,
|
438
514
|
"ignore_eos": request.ignore_eos,
|
439
515
|
}
|
@@ -463,6 +539,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
|
|
463
539
|
logprob_start_len=logprob_start_lens,
|
464
540
|
return_text_in_logprobs=True,
|
465
541
|
stream=all_requests[0].stream,
|
542
|
+
rid=request_ids,
|
466
543
|
)
|
467
544
|
|
468
545
|
if len(all_requests) == 1:
|
@@ -745,7 +822,9 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
745
822
|
|
746
823
|
|
747
824
|
def v1_chat_generate_request(
|
748
|
-
all_requests: List[ChatCompletionRequest],
|
825
|
+
all_requests: List[ChatCompletionRequest],
|
826
|
+
tokenizer_manager,
|
827
|
+
request_ids: List[str] = None,
|
749
828
|
):
|
750
829
|
input_ids = []
|
751
830
|
sampling_params_list = []
|
@@ -802,6 +881,7 @@ def v1_chat_generate_request(
|
|
802
881
|
"frequency_penalty": request.frequency_penalty,
|
803
882
|
"repetition_penalty": request.repetition_penalty,
|
804
883
|
"regex": request.regex,
|
884
|
+
"json_schema": request.json_schema,
|
805
885
|
"n": request.n,
|
806
886
|
}
|
807
887
|
)
|
@@ -832,6 +912,7 @@ def v1_chat_generate_request(
|
|
832
912
|
top_logprobs_num=top_logprobs_nums,
|
833
913
|
stream=all_requests[0].stream,
|
834
914
|
return_text_in_logprobs=True,
|
915
|
+
rid=request_ids,
|
835
916
|
)
|
836
917
|
if len(all_requests) == 1:
|
837
918
|
return adapted_request, all_requests[0]
|
@@ -161,6 +161,7 @@ class CompletionRequest(BaseModel):
|
|
161
161
|
|
162
162
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
163
163
|
regex: Optional[str] = None
|
164
|
+
json_schema: Optional[str] = None
|
164
165
|
ignore_eos: Optional[bool] = False
|
165
166
|
min_tokens: Optional[int] = 0
|
166
167
|
repetition_penalty: Optional[float] = 1.0
|
@@ -262,6 +263,7 @@ class ChatCompletionRequest(BaseModel):
|
|
262
263
|
|
263
264
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
264
265
|
regex: Optional[str] = None
|
266
|
+
json_schema: Optional[str] = None
|
265
267
|
min_tokens: Optional[int] = 0
|
266
268
|
repetition_penalty: Optional[float] = 1.0
|
267
269
|
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
|
@@ -21,63 +21,10 @@ class SamplingBatchInfo:
|
|
21
21
|
top_ps: torch.Tensor = None
|
22
22
|
top_ks: torch.Tensor = None
|
23
23
|
min_ps: torch.Tensor = None
|
24
|
-
|
25
|
-
# Dispatch in CUDA graph
|
26
|
-
need_min_p_sampling: bool = False
|
27
|
-
|
28
|
-
# Bias Tensors
|
24
|
+
penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
|
29
25
|
logit_bias: torch.Tensor = None
|
30
26
|
vocab_mask: torch.Tensor = None
|
31
27
|
|
32
|
-
# Penalizer
|
33
|
-
penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
|
34
|
-
linear_penalties: torch.Tensor = None
|
35
|
-
scaling_penalties: torch.Tensor = None
|
36
|
-
|
37
|
-
def has_bias(self):
|
38
|
-
return (
|
39
|
-
self.logit_bias is not None
|
40
|
-
or self.vocab_mask is not None
|
41
|
-
or self.linear_penalties is not None
|
42
|
-
or self.scaling_penalties is not None
|
43
|
-
)
|
44
|
-
|
45
|
-
@classmethod
|
46
|
-
def dummy_one(cls, max_bs: int, vocab_size: int):
|
47
|
-
ret = cls(vocab_size=vocab_size)
|
48
|
-
ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
|
49
|
-
ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
|
50
|
-
ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
|
51
|
-
ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
|
52
|
-
return ret
|
53
|
-
|
54
|
-
def __getitem__(self, key):
|
55
|
-
if isinstance(key, slice):
|
56
|
-
# NOTE: We do not use cuda graph when there is bias tensors
|
57
|
-
assert not self.has_bias()
|
58
|
-
return SamplingBatchInfo(
|
59
|
-
vocab_size=self.vocab_size,
|
60
|
-
temperatures=self.temperatures[key],
|
61
|
-
top_ps=self.top_ps[key],
|
62
|
-
top_ks=self.top_ks[key],
|
63
|
-
min_ps=self.min_ps[key],
|
64
|
-
need_min_p_sampling=self.need_min_p_sampling,
|
65
|
-
)
|
66
|
-
else:
|
67
|
-
raise NotImplementedError
|
68
|
-
|
69
|
-
def inplace_assign(self, bs: int, other: SamplingBatchInfo):
|
70
|
-
# NOTE: We do not use cuda graph when there is bias tensors
|
71
|
-
assert not self.has_bias()
|
72
|
-
|
73
|
-
self.vocab_size = other.vocab_size
|
74
|
-
self.need_min_p_sampling = other.need_min_p_sampling
|
75
|
-
|
76
|
-
self.temperatures[:bs] = other.temperatures
|
77
|
-
self.top_ps[:bs] = other.top_ps
|
78
|
-
self.top_ks[:bs] = other.top_ks
|
79
|
-
self.min_ps[:bs] = other.min_ps
|
80
|
-
|
81
28
|
@classmethod
|
82
29
|
def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
|
83
30
|
device = "cuda"
|
@@ -98,7 +45,6 @@ class SamplingBatchInfo:
|
|
98
45
|
ret.min_ps = torch.tensor(
|
99
46
|
[r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
|
100
47
|
)
|
101
|
-
ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
|
102
48
|
|
103
49
|
# Each penalizers will do nothing if they evaluate themselves as not required by looking at
|
104
50
|
# the sampling_params of the requests (See {_is_required()} of each penalizers). So this
|
@@ -126,25 +72,6 @@ class SamplingBatchInfo:
|
|
126
72
|
|
127
73
|
return ret
|
128
74
|
|
129
|
-
def prepare_penalties(self):
|
130
|
-
self.scaling_penalties = None
|
131
|
-
self.linear_penalties = None
|
132
|
-
|
133
|
-
for penalizer in self.penalizer_orchestrator.penalizers.values():
|
134
|
-
if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
|
135
|
-
if penalizer.is_prepared():
|
136
|
-
self.scaling_penalties = penalizer.cumulated_repetition_penalties
|
137
|
-
else:
|
138
|
-
if penalizer.is_prepared():
|
139
|
-
if self.linear_penalties is None:
|
140
|
-
bs = self.penalizer_orchestrator.batch.batch_size()
|
141
|
-
self.linear_penalties = torch.zeros(
|
142
|
-
(bs, self.vocab_size),
|
143
|
-
dtype=torch.float32,
|
144
|
-
device="cuda",
|
145
|
-
)
|
146
|
-
self.linear_penalties = penalizer.apply(self.linear_penalties)
|
147
|
-
|
148
75
|
def update_regex_vocab_mask(self, batch: ScheduleBatch):
|
149
76
|
bs, reqs = batch.batch_size(), batch.reqs
|
150
77
|
device = "cuda"
|
@@ -39,6 +39,7 @@ class SamplingParams:
|
|
39
39
|
spaces_between_special_tokens: bool = True,
|
40
40
|
regex: Optional[str] = None,
|
41
41
|
n: int = 1,
|
42
|
+
json_schema: Optional[str] = None,
|
42
43
|
) -> None:
|
43
44
|
self.temperature = temperature
|
44
45
|
self.top_p = top_p
|
@@ -56,6 +57,7 @@ class SamplingParams:
|
|
56
57
|
self.spaces_between_special_tokens = spaces_between_special_tokens
|
57
58
|
self.regex = regex
|
58
59
|
self.n = n
|
60
|
+
self.json_schema = json_schema
|
59
61
|
|
60
62
|
# Process some special cases
|
61
63
|
if self.temperature < _SAMPLING_EPS:
|
@@ -106,6 +108,8 @@ class SamplingParams:
|
|
106
108
|
f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
|
107
109
|
f"{self.min_new_tokens}."
|
108
110
|
)
|
111
|
+
if self.regex is not None and self.json_schema is not None:
|
112
|
+
raise ValueError("regex and json_schema cannot be both set.")
|
109
113
|
|
110
114
|
def normalize(self, tokenizer):
|
111
115
|
# Process stop strings
|
sglang/srt/server.py
CHANGED
@@ -59,6 +59,7 @@ from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
|
59
59
|
from sglang.srt.openai_api.adapter import (
|
60
60
|
load_chat_template_for_openai_api,
|
61
61
|
v1_batches,
|
62
|
+
v1_cancel_batch,
|
62
63
|
v1_chat_completions,
|
63
64
|
v1_completions,
|
64
65
|
v1_delete_file,
|
@@ -246,6 +247,12 @@ async def openai_v1_batches(raw_request: Request):
|
|
246
247
|
return await v1_batches(tokenizer_manager, raw_request)
|
247
248
|
|
248
249
|
|
250
|
+
@app.post("/v1/batches/{batch_id}/cancel")
|
251
|
+
async def cancel_batches(batch_id: str):
|
252
|
+
# https://platform.openai.com/docs/api-reference/batch/cancel
|
253
|
+
return await v1_cancel_batch(tokenizer_manager, batch_id)
|
254
|
+
|
255
|
+
|
249
256
|
@app.get("/v1/batches/{batch_id}")
|
250
257
|
async def retrieve_batch(batch_id: str):
|
251
258
|
return await v1_retrieve_batch(batch_id)
|
@@ -328,12 +335,12 @@ def launch_server(
|
|
328
335
|
pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)
|
329
336
|
|
330
337
|
if server_args.dp_size == 1:
|
331
|
-
|
338
|
+
start_controller_process = start_controller_process_single
|
332
339
|
else:
|
333
|
-
|
340
|
+
start_controller_process = start_controller_process_multi
|
334
341
|
|
335
342
|
proc_controller = mp.Process(
|
336
|
-
target=
|
343
|
+
target=start_controller_process,
|
337
344
|
args=(server_args, port_args, pipe_controller_writer, model_overide_args),
|
338
345
|
)
|
339
346
|
proc_controller.start()
|
@@ -414,7 +421,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
414
421
|
if not server_args.disable_flashinfer:
|
415
422
|
assert_pkg_version(
|
416
423
|
"flashinfer",
|
417
|
-
"0.1.
|
424
|
+
"0.1.6",
|
418
425
|
"Please uninstall the old version and "
|
419
426
|
"reinstall the latest version by following the instructions "
|
420
427
|
"at https://docs.flashinfer.ai/installation.html.",
|
sglang/srt/utils.py
CHANGED
@@ -26,7 +26,7 @@ import struct
|
|
26
26
|
import time
|
27
27
|
from importlib.metadata import PackageNotFoundError, version
|
28
28
|
from io import BytesIO
|
29
|
-
from typing import List, Optional
|
29
|
+
from typing import List, Optional, Union
|
30
30
|
|
31
31
|
import numpy as np
|
32
32
|
import psutil
|
@@ -193,35 +193,16 @@ def allocate_init_ports(
|
|
193
193
|
return ret_ports[0], ret_ports[1:num_ports_needed]
|
194
194
|
|
195
195
|
|
196
|
-
def
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
logit_bias[t_id] = -1e5
|
207
|
-
|
208
|
-
return logit_bias
|
209
|
-
|
210
|
-
|
211
|
-
def is_multimodal_model(model):
|
212
|
-
from sglang.srt.model_config import ModelConfig
|
213
|
-
|
214
|
-
if isinstance(model, str):
|
215
|
-
model = model.lower()
|
216
|
-
return "llava" in model or "yi-vl" in model or "llava-next" in model
|
217
|
-
|
218
|
-
if isinstance(model, ModelConfig):
|
219
|
-
model_path = model.path.lower()
|
220
|
-
return (
|
221
|
-
"llava" in model_path or "yi-vl" in model_path or "llava-next" in model_path
|
222
|
-
)
|
223
|
-
|
224
|
-
raise ValueError("unrecognized type")
|
196
|
+
def is_multimodal_model(model_architectures):
|
197
|
+
if (
|
198
|
+
"LlavaLlamaForCausalLM" in model_architectures
|
199
|
+
or "LlavaQwenForCausalLM" in model_architectures
|
200
|
+
or "LlavaMistralForCausalLM" in model_architectures
|
201
|
+
or "LlavaVidForCausalLM" in model_architectures
|
202
|
+
):
|
203
|
+
return True
|
204
|
+
else:
|
205
|
+
return False
|
225
206
|
|
226
207
|
|
227
208
|
def is_generation_model(model_architectures, is_embedding: bool = False):
|
@@ -317,12 +298,14 @@ def decode_video_base64(video_base64):
|
|
317
298
|
) # Return an empty array and size tuple if no frames were found
|
318
299
|
|
319
300
|
|
320
|
-
def load_image(image_file):
|
301
|
+
def load_image(image_file: Union[str, bytes]):
|
321
302
|
from PIL import Image
|
322
303
|
|
323
304
|
image = image_size = None
|
324
305
|
|
325
|
-
if image_file
|
306
|
+
if isinstance(image_file, bytes):
|
307
|
+
image = Image.open(BytesIO(image_file))
|
308
|
+
elif image_file.startswith("http://") or image_file.startswith("https://"):
|
326
309
|
timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
|
327
310
|
response = requests.get(image_file, timeout=timeout)
|
328
311
|
image = Image.open(BytesIO(response.content))
|
@@ -334,8 +317,10 @@ def load_image(image_file):
|
|
334
317
|
elif image_file.startswith("video:"):
|
335
318
|
image_file = image_file.replace("video:", "")
|
336
319
|
image, image_size = decode_video_base64(image_file)
|
337
|
-
|
320
|
+
elif isinstance(image_file, str):
|
338
321
|
image = Image.open(BytesIO(base64.b64decode(image_file)))
|
322
|
+
else:
|
323
|
+
raise ValueError(f"Invalid image: {image}")
|
339
324
|
|
340
325
|
return image, image_size
|
341
326
|
|
sglang/test/runners.py
CHANGED
@@ -30,7 +30,7 @@ DEFAULT_PROMPTS = [
|
|
30
30
|
# the output of gemma-2-2b from SRT is unstable on the commented prompt
|
31
31
|
# "The capital of France is",
|
32
32
|
"Apple is red. Banana is Yellow. " * 800 + "Apple is",
|
33
|
-
"The capital of the United
|
33
|
+
"The capital of the United Kingdom is",
|
34
34
|
"Today is a sunny day and I like",
|
35
35
|
"AI is a field of computer science focused on",
|
36
36
|
]
|
@@ -180,7 +180,7 @@ class SRTRunner:
|
|
180
180
|
tp_size=tp_size,
|
181
181
|
dtype=get_dtype_str(torch_dtype),
|
182
182
|
port=port,
|
183
|
-
mem_fraction_static=0.
|
183
|
+
mem_fraction_static=0.7,
|
184
184
|
trust_remote_code=False,
|
185
185
|
is_embedding=not self.is_generation,
|
186
186
|
)
|
sglang/test/test_layernorm.py
CHANGED
@@ -3,7 +3,7 @@ import unittest
|
|
3
3
|
|
4
4
|
import torch
|
5
5
|
|
6
|
-
from sglang.srt.layers.layernorm import RMSNorm
|
6
|
+
from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
|
7
7
|
|
8
8
|
|
9
9
|
class TestRMSNorm(unittest.TestCase):
|
@@ -56,5 +56,57 @@ class TestRMSNorm(unittest.TestCase):
|
|
56
56
|
self._run_rms_norm_test(*params)
|
57
57
|
|
58
58
|
|
59
|
+
class TestGemmaRMSNorm(unittest.TestCase):
|
60
|
+
DTYPES = [torch.half, torch.bfloat16]
|
61
|
+
NUM_TOKENS = [7, 83, 4096]
|
62
|
+
HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
|
63
|
+
ADD_RESIDUAL = [False, True]
|
64
|
+
SEEDS = [0]
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
def setUpClass(cls):
|
68
|
+
if not torch.cuda.is_available():
|
69
|
+
raise unittest.SkipTest("CUDA is not available")
|
70
|
+
torch.set_default_device("cuda")
|
71
|
+
|
72
|
+
def _run_gemma_rms_norm_test(
|
73
|
+
self, num_tokens, hidden_size, add_residual, dtype, seed
|
74
|
+
):
|
75
|
+
torch.manual_seed(seed)
|
76
|
+
|
77
|
+
layer = GemmaRMSNorm(hidden_size).to(dtype=dtype)
|
78
|
+
layer.weight.data.normal_(mean=1.0, std=0.1)
|
79
|
+
scale = 1 / (2 * hidden_size)
|
80
|
+
x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
|
81
|
+
residual = torch.randn_like(x) * scale if add_residual else None
|
82
|
+
|
83
|
+
with torch.inference_mode():
|
84
|
+
ref_out = layer.forward_native(x, residual)
|
85
|
+
out = layer(x, residual)
|
86
|
+
|
87
|
+
if add_residual:
|
88
|
+
self.assertTrue(torch.allclose(out[0], ref_out[0], atol=1e-3, rtol=1e-3))
|
89
|
+
self.assertTrue(torch.allclose(out[1], ref_out[1], atol=1e-3, rtol=1e-3))
|
90
|
+
else:
|
91
|
+
self.assertTrue(torch.allclose(out, ref_out, atol=1e-3, rtol=1e-3))
|
92
|
+
|
93
|
+
def test_gemma_rms_norm(self):
|
94
|
+
for params in itertools.product(
|
95
|
+
self.NUM_TOKENS,
|
96
|
+
self.HIDDEN_SIZES,
|
97
|
+
self.ADD_RESIDUAL,
|
98
|
+
self.DTYPES,
|
99
|
+
self.SEEDS,
|
100
|
+
):
|
101
|
+
with self.subTest(
|
102
|
+
num_tokens=params[0],
|
103
|
+
hidden_size=params[1],
|
104
|
+
add_residual=params[2],
|
105
|
+
dtype=params[3],
|
106
|
+
seed=params[4],
|
107
|
+
):
|
108
|
+
self._run_gemma_rms_norm_test(*params)
|
109
|
+
|
110
|
+
|
59
111
|
if __name__ == "__main__":
|
60
112
|
unittest.main(verbosity=2)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.14"
|
1
|
+
__version__ = "0.2.14.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.14
|
3
|
+
Version: 0.2.14.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
312
312
|
### Method 2: From source
|
313
313
|
```
|
314
314
|
# Use the last release branch
|
315
|
-
git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
|
315
|
+
git clone -b v0.2.14.post2 https://github.com/sgl-project/sglang.git
|
316
316
|
cd sglang
|
317
317
|
|
318
318
|
pip install --upgrade pip
|
@@ -339,6 +339,7 @@ docker run --gpus all \
|
|
339
339
|
### Method 4: Using docker compose
|
340
340
|
|
341
341
|
<details>
|
342
|
+
<summary>More</summary>
|
342
343
|
|
343
344
|
> This method is recommended if you plan to serve it as a service.
|
344
345
|
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
@@ -350,6 +351,7 @@ docker run --gpus all \
|
|
350
351
|
### Method 5: Run on Kubernetes or Clouds with SkyPilot
|
351
352
|
|
352
353
|
<details>
|
354
|
+
<summary>More</summary>
|
353
355
|
|
354
356
|
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
355
357
|
|
@@ -389,7 +391,7 @@ sky status --endpoint 30000 sglang
|
|
389
391
|
|
390
392
|
|
391
393
|
### Common Notes
|
392
|
-
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang.
|
394
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
|
393
395
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
394
396
|
|
395
397
|
## Backend: SGLang Runtime (SRT)
|
@@ -494,7 +496,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
494
496
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
495
497
|
- DeepSeek / DeepSeek 2
|
496
498
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
497
|
-
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava
|
499
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
498
500
|
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
499
501
|
- LLaVA 1.5 / 1.6 / NeXT
|
500
502
|
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
@@ -518,6 +520,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
518
520
|
|
519
521
|
#### Use Models From ModelScope
|
520
522
|
<details>
|
523
|
+
<summary>More</summary>
|
521
524
|
|
522
525
|
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
|
523
526
|
```
|
@@ -532,6 +535,7 @@ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen
|
|
532
535
|
|
533
536
|
#### Run Llama 3.1 405B
|
534
537
|
<details>
|
538
|
+
<summary>More</summary>
|
535
539
|
|
536
540
|
```bash
|
537
541
|
# Run 405B (fp8) on a single node
|
@@ -549,7 +553,9 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
549
553
|
|
550
554
|
### Benchmark Performance
|
551
555
|
|
552
|
-
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
|
556
|
+
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
|
557
|
+
Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
|
558
|
+
A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
|
553
559
|
```
|
554
560
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
555
561
|
```
|