sglang 0.2.14__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/constrained/fsm_cache.py +11 -2
- sglang/srt/constrained/jump_forward.py +1 -0
- sglang/srt/layers/activation.py +83 -7
- sglang/srt/layers/layernorm.py +0 -3
- sglang/srt/layers/logits_processor.py +4 -4
- sglang/srt/layers/sampler.py +15 -68
- sglang/srt/managers/schedule_batch.py +15 -20
- sglang/srt/managers/tp_worker.py +40 -33
- sglang/srt/model_executor/cuda_graph_runner.py +17 -31
- sglang/srt/model_executor/forward_batch_info.py +1 -8
- sglang/srt/model_executor/model_runner.py +5 -11
- sglang/srt/models/chatglm.py +12 -4
- sglang/srt/models/commandr.py +1 -5
- sglang/srt/models/dbrx.py +1 -5
- sglang/srt/models/deepseek.py +1 -5
- sglang/srt/models/deepseek_v2.py +1 -5
- sglang/srt/models/gemma.py +1 -5
- sglang/srt/models/gemma2.py +1 -5
- sglang/srt/models/gpt_bigcode.py +2 -6
- sglang/srt/models/grok.py +1 -5
- sglang/srt/models/internlm2.py +1 -5
- sglang/srt/models/llama2.py +3 -7
- sglang/srt/models/llama_classification.py +2 -2
- sglang/srt/models/minicpm.py +1 -5
- sglang/srt/models/mixtral.py +1 -5
- sglang/srt/models/mixtral_quant.py +1 -5
- sglang/srt/models/qwen.py +2 -5
- sglang/srt/models/qwen2.py +2 -6
- sglang/srt/models/qwen2_moe.py +14 -5
- sglang/srt/models/stablelm.py +1 -5
- sglang/srt/openai_api/adapter.py +85 -4
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/sampling/sampling_batch_info.py +1 -74
- sglang/srt/sampling/sampling_params.py +4 -0
- sglang/srt/server.py +8 -1
- sglang/test/runners.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/METADATA +10 -4
- {sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/RECORD +42 -42
- {sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/WHEEL +1 -1
- {sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/LICENSE +0 -0
- {sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/top_level.txt +0 -0
sglang/srt/models/mixtral.py
CHANGED
@@ -41,7 +41,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
41
41
|
from sglang.srt.layers.layernorm import RMSNorm
|
42
42
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
43
43
|
from sglang.srt.layers.radix_attention import RadixAttention
|
44
|
-
from sglang.srt.layers.sampler import Sampler
|
45
44
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
46
45
|
|
47
46
|
|
@@ -300,7 +299,6 @@ class MixtralForCausalLM(nn.Module):
|
|
300
299
|
self.model = MixtralModel(config, quant_config=quant_config, prefix="model")
|
301
300
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
302
301
|
self.logits_processor = LogitsProcessor(config)
|
303
|
-
self.sampler = Sampler()
|
304
302
|
|
305
303
|
def forward(
|
306
304
|
self,
|
@@ -310,11 +308,9 @@ class MixtralForCausalLM(nn.Module):
|
|
310
308
|
input_embeds: torch.Tensor = None,
|
311
309
|
) -> torch.Tensor:
|
312
310
|
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
313
|
-
|
311
|
+
return self.logits_processor(
|
314
312
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
315
313
|
)
|
316
|
-
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
317
|
-
return sample_output, logits_output
|
318
314
|
|
319
315
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
320
316
|
stacked_params_mapping = [
|
@@ -45,7 +45,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
45
45
|
from sglang.srt.layers.layernorm import RMSNorm
|
46
46
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
47
47
|
from sglang.srt.layers.radix_attention import RadixAttention
|
48
|
-
from sglang.srt.layers.sampler import Sampler
|
49
48
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
50
49
|
|
51
50
|
|
@@ -334,7 +333,6 @@ class QuantMixtralForCausalLM(nn.Module):
|
|
334
333
|
self.model = MixtralModel(config, quant_config=quant_config)
|
335
334
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
336
335
|
self.logits_processor = LogitsProcessor(config)
|
337
|
-
self.sampler = Sampler()
|
338
336
|
|
339
337
|
@torch.no_grad()
|
340
338
|
def forward(
|
@@ -345,11 +343,9 @@ class QuantMixtralForCausalLM(nn.Module):
|
|
345
343
|
input_embeds: torch.Tensor = None,
|
346
344
|
) -> torch.Tensor:
|
347
345
|
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
348
|
-
|
346
|
+
return self.logits_processor(
|
349
347
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
350
348
|
)
|
351
|
-
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
352
|
-
return sample_output, logits_output
|
353
349
|
|
354
350
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
355
351
|
stacked_params_mapping = [
|
sglang/srt/models/qwen.py
CHANGED
@@ -39,7 +39,6 @@ from sglang.srt.layers.activation import SiluAndMul
|
|
39
39
|
from sglang.srt.layers.layernorm import RMSNorm
|
40
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
|
-
from sglang.srt.layers.sampler import Sampler
|
43
42
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
43
|
|
45
44
|
|
@@ -252,7 +251,6 @@ class QWenLMHeadModel(nn.Module):
|
|
252
251
|
vocab_size = ((config.vocab_size + 63) // 64) * 64
|
253
252
|
self.lm_head = ParallelLMHead(vocab_size, config.hidden_size)
|
254
253
|
self.logits_processor = LogitsProcessor(config)
|
255
|
-
self.sampler = Sampler()
|
256
254
|
|
257
255
|
@torch.no_grad()
|
258
256
|
def forward(
|
@@ -262,11 +260,10 @@ class QWenLMHeadModel(nn.Module):
|
|
262
260
|
input_metadata: InputMetadata,
|
263
261
|
):
|
264
262
|
hidden_states = self.transformer(input_ids, positions, input_metadata)
|
265
|
-
|
263
|
+
next_tokens = self.logits_processor(
|
266
264
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
267
265
|
)
|
268
|
-
|
269
|
-
return sample_output, logits_output
|
266
|
+
return next_tokens
|
270
267
|
|
271
268
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
272
269
|
stacked_params_mapping = [
|
sglang/srt/models/qwen2.py
CHANGED
@@ -38,9 +38,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
38
38
|
from sglang.srt.layers.activation import SiluAndMul
|
39
39
|
from sglang.srt.layers.layernorm import RMSNorm
|
40
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
|
-
from sglang.srt.layers.pooler import Pooler, PoolingType
|
41
|
+
from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
|
-
from sglang.srt.layers.sampler import Sampler
|
44
43
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
45
44
|
|
46
45
|
Qwen2Config = None
|
@@ -277,7 +276,6 @@ class Qwen2ForCausalLM(nn.Module):
|
|
277
276
|
self.model = Qwen2Model(config, quant_config=quant_config)
|
278
277
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
279
278
|
self.logits_processor = LogitsProcessor(config)
|
280
|
-
self.sampler = Sampler()
|
281
279
|
self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
|
282
280
|
|
283
281
|
@torch.no_grad()
|
@@ -291,11 +289,9 @@ class Qwen2ForCausalLM(nn.Module):
|
|
291
289
|
) -> torch.Tensor:
|
292
290
|
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
293
291
|
if not get_embedding:
|
294
|
-
|
292
|
+
return self.logits_processor(
|
295
293
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
296
294
|
)
|
297
|
-
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
298
|
-
return sample_output, logits_output
|
299
295
|
else:
|
300
296
|
return self.pooler(hidden_states, input_metadata)
|
301
297
|
|
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -35,8 +35,10 @@ from vllm.model_executor.layers.linear import (
|
|
35
35
|
ReplicatedLinear,
|
36
36
|
RowParallelLinear,
|
37
37
|
)
|
38
|
+
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
38
39
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
39
40
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
41
|
+
from vllm.model_executor.layers.sampler import Sampler
|
40
42
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
41
43
|
ParallelLMHead,
|
42
44
|
VocabParallelEmbedding,
|
@@ -47,7 +49,6 @@ from sglang.srt.layers.activation import SiluAndMul
|
|
47
49
|
from sglang.srt.layers.layernorm import RMSNorm
|
48
50
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
49
51
|
from sglang.srt.layers.radix_attention import RadixAttention
|
50
|
-
from sglang.srt.layers.sampler import Sampler
|
51
52
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
52
53
|
|
53
54
|
|
@@ -365,7 +366,6 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
365
366
|
config.vocab_size, config.hidden_size, quant_config=quant_config
|
366
367
|
)
|
367
368
|
self.logits_processor = LogitsProcessor(config)
|
368
|
-
self.sampler = Sampler()
|
369
369
|
|
370
370
|
@torch.no_grad()
|
371
371
|
def forward(
|
@@ -376,11 +376,20 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
376
376
|
input_embeds: torch.Tensor = None,
|
377
377
|
) -> torch.Tensor:
|
378
378
|
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
379
|
-
|
379
|
+
return self.logits_processor(
|
380
380
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
381
381
|
)
|
382
|
-
|
383
|
-
|
382
|
+
|
383
|
+
def compute_logits(
|
384
|
+
self,
|
385
|
+
input_ids: torch.Tensor,
|
386
|
+
hidden_states: torch.Tensor,
|
387
|
+
input_metadata: InputMetadata,
|
388
|
+
) -> torch.Tensor:
|
389
|
+
logits = self.logits_processor(
|
390
|
+
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
391
|
+
)
|
392
|
+
return logits
|
384
393
|
|
385
394
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
386
395
|
stacked_params_mapping = [
|
sglang/srt/models/stablelm.py
CHANGED
@@ -40,7 +40,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
40
40
|
from sglang.srt.layers.activation import SiluAndMul
|
41
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
|
-
from sglang.srt.layers.sampler import Sampler
|
44
43
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
45
44
|
|
46
45
|
|
@@ -250,7 +249,6 @@ class StableLmForCausalLM(nn.Module):
|
|
250
249
|
self.model = StableLMEpochModel(config, quant_config=quant_config)
|
251
250
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
252
251
|
self.logits_processor = LogitsProcessor(config)
|
253
|
-
self.sampler = Sampler()
|
254
252
|
|
255
253
|
@torch.no_grad()
|
256
254
|
def forward(
|
@@ -261,11 +259,9 @@ class StableLmForCausalLM(nn.Module):
|
|
261
259
|
input_embeds: torch.Tensor = None,
|
262
260
|
) -> torch.Tensor:
|
263
261
|
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
264
|
-
|
262
|
+
return self.logits_processor(
|
265
263
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
266
264
|
)
|
267
|
-
sample_output = self.sampler(logits_output, input_metadata.sampling_info)
|
268
|
-
return sample_output, logits_output
|
269
265
|
|
270
266
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
271
267
|
stacked_params_mapping = [
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -275,10 +275,12 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
275
275
|
end_point = batch_storage[batch_id].endpoint
|
276
276
|
file_request_list = []
|
277
277
|
all_requests = []
|
278
|
+
request_ids = []
|
278
279
|
for line in lines:
|
279
280
|
request_data = json.loads(line)
|
280
281
|
file_request_list.append(request_data)
|
281
282
|
body = request_data["body"]
|
283
|
+
request_ids.append(request_data["custom_id"])
|
282
284
|
|
283
285
|
# Although streaming is supported for standalone completions, it is not supported in
|
284
286
|
# batch mode (multiple completions in single request).
|
@@ -289,12 +291,16 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
289
291
|
all_requests.append(ChatCompletionRequest(**body))
|
290
292
|
elif end_point == "/v1/completions":
|
291
293
|
all_requests.append(CompletionRequest(**body))
|
294
|
+
|
292
295
|
if end_point == "/v1/chat/completions":
|
293
296
|
adapted_request, request = v1_chat_generate_request(
|
294
|
-
all_requests, tokenizer_manager
|
297
|
+
all_requests, tokenizer_manager, request_ids=request_ids
|
295
298
|
)
|
296
299
|
elif end_point == "/v1/completions":
|
297
|
-
adapted_request, request = v1_generate_request(
|
300
|
+
adapted_request, request = v1_generate_request(
|
301
|
+
all_requests, request_ids=request_ids
|
302
|
+
)
|
303
|
+
|
298
304
|
try:
|
299
305
|
ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
|
300
306
|
if not isinstance(ret, list):
|
@@ -326,6 +332,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
326
332
|
}
|
327
333
|
all_ret.append(response_json)
|
328
334
|
completed_requests += 1
|
335
|
+
|
329
336
|
# Write results to a new file
|
330
337
|
output_file_id = f"backend_result_file-{uuid.uuid4()}"
|
331
338
|
global storage_dir
|
@@ -372,6 +379,72 @@ async def v1_retrieve_batch(batch_id: str):
|
|
372
379
|
return batch_response
|
373
380
|
|
374
381
|
|
382
|
+
async def v1_cancel_batch(tokenizer_manager, batch_id: str):
|
383
|
+
# Retrieve the batch job from the in-memory storage
|
384
|
+
batch_response = batch_storage.get(batch_id)
|
385
|
+
if batch_response is None:
|
386
|
+
raise HTTPException(status_code=404, detail="Batch not found")
|
387
|
+
|
388
|
+
# Only do cancal when status is "validating" or "in_progress"
|
389
|
+
if batch_response.status in ["validating", "in_progress"]:
|
390
|
+
# Start cancelling the batch asynchronously
|
391
|
+
asyncio.create_task(
|
392
|
+
cancel_batch(
|
393
|
+
tokenizer_manager=tokenizer_manager,
|
394
|
+
batch_id=batch_id,
|
395
|
+
input_file_id=batch_response.input_file_id,
|
396
|
+
)
|
397
|
+
)
|
398
|
+
|
399
|
+
# Update batch status to "cancelling"
|
400
|
+
batch_response.status = "cancelling"
|
401
|
+
|
402
|
+
return batch_response
|
403
|
+
else:
|
404
|
+
raise HTTPException(
|
405
|
+
status_code=500,
|
406
|
+
detail=f"Current status is {batch_response.status}, no need to cancel",
|
407
|
+
)
|
408
|
+
|
409
|
+
|
410
|
+
async def cancel_batch(tokenizer_manager, batch_id: str, input_file_id: str):
|
411
|
+
try:
|
412
|
+
# Update the batch status to "cancelling"
|
413
|
+
batch_storage[batch_id].status = "cancelling"
|
414
|
+
|
415
|
+
# Retrieve the input file content
|
416
|
+
input_file_request = file_id_request.get(input_file_id)
|
417
|
+
if not input_file_request:
|
418
|
+
raise ValueError("Input file not found")
|
419
|
+
|
420
|
+
# Parse the JSONL file and process each request
|
421
|
+
input_file_path = file_id_storage.get(input_file_id)
|
422
|
+
with open(input_file_path, "r", encoding="utf-8") as f:
|
423
|
+
lines = f.readlines()
|
424
|
+
|
425
|
+
file_request_list = []
|
426
|
+
request_ids = []
|
427
|
+
for line in lines:
|
428
|
+
request_data = json.loads(line)
|
429
|
+
file_request_list.append(request_data)
|
430
|
+
request_ids.append(request_data["custom_id"])
|
431
|
+
|
432
|
+
# Cancel requests by request_ids
|
433
|
+
for rid in request_ids:
|
434
|
+
tokenizer_manager.abort_request(rid=rid)
|
435
|
+
|
436
|
+
retrieve_batch = batch_storage[batch_id]
|
437
|
+
retrieve_batch.status = "cancelled"
|
438
|
+
|
439
|
+
except Exception as e:
|
440
|
+
logger.error("error in SGLang:", e)
|
441
|
+
# Update batch status to "failed"
|
442
|
+
retrieve_batch = batch_storage[batch_id]
|
443
|
+
retrieve_batch.status = "failed"
|
444
|
+
retrieve_batch.failed_at = int(time.time())
|
445
|
+
retrieve_batch.errors = {"message": str(e)}
|
446
|
+
|
447
|
+
|
375
448
|
async def v1_retrieve_file(file_id: str):
|
376
449
|
# Retrieve the batch job from the in-memory storage
|
377
450
|
file_response = file_id_response.get(file_id)
|
@@ -392,7 +465,9 @@ async def v1_retrieve_file_content(file_id: str):
|
|
392
465
|
return StreamingResponse(iter_file(), media_type="application/octet-stream")
|
393
466
|
|
394
467
|
|
395
|
-
def v1_generate_request(
|
468
|
+
def v1_generate_request(
|
469
|
+
all_requests: List[CompletionRequest], request_ids: List[str] = None
|
470
|
+
):
|
396
471
|
prompts = []
|
397
472
|
sampling_params_list = []
|
398
473
|
return_logprobs = []
|
@@ -434,6 +509,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
|
|
434
509
|
"frequency_penalty": request.frequency_penalty,
|
435
510
|
"repetition_penalty": request.repetition_penalty,
|
436
511
|
"regex": request.regex,
|
512
|
+
"json_schema": request.json_schema,
|
437
513
|
"n": request.n,
|
438
514
|
"ignore_eos": request.ignore_eos,
|
439
515
|
}
|
@@ -463,6 +539,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
|
|
463
539
|
logprob_start_len=logprob_start_lens,
|
464
540
|
return_text_in_logprobs=True,
|
465
541
|
stream=all_requests[0].stream,
|
542
|
+
rid=request_ids,
|
466
543
|
)
|
467
544
|
|
468
545
|
if len(all_requests) == 1:
|
@@ -745,7 +822,9 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
|
|
745
822
|
|
746
823
|
|
747
824
|
def v1_chat_generate_request(
|
748
|
-
all_requests: List[ChatCompletionRequest],
|
825
|
+
all_requests: List[ChatCompletionRequest],
|
826
|
+
tokenizer_manager,
|
827
|
+
request_ids: List[str] = None,
|
749
828
|
):
|
750
829
|
input_ids = []
|
751
830
|
sampling_params_list = []
|
@@ -802,6 +881,7 @@ def v1_chat_generate_request(
|
|
802
881
|
"frequency_penalty": request.frequency_penalty,
|
803
882
|
"repetition_penalty": request.repetition_penalty,
|
804
883
|
"regex": request.regex,
|
884
|
+
"json_schema": request.json_schema,
|
805
885
|
"n": request.n,
|
806
886
|
}
|
807
887
|
)
|
@@ -832,6 +912,7 @@ def v1_chat_generate_request(
|
|
832
912
|
top_logprobs_num=top_logprobs_nums,
|
833
913
|
stream=all_requests[0].stream,
|
834
914
|
return_text_in_logprobs=True,
|
915
|
+
rid=request_ids,
|
835
916
|
)
|
836
917
|
if len(all_requests) == 1:
|
837
918
|
return adapted_request, all_requests[0]
|
@@ -161,6 +161,7 @@ class CompletionRequest(BaseModel):
|
|
161
161
|
|
162
162
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
163
163
|
regex: Optional[str] = None
|
164
|
+
json_schema: Optional[str] = None
|
164
165
|
ignore_eos: Optional[bool] = False
|
165
166
|
min_tokens: Optional[int] = 0
|
166
167
|
repetition_penalty: Optional[float] = 1.0
|
@@ -262,6 +263,7 @@ class ChatCompletionRequest(BaseModel):
|
|
262
263
|
|
263
264
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
264
265
|
regex: Optional[str] = None
|
266
|
+
json_schema: Optional[str] = None
|
265
267
|
min_tokens: Optional[int] = 0
|
266
268
|
repetition_penalty: Optional[float] = 1.0
|
267
269
|
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
|
@@ -21,63 +21,10 @@ class SamplingBatchInfo:
|
|
21
21
|
top_ps: torch.Tensor = None
|
22
22
|
top_ks: torch.Tensor = None
|
23
23
|
min_ps: torch.Tensor = None
|
24
|
-
|
25
|
-
# Dispatch in CUDA graph
|
26
|
-
need_min_p_sampling: bool = False
|
27
|
-
|
28
|
-
# Bias Tensors
|
24
|
+
penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
|
29
25
|
logit_bias: torch.Tensor = None
|
30
26
|
vocab_mask: torch.Tensor = None
|
31
27
|
|
32
|
-
# Penalizer
|
33
|
-
penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
|
34
|
-
linear_penalties: torch.Tensor = None
|
35
|
-
scaling_penalties: torch.Tensor = None
|
36
|
-
|
37
|
-
def has_bias(self):
|
38
|
-
return (
|
39
|
-
self.logit_bias is not None
|
40
|
-
or self.vocab_mask is not None
|
41
|
-
or self.linear_penalties is not None
|
42
|
-
or self.scaling_penalties is not None
|
43
|
-
)
|
44
|
-
|
45
|
-
@classmethod
|
46
|
-
def dummy_one(cls, max_bs: int, vocab_size: int):
|
47
|
-
ret = cls(vocab_size=vocab_size)
|
48
|
-
ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
|
49
|
-
ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
|
50
|
-
ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
|
51
|
-
ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
|
52
|
-
return ret
|
53
|
-
|
54
|
-
def __getitem__(self, key):
|
55
|
-
if isinstance(key, slice):
|
56
|
-
# NOTE: We do not use cuda graph when there is bias tensors
|
57
|
-
assert not self.has_bias()
|
58
|
-
return SamplingBatchInfo(
|
59
|
-
vocab_size=self.vocab_size,
|
60
|
-
temperatures=self.temperatures[key],
|
61
|
-
top_ps=self.top_ps[key],
|
62
|
-
top_ks=self.top_ks[key],
|
63
|
-
min_ps=self.min_ps[key],
|
64
|
-
need_min_p_sampling=self.need_min_p_sampling,
|
65
|
-
)
|
66
|
-
else:
|
67
|
-
raise NotImplementedError
|
68
|
-
|
69
|
-
def inplace_assign(self, bs: int, other: SamplingBatchInfo):
|
70
|
-
# NOTE: We do not use cuda graph when there is bias tensors
|
71
|
-
assert not self.has_bias()
|
72
|
-
|
73
|
-
self.vocab_size = other.vocab_size
|
74
|
-
self.need_min_p_sampling = other.need_min_p_sampling
|
75
|
-
|
76
|
-
self.temperatures[:bs] = other.temperatures
|
77
|
-
self.top_ps[:bs] = other.top_ps
|
78
|
-
self.top_ks[:bs] = other.top_ks
|
79
|
-
self.min_ps[:bs] = other.min_ps
|
80
|
-
|
81
28
|
@classmethod
|
82
29
|
def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
|
83
30
|
device = "cuda"
|
@@ -98,7 +45,6 @@ class SamplingBatchInfo:
|
|
98
45
|
ret.min_ps = torch.tensor(
|
99
46
|
[r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
|
100
47
|
)
|
101
|
-
ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
|
102
48
|
|
103
49
|
# Each penalizers will do nothing if they evaluate themselves as not required by looking at
|
104
50
|
# the sampling_params of the requests (See {_is_required()} of each penalizers). So this
|
@@ -126,25 +72,6 @@ class SamplingBatchInfo:
|
|
126
72
|
|
127
73
|
return ret
|
128
74
|
|
129
|
-
def prepare_penalties(self):
|
130
|
-
self.scaling_penalties = None
|
131
|
-
self.linear_penalties = None
|
132
|
-
|
133
|
-
for penalizer in self.penalizer_orchestrator.penalizers.values():
|
134
|
-
if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
|
135
|
-
if penalizer.is_prepared():
|
136
|
-
self.scaling_penalties = penalizer.cumulated_repetition_penalties
|
137
|
-
else:
|
138
|
-
if penalizer.is_prepared():
|
139
|
-
if self.linear_penalties is None:
|
140
|
-
bs = self.penalizer_orchestrator.batch.batch_size()
|
141
|
-
self.linear_penalties = torch.zeros(
|
142
|
-
(bs, self.vocab_size),
|
143
|
-
dtype=torch.float32,
|
144
|
-
device="cuda",
|
145
|
-
)
|
146
|
-
self.linear_penalties = penalizer.apply(self.linear_penalties)
|
147
|
-
|
148
75
|
def update_regex_vocab_mask(self, batch: ScheduleBatch):
|
149
76
|
bs, reqs = batch.batch_size(), batch.reqs
|
150
77
|
device = "cuda"
|
@@ -39,6 +39,7 @@ class SamplingParams:
|
|
39
39
|
spaces_between_special_tokens: bool = True,
|
40
40
|
regex: Optional[str] = None,
|
41
41
|
n: int = 1,
|
42
|
+
json_schema: Optional[str] = None,
|
42
43
|
) -> None:
|
43
44
|
self.temperature = temperature
|
44
45
|
self.top_p = top_p
|
@@ -56,6 +57,7 @@ class SamplingParams:
|
|
56
57
|
self.spaces_between_special_tokens = spaces_between_special_tokens
|
57
58
|
self.regex = regex
|
58
59
|
self.n = n
|
60
|
+
self.json_schema = json_schema
|
59
61
|
|
60
62
|
# Process some special cases
|
61
63
|
if self.temperature < _SAMPLING_EPS:
|
@@ -106,6 +108,8 @@ class SamplingParams:
|
|
106
108
|
f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
|
107
109
|
f"{self.min_new_tokens}."
|
108
110
|
)
|
111
|
+
if self.regex is not None and self.json_schema is not None:
|
112
|
+
raise ValueError("regex and json_schema cannot be both set.")
|
109
113
|
|
110
114
|
def normalize(self, tokenizer):
|
111
115
|
# Process stop strings
|
sglang/srt/server.py
CHANGED
@@ -59,6 +59,7 @@ from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
|
59
59
|
from sglang.srt.openai_api.adapter import (
|
60
60
|
load_chat_template_for_openai_api,
|
61
61
|
v1_batches,
|
62
|
+
v1_cancel_batch,
|
62
63
|
v1_chat_completions,
|
63
64
|
v1_completions,
|
64
65
|
v1_delete_file,
|
@@ -246,6 +247,12 @@ async def openai_v1_batches(raw_request: Request):
|
|
246
247
|
return await v1_batches(tokenizer_manager, raw_request)
|
247
248
|
|
248
249
|
|
250
|
+
@app.post("/v1/batches/{batch_id}/cancel")
|
251
|
+
async def cancel_batches(batch_id: str):
|
252
|
+
# https://platform.openai.com/docs/api-reference/batch/cancel
|
253
|
+
return await v1_cancel_batch(tokenizer_manager, batch_id)
|
254
|
+
|
255
|
+
|
249
256
|
@app.get("/v1/batches/{batch_id}")
|
250
257
|
async def retrieve_batch(batch_id: str):
|
251
258
|
return await v1_retrieve_batch(batch_id)
|
@@ -414,7 +421,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
414
421
|
if not server_args.disable_flashinfer:
|
415
422
|
assert_pkg_version(
|
416
423
|
"flashinfer",
|
417
|
-
"0.1.
|
424
|
+
"0.1.6",
|
418
425
|
"Please uninstall the old version and "
|
419
426
|
"reinstall the latest version by following the instructions "
|
420
427
|
"at https://docs.flashinfer.ai/installation.html.",
|
sglang/test/runners.py
CHANGED
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.14"
|
1
|
+
__version__ = "0.2.14.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.14
|
3
|
+
Version: 0.2.14.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
312
312
|
### Method 2: From source
|
313
313
|
```
|
314
314
|
# Use the last release branch
|
315
|
-
git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
|
315
|
+
git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
|
316
316
|
cd sglang
|
317
317
|
|
318
318
|
pip install --upgrade pip
|
@@ -339,6 +339,7 @@ docker run --gpus all \
|
|
339
339
|
### Method 4: Using docker compose
|
340
340
|
|
341
341
|
<details>
|
342
|
+
<summary>More</summary>
|
342
343
|
|
343
344
|
> This method is recommended if you plan to serve it as a service.
|
344
345
|
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
@@ -350,6 +351,7 @@ docker run --gpus all \
|
|
350
351
|
### Method 5: Run on Kubernetes or Clouds with SkyPilot
|
351
352
|
|
352
353
|
<details>
|
354
|
+
<summary>More</summary>
|
353
355
|
|
354
356
|
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
355
357
|
|
@@ -389,7 +391,7 @@ sky status --endpoint 30000 sglang
|
|
389
391
|
|
390
392
|
|
391
393
|
### Common Notes
|
392
|
-
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang.
|
394
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
|
393
395
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
394
396
|
|
395
397
|
## Backend: SGLang Runtime (SRT)
|
@@ -518,6 +520,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
518
520
|
|
519
521
|
#### Use Models From ModelScope
|
520
522
|
<details>
|
523
|
+
<summary>More</summary>
|
521
524
|
|
522
525
|
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
|
523
526
|
```
|
@@ -532,6 +535,7 @@ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen
|
|
532
535
|
|
533
536
|
#### Run Llama 3.1 405B
|
534
537
|
<details>
|
538
|
+
<summary>More</summary>
|
535
539
|
|
536
540
|
```bash
|
537
541
|
# Run 405B (fp8) on a single node
|
@@ -549,7 +553,9 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
549
553
|
|
550
554
|
### Benchmark Performance
|
551
555
|
|
552
|
-
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
|
556
|
+
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
|
557
|
+
Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
|
558
|
+
A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
|
553
559
|
```
|
554
560
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
555
561
|
```
|