sglang 0.2.14__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. sglang/srt/constrained/fsm_cache.py +11 -2
  2. sglang/srt/constrained/jump_forward.py +1 -0
  3. sglang/srt/layers/activation.py +83 -7
  4. sglang/srt/layers/layernorm.py +0 -3
  5. sglang/srt/layers/logits_processor.py +4 -4
  6. sglang/srt/layers/sampler.py +15 -68
  7. sglang/srt/managers/schedule_batch.py +15 -20
  8. sglang/srt/managers/tp_worker.py +40 -33
  9. sglang/srt/model_executor/cuda_graph_runner.py +17 -31
  10. sglang/srt/model_executor/forward_batch_info.py +1 -8
  11. sglang/srt/model_executor/model_runner.py +5 -11
  12. sglang/srt/models/chatglm.py +12 -4
  13. sglang/srt/models/commandr.py +1 -5
  14. sglang/srt/models/dbrx.py +1 -5
  15. sglang/srt/models/deepseek.py +1 -5
  16. sglang/srt/models/deepseek_v2.py +1 -5
  17. sglang/srt/models/gemma.py +1 -5
  18. sglang/srt/models/gemma2.py +1 -5
  19. sglang/srt/models/gpt_bigcode.py +2 -6
  20. sglang/srt/models/grok.py +1 -5
  21. sglang/srt/models/internlm2.py +1 -5
  22. sglang/srt/models/llama2.py +3 -7
  23. sglang/srt/models/llama_classification.py +2 -2
  24. sglang/srt/models/minicpm.py +1 -5
  25. sglang/srt/models/mixtral.py +1 -5
  26. sglang/srt/models/mixtral_quant.py +1 -5
  27. sglang/srt/models/qwen.py +2 -5
  28. sglang/srt/models/qwen2.py +2 -6
  29. sglang/srt/models/qwen2_moe.py +14 -5
  30. sglang/srt/models/stablelm.py +1 -5
  31. sglang/srt/openai_api/adapter.py +85 -4
  32. sglang/srt/openai_api/protocol.py +2 -0
  33. sglang/srt/sampling/sampling_batch_info.py +1 -74
  34. sglang/srt/sampling/sampling_params.py +4 -0
  35. sglang/srt/server.py +8 -1
  36. sglang/test/runners.py +1 -1
  37. sglang/version.py +1 -1
  38. {sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/METADATA +10 -4
  39. {sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/RECORD +42 -42
  40. {sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/WHEEL +1 -1
  41. {sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/LICENSE +0 -0
  42. {sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/top_level.txt +0 -0
@@ -41,7 +41,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
41
41
  from sglang.srt.layers.layernorm import RMSNorm
42
42
  from sglang.srt.layers.logits_processor import LogitsProcessor
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
- from sglang.srt.layers.sampler import Sampler
45
44
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
46
45
 
47
46
 
@@ -300,7 +299,6 @@ class MixtralForCausalLM(nn.Module):
300
299
  self.model = MixtralModel(config, quant_config=quant_config, prefix="model")
301
300
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
302
301
  self.logits_processor = LogitsProcessor(config)
303
- self.sampler = Sampler()
304
302
 
305
303
  def forward(
306
304
  self,
@@ -310,11 +308,9 @@ class MixtralForCausalLM(nn.Module):
310
308
  input_embeds: torch.Tensor = None,
311
309
  ) -> torch.Tensor:
312
310
  hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
313
- logits_output = self.logits_processor(
311
+ return self.logits_processor(
314
312
  input_ids, hidden_states, self.lm_head.weight, input_metadata
315
313
  )
316
- sample_output = self.sampler(logits_output, input_metadata.sampling_info)
317
- return sample_output, logits_output
318
314
 
319
315
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
320
316
  stacked_params_mapping = [
@@ -45,7 +45,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
45
45
  from sglang.srt.layers.layernorm import RMSNorm
46
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
47
  from sglang.srt.layers.radix_attention import RadixAttention
48
- from sglang.srt.layers.sampler import Sampler
49
48
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
50
49
 
51
50
 
@@ -334,7 +333,6 @@ class QuantMixtralForCausalLM(nn.Module):
334
333
  self.model = MixtralModel(config, quant_config=quant_config)
335
334
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
336
335
  self.logits_processor = LogitsProcessor(config)
337
- self.sampler = Sampler()
338
336
 
339
337
  @torch.no_grad()
340
338
  def forward(
@@ -345,11 +343,9 @@ class QuantMixtralForCausalLM(nn.Module):
345
343
  input_embeds: torch.Tensor = None,
346
344
  ) -> torch.Tensor:
347
345
  hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
348
- logits_output = self.logits_processor(
346
+ return self.logits_processor(
349
347
  input_ids, hidden_states, self.lm_head.weight, input_metadata
350
348
  )
351
- sample_output = self.sampler(logits_output, input_metadata.sampling_info)
352
- return sample_output, logits_output
353
349
 
354
350
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
355
351
  stacked_params_mapping = [
sglang/srt/models/qwen.py CHANGED
@@ -39,7 +39,6 @@ from sglang.srt.layers.activation import SiluAndMul
39
39
  from sglang.srt.layers.layernorm import RMSNorm
40
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
- from sglang.srt.layers.sampler import Sampler
43
42
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
43
 
45
44
 
@@ -252,7 +251,6 @@ class QWenLMHeadModel(nn.Module):
252
251
  vocab_size = ((config.vocab_size + 63) // 64) * 64
253
252
  self.lm_head = ParallelLMHead(vocab_size, config.hidden_size)
254
253
  self.logits_processor = LogitsProcessor(config)
255
- self.sampler = Sampler()
256
254
 
257
255
  @torch.no_grad()
258
256
  def forward(
@@ -262,11 +260,10 @@ class QWenLMHeadModel(nn.Module):
262
260
  input_metadata: InputMetadata,
263
261
  ):
264
262
  hidden_states = self.transformer(input_ids, positions, input_metadata)
265
- logits_output = self.logits_processor(
263
+ next_tokens = self.logits_processor(
266
264
  input_ids, hidden_states, self.lm_head.weight, input_metadata
267
265
  )
268
- sample_output = self.sampler(logits_output, input_metadata.sampling_info)
269
- return sample_output, logits_output
266
+ return next_tokens
270
267
 
271
268
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
272
269
  stacked_params_mapping = [
@@ -38,9 +38,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
38
38
  from sglang.srt.layers.activation import SiluAndMul
39
39
  from sglang.srt.layers.layernorm import RMSNorm
40
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
- from sglang.srt.layers.pooler import Pooler, PoolingType
41
+ from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
- from sglang.srt.layers.sampler import Sampler
44
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
45
44
 
46
45
  Qwen2Config = None
@@ -277,7 +276,6 @@ class Qwen2ForCausalLM(nn.Module):
277
276
  self.model = Qwen2Model(config, quant_config=quant_config)
278
277
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
279
278
  self.logits_processor = LogitsProcessor(config)
280
- self.sampler = Sampler()
281
279
  self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
282
280
 
283
281
  @torch.no_grad()
@@ -291,11 +289,9 @@ class Qwen2ForCausalLM(nn.Module):
291
289
  ) -> torch.Tensor:
292
290
  hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
293
291
  if not get_embedding:
294
- logits_output = self.logits_processor(
292
+ return self.logits_processor(
295
293
  input_ids, hidden_states, self.lm_head.weight, input_metadata
296
294
  )
297
- sample_output = self.sampler(logits_output, input_metadata.sampling_info)
298
- return sample_output, logits_output
299
295
  else:
300
296
  return self.pooler(hidden_states, input_metadata)
301
297
 
@@ -35,8 +35,10 @@ from vllm.model_executor.layers.linear import (
35
35
  ReplicatedLinear,
36
36
  RowParallelLinear,
37
37
  )
38
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
38
39
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
39
40
  from vllm.model_executor.layers.rotary_embedding import get_rope
41
+ from vllm.model_executor.layers.sampler import Sampler
40
42
  from vllm.model_executor.layers.vocab_parallel_embedding import (
41
43
  ParallelLMHead,
42
44
  VocabParallelEmbedding,
@@ -47,7 +49,6 @@ from sglang.srt.layers.activation import SiluAndMul
47
49
  from sglang.srt.layers.layernorm import RMSNorm
48
50
  from sglang.srt.layers.logits_processor import LogitsProcessor
49
51
  from sglang.srt.layers.radix_attention import RadixAttention
50
- from sglang.srt.layers.sampler import Sampler
51
52
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
52
53
 
53
54
 
@@ -365,7 +366,6 @@ class Qwen2MoeForCausalLM(nn.Module):
365
366
  config.vocab_size, config.hidden_size, quant_config=quant_config
366
367
  )
367
368
  self.logits_processor = LogitsProcessor(config)
368
- self.sampler = Sampler()
369
369
 
370
370
  @torch.no_grad()
371
371
  def forward(
@@ -376,11 +376,20 @@ class Qwen2MoeForCausalLM(nn.Module):
376
376
  input_embeds: torch.Tensor = None,
377
377
  ) -> torch.Tensor:
378
378
  hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
379
- logits_output = self.logits_processor(
379
+ return self.logits_processor(
380
380
  input_ids, hidden_states, self.lm_head.weight, input_metadata
381
381
  )
382
- sample_output = self.sampler(logits_output, input_metadata.sampling_info)
383
- return sample_output, logits_output
382
+
383
+ def compute_logits(
384
+ self,
385
+ input_ids: torch.Tensor,
386
+ hidden_states: torch.Tensor,
387
+ input_metadata: InputMetadata,
388
+ ) -> torch.Tensor:
389
+ logits = self.logits_processor(
390
+ input_ids, hidden_states, self.lm_head.weight, input_metadata
391
+ )
392
+ return logits
384
393
 
385
394
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
386
395
  stacked_params_mapping = [
@@ -40,7 +40,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40
40
  from sglang.srt.layers.activation import SiluAndMul
41
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
- from sglang.srt.layers.sampler import Sampler
44
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
45
44
 
46
45
 
@@ -250,7 +249,6 @@ class StableLmForCausalLM(nn.Module):
250
249
  self.model = StableLMEpochModel(config, quant_config=quant_config)
251
250
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
252
251
  self.logits_processor = LogitsProcessor(config)
253
- self.sampler = Sampler()
254
252
 
255
253
  @torch.no_grad()
256
254
  def forward(
@@ -261,11 +259,9 @@ class StableLmForCausalLM(nn.Module):
261
259
  input_embeds: torch.Tensor = None,
262
260
  ) -> torch.Tensor:
263
261
  hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
264
- logits_output = self.logits_processor(
262
+ return self.logits_processor(
265
263
  input_ids, hidden_states, self.lm_head.weight, input_metadata
266
264
  )
267
- sample_output = self.sampler(logits_output, input_metadata.sampling_info)
268
- return sample_output, logits_output
269
265
 
270
266
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
271
267
  stacked_params_mapping = [
@@ -275,10 +275,12 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
275
275
  end_point = batch_storage[batch_id].endpoint
276
276
  file_request_list = []
277
277
  all_requests = []
278
+ request_ids = []
278
279
  for line in lines:
279
280
  request_data = json.loads(line)
280
281
  file_request_list.append(request_data)
281
282
  body = request_data["body"]
283
+ request_ids.append(request_data["custom_id"])
282
284
 
283
285
  # Although streaming is supported for standalone completions, it is not supported in
284
286
  # batch mode (multiple completions in single request).
@@ -289,12 +291,16 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
289
291
  all_requests.append(ChatCompletionRequest(**body))
290
292
  elif end_point == "/v1/completions":
291
293
  all_requests.append(CompletionRequest(**body))
294
+
292
295
  if end_point == "/v1/chat/completions":
293
296
  adapted_request, request = v1_chat_generate_request(
294
- all_requests, tokenizer_manager
297
+ all_requests, tokenizer_manager, request_ids=request_ids
295
298
  )
296
299
  elif end_point == "/v1/completions":
297
- adapted_request, request = v1_generate_request(all_requests)
300
+ adapted_request, request = v1_generate_request(
301
+ all_requests, request_ids=request_ids
302
+ )
303
+
298
304
  try:
299
305
  ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
300
306
  if not isinstance(ret, list):
@@ -326,6 +332,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
326
332
  }
327
333
  all_ret.append(response_json)
328
334
  completed_requests += 1
335
+
329
336
  # Write results to a new file
330
337
  output_file_id = f"backend_result_file-{uuid.uuid4()}"
331
338
  global storage_dir
@@ -372,6 +379,72 @@ async def v1_retrieve_batch(batch_id: str):
372
379
  return batch_response
373
380
 
374
381
 
382
+ async def v1_cancel_batch(tokenizer_manager, batch_id: str):
383
+ # Retrieve the batch job from the in-memory storage
384
+ batch_response = batch_storage.get(batch_id)
385
+ if batch_response is None:
386
+ raise HTTPException(status_code=404, detail="Batch not found")
387
+
388
+ # Only do cancal when status is "validating" or "in_progress"
389
+ if batch_response.status in ["validating", "in_progress"]:
390
+ # Start cancelling the batch asynchronously
391
+ asyncio.create_task(
392
+ cancel_batch(
393
+ tokenizer_manager=tokenizer_manager,
394
+ batch_id=batch_id,
395
+ input_file_id=batch_response.input_file_id,
396
+ )
397
+ )
398
+
399
+ # Update batch status to "cancelling"
400
+ batch_response.status = "cancelling"
401
+
402
+ return batch_response
403
+ else:
404
+ raise HTTPException(
405
+ status_code=500,
406
+ detail=f"Current status is {batch_response.status}, no need to cancel",
407
+ )
408
+
409
+
410
+ async def cancel_batch(tokenizer_manager, batch_id: str, input_file_id: str):
411
+ try:
412
+ # Update the batch status to "cancelling"
413
+ batch_storage[batch_id].status = "cancelling"
414
+
415
+ # Retrieve the input file content
416
+ input_file_request = file_id_request.get(input_file_id)
417
+ if not input_file_request:
418
+ raise ValueError("Input file not found")
419
+
420
+ # Parse the JSONL file and process each request
421
+ input_file_path = file_id_storage.get(input_file_id)
422
+ with open(input_file_path, "r", encoding="utf-8") as f:
423
+ lines = f.readlines()
424
+
425
+ file_request_list = []
426
+ request_ids = []
427
+ for line in lines:
428
+ request_data = json.loads(line)
429
+ file_request_list.append(request_data)
430
+ request_ids.append(request_data["custom_id"])
431
+
432
+ # Cancel requests by request_ids
433
+ for rid in request_ids:
434
+ tokenizer_manager.abort_request(rid=rid)
435
+
436
+ retrieve_batch = batch_storage[batch_id]
437
+ retrieve_batch.status = "cancelled"
438
+
439
+ except Exception as e:
440
+ logger.error("error in SGLang:", e)
441
+ # Update batch status to "failed"
442
+ retrieve_batch = batch_storage[batch_id]
443
+ retrieve_batch.status = "failed"
444
+ retrieve_batch.failed_at = int(time.time())
445
+ retrieve_batch.errors = {"message": str(e)}
446
+
447
+
375
448
  async def v1_retrieve_file(file_id: str):
376
449
  # Retrieve the batch job from the in-memory storage
377
450
  file_response = file_id_response.get(file_id)
@@ -392,7 +465,9 @@ async def v1_retrieve_file_content(file_id: str):
392
465
  return StreamingResponse(iter_file(), media_type="application/octet-stream")
393
466
 
394
467
 
395
- def v1_generate_request(all_requests: List[CompletionRequest]):
468
+ def v1_generate_request(
469
+ all_requests: List[CompletionRequest], request_ids: List[str] = None
470
+ ):
396
471
  prompts = []
397
472
  sampling_params_list = []
398
473
  return_logprobs = []
@@ -434,6 +509,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
434
509
  "frequency_penalty": request.frequency_penalty,
435
510
  "repetition_penalty": request.repetition_penalty,
436
511
  "regex": request.regex,
512
+ "json_schema": request.json_schema,
437
513
  "n": request.n,
438
514
  "ignore_eos": request.ignore_eos,
439
515
  }
@@ -463,6 +539,7 @@ def v1_generate_request(all_requests: List[CompletionRequest]):
463
539
  logprob_start_len=logprob_start_lens,
464
540
  return_text_in_logprobs=True,
465
541
  stream=all_requests[0].stream,
542
+ rid=request_ids,
466
543
  )
467
544
 
468
545
  if len(all_requests) == 1:
@@ -745,7 +822,9 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
745
822
 
746
823
 
747
824
  def v1_chat_generate_request(
748
- all_requests: List[ChatCompletionRequest], tokenizer_manager
825
+ all_requests: List[ChatCompletionRequest],
826
+ tokenizer_manager,
827
+ request_ids: List[str] = None,
749
828
  ):
750
829
  input_ids = []
751
830
  sampling_params_list = []
@@ -802,6 +881,7 @@ def v1_chat_generate_request(
802
881
  "frequency_penalty": request.frequency_penalty,
803
882
  "repetition_penalty": request.repetition_penalty,
804
883
  "regex": request.regex,
884
+ "json_schema": request.json_schema,
805
885
  "n": request.n,
806
886
  }
807
887
  )
@@ -832,6 +912,7 @@ def v1_chat_generate_request(
832
912
  top_logprobs_num=top_logprobs_nums,
833
913
  stream=all_requests[0].stream,
834
914
  return_text_in_logprobs=True,
915
+ rid=request_ids,
835
916
  )
836
917
  if len(all_requests) == 1:
837
918
  return adapted_request, all_requests[0]
@@ -161,6 +161,7 @@ class CompletionRequest(BaseModel):
161
161
 
162
162
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
163
163
  regex: Optional[str] = None
164
+ json_schema: Optional[str] = None
164
165
  ignore_eos: Optional[bool] = False
165
166
  min_tokens: Optional[int] = 0
166
167
  repetition_penalty: Optional[float] = 1.0
@@ -262,6 +263,7 @@ class ChatCompletionRequest(BaseModel):
262
263
 
263
264
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
264
265
  regex: Optional[str] = None
266
+ json_schema: Optional[str] = None
265
267
  min_tokens: Optional[int] = 0
266
268
  repetition_penalty: Optional[float] = 1.0
267
269
  stop_token_ids: Optional[List[int]] = Field(default_factory=list)
@@ -21,63 +21,10 @@ class SamplingBatchInfo:
21
21
  top_ps: torch.Tensor = None
22
22
  top_ks: torch.Tensor = None
23
23
  min_ps: torch.Tensor = None
24
-
25
- # Dispatch in CUDA graph
26
- need_min_p_sampling: bool = False
27
-
28
- # Bias Tensors
24
+ penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
29
25
  logit_bias: torch.Tensor = None
30
26
  vocab_mask: torch.Tensor = None
31
27
 
32
- # Penalizer
33
- penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
34
- linear_penalties: torch.Tensor = None
35
- scaling_penalties: torch.Tensor = None
36
-
37
- def has_bias(self):
38
- return (
39
- self.logit_bias is not None
40
- or self.vocab_mask is not None
41
- or self.linear_penalties is not None
42
- or self.scaling_penalties is not None
43
- )
44
-
45
- @classmethod
46
- def dummy_one(cls, max_bs: int, vocab_size: int):
47
- ret = cls(vocab_size=vocab_size)
48
- ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
49
- ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
50
- ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
51
- ret.min_ps = torch.zeros((max_bs,), dtype=torch.float, device="cuda")
52
- return ret
53
-
54
- def __getitem__(self, key):
55
- if isinstance(key, slice):
56
- # NOTE: We do not use cuda graph when there is bias tensors
57
- assert not self.has_bias()
58
- return SamplingBatchInfo(
59
- vocab_size=self.vocab_size,
60
- temperatures=self.temperatures[key],
61
- top_ps=self.top_ps[key],
62
- top_ks=self.top_ks[key],
63
- min_ps=self.min_ps[key],
64
- need_min_p_sampling=self.need_min_p_sampling,
65
- )
66
- else:
67
- raise NotImplementedError
68
-
69
- def inplace_assign(self, bs: int, other: SamplingBatchInfo):
70
- # NOTE: We do not use cuda graph when there is bias tensors
71
- assert not self.has_bias()
72
-
73
- self.vocab_size = other.vocab_size
74
- self.need_min_p_sampling = other.need_min_p_sampling
75
-
76
- self.temperatures[:bs] = other.temperatures
77
- self.top_ps[:bs] = other.top_ps
78
- self.top_ks[:bs] = other.top_ks
79
- self.min_ps[:bs] = other.min_ps
80
-
81
28
  @classmethod
82
29
  def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
83
30
  device = "cuda"
@@ -98,7 +45,6 @@ class SamplingBatchInfo:
98
45
  ret.min_ps = torch.tensor(
99
46
  [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
100
47
  )
101
- ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
102
48
 
103
49
  # Each penalizers will do nothing if they evaluate themselves as not required by looking at
104
50
  # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
@@ -126,25 +72,6 @@ class SamplingBatchInfo:
126
72
 
127
73
  return ret
128
74
 
129
- def prepare_penalties(self):
130
- self.scaling_penalties = None
131
- self.linear_penalties = None
132
-
133
- for penalizer in self.penalizer_orchestrator.penalizers.values():
134
- if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
135
- if penalizer.is_prepared():
136
- self.scaling_penalties = penalizer.cumulated_repetition_penalties
137
- else:
138
- if penalizer.is_prepared():
139
- if self.linear_penalties is None:
140
- bs = self.penalizer_orchestrator.batch.batch_size()
141
- self.linear_penalties = torch.zeros(
142
- (bs, self.vocab_size),
143
- dtype=torch.float32,
144
- device="cuda",
145
- )
146
- self.linear_penalties = penalizer.apply(self.linear_penalties)
147
-
148
75
  def update_regex_vocab_mask(self, batch: ScheduleBatch):
149
76
  bs, reqs = batch.batch_size(), batch.reqs
150
77
  device = "cuda"
@@ -39,6 +39,7 @@ class SamplingParams:
39
39
  spaces_between_special_tokens: bool = True,
40
40
  regex: Optional[str] = None,
41
41
  n: int = 1,
42
+ json_schema: Optional[str] = None,
42
43
  ) -> None:
43
44
  self.temperature = temperature
44
45
  self.top_p = top_p
@@ -56,6 +57,7 @@ class SamplingParams:
56
57
  self.spaces_between_special_tokens = spaces_between_special_tokens
57
58
  self.regex = regex
58
59
  self.n = n
60
+ self.json_schema = json_schema
59
61
 
60
62
  # Process some special cases
61
63
  if self.temperature < _SAMPLING_EPS:
@@ -106,6 +108,8 @@ class SamplingParams:
106
108
  f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
107
109
  f"{self.min_new_tokens}."
108
110
  )
111
+ if self.regex is not None and self.json_schema is not None:
112
+ raise ValueError("regex and json_schema cannot be both set.")
109
113
 
110
114
  def normalize(self, tokenizer):
111
115
  # Process stop strings
sglang/srt/server.py CHANGED
@@ -59,6 +59,7 @@ from sglang.srt.managers.tokenizer_manager import TokenizerManager
59
59
  from sglang.srt.openai_api.adapter import (
60
60
  load_chat_template_for_openai_api,
61
61
  v1_batches,
62
+ v1_cancel_batch,
62
63
  v1_chat_completions,
63
64
  v1_completions,
64
65
  v1_delete_file,
@@ -246,6 +247,12 @@ async def openai_v1_batches(raw_request: Request):
246
247
  return await v1_batches(tokenizer_manager, raw_request)
247
248
 
248
249
 
250
+ @app.post("/v1/batches/{batch_id}/cancel")
251
+ async def cancel_batches(batch_id: str):
252
+ # https://platform.openai.com/docs/api-reference/batch/cancel
253
+ return await v1_cancel_batch(tokenizer_manager, batch_id)
254
+
255
+
249
256
  @app.get("/v1/batches/{batch_id}")
250
257
  async def retrieve_batch(batch_id: str):
251
258
  return await v1_retrieve_batch(batch_id)
@@ -414,7 +421,7 @@ def _set_envs_and_config(server_args: ServerArgs):
414
421
  if not server_args.disable_flashinfer:
415
422
  assert_pkg_version(
416
423
  "flashinfer",
417
- "0.1.5",
424
+ "0.1.6",
418
425
  "Please uninstall the old version and "
419
426
  "reinstall the latest version by following the instructions "
420
427
  "at https://docs.flashinfer.ai/installation.html.",
sglang/test/runners.py CHANGED
@@ -180,7 +180,7 @@ class SRTRunner:
180
180
  tp_size=tp_size,
181
181
  dtype=get_dtype_str(torch_dtype),
182
182
  port=port,
183
- mem_fraction_static=0.69,
183
+ mem_fraction_static=0.7,
184
184
  trust_remote_code=False,
185
185
  is_embedding=not self.is_generation,
186
186
  )
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.14"
1
+ __version__ = "0.2.14.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.14
3
+ Version: 0.2.14.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
312
312
  ### Method 2: From source
313
313
  ```
314
314
  # Use the last release branch
315
- git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
316
316
  cd sglang
317
317
 
318
318
  pip install --upgrade pip
@@ -339,6 +339,7 @@ docker run --gpus all \
339
339
  ### Method 4: Using docker compose
340
340
 
341
341
  <details>
342
+ <summary>More</summary>
342
343
 
343
344
  > This method is recommended if you plan to serve it as a service.
344
345
  > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
@@ -350,6 +351,7 @@ docker run --gpus all \
350
351
  ### Method 5: Run on Kubernetes or Clouds with SkyPilot
351
352
 
352
353
  <details>
354
+ <summary>More</summary>
353
355
 
354
356
  To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
355
357
 
@@ -389,7 +391,7 @@ sky status --endpoint 30000 sglang
389
391
 
390
392
 
391
393
  ### Common Notes
392
- - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
394
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
393
395
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
394
396
 
395
397
  ## Backend: SGLang Runtime (SRT)
@@ -518,6 +520,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
518
520
 
519
521
  #### Use Models From ModelScope
520
522
  <details>
523
+ <summary>More</summary>
521
524
 
522
525
  To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
523
526
  ```
@@ -532,6 +535,7 @@ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen
532
535
 
533
536
  #### Run Llama 3.1 405B
534
537
  <details>
538
+ <summary>More</summary>
535
539
 
536
540
  ```bash
537
541
  # Run 405B (fp8) on a single node
@@ -549,7 +553,9 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
549
553
 
550
554
  ### Benchmark Performance
551
555
 
552
- - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
556
+ - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
557
+ Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
558
+ A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
553
559
  ```
554
560
  python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
555
561
  ```