sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. sglang/bench_one_batch.py +0 -2
  2. sglang/bench_serving.py +224 -127
  3. sglang/compile_deep_gemm.py +3 -0
  4. sglang/launch_server.py +0 -14
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/falcon_h1.py +12 -58
  7. sglang/srt/configs/mamba_utils.py +117 -0
  8. sglang/srt/configs/model_config.py +68 -31
  9. sglang/srt/configs/nemotron_h.py +286 -0
  10. sglang/srt/configs/qwen3_next.py +11 -43
  11. sglang/srt/disaggregation/decode.py +7 -18
  12. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  13. sglang/srt/disaggregation/nixl/conn.py +55 -23
  14. sglang/srt/disaggregation/prefill.py +17 -32
  15. sglang/srt/entrypoints/engine.py +2 -2
  16. sglang/srt/entrypoints/grpc_request_manager.py +10 -23
  17. sglang/srt/entrypoints/grpc_server.py +220 -80
  18. sglang/srt/entrypoints/http_server.py +49 -1
  19. sglang/srt/entrypoints/openai/protocol.py +159 -31
  20. sglang/srt/entrypoints/openai/serving_chat.py +13 -71
  21. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  22. sglang/srt/environ.py +4 -0
  23. sglang/srt/function_call/function_call_parser.py +8 -6
  24. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  25. sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
  26. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
  27. sglang/srt/layers/attention/attention_registry.py +31 -22
  28. sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
  29. sglang/srt/layers/attention/flashattention_backend.py +0 -1
  30. sglang/srt/layers/attention/flashinfer_backend.py +223 -6
  31. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
  32. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
  33. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  34. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
  35. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  36. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  37. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  38. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  39. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  40. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  41. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  42. sglang/srt/layers/attention/triton_backend.py +1 -1
  43. sglang/srt/layers/logits_processor.py +136 -6
  44. sglang/srt/layers/modelopt_utils.py +11 -0
  45. sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
  46. sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
  47. sglang/srt/layers/moe/ep_moe/layer.py +8 -286
  48. sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
  49. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  50. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  51. sglang/srt/layers/moe/utils.py +7 -1
  52. sglang/srt/layers/quantization/__init__.py +1 -1
  53. sglang/srt/layers/quantization/fp8.py +84 -18
  54. sglang/srt/layers/quantization/modelopt_quant.py +1 -1
  55. sglang/srt/layers/quantization/quark/quark.py +3 -1
  56. sglang/srt/layers/quantization/w4afp8.py +2 -16
  57. sglang/srt/lora/lora_manager.py +0 -8
  58. sglang/srt/managers/overlap_utils.py +18 -16
  59. sglang/srt/managers/schedule_batch.py +119 -90
  60. sglang/srt/managers/schedule_policy.py +1 -1
  61. sglang/srt/managers/scheduler.py +213 -126
  62. sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
  63. sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
  64. sglang/srt/managers/tokenizer_manager.py +270 -53
  65. sglang/srt/managers/tp_worker.py +39 -28
  66. sglang/srt/mem_cache/allocator.py +7 -2
  67. sglang/srt/mem_cache/chunk_cache.py +1 -1
  68. sglang/srt/mem_cache/memory_pool.py +162 -68
  69. sglang/srt/mem_cache/radix_cache.py +8 -3
  70. sglang/srt/mem_cache/swa_radix_cache.py +70 -14
  71. sglang/srt/model_executor/cuda_graph_runner.py +1 -1
  72. sglang/srt/model_executor/forward_batch_info.py +4 -18
  73. sglang/srt/model_executor/model_runner.py +55 -51
  74. sglang/srt/model_loader/__init__.py +1 -1
  75. sglang/srt/model_loader/loader.py +187 -6
  76. sglang/srt/model_loader/weight_utils.py +3 -0
  77. sglang/srt/models/falcon_h1.py +11 -9
  78. sglang/srt/models/gemma3_mm.py +16 -0
  79. sglang/srt/models/grok.py +5 -13
  80. sglang/srt/models/mixtral.py +1 -3
  81. sglang/srt/models/mllama4.py +11 -1
  82. sglang/srt/models/nemotron_h.py +514 -0
  83. sglang/srt/models/utils.py +5 -1
  84. sglang/srt/sampling/sampling_batch_info.py +11 -9
  85. sglang/srt/server_args.py +100 -33
  86. sglang/srt/speculative/eagle_worker.py +11 -13
  87. sglang/srt/speculative/ngram_worker.py +12 -11
  88. sglang/srt/speculative/spec_utils.py +0 -1
  89. sglang/srt/two_batch_overlap.py +1 -0
  90. sglang/srt/utils/common.py +18 -0
  91. sglang/srt/utils/hf_transformers_utils.py +2 -0
  92. sglang/test/longbench_v2/__init__.py +1 -0
  93. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  94. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  95. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  96. sglang/test/run_eval.py +40 -0
  97. sglang/test/simple_eval_longbench_v2.py +332 -0
  98. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  99. sglang/test/test_deterministic.py +18 -2
  100. sglang/test/test_deterministic_utils.py +81 -0
  101. sglang/test/test_disaggregation_utils.py +63 -0
  102. sglang/test/test_utils.py +32 -11
  103. sglang/version.py +1 -1
  104. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
  105. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
  106. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  107. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  108. sglang/test/test_block_fp8_ep.py +0 -358
  109. /sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
  110. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  111. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  112. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@
13
13
  # ==============================================================================
14
14
  """Pydantic models for OpenAI API protocol"""
15
15
 
16
+ import logging
16
17
  import time
17
18
  import uuid
18
19
  from dataclasses import dataclass
@@ -37,6 +38,10 @@ from pydantic import (
37
38
  )
38
39
  from typing_extensions import Literal
39
40
 
41
+ from sglang.utils import convert_json_schema_to_str
42
+
43
+ logger = logging.getLogger(__name__)
44
+
40
45
  DEFAULT_MODEL_NAME = "default"
41
46
 
42
47
 
@@ -445,8 +450,8 @@ class ChatCompletionRequest(BaseModel):
445
450
  stop: Optional[Union[str, List[str]]] = None
446
451
  stream: bool = False
447
452
  stream_options: Optional[StreamOptions] = None
448
- temperature: float = 0.7
449
- top_p: float = 1.0
453
+ temperature: Optional[float] = None
454
+ top_p: Optional[float] = None
450
455
  user: Optional[str] = None
451
456
  tools: Optional[List[Tool]] = Field(default=None, examples=[None])
452
457
  tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
@@ -461,6 +466,47 @@ class ChatCompletionRequest(BaseModel):
461
466
  "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
462
467
  )
463
468
 
469
+ # Extra parameters for SRT backend only and will be ignored by OpenAI models.
470
+ top_k: Optional[int] = None
471
+ min_p: Optional[float] = None
472
+ min_tokens: int = 0
473
+ regex: Optional[str] = None
474
+ ebnf: Optional[str] = None
475
+ repetition_penalty: Optional[float] = None
476
+ stop_token_ids: Optional[List[int]] = None
477
+ no_stop_trim: bool = False
478
+ ignore_eos: bool = False
479
+ continue_final_message: bool = False
480
+ skip_special_tokens: bool = True
481
+ lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
482
+ session_params: Optional[Dict] = None
483
+ separate_reasoning: bool = True
484
+ stream_reasoning: bool = True
485
+ chat_template_kwargs: Optional[Dict] = None
486
+
487
+ # For request id
488
+ rid: Optional[Union[List[str], str]] = None
489
+ # Extra key for classifying the request (e.g. cache_salt)
490
+ extra_key: Optional[Union[List[str], str]] = None
491
+ # Cache salt for request caching
492
+ cache_salt: Optional[Union[List[str], str]] = None
493
+ # Priority for the request
494
+ priority: Optional[int] = None
495
+
496
+ # For PD disaggregation
497
+ bootstrap_host: Optional[Union[List[str], str]] = None
498
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
499
+ bootstrap_room: Optional[Union[List[int], int]] = None
500
+
501
+ # OpenAI/SGLang default sampling parameters
502
+ _DEFAULT_SAMPLING_PARAMS = {
503
+ "temperature": 1.0,
504
+ "top_p": 1.0,
505
+ "top_k": -1,
506
+ "min_p": 0.0,
507
+ "repetition_penalty": 1.0,
508
+ }
509
+
464
510
  @model_validator(mode="before")
465
511
  @classmethod
466
512
  def set_tool_choice_default(cls, values):
@@ -531,37 +577,81 @@ class ChatCompletionRequest(BaseModel):
531
577
 
532
578
  return values
533
579
 
534
- # Extra parameters for SRT backend only and will be ignored by OpenAI models.
535
- top_k: int = -1
536
- min_p: float = 0.0
537
- min_tokens: int = 0
538
- regex: Optional[str] = None
539
- ebnf: Optional[str] = None
540
- repetition_penalty: float = 1.0
541
- stop_token_ids: Optional[List[int]] = None
542
- no_stop_trim: bool = False
543
- ignore_eos: bool = False
544
- continue_final_message: bool = False
545
- skip_special_tokens: bool = True
546
- lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
547
- session_params: Optional[Dict] = None
548
- separate_reasoning: bool = True
549
- stream_reasoning: bool = True
550
- chat_template_kwargs: Optional[Dict] = None
580
+ def to_sampling_params(
581
+ self,
582
+ stop: List[str],
583
+ model_generation_config: Dict[str, Any],
584
+ tool_call_constraint: Optional[Any] = None,
585
+ ) -> Dict[str, Any]:
586
+ """
587
+ Convert request to sampling parameters.
588
+ Priority: user value > model generation_config > OpenAI defaults
589
+ """
590
+
591
+ def get_param(param_name: str):
592
+ value = getattr(self, param_name)
593
+ if value is None:
594
+ return model_generation_config.get(
595
+ param_name, self._DEFAULT_SAMPLING_PARAMS[param_name]
596
+ )
597
+ return value
598
+
599
+ sampling_params = {
600
+ "temperature": get_param("temperature"),
601
+ "max_new_tokens": self.max_tokens or self.max_completion_tokens,
602
+ "min_new_tokens": self.min_tokens,
603
+ "stop": stop,
604
+ "stop_token_ids": self.stop_token_ids,
605
+ "top_p": get_param("top_p"),
606
+ "top_k": get_param("top_k"),
607
+ "min_p": get_param("min_p"),
608
+ "presence_penalty": self.presence_penalty,
609
+ "frequency_penalty": self.frequency_penalty,
610
+ "repetition_penalty": get_param("repetition_penalty"),
611
+ "regex": self.regex,
612
+ "ebnf": self.ebnf,
613
+ "n": self.n,
614
+ "no_stop_trim": self.no_stop_trim,
615
+ "ignore_eos": self.ignore_eos,
616
+ "skip_special_tokens": self.skip_special_tokens,
617
+ "logit_bias": self.logit_bias,
618
+ }
551
619
 
552
- # For request id
553
- rid: Optional[Union[List[str], str]] = None
554
- # Extra key for classifying the request (e.g. cache_salt)
555
- extra_key: Optional[Union[List[str], str]] = None
556
- # Cache salt for request caching
557
- cache_salt: Optional[Union[List[str], str]] = None
558
- # Priority for the request
559
- priority: Optional[int] = None
620
+ if self.response_format and self.response_format.type == "json_schema":
621
+ sampling_params["json_schema"] = convert_json_schema_to_str(
622
+ self.response_format.json_schema.schema_
623
+ )
624
+ elif self.response_format and self.response_format.type == "json_object":
625
+ sampling_params["json_schema"] = '{"type": "object"}'
626
+ elif self.response_format and self.response_format.type == "structural_tag":
627
+ sampling_params["structural_tag"] = convert_json_schema_to_str(
628
+ self.response_format.model_dump(by_alias=True)
629
+ )
560
630
 
561
- # For PD disaggregation
562
- bootstrap_host: Optional[Union[List[str], str]] = None
563
- bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
564
- bootstrap_room: Optional[Union[List[int], int]] = None
631
+ # Check if there are already existing output constraints
632
+ has_existing_constraints = (
633
+ sampling_params.get("regex")
634
+ or sampling_params.get("ebnf")
635
+ or sampling_params.get("structural_tag")
636
+ or sampling_params.get("json_schema")
637
+ )
638
+
639
+ if tool_call_constraint and has_existing_constraints:
640
+ logger.warning("Constrained decoding is not compatible with tool calls.")
641
+ elif tool_call_constraint:
642
+ constraint_type, constraint_value = tool_call_constraint
643
+ if constraint_type == "structural_tag":
644
+ sampling_params[constraint_type] = convert_json_schema_to_str(
645
+ constraint_value.model_dump(by_alias=True)
646
+ )
647
+ elif constraint_type == "json_schema":
648
+ sampling_params[constraint_type] = convert_json_schema_to_str(
649
+ constraint_value
650
+ )
651
+ else:
652
+ sampling_params[constraint_type] = constraint_value
653
+
654
+ return sampling_params
565
655
 
566
656
 
567
657
  class ChatMessage(BaseModel):
@@ -711,12 +801,50 @@ class RerankResponse(BaseModel):
711
801
  meta_info: Optional[dict] = None
712
802
 
713
803
 
804
+ class TokenizeRequest(BaseModel):
805
+ """Request schema for the /tokenize endpoint."""
806
+
807
+ model: str = DEFAULT_MODEL_NAME
808
+ prompt: Union[str, List[str]]
809
+ add_special_tokens: bool = Field(
810
+ default=True,
811
+ description="whether to add model-specific special tokens (e.g. BOS/EOS) during encoding.",
812
+ )
813
+
814
+
815
+ class TokenizeResponse(BaseModel):
816
+ """Response schema for the /tokenize endpoint."""
817
+
818
+ tokens: Union[List[int], List[List[int]]]
819
+ count: Union[int, List[int]]
820
+ max_model_len: int
821
+
822
+
823
+ class DetokenizeRequest(BaseModel):
824
+ """Request schema for the /detokenize endpoint."""
825
+
826
+ model: str = DEFAULT_MODEL_NAME
827
+ tokens: Union[List[int], List[List[int]]]
828
+ skip_special_tokens: bool = Field(
829
+ default=True,
830
+ description="whether to exclude special tokens (e.g. padding or EOS) during decoding.",
831
+ )
832
+
833
+
834
+ class DetokenizeResponse(BaseModel):
835
+ """Response schema for the /detokenize endpoint."""
836
+
837
+ text: Union[str, List[str]]
838
+
839
+
714
840
  OpenAIServingRequest = Union[
715
841
  ChatCompletionRequest,
716
842
  CompletionRequest,
717
843
  EmbeddingRequest,
718
844
  ScoringRequest,
719
845
  V1RerankReqInput,
846
+ TokenizeRequest,
847
+ DetokenizeRequest,
720
848
  ]
721
849
 
722
850
 
@@ -44,7 +44,6 @@ from sglang.srt.managers.io_struct import GenerateReqInput
44
44
  from sglang.srt.parser.conversation import generate_chat_conv
45
45
  from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
46
46
  from sglang.srt.parser.reasoning_parser import ReasoningParser
47
- from sglang.utils import convert_json_schema_to_str
48
47
 
49
48
  if TYPE_CHECKING:
50
49
  from sglang.srt.managers.template_manager import TemplateManager
@@ -66,6 +65,15 @@ class OpenAIServingChat(OpenAIServingBase):
66
65
  self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
67
66
  self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
68
67
 
68
+ # Get default sampling parameters from model's generation config
69
+ self.default_sampling_params = (
70
+ self.tokenizer_manager.model_config.get_default_sampling_params()
71
+ )
72
+ if self.default_sampling_params:
73
+ logger.info(
74
+ f"Using default chat sampling params from model generation config: {self.default_sampling_params}",
75
+ )
76
+
69
77
  def _request_id_prefix(self) -> str:
70
78
  return "chatcmpl-"
71
79
 
@@ -137,10 +145,10 @@ class OpenAIServingChat(OpenAIServingBase):
137
145
  processed_messages = self._process_messages(request, is_multimodal)
138
146
 
139
147
  # Build sampling parameters
140
- sampling_params = self._build_sampling_params(
141
- request,
142
- processed_messages.stop,
143
- processed_messages.tool_call_constraint,
148
+ sampling_params = request.to_sampling_params(
149
+ stop=processed_messages.stop,
150
+ model_generation_config=self.default_sampling_params,
151
+ tool_call_constraint=processed_messages.tool_call_constraint,
144
152
  )
145
153
 
146
154
  # Handle single vs multiple requests
@@ -410,72 +418,6 @@ class OpenAIServingChat(OpenAIServingBase):
410
418
  stop=stop,
411
419
  )
412
420
 
413
- def _build_sampling_params(
414
- self,
415
- request: ChatCompletionRequest,
416
- stop: List[str],
417
- tool_call_constraint: Optional[Any],
418
- ) -> Dict[str, Any]:
419
- """Build sampling parameters for the request"""
420
-
421
- sampling_params = {
422
- "temperature": request.temperature,
423
- "max_new_tokens": request.max_tokens or request.max_completion_tokens,
424
- "min_new_tokens": request.min_tokens,
425
- "stop": stop,
426
- "stop_token_ids": request.stop_token_ids,
427
- "top_p": request.top_p,
428
- "top_k": request.top_k,
429
- "min_p": request.min_p,
430
- "presence_penalty": request.presence_penalty,
431
- "frequency_penalty": request.frequency_penalty,
432
- "repetition_penalty": request.repetition_penalty,
433
- "regex": request.regex,
434
- "ebnf": request.ebnf,
435
- "n": request.n,
436
- "no_stop_trim": request.no_stop_trim,
437
- "ignore_eos": request.ignore_eos,
438
- "skip_special_tokens": request.skip_special_tokens,
439
- "logit_bias": request.logit_bias,
440
- }
441
-
442
- if request.response_format and request.response_format.type == "json_schema":
443
- sampling_params["json_schema"] = convert_json_schema_to_str(
444
- request.response_format.json_schema.schema_
445
- )
446
- elif request.response_format and request.response_format.type == "json_object":
447
- sampling_params["json_schema"] = '{"type": "object"}'
448
- elif (
449
- request.response_format and request.response_format.type == "structural_tag"
450
- ):
451
- sampling_params["structural_tag"] = convert_json_schema_to_str(
452
- request.response_format.model_dump(by_alias=True)
453
- )
454
-
455
- # Check if there are already existing output constraints
456
- has_existing_constraints = (
457
- sampling_params.get("regex")
458
- or sampling_params.get("ebnf")
459
- or sampling_params.get("structural_tag")
460
- or sampling_params.get("json_schema")
461
- )
462
-
463
- if tool_call_constraint and has_existing_constraints:
464
- logger.warning("Constrained decoding is not compatible with tool calls.")
465
- elif tool_call_constraint:
466
- constraint_type, constraint_value = tool_call_constraint
467
- if constraint_type == "structural_tag":
468
- sampling_params[constraint_type] = convert_json_schema_to_str(
469
- constraint_value.model_dump(by_alias=True)
470
- )
471
- elif constraint_type == "json_schema":
472
- sampling_params[constraint_type] = convert_json_schema_to_str(
473
- constraint_value
474
- )
475
- else:
476
- sampling_params[constraint_type] = constraint_value
477
- return sampling_params
478
-
479
421
  async def _handle_streaming_request(
480
422
  self,
481
423
  adapted_request: GenerateReqInput,
@@ -0,0 +1,144 @@
1
+ import logging
2
+ from http import HTTPStatus
3
+ from typing import List, Union
4
+
5
+ from fastapi import Request
6
+
7
+ from sglang.srt.entrypoints.openai.protocol import (
8
+ DetokenizeRequest,
9
+ DetokenizeResponse,
10
+ ErrorResponse,
11
+ TokenizeRequest,
12
+ TokenizeResponse,
13
+ )
14
+ from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class OpenAIServingTokenize(OpenAIServingBase):
20
+ """Handler for /v1/tokenize requests"""
21
+
22
+ def _request_id_prefix(self) -> str:
23
+ return "tok-"
24
+
25
+ def _convert_to_internal_request(
26
+ self, request: TokenizeRequest, raw_request: Request
27
+ ) -> tuple[TokenizeRequest, TokenizeRequest]:
28
+ return request, request
29
+
30
+ async def _handle_non_streaming_request(
31
+ self,
32
+ adapted_request: TokenizeRequest,
33
+ request: TokenizeRequest,
34
+ raw_request: Request,
35
+ ) -> Union[TokenizeResponse, ErrorResponse]:
36
+ try:
37
+ tokenizer = self.tokenizer_manager.tokenizer
38
+ max_model_len = getattr(tokenizer, "model_max_length", -1)
39
+
40
+ if isinstance(request.prompt, str):
41
+ token_ids = tokenizer.encode(
42
+ request.prompt,
43
+ add_special_tokens=request.add_special_tokens,
44
+ )
45
+ tokens = token_ids
46
+ count = len(token_ids)
47
+ elif isinstance(request.prompt, list):
48
+ token_ids_list = [
49
+ tokenizer.encode(
50
+ text, add_special_tokens=request.add_special_tokens
51
+ )
52
+ for text in request.prompt
53
+ ]
54
+ tokens = token_ids_list
55
+ count = [len(ids) for ids in token_ids_list]
56
+ else:
57
+ return self.create_error_response(
58
+ f"Invalid prompt type: {type(request.prompt)}. Expected str or List[str]."
59
+ )
60
+
61
+ return TokenizeResponse(
62
+ tokens=tokens, count=count, max_model_len=max_model_len
63
+ )
64
+ except Exception as e:
65
+ logger.error("Error during tokenization", exc_info=True)
66
+ return self.create_error_response(
67
+ f"Internal server error during tokenization: {e}",
68
+ err_type="InternalServerError",
69
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
70
+ )
71
+
72
+
73
+ class OpenAIServingDetokenize(OpenAIServingBase):
74
+ """Handler for /v1/detokenize requests"""
75
+
76
+ def _request_id_prefix(self) -> str:
77
+ return "detok-"
78
+
79
+ def _convert_to_internal_request(
80
+ self, request: DetokenizeRequest, raw_request: Request
81
+ ) -> tuple[DetokenizeRequest, DetokenizeRequest]:
82
+ return request, request
83
+
84
+ async def _handle_non_streaming_request(
85
+ self,
86
+ adapted_request: DetokenizeRequest,
87
+ request: DetokenizeRequest,
88
+ raw_request: Request,
89
+ ) -> Union[DetokenizeResponse, ErrorResponse]:
90
+ try:
91
+ tokenizer = self.tokenizer_manager.tokenizer
92
+
93
+ if (
94
+ isinstance(request.tokens, list)
95
+ and request.tokens
96
+ and isinstance(request.tokens[0], int)
97
+ ):
98
+ if not all(isinstance(t, int) for t in request.tokens):
99
+ return self.create_error_response(
100
+ "Invalid input: 'tokens' must be a list of integers."
101
+ )
102
+ tokens_to_decode = [int(t) for t in request.tokens]
103
+ text = tokenizer.decode(
104
+ tokens_to_decode, skip_special_tokens=request.skip_special_tokens
105
+ )
106
+ text_out: Union[str, List[str]] = text
107
+ elif (
108
+ isinstance(request.tokens, list)
109
+ and request.tokens
110
+ and isinstance(request.tokens[0], list)
111
+ ):
112
+ texts: List[str] = []
113
+ for token_list in request.tokens:
114
+ if not all(isinstance(t, int) for t in token_list):
115
+ return self.create_error_response(
116
+ f"Invalid input: Sublist in 'tokens' must contain only integers. Found: {token_list}"
117
+ )
118
+ decoded_text = tokenizer.decode(
119
+ [int(t) for t in token_list],
120
+ skip_special_tokens=request.skip_special_tokens,
121
+ )
122
+ texts.append(decoded_text)
123
+ text_out = texts
124
+ elif isinstance(request.tokens, list) and not request.tokens:
125
+ text_out = ""
126
+ else:
127
+ return self.create_error_response(
128
+ f"Invalid tokens type: {type(request.tokens)}. Expected List[int] or List[List[int]]."
129
+ )
130
+
131
+ return DetokenizeResponse(text=text_out)
132
+ except Exception as e:
133
+ logger.error("Error during detokenization", exc_info=True)
134
+ if "decode" in str(e).lower():
135
+ return self.create_error_response(
136
+ f"Error decoding tokens: {e}. Input tokens might be invalid for the model.",
137
+ err_type="DecodeError",
138
+ status_code=HTTPStatus.BAD_REQUEST,
139
+ )
140
+ return self.create_error_response(
141
+ f"Internal server error during detokenization: {e}",
142
+ err_type="InternalServerError",
143
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
144
+ )
sglang/srt/environ.py CHANGED
@@ -128,6 +128,10 @@ class Envs:
128
128
  SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
129
129
  SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
130
130
 
131
+ # Test: pd-disaggregation
132
+ SGLANG_TEST_PD_DISAGG_BACKEND = EnvStr("mooncake")
133
+ SGLANG_TEST_PD_DISAGG_DEVICES = EnvStr(None)
134
+
131
135
  # Model Parallel
132
136
  SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
133
137
 
@@ -35,17 +35,19 @@ class FunctionCallParser:
35
35
  """
36
36
 
37
37
  ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
38
- "llama3": Llama32Detector,
39
- "qwen25": Qwen25Detector,
40
- "mistral": MistralDetector,
41
38
  "deepseekv3": DeepSeekV3Detector,
42
39
  "deepseekv31": DeepSeekV31Detector,
43
- "pythonic": PythonicDetector,
40
+ "glm": Glm4MoeDetector,
41
+ "glm45": Glm4MoeDetector,
42
+ "gpt-oss": GptOssDetector,
44
43
  "kimi_k2": KimiK2Detector,
44
+ "llama3": Llama32Detector,
45
+ "mistral": MistralDetector,
46
+ "pythonic": PythonicDetector,
47
+ "qwen": Qwen25Detector,
48
+ "qwen25": Qwen25Detector,
45
49
  "qwen3_coder": Qwen3CoderDetector,
46
- "glm45": Glm4MoeDetector,
47
50
  "step3": Step3Detector,
48
- "gpt-oss": GptOssDetector,
49
51
  }
50
52
 
51
53
  def __init__(self, tools: List[Tool], tool_call_parser: str):