sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -2
- sglang/bench_serving.py +224 -127
- sglang/compile_deep_gemm.py +3 -0
- sglang/launch_server.py +0 -14
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/falcon_h1.py +12 -58
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +68 -31
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +11 -43
- sglang/srt/disaggregation/decode.py +7 -18
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/nixl/conn.py +55 -23
- sglang/srt/disaggregation/prefill.py +17 -32
- sglang/srt/entrypoints/engine.py +2 -2
- sglang/srt/entrypoints/grpc_request_manager.py +10 -23
- sglang/srt/entrypoints/grpc_server.py +220 -80
- sglang/srt/entrypoints/http_server.py +49 -1
- sglang/srt/entrypoints/openai/protocol.py +159 -31
- sglang/srt/entrypoints/openai/serving_chat.py +13 -71
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +4 -0
- sglang/srt/function_call/function_call_parser.py +8 -6
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
- sglang/srt/layers/attention/attention_registry.py +31 -22
- sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
- sglang/srt/layers/attention/flashattention_backend.py +0 -1
- sglang/srt/layers/attention/flashinfer_backend.py +223 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/triton_backend.py +1 -1
- sglang/srt/layers/logits_processor.py +136 -6
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
- sglang/srt/layers/moe/ep_moe/layer.py +8 -286
- sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/utils.py +7 -1
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/modelopt_quant.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/w4afp8.py +2 -16
- sglang/srt/lora/lora_manager.py +0 -8
- sglang/srt/managers/overlap_utils.py +18 -16
- sglang/srt/managers/schedule_batch.py +119 -90
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +213 -126
- sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
- sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
- sglang/srt/managers/tokenizer_manager.py +270 -53
- sglang/srt/managers/tp_worker.py +39 -28
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +162 -68
- sglang/srt/mem_cache/radix_cache.py +8 -3
- sglang/srt/mem_cache/swa_radix_cache.py +70 -14
- sglang/srt/model_executor/cuda_graph_runner.py +1 -1
- sglang/srt/model_executor/forward_batch_info.py +4 -18
- sglang/srt/model_executor/model_runner.py +55 -51
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +187 -6
- sglang/srt/model_loader/weight_utils.py +3 -0
- sglang/srt/models/falcon_h1.py +11 -9
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +11 -1
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/utils.py +5 -1
- sglang/srt/sampling/sampling_batch_info.py +11 -9
- sglang/srt/server_args.py +100 -33
- sglang/srt/speculative/eagle_worker.py +11 -13
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_utils.py +0 -1
- sglang/srt/two_batch_overlap.py +1 -0
- sglang/srt/utils/common.py +18 -0
- sglang/srt/utils/hf_transformers_utils.py +2 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +40 -0
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +18 -2
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +63 -0
- sglang/test/test_utils.py +32 -11
- sglang/version.py +1 -1
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,7 @@
|
|
13
13
|
# ==============================================================================
|
14
14
|
"""Pydantic models for OpenAI API protocol"""
|
15
15
|
|
16
|
+
import logging
|
16
17
|
import time
|
17
18
|
import uuid
|
18
19
|
from dataclasses import dataclass
|
@@ -37,6 +38,10 @@ from pydantic import (
|
|
37
38
|
)
|
38
39
|
from typing_extensions import Literal
|
39
40
|
|
41
|
+
from sglang.utils import convert_json_schema_to_str
|
42
|
+
|
43
|
+
logger = logging.getLogger(__name__)
|
44
|
+
|
40
45
|
DEFAULT_MODEL_NAME = "default"
|
41
46
|
|
42
47
|
|
@@ -445,8 +450,8 @@ class ChatCompletionRequest(BaseModel):
|
|
445
450
|
stop: Optional[Union[str, List[str]]] = None
|
446
451
|
stream: bool = False
|
447
452
|
stream_options: Optional[StreamOptions] = None
|
448
|
-
temperature: float =
|
449
|
-
top_p: float =
|
453
|
+
temperature: Optional[float] = None
|
454
|
+
top_p: Optional[float] = None
|
450
455
|
user: Optional[str] = None
|
451
456
|
tools: Optional[List[Tool]] = Field(default=None, examples=[None])
|
452
457
|
tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
|
@@ -461,6 +466,47 @@ class ChatCompletionRequest(BaseModel):
|
|
461
466
|
"Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
|
462
467
|
)
|
463
468
|
|
469
|
+
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
470
|
+
top_k: Optional[int] = None
|
471
|
+
min_p: Optional[float] = None
|
472
|
+
min_tokens: int = 0
|
473
|
+
regex: Optional[str] = None
|
474
|
+
ebnf: Optional[str] = None
|
475
|
+
repetition_penalty: Optional[float] = None
|
476
|
+
stop_token_ids: Optional[List[int]] = None
|
477
|
+
no_stop_trim: bool = False
|
478
|
+
ignore_eos: bool = False
|
479
|
+
continue_final_message: bool = False
|
480
|
+
skip_special_tokens: bool = True
|
481
|
+
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
482
|
+
session_params: Optional[Dict] = None
|
483
|
+
separate_reasoning: bool = True
|
484
|
+
stream_reasoning: bool = True
|
485
|
+
chat_template_kwargs: Optional[Dict] = None
|
486
|
+
|
487
|
+
# For request id
|
488
|
+
rid: Optional[Union[List[str], str]] = None
|
489
|
+
# Extra key for classifying the request (e.g. cache_salt)
|
490
|
+
extra_key: Optional[Union[List[str], str]] = None
|
491
|
+
# Cache salt for request caching
|
492
|
+
cache_salt: Optional[Union[List[str], str]] = None
|
493
|
+
# Priority for the request
|
494
|
+
priority: Optional[int] = None
|
495
|
+
|
496
|
+
# For PD disaggregation
|
497
|
+
bootstrap_host: Optional[Union[List[str], str]] = None
|
498
|
+
bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
|
499
|
+
bootstrap_room: Optional[Union[List[int], int]] = None
|
500
|
+
|
501
|
+
# OpenAI/SGLang default sampling parameters
|
502
|
+
_DEFAULT_SAMPLING_PARAMS = {
|
503
|
+
"temperature": 1.0,
|
504
|
+
"top_p": 1.0,
|
505
|
+
"top_k": -1,
|
506
|
+
"min_p": 0.0,
|
507
|
+
"repetition_penalty": 1.0,
|
508
|
+
}
|
509
|
+
|
464
510
|
@model_validator(mode="before")
|
465
511
|
@classmethod
|
466
512
|
def set_tool_choice_default(cls, values):
|
@@ -531,37 +577,81 @@ class ChatCompletionRequest(BaseModel):
|
|
531
577
|
|
532
578
|
return values
|
533
579
|
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
580
|
+
def to_sampling_params(
|
581
|
+
self,
|
582
|
+
stop: List[str],
|
583
|
+
model_generation_config: Dict[str, Any],
|
584
|
+
tool_call_constraint: Optional[Any] = None,
|
585
|
+
) -> Dict[str, Any]:
|
586
|
+
"""
|
587
|
+
Convert request to sampling parameters.
|
588
|
+
Priority: user value > model generation_config > OpenAI defaults
|
589
|
+
"""
|
590
|
+
|
591
|
+
def get_param(param_name: str):
|
592
|
+
value = getattr(self, param_name)
|
593
|
+
if value is None:
|
594
|
+
return model_generation_config.get(
|
595
|
+
param_name, self._DEFAULT_SAMPLING_PARAMS[param_name]
|
596
|
+
)
|
597
|
+
return value
|
598
|
+
|
599
|
+
sampling_params = {
|
600
|
+
"temperature": get_param("temperature"),
|
601
|
+
"max_new_tokens": self.max_tokens or self.max_completion_tokens,
|
602
|
+
"min_new_tokens": self.min_tokens,
|
603
|
+
"stop": stop,
|
604
|
+
"stop_token_ids": self.stop_token_ids,
|
605
|
+
"top_p": get_param("top_p"),
|
606
|
+
"top_k": get_param("top_k"),
|
607
|
+
"min_p": get_param("min_p"),
|
608
|
+
"presence_penalty": self.presence_penalty,
|
609
|
+
"frequency_penalty": self.frequency_penalty,
|
610
|
+
"repetition_penalty": get_param("repetition_penalty"),
|
611
|
+
"regex": self.regex,
|
612
|
+
"ebnf": self.ebnf,
|
613
|
+
"n": self.n,
|
614
|
+
"no_stop_trim": self.no_stop_trim,
|
615
|
+
"ignore_eos": self.ignore_eos,
|
616
|
+
"skip_special_tokens": self.skip_special_tokens,
|
617
|
+
"logit_bias": self.logit_bias,
|
618
|
+
}
|
551
619
|
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
620
|
+
if self.response_format and self.response_format.type == "json_schema":
|
621
|
+
sampling_params["json_schema"] = convert_json_schema_to_str(
|
622
|
+
self.response_format.json_schema.schema_
|
623
|
+
)
|
624
|
+
elif self.response_format and self.response_format.type == "json_object":
|
625
|
+
sampling_params["json_schema"] = '{"type": "object"}'
|
626
|
+
elif self.response_format and self.response_format.type == "structural_tag":
|
627
|
+
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
628
|
+
self.response_format.model_dump(by_alias=True)
|
629
|
+
)
|
560
630
|
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
631
|
+
# Check if there are already existing output constraints
|
632
|
+
has_existing_constraints = (
|
633
|
+
sampling_params.get("regex")
|
634
|
+
or sampling_params.get("ebnf")
|
635
|
+
or sampling_params.get("structural_tag")
|
636
|
+
or sampling_params.get("json_schema")
|
637
|
+
)
|
638
|
+
|
639
|
+
if tool_call_constraint and has_existing_constraints:
|
640
|
+
logger.warning("Constrained decoding is not compatible with tool calls.")
|
641
|
+
elif tool_call_constraint:
|
642
|
+
constraint_type, constraint_value = tool_call_constraint
|
643
|
+
if constraint_type == "structural_tag":
|
644
|
+
sampling_params[constraint_type] = convert_json_schema_to_str(
|
645
|
+
constraint_value.model_dump(by_alias=True)
|
646
|
+
)
|
647
|
+
elif constraint_type == "json_schema":
|
648
|
+
sampling_params[constraint_type] = convert_json_schema_to_str(
|
649
|
+
constraint_value
|
650
|
+
)
|
651
|
+
else:
|
652
|
+
sampling_params[constraint_type] = constraint_value
|
653
|
+
|
654
|
+
return sampling_params
|
565
655
|
|
566
656
|
|
567
657
|
class ChatMessage(BaseModel):
|
@@ -711,12 +801,50 @@ class RerankResponse(BaseModel):
|
|
711
801
|
meta_info: Optional[dict] = None
|
712
802
|
|
713
803
|
|
804
|
+
class TokenizeRequest(BaseModel):
|
805
|
+
"""Request schema for the /tokenize endpoint."""
|
806
|
+
|
807
|
+
model: str = DEFAULT_MODEL_NAME
|
808
|
+
prompt: Union[str, List[str]]
|
809
|
+
add_special_tokens: bool = Field(
|
810
|
+
default=True,
|
811
|
+
description="whether to add model-specific special tokens (e.g. BOS/EOS) during encoding.",
|
812
|
+
)
|
813
|
+
|
814
|
+
|
815
|
+
class TokenizeResponse(BaseModel):
|
816
|
+
"""Response schema for the /tokenize endpoint."""
|
817
|
+
|
818
|
+
tokens: Union[List[int], List[List[int]]]
|
819
|
+
count: Union[int, List[int]]
|
820
|
+
max_model_len: int
|
821
|
+
|
822
|
+
|
823
|
+
class DetokenizeRequest(BaseModel):
|
824
|
+
"""Request schema for the /detokenize endpoint."""
|
825
|
+
|
826
|
+
model: str = DEFAULT_MODEL_NAME
|
827
|
+
tokens: Union[List[int], List[List[int]]]
|
828
|
+
skip_special_tokens: bool = Field(
|
829
|
+
default=True,
|
830
|
+
description="whether to exclude special tokens (e.g. padding or EOS) during decoding.",
|
831
|
+
)
|
832
|
+
|
833
|
+
|
834
|
+
class DetokenizeResponse(BaseModel):
|
835
|
+
"""Response schema for the /detokenize endpoint."""
|
836
|
+
|
837
|
+
text: Union[str, List[str]]
|
838
|
+
|
839
|
+
|
714
840
|
OpenAIServingRequest = Union[
|
715
841
|
ChatCompletionRequest,
|
716
842
|
CompletionRequest,
|
717
843
|
EmbeddingRequest,
|
718
844
|
ScoringRequest,
|
719
845
|
V1RerankReqInput,
|
846
|
+
TokenizeRequest,
|
847
|
+
DetokenizeRequest,
|
720
848
|
]
|
721
849
|
|
722
850
|
|
@@ -44,7 +44,6 @@ from sglang.srt.managers.io_struct import GenerateReqInput
|
|
44
44
|
from sglang.srt.parser.conversation import generate_chat_conv
|
45
45
|
from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
|
46
46
|
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
47
|
-
from sglang.utils import convert_json_schema_to_str
|
48
47
|
|
49
48
|
if TYPE_CHECKING:
|
50
49
|
from sglang.srt.managers.template_manager import TemplateManager
|
@@ -66,6 +65,15 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
66
65
|
self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
|
67
66
|
self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
|
68
67
|
|
68
|
+
# Get default sampling parameters from model's generation config
|
69
|
+
self.default_sampling_params = (
|
70
|
+
self.tokenizer_manager.model_config.get_default_sampling_params()
|
71
|
+
)
|
72
|
+
if self.default_sampling_params:
|
73
|
+
logger.info(
|
74
|
+
f"Using default chat sampling params from model generation config: {self.default_sampling_params}",
|
75
|
+
)
|
76
|
+
|
69
77
|
def _request_id_prefix(self) -> str:
|
70
78
|
return "chatcmpl-"
|
71
79
|
|
@@ -137,10 +145,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
137
145
|
processed_messages = self._process_messages(request, is_multimodal)
|
138
146
|
|
139
147
|
# Build sampling parameters
|
140
|
-
sampling_params =
|
141
|
-
|
142
|
-
|
143
|
-
processed_messages.tool_call_constraint,
|
148
|
+
sampling_params = request.to_sampling_params(
|
149
|
+
stop=processed_messages.stop,
|
150
|
+
model_generation_config=self.default_sampling_params,
|
151
|
+
tool_call_constraint=processed_messages.tool_call_constraint,
|
144
152
|
)
|
145
153
|
|
146
154
|
# Handle single vs multiple requests
|
@@ -410,72 +418,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
410
418
|
stop=stop,
|
411
419
|
)
|
412
420
|
|
413
|
-
def _build_sampling_params(
|
414
|
-
self,
|
415
|
-
request: ChatCompletionRequest,
|
416
|
-
stop: List[str],
|
417
|
-
tool_call_constraint: Optional[Any],
|
418
|
-
) -> Dict[str, Any]:
|
419
|
-
"""Build sampling parameters for the request"""
|
420
|
-
|
421
|
-
sampling_params = {
|
422
|
-
"temperature": request.temperature,
|
423
|
-
"max_new_tokens": request.max_tokens or request.max_completion_tokens,
|
424
|
-
"min_new_tokens": request.min_tokens,
|
425
|
-
"stop": stop,
|
426
|
-
"stop_token_ids": request.stop_token_ids,
|
427
|
-
"top_p": request.top_p,
|
428
|
-
"top_k": request.top_k,
|
429
|
-
"min_p": request.min_p,
|
430
|
-
"presence_penalty": request.presence_penalty,
|
431
|
-
"frequency_penalty": request.frequency_penalty,
|
432
|
-
"repetition_penalty": request.repetition_penalty,
|
433
|
-
"regex": request.regex,
|
434
|
-
"ebnf": request.ebnf,
|
435
|
-
"n": request.n,
|
436
|
-
"no_stop_trim": request.no_stop_trim,
|
437
|
-
"ignore_eos": request.ignore_eos,
|
438
|
-
"skip_special_tokens": request.skip_special_tokens,
|
439
|
-
"logit_bias": request.logit_bias,
|
440
|
-
}
|
441
|
-
|
442
|
-
if request.response_format and request.response_format.type == "json_schema":
|
443
|
-
sampling_params["json_schema"] = convert_json_schema_to_str(
|
444
|
-
request.response_format.json_schema.schema_
|
445
|
-
)
|
446
|
-
elif request.response_format and request.response_format.type == "json_object":
|
447
|
-
sampling_params["json_schema"] = '{"type": "object"}'
|
448
|
-
elif (
|
449
|
-
request.response_format and request.response_format.type == "structural_tag"
|
450
|
-
):
|
451
|
-
sampling_params["structural_tag"] = convert_json_schema_to_str(
|
452
|
-
request.response_format.model_dump(by_alias=True)
|
453
|
-
)
|
454
|
-
|
455
|
-
# Check if there are already existing output constraints
|
456
|
-
has_existing_constraints = (
|
457
|
-
sampling_params.get("regex")
|
458
|
-
or sampling_params.get("ebnf")
|
459
|
-
or sampling_params.get("structural_tag")
|
460
|
-
or sampling_params.get("json_schema")
|
461
|
-
)
|
462
|
-
|
463
|
-
if tool_call_constraint and has_existing_constraints:
|
464
|
-
logger.warning("Constrained decoding is not compatible with tool calls.")
|
465
|
-
elif tool_call_constraint:
|
466
|
-
constraint_type, constraint_value = tool_call_constraint
|
467
|
-
if constraint_type == "structural_tag":
|
468
|
-
sampling_params[constraint_type] = convert_json_schema_to_str(
|
469
|
-
constraint_value.model_dump(by_alias=True)
|
470
|
-
)
|
471
|
-
elif constraint_type == "json_schema":
|
472
|
-
sampling_params[constraint_type] = convert_json_schema_to_str(
|
473
|
-
constraint_value
|
474
|
-
)
|
475
|
-
else:
|
476
|
-
sampling_params[constraint_type] = constraint_value
|
477
|
-
return sampling_params
|
478
|
-
|
479
421
|
async def _handle_streaming_request(
|
480
422
|
self,
|
481
423
|
adapted_request: GenerateReqInput,
|
@@ -0,0 +1,144 @@
|
|
1
|
+
import logging
|
2
|
+
from http import HTTPStatus
|
3
|
+
from typing import List, Union
|
4
|
+
|
5
|
+
from fastapi import Request
|
6
|
+
|
7
|
+
from sglang.srt.entrypoints.openai.protocol import (
|
8
|
+
DetokenizeRequest,
|
9
|
+
DetokenizeResponse,
|
10
|
+
ErrorResponse,
|
11
|
+
TokenizeRequest,
|
12
|
+
TokenizeResponse,
|
13
|
+
)
|
14
|
+
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class OpenAIServingTokenize(OpenAIServingBase):
|
20
|
+
"""Handler for /v1/tokenize requests"""
|
21
|
+
|
22
|
+
def _request_id_prefix(self) -> str:
|
23
|
+
return "tok-"
|
24
|
+
|
25
|
+
def _convert_to_internal_request(
|
26
|
+
self, request: TokenizeRequest, raw_request: Request
|
27
|
+
) -> tuple[TokenizeRequest, TokenizeRequest]:
|
28
|
+
return request, request
|
29
|
+
|
30
|
+
async def _handle_non_streaming_request(
|
31
|
+
self,
|
32
|
+
adapted_request: TokenizeRequest,
|
33
|
+
request: TokenizeRequest,
|
34
|
+
raw_request: Request,
|
35
|
+
) -> Union[TokenizeResponse, ErrorResponse]:
|
36
|
+
try:
|
37
|
+
tokenizer = self.tokenizer_manager.tokenizer
|
38
|
+
max_model_len = getattr(tokenizer, "model_max_length", -1)
|
39
|
+
|
40
|
+
if isinstance(request.prompt, str):
|
41
|
+
token_ids = tokenizer.encode(
|
42
|
+
request.prompt,
|
43
|
+
add_special_tokens=request.add_special_tokens,
|
44
|
+
)
|
45
|
+
tokens = token_ids
|
46
|
+
count = len(token_ids)
|
47
|
+
elif isinstance(request.prompt, list):
|
48
|
+
token_ids_list = [
|
49
|
+
tokenizer.encode(
|
50
|
+
text, add_special_tokens=request.add_special_tokens
|
51
|
+
)
|
52
|
+
for text in request.prompt
|
53
|
+
]
|
54
|
+
tokens = token_ids_list
|
55
|
+
count = [len(ids) for ids in token_ids_list]
|
56
|
+
else:
|
57
|
+
return self.create_error_response(
|
58
|
+
f"Invalid prompt type: {type(request.prompt)}. Expected str or List[str]."
|
59
|
+
)
|
60
|
+
|
61
|
+
return TokenizeResponse(
|
62
|
+
tokens=tokens, count=count, max_model_len=max_model_len
|
63
|
+
)
|
64
|
+
except Exception as e:
|
65
|
+
logger.error("Error during tokenization", exc_info=True)
|
66
|
+
return self.create_error_response(
|
67
|
+
f"Internal server error during tokenization: {e}",
|
68
|
+
err_type="InternalServerError",
|
69
|
+
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
70
|
+
)
|
71
|
+
|
72
|
+
|
73
|
+
class OpenAIServingDetokenize(OpenAIServingBase):
|
74
|
+
"""Handler for /v1/detokenize requests"""
|
75
|
+
|
76
|
+
def _request_id_prefix(self) -> str:
|
77
|
+
return "detok-"
|
78
|
+
|
79
|
+
def _convert_to_internal_request(
|
80
|
+
self, request: DetokenizeRequest, raw_request: Request
|
81
|
+
) -> tuple[DetokenizeRequest, DetokenizeRequest]:
|
82
|
+
return request, request
|
83
|
+
|
84
|
+
async def _handle_non_streaming_request(
|
85
|
+
self,
|
86
|
+
adapted_request: DetokenizeRequest,
|
87
|
+
request: DetokenizeRequest,
|
88
|
+
raw_request: Request,
|
89
|
+
) -> Union[DetokenizeResponse, ErrorResponse]:
|
90
|
+
try:
|
91
|
+
tokenizer = self.tokenizer_manager.tokenizer
|
92
|
+
|
93
|
+
if (
|
94
|
+
isinstance(request.tokens, list)
|
95
|
+
and request.tokens
|
96
|
+
and isinstance(request.tokens[0], int)
|
97
|
+
):
|
98
|
+
if not all(isinstance(t, int) for t in request.tokens):
|
99
|
+
return self.create_error_response(
|
100
|
+
"Invalid input: 'tokens' must be a list of integers."
|
101
|
+
)
|
102
|
+
tokens_to_decode = [int(t) for t in request.tokens]
|
103
|
+
text = tokenizer.decode(
|
104
|
+
tokens_to_decode, skip_special_tokens=request.skip_special_tokens
|
105
|
+
)
|
106
|
+
text_out: Union[str, List[str]] = text
|
107
|
+
elif (
|
108
|
+
isinstance(request.tokens, list)
|
109
|
+
and request.tokens
|
110
|
+
and isinstance(request.tokens[0], list)
|
111
|
+
):
|
112
|
+
texts: List[str] = []
|
113
|
+
for token_list in request.tokens:
|
114
|
+
if not all(isinstance(t, int) for t in token_list):
|
115
|
+
return self.create_error_response(
|
116
|
+
f"Invalid input: Sublist in 'tokens' must contain only integers. Found: {token_list}"
|
117
|
+
)
|
118
|
+
decoded_text = tokenizer.decode(
|
119
|
+
[int(t) for t in token_list],
|
120
|
+
skip_special_tokens=request.skip_special_tokens,
|
121
|
+
)
|
122
|
+
texts.append(decoded_text)
|
123
|
+
text_out = texts
|
124
|
+
elif isinstance(request.tokens, list) and not request.tokens:
|
125
|
+
text_out = ""
|
126
|
+
else:
|
127
|
+
return self.create_error_response(
|
128
|
+
f"Invalid tokens type: {type(request.tokens)}. Expected List[int] or List[List[int]]."
|
129
|
+
)
|
130
|
+
|
131
|
+
return DetokenizeResponse(text=text_out)
|
132
|
+
except Exception as e:
|
133
|
+
logger.error("Error during detokenization", exc_info=True)
|
134
|
+
if "decode" in str(e).lower():
|
135
|
+
return self.create_error_response(
|
136
|
+
f"Error decoding tokens: {e}. Input tokens might be invalid for the model.",
|
137
|
+
err_type="DecodeError",
|
138
|
+
status_code=HTTPStatus.BAD_REQUEST,
|
139
|
+
)
|
140
|
+
return self.create_error_response(
|
141
|
+
f"Internal server error during detokenization: {e}",
|
142
|
+
err_type="InternalServerError",
|
143
|
+
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
144
|
+
)
|
sglang/srt/environ.py
CHANGED
@@ -128,6 +128,10 @@ class Envs:
|
|
128
128
|
SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
|
129
129
|
SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
|
130
130
|
|
131
|
+
# Test: pd-disaggregation
|
132
|
+
SGLANG_TEST_PD_DISAGG_BACKEND = EnvStr("mooncake")
|
133
|
+
SGLANG_TEST_PD_DISAGG_DEVICES = EnvStr(None)
|
134
|
+
|
131
135
|
# Model Parallel
|
132
136
|
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
|
133
137
|
|
@@ -35,17 +35,19 @@ class FunctionCallParser:
|
|
35
35
|
"""
|
36
36
|
|
37
37
|
ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
|
38
|
-
"llama3": Llama32Detector,
|
39
|
-
"qwen25": Qwen25Detector,
|
40
|
-
"mistral": MistralDetector,
|
41
38
|
"deepseekv3": DeepSeekV3Detector,
|
42
39
|
"deepseekv31": DeepSeekV31Detector,
|
43
|
-
"
|
40
|
+
"glm": Glm4MoeDetector,
|
41
|
+
"glm45": Glm4MoeDetector,
|
42
|
+
"gpt-oss": GptOssDetector,
|
44
43
|
"kimi_k2": KimiK2Detector,
|
44
|
+
"llama3": Llama32Detector,
|
45
|
+
"mistral": MistralDetector,
|
46
|
+
"pythonic": PythonicDetector,
|
47
|
+
"qwen": Qwen25Detector,
|
48
|
+
"qwen25": Qwen25Detector,
|
45
49
|
"qwen3_coder": Qwen3CoderDetector,
|
46
|
-
"glm45": Glm4MoeDetector,
|
47
50
|
"step3": Step3Detector,
|
48
|
-
"gpt-oss": GptOssDetector,
|
49
51
|
}
|
50
52
|
|
51
53
|
def __init__(self, tools: List[Tool], tool_call_parser: str):
|