sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +257 -29
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +50 -6
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +48 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/xgrammar_backend.py +28 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +21 -10
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +5 -3
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +24 -3
- sglang/srt/entrypoints/engine.py +38 -17
- sglang/srt/entrypoints/grpc_request_manager.py +580 -0
- sglang/srt/entrypoints/grpc_server.py +680 -0
- sglang/srt/entrypoints/http_server.py +85 -54
- sglang/srt/entrypoints/openai/protocol.py +4 -1
- sglang/srt/entrypoints/openai/serving_base.py +46 -3
- sglang/srt/entrypoints/openai/serving_chat.py +36 -16
- sglang/srt/entrypoints/openai/serving_completions.py +12 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +6 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +6 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +142 -9
- sglang/srt/layers/attention/ascend_backend.py +11 -4
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +18 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/dp_attention.py +30 -1
- sglang/srt/layers/layernorm.py +32 -15
- sglang/srt/layers/linear.py +34 -3
- sglang/srt/layers/logits_processor.py +29 -10
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +182 -62
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +12 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +50 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +147 -47
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +64 -40
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +30 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +76 -38
- sglang/srt/layers/sampler.py +162 -18
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +158 -160
- sglang/srt/managers/data_parallel_controller.py +105 -35
- sglang/srt/managers/detokenizer_manager.py +8 -4
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +199 -12
- sglang/srt/managers/mm_utils.py +1 -0
- sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
- sglang/srt/managers/schedule_batch.py +77 -56
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +187 -39
- sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
- sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
- sglang/srt/managers/tokenizer_manager.py +259 -519
- sglang/srt/managers/tp_worker.py +53 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
- sglang/srt/mem_cache/hicache_storage.py +3 -23
- sglang/srt/mem_cache/hiradix_cache.py +103 -43
- sglang/srt/mem_cache/memory_pool.py +347 -48
- sglang/srt/mem_cache/memory_pool_host.py +105 -46
- sglang/srt/mem_cache/radix_cache.py +0 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
- sglang/srt/mem_cache/swa_radix_cache.py +0 -2
- sglang/srt/metrics/collector.py +493 -76
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +59 -2
- sglang/srt/model_executor/model_runner.py +356 -29
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +128 -4
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +798 -218
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_v2.py +109 -15
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +1 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/glm4v_moe.py +3 -0
- sglang/srt/models/gpt_oss.py +1 -1
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +7 -0
- sglang/srt/models/qwen2_5_vl.py +27 -3
- sglang/srt/models/qwen2_moe.py +56 -12
- sglang/srt/models/qwen3_moe.py +1 -1
- sglang/srt/models/qwen3_next.py +1042 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/multimodal/processors/dots_vlm.py +99 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/multimodal/processors/qwen_vl.py +15 -5
- sglang/srt/offloader.py +27 -3
- sglang/srt/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +276 -35
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_utils.py +0 -2
- sglang/srt/speculative/eagle_worker.py +43 -4
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tracing/trace.py +552 -0
- sglang/srt/utils.py +34 -3
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_utils.py +28 -1
- sglang/utils.py +11 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
- sglang/srt/disaggregation/launch_lb.py +0 -118
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -27,10 +27,12 @@ import tempfile
|
|
27
27
|
import threading
|
28
28
|
import time
|
29
29
|
from http import HTTPStatus
|
30
|
-
from typing import Any, AsyncIterator, Callable, Dict, List, Optional
|
30
|
+
from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
|
31
31
|
|
32
32
|
import setproctitle
|
33
33
|
|
34
|
+
from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
|
35
|
+
|
34
36
|
# Fix a bug of Python threading
|
35
37
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
36
38
|
|
@@ -47,11 +49,7 @@ from fastapi.exceptions import RequestValidationError
|
|
47
49
|
from fastapi.middleware.cors import CORSMiddleware
|
48
50
|
from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
49
51
|
|
50
|
-
from sglang.srt.disaggregation.utils import
|
51
|
-
FAKE_BOOTSTRAP_HOST,
|
52
|
-
DisaggregationMode,
|
53
|
-
register_disaggregation_server,
|
54
|
-
)
|
52
|
+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
|
55
53
|
from sglang.srt.entrypoints.engine import _launch_subprocesses
|
56
54
|
from sglang.srt.entrypoints.openai.protocol import (
|
57
55
|
ChatCompletionRequest,
|
@@ -77,6 +75,7 @@ from sglang.srt.managers.io_struct import (
|
|
77
75
|
EmbeddingReqInput,
|
78
76
|
GenerateReqInput,
|
79
77
|
GetWeightsByNameReqInput,
|
78
|
+
InitWeightsSendGroupForRemoteInstanceReqInput,
|
80
79
|
InitWeightsUpdateGroupReqInput,
|
81
80
|
LoadLoRAAdapterReqInput,
|
82
81
|
OpenSessionReqInput,
|
@@ -84,6 +83,7 @@ from sglang.srt.managers.io_struct import (
|
|
84
83
|
ProfileReqInput,
|
85
84
|
ReleaseMemoryOccupationReqInput,
|
86
85
|
ResumeMemoryOccupationReqInput,
|
86
|
+
SendWeightsToRemoteInstanceReqInput,
|
87
87
|
SeparateReasoningReqInput,
|
88
88
|
SetInternalStateReq,
|
89
89
|
SlowDownReqInput,
|
@@ -96,8 +96,9 @@ from sglang.srt.managers.io_struct import (
|
|
96
96
|
)
|
97
97
|
from sglang.srt.managers.multi_tokenizer_mixin import (
|
98
98
|
MultiTokenizerManager,
|
99
|
-
|
99
|
+
MultiTokenizerRouter,
|
100
100
|
get_main_process_id,
|
101
|
+
monkey_patch_uvicorn_multiprocessing,
|
101
102
|
read_from_shared_memory,
|
102
103
|
write_data_for_multi_tokenizer,
|
103
104
|
)
|
@@ -127,7 +128,9 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
|
127
128
|
# Store global states
|
128
129
|
@dataclasses.dataclass
|
129
130
|
class _GlobalState:
|
130
|
-
tokenizer_manager:
|
131
|
+
tokenizer_manager: Union[
|
132
|
+
TokenizerManager, MultiTokenizerRouter, MultiTokenizerManager
|
133
|
+
]
|
131
134
|
template_manager: TemplateManager
|
132
135
|
scheduler_info: Dict
|
133
136
|
|
@@ -140,21 +143,6 @@ def set_global_state(global_state: _GlobalState):
|
|
140
143
|
_global_state = global_state
|
141
144
|
|
142
145
|
|
143
|
-
# Function to set up all middlewares for multi-tokenizer compatibility
|
144
|
-
def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
|
145
|
-
"""Setup all middlewares for both single and multi-process modes"""
|
146
|
-
worker_pid = os.getpid()
|
147
|
-
|
148
|
-
if api_key:
|
149
|
-
add_api_key_middleware(app, api_key)
|
150
|
-
logger.info(f"Worker {worker_pid} added API key middleware")
|
151
|
-
|
152
|
-
if enable_metrics:
|
153
|
-
add_prometheus_middleware(app)
|
154
|
-
enable_func_timer()
|
155
|
-
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
156
|
-
|
157
|
-
|
158
146
|
async def init_multi_tokenizer() -> ServerArgs:
|
159
147
|
"""Read args information from shm and init tokenizer manager for current process"""
|
160
148
|
pid = os.getpid()
|
@@ -162,11 +150,15 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
162
150
|
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
163
151
|
|
164
152
|
# Read configuration from shared memory
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
153
|
+
port_args, server_args, scheduler_info = read_from_shared_memory(
|
154
|
+
f"multi_tokenizer_args_{main_pid}"
|
155
|
+
)
|
156
|
+
server_args: ServerArgs
|
157
|
+
|
158
|
+
# API key authentication is not supported in multi-tokenizer mode
|
159
|
+
assert (
|
160
|
+
server_args.api_key is None
|
161
|
+
), "API key is not supported in multi-tokenizer mode"
|
170
162
|
|
171
163
|
port_args.tokenizer_ipc_name = (
|
172
164
|
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
@@ -192,18 +184,29 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
192
184
|
scheduler_info=scheduler_info,
|
193
185
|
)
|
194
186
|
)
|
187
|
+
|
188
|
+
if server_args.enable_trace:
|
189
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
190
|
+
if server_args.disaggregation_mode == "null":
|
191
|
+
thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
|
192
|
+
trace_set_thread_info(thread_label)
|
193
|
+
|
195
194
|
return server_args
|
196
195
|
|
197
196
|
|
198
197
|
@asynccontextmanager
|
199
198
|
async def lifespan(fast_api_app: FastAPI):
|
200
|
-
|
201
|
-
if server_args is None:
|
199
|
+
if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
|
202
200
|
# Initialize multi-tokenizer support for worker processes
|
203
|
-
fast_api_app.server_args = await init_multi_tokenizer()
|
204
|
-
|
205
|
-
|
206
|
-
)
|
201
|
+
fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
|
202
|
+
|
203
|
+
# only metrics middleware is supported in multi-tokenizer mode
|
204
|
+
worker_pid = os.getpid()
|
205
|
+
if fast_api_app.server_args.enable_metrics:
|
206
|
+
add_prometheus_middleware(app)
|
207
|
+
enable_func_timer()
|
208
|
+
|
209
|
+
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
207
210
|
fast_api_app.warmup_thread = threading.Thread(
|
208
211
|
target=_wait_and_warmup,
|
209
212
|
args=(
|
@@ -681,6 +684,38 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
|
|
681
684
|
)
|
682
685
|
|
683
686
|
|
687
|
+
@app.post("/init_weights_send_group_for_remote_instance")
|
688
|
+
async def init_weights_send_group_for_remote_instance(
|
689
|
+
obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
|
690
|
+
):
|
691
|
+
success, message = (
|
692
|
+
await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
|
693
|
+
obj, request
|
694
|
+
)
|
695
|
+
)
|
696
|
+
content = {"success": success, "message": message}
|
697
|
+
if success:
|
698
|
+
return ORJSONResponse(content, status_code=200)
|
699
|
+
else:
|
700
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
701
|
+
|
702
|
+
|
703
|
+
@app.post("/send_weights_to_remote_instance")
|
704
|
+
async def send_weights_to_remote_instance(
|
705
|
+
obj: SendWeightsToRemoteInstanceReqInput, request: Request
|
706
|
+
):
|
707
|
+
success, message = (
|
708
|
+
await _global_state.tokenizer_manager.send_weights_to_remote_instance(
|
709
|
+
obj, request
|
710
|
+
)
|
711
|
+
)
|
712
|
+
content = {"success": success, "message": message}
|
713
|
+
if success:
|
714
|
+
return ORJSONResponse(content, status_code=200)
|
715
|
+
else:
|
716
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
717
|
+
|
718
|
+
|
684
719
|
@app.post("/init_weights_update_group")
|
685
720
|
async def init_weights_update_group(
|
686
721
|
obj: InitWeightsUpdateGroupReqInput, request: Request
|
@@ -1168,7 +1203,6 @@ def launch_server(
|
|
1168
1203
|
2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
|
1169
1204
|
"""
|
1170
1205
|
if server_args.tokenizer_worker_num > 1:
|
1171
|
-
setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
|
1172
1206
|
port_args = PortArgs.init_new(server_args)
|
1173
1207
|
port_args.tokenizer_worker_ipc_name = (
|
1174
1208
|
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
@@ -1177,11 +1211,16 @@ def launch_server(
|
|
1177
1211
|
server_args=server_args, port_args=port_args
|
1178
1212
|
)
|
1179
1213
|
else:
|
1180
|
-
setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
|
1181
1214
|
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1182
1215
|
server_args=server_args,
|
1183
1216
|
)
|
1184
1217
|
|
1218
|
+
if server_args.enable_trace:
|
1219
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
1220
|
+
if server_args.disaggregation_mode == "null":
|
1221
|
+
thread_label = "Tokenizer"
|
1222
|
+
trace_set_thread_info(thread_label)
|
1223
|
+
|
1185
1224
|
set_global_state(
|
1186
1225
|
_GlobalState(
|
1187
1226
|
tokenizer_manager=tokenizer_manager,
|
@@ -1191,12 +1230,10 @@ def launch_server(
|
|
1191
1230
|
)
|
1192
1231
|
|
1193
1232
|
if server_args.tokenizer_worker_num > 1:
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
scheduler_info,
|
1199
|
-
)
|
1233
|
+
multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
|
1234
|
+
port_args,
|
1235
|
+
server_args,
|
1236
|
+
scheduler_info,
|
1200
1237
|
)
|
1201
1238
|
else:
|
1202
1239
|
# Add api key authorization
|
@@ -1233,6 +1270,9 @@ def launch_server(
|
|
1233
1270
|
"level": "INFO",
|
1234
1271
|
"propagate": False,
|
1235
1272
|
}
|
1273
|
+
|
1274
|
+
monkey_patch_uvicorn_multiprocessing()
|
1275
|
+
|
1236
1276
|
uvicorn.run(
|
1237
1277
|
"sglang.srt.entrypoints.http_server:app",
|
1238
1278
|
host=server_args.host,
|
@@ -1243,6 +1283,7 @@ def launch_server(
|
|
1243
1283
|
workers=server_args.tokenizer_worker_num,
|
1244
1284
|
)
|
1245
1285
|
else:
|
1286
|
+
app.is_single_tokenizer_mode = True
|
1246
1287
|
uvicorn.run(
|
1247
1288
|
app,
|
1248
1289
|
host=server_args.host,
|
@@ -1253,10 +1294,8 @@ def launch_server(
|
|
1253
1294
|
)
|
1254
1295
|
finally:
|
1255
1296
|
if server_args.tokenizer_worker_num > 1:
|
1256
|
-
|
1257
|
-
|
1258
|
-
scheduler_info_shm.unlink()
|
1259
|
-
_global_state.tokenizer_manager.clear_tokenizer_mapping()
|
1297
|
+
multi_tokenizer_args_shm.unlink()
|
1298
|
+
_global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
|
1260
1299
|
else:
|
1261
1300
|
warmup_thread.join()
|
1262
1301
|
|
@@ -1405,13 +1444,5 @@ def _wait_and_warmup(
|
|
1405
1444
|
if server_args.debug_tensor_dump_input_file:
|
1406
1445
|
kill_process_tree(os.getpid())
|
1407
1446
|
|
1408
|
-
if server_args.pdlb_url is not None:
|
1409
|
-
register_disaggregation_server(
|
1410
|
-
server_args.disaggregation_mode,
|
1411
|
-
server_args.port,
|
1412
|
-
server_args.disaggregation_bootstrap_port,
|
1413
|
-
server_args.pdlb_url,
|
1414
|
-
)
|
1415
|
-
|
1416
1447
|
if launch_callback is not None:
|
1417
1448
|
launch_callback()
|
@@ -229,6 +229,9 @@ class CompletionRequest(BaseModel):
|
|
229
229
|
# For request id
|
230
230
|
rid: Optional[Union[List[str], str]] = None
|
231
231
|
|
232
|
+
# For customer metric labels
|
233
|
+
customer_labels: Optional[Dict[str, str]] = None
|
234
|
+
|
232
235
|
@field_validator("max_tokens")
|
233
236
|
@classmethod
|
234
237
|
def validate_max_tokens_positive(cls, v):
|
@@ -447,7 +450,7 @@ class ChatCompletionRequest(BaseModel):
|
|
447
450
|
description="Constrains effort on reasoning for reasoning models. "
|
448
451
|
"'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
|
449
452
|
"result in faster responses and fewer tokens used on reasoning in a response. "
|
450
|
-
"Currently only supported for OpenAI models.",
|
453
|
+
"Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
|
451
454
|
)
|
452
455
|
|
453
456
|
@model_validator(mode="before")
|
@@ -1,15 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import json
|
2
4
|
import logging
|
3
5
|
import uuid
|
4
6
|
from abc import ABC, abstractmethod
|
5
|
-
from typing import Any, Optional, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
6
8
|
|
7
9
|
from fastapi import HTTPException, Request
|
8
10
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
9
11
|
|
10
12
|
from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
|
11
13
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
12
|
-
from sglang.srt.
|
14
|
+
from sglang.srt.server_args import ServerArgs
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
13
18
|
|
14
19
|
logger = logging.getLogger(__name__)
|
15
20
|
|
@@ -20,6 +25,14 @@ class OpenAIServingBase(ABC):
|
|
20
25
|
|
21
26
|
def __init__(self, tokenizer_manager: TokenizerManager):
|
22
27
|
self.tokenizer_manager = tokenizer_manager
|
28
|
+
self.allowed_custom_labels = (
|
29
|
+
set(
|
30
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
|
31
|
+
)
|
32
|
+
if isinstance(self.tokenizer_manager.server_args, ServerArgs)
|
33
|
+
and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
|
34
|
+
else None
|
35
|
+
)
|
23
36
|
|
24
37
|
async def handle_request(
|
25
38
|
self, request: OpenAIServingRequest, raw_request: Request
|
@@ -33,7 +46,7 @@ class OpenAIServingBase(ABC):
|
|
33
46
|
|
34
47
|
# Convert to internal format
|
35
48
|
adapted_request, processed_request = self._convert_to_internal_request(
|
36
|
-
request
|
49
|
+
request, raw_request
|
37
50
|
)
|
38
51
|
|
39
52
|
# Note(Xinyuan): raw_request below is only used for detecting the connection of the client
|
@@ -77,6 +90,7 @@ class OpenAIServingBase(ABC):
|
|
77
90
|
def _convert_to_internal_request(
|
78
91
|
self,
|
79
92
|
request: OpenAIServingRequest,
|
93
|
+
raw_request: Request = None,
|
80
94
|
) -> tuple[GenerateReqInput, OpenAIServingRequest]:
|
81
95
|
"""Convert OpenAI request to internal format"""
|
82
96
|
pass
|
@@ -150,3 +164,32 @@ class OpenAIServingBase(ABC):
|
|
150
164
|
code=status_code,
|
151
165
|
)
|
152
166
|
return json.dumps({"error": error.model_dump()})
|
167
|
+
|
168
|
+
def extract_customer_labels(self, raw_request):
|
169
|
+
if (
|
170
|
+
not self.allowed_custom_labels
|
171
|
+
or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
172
|
+
):
|
173
|
+
return None
|
174
|
+
|
175
|
+
customer_labels = None
|
176
|
+
header = (
|
177
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
178
|
+
)
|
179
|
+
try:
|
180
|
+
raw_labels = (
|
181
|
+
json.loads(raw_request.headers.get(header))
|
182
|
+
if raw_request and raw_request.headers.get(header)
|
183
|
+
else None
|
184
|
+
)
|
185
|
+
except json.JSONDecodeError as e:
|
186
|
+
logger.exception(f"Error in request: {e}")
|
187
|
+
raw_labels = None
|
188
|
+
|
189
|
+
if isinstance(raw_labels, dict):
|
190
|
+
customer_labels = {
|
191
|
+
label: value
|
192
|
+
for label, value in raw_labels.items()
|
193
|
+
if label in self.allowed_custom_labels
|
194
|
+
}
|
195
|
+
return customer_labels
|
@@ -1,9 +1,11 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import copy
|
2
4
|
import json
|
3
5
|
import logging
|
4
6
|
import time
|
5
7
|
import uuid
|
6
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
8
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
7
9
|
|
8
10
|
from fastapi import Request
|
9
11
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
@@ -33,13 +35,15 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
33
35
|
)
|
34
36
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
35
37
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
36
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
37
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
38
38
|
from sglang.srt.parser.conversation import generate_chat_conv
|
39
39
|
from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
|
40
40
|
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
41
41
|
from sglang.utils import convert_json_schema_to_str
|
42
42
|
|
43
|
+
if TYPE_CHECKING:
|
44
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
45
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
46
|
+
|
43
47
|
logger = logging.getLogger(__name__)
|
44
48
|
|
45
49
|
|
@@ -53,6 +57,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
53
57
|
):
|
54
58
|
super().__init__(tokenizer_manager)
|
55
59
|
self.template_manager = template_manager
|
60
|
+
self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
|
56
61
|
|
57
62
|
def _request_id_prefix(self) -> str:
|
58
63
|
return "chatcmpl-"
|
@@ -91,6 +96,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
91
96
|
def _convert_to_internal_request(
|
92
97
|
self,
|
93
98
|
request: ChatCompletionRequest,
|
99
|
+
raw_request: Request = None,
|
94
100
|
) -> tuple[GenerateReqInput, ChatCompletionRequest]:
|
95
101
|
reasoning_effort = (
|
96
102
|
request.chat_template_kwargs.pop("reasoning_effort", None)
|
@@ -122,6 +128,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
122
128
|
else:
|
123
129
|
prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
|
124
130
|
|
131
|
+
# Extract customer labels from raw request headers
|
132
|
+
customer_labels = self.extract_customer_labels(raw_request)
|
133
|
+
|
125
134
|
adapted_request = GenerateReqInput(
|
126
135
|
**prompt_kwargs,
|
127
136
|
image_data=processed_messages.image_data,
|
@@ -140,6 +149,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
140
149
|
bootstrap_room=request.bootstrap_room,
|
141
150
|
return_hidden_states=request.return_hidden_states,
|
142
151
|
rid=request.rid,
|
152
|
+
customer_labels=customer_labels,
|
143
153
|
)
|
144
154
|
|
145
155
|
return adapted_request, request
|
@@ -172,10 +182,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
172
182
|
]
|
173
183
|
else:
|
174
184
|
tools = [item.function.model_dump() for item in request.tools]
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
185
|
+
if self.tool_call_parser:
|
186
|
+
parser = FunctionCallParser(request.tools, self.tool_call_parser)
|
187
|
+
tool_call_constraint = parser.get_structure_constraint(
|
188
|
+
request.tool_choice
|
189
|
+
)
|
179
190
|
|
180
191
|
# Use chat template
|
181
192
|
if self.template_manager.chat_template_name is None:
|
@@ -537,7 +548,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
537
548
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
538
549
|
|
539
550
|
# Handle tool calls
|
540
|
-
if
|
551
|
+
if (
|
552
|
+
request.tool_choice != "none"
|
553
|
+
and request.tools
|
554
|
+
and self.tool_call_parser
|
555
|
+
):
|
541
556
|
async for chunk in self._process_tool_call_stream(
|
542
557
|
index,
|
543
558
|
delta,
|
@@ -727,10 +742,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
727
742
|
|
728
743
|
# Handle tool calls
|
729
744
|
tool_calls = None
|
730
|
-
if
|
731
|
-
|
745
|
+
if (
|
746
|
+
request.tool_choice != "none"
|
747
|
+
and request.tools
|
748
|
+
and self.tool_call_parser
|
749
|
+
):
|
732
750
|
tool_calls, text, finish_reason = self._process_tool_calls(
|
733
|
-
text, request.tools,
|
751
|
+
text, request.tools, finish_reason
|
734
752
|
)
|
735
753
|
|
736
754
|
choice_data = ChatCompletionResponseChoice(
|
@@ -824,11 +842,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
824
842
|
self,
|
825
843
|
text: str,
|
826
844
|
tools: List[Any],
|
827
|
-
tool_call_parser: Optional[str],
|
828
845
|
finish_reason: Dict[str, Any],
|
829
846
|
) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
|
830
847
|
"""Process tool calls in the response"""
|
831
|
-
parser = FunctionCallParser(tools, tool_call_parser)
|
848
|
+
parser = FunctionCallParser(tools, self.tool_call_parser)
|
832
849
|
if parser.has_tool_call(text):
|
833
850
|
if finish_reason["type"] == "stop":
|
834
851
|
finish_reason["type"] = "tool_calls"
|
@@ -838,7 +855,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
838
855
|
tool_calls = []
|
839
856
|
for call_info in call_info_list:
|
840
857
|
# For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
|
841
|
-
if
|
858
|
+
if (
|
859
|
+
self.tool_call_parser == "kimi_k2"
|
860
|
+
and call_info.name is not None
|
861
|
+
):
|
842
862
|
tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
|
843
863
|
else:
|
844
864
|
tool_id = f"call_{uuid.uuid4().hex[:24]}"
|
@@ -933,7 +953,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
933
953
|
if index not in parser_dict:
|
934
954
|
parser_dict[index] = FunctionCallParser(
|
935
955
|
tools=request.tools,
|
936
|
-
tool_call_parser=self.
|
956
|
+
tool_call_parser=self.tool_call_parser,
|
937
957
|
)
|
938
958
|
parser = parser_dict[index]
|
939
959
|
|
@@ -962,7 +982,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
962
982
|
# Tool call ID should be generated only once per tool call
|
963
983
|
if call_item.name:
|
964
984
|
# First chunk: include ID and function name
|
965
|
-
if self.
|
985
|
+
if self.tool_call_parser == "kimi_k2":
|
966
986
|
# Align with Kimi-K2 format: functions.{name}:{index}
|
967
987
|
tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
|
968
988
|
else:
|
@@ -1,6 +1,8 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import logging
|
2
4
|
import time
|
3
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
5
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
4
6
|
|
5
7
|
from fastapi import Request
|
6
8
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
@@ -20,13 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
20
22
|
to_openai_style_logprobs,
|
21
23
|
)
|
22
24
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
23
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
24
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
25
25
|
from sglang.srt.parser.code_completion_parser import (
|
26
26
|
generate_completion_prompt_from_request,
|
27
27
|
)
|
28
28
|
from sglang.utils import convert_json_schema_to_str
|
29
29
|
|
30
|
+
if TYPE_CHECKING:
|
31
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
32
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
33
|
+
|
30
34
|
logger = logging.getLogger(__name__)
|
31
35
|
|
32
36
|
|
@@ -55,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
55
59
|
def _convert_to_internal_request(
|
56
60
|
self,
|
57
61
|
request: CompletionRequest,
|
62
|
+
raw_request: Request = None,
|
58
63
|
) -> tuple[GenerateReqInput, CompletionRequest]:
|
59
64
|
"""Convert OpenAI completion request to internal format"""
|
60
65
|
# NOTE: with openai API, the prompt's logprobs are always not computed
|
@@ -85,6 +90,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
85
90
|
else:
|
86
91
|
prompt_kwargs = {"input_ids": prompt}
|
87
92
|
|
93
|
+
# Extract customer labels from raw request headers
|
94
|
+
customer_labels = self.extract_customer_labels(raw_request)
|
95
|
+
|
88
96
|
adapted_request = GenerateReqInput(
|
89
97
|
**prompt_kwargs,
|
90
98
|
sampling_params=sampling_params,
|
@@ -99,6 +107,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
99
107
|
bootstrap_room=request.bootstrap_room,
|
100
108
|
return_hidden_states=request.return_hidden_states,
|
101
109
|
rid=request.rid,
|
110
|
+
customer_labels=customer_labels,
|
102
111
|
)
|
103
112
|
|
104
113
|
return adapted_request, request
|
@@ -1,4 +1,6 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
2
4
|
|
3
5
|
from fastapi import Request
|
4
6
|
from fastapi.responses import ORJSONResponse
|
@@ -13,10 +15,12 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
13
15
|
)
|
14
16
|
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
15
17
|
from sglang.srt.managers.io_struct import EmbeddingReqInput
|
16
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
17
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
18
18
|
from sglang.srt.parser.conversation import generate_embedding_convs
|
19
19
|
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
22
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
23
|
+
|
20
24
|
|
21
25
|
class OpenAIServingEmbedding(OpenAIServingBase):
|
22
26
|
"""Handler for v1/embeddings requests"""
|
@@ -70,6 +74,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
|
|
70
74
|
def _convert_to_internal_request(
|
71
75
|
self,
|
72
76
|
request: EmbeddingRequest,
|
77
|
+
raw_request: Request = None,
|
73
78
|
) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
|
74
79
|
"""Convert OpenAI embedding request to internal format"""
|
75
80
|
prompt = request.input
|
@@ -45,7 +45,9 @@ class OpenAIServingRerank(OpenAIServingBase):
|
|
45
45
|
return None
|
46
46
|
|
47
47
|
def _convert_to_internal_request(
|
48
|
-
self,
|
48
|
+
self,
|
49
|
+
request: V1RerankReqInput,
|
50
|
+
raw_request: Request = None,
|
49
51
|
) -> tuple[EmbeddingReqInput, V1RerankReqInput]:
|
50
52
|
"""Convert OpenAI rerank request to internal embedding format"""
|
51
53
|
# Create pairs of [query, document] for each document
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
2
|
# Adapted from vLLM's OpenAIServingResponses
|
3
3
|
"""Handler for /v1/responses requests"""
|
4
|
+
from __future__ import annotations
|
4
5
|
|
5
6
|
import asyncio
|
6
7
|
import copy
|
@@ -9,7 +10,7 @@ import logging
|
|
9
10
|
import time
|
10
11
|
from contextlib import AsyncExitStack
|
11
12
|
from http import HTTPStatus
|
12
|
-
from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
|
13
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
|
13
14
|
|
14
15
|
import jinja2
|
15
16
|
import openai.types.responses as openai_responses_types
|
@@ -54,11 +55,13 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
54
55
|
from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
|
55
56
|
from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
|
56
57
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
57
|
-
from sglang.srt.managers.template_manager import TemplateManager
|
58
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
59
58
|
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
60
59
|
from sglang.srt.utils import random_uuid
|
61
60
|
|
61
|
+
if TYPE_CHECKING:
|
62
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
63
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
64
|
+
|
62
65
|
logger = logging.getLogger(__name__)
|
63
66
|
|
64
67
|
|
@@ -25,6 +25,7 @@ class OpenAIServingScore(OpenAIServingBase):
|
|
25
25
|
def _convert_to_internal_request(
|
26
26
|
self,
|
27
27
|
request: ScoringRequest,
|
28
|
+
raw_request: Request = None,
|
28
29
|
) -> tuple[ScoringRequest, ScoringRequest]:
|
29
30
|
"""Convert OpenAI scoring request to internal format"""
|
30
31
|
# For scoring, we pass the request directly as the tokenizer_manager
|
sglang/srt/eplb/eplb_manager.py
CHANGED
@@ -55,7 +55,7 @@ class EPLBManager:
|
|
55
55
|
enable_timing = self._rebalance_layers_per_chunk is None
|
56
56
|
|
57
57
|
if enable_timing:
|
58
|
-
torch.
|
58
|
+
torch.get_device_module().synchronize()
|
59
59
|
time_start = time.time()
|
60
60
|
|
61
61
|
dump_record_output = get_global_expert_distribution_recorder().dump_record(
|
@@ -85,7 +85,7 @@ class EPLBManager:
|
|
85
85
|
|
86
86
|
msg = f"[EPLBManager] rebalance end"
|
87
87
|
if enable_timing:
|
88
|
-
torch.
|
88
|
+
torch.get_device_module().synchronize()
|
89
89
|
time_end = time.time()
|
90
90
|
msg += f" time={time_end - time_start:.3f}s"
|
91
91
|
logger.info(msg)
|