sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +257 -29
- sglang/lang/interpreter.py +1 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +50 -6
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +48 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/xgrammar_backend.py +28 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +21 -10
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +5 -3
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +67 -43
- sglang/srt/entrypoints/engine.py +38 -17
- sglang/srt/entrypoints/grpc_request_manager.py +580 -0
- sglang/srt/entrypoints/grpc_server.py +680 -0
- sglang/srt/entrypoints/http_server.py +88 -53
- sglang/srt/entrypoints/openai/protocol.py +7 -4
- sglang/srt/entrypoints/openai/serving_base.py +46 -3
- sglang/srt/entrypoints/openai/serving_chat.py +39 -19
- sglang/srt/entrypoints/openai/serving_completions.py +15 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +6 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +142 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +11 -4
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +18 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -7
- sglang/srt/layers/dp_attention.py +30 -1
- sglang/srt/layers/layernorm.py +32 -15
- sglang/srt/layers/linear.py +34 -3
- sglang/srt/layers/logits_processor.py +29 -10
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +182 -62
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +12 -7
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +50 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +182 -49
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +68 -41
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +30 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +76 -38
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +0 -18
- sglang/srt/layers/sampler.py +162 -18
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +200 -199
- sglang/srt/managers/data_parallel_controller.py +105 -35
- sglang/srt/managers/detokenizer_manager.py +8 -4
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +199 -12
- sglang/srt/managers/mm_utils.py +1 -0
- sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
- sglang/srt/managers/schedule_batch.py +77 -56
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +191 -139
- sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
- sglang/srt/managers/tokenizer_manager.py +260 -519
- sglang/srt/managers/tp_worker.py +53 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +18 -33
- sglang/srt/mem_cache/hiradix_cache.py +108 -48
- sglang/srt/mem_cache/memory_pool.py +347 -48
- sglang/srt/mem_cache/memory_pool_host.py +121 -57
- sglang/srt/mem_cache/radix_cache.py +0 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +0 -2
- sglang/srt/metrics/collector.py +502 -77
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +75 -19
- sglang/srt/model_executor/model_runner.py +357 -30
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +128 -4
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +798 -218
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_v2.py +346 -48
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +11 -2
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/glm4v_moe.py +3 -0
- sglang/srt/models/gpt_oss.py +1 -1
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +7 -0
- sglang/srt/models/qwen2_5_vl.py +27 -3
- sglang/srt/models/qwen2_moe.py +60 -13
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +40 -9
- sglang/srt/models/qwen3_next.py +1042 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/multimodal/processors/dots_vlm.py +99 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/multimodal/processors/qwen_vl.py +15 -5
- sglang/srt/offloader.py +27 -3
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +355 -37
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_utils.py +0 -2
- sglang/srt/speculative/eagle_worker.py +197 -112
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tracing/trace.py +552 -0
- sglang/srt/utils.py +46 -3
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_utils.py +28 -1
- sglang/utils.py +12 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,11 @@ import tempfile
|
|
27
27
|
import threading
|
28
28
|
import time
|
29
29
|
from http import HTTPStatus
|
30
|
-
from typing import Any, AsyncIterator, Callable, Dict, List, Optional
|
30
|
+
from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
|
31
|
+
|
32
|
+
import setproctitle
|
33
|
+
|
34
|
+
from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
|
31
35
|
|
32
36
|
# Fix a bug of Python threading
|
33
37
|
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
|
@@ -45,11 +49,7 @@ from fastapi.exceptions import RequestValidationError
|
|
45
49
|
from fastapi.middleware.cors import CORSMiddleware
|
46
50
|
from fastapi.responses import ORJSONResponse, Response, StreamingResponse
|
47
51
|
|
48
|
-
from sglang.srt.disaggregation.utils import
|
49
|
-
FAKE_BOOTSTRAP_HOST,
|
50
|
-
DisaggregationMode,
|
51
|
-
register_disaggregation_server,
|
52
|
-
)
|
52
|
+
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
|
53
53
|
from sglang.srt.entrypoints.engine import _launch_subprocesses
|
54
54
|
from sglang.srt.entrypoints.openai.protocol import (
|
55
55
|
ChatCompletionRequest,
|
@@ -75,6 +75,7 @@ from sglang.srt.managers.io_struct import (
|
|
75
75
|
EmbeddingReqInput,
|
76
76
|
GenerateReqInput,
|
77
77
|
GetWeightsByNameReqInput,
|
78
|
+
InitWeightsSendGroupForRemoteInstanceReqInput,
|
78
79
|
InitWeightsUpdateGroupReqInput,
|
79
80
|
LoadLoRAAdapterReqInput,
|
80
81
|
OpenSessionReqInput,
|
@@ -82,6 +83,7 @@ from sglang.srt.managers.io_struct import (
|
|
82
83
|
ProfileReqInput,
|
83
84
|
ReleaseMemoryOccupationReqInput,
|
84
85
|
ResumeMemoryOccupationReqInput,
|
86
|
+
SendWeightsToRemoteInstanceReqInput,
|
85
87
|
SeparateReasoningReqInput,
|
86
88
|
SetInternalStateReq,
|
87
89
|
SlowDownReqInput,
|
@@ -94,15 +96,16 @@ from sglang.srt.managers.io_struct import (
|
|
94
96
|
)
|
95
97
|
from sglang.srt.managers.multi_tokenizer_mixin import (
|
96
98
|
MultiTokenizerManager,
|
97
|
-
|
99
|
+
MultiTokenizerRouter,
|
98
100
|
get_main_process_id,
|
101
|
+
monkey_patch_uvicorn_multiprocessing,
|
99
102
|
read_from_shared_memory,
|
100
103
|
write_data_for_multi_tokenizer,
|
101
104
|
)
|
102
105
|
from sglang.srt.managers.template_manager import TemplateManager
|
103
106
|
from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
|
104
107
|
from sglang.srt.metrics.func_timer import enable_func_timer
|
105
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
108
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
106
109
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
107
110
|
from sglang.srt.utils import (
|
108
111
|
add_api_key_middleware,
|
@@ -125,7 +128,9 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
|
125
128
|
# Store global states
|
126
129
|
@dataclasses.dataclass
|
127
130
|
class _GlobalState:
|
128
|
-
tokenizer_manager:
|
131
|
+
tokenizer_manager: Union[
|
132
|
+
TokenizerManager, MultiTokenizerRouter, MultiTokenizerManager
|
133
|
+
]
|
129
134
|
template_manager: TemplateManager
|
130
135
|
scheduler_info: Dict
|
131
136
|
|
@@ -138,21 +143,6 @@ def set_global_state(global_state: _GlobalState):
|
|
138
143
|
_global_state = global_state
|
139
144
|
|
140
145
|
|
141
|
-
# Function to set up all middlewares for multi-tokenizer compatibility
|
142
|
-
def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
|
143
|
-
"""Setup all middlewares for both single and multi-process modes"""
|
144
|
-
worker_pid = os.getpid()
|
145
|
-
|
146
|
-
if api_key:
|
147
|
-
add_api_key_middleware(app, api_key)
|
148
|
-
logger.info(f"Worker {worker_pid} added API key middleware")
|
149
|
-
|
150
|
-
if enable_metrics:
|
151
|
-
add_prometheus_middleware(app)
|
152
|
-
enable_func_timer()
|
153
|
-
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
154
|
-
|
155
|
-
|
156
146
|
async def init_multi_tokenizer() -> ServerArgs:
|
157
147
|
"""Read args information from shm and init tokenizer manager for current process"""
|
158
148
|
pid = os.getpid()
|
@@ -160,11 +150,15 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
160
150
|
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
161
151
|
|
162
152
|
# Read configuration from shared memory
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
153
|
+
port_args, server_args, scheduler_info = read_from_shared_memory(
|
154
|
+
f"multi_tokenizer_args_{main_pid}"
|
155
|
+
)
|
156
|
+
server_args: ServerArgs
|
157
|
+
|
158
|
+
# API key authentication is not supported in multi-tokenizer mode
|
159
|
+
assert (
|
160
|
+
server_args.api_key is None
|
161
|
+
), "API key is not supported in multi-tokenizer mode"
|
168
162
|
|
169
163
|
port_args.tokenizer_ipc_name = (
|
170
164
|
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
@@ -190,18 +184,29 @@ async def init_multi_tokenizer() -> ServerArgs:
|
|
190
184
|
scheduler_info=scheduler_info,
|
191
185
|
)
|
192
186
|
)
|
187
|
+
|
188
|
+
if server_args.enable_trace:
|
189
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
190
|
+
if server_args.disaggregation_mode == "null":
|
191
|
+
thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
|
192
|
+
trace_set_thread_info(thread_label)
|
193
|
+
|
193
194
|
return server_args
|
194
195
|
|
195
196
|
|
196
197
|
@asynccontextmanager
|
197
198
|
async def lifespan(fast_api_app: FastAPI):
|
198
|
-
|
199
|
-
if server_args is None:
|
199
|
+
if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
|
200
200
|
# Initialize multi-tokenizer support for worker processes
|
201
|
-
fast_api_app.server_args = await init_multi_tokenizer()
|
202
|
-
|
203
|
-
|
204
|
-
)
|
201
|
+
fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
|
202
|
+
|
203
|
+
# only metrics middleware is supported in multi-tokenizer mode
|
204
|
+
worker_pid = os.getpid()
|
205
|
+
if fast_api_app.server_args.enable_metrics:
|
206
|
+
add_prometheus_middleware(app)
|
207
|
+
enable_func_timer()
|
208
|
+
|
209
|
+
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
205
210
|
fast_api_app.warmup_thread = threading.Thread(
|
206
211
|
target=_wait_and_warmup,
|
207
212
|
args=(
|
@@ -679,6 +684,38 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
|
|
679
684
|
)
|
680
685
|
|
681
686
|
|
687
|
+
@app.post("/init_weights_send_group_for_remote_instance")
|
688
|
+
async def init_weights_send_group_for_remote_instance(
|
689
|
+
obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
|
690
|
+
):
|
691
|
+
success, message = (
|
692
|
+
await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
|
693
|
+
obj, request
|
694
|
+
)
|
695
|
+
)
|
696
|
+
content = {"success": success, "message": message}
|
697
|
+
if success:
|
698
|
+
return ORJSONResponse(content, status_code=200)
|
699
|
+
else:
|
700
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
701
|
+
|
702
|
+
|
703
|
+
@app.post("/send_weights_to_remote_instance")
|
704
|
+
async def send_weights_to_remote_instance(
|
705
|
+
obj: SendWeightsToRemoteInstanceReqInput, request: Request
|
706
|
+
):
|
707
|
+
success, message = (
|
708
|
+
await _global_state.tokenizer_manager.send_weights_to_remote_instance(
|
709
|
+
obj, request
|
710
|
+
)
|
711
|
+
)
|
712
|
+
content = {"success": success, "message": message}
|
713
|
+
if success:
|
714
|
+
return ORJSONResponse(content, status_code=200)
|
715
|
+
else:
|
716
|
+
return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
|
717
|
+
|
718
|
+
|
682
719
|
@app.post("/init_weights_update_group")
|
683
720
|
async def init_weights_update_group(
|
684
721
|
obj: InitWeightsUpdateGroupReqInput, request: Request
|
@@ -1178,6 +1215,12 @@ def launch_server(
|
|
1178
1215
|
server_args=server_args,
|
1179
1216
|
)
|
1180
1217
|
|
1218
|
+
if server_args.enable_trace:
|
1219
|
+
process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
|
1220
|
+
if server_args.disaggregation_mode == "null":
|
1221
|
+
thread_label = "Tokenizer"
|
1222
|
+
trace_set_thread_info(thread_label)
|
1223
|
+
|
1181
1224
|
set_global_state(
|
1182
1225
|
_GlobalState(
|
1183
1226
|
tokenizer_manager=tokenizer_manager,
|
@@ -1187,12 +1230,10 @@ def launch_server(
|
|
1187
1230
|
)
|
1188
1231
|
|
1189
1232
|
if server_args.tokenizer_worker_num > 1:
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
scheduler_info,
|
1195
|
-
)
|
1233
|
+
multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
|
1234
|
+
port_args,
|
1235
|
+
server_args,
|
1236
|
+
scheduler_info,
|
1196
1237
|
)
|
1197
1238
|
else:
|
1198
1239
|
# Add api key authorization
|
@@ -1229,6 +1270,9 @@ def launch_server(
|
|
1229
1270
|
"level": "INFO",
|
1230
1271
|
"propagate": False,
|
1231
1272
|
}
|
1273
|
+
|
1274
|
+
monkey_patch_uvicorn_multiprocessing()
|
1275
|
+
|
1232
1276
|
uvicorn.run(
|
1233
1277
|
"sglang.srt.entrypoints.http_server:app",
|
1234
1278
|
host=server_args.host,
|
@@ -1239,6 +1283,7 @@ def launch_server(
|
|
1239
1283
|
workers=server_args.tokenizer_worker_num,
|
1240
1284
|
)
|
1241
1285
|
else:
|
1286
|
+
app.is_single_tokenizer_mode = True
|
1242
1287
|
uvicorn.run(
|
1243
1288
|
app,
|
1244
1289
|
host=server_args.host,
|
@@ -1249,10 +1294,8 @@ def launch_server(
|
|
1249
1294
|
)
|
1250
1295
|
finally:
|
1251
1296
|
if server_args.tokenizer_worker_num > 1:
|
1252
|
-
|
1253
|
-
|
1254
|
-
scheduler_info_shm.unlink()
|
1255
|
-
_global_state.tokenizer_manager.clear_tokenizer_mapping()
|
1297
|
+
multi_tokenizer_args_shm.unlink()
|
1298
|
+
_global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
|
1256
1299
|
else:
|
1257
1300
|
warmup_thread.join()
|
1258
1301
|
|
@@ -1401,13 +1444,5 @@ def _wait_and_warmup(
|
|
1401
1444
|
if server_args.debug_tensor_dump_input_file:
|
1402
1445
|
kill_process_tree(os.getpid())
|
1403
1446
|
|
1404
|
-
if server_args.pdlb_url is not None:
|
1405
|
-
register_disaggregation_server(
|
1406
|
-
server_args.disaggregation_mode,
|
1407
|
-
server_args.port,
|
1408
|
-
server_args.disaggregation_bootstrap_port,
|
1409
|
-
server_args.pdlb_url,
|
1410
|
-
)
|
1411
|
-
|
1412
1447
|
if launch_callback is not None:
|
1413
1448
|
launch_callback()
|
@@ -229,6 +229,9 @@ class CompletionRequest(BaseModel):
|
|
229
229
|
# For request id
|
230
230
|
rid: Optional[Union[List[str], str]] = None
|
231
231
|
|
232
|
+
# For customer metric labels
|
233
|
+
customer_labels: Optional[Dict[str, str]] = None
|
234
|
+
|
232
235
|
@field_validator("max_tokens")
|
233
236
|
@classmethod
|
234
237
|
def validate_max_tokens_positive(cls, v):
|
@@ -447,7 +450,7 @@ class ChatCompletionRequest(BaseModel):
|
|
447
450
|
description="Constrains effort on reasoning for reasoning models. "
|
448
451
|
"'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
|
449
452
|
"result in faster responses and fewer tokens used on reasoning in a response. "
|
450
|
-
"Currently only supported for OpenAI models.",
|
453
|
+
"Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
|
451
454
|
)
|
452
455
|
|
453
456
|
@model_validator(mode="before")
|
@@ -542,9 +545,9 @@ class ChatCompletionRequest(BaseModel):
|
|
542
545
|
rid: Optional[Union[List[str], str]] = None
|
543
546
|
|
544
547
|
# For PD disaggregation
|
545
|
-
bootstrap_host: Optional[str] = None
|
546
|
-
bootstrap_port: Optional[int] = None
|
547
|
-
bootstrap_room: Optional[int] = None
|
548
|
+
bootstrap_host: Optional[Union[List[str], str]] = None
|
549
|
+
bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
|
550
|
+
bootstrap_room: Optional[Union[List[int], int]] = None
|
548
551
|
|
549
552
|
|
550
553
|
class ChatMessage(BaseModel):
|
@@ -1,15 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import json
|
2
4
|
import logging
|
3
5
|
import uuid
|
4
6
|
from abc import ABC, abstractmethod
|
5
|
-
from typing import Any, Optional, Union
|
7
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
6
8
|
|
7
9
|
from fastapi import HTTPException, Request
|
8
10
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
9
11
|
|
10
12
|
from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
|
11
13
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
12
|
-
from sglang.srt.
|
14
|
+
from sglang.srt.server_args import ServerArgs
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
13
18
|
|
14
19
|
logger = logging.getLogger(__name__)
|
15
20
|
|
@@ -20,6 +25,14 @@ class OpenAIServingBase(ABC):
|
|
20
25
|
|
21
26
|
def __init__(self, tokenizer_manager: TokenizerManager):
|
22
27
|
self.tokenizer_manager = tokenizer_manager
|
28
|
+
self.allowed_custom_labels = (
|
29
|
+
set(
|
30
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
|
31
|
+
)
|
32
|
+
if isinstance(self.tokenizer_manager.server_args, ServerArgs)
|
33
|
+
and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
|
34
|
+
else None
|
35
|
+
)
|
23
36
|
|
24
37
|
async def handle_request(
|
25
38
|
self, request: OpenAIServingRequest, raw_request: Request
|
@@ -33,7 +46,7 @@ class OpenAIServingBase(ABC):
|
|
33
46
|
|
34
47
|
# Convert to internal format
|
35
48
|
adapted_request, processed_request = self._convert_to_internal_request(
|
36
|
-
request
|
49
|
+
request, raw_request
|
37
50
|
)
|
38
51
|
|
39
52
|
# Note(Xinyuan): raw_request below is only used for detecting the connection of the client
|
@@ -77,6 +90,7 @@ class OpenAIServingBase(ABC):
|
|
77
90
|
def _convert_to_internal_request(
|
78
91
|
self,
|
79
92
|
request: OpenAIServingRequest,
|
93
|
+
raw_request: Request = None,
|
80
94
|
) -> tuple[GenerateReqInput, OpenAIServingRequest]:
|
81
95
|
"""Convert OpenAI request to internal format"""
|
82
96
|
pass
|
@@ -150,3 +164,32 @@ class OpenAIServingBase(ABC):
|
|
150
164
|
code=status_code,
|
151
165
|
)
|
152
166
|
return json.dumps({"error": error.model_dump()})
|
167
|
+
|
168
|
+
def extract_customer_labels(self, raw_request):
|
169
|
+
if (
|
170
|
+
not self.allowed_custom_labels
|
171
|
+
or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
172
|
+
):
|
173
|
+
return None
|
174
|
+
|
175
|
+
customer_labels = None
|
176
|
+
header = (
|
177
|
+
self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
|
178
|
+
)
|
179
|
+
try:
|
180
|
+
raw_labels = (
|
181
|
+
json.loads(raw_request.headers.get(header))
|
182
|
+
if raw_request and raw_request.headers.get(header)
|
183
|
+
else None
|
184
|
+
)
|
185
|
+
except json.JSONDecodeError as e:
|
186
|
+
logger.exception(f"Error in request: {e}")
|
187
|
+
raw_labels = None
|
188
|
+
|
189
|
+
if isinstance(raw_labels, dict):
|
190
|
+
customer_labels = {
|
191
|
+
label: value
|
192
|
+
for label, value in raw_labels.items()
|
193
|
+
if label in self.allowed_custom_labels
|
194
|
+
}
|
195
|
+
return customer_labels
|
@@ -1,14 +1,15 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import copy
|
2
4
|
import json
|
3
5
|
import logging
|
4
6
|
import time
|
5
7
|
import uuid
|
6
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
8
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
7
9
|
|
8
10
|
from fastapi import Request
|
9
11
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
10
12
|
|
11
|
-
from sglang.srt.conversation import generate_chat_conv
|
12
13
|
from sglang.srt.entrypoints.openai.protocol import (
|
13
14
|
ChatCompletionRequest,
|
14
15
|
ChatCompletionResponse,
|
@@ -33,13 +34,16 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
33
34
|
to_openai_style_logprobs,
|
34
35
|
)
|
35
36
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
36
|
-
from sglang.srt.jinja_template_utils import process_content_for_template_format
|
37
37
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
38
|
-
from sglang.srt.
|
39
|
-
from sglang.srt.
|
40
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
38
|
+
from sglang.srt.parser.conversation import generate_chat_conv
|
39
|
+
from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
|
40
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
41
41
|
from sglang.utils import convert_json_schema_to_str
|
42
42
|
|
43
|
+
if TYPE_CHECKING:
|
44
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
45
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
46
|
+
|
43
47
|
logger = logging.getLogger(__name__)
|
44
48
|
|
45
49
|
|
@@ -53,6 +57,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
53
57
|
):
|
54
58
|
super().__init__(tokenizer_manager)
|
55
59
|
self.template_manager = template_manager
|
60
|
+
self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
|
56
61
|
|
57
62
|
def _request_id_prefix(self) -> str:
|
58
63
|
return "chatcmpl-"
|
@@ -91,6 +96,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
91
96
|
def _convert_to_internal_request(
|
92
97
|
self,
|
93
98
|
request: ChatCompletionRequest,
|
99
|
+
raw_request: Request = None,
|
94
100
|
) -> tuple[GenerateReqInput, ChatCompletionRequest]:
|
95
101
|
reasoning_effort = (
|
96
102
|
request.chat_template_kwargs.pop("reasoning_effort", None)
|
@@ -122,6 +128,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
122
128
|
else:
|
123
129
|
prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
|
124
130
|
|
131
|
+
# Extract customer labels from raw request headers
|
132
|
+
customer_labels = self.extract_customer_labels(raw_request)
|
133
|
+
|
125
134
|
adapted_request = GenerateReqInput(
|
126
135
|
**prompt_kwargs,
|
127
136
|
image_data=processed_messages.image_data,
|
@@ -140,6 +149,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
140
149
|
bootstrap_room=request.bootstrap_room,
|
141
150
|
return_hidden_states=request.return_hidden_states,
|
142
151
|
rid=request.rid,
|
152
|
+
customer_labels=customer_labels,
|
143
153
|
)
|
144
154
|
|
145
155
|
return adapted_request, request
|
@@ -172,10 +182,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
172
182
|
]
|
173
183
|
else:
|
174
184
|
tools = [item.function.model_dump() for item in request.tools]
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
185
|
+
if self.tool_call_parser:
|
186
|
+
parser = FunctionCallParser(request.tools, self.tool_call_parser)
|
187
|
+
tool_call_constraint = parser.get_structure_constraint(
|
188
|
+
request.tool_choice
|
189
|
+
)
|
179
190
|
|
180
191
|
# Use chat template
|
181
192
|
if self.template_manager.chat_template_name is None:
|
@@ -537,7 +548,11 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
537
548
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
538
549
|
|
539
550
|
# Handle tool calls
|
540
|
-
if
|
551
|
+
if (
|
552
|
+
request.tool_choice != "none"
|
553
|
+
and request.tools
|
554
|
+
and self.tool_call_parser
|
555
|
+
):
|
541
556
|
async for chunk in self._process_tool_call_stream(
|
542
557
|
index,
|
543
558
|
delta,
|
@@ -727,10 +742,13 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
727
742
|
|
728
743
|
# Handle tool calls
|
729
744
|
tool_calls = None
|
730
|
-
if
|
731
|
-
|
745
|
+
if (
|
746
|
+
request.tool_choice != "none"
|
747
|
+
and request.tools
|
748
|
+
and self.tool_call_parser
|
749
|
+
):
|
732
750
|
tool_calls, text, finish_reason = self._process_tool_calls(
|
733
|
-
text, request.tools,
|
751
|
+
text, request.tools, finish_reason
|
734
752
|
)
|
735
753
|
|
736
754
|
choice_data = ChatCompletionResponseChoice(
|
@@ -824,11 +842,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
824
842
|
self,
|
825
843
|
text: str,
|
826
844
|
tools: List[Any],
|
827
|
-
tool_call_parser: Optional[str],
|
828
845
|
finish_reason: Dict[str, Any],
|
829
846
|
) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
|
830
847
|
"""Process tool calls in the response"""
|
831
|
-
parser = FunctionCallParser(tools, tool_call_parser)
|
848
|
+
parser = FunctionCallParser(tools, self.tool_call_parser)
|
832
849
|
if parser.has_tool_call(text):
|
833
850
|
if finish_reason["type"] == "stop":
|
834
851
|
finish_reason["type"] = "tool_calls"
|
@@ -838,7 +855,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
838
855
|
tool_calls = []
|
839
856
|
for call_info in call_info_list:
|
840
857
|
# For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
|
841
|
-
if
|
858
|
+
if (
|
859
|
+
self.tool_call_parser == "kimi_k2"
|
860
|
+
and call_info.name is not None
|
861
|
+
):
|
842
862
|
tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
|
843
863
|
else:
|
844
864
|
tool_id = f"call_{uuid.uuid4().hex[:24]}"
|
@@ -933,7 +953,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
933
953
|
if index not in parser_dict:
|
934
954
|
parser_dict[index] = FunctionCallParser(
|
935
955
|
tools=request.tools,
|
936
|
-
tool_call_parser=self.
|
956
|
+
tool_call_parser=self.tool_call_parser,
|
937
957
|
)
|
938
958
|
parser = parser_dict[index]
|
939
959
|
|
@@ -962,7 +982,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
962
982
|
# Tool call ID should be generated only once per tool call
|
963
983
|
if call_item.name:
|
964
984
|
# First chunk: include ID and function name
|
965
|
-
if self.
|
985
|
+
if self.tool_call_parser == "kimi_k2":
|
966
986
|
# Align with Kimi-K2 format: functions.{name}:{index}
|
967
987
|
tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
|
968
988
|
else:
|
@@ -1,11 +1,12 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import logging
|
2
4
|
import time
|
3
|
-
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
5
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
|
4
6
|
|
5
7
|
from fastapi import Request
|
6
8
|
from fastapi.responses import ORJSONResponse, StreamingResponse
|
7
9
|
|
8
|
-
from sglang.srt.code_completion_parser import generate_completion_prompt_from_request
|
9
10
|
from sglang.srt.entrypoints.openai.protocol import (
|
10
11
|
CompletionRequest,
|
11
12
|
CompletionResponse,
|
@@ -21,10 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
|
|
21
22
|
to_openai_style_logprobs,
|
22
23
|
)
|
23
24
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
24
|
-
from sglang.srt.
|
25
|
-
|
25
|
+
from sglang.srt.parser.code_completion_parser import (
|
26
|
+
generate_completion_prompt_from_request,
|
27
|
+
)
|
26
28
|
from sglang.utils import convert_json_schema_to_str
|
27
29
|
|
30
|
+
if TYPE_CHECKING:
|
31
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
32
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
33
|
+
|
28
34
|
logger = logging.getLogger(__name__)
|
29
35
|
|
30
36
|
|
@@ -53,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
53
59
|
def _convert_to_internal_request(
|
54
60
|
self,
|
55
61
|
request: CompletionRequest,
|
62
|
+
raw_request: Request = None,
|
56
63
|
) -> tuple[GenerateReqInput, CompletionRequest]:
|
57
64
|
"""Convert OpenAI completion request to internal format"""
|
58
65
|
# NOTE: with openai API, the prompt's logprobs are always not computed
|
@@ -83,6 +90,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
83
90
|
else:
|
84
91
|
prompt_kwargs = {"input_ids": prompt}
|
85
92
|
|
93
|
+
# Extract customer labels from raw request headers
|
94
|
+
customer_labels = self.extract_customer_labels(raw_request)
|
95
|
+
|
86
96
|
adapted_request = GenerateReqInput(
|
87
97
|
**prompt_kwargs,
|
88
98
|
sampling_params=sampling_params,
|
@@ -97,6 +107,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
|
|
97
107
|
bootstrap_room=request.bootstrap_room,
|
98
108
|
return_hidden_states=request.return_hidden_states,
|
99
109
|
rid=request.rid,
|
110
|
+
customer_labels=customer_labels,
|
100
111
|
)
|
101
112
|
|
102
113
|
return adapted_request, request
|
@@ -1,9 +1,10 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
2
4
|
|
3
5
|
from fastapi import Request
|
4
6
|
from fastapi.responses import ORJSONResponse
|
5
7
|
|
6
|
-
from sglang.srt.conversation import generate_embedding_convs
|
7
8
|
from sglang.srt.entrypoints.openai.protocol import (
|
8
9
|
EmbeddingObject,
|
9
10
|
EmbeddingRequest,
|
@@ -14,8 +15,11 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
14
15
|
)
|
15
16
|
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
|
16
17
|
from sglang.srt.managers.io_struct import EmbeddingReqInput
|
17
|
-
from sglang.srt.
|
18
|
-
|
18
|
+
from sglang.srt.parser.conversation import generate_embedding_convs
|
19
|
+
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
22
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
19
23
|
|
20
24
|
|
21
25
|
class OpenAIServingEmbedding(OpenAIServingBase):
|
@@ -70,6 +74,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
|
|
70
74
|
def _convert_to_internal_request(
|
71
75
|
self,
|
72
76
|
request: EmbeddingRequest,
|
77
|
+
raw_request: Request = None,
|
73
78
|
) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
|
74
79
|
"""Convert OpenAI embedding request to internal format"""
|
75
80
|
prompt = request.input
|
@@ -45,7 +45,9 @@ class OpenAIServingRerank(OpenAIServingBase):
|
|
45
45
|
return None
|
46
46
|
|
47
47
|
def _convert_to_internal_request(
|
48
|
-
self,
|
48
|
+
self,
|
49
|
+
request: V1RerankReqInput,
|
50
|
+
raw_request: Request = None,
|
49
51
|
) -> tuple[EmbeddingReqInput, V1RerankReqInput]:
|
50
52
|
"""Convert OpenAI rerank request to internal embedding format"""
|
51
53
|
# Create pairs of [query, document] for each document
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
2
|
# Adapted from vLLM's OpenAIServingResponses
|
3
3
|
"""Handler for /v1/responses requests"""
|
4
|
+
from __future__ import annotations
|
4
5
|
|
5
6
|
import asyncio
|
6
7
|
import copy
|
@@ -9,7 +10,7 @@ import logging
|
|
9
10
|
import time
|
10
11
|
from contextlib import AsyncExitStack
|
11
12
|
from http import HTTPStatus
|
12
|
-
from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
|
13
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
|
13
14
|
|
14
15
|
import jinja2
|
15
16
|
import openai.types.responses as openai_responses_types
|
@@ -54,11 +55,13 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
54
55
|
from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
|
55
56
|
from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
|
56
57
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
57
|
-
from sglang.srt.
|
58
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
59
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
58
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
60
59
|
from sglang.srt.utils import random_uuid
|
61
60
|
|
61
|
+
if TYPE_CHECKING:
|
62
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
63
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
64
|
+
|
62
65
|
logger = logging.getLogger(__name__)
|
63
66
|
|
64
67
|
|
@@ -25,6 +25,7 @@ class OpenAIServingScore(OpenAIServingBase):
|
|
25
25
|
def _convert_to_internal_request(
|
26
26
|
self,
|
27
27
|
request: ScoringRequest,
|
28
|
+
raw_request: Request = None,
|
28
29
|
) -> tuple[ScoringRequest, ScoringRequest]:
|
29
30
|
"""Convert OpenAI scoring request to internal format"""
|
30
31
|
# For scoring, we pass the request directly as the tokenizer_manager
|