sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +257 -29
- sglang/lang/interpreter.py +1 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +50 -6
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +48 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/xgrammar_backend.py +28 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +21 -10
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +5 -3
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +67 -43
- sglang/srt/entrypoints/engine.py +38 -17
- sglang/srt/entrypoints/grpc_request_manager.py +580 -0
- sglang/srt/entrypoints/grpc_server.py +680 -0
- sglang/srt/entrypoints/http_server.py +88 -53
- sglang/srt/entrypoints/openai/protocol.py +7 -4
- sglang/srt/entrypoints/openai/serving_base.py +46 -3
- sglang/srt/entrypoints/openai/serving_chat.py +39 -19
- sglang/srt/entrypoints/openai/serving_completions.py +15 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +6 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +142 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +11 -4
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +18 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -7
- sglang/srt/layers/dp_attention.py +30 -1
- sglang/srt/layers/layernorm.py +32 -15
- sglang/srt/layers/linear.py +34 -3
- sglang/srt/layers/logits_processor.py +29 -10
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +182 -62
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +12 -7
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +50 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +182 -49
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +68 -41
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +30 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +76 -38
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +0 -18
- sglang/srt/layers/sampler.py +162 -18
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +200 -199
- sglang/srt/managers/data_parallel_controller.py +105 -35
- sglang/srt/managers/detokenizer_manager.py +8 -4
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +199 -12
- sglang/srt/managers/mm_utils.py +1 -0
- sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
- sglang/srt/managers/schedule_batch.py +77 -56
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +191 -139
- sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
- sglang/srt/managers/tokenizer_manager.py +260 -519
- sglang/srt/managers/tp_worker.py +53 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +18 -33
- sglang/srt/mem_cache/hiradix_cache.py +108 -48
- sglang/srt/mem_cache/memory_pool.py +347 -48
- sglang/srt/mem_cache/memory_pool_host.py +121 -57
- sglang/srt/mem_cache/radix_cache.py +0 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +0 -2
- sglang/srt/metrics/collector.py +502 -77
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +75 -19
- sglang/srt/model_executor/model_runner.py +357 -30
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +128 -4
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +798 -218
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_v2.py +346 -48
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +11 -2
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/glm4v_moe.py +3 -0
- sglang/srt/models/gpt_oss.py +1 -1
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +7 -0
- sglang/srt/models/qwen2_5_vl.py +27 -3
- sglang/srt/models/qwen2_moe.py +60 -13
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +40 -9
- sglang/srt/models/qwen3_next.py +1042 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/multimodal/processors/dots_vlm.py +99 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/multimodal/processors/qwen_vl.py +15 -5
- sglang/srt/offloader.py +27 -3
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +355 -37
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_utils.py +0 -2
- sglang/srt/speculative/eagle_worker.py +197 -112
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tracing/trace.py +552 -0
- sglang/srt/utils.py +46 -3
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_utils.py +28 -1
- sglang/utils.py +12 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,552 @@
|
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
"""package for sglang requests tracing"""
|
15
|
+
|
16
|
+
from __future__ import annotations
|
17
|
+
|
18
|
+
import ctypes
|
19
|
+
import logging
|
20
|
+
import os
|
21
|
+
import random
|
22
|
+
import threading
|
23
|
+
import time
|
24
|
+
import uuid
|
25
|
+
from dataclasses import dataclass
|
26
|
+
from typing import Any, Dict, List, Optional
|
27
|
+
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
opentelemetry_imported = False
|
30
|
+
tracing_enabled = False
|
31
|
+
|
32
|
+
try:
|
33
|
+
from opentelemetry import context, propagate, trace
|
34
|
+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
35
|
+
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
|
36
|
+
from opentelemetry.sdk.trace import TracerProvider, id_generator
|
37
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
38
|
+
|
39
|
+
opentelemetry_imported = True
|
40
|
+
except ImportError:
|
41
|
+
|
42
|
+
class id_generator:
|
43
|
+
class IdGenerator:
|
44
|
+
pass
|
45
|
+
|
46
|
+
logger.info("opentelemetry package is not installed, tracing disabled")
|
47
|
+
|
48
|
+
|
49
|
+
@dataclass
|
50
|
+
class SglangTraceThreadInfo:
|
51
|
+
host_id: str
|
52
|
+
pid: int
|
53
|
+
thread_label: str
|
54
|
+
tp_rank: int
|
55
|
+
dp_rank: int
|
56
|
+
tracer: trace.Tracer
|
57
|
+
|
58
|
+
|
59
|
+
@dataclass
|
60
|
+
class SglangTraceSliceContext:
|
61
|
+
slice_name: str
|
62
|
+
span: Optional[trace.span.Span] = None
|
63
|
+
# When True, defers slice_name assignment until trace_slice_end()
|
64
|
+
anonymous: bool = False
|
65
|
+
|
66
|
+
|
67
|
+
@dataclass
|
68
|
+
class SglangTraceThreadContext:
|
69
|
+
thread_info: SglangTraceThreadInfo
|
70
|
+
cur_slice_stack: List[SglangTraceSliceContext]
|
71
|
+
thread_span: Optional[trace.span.Span] = None
|
72
|
+
# Record the most recently completed span as the previous span for the next span to be created.
|
73
|
+
last_span_context: Optional[trace.span.SpanContext] = None
|
74
|
+
|
75
|
+
|
76
|
+
@dataclass
|
77
|
+
class SglangTraceReqContext:
|
78
|
+
rid: str
|
79
|
+
start_time_ns: int
|
80
|
+
threads_context: Dict[int, SglangTraceThreadContext]
|
81
|
+
bootstrap_room: Optional[int] = None
|
82
|
+
|
83
|
+
# Indicates whether this instance is a replica from the main process.
|
84
|
+
# When True, root_span is None and only root_span_context is preserved.
|
85
|
+
is_copy: bool = False
|
86
|
+
root_span: Optional[trace.span.Span] = None
|
87
|
+
root_span_context: Optional[context.Context] = None
|
88
|
+
|
89
|
+
|
90
|
+
@dataclass
|
91
|
+
class SglangTracePropagateContext:
|
92
|
+
root_span_context: context.Context
|
93
|
+
prev_span_context: Optional[trace.span.SpanContext]
|
94
|
+
|
95
|
+
def to_dict(self):
|
96
|
+
carrier: dict[str, str] = {}
|
97
|
+
context.attach(self.root_span_context)
|
98
|
+
propagate.inject(carrier)
|
99
|
+
|
100
|
+
if self.prev_span_context:
|
101
|
+
return {
|
102
|
+
"root_span": carrier,
|
103
|
+
"prev_span": {
|
104
|
+
"span_id": self.prev_span_context.span_id,
|
105
|
+
"trace_id": self.prev_span_context.trace_id,
|
106
|
+
},
|
107
|
+
}
|
108
|
+
else:
|
109
|
+
return {"root_span": carrier, "prev_span": "None"}
|
110
|
+
|
111
|
+
@classmethod
|
112
|
+
def instance_from_dict(cls, d):
|
113
|
+
if "root_span" not in d or "prev_span" not in d:
|
114
|
+
return None
|
115
|
+
|
116
|
+
carrier = d["root_span"]
|
117
|
+
root_span_context = propagate.extract(carrier)
|
118
|
+
|
119
|
+
if d["prev_span"] == "None":
|
120
|
+
prev_span_context = None
|
121
|
+
else:
|
122
|
+
prev_span_context = trace.span.SpanContext(
|
123
|
+
trace_id=d["prev_span"]["trace_id"],
|
124
|
+
span_id=d["prev_span"]["span_id"],
|
125
|
+
is_remote=True,
|
126
|
+
)
|
127
|
+
|
128
|
+
return cls(root_span_context, prev_span_context)
|
129
|
+
|
130
|
+
|
131
|
+
class SglangTraceCustomIdGenerator(id_generator.IdGenerator):
|
132
|
+
"""
|
133
|
+
The default IdGenerator may produce duplicate trace IDs across multiple TP scheduler processes,
|
134
|
+
hence a custom IdGenerator is implemented.
|
135
|
+
"""
|
136
|
+
|
137
|
+
def __init__(self):
|
138
|
+
super().__init__()
|
139
|
+
self.local_random = random.Random()
|
140
|
+
self.local_random.seed(time.time())
|
141
|
+
|
142
|
+
def generate_trace_id(self) -> int:
|
143
|
+
return self.local_random.getrandbits(64)
|
144
|
+
|
145
|
+
def generate_span_id(self) -> int:
|
146
|
+
return self.local_random.getrandbits(64)
|
147
|
+
|
148
|
+
|
149
|
+
# global variables
|
150
|
+
threads_info: Dict[int, SglangTraceThreadInfo] = {}
|
151
|
+
reqs_context: Dict[str, SglangTraceReqContext] = {}
|
152
|
+
|
153
|
+
__get_cur_time_ns = lambda: int(time.time() * 1e9)
|
154
|
+
|
155
|
+
|
156
|
+
def __get_host_id() -> str:
|
157
|
+
"""
|
158
|
+
In distributed tracing systems, obtain a unique node identifier
|
159
|
+
and inject it into all subsequently generated spans
|
160
|
+
to prevent PID conflicts between threads on different nodes.
|
161
|
+
"""
|
162
|
+
if os.path.exists("/etc/machine-id"):
|
163
|
+
try:
|
164
|
+
with open("/etc/machine-id", "r") as f:
|
165
|
+
return f.read().strip()
|
166
|
+
except:
|
167
|
+
pass
|
168
|
+
|
169
|
+
mac = uuid.getnode()
|
170
|
+
if mac != 0:
|
171
|
+
return uuid.UUID(int=mac).hex
|
172
|
+
|
173
|
+
return "unknown"
|
174
|
+
|
175
|
+
|
176
|
+
# Should be called by each tracked process.
|
177
|
+
def process_tracing_init(otlp_endpoint, server_name):
|
178
|
+
global tracing_enabled
|
179
|
+
global __get_cur_time_ns
|
180
|
+
if not opentelemetry_imported:
|
181
|
+
tracing_enabled = False
|
182
|
+
return
|
183
|
+
|
184
|
+
try:
|
185
|
+
resource = Resource.create(
|
186
|
+
attributes={
|
187
|
+
SERVICE_NAME: server_name,
|
188
|
+
}
|
189
|
+
)
|
190
|
+
tracer_provider = TracerProvider(
|
191
|
+
resource=resource, id_generator=SglangTraceCustomIdGenerator()
|
192
|
+
)
|
193
|
+
|
194
|
+
processor = BatchSpanProcessor(
|
195
|
+
OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
|
196
|
+
)
|
197
|
+
tracer_provider.add_span_processor(processor)
|
198
|
+
trace.set_tracer_provider(tracer_provider)
|
199
|
+
except Exception as e:
|
200
|
+
logger.error(f": initialize opentelemetry error:{e}")
|
201
|
+
logger.warning("pelease set correct otlp endpoint")
|
202
|
+
tracing_enabled = False
|
203
|
+
return
|
204
|
+
|
205
|
+
if hasattr(time, "time_ns"):
|
206
|
+
__get_cur_time_ns = lambda: int(time.time_ns())
|
207
|
+
|
208
|
+
tracing_enabled = True
|
209
|
+
|
210
|
+
|
211
|
+
# Should be called by each tracked thread.
|
212
|
+
def trace_set_thread_info(
|
213
|
+
thread_label: str, tp_rank: Optional[int] = None, dp_rank: Optional[int] = None
|
214
|
+
):
|
215
|
+
if not tracing_enabled:
|
216
|
+
return
|
217
|
+
|
218
|
+
pid = threading.get_native_id()
|
219
|
+
if pid in threads_info:
|
220
|
+
return
|
221
|
+
|
222
|
+
threads_info[pid] = SglangTraceThreadInfo(
|
223
|
+
host_id=__get_host_id(),
|
224
|
+
pid=pid,
|
225
|
+
thread_label=thread_label,
|
226
|
+
tp_rank=tp_rank,
|
227
|
+
dp_rank=dp_rank,
|
228
|
+
tracer=trace.get_tracer("sglang server"),
|
229
|
+
)
|
230
|
+
|
231
|
+
|
232
|
+
def __create_thread_context(pid, req_span_context, ts: Optional[int] = None):
|
233
|
+
if pid not in threads_info:
|
234
|
+
trace_set_thread_info("unknown")
|
235
|
+
|
236
|
+
thread_info = threads_info[pid]
|
237
|
+
thread_context = SglangTraceThreadContext(
|
238
|
+
thread_info=thread_info,
|
239
|
+
cur_slice_stack=[],
|
240
|
+
)
|
241
|
+
|
242
|
+
thread_name = f"{thread_info.thread_label}"
|
243
|
+
if thread_info.tp_rank is not None:
|
244
|
+
thread_name += f" [TP {thread_info.tp_rank}] "
|
245
|
+
thread_name += f"(host:{thread_info.host_id[:8]} | pid:{pid})"
|
246
|
+
ts = ts or __get_cur_time_ns()
|
247
|
+
thread_context.thread_span = thread_context.thread_info.tracer.start_span(
|
248
|
+
name=thread_name,
|
249
|
+
start_time=ts,
|
250
|
+
context=req_span_context,
|
251
|
+
)
|
252
|
+
|
253
|
+
if thread_info.tp_rank is not None:
|
254
|
+
thread_context.thread_span.set_attributes({"tp_rank": thread_info.tp_rank})
|
255
|
+
|
256
|
+
thread_context.thread_span.set_attributes(
|
257
|
+
{
|
258
|
+
"host_id": thread_info.host_id,
|
259
|
+
"pid": thread_info.pid,
|
260
|
+
"thread_label": thread_info.thread_label,
|
261
|
+
}
|
262
|
+
)
|
263
|
+
|
264
|
+
return thread_context
|
265
|
+
|
266
|
+
|
267
|
+
def trace_get_proc_propagate_context(rid) -> Optional[Dict[str, Any]]:
|
268
|
+
if not tracing_enabled:
|
269
|
+
return None
|
270
|
+
|
271
|
+
rid = str(rid)
|
272
|
+
if rid not in reqs_context or not reqs_context[rid].root_span_context:
|
273
|
+
return None
|
274
|
+
|
275
|
+
pid = threading.get_native_id()
|
276
|
+
prev_span_context = None
|
277
|
+
thread_context = reqs_context[rid].threads_context[pid]
|
278
|
+
if thread_context.cur_slice_stack:
|
279
|
+
cur_slice_info = thread_context.cur_slice_stack[0]
|
280
|
+
prev_span_context = cur_slice_info.span.get_span_context()
|
281
|
+
elif thread_context.last_span_context:
|
282
|
+
prev_span_context = thread_context.last_span_context
|
283
|
+
|
284
|
+
trace_context = SglangTracePropagateContext(
|
285
|
+
reqs_context[rid].root_span_context, prev_span_context
|
286
|
+
)
|
287
|
+
return trace_context.to_dict()
|
288
|
+
|
289
|
+
|
290
|
+
def trace_set_proc_propagate_context(rid, trace_context: Optional[Dict[str, Any]]):
|
291
|
+
if not tracing_enabled:
|
292
|
+
return
|
293
|
+
if not trace_context:
|
294
|
+
return
|
295
|
+
|
296
|
+
trace_context = SglangTracePropagateContext.instance_from_dict(trace_context)
|
297
|
+
if not trace_context:
|
298
|
+
return
|
299
|
+
|
300
|
+
rid = str(rid)
|
301
|
+
# Create a copy of the request context
|
302
|
+
if rid not in reqs_context:
|
303
|
+
reqs_context[rid] = SglangTraceReqContext(
|
304
|
+
rid=rid,
|
305
|
+
start_time_ns=__get_cur_time_ns(),
|
306
|
+
threads_context={},
|
307
|
+
root_span_context=trace_context.root_span_context,
|
308
|
+
is_copy=True,
|
309
|
+
)
|
310
|
+
|
311
|
+
pid = threading.get_native_id()
|
312
|
+
|
313
|
+
if pid in reqs_context[rid].threads_context:
|
314
|
+
return
|
315
|
+
|
316
|
+
# Create new thread context.
|
317
|
+
reqs_context[rid].threads_context[pid] = __create_thread_context(
|
318
|
+
pid,
|
319
|
+
trace_context.root_span_context,
|
320
|
+
reqs_context[rid].start_time_ns,
|
321
|
+
)
|
322
|
+
|
323
|
+
reqs_context[rid].threads_context[
|
324
|
+
pid
|
325
|
+
].last_span_context = trace_context.prev_span_context
|
326
|
+
|
327
|
+
|
328
|
+
def trace_req_start(
|
329
|
+
rid: str,
|
330
|
+
bootstrap_room: Optional[int] = None,
|
331
|
+
ts: Optional[int] = None,
|
332
|
+
):
|
333
|
+
if not tracing_enabled:
|
334
|
+
return
|
335
|
+
|
336
|
+
rid = str(rid)
|
337
|
+
|
338
|
+
ts = ts or __get_cur_time_ns()
|
339
|
+
|
340
|
+
pid = threading.get_native_id()
|
341
|
+
if pid not in threads_info:
|
342
|
+
return
|
343
|
+
|
344
|
+
# create req context and root span
|
345
|
+
reqs_context[rid] = SglangTraceReqContext(
|
346
|
+
rid=rid,
|
347
|
+
start_time_ns=ts,
|
348
|
+
threads_context={},
|
349
|
+
bootstrap_room=bootstrap_room,
|
350
|
+
is_copy=False,
|
351
|
+
)
|
352
|
+
|
353
|
+
# Drop the worker_id added by MultiTokenizer
|
354
|
+
orig_rid = rid.split("_")[-1]
|
355
|
+
tracer = threads_info[pid].tracer
|
356
|
+
root_span = tracer.start_span(
|
357
|
+
name=f"Req {orig_rid[:8]}",
|
358
|
+
start_time=ts,
|
359
|
+
)
|
360
|
+
|
361
|
+
root_span.set_attributes(
|
362
|
+
{
|
363
|
+
"rid": rid,
|
364
|
+
"bootstrap_room": bootstrap_room if bootstrap_room else "None",
|
365
|
+
}
|
366
|
+
)
|
367
|
+
|
368
|
+
reqs_context[rid].root_span = root_span
|
369
|
+
reqs_context[rid].root_span_context = trace.set_span_in_context(root_span)
|
370
|
+
|
371
|
+
# create thread context and thread span
|
372
|
+
reqs_context[rid].threads_context[pid] = __create_thread_context(
|
373
|
+
pid,
|
374
|
+
reqs_context[rid].root_span_context,
|
375
|
+
ts,
|
376
|
+
)
|
377
|
+
|
378
|
+
|
379
|
+
def trace_req_finish(
|
380
|
+
rid: str, ts: Optional[int] = None, attrs: Optional[Dict[str, Any]] = None
|
381
|
+
):
|
382
|
+
if not tracing_enabled:
|
383
|
+
return
|
384
|
+
|
385
|
+
rid = str(rid)
|
386
|
+
if rid not in reqs_context:
|
387
|
+
return
|
388
|
+
|
389
|
+
req_context = reqs_context[rid]
|
390
|
+
ts = ts or __get_cur_time_ns()
|
391
|
+
|
392
|
+
# End all unclosed thread spans.
|
393
|
+
for thread_context in req_context.threads_context.values():
|
394
|
+
thread_context.thread_span.end(end_time=ts)
|
395
|
+
|
396
|
+
if attrs:
|
397
|
+
req_context.root_span.set_attributes(attrs)
|
398
|
+
|
399
|
+
req_context.root_span.end(end_time=ts)
|
400
|
+
|
401
|
+
del reqs_context[rid]
|
402
|
+
|
403
|
+
|
404
|
+
def trace_slice_start(
|
405
|
+
name: str,
|
406
|
+
rid: str,
|
407
|
+
ts: Optional[int] = None,
|
408
|
+
anonymous: bool = False,
|
409
|
+
):
|
410
|
+
|
411
|
+
rid = str(rid)
|
412
|
+
if not tracing_enabled or rid not in reqs_context:
|
413
|
+
return
|
414
|
+
|
415
|
+
pid = threading.get_native_id()
|
416
|
+
if pid not in reqs_context[rid].threads_context:
|
417
|
+
return
|
418
|
+
|
419
|
+
thread_context = reqs_context[rid].threads_context[pid]
|
420
|
+
|
421
|
+
ts = ts or __get_cur_time_ns()
|
422
|
+
|
423
|
+
slice_info = SglangTraceSliceContext(
|
424
|
+
slice_name=name,
|
425
|
+
anonymous=anonymous,
|
426
|
+
)
|
427
|
+
|
428
|
+
# find prev slice
|
429
|
+
prev_span_context = None
|
430
|
+
if not thread_context.cur_slice_stack:
|
431
|
+
if thread_context.last_span_context:
|
432
|
+
prev_span_context = thread_context.last_span_context
|
433
|
+
|
434
|
+
parent_span = thread_context.thread_span
|
435
|
+
if thread_context.cur_slice_stack:
|
436
|
+
parent_span = thread_context.cur_slice_stack[-1].span
|
437
|
+
|
438
|
+
parent_span_context = trace.set_span_in_context(parent_span)
|
439
|
+
span = thread_context.thread_info.tracer.start_span(
|
440
|
+
name=slice_info.slice_name,
|
441
|
+
start_time=ts,
|
442
|
+
context=parent_span_context,
|
443
|
+
)
|
444
|
+
|
445
|
+
if prev_span_context:
|
446
|
+
span.add_link(prev_span_context)
|
447
|
+
|
448
|
+
slice_info.span = span
|
449
|
+
|
450
|
+
thread_context.cur_slice_stack.append(slice_info)
|
451
|
+
|
452
|
+
|
453
|
+
def trace_slice_end(
|
454
|
+
name: str,
|
455
|
+
rid: str,
|
456
|
+
ts: Optional[int] = None,
|
457
|
+
attrs: Optional[Dict[str, Any]] = None,
|
458
|
+
auto_next_anon: bool = False,
|
459
|
+
thread_finish_flag: bool = False,
|
460
|
+
):
|
461
|
+
rid = str(rid)
|
462
|
+
if not tracing_enabled or rid not in reqs_context:
|
463
|
+
return
|
464
|
+
|
465
|
+
pid = threading.get_native_id()
|
466
|
+
if pid not in reqs_context[rid].threads_context:
|
467
|
+
return
|
468
|
+
|
469
|
+
thread_context = reqs_context[rid].threads_context[pid]
|
470
|
+
|
471
|
+
if not thread_context.cur_slice_stack:
|
472
|
+
logger.warning(f"No matching with the SLICE_START event{name} is required.")
|
473
|
+
return
|
474
|
+
|
475
|
+
ts = ts or __get_cur_time_ns()
|
476
|
+
slice_info = thread_context.cur_slice_stack[-1]
|
477
|
+
span = slice_info.span
|
478
|
+
|
479
|
+
if slice_info.anonymous:
|
480
|
+
span.update_name(name)
|
481
|
+
else:
|
482
|
+
span = slice_info.span
|
483
|
+
if slice_info.slice_name != name:
|
484
|
+
span.set_status(trace.Status(trace.StatusCode.ERROR))
|
485
|
+
logger.warning(f"Slice name mismatch: {name} != {slice_info.slice_name}")
|
486
|
+
|
487
|
+
if attrs:
|
488
|
+
span.set_attributes(attrs)
|
489
|
+
|
490
|
+
span.end(end_time=ts)
|
491
|
+
|
492
|
+
thread_context.cur_slice_stack.pop()
|
493
|
+
if len(thread_context.cur_slice_stack) == 0:
|
494
|
+
thread_context.last_span_context = span.get_span_context()
|
495
|
+
|
496
|
+
# If this is the last slice in the thread,
|
497
|
+
# release the thread context and check whether to release the request context.
|
498
|
+
if thread_finish_flag:
|
499
|
+
thread_context.thread_span.end(end_time=ts)
|
500
|
+
del reqs_context[rid].threads_context[pid]
|
501
|
+
if reqs_context[rid].is_copy and not reqs_context[rid].threads_context:
|
502
|
+
del reqs_context[rid]
|
503
|
+
return
|
504
|
+
|
505
|
+
if auto_next_anon:
|
506
|
+
trace_slice_start("", rid, ts, True)
|
507
|
+
|
508
|
+
|
509
|
+
# alias
|
510
|
+
trace_slice = trace_slice_end
|
511
|
+
|
512
|
+
|
513
|
+
# Add event to the current slice on the same thread with the same rid.
|
514
|
+
def trace_event(name: str, rid: str, ts: Optional[int] = None):
|
515
|
+
if not tracing_enabled or rid not in reqs_context:
|
516
|
+
return
|
517
|
+
|
518
|
+
rid = str(rid)
|
519
|
+
pid = threading.get_native_id()
|
520
|
+
if pid not in reqs_context[rid].threads_context:
|
521
|
+
return
|
522
|
+
|
523
|
+
thread_context = reqs_context[rid].threads_context[pid]
|
524
|
+
|
525
|
+
if not thread_context.cur_slice_stack:
|
526
|
+
logger.warning(f"No slice is currently being traced.")
|
527
|
+
return
|
528
|
+
|
529
|
+
ts = ts or __get_cur_time_ns()
|
530
|
+
|
531
|
+
slice_info = thread_context.cur_slice_stack[-1]
|
532
|
+
slice_info.span.add_event(name=name, timestamp=ts)
|
533
|
+
|
534
|
+
|
535
|
+
# Add attrs to the current slice on the same thread with the same rid.
|
536
|
+
def trace_slice_add_attr(rid: str, attrs: Dict[str, Any]):
|
537
|
+
if not tracing_enabled or rid not in reqs_context:
|
538
|
+
return
|
539
|
+
|
540
|
+
rid = str(rid)
|
541
|
+
pid = threading.get_native_id()
|
542
|
+
if pid not in reqs_context[rid].threads_context:
|
543
|
+
return
|
544
|
+
|
545
|
+
thread_context = reqs_context[rid].threads_context[pid]
|
546
|
+
|
547
|
+
if not thread_context.cur_slice_stack:
|
548
|
+
logger.warning(f"No slice is currently being traced.")
|
549
|
+
return
|
550
|
+
|
551
|
+
slice_info = thread_context.cur_slice_stack[-1]
|
552
|
+
slice_info.span.set_attributes(attrs)
|
sglang/srt/utils.py
CHANGED
@@ -15,6 +15,7 @@
|
|
15
15
|
|
16
16
|
from __future__ import annotations
|
17
17
|
|
18
|
+
import argparse
|
18
19
|
import asyncio
|
19
20
|
import builtins
|
20
21
|
import ctypes
|
@@ -230,8 +231,16 @@ except:
|
|
230
231
|
is_intel_amx_backend_available = False
|
231
232
|
|
232
233
|
|
234
|
+
try:
|
235
|
+
# move torch._C._cpu._is_amx_tile_supported() from cpu_has_amx_support
|
236
|
+
# to support torch compile
|
237
|
+
is_amx_tile_supported = torch._C._cpu._is_amx_tile_supported()
|
238
|
+
except:
|
239
|
+
is_amx_tile_supported = False
|
240
|
+
|
241
|
+
|
233
242
|
def cpu_has_amx_support():
|
234
|
-
return
|
243
|
+
return is_amx_tile_supported and is_intel_amx_backend_available
|
235
244
|
|
236
245
|
|
237
246
|
def use_intel_amx_backend(layer):
|
@@ -426,7 +435,9 @@ def get_available_gpu_memory(
|
|
426
435
|
|
427
436
|
elif device == "cpu":
|
428
437
|
# TODO: rename the variables in the current function to be not GPU specific
|
429
|
-
|
438
|
+
total_free_memory = psutil.virtual_memory().available
|
439
|
+
n_numa_node: int = len(get_cpu_ids_by_node())
|
440
|
+
free_gpu_memory = round(total_free_memory / n_numa_node, 3)
|
430
441
|
elif device == "npu":
|
431
442
|
num_gpus = torch.npu.device_count()
|
432
443
|
assert gpu_id < num_gpus
|
@@ -1149,7 +1160,7 @@ def pytorch_profile(name, func, *args, data_size=-1):
|
|
1149
1160
|
|
1150
1161
|
def get_zmq_socket(
|
1151
1162
|
context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
|
1152
|
-
):
|
1163
|
+
) -> zmq.Socket:
|
1153
1164
|
mem = psutil.virtual_memory()
|
1154
1165
|
total_mem = mem.total / 1024**3
|
1155
1166
|
available_mem = mem.available / 1024**3
|
@@ -1421,6 +1432,7 @@ def init_custom_process_group(
|
|
1421
1432
|
store=None,
|
1422
1433
|
group_name=None,
|
1423
1434
|
pg_options=None,
|
1435
|
+
device_id=None,
|
1424
1436
|
):
|
1425
1437
|
from torch.distributed.distributed_c10d import (
|
1426
1438
|
Backend,
|
@@ -1474,6 +1486,7 @@ def init_custom_process_group(
|
|
1474
1486
|
group_name=group_name,
|
1475
1487
|
**{pg_options_param_name: pg_options},
|
1476
1488
|
timeout=timeout,
|
1489
|
+
device_id=device_id,
|
1477
1490
|
)
|
1478
1491
|
|
1479
1492
|
_world.pg_group_ranks[pg] = {i: i for i in range(world_size)}
|
@@ -2900,6 +2913,18 @@ def mxfp_supported():
|
|
2900
2913
|
return False
|
2901
2914
|
|
2902
2915
|
|
2916
|
+
@lru_cache(maxsize=1)
|
2917
|
+
def is_gfx95_supported():
|
2918
|
+
"""
|
2919
|
+
Returns whether the current platform supports MX types.
|
2920
|
+
"""
|
2921
|
+
if torch.version.hip:
|
2922
|
+
gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
|
2923
|
+
return any(gfx in gcn_arch for gfx in ["gfx95"])
|
2924
|
+
else:
|
2925
|
+
return False
|
2926
|
+
|
2927
|
+
|
2903
2928
|
# LoRA-related constants and utilities
|
2904
2929
|
SUPPORTED_LORA_TARGET_MODULES = [
|
2905
2930
|
"q_proj",
|
@@ -3015,3 +3040,21 @@ def check_cuda_result(raw_output):
|
|
3015
3040
|
raise Exception(f"CUDA error: {err}")
|
3016
3041
|
|
3017
3042
|
return results
|
3043
|
+
|
3044
|
+
|
3045
|
+
def numa_bind_to_node(node: int):
|
3046
|
+
libnuma = ctypes.CDLL("libnuma.so")
|
3047
|
+
if libnuma.numa_available() < 0:
|
3048
|
+
raise SystemError("numa not available on this system")
|
3049
|
+
|
3050
|
+
libnuma.numa_run_on_node(ctypes.c_int(node))
|
3051
|
+
libnuma.numa_set_localalloc()
|
3052
|
+
|
3053
|
+
|
3054
|
+
def json_list_type(value):
|
3055
|
+
try:
|
3056
|
+
return json.loads(value)
|
3057
|
+
except json.JSONDecodeError:
|
3058
|
+
raise argparse.ArgumentTypeError(
|
3059
|
+
f"Invalid JSON list: {value}. Please provide a valid JSON list."
|
3060
|
+
)
|
sglang/srt/weight_sync/utils.py
CHANGED
@@ -6,7 +6,7 @@ from torch.distributed.device_mesh import DeviceMesh
|
|
6
6
|
from torch.distributed.tensor import DTensor
|
7
7
|
|
8
8
|
from sglang.srt.entrypoints.engine import Engine
|
9
|
-
from sglang.srt.managers.
|
9
|
+
from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput
|
10
10
|
from sglang.srt.model_executor.model_runner import LocalSerializedTensor
|
11
11
|
from sglang.srt.utils import MultiprocessingSerializer
|
12
12
|
|