sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,174 @@
|
|
1
|
+
import torch
|
2
|
+
import triton
|
3
|
+
import triton.language as tl
|
4
|
+
|
5
|
+
from sglang.srt.lora.utils import LoRABatchInfo
|
6
|
+
from sglang.srt.utils import cached_triton_kernel
|
7
|
+
|
8
|
+
|
9
|
+
@cached_triton_kernel(lambda _, kwargs: (kwargs["NUM_SLICES"], kwargs["BLOCK_M"]))
|
10
|
+
@triton.jit
|
11
|
+
def _chunked_lora_shrink_kernel(
|
12
|
+
# Pointers to matrices
|
13
|
+
x,
|
14
|
+
weights,
|
15
|
+
output,
|
16
|
+
# Information on sequence lengths,ranks and weight id
|
17
|
+
seg_indptr,
|
18
|
+
weight_indices,
|
19
|
+
lora_ranks,
|
20
|
+
permutation,
|
21
|
+
num_segs,
|
22
|
+
# Meta parameters
|
23
|
+
N: tl.constexpr, # num_slices * r
|
24
|
+
K: tl.constexpr, # input_dim
|
25
|
+
NUM_SLICES: tl.constexpr,
|
26
|
+
BLOCK_M: tl.constexpr,
|
27
|
+
BLOCK_N: tl.constexpr,
|
28
|
+
BLOCK_K: tl.constexpr,
|
29
|
+
):
|
30
|
+
"""
|
31
|
+
Computes a chunked SGMV for LoRA shrink operations.
|
32
|
+
|
33
|
+
The kernel ensures that output[seg_start:seg_start + seg_len, :rank * num_slices]
|
34
|
+
stores the product of the input `x` and the LoRA weights for the corresponding
|
35
|
+
sequence. This implies that when rank is 0, the kernel is essentially a no-op,
|
36
|
+
as output[seg_start:seg_start + seg_len, :0] is trivially correct (empty).
|
37
|
+
|
38
|
+
Args:
|
39
|
+
x (torch.Tensor): The input activations tensor of shape `(s, K)`, where `s`
|
40
|
+
is the sum of all sequence lengths in the batch.
|
41
|
+
weights (torch.Tensor): The LoRA A weights for all available adapters,
|
42
|
+
with shape `(num_lora, N, K)` where N = num_slices * r.
|
43
|
+
output (torch.Tensor): The output tensor of shape `(s, N)`.
|
44
|
+
"""
|
45
|
+
x_stride_1: tl.constexpr = 1
|
46
|
+
x_stride_0: tl.constexpr = K
|
47
|
+
|
48
|
+
w_stride_0: tl.constexpr = N * K
|
49
|
+
w_stride_1: tl.constexpr = K
|
50
|
+
w_stride_2: tl.constexpr = 1
|
51
|
+
|
52
|
+
output_stride_0: tl.constexpr = N
|
53
|
+
output_stride_1: tl.constexpr = 1
|
54
|
+
|
55
|
+
pid_s = tl.program_id(1)
|
56
|
+
if pid_s >= num_segs:
|
57
|
+
return
|
58
|
+
|
59
|
+
pid_n = tl.program_id(0)
|
60
|
+
|
61
|
+
# Current block computes sequence with batch_id,
|
62
|
+
# which starts from row seg_start of x with length seg_len
|
63
|
+
w_index = tl.load(weight_indices + pid_s)
|
64
|
+
rank = tl.load(lora_ranks + w_index)
|
65
|
+
|
66
|
+
# If rank is 0, this kernel becomes a no-op as the output is always trivially correct.
|
67
|
+
if rank == 0:
|
68
|
+
return
|
69
|
+
|
70
|
+
seg_start = tl.load(seg_indptr + pid_s)
|
71
|
+
seg_end = tl.load(seg_indptr + pid_s + 1)
|
72
|
+
|
73
|
+
# Adjust N dim according to the specific LoRA adapter
|
74
|
+
cur_n = tl.minimum(N, rank * NUM_SLICES)
|
75
|
+
|
76
|
+
# Map logical sequence index to physical index
|
77
|
+
s_offset_logical = tl.arange(0, BLOCK_M) + seg_start
|
78
|
+
s_offset_physical = tl.load(
|
79
|
+
permutation + s_offset_logical, mask=s_offset_logical < seg_end
|
80
|
+
)
|
81
|
+
|
82
|
+
n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
|
83
|
+
k_offset = tl.arange(0, BLOCK_K)
|
84
|
+
x_ptrs = x + (
|
85
|
+
s_offset_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
|
86
|
+
)
|
87
|
+
w_ptrs = (weights + w_index * w_stride_0) + (
|
88
|
+
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
|
89
|
+
)
|
90
|
+
|
91
|
+
# Iterate to compute the block in output matrix
|
92
|
+
partial_sum = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
|
93
|
+
for k in range(0, tl.cdiv(K, BLOCK_K)):
|
94
|
+
x_tile = tl.load(
|
95
|
+
x_ptrs,
|
96
|
+
mask=(s_offset_logical[:, None] < seg_end)
|
97
|
+
& (k_offset[None, :] < K - k * BLOCK_K),
|
98
|
+
other=0.0,
|
99
|
+
)
|
100
|
+
w_tile = tl.load(
|
101
|
+
w_ptrs,
|
102
|
+
mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < cur_n),
|
103
|
+
other=0.0,
|
104
|
+
)
|
105
|
+
partial_sum += tl.dot(x_tile, w_tile)
|
106
|
+
|
107
|
+
x_ptrs += BLOCK_K * x_stride_1
|
108
|
+
w_ptrs += BLOCK_K * w_stride_2
|
109
|
+
|
110
|
+
# Store result to output matrix
|
111
|
+
partial_sum = partial_sum.to(x.dtype.element_ty)
|
112
|
+
output_ptr = output + (
|
113
|
+
s_offset_physical[:, None] * output_stride_0
|
114
|
+
+ n_offset[None, :] * output_stride_1
|
115
|
+
)
|
116
|
+
output_mask = (s_offset_logical[:, None] < seg_end) & (n_offset[None, :] < cur_n)
|
117
|
+
tl.store(output_ptr, partial_sum, mask=output_mask)
|
118
|
+
|
119
|
+
|
120
|
+
def chunked_sgmv_lora_shrink_forward(
|
121
|
+
x: torch.Tensor,
|
122
|
+
weights: torch.Tensor,
|
123
|
+
batch_info: LoRABatchInfo,
|
124
|
+
num_slices: int,
|
125
|
+
) -> torch.Tensor:
|
126
|
+
# x: (s, input_dim)
|
127
|
+
# weights: (num_lora, num_slices * r, input_dim)
|
128
|
+
# output: (s, num_slices * r)
|
129
|
+
# num_slices: qkv=3, gate_up=2, others=1
|
130
|
+
# when called with multiple slices, the weights.shape[-2] will be num_slices * r
|
131
|
+
# input_dim is much larger than r
|
132
|
+
|
133
|
+
assert x.is_contiguous()
|
134
|
+
assert weights.is_contiguous()
|
135
|
+
assert len(x.shape) == 2
|
136
|
+
assert len(weights.shape) == 3
|
137
|
+
|
138
|
+
# Block shapes
|
139
|
+
# TODO (lifuhuang): experiment with split-k
|
140
|
+
BLOCK_M = batch_info.max_len
|
141
|
+
BLOCK_N = 16
|
142
|
+
BLOCK_K = 256
|
143
|
+
|
144
|
+
S = x.shape[0]
|
145
|
+
N = weights.shape[1]
|
146
|
+
K = weights.shape[2]
|
147
|
+
assert x.shape[-1] == K
|
148
|
+
|
149
|
+
num_segments = batch_info.num_segments
|
150
|
+
grid = (
|
151
|
+
triton.cdiv(N, BLOCK_N),
|
152
|
+
batch_info.bs if batch_info.use_cuda_graph else num_segments,
|
153
|
+
)
|
154
|
+
|
155
|
+
output = torch.empty((S, N), device=x.device, dtype=x.dtype)
|
156
|
+
_chunked_lora_shrink_kernel[grid](
|
157
|
+
x=x,
|
158
|
+
weights=weights,
|
159
|
+
output=output,
|
160
|
+
seg_indptr=batch_info.seg_indptr,
|
161
|
+
weight_indices=batch_info.weight_indices,
|
162
|
+
lora_ranks=batch_info.lora_ranks,
|
163
|
+
permutation=batch_info.permutation,
|
164
|
+
num_segs=num_segments,
|
165
|
+
# constants
|
166
|
+
N=N,
|
167
|
+
K=K,
|
168
|
+
NUM_SLICES=num_slices,
|
169
|
+
BLOCK_M=BLOCK_M,
|
170
|
+
BLOCK_N=BLOCK_N,
|
171
|
+
BLOCK_K=BLOCK_K,
|
172
|
+
)
|
173
|
+
|
174
|
+
return output
|
sglang/srt/lora/utils.py
CHANGED
@@ -5,24 +5,27 @@ from typing import Iterable, Optional, Set, Tuple
|
|
5
5
|
|
6
6
|
import torch
|
7
7
|
|
8
|
-
from sglang.srt.hf_transformers_utils import AutoConfig
|
8
|
+
from sglang.srt.utils.hf_transformers_utils import AutoConfig
|
9
9
|
|
10
10
|
|
11
11
|
@dataclass
|
12
12
|
class LoRABatchInfo:
|
13
|
+
# The forward mode is using CUDA Graph.
|
14
|
+
use_cuda_graph: bool
|
15
|
+
|
13
16
|
# Batch size
|
14
17
|
bs: int
|
15
18
|
|
16
|
-
#
|
17
|
-
|
18
|
-
|
19
|
-
# Indice pointers of each sequence in shape (bs + 1, )
|
20
|
-
seg_indptr: torch.Tensor
|
19
|
+
# Number of segments. For triton backend, it is equal to batch size.
|
20
|
+
num_segments: int
|
21
21
|
|
22
|
-
# Maximum
|
22
|
+
# Maximum segment length of current batch
|
23
23
|
max_len: int
|
24
24
|
|
25
|
-
#
|
25
|
+
# Indice pointers of each segment in shape (num_segments + 1, )
|
26
|
+
seg_indptr: torch.Tensor
|
27
|
+
|
28
|
+
# The index of lora adapter used by each segment, in shape (num_segments,)
|
26
29
|
weight_indices: torch.Tensor
|
27
30
|
|
28
31
|
# ranks of each lora adapter, in shape (lora_num,)
|
@@ -31,6 +34,12 @@ class LoRABatchInfo:
|
|
31
34
|
# scaling of each lora adapter, in shape (lora_num,)
|
32
35
|
scalings: torch.Tensor
|
33
36
|
|
37
|
+
# Lengths of each segments in shape (num_segments,)
|
38
|
+
seg_lens: Optional[torch.Tensor]
|
39
|
+
|
40
|
+
# The logical (re)ordering of input rows (tokens), in shape (num_tokens,)
|
41
|
+
permutation: Optional[torch.Tensor]
|
42
|
+
|
34
43
|
|
35
44
|
class LoRAType(Enum):
|
36
45
|
LORA_A = 0
|
@@ -48,14 +57,14 @@ def get_layer_id(name: str) -> int:
|
|
48
57
|
|
49
58
|
|
50
59
|
def get_hidden_dim(
|
51
|
-
module_name: str, config: AutoConfig, base_model: torch.nn.Module
|
60
|
+
module_name: str, config: AutoConfig, base_model: torch.nn.Module, layer_idx: int
|
52
61
|
) -> Tuple[int]:
|
53
62
|
"""
|
54
63
|
Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
|
55
64
|
"""
|
56
65
|
|
57
66
|
if hasattr(base_model, "get_hidden_dim"):
|
58
|
-
return base_model.get_hidden_dim(module_name)
|
67
|
+
return base_model.get_hidden_dim(module_name, layer_idx)
|
59
68
|
else:
|
60
69
|
"""
|
61
70
|
WARNING: get_hidden_dim() is not defined,
|
@@ -89,6 +98,7 @@ def get_normalized_target_modules(
|
|
89
98
|
) -> set[str]:
|
90
99
|
"""
|
91
100
|
Mapping a list of target module name to names of the normalized LoRA weights.
|
101
|
+
Handles both base module names (e.g., "gate_proj") and prefixed module names (e.g., "feed_forward.gate_proj").
|
92
102
|
"""
|
93
103
|
params_mapping = {
|
94
104
|
"q_proj": "qkv_proj",
|
@@ -100,7 +110,8 @@ def get_normalized_target_modules(
|
|
100
110
|
|
101
111
|
result = set()
|
102
112
|
for name in target_modules:
|
103
|
-
|
113
|
+
base_name = name.split(".")[-1]
|
114
|
+
normalized_name = params_mapping.get(base_name, base_name)
|
104
115
|
result.add(normalized_name)
|
105
116
|
return result
|
106
117
|
|
@@ -0,0 +1,170 @@
|
|
1
|
+
"""
|
2
|
+
Asynchronous dynamic batch tokenizer for SGLang.
|
3
|
+
|
4
|
+
This module provides an async tokenizer with dynamic batching capabilities
|
5
|
+
to reduce tokenization overhead when multiple requests arrive concurrently.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import asyncio
|
9
|
+
import logging
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
11
|
+
from functools import partial
|
12
|
+
from typing import Any, Dict, List, Optional
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class AsyncDynamicbatchTokenizer:
|
18
|
+
"""Asynchronous tokenizer with dynamic batching for single string prompts.
|
19
|
+
|
20
|
+
Dynamically batches pending encode requests from a queue to reduce overhead.
|
21
|
+
Only handles single string prompts - regular batch processing of multiple
|
22
|
+
strings per request should be handled at a higher level.
|
23
|
+
A single-thread ThreadPoolExecutor is used so the event loop stays responsive.
|
24
|
+
|
25
|
+
Note: Uses lazy initialization for asyncio components because this class
|
26
|
+
is instantiated in TokenizerManager.__init__() before the event loop starts.
|
27
|
+
"""
|
28
|
+
|
29
|
+
def __init__(
|
30
|
+
self,
|
31
|
+
tokenizer,
|
32
|
+
max_batch_size: int = 32,
|
33
|
+
batch_wait_timeout_s: float = 0.002,
|
34
|
+
) -> None:
|
35
|
+
self.tokenizer = tokenizer
|
36
|
+
self.max_batch_size = max_batch_size
|
37
|
+
self.batch_wait_timeout_s = batch_wait_timeout_s
|
38
|
+
|
39
|
+
# Single queue for all encode requests - initialized lazily
|
40
|
+
self._queue: Optional[asyncio.Queue] = None
|
41
|
+
self._batcher_task: Optional[asyncio.Task] = None
|
42
|
+
|
43
|
+
# Single-thread executor for blocking tokenizer calls
|
44
|
+
self._executor = ThreadPoolExecutor(max_workers=1)
|
45
|
+
self._initialized = False
|
46
|
+
|
47
|
+
def _ensure_initialized(self):
|
48
|
+
"""Lazy initialization of event loop dependent components."""
|
49
|
+
if not self._initialized:
|
50
|
+
self._queue = asyncio.Queue()
|
51
|
+
self._batcher_task = asyncio.create_task(self._dynamic_batch_loop())
|
52
|
+
self._initialized = True
|
53
|
+
|
54
|
+
async def __call__(self, prompt: str, **kwargs) -> Any:
|
55
|
+
"""Encode a single prompt."""
|
56
|
+
return await self.encode(prompt, **kwargs)
|
57
|
+
|
58
|
+
async def encode(self, prompt: str, **kwargs) -> Any:
|
59
|
+
"""Encode a single prompt."""
|
60
|
+
self._ensure_initialized()
|
61
|
+
result_future: asyncio.Future = asyncio.get_running_loop().create_future()
|
62
|
+
await self._queue.put((prompt, kwargs, result_future))
|
63
|
+
return await result_future
|
64
|
+
|
65
|
+
async def _dynamic_batch_loop(self):
|
66
|
+
"""Dynamically batch incoming encode requests for efficiency."""
|
67
|
+
while True:
|
68
|
+
try:
|
69
|
+
# Get the first request
|
70
|
+
prompt, kwargs, result_future = await self._queue.get()
|
71
|
+
|
72
|
+
# Collect requests into dynamic batch
|
73
|
+
prompts = [prompt]
|
74
|
+
kwargs_list = [kwargs]
|
75
|
+
result_futures = [result_future]
|
76
|
+
|
77
|
+
# Check if there are more items immediately available in the queue
|
78
|
+
# If queue is empty, process single item immediately without timeout
|
79
|
+
if self._queue.empty():
|
80
|
+
# No other requests waiting, process immediately
|
81
|
+
pass
|
82
|
+
else:
|
83
|
+
# There might be more requests, wait for dynamic batching opportunity
|
84
|
+
start_time = asyncio.get_running_loop().time()
|
85
|
+
|
86
|
+
# Collect more requests up to max_batch_size or batch_wait_timeout_s
|
87
|
+
while len(prompts) < self.max_batch_size:
|
88
|
+
elapsed = asyncio.get_running_loop().time() - start_time
|
89
|
+
if elapsed >= self.batch_wait_timeout_s:
|
90
|
+
break
|
91
|
+
|
92
|
+
remaining_time = self.batch_wait_timeout_s - elapsed
|
93
|
+
try:
|
94
|
+
prompt, kwargs, result_future = await asyncio.wait_for(
|
95
|
+
self._queue.get(), remaining_time
|
96
|
+
)
|
97
|
+
prompts.append(prompt)
|
98
|
+
kwargs_list.append(kwargs)
|
99
|
+
result_futures.append(result_future)
|
100
|
+
except asyncio.TimeoutError:
|
101
|
+
break
|
102
|
+
|
103
|
+
# Log dynamic batch information
|
104
|
+
logger.debug(
|
105
|
+
f"AsyncDynamicbatchTokenizer: Processing dynamic batch of size {len(prompts)}"
|
106
|
+
)
|
107
|
+
|
108
|
+
# Process the dynamic batch
|
109
|
+
await self._process_dynamic_batch(prompts, kwargs_list, result_futures)
|
110
|
+
|
111
|
+
except Exception as e:
|
112
|
+
logger.error(f"Error in dynamic batch loop: {e}")
|
113
|
+
# Continue the loop to handle other requests
|
114
|
+
|
115
|
+
async def _process_dynamic_batch(
|
116
|
+
self,
|
117
|
+
prompts: List[str],
|
118
|
+
kwargs_list: List[Dict],
|
119
|
+
result_futures: List[asyncio.Future],
|
120
|
+
) -> None:
|
121
|
+
"""Process a dynamic batch of encode requests for single string prompts."""
|
122
|
+
# Check if all kwargs are identical for efficient batch processing
|
123
|
+
can_batch = len(set(str(sorted(kw.items())) for kw in kwargs_list)) == 1
|
124
|
+
kwargs = kwargs_list[0] if can_batch else None
|
125
|
+
|
126
|
+
try:
|
127
|
+
# If every request uses identical kwargs we can run a single
|
128
|
+
# batch tokenizer call for a big speed-up.
|
129
|
+
if can_batch and len(prompts) > 1:
|
130
|
+
encode_fn = partial(self.tokenizer, prompts, **kwargs)
|
131
|
+
results = await asyncio.get_running_loop().run_in_executor(
|
132
|
+
self._executor, encode_fn
|
133
|
+
)
|
134
|
+
|
135
|
+
for i, fut in enumerate(result_futures):
|
136
|
+
if not fut.done():
|
137
|
+
data = {k: v[i] for k, v in results.items()}
|
138
|
+
fut.set_result(data)
|
139
|
+
else:
|
140
|
+
# Process each request individually due to different kwargs
|
141
|
+
if len(prompts) > 1 and not can_batch:
|
142
|
+
logger.warning(
|
143
|
+
f"AsyncDynamicbatchTokenizer: Dynamic batching disabled for batch of {len(prompts)} "
|
144
|
+
f"requests due to differing kwargs. This reduces performance benefits. "
|
145
|
+
f"Consider using consistent tokenization parameters across requests."
|
146
|
+
)
|
147
|
+
|
148
|
+
encode_fn = lambda prompts=prompts, kwargs=kwargs_list: [
|
149
|
+
self.tokenizer(p, **kw) for p, kw in zip(prompts, kwargs_list)
|
150
|
+
]
|
151
|
+
results = await asyncio.get_running_loop().run_in_executor(
|
152
|
+
self._executor, encode_fn
|
153
|
+
)
|
154
|
+
|
155
|
+
for fut, res in zip(result_futures, results):
|
156
|
+
if not fut.done():
|
157
|
+
fut.set_result(res)
|
158
|
+
except Exception as e:
|
159
|
+
logger.error(f"Error in dynamic batch processing: {e}")
|
160
|
+
for fut in result_futures:
|
161
|
+
if not fut.done():
|
162
|
+
fut.set_exception(e)
|
163
|
+
|
164
|
+
def __del__(self):
|
165
|
+
"""Clean up background tasks."""
|
166
|
+
if hasattr(self, "_batcher_task") and self._batcher_task:
|
167
|
+
if not self._batcher_task.done():
|
168
|
+
self._batcher_task.cancel()
|
169
|
+
if hasattr(self, "_executor"):
|
170
|
+
self._executor.shutdown(wait=False)
|