sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -16,9 +16,10 @@
16
16
  import contextlib
17
17
  import json
18
18
  import os
19
+ import tempfile
19
20
  import warnings
20
21
  from pathlib import Path
21
- from typing import Any, Dict, Optional, Type, Union
22
+ from typing import Any, Dict, List, Optional, Type, Union
22
23
 
23
24
  import torch
24
25
  from huggingface_hub import snapshot_download
@@ -38,30 +39,44 @@ from sglang.srt.configs import (
38
39
  ChatGLMConfig,
39
40
  DbrxConfig,
40
41
  DeepseekVL2Config,
42
+ DotsOCRConfig,
41
43
  DotsVLMConfig,
42
44
  ExaoneConfig,
45
+ FalconH1Config,
43
46
  KimiVLConfig,
44
47
  LongcatFlashConfig,
45
48
  MultiModalityConfig,
49
+ NemotronHConfig,
50
+ Olmo3Config,
46
51
  Qwen3NextConfig,
47
52
  Step3VLConfig,
48
53
  )
54
+ from sglang.srt.configs.deepseek_ocr import DeepseekVLV2Config
49
55
  from sglang.srt.configs.internvl import InternVLChatConfig
50
56
  from sglang.srt.connector import create_remote_connector
51
57
  from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset
52
58
 
53
- _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
54
- ChatGLMConfig.model_type: ChatGLMConfig,
55
- DbrxConfig.model_type: DbrxConfig,
56
- ExaoneConfig.model_type: ExaoneConfig,
57
- DeepseekVL2Config.model_type: DeepseekVL2Config,
58
- MultiModalityConfig.model_type: MultiModalityConfig,
59
- KimiVLConfig.model_type: KimiVLConfig,
60
- InternVLChatConfig.model_type: InternVLChatConfig,
61
- Step3VLConfig.model_type: Step3VLConfig,
62
- LongcatFlashConfig.model_type: LongcatFlashConfig,
63
- Qwen3NextConfig.model_type: Qwen3NextConfig,
64
- DotsVLMConfig.model_type: DotsVLMConfig,
59
+ _CONFIG_REGISTRY: List[Type[PretrainedConfig]] = [
60
+ ChatGLMConfig,
61
+ DbrxConfig,
62
+ ExaoneConfig,
63
+ DeepseekVL2Config,
64
+ MultiModalityConfig,
65
+ KimiVLConfig,
66
+ InternVLChatConfig,
67
+ Step3VLConfig,
68
+ LongcatFlashConfig,
69
+ Olmo3Config,
70
+ Qwen3NextConfig,
71
+ FalconH1Config,
72
+ DotsVLMConfig,
73
+ DotsOCRConfig,
74
+ NemotronHConfig,
75
+ DeepseekVLV2Config,
76
+ ]
77
+
78
+ _CONFIG_REGISTRY = {
79
+ config_cls.model_type: config_cls for config_cls in _CONFIG_REGISTRY
65
80
  }
66
81
 
67
82
  for name, cls in _CONFIG_REGISTRY.items():
@@ -102,6 +117,12 @@ def get_hf_text_config(config: PretrainedConfig):
102
117
  # if transformers config doesn't align with this assumption.
103
118
  assert hasattr(config.text_config, "num_attention_heads")
104
119
  return config.text_config
120
+
121
+ if hasattr(config, "llm_config"):
122
+ # PointsV1.5 Chat Model
123
+ assert hasattr(config.llm_config, "num_attention_heads")
124
+ return config.llm_config
125
+
105
126
  if hasattr(config, "language_config"):
106
127
  return config.language_config
107
128
  if hasattr(config, "thinker_config"):
@@ -119,6 +140,38 @@ def get_hf_text_config(config: PretrainedConfig):
119
140
  return config
120
141
 
121
142
 
143
+ # Temporary hack for DeepSeek-V3.2 model
144
+ def _load_deepseek_v32_model(
145
+ model_path: str,
146
+ trust_remote_code: bool = False,
147
+ revision: Optional[str] = None,
148
+ **kwargs,
149
+ ):
150
+ # first get the local path
151
+ local_path = download_from_hf(model_path)
152
+ # then load the config file in json
153
+ config_file = os.path.join(local_path, "config.json")
154
+ if not os.path.exists(config_file):
155
+ raise RuntimeError(f"Can't find config file in {local_path}.")
156
+
157
+ with open(config_file, "r") as f:
158
+ config_json = json.load(f)
159
+
160
+ config_json["architectures"] = ["DeepseekV3ForCausalLM"]
161
+ config_json["model_type"] = "deepseek_v3"
162
+
163
+ tmp_path = os.path.join(tempfile.gettempdir(), "_tmp_config_folder")
164
+ os.makedirs(tmp_path, exist_ok=True)
165
+
166
+ unique_path = os.path.join(tmp_path, f"deepseek_v32_{os.getpid()}")
167
+ with open(unique_path, "w") as f:
168
+ json.dump(config_json, f)
169
+
170
+ return AutoConfig.from_pretrained(
171
+ unique_path, trust_remote_code=trust_remote_code, revision=revision, **kwargs
172
+ )
173
+
174
+
122
175
  @lru_cache_frozenset(maxsize=32)
123
176
  def get_config(
124
177
  model: str,
@@ -140,9 +193,22 @@ def get_config(
140
193
  client.pull_files(ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
141
194
  model = client.get_local_dir()
142
195
 
143
- config = AutoConfig.from_pretrained(
144
- model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
145
- )
196
+ try:
197
+ config = AutoConfig.from_pretrained(
198
+ model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
199
+ )
200
+ if "deepseek-ai/DeepSeek-OCR" in model:
201
+ config.model_type = "deepseek-ocr"
202
+ # Due to an unknown reason, Hugging Face’s AutoConfig mistakenly recognizes the configuration of deepseek-ocr as deepseekvl2.
203
+ # This is a temporary workaround and will require further optimization.
204
+
205
+ except ValueError as e:
206
+ if not "deepseek_v32" in str(e):
207
+ raise e
208
+ config = _load_deepseek_v32_model(
209
+ model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
210
+ )
211
+
146
212
  if (
147
213
  config.architectures is not None
148
214
  and config.architectures[0] == "Phi4MMForCausalLM"
@@ -158,7 +224,8 @@ def get_config(
158
224
  "intermediate_size": 4304,
159
225
  "model_type": "siglip_vision_model",
160
226
  "num_attention_heads": 16,
161
- "num_hidden_layers": 26, # Model is originally 27-layer, we only need the first 26 layers for feature extraction.
227
+ "num_hidden_layers": 26,
228
+ # Model is originally 27-layer, we only need the first 26 layers for feature extraction.
162
229
  "patch_size": 14,
163
230
  }
164
231
  config.vision_config = SiglipVisionConfig(**vision_config)
@@ -374,8 +441,8 @@ def get_processor(
374
441
  **kwargs,
375
442
  )
376
443
 
377
- # fix: for Qwen2-VL model, inject default 'size' if not provided.
378
- if config.model_type in {"qwen2_vl"}:
444
+ # fix: for Qwen2-VL and Sarashina2Vision models, inject default 'size' if not provided.
445
+ if config.model_type in {"qwen2_vl", "sarashina2_vision"}:
379
446
  if "size" not in kwargs:
380
447
  kwargs["size"] = {"shortest_edge": 3136, "longest_edge": 1003520}
381
448
 
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os
3
2
  from dataclasses import dataclass
4
3
  from multiprocessing import shared_memory
5
4
  from pathlib import Path
@@ -11,14 +11,14 @@ from sglang.srt.distributed.naive_distributed import (
11
11
  get_naive_distributed,
12
12
  set_naive_distributed,
13
13
  )
14
- from sglang.srt.host_shared_memory import (
14
+ from sglang.srt.layers.parameter import ModelWeightParameter
15
+ from sglang.srt.server_args import ServerArgs
16
+ from sglang.srt.utils import MultiprocessingSerializer, is_pin_memory_available
17
+ from sglang.srt.utils.host_shared_memory import (
15
18
  HostSharedMemoryManager,
16
19
  get_host_shared_memory_manager,
17
20
  set_host_shared_memory_manager,
18
21
  )
19
- from sglang.srt.layers.parameter import ModelWeightParameter
20
- from sglang.srt.server_args import ServerArgs
21
- from sglang.srt.utils import MultiprocessingSerializer, is_pin_memory_available
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
@@ -17,10 +17,18 @@ import torch
17
17
  from packaging import version
18
18
  from torch.multiprocessing import reductions
19
19
 
20
+ from sglang.srt.utils import is_npu
21
+
22
+ _is_npu = is_npu()
23
+
20
24
 
21
25
  def monkey_patch_torch_reductions():
22
26
  """Monkey patching before Torch https://github.com/pytorch/pytorch/pull/149248 is fixed"""
23
27
 
28
+ # Currently, NPU does not support UUID. This has been temporarily commented out, with support expected in the fourth quarter.
29
+ if _is_npu:
30
+ return
31
+
24
32
  if hasattr(reductions, "_reduce_tensor_original"):
25
33
  return
26
34
 
@@ -0,0 +1,199 @@
1
+ """Merge Chrome trace files from multiple ranks (TP, DP, PP, EP) into a single trace."""
2
+
3
+ import glob
4
+ import gzip
5
+ import json
6
+ import logging
7
+ import os
8
+ import re
9
+ from typing import Any, Dict, List, Optional, Tuple
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ProfileMerger:
15
+ """Merge profile traces from all parallelism types: TP, DP, PP, EP."""
16
+
17
+ def __init__(self, output_dir: str, profile_id: str):
18
+ self.output_dir = output_dir
19
+ self.profile_id = profile_id
20
+ self.merged_trace_path = os.path.join(
21
+ output_dir, f"merged-{profile_id}.trace.json.gz"
22
+ )
23
+
24
+ # Rank types in priority order (used for sorting and labeling)
25
+ self.rank_types = ["tp", "dp", "pp", "ep"]
26
+
27
+ # Sort index multipliers: DP (highest) > EP > PP > TP (lowest)
28
+ # These ensure proper visual ordering in trace viewer
29
+ self.sort_index_multipliers = {
30
+ "dp_rank": 100_000_000,
31
+ "ep_rank": 1_000_000,
32
+ "pp_rank": 10_000,
33
+ "tp_rank": 100,
34
+ }
35
+
36
+ # PID threshold for sort_index updates (only update for system PIDs < 1000)
37
+ self.pid_sort_index_threshold = 1000
38
+
39
+ def merge_chrome_traces(self) -> str:
40
+ """Merge Chrome traces from all ranks into a single trace.
41
+
42
+ Returns:
43
+ Path to merged trace file.
44
+
45
+ Raises:
46
+ ValueError: If no trace files found.
47
+ """
48
+ trace_files = self._discover_trace_files()
49
+ if not trace_files:
50
+ raise ValueError(f"No trace files found for profile_id: {self.profile_id}")
51
+
52
+ logger.info(f"Found {len(trace_files)} trace files to merge")
53
+
54
+ merged_trace = {"traceEvents": []}
55
+ all_device_properties = []
56
+
57
+ for trace_file in sorted(trace_files, key=self._get_rank_sort_key):
58
+ rank_info = self._extract_rank_info(trace_file)
59
+ logger.info(f"Processing {trace_file} with rank info: {rank_info}")
60
+
61
+ output = self._handle_file(trace_file, rank_info)
62
+
63
+ merged_trace["traceEvents"].extend(output["traceEvents"])
64
+
65
+ if "deviceProperties" in output:
66
+ all_device_properties.extend(output["deviceProperties"])
67
+ del output["deviceProperties"]
68
+
69
+ for key, value in output.items():
70
+ if key != "traceEvents" and key not in merged_trace:
71
+ merged_trace[key] = value
72
+
73
+ if all_device_properties:
74
+ merged_trace["deviceProperties"] = all_device_properties
75
+
76
+ with gzip.open(self.merged_trace_path, "wb") as f:
77
+ f.write(json.dumps(merged_trace).encode("utf-8"))
78
+
79
+ logger.info(f"Merged profile saved to: {self.merged_trace_path}")
80
+ logger.info(f"Total events merged: {len(merged_trace['traceEvents'])}")
81
+
82
+ return self.merged_trace_path
83
+
84
+ def _discover_trace_files(self) -> List[str]:
85
+ """Discover trace files matching profile_id (supports TP/DP/PP/EP formats)."""
86
+ patterns = [f"{self.profile_id}*.trace.json.gz"]
87
+
88
+ trace_files = []
89
+ for pattern in patterns:
90
+ search_pattern = os.path.join(self.output_dir, pattern)
91
+ trace_files.extend(glob.glob(search_pattern))
92
+
93
+ trace_files = [
94
+ f
95
+ for f in trace_files
96
+ if not f.endswith(f"merged-{self.profile_id}.trace.json.gz")
97
+ and not f.endswith("-memory.pickle")
98
+ and "TP-" in f
99
+ ]
100
+ trace_files = list(set(trace_files))
101
+ return trace_files
102
+
103
+ def _extract_rank_info(self, filename: str) -> Dict[str, int]:
104
+ """Extract rank info (TP/DP/PP/EP) from filename."""
105
+ basename = os.path.basename(filename)
106
+ rank_info = {}
107
+
108
+ for rank_type in self.rank_types:
109
+ match = re.search(rf"{rank_type.upper()}-(\d+)", basename)
110
+ if match:
111
+ rank_info[f"{rank_type}_rank"] = int(match.group(1))
112
+
113
+ return rank_info
114
+
115
+ def _create_rank_label(self, rank_info: Dict[str, int]) -> str:
116
+ parts = []
117
+ for rank_type in self.rank_types:
118
+ rank_key = f"{rank_type}_rank"
119
+ if rank_key in rank_info:
120
+ parts.append(f"{rank_type.upper()}{rank_info[rank_key]:02d}")
121
+
122
+ return f"[{'-'.join(parts)}]" if parts else "[Unknown]"
123
+
124
+ def _handle_file(self, path: str, rank_info: Dict[str, int]) -> Dict[str, Any]:
125
+ logger.info(f"Processing file: {path}")
126
+
127
+ try:
128
+ with gzip.open(path, "rt", encoding="utf-8") as f:
129
+ trace = json.load(f)
130
+
131
+ output = {
132
+ key: value for key, value in trace.items() if key != "traceEvents"
133
+ }
134
+ output["traceEvents"] = self._process_events(
135
+ trace.get("traceEvents", []), rank_info
136
+ )
137
+ return output
138
+
139
+ except Exception as e:
140
+ logger.error(f"Failed to process trace file {path}: {e}")
141
+ return {"traceEvents": []}
142
+
143
+ def _process_events(
144
+ self, events: List[Dict], rank_info: Dict[str, int]
145
+ ) -> List[Dict]:
146
+ """Process events: update sort_index and add rank labels to PIDs."""
147
+ rank_label = self._create_rank_label(rank_info)
148
+
149
+ for event in events:
150
+ if event.get("name") == "process_sort_index":
151
+ pid = self._maybe_cast_int(event.get("pid"))
152
+ if pid is not None and pid < self.pid_sort_index_threshold:
153
+ event["args"]["sort_index"] = self._calculate_sort_index(
154
+ rank_info, pid
155
+ )
156
+
157
+ event["pid"] = f"{rank_label} {event['pid']}"
158
+
159
+ return events
160
+
161
+ def _calculate_sort_index(self, rank_info: Dict[str, int], pid: int) -> int:
162
+ sort_index = pid
163
+ for rank_type, multiplier in self.sort_index_multipliers.items():
164
+ sort_index += rank_info.get(rank_type, 0) * multiplier
165
+ return sort_index
166
+
167
+ def _get_rank_sort_key(self, path: str) -> Tuple[int, int, int, int]:
168
+ rank_info = self._extract_rank_info(path)
169
+ return tuple(
170
+ rank_info.get(f"{rank_type}_rank", 0)
171
+ for rank_type in ["dp", "ep", "pp", "tp"]
172
+ )
173
+
174
+ def _maybe_cast_int(self, x) -> Optional[int]:
175
+ try:
176
+ return int(x)
177
+ except (ValueError, TypeError):
178
+ return None
179
+
180
+ def get_merge_summary(self) -> Dict[str, Any]:
181
+ if not os.path.exists(self.merged_trace_path):
182
+ return {"error": "Merged trace file not found"}
183
+
184
+ try:
185
+ with gzip.open(self.merged_trace_path, "rt") as f:
186
+ merged_data = json.load(f)
187
+
188
+ trace_files = self._discover_trace_files()
189
+
190
+ return {
191
+ "merged_file": self.merged_trace_path,
192
+ "total_events": len(merged_data.get("traceEvents", [])),
193
+ "total_files": len(trace_files),
194
+ "source_files": [os.path.basename(f) for f in trace_files],
195
+ "profile_id": self.profile_id,
196
+ "device_properties_count": len(merged_data.get("deviceProperties", [])),
197
+ }
198
+ except Exception as e:
199
+ return {"error": f"Failed to read merged trace: {str(e)}"}