sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
1
1
  import math
2
- import os
3
2
  from dataclasses import dataclass
4
3
  from typing import Dict, List, Optional, Tuple
5
4
 
@@ -12,6 +11,8 @@ from transformers import (
12
11
  ProcessorMixin,
13
12
  )
14
13
 
14
+ from sglang.srt.configs.deepseek_ocr import BASE_SIZE, IMAGE_SIZE, MAX_CROPS, MIN_CROPS
15
+
15
16
 
16
17
  def select_best_resolution(image_size, candidate_resolutions):
17
18
  # used for cropping
@@ -62,6 +63,7 @@ class DictOutput(object):
62
63
  class VLChatProcessorOutput(DictOutput):
63
64
  input_ids: torch.LongTensor
64
65
  target_ids: torch.LongTensor
66
+ images_crop: torch.LongTensor
65
67
  pixel_values: (
66
68
  torch.Tensor
67
69
  ) # rename from "images" to "pixel_values" for compatibility
@@ -105,6 +107,68 @@ class ImageTransform(object):
105
107
  return x
106
108
 
107
109
 
110
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
111
+ best_ratio_diff = float("inf")
112
+ best_ratio = (1, 1)
113
+ area = width * height
114
+ for ratio in target_ratios:
115
+ target_aspect_ratio = ratio[0] / ratio[1]
116
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
117
+ if ratio_diff < best_ratio_diff:
118
+ best_ratio_diff = ratio_diff
119
+ best_ratio = ratio
120
+ elif ratio_diff == best_ratio_diff:
121
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
122
+ best_ratio = ratio
123
+ return best_ratio
124
+
125
+
126
+ def dynamic_preprocess(
127
+ image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False
128
+ ):
129
+ orig_width, orig_height = image.size
130
+ aspect_ratio = orig_width / orig_height
131
+
132
+ # calculate the existing image aspect ratio
133
+ target_ratios = set(
134
+ (i, j)
135
+ for n in range(min_num, max_num + 1)
136
+ for i in range(1, n + 1)
137
+ for j in range(1, n + 1)
138
+ if i * j <= max_num and i * j >= min_num
139
+ )
140
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
141
+
142
+ # find the closest aspect ratio to the target
143
+ target_aspect_ratio = find_closest_aspect_ratio(
144
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size
145
+ )
146
+
147
+ # calculate the target width and height
148
+ target_width = image_size * target_aspect_ratio[0]
149
+ target_height = image_size * target_aspect_ratio[1]
150
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
151
+
152
+ # resize the image
153
+ resized_img = image.resize((target_width, target_height))
154
+ processed_images = []
155
+ for i in range(blocks):
156
+ box = (
157
+ (i % (target_width // image_size)) * image_size,
158
+ (i // (target_width // image_size)) * image_size,
159
+ ((i % (target_width // image_size)) + 1) * image_size,
160
+ ((i // (target_width // image_size)) + 1) * image_size,
161
+ )
162
+ # split the image
163
+ split_img = resized_img.crop(box)
164
+ processed_images.append(split_img)
165
+ assert len(processed_images) == blocks
166
+ if use_thumbnail and len(processed_images) != 1:
167
+ thumbnail_img = image.resize((image_size, image_size))
168
+ processed_images.append(thumbnail_img)
169
+ return processed_images, target_aspect_ratio
170
+
171
+
108
172
  class DeepseekVLV2Processor(ProcessorMixin):
109
173
  tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
110
174
  attributes = ["tokenizer"]
@@ -134,7 +198,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
134
198
  self.image_std = image_std
135
199
  self.normalize = normalize
136
200
  self.downsample_ratio = downsample_ratio
137
-
201
+ self.base_size = BASE_SIZE
138
202
  self.image_transform = ImageTransform(
139
203
  mean=image_mean, std=image_std, normalize=normalize
140
204
  )
@@ -177,7 +241,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
177
241
  **kwargs,
178
242
  )
179
243
 
180
- def format_messages_v2(self, messages, pil_images, max_req_input_len=-1):
244
+ def format_messages_v2(self, messages: str, pil_images, max_req_input_len=-1):
181
245
  """play the role of format_messages_v2 and get_images_info in the last version"""
182
246
  tokenized_data = []
183
247
  masked_tokenized_data = [] # labels
@@ -187,35 +251,34 @@ class DeepseekVLV2Processor(ProcessorMixin):
187
251
 
188
252
  image_index = 0
189
253
  image_token_cnt = messages.count(self.image_token)
190
- tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
254
+ (
255
+ input_ids,
256
+ images,
257
+ images_crop,
258
+ seq_mask,
259
+ spatial_crop,
260
+ num_image_tokens,
261
+ image_shapes,
262
+ ) = self.tokenize_with_images(
191
263
  messages,
192
264
  pil_images[image_index : image_index + image_token_cnt],
193
265
  bos=True,
194
266
  eos=True,
195
267
  cropping=len(pil_images) <= 2,
196
- max_req_input_len=max_req_input_len,
197
268
  )
198
269
 
199
270
  image_index = image_token_cnt
200
- tokenized_data += tokenized_str
201
- if self.mask_prompt:
202
- masked_tokenized_data += [self.ignore_id] * len(tokenized_str)
203
- else:
204
- masked_tokenized_data += tokenized_str
205
271
  images_list += images
206
272
  images_seq_mask += seq_mask
207
- images_spatial_crop += spatial_crop
208
-
209
- assert len(tokenized_data) == len(
210
- images_seq_mask
211
- ), f"format_messages_v2: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
273
+ images_spatial_crop = spatial_crop
212
274
 
213
275
  return (
214
- tokenized_data,
276
+ input_ids,
215
277
  masked_tokenized_data,
216
278
  images_list,
217
279
  images_seq_mask,
218
280
  images_spatial_crop,
281
+ images_crop,
219
282
  )
220
283
 
221
284
  @property
@@ -252,6 +315,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
252
315
  inference_mode: bool = True,
253
316
  system_prompt: str = "",
254
317
  max_req_input_len: int = -1,
318
+ cropping: bool = True,
255
319
  **kwargs,
256
320
  ):
257
321
  """
@@ -275,47 +339,22 @@ class DeepseekVLV2Processor(ProcessorMixin):
275
339
  - num_image_tokens (List[int]): the number of image tokens
276
340
  """
277
341
 
278
- assert (
279
- prompt is None or conversations is None
280
- ), "prompt and conversations cannot be used at the same time."
281
-
342
+ prompt = conversations or prompt
282
343
  (
283
- tokenized_str,
344
+ input_ids,
284
345
  masked_tokenized_str,
285
346
  images_list,
286
347
  images_seq_mask,
287
348
  images_spatial_crop,
288
- ) = self.format_messages_v2(conversations, images, max_req_input_len)
349
+ images_crop,
350
+ ) = self.format_messages_v2(prompt, images, max_req_input_len)
289
351
 
290
- assert (
291
- len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
292
- ), (
293
- f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
294
- f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
295
- )
296
-
297
- input_ids = torch.LongTensor(tokenized_str)
298
352
  target_ids = torch.LongTensor(masked_tokenized_str)
299
- images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
300
-
301
- # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
302
- target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
303
- self.ignore_id
304
- )
305
- input_ids[input_ids < 0] = self.pad_id
306
-
307
- if inference_mode:
308
- assert input_ids[-1] == self.eos_id
309
- input_ids = input_ids[:-1]
310
- target_ids = target_ids[:-1]
311
- images_seq_mask = images_seq_mask[:-1]
312
353
 
313
354
  if len(images_list) == 0:
314
355
  images = torch.zeros((1, 3, self.image_size, self.image_size))
315
- images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
316
356
  else:
317
357
  images = torch.stack(images_list, dim=0)
318
- images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
319
358
 
320
359
  images_spatial_crop = torch.stack(
321
360
  [images_spatial_crop], dim=0
@@ -324,6 +363,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
324
363
  prepare = VLChatProcessorOutput(
325
364
  input_ids=input_ids,
326
365
  target_ids=target_ids,
366
+ images_crop=images_crop,
327
367
  pixel_values=images,
328
368
  images_seq_mask=images_seq_mask,
329
369
  images_spatial_crop=images_spatial_crop,
@@ -341,10 +381,14 @@ class DeepseekVLV2Processor(ProcessorMixin):
341
381
  inference_mode: bool = True,
342
382
  system_prompt: str = "",
343
383
  max_req_input_len: int = -1,
384
+ text: list[str] = None,
344
385
  **kwargs,
345
386
  ):
387
+ assert text is None or isinstance(text, list)
388
+ if text is not None:
389
+ text = text[0]
346
390
  prepare = self.process_one(
347
- prompt=prompt,
391
+ prompt=prompt or text,
348
392
  conversations=conversations,
349
393
  images=images,
350
394
  apply_sft_format=apply_sft_format,
@@ -369,85 +413,83 @@ class DeepseekVLV2Processor(ProcessorMixin):
369
413
  bos: bool = True,
370
414
  eos: bool = True,
371
415
  cropping: bool = True,
372
- max_req_input_len: int = -1,
373
416
  ):
374
417
  """Tokenize text with <image> tags."""
375
- images_list, images_seq_mask, images_spatial_crop = [], [], []
418
+
419
+ conversation = conversation
420
+ assert conversation.count(self.image_token) == len(images)
376
421
  text_splits = conversation.split(self.image_token)
422
+ images_list, images_crop_list, images_seq_mask, images_spatial_crop = (
423
+ [],
424
+ [],
425
+ [],
426
+ [],
427
+ )
428
+ image_shapes = []
429
+ num_image_tokens = []
377
430
  tokenized_str = []
378
431
  for text_sep, image in zip(text_splits, images):
379
432
  """encode text_sep"""
380
433
  tokenized_sep = self.encode(text_sep, bos=False, eos=False)
434
+
381
435
  tokenized_str += tokenized_sep
382
436
  images_seq_mask += [False] * len(tokenized_sep)
383
437
 
384
- """select best resolution for anyres"""
385
- if cropping:
386
- best_width, best_height = select_best_resolution(
387
- image.size, self.candidate_resolutions
388
- )
438
+ image_shapes.append(image.size)
439
+
440
+ if image.size[0] <= 640 and image.size[1] <= 640:
441
+ crop_ratio = [1, 1]
389
442
  else:
390
- best_width, best_height = self.image_size, self.image_size
391
- # print(image.size, (best_width, best_height)) # check the select_best_resolutions func
443
+ if cropping:
444
+ images_crop_raw, crop_ratio = dynamic_preprocess(
445
+ image, image_size=IMAGE_SIZE
446
+ )
447
+ else:
448
+ crop_ratio = [1, 1]
392
449
 
393
450
  """process the global view"""
451
+ if self.image_size <= 640 and not cropping:
452
+ image = image.resize((self.image_size, self.image_size))
453
+
394
454
  global_view = ImageOps.pad(
395
455
  image,
396
- (self.image_size, self.image_size),
456
+ (self.base_size, self.base_size),
397
457
  color=tuple(int(x * 255) for x in self.image_transform.mean),
398
458
  )
399
459
  images_list.append(self.image_transform(global_view))
400
460
 
401
- """process the local views"""
402
- local_view = ImageOps.pad(
403
- image,
404
- (best_width, best_height),
405
- color=tuple(int(x * 255) for x in self.image_transform.mean),
406
- )
407
- for i in range(0, best_height, self.image_size):
408
- for j in range(0, best_width, self.image_size):
409
- images_list.append(
410
- self.image_transform(
411
- local_view.crop(
412
- (j, i, j + self.image_size, i + self.image_size)
413
- )
414
- )
415
- )
416
-
417
- """record height / width crop num"""
418
- num_width_tiles, num_height_tiles = (
419
- best_width // self.image_size,
420
- best_height // self.image_size,
421
- )
461
+ num_width_tiles, num_height_tiles = crop_ratio
422
462
  images_spatial_crop.append([num_width_tiles, num_height_tiles])
423
463
 
464
+ if num_width_tiles > 1 or num_height_tiles > 1:
465
+ for i in range(len(images_crop_raw)):
466
+ images_crop_list.append(self.image_transform(images_crop_raw[i]))
467
+
424
468
  """add image tokens"""
425
- h = w = math.ceil(
469
+ num_queries = math.ceil(
426
470
  (self.image_size // self.patch_size) / self.downsample_ratio
427
471
  )
428
- # global views tokens h * (w + 1), 1 is for line separator
429
- tokenized_image = [self.image_token_id] * h * (w + 1)
430
- # add a separator between global and local views
431
- tokenized_image += [self.image_token_id]
432
- # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
433
- tokenized_image += (
434
- [self.image_token_id]
435
- * (num_height_tiles * h)
436
- * (num_width_tiles * w + 1)
472
+ num_queries_base = math.ceil(
473
+ (self.base_size // self.patch_size) / self.downsample_ratio
437
474
  )
438
475
 
476
+ tokenized_image = (
477
+ [self.image_token_id] * num_queries_base + [self.image_token_id]
478
+ ) * num_queries_base
479
+ tokenized_image += [self.image_token_id]
480
+ if num_width_tiles > 1 or num_height_tiles > 1:
481
+ tokenized_image += (
482
+ [self.image_token_id] * (num_queries * num_width_tiles)
483
+ + [self.image_token_id]
484
+ ) * (num_queries * num_height_tiles)
439
485
  tokenized_str += tokenized_image
486
+
440
487
  images_seq_mask += [True] * len(tokenized_image)
441
- # print(width_crop_num, height_crop_num, len(tokenized_image)) # test the correctness of the number of image-related tokens
488
+ num_image_tokens.append(len(tokenized_image))
442
489
 
443
490
  """process the last text split"""
444
491
  tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
445
- # deal with video, limit with request len
446
- if max_req_input_len > -1:
447
- if max_req_input_len < len(tokenized_sep) + len(tokenized_str) - 1:
448
- rest = max_req_input_len - len(tokenized_sep) - 1 - 1024
449
- tokenized_str = tokenized_str[:rest]
450
- images_seq_mask = images_seq_mask[:rest]
492
+
451
493
  tokenized_str += tokenized_sep
452
494
  images_seq_mask += [False] * len(tokenized_sep)
453
495
 
@@ -463,7 +505,64 @@ class DeepseekVLV2Processor(ProcessorMixin):
463
505
  images_seq_mask
464
506
  ), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
465
507
 
466
- return tokenized_str, images_list, images_seq_mask, images_spatial_crop
508
+ masked_tokenized_str = []
509
+ for token_index in tokenized_str:
510
+ if token_index != self.image_token_id:
511
+ masked_tokenized_str.append(token_index)
512
+ else:
513
+ masked_tokenized_str.append(self.ignore_id)
514
+
515
+ assert (
516
+ len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
517
+ ), (
518
+ f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
519
+ f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
520
+ )
521
+ input_ids = torch.LongTensor(tokenized_str)
522
+ target_ids = torch.LongTensor(masked_tokenized_str)
523
+ images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
524
+
525
+ # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
526
+ target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
527
+ self.ignore_id
528
+ )
529
+ input_ids[input_ids < 0] = self.pad_id
530
+
531
+ inference_mode = True
532
+
533
+ if inference_mode:
534
+ # Remove the ending eos token
535
+ assert input_ids[-1] == self.eos_id
536
+ input_ids = input_ids[:-1]
537
+ target_ids = target_ids[:-1]
538
+ images_seq_mask = images_seq_mask[:-1]
539
+
540
+ if len(images_list) == 0:
541
+ pixel_values = torch.zeros((1, 3, self.base_size, self.base_size))
542
+ images_spatial_crop = torch.zeros((1, 1), dtype=torch.long)
543
+ images_crop = torch.zeros(
544
+ (1, 3, self.image_size, self.image_size)
545
+ ).unsqueeze(0)
546
+ else:
547
+ pixel_values = torch.stack(images_list, dim=0)
548
+ images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
549
+ if images_crop_list:
550
+ images_crop = torch.stack(images_crop_list, dim=0).unsqueeze(0)
551
+ else:
552
+ images_crop = torch.zeros(
553
+ (1, 3, self.image_size, self.image_size)
554
+ ).unsqueeze(0)
555
+
556
+ input_ids = input_ids.unsqueeze(0)
557
+ return (
558
+ input_ids,
559
+ pixel_values,
560
+ images_crop,
561
+ images_seq_mask,
562
+ images_spatial_crop,
563
+ num_image_tokens,
564
+ image_shapes,
565
+ )
467
566
 
468
567
 
469
568
  class DeepseekVL2VisionEncoderConfig(PretrainedConfig):
@@ -548,7 +647,6 @@ class DeepseekVL2MlpProjectorConfig(PretrainedConfig):
548
647
 
549
648
 
550
649
  class DeepseekV2Config(PretrainedConfig):
551
-
552
650
  model_type = "deepseek_v2"
553
651
  keys_to_ignore_at_inference = ["past_key_values"]
554
652
 
@@ -0,0 +1,64 @@
1
+ from typing import Optional
2
+
3
+ from transformers import AutoProcessor, Qwen2_5_VLProcessor
4
+ from transformers.image_processing_utils import BaseImageProcessor
5
+ from transformers.models.qwen2 import Qwen2Config
6
+
7
+ from sglang.srt.configs.dots_vlm import DotsVisionConfig
8
+
9
+
10
+ class DotsOCRConfig(Qwen2Config):
11
+ model_type = "dots_ocr"
12
+
13
+ def __init__(
14
+ self,
15
+ image_token_id=151665,
16
+ video_token_id=151656,
17
+ vision_config: Optional[dict] = None,
18
+ *args,
19
+ **kwargs
20
+ ):
21
+ super().__init__(*args, **kwargs)
22
+ self.image_token_id = image_token_id
23
+ self.video_token_id = video_token_id
24
+ self.vision_config = DotsVisionConfig(**(vision_config or {}))
25
+
26
+ def save_pretrained(self, save_directory, **kwargs):
27
+ self._auto_class = None
28
+ super().save_pretrained(save_directory, **kwargs)
29
+
30
+
31
+ class DummyVideoProcessor(BaseImageProcessor):
32
+ model_input_names = ["pixel_values"]
33
+
34
+ def __call__(self, *args, **kwargs):
35
+ return None
36
+
37
+
38
+ class DotsVLProcessor(Qwen2_5_VLProcessor):
39
+ def __init__(
40
+ self,
41
+ image_processor=None,
42
+ tokenizer=None,
43
+ video_processor=None,
44
+ chat_template=None,
45
+ **kwargs
46
+ ):
47
+ if video_processor is None:
48
+ video_processor = DummyVideoProcessor()
49
+ super().__init__(
50
+ image_processor, tokenizer, video_processor, chat_template=chat_template
51
+ )
52
+ self.image_token = (
53
+ "<|imgpad|>"
54
+ if not hasattr(tokenizer, "image_token")
55
+ else tokenizer.image_token
56
+ )
57
+ self.image_token_id = (
58
+ tokenizer.image_token_id
59
+ if getattr(tokenizer, "image_token_id", None) is not None
60
+ else tokenizer.convert_tokens_to_ids(self.image_token)
61
+ )
62
+
63
+
64
+ AutoProcessor.register(DotsOCRConfig, DotsVLProcessor)
@@ -1,10 +1,5 @@
1
- from typing import Any, List, Optional, Union
2
-
3
- from transformers import AutoProcessor, LlamaTokenizerFast, PretrainedConfig
4
- from transformers.feature_extraction_utils import BatchFeature
5
- from transformers.image_utils import ImageInput
6
- from transformers.processing_utils import ProcessingKwargs, Unpack
7
- from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
1
+ from transformers import AutoProcessor, PretrainedConfig
2
+ from transformers.processing_utils import ProcessingKwargs
8
3
 
9
4
  try:
10
5
  from transformers import Qwen2_5_VLProcessor