sglang 0.4.0.post1__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. {sglang-0.4.0.post1 → sglang-0.4.1}/PKG-INFO +12 -7
  2. {sglang-0.4.0.post1 → sglang-0.4.1}/README.md +6 -3
  3. {sglang-0.4.0.post1 → sglang-0.4.1}/pyproject.toml +7 -6
  4. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/bench_offline_throughput.py +6 -6
  5. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/bench_one_batch.py +1 -0
  6. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/bench_serving.py +9 -1
  7. sglang-0.4.1/sglang/check_env.py +305 -0
  8. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/backend/runtime_endpoint.py +1 -0
  9. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/chat_template.py +32 -0
  10. sglang-0.4.1/sglang/llama3_eval.py +316 -0
  11. sglang-0.4.1/sglang/srt/aio_rwlock.py +100 -0
  12. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/configs/model_config.py +8 -1
  13. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/constrained/xgrammar_backend.py +4 -1
  14. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/attention/flashinfer_backend.py +51 -5
  15. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/attention/triton_backend.py +16 -25
  16. sglang-0.4.1/sglang/srt/layers/attention/triton_ops/decode_attention.py +669 -0
  17. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/linear.py +20 -2
  18. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/logits_processor.py +133 -95
  19. {sglang-0.4.0.post1/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/ep_moe/layer.py +18 -39
  20. sglang-0.4.1/sglang/srt/layers/moe/fused_moe_native.py +46 -0
  21. {sglang-0.4.0.post1/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/fused_moe_triton/__init__.py +3 -7
  22. {sglang-0.4.0.post1/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/fused_moe_triton/fused_moe.py +174 -119
  23. {sglang-0.4.0.post1/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/fused_moe_triton/layer.py +17 -49
  24. sglang-0.4.1/sglang/srt/layers/moe/topk.py +191 -0
  25. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/quantization/__init__.py +5 -50
  26. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/quantization/fp8.py +221 -36
  27. sglang-0.4.1/sglang/srt/layers/quantization/fp8_kernel.py +278 -0
  28. sglang-0.4.1/sglang/srt/layers/quantization/fp8_utils.py +116 -0
  29. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/radix_attention.py +8 -1
  30. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/sampler.py +27 -5
  31. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/torchao_utils.py +31 -0
  32. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/detokenizer_manager.py +37 -17
  33. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/io_struct.py +39 -10
  34. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/schedule_batch.py +54 -34
  35. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/schedule_policy.py +64 -5
  36. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/scheduler.py +171 -136
  37. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/tokenizer_manager.py +184 -133
  38. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  39. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/mem_cache/chunk_cache.py +2 -2
  40. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/mem_cache/memory_pool.py +15 -8
  41. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/mem_cache/radix_cache.py +12 -2
  42. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/model_executor/cuda_graph_runner.py +25 -11
  43. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/model_executor/model_runner.py +28 -14
  44. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/model_parallel.py +66 -5
  45. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/dbrx.py +1 -1
  46. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/deepseek.py +1 -1
  47. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/deepseek_v2.py +67 -18
  48. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/gemma2.py +34 -0
  49. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/gemma2_reward.py +0 -1
  50. sglang-0.4.1/sglang/srt/models/granite.py +517 -0
  51. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/grok.py +73 -9
  52. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/llama.py +22 -0
  53. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/llama_classification.py +11 -23
  54. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/llama_reward.py +0 -2
  55. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/llava.py +37 -14
  56. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/mixtral.py +2 -2
  57. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/olmoe.py +1 -1
  58. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/qwen2.py +20 -0
  59. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/qwen2_moe.py +1 -1
  60. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/xverse_moe.py +1 -1
  61. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/openai_api/adapter.py +8 -0
  62. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/openai_api/protocol.py +9 -4
  63. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/server.py +2 -1
  64. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/server_args.py +19 -9
  65. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/utils.py +40 -54
  66. sglang-0.4.1/sglang/test/test_block_fp8.py +341 -0
  67. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/test_utils.py +3 -2
  68. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/utils.py +10 -3
  69. sglang-0.4.1/sglang/version.py +1 -0
  70. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang.egg-info/PKG-INFO +12 -7
  71. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang.egg-info/SOURCES.txt +13 -7
  72. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang.egg-info/requires.txt +5 -3
  73. sglang-0.4.0.post1/sglang/check_env.py +0 -213
  74. sglang-0.4.0.post1/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -714
  75. sglang-0.4.0.post1/sglang/srt/layers/fused_moe_patch.py +0 -133
  76. sglang-0.4.0.post1/sglang/srt/layers/quantization/fp8_utils.py +0 -27
  77. sglang-0.4.0.post1/sglang/version.py +0 -1
  78. {sglang-0.4.0.post1 → sglang-0.4.1}/LICENSE +0 -0
  79. {sglang-0.4.0.post1 → sglang-0.4.1}/setup.cfg +0 -0
  80. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/__init__.py +0 -0
  81. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/api.py +0 -0
  82. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/bench_latency.py +0 -0
  83. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/bench_one_batch_server.py +0 -0
  84. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/global_config.py +0 -0
  85. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/__init__.py +0 -0
  86. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/backend/__init__.py +0 -0
  87. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/backend/anthropic.py +0 -0
  88. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/backend/base_backend.py +0 -0
  89. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/backend/litellm.py +0 -0
  90. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/backend/openai.py +0 -0
  91. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/backend/vertexai.py +0 -0
  92. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/choices.py +0 -0
  93. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/compiler.py +0 -0
  94. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/interpreter.py +0 -0
  95. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/ir.py +0 -0
  96. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/lang/tracer.py +0 -0
  97. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/launch_server.py +0 -0
  98. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/launch_server_llavavid.py +0 -0
  99. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/_custom_ops.py +0 -0
  100. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/configs/__init__.py +0 -0
  101. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/configs/device_config.py +0 -0
  102. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/configs/exaone.py +0 -0
  103. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/configs/load_config.py +0 -0
  104. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/configs/qwen2vl.py +0 -0
  105. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/constrained/__init__.py +0 -0
  106. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  107. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/constrained/outlines_backend.py +0 -0
  108. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  109. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/conversation.py +0 -0
  110. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/__init__.py +0 -0
  111. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/communication_op.py +0 -0
  112. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
  113. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  114. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  115. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  116. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  117. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  118. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  119. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  120. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  121. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/parallel_state.py +0 -0
  122. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/distributed/utils.py +0 -0
  123. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/hf_transformers_utils.py +0 -0
  124. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/activation.py +0 -0
  125. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/attention/__init__.py +0 -0
  126. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
  127. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/attention/torch_native_backend.py +0 -0
  128. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  129. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
  130. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  131. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/custom_op_util.py +0 -0
  132. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/layernorm.py +0 -0
  133. {sglang-0.4.0.post1/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/ep_moe/__init__.py +0 -0
  134. {sglang-0.4.0.post1/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/ep_moe/kernels.py +0 -0
  135. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/pooler.py +0 -0
  136. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/quantization/base_config.py +0 -0
  137. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/rotary_embedding.py +0 -0
  138. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  139. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/lora/lora.py +0 -0
  140. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/lora/lora_config.py +0 -0
  141. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/lora/lora_manager.py +0 -0
  142. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/data_parallel_controller.py +0 -0
  143. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/image_processor.py +0 -0
  144. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/session_controller.py +0 -0
  145. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/tp_worker.py +0 -0
  146. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
  147. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/mem_cache/flush_cache.py +0 -0
  148. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/metrics/collector.py +0 -0
  149. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/metrics/func_timer.py +0 -0
  150. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/mm_utils.py +0 -0
  151. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  152. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/model_loader/__init__.py +0 -0
  153. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/model_loader/loader.py +0 -0
  154. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/model_loader/utils.py +0 -0
  155. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/model_loader/weight_utils.py +0 -0
  156. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/baichuan.py +0 -0
  157. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/chatglm.py +0 -0
  158. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/commandr.py +0 -0
  159. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/exaone.py +0 -0
  160. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/gemma.py +0 -0
  161. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/gpt2.py +0 -0
  162. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/gpt_bigcode.py +0 -0
  163. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/internlm2.py +0 -0
  164. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/internlm2_reward.py +0 -0
  165. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/llama_embedding.py +0 -0
  166. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/llavavid.py +0 -0
  167. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/minicpm.py +0 -0
  168. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/minicpm3.py +0 -0
  169. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/mistral.py +0 -0
  170. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/mixtral_quant.py +0 -0
  171. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/mllama.py +0 -0
  172. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/olmo.py +0 -0
  173. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/olmo2.py +0 -0
  174. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/phi3_small.py +0 -0
  175. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/qwen.py +0 -0
  176. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/qwen2_vl.py +0 -0
  177. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/registry.py +0 -0
  178. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/stablelm.py +0 -0
  179. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/torch_native_llama.py +0 -0
  180. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/xverse.py +0 -0
  181. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/models/yivl.py +0 -0
  182. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  183. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  184. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  185. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  186. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  187. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  188. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  189. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/srt/sampling/sampling_params.py +0 -0
  190. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/few_shot_gsm8k.py +0 -0
  191. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  192. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/run_eval.py +0 -0
  193. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/runners.py +0 -0
  194. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/simple_eval_common.py +0 -0
  195. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/simple_eval_gpqa.py +0 -0
  196. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/simple_eval_humaneval.py +0 -0
  197. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/simple_eval_math.py +0 -0
  198. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/simple_eval_mgsm.py +0 -0
  199. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/simple_eval_mmlu.py +0 -0
  200. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  201. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/test_activation.py +0 -0
  202. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/test_layernorm.py +0 -0
  203. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang/test/test_programs.py +0 -0
  204. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang.egg-info/dependency_links.txt +0 -0
  205. {sglang-0.4.0.post1 → sglang-0.4.1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.4.0.post1
3
+ Version: 0.4.1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -215,6 +215,7 @@ Requires-Dist: requests
215
215
  Requires-Dist: tqdm
216
216
  Requires-Dist: numpy
217
217
  Requires-Dist: IPython
218
+ Requires-Dist: setproctitle
218
219
  Provides-Extra: runtime-common
219
220
  Requires-Dist: aiohttp; extra == "runtime-common"
220
221
  Requires-Dist: decord; extra == "runtime-common"
@@ -232,16 +233,17 @@ Requires-Dist: psutil; extra == "runtime-common"
232
233
  Requires-Dist: pydantic; extra == "runtime-common"
233
234
  Requires-Dist: python-multipart; extra == "runtime-common"
234
235
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
235
- Requires-Dist: torchao; extra == "runtime-common"
236
+ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
236
237
  Requires-Dist: uvicorn; extra == "runtime-common"
237
238
  Requires-Dist: uvloop; extra == "runtime-common"
238
- Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
239
+ Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
239
240
  Provides-Extra: srt
240
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
241
242
  Requires-Dist: torch; extra == "srt"
242
243
  Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
243
244
  Requires-Dist: cuda-python; extra == "srt"
244
- Requires-Dist: flashinfer>=0.1.6; extra == "srt"
245
+ Requires-Dist: flashinfer==0.1.6; extra == "srt"
246
+ Requires-Dist: sgl-kernel>=0.0.2.post8; extra == "srt"
245
247
  Provides-Extra: srt-hip
246
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
247
249
  Requires-Dist: torch; extra == "srt-hip"
@@ -311,8 +313,11 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
311
313
 
312
314
  --------------------------------------------------------------------------------
313
315
 
314
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
315
- [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
316
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
317
+ | [**Documentation**](https://sgl-project.github.io/)
318
+ | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
319
+ | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
320
+ | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
316
321
 
317
322
  ## News
318
323
  - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
@@ -353,7 +358,7 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
353
358
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
354
359
 
355
360
  ## Adoption and Sponsorship
356
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
361
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
357
362
 
358
363
  ## Acknowledgment and Citation
359
364
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -12,8 +12,11 @@
12
12
 
13
13
  --------------------------------------------------------------------------------
14
14
 
15
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
16
- [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
15
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
16
+ | [**Documentation**](https://sgl-project.github.io/)
17
+ | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
18
+ | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
19
+ | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
17
20
 
18
21
  ## News
19
22
  - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
@@ -54,7 +57,7 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
54
57
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
55
58
 
56
59
  ## Adoption and Sponsorship
57
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
60
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
58
61
 
59
62
  ## Acknowledgment and Citation
60
63
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.0.post1"
7
+ version = "0.4.1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -13,7 +13,7 @@ classifiers = [
13
13
  "Programming Language :: Python :: 3",
14
14
  "License :: OSI Approved :: Apache Software License",
15
15
  ]
16
- dependencies = ["requests", "tqdm", "numpy", "IPython"]
16
+ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
17
17
 
18
18
  [project.optional-dependencies]
19
19
  runtime_common = ["aiohttp", "decord", "fastapi",
@@ -21,9 +21,9 @@ runtime_common = ["aiohttp", "decord", "fastapi",
21
21
  "orjson", "outlines>=0.0.44,<0.1.0",
22
22
  "packaging", "pillow", "prometheus-client>=0.20.0",
23
23
  "psutil", "pydantic", "python-multipart",
24
- "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
25
- "xgrammar>=0.1.4"]
26
- srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer>=0.1.6"]
24
+ "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
25
+ "xgrammar>=0.1.6"]
26
+ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post8"]
27
27
 
28
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
29
29
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -33,7 +33,7 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
33
33
  srt_xpu = ["sglang[runtime_common]"]
34
34
  #For Intel Gaudi(device : hpu) follow the installation guide
35
35
  #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
36
- srt_hpu = ["sglang[runtime_common]"]
36
+ srt_hpu = ["sglang[runtime_common]"]
37
37
 
38
38
  openai = ["openai>=1.0", "tiktoken"]
39
39
  anthropic = ["anthropic>=0.20.0"]
@@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
50
50
  all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
51
51
  all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
52
52
  all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
53
+
53
54
  dev = ["sglang[all]", "sglang[test]"]
54
55
  dev_hip = ["sglang[all_hip]", "sglang[test]"]
55
56
  dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
@@ -201,18 +201,17 @@ def throughput_test_once(
201
201
  for r in reqs
202
202
  ]
203
203
 
204
- st = time.perf_counter()
205
204
  if profile:
206
205
  backend.start_profile()
207
206
 
207
+ st = time.perf_counter()
208
208
  gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
209
+ latency = time.perf_counter() - st
209
210
 
210
211
  if profile:
211
212
  backend.stop_profile()
212
213
  monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
213
214
 
214
- latency = time.perf_counter() - st
215
-
216
215
  if backend_name == "runtime":
217
216
  gen_out = json.loads(gen_out)
218
217
 
@@ -285,7 +284,7 @@ def throughput_test(
285
284
  else:
286
285
  raise ValueError('Please set backend to either "engine" or "runtime"')
287
286
 
288
- tokenizer_id = server_args.model_path
287
+ tokenizer_id = server_args.tokenizer_path or server_args.model_path
289
288
  tokenizer = get_tokenizer(tokenizer_id)
290
289
 
291
290
  # Set global environmnets
@@ -304,8 +303,8 @@ def throughput_test(
304
303
  warmup_requests = sample_random_requests(
305
304
  input_len=256,
306
305
  output_len=16,
307
- num_prompts=16,
308
- range_ratio=0.8,
306
+ num_prompts=min(bench_args.num_prompts, 16),
307
+ range_ratio=1.0,
309
308
  tokenizer=tokenizer,
310
309
  dataset_path=bench_args.dataset_path,
311
310
  )
@@ -321,6 +320,7 @@ def throughput_test(
321
320
  extra_request_body=extra_request_body,
322
321
  profile=False,
323
322
  )
323
+ time.sleep(0.5)
324
324
 
325
325
  logging.info("\nBenchmark...")
326
326
  result = throughput_test_once(
@@ -385,6 +385,7 @@ def latency_test(
385
385
  8, # shorter decoding to speed up the warmup
386
386
  server_args.device,
387
387
  )
388
+
388
389
  rank_print("Benchmark ...")
389
390
 
390
391
  # Run the sweep
@@ -321,6 +321,8 @@ async def async_request_sglang_generate(
321
321
  },
322
322
  "stream": not args.disable_stream,
323
323
  "lora_path": request_func_input.lora_name,
324
+ "return_logprob": args.return_logprob,
325
+ "logprob_start_len": -1,
324
326
  **request_func_input.extra_request_body,
325
327
  }
326
328
  headers = {}
@@ -911,7 +913,7 @@ async def benchmark(
911
913
  prompt=test_prompt,
912
914
  api_url=api_url,
913
915
  prompt_len=test_prompt_len,
914
- output_len=test_output_len,
916
+ output_len=min(test_output_len, 32),
915
917
  lora_name=lora_name,
916
918
  extra_request_body=extra_request_body,
917
919
  )
@@ -922,6 +924,7 @@ async def benchmark(
922
924
  f"are correctly specified. Error: {test_output.error}"
923
925
  )
924
926
  else:
927
+ requests.post(base_url + "/flush_cache")
925
928
  print("Initial test run completed. Starting main benchmark run...")
926
929
 
927
930
  time.sleep(1.5)
@@ -1413,6 +1416,11 @@ if __name__ == "__main__":
1413
1416
  action="store_true",
1414
1417
  help="Disable ignoring EOS.",
1415
1418
  )
1419
+ parser.add_argument(
1420
+ "--return-logprob",
1421
+ action="store_true",
1422
+ help="Return logprob.",
1423
+ )
1416
1424
  parser.add_argument(
1417
1425
  "--extra-request-body",
1418
1426
  metavar='{"key1": "value1", "key2": "value2"}',
@@ -0,0 +1,305 @@
1
+ """Check environment configurations and dependency versions."""
2
+
3
+ import importlib
4
+ import os
5
+ import resource
6
+ import subprocess
7
+ import sys
8
+ from collections import OrderedDict, defaultdict
9
+
10
+ import torch
11
+
12
+ from sglang.srt.utils import is_hip
13
+
14
+
15
+ def is_cuda_v2():
16
+ return torch.version.cuda is not None
17
+
18
+
19
+ # List of packages to check versions
20
+ PACKAGE_LIST = [
21
+ "sglang",
22
+ "flashinfer",
23
+ "triton",
24
+ "transformers",
25
+ "torchao",
26
+ "numpy",
27
+ "aiohttp",
28
+ "fastapi",
29
+ "hf_transfer",
30
+ "huggingface_hub",
31
+ "interegular",
32
+ "modelscope",
33
+ "orjson",
34
+ "outlines",
35
+ "packaging",
36
+ "psutil",
37
+ "pydantic",
38
+ "multipart",
39
+ "zmq",
40
+ "torchao",
41
+ "uvicorn",
42
+ "uvloop",
43
+ "vllm",
44
+ "xgrammar",
45
+ "openai",
46
+ "tiktoken",
47
+ "anthropic",
48
+ "litellm",
49
+ "decord",
50
+ ]
51
+
52
+
53
+ def get_package_versions(packages):
54
+ """
55
+ Get versions of specified packages.
56
+ """
57
+ versions = {}
58
+ for package in packages:
59
+ package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
60
+ try:
61
+ module = importlib.import_module(package_name)
62
+ if hasattr(module, "__version__"):
63
+ versions[package_name] = module.__version__
64
+ except ModuleNotFoundError:
65
+ versions[package_name] = "Module Not Found"
66
+ return versions
67
+
68
+
69
+ def get_cuda_info():
70
+ """
71
+ Get CUDA-related information if available.
72
+ """
73
+ if is_cuda_v2():
74
+ cuda_info = {"CUDA available": torch.cuda.is_available()}
75
+
76
+ if cuda_info["CUDA available"]:
77
+ cuda_info.update(_get_gpu_info())
78
+ cuda_info.update(_get_cuda_version_info())
79
+
80
+ return cuda_info
81
+ elif is_hip():
82
+ cuda_info = {"ROCM available": torch.cuda.is_available()}
83
+
84
+ if cuda_info["ROCM available"]:
85
+ cuda_info.update(_get_gpu_info())
86
+ cuda_info.update(_get_cuda_version_info())
87
+
88
+ return cuda_info
89
+
90
+
91
+ def _get_gpu_info():
92
+ """
93
+ Get information about available GPUs.
94
+ """
95
+ devices = defaultdict(list)
96
+ capabilities = defaultdict(list)
97
+ for k in range(torch.cuda.device_count()):
98
+ devices[torch.cuda.get_device_name(k)].append(str(k))
99
+ capability = torch.cuda.get_device_capability(k)
100
+ capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
101
+
102
+ gpu_info = {}
103
+ for name, device_ids in devices.items():
104
+ gpu_info[f"GPU {','.join(device_ids)}"] = name
105
+
106
+ if len(capabilities) == 1:
107
+ # All GPUs have the same compute capability
108
+ cap, gpu_ids = list(capabilities.items())[0]
109
+ gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
110
+ else:
111
+ # GPUs have different compute capabilities
112
+ for cap, gpu_ids in capabilities.items():
113
+ gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
114
+
115
+ return gpu_info
116
+
117
+
118
+ def _get_cuda_version_info():
119
+ """
120
+ Get CUDA version information.
121
+ """
122
+ if is_cuda_v2():
123
+ from torch.utils.cpp_extension import CUDA_HOME
124
+
125
+ cuda_info = {"CUDA_HOME": CUDA_HOME}
126
+
127
+ if CUDA_HOME and os.path.isdir(CUDA_HOME):
128
+ cuda_info.update(_get_nvcc_info())
129
+ cuda_info.update(_get_cuda_driver_version())
130
+
131
+ return cuda_info
132
+ elif is_hip():
133
+ from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
134
+
135
+ cuda_info = {"ROCM_HOME": ROCM_HOME}
136
+
137
+ if ROCM_HOME and os.path.isdir(ROCM_HOME):
138
+ cuda_info.update(_get_nvcc_info())
139
+ cuda_info.update(_get_cuda_driver_version())
140
+
141
+ return cuda_info
142
+ else:
143
+ cuda_info = {"CUDA_HOME": ""}
144
+ return cuda_info
145
+
146
+
147
+ def _get_nvcc_info():
148
+ """
149
+ Get NVCC version information.
150
+ """
151
+ if is_cuda_v2():
152
+ from torch.utils.cpp_extension import CUDA_HOME
153
+
154
+ try:
155
+ nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
156
+ nvcc_output = (
157
+ subprocess.check_output(f'"{nvcc}" -V', shell=True)
158
+ .decode("utf-8")
159
+ .strip()
160
+ )
161
+ return {
162
+ "NVCC": nvcc_output[
163
+ nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind(
164
+ "Build"
165
+ )
166
+ ].strip()
167
+ }
168
+ except subprocess.SubprocessError:
169
+ return {"NVCC": "Not Available"}
170
+ elif is_hip():
171
+ from torch.utils.cpp_extension import ROCM_HOME
172
+
173
+ try:
174
+ hipcc = os.path.join(ROCM_HOME, "bin/hipcc")
175
+ hipcc_output = (
176
+ subprocess.check_output(f'"{hipcc}" --version', shell=True)
177
+ .decode("utf-8")
178
+ .strip()
179
+ )
180
+ return {
181
+ "HIPCC": hipcc_output[
182
+ hipcc_output.rfind("HIP version") : hipcc_output.rfind("AMD clang")
183
+ ].strip()
184
+ }
185
+ except subprocess.SubprocessError:
186
+ return {"HIPCC": "Not Available"}
187
+ else:
188
+ return {"NVCC": "Not Available"}
189
+
190
+
191
+ def _get_cuda_driver_version():
192
+ """
193
+ Get CUDA driver version.
194
+ """
195
+ versions = set()
196
+ if is_cuda_v2():
197
+ try:
198
+ output = subprocess.check_output(
199
+ [
200
+ "nvidia-smi",
201
+ "--query-gpu=driver_version",
202
+ "--format=csv,noheader,nounits",
203
+ ]
204
+ )
205
+ versions = set(output.decode().strip().split("\n"))
206
+ if len(versions) == 1:
207
+ return {"CUDA Driver Version": versions.pop()}
208
+ else:
209
+ return {"CUDA Driver Versions": ", ".join(sorted(versions))}
210
+ except subprocess.SubprocessError:
211
+ return {"CUDA Driver Version": "Not Available"}
212
+ elif is_hip():
213
+ try:
214
+ output = subprocess.check_output(
215
+ [
216
+ "rocm-smi",
217
+ "--showdriverversion",
218
+ "--csv",
219
+ ]
220
+ )
221
+ versions = set(output.decode().strip().split("\n"))
222
+ versions.discard("name, value")
223
+ ver = versions.pop()
224
+ ver = ver.replace('"Driver version", ', "").replace('"', "")
225
+
226
+ return {"ROCM Driver Version": ver}
227
+ except subprocess.SubprocessError:
228
+ return {"ROCM Driver Version": "Not Available"}
229
+ else:
230
+ return {"CUDA Driver Version": "Not Available"}
231
+
232
+
233
+ def get_gpu_topology():
234
+ """
235
+ Get GPU topology information.
236
+ """
237
+ if is_cuda_v2():
238
+ try:
239
+ result = subprocess.run(
240
+ ["nvidia-smi", "topo", "-m"],
241
+ stdout=subprocess.PIPE,
242
+ stderr=subprocess.PIPE,
243
+ text=True,
244
+ check=True,
245
+ )
246
+ return "\n" + result.stdout if result.returncode == 0 else None
247
+ except subprocess.SubprocessError:
248
+ return None
249
+ elif is_hip():
250
+ try:
251
+ result = subprocess.run(
252
+ ["rocm-smi", "--showtopotype"],
253
+ stdout=subprocess.PIPE,
254
+ stderr=subprocess.PIPE,
255
+ text=True,
256
+ check=True,
257
+ )
258
+ return "\n" + result.stdout if result.returncode == 0 else None
259
+ except subprocess.SubprocessError:
260
+ return None
261
+ else:
262
+ return None
263
+
264
+
265
+ def get_hypervisor_vendor():
266
+ try:
267
+ output = subprocess.check_output(["lscpu"], text=True)
268
+ for line in output.split("\n"):
269
+ if "Hypervisor vendor:" in line:
270
+ return line.split(":")[1].strip()
271
+ return None
272
+ except:
273
+ return None
274
+
275
+
276
+ def check_env():
277
+ """
278
+ Check and print environment information.
279
+ """
280
+ env_info = OrderedDict()
281
+ env_info["Python"] = sys.version.replace("\n", "")
282
+ env_info.update(get_cuda_info())
283
+ env_info["PyTorch"] = torch.__version__
284
+ env_info.update(get_package_versions(PACKAGE_LIST))
285
+
286
+ gpu_topo = get_gpu_topology()
287
+ if gpu_topo:
288
+ if is_cuda_v2():
289
+ env_info["NVIDIA Topology"] = gpu_topo
290
+ elif is_hip():
291
+ env_info["AMD Topology"] = gpu_topo
292
+
293
+ hypervisor_vendor = get_hypervisor_vendor()
294
+ if hypervisor_vendor:
295
+ env_info["Hypervisor vendor"] = hypervisor_vendor
296
+
297
+ ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
298
+ env_info["ulimit soft"] = ulimit_soft
299
+
300
+ for k, v in env_info.items():
301
+ print(f"{k}: {v}")
302
+
303
+
304
+ if __name__ == "__main__":
305
+ check_env()
@@ -55,6 +55,7 @@ class RuntimeEndpoint(BaseBackend):
55
55
  self.base_url + "/flush_cache",
56
56
  api_key=self.api_key,
57
57
  verify=self.verify,
58
+ method="POST",
58
59
  )
59
60
  self._assert_success(res)
60
61
 
@@ -320,6 +320,28 @@ register_chat_template(
320
320
  )
321
321
  )
322
322
 
323
+ register_chat_template(
324
+ ChatTemplate(
325
+ name="granite-3-instruct",
326
+ default_system_prompt=None,
327
+ role_prefix_and_suffix={
328
+ "system": (
329
+ "<|start_of_role|>system<|end_of_role|>",
330
+ "<|end_of_text|>",
331
+ ),
332
+ "user": (
333
+ "<|start_of_role|>user<|end_of_role|>",
334
+ "<|end_of_text|>",
335
+ ),
336
+ "assistant": (
337
+ "<|start_of_role|>assistant<|end_of_role|>",
338
+ "<|end_of_text|>",
339
+ ),
340
+ },
341
+ stop_str=("<|end_of_text|>",),
342
+ )
343
+ )
344
+
323
345
 
324
346
  @register_chat_template_matching_function
325
347
  def match_dbrx(model_path: str):
@@ -402,6 +424,16 @@ def match_c4ai_command_r(model_path: str):
402
424
  return get_chat_template("c4ai-command-r")
403
425
 
404
426
 
427
+ @register_chat_template_matching_function
428
+ def match_granite_instruct(model_path: str):
429
+ model_path = model_path.lower()
430
+ # When future versions of Granite are released, this code may
431
+ # need to be updated. For now, assume that the Granite 3.0
432
+ # template works across the board.
433
+ if "granite" in model_path and "instruct" in model_path:
434
+ return get_chat_template("granite-3-instruct")
435
+
436
+
405
437
  if __name__ == "__main__":
406
438
  messages = [
407
439
  {"role": "system", "content": None}, # None means default