sglang 0.4.0__tar.gz → 0.4.0.post2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. {sglang-0.4.0 → sglang-0.4.0.post2}/PKG-INFO +15 -9
  2. {sglang-0.4.0 → sglang-0.4.0.post2}/README.md +8 -4
  3. {sglang-0.4.0 → sglang-0.4.0.post2}/pyproject.toml +7 -6
  4. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/__init__.py +1 -1
  5. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/bench_offline_throughput.py +18 -6
  6. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/bench_one_batch.py +13 -0
  7. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/bench_serving.py +8 -1
  8. sglang-0.4.0.post2/sglang/check_env.py +305 -0
  9. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/runtime_endpoint.py +1 -0
  10. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/chat_template.py +32 -0
  11. sglang-0.4.0.post2/sglang/llama3_eval.py +316 -0
  12. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/constrained/outlines_backend.py +5 -0
  13. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/constrained/xgrammar_backend.py +9 -6
  14. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/__init__.py +5 -2
  15. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
  16. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/flashinfer_backend.py +22 -5
  17. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/torch_native_backend.py +22 -8
  18. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/triton_backend.py +38 -33
  19. sglang-0.4.0.post2/sglang/srt/layers/attention/triton_ops/decode_attention.py +669 -0
  20. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
  21. sglang-0.4.0.post2/sglang/srt/layers/ep_moe/__init__.py +0 -0
  22. sglang-0.4.0.post2/sglang/srt/layers/ep_moe/kernels.py +349 -0
  23. sglang-0.4.0.post2/sglang/srt/layers/ep_moe/layer.py +665 -0
  24. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/fused_moe_triton/fused_moe.py +64 -21
  25. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/fused_moe_triton/layer.py +1 -1
  26. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/logits_processor.py +133 -95
  27. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/quantization/__init__.py +2 -47
  28. sglang-0.4.0.post2/sglang/srt/layers/quantization/fp8.py +607 -0
  29. sglang-0.4.0.post2/sglang/srt/layers/quantization/fp8_utils.py +27 -0
  30. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/radix_attention.py +11 -2
  31. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/sampler.py +29 -5
  32. sglang-0.4.0.post2/sglang/srt/layers/torchao_utils.py +108 -0
  33. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/detokenizer_manager.py +37 -17
  34. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/io_struct.py +39 -10
  35. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/schedule_batch.py +39 -24
  36. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/schedule_policy.py +64 -5
  37. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/scheduler.py +236 -197
  38. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/tokenizer_manager.py +99 -58
  39. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +7 -5
  40. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  41. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mem_cache/chunk_cache.py +2 -2
  42. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mem_cache/memory_pool.py +5 -1
  43. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mem_cache/radix_cache.py +12 -2
  44. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_executor/cuda_graph_runner.py +39 -11
  45. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_executor/model_runner.py +24 -9
  46. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_parallel.py +67 -10
  47. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/commandr.py +2 -2
  48. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/deepseek_v2.py +87 -7
  49. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/gemma2.py +34 -0
  50. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/gemma2_reward.py +0 -1
  51. sglang-0.4.0.post2/sglang/srt/models/granite.py +517 -0
  52. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/grok.py +72 -13
  53. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llama.py +22 -5
  54. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llama_classification.py +11 -23
  55. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llama_reward.py +0 -2
  56. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llava.py +37 -14
  57. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/mixtral.py +12 -9
  58. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/phi3_small.py +0 -5
  59. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/qwen2.py +20 -0
  60. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/qwen2_moe.py +0 -5
  61. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/torch_native_llama.py +0 -5
  62. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/openai_api/adapter.py +4 -0
  63. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/openai_api/protocol.py +9 -4
  64. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/sampling_batch_info.py +9 -8
  65. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/server.py +4 -4
  66. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/server_args.py +62 -13
  67. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/utils.py +57 -10
  68. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/test_utils.py +3 -2
  69. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/utils.py +10 -3
  70. sglang-0.4.0.post2/sglang/version.py +1 -0
  71. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang.egg-info/PKG-INFO +15 -9
  72. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang.egg-info/SOURCES.txt +7 -0
  73. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang.egg-info/requires.txt +6 -4
  74. sglang-0.4.0/sglang/check_env.py +0 -213
  75. sglang-0.4.0/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -714
  76. sglang-0.4.0/sglang/srt/layers/torchao_utils.py +0 -95
  77. sglang-0.4.0/sglang/version.py +0 -1
  78. {sglang-0.4.0 → sglang-0.4.0.post2}/LICENSE +0 -0
  79. {sglang-0.4.0 → sglang-0.4.0.post2}/setup.cfg +0 -0
  80. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/api.py +0 -0
  81. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/bench_latency.py +0 -0
  82. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/bench_one_batch_server.py +0 -0
  83. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/global_config.py +0 -0
  84. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/__init__.py +0 -0
  85. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/__init__.py +0 -0
  86. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/anthropic.py +0 -0
  87. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/base_backend.py +0 -0
  88. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/litellm.py +0 -0
  89. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/openai.py +0 -0
  90. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/backend/vertexai.py +0 -0
  91. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/choices.py +0 -0
  92. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/compiler.py +0 -0
  93. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/interpreter.py +0 -0
  94. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/ir.py +0 -0
  95. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/lang/tracer.py +0 -0
  96. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/launch_server.py +0 -0
  97. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/launch_server_llavavid.py +0 -0
  98. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/_custom_ops.py +0 -0
  99. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/__init__.py +0 -0
  100. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/device_config.py +0 -0
  101. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/exaone.py +0 -0
  102. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/load_config.py +0 -0
  103. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/model_config.py +0 -0
  104. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/configs/qwen2vl.py +0 -0
  105. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/constrained/__init__.py +0 -0
  106. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/constrained/base_grammar_backend.py +0 -0
  107. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
  108. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/conversation.py +0 -0
  109. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/__init__.py +0 -0
  110. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/communication_op.py +0 -0
  111. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/__init__.py +0 -0
  112. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/cuda_wrapper.py +0 -0
  113. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce.py +0 -0
  114. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +0 -0
  115. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/hpu_communicator.py +0 -0
  116. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/pynccl.py +0 -0
  117. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/pynccl_wrapper.py +0 -0
  118. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/shm_broadcast.py +0 -0
  119. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/device_communicators/xpu_communicator.py +0 -0
  120. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/parallel_state.py +0 -0
  121. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/distributed/utils.py +0 -0
  122. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/hf_transformers_utils.py +0 -0
  123. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/activation.py +0 -0
  124. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
  125. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
  126. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/custom_op_util.py +0 -0
  127. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/fused_moe_patch.py +0 -0
  128. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/fused_moe_triton/__init__.py +0 -0
  129. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/layernorm.py +0 -0
  130. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/linear.py +0 -0
  131. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/pooler.py +0 -0
  132. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
  133. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/rotary_embedding.py +0 -0
  134. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
  135. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/lora/lora.py +0 -0
  136. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/lora/lora_config.py +0 -0
  137. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/lora/lora_manager.py +0 -0
  138. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/data_parallel_controller.py +0 -0
  139. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/image_processor.py +0 -0
  140. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/session_controller.py +0 -0
  141. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/managers/tp_worker.py +0 -0
  142. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
  143. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/metrics/collector.py +0 -0
  144. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/metrics/func_timer.py +0 -0
  145. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/mm_utils.py +0 -0
  146. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  147. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_loader/__init__.py +0 -0
  148. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_loader/loader.py +0 -0
  149. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_loader/utils.py +0 -0
  150. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/model_loader/weight_utils.py +0 -0
  151. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/baichuan.py +0 -0
  152. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/chatglm.py +0 -0
  153. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/dbrx.py +0 -0
  154. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/deepseek.py +0 -0
  155. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/exaone.py +0 -0
  156. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/gemma.py +0 -0
  157. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/gpt2.py +0 -0
  158. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
  159. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/internlm2.py +0 -0
  160. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/internlm2_reward.py +0 -0
  161. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llama_embedding.py +0 -0
  162. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/llavavid.py +0 -0
  163. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/minicpm.py +0 -0
  164. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/minicpm3.py +0 -0
  165. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/mistral.py +0 -0
  166. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/mixtral_quant.py +0 -0
  167. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/mllama.py +0 -0
  168. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/olmo.py +0 -0
  169. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/olmo2.py +0 -0
  170. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/olmoe.py +0 -0
  171. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/qwen.py +0 -0
  172. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/qwen2_vl.py +0 -0
  173. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/registry.py +0 -0
  174. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/stablelm.py +0 -0
  175. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/xverse.py +0 -0
  176. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/xverse_moe.py +0 -0
  177. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/models/yivl.py +0 -0
  178. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  179. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  180. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  181. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  182. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  183. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  184. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/srt/sampling/sampling_params.py +0 -0
  185. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/few_shot_gsm8k.py +0 -0
  186. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
  187. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/run_eval.py +0 -0
  188. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/runners.py +0 -0
  189. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_common.py +0 -0
  190. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_gpqa.py +0 -0
  191. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_humaneval.py +0 -0
  192. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_math.py +0 -0
  193. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_mgsm.py +0 -0
  194. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/simple_eval_mmlu.py +0 -0
  195. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  196. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/test_activation.py +0 -0
  197. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/test_layernorm.py +0 -0
  198. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang/test/test_programs.py +0 -0
  199. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang.egg-info/dependency_links.txt +0 -0
  200. {sglang-0.4.0 → sglang-0.4.0.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.4.0
3
+ Version: 0.4.0.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -215,6 +215,7 @@ Requires-Dist: requests
215
215
  Requires-Dist: tqdm
216
216
  Requires-Dist: numpy
217
217
  Requires-Dist: IPython
218
+ Requires-Dist: setproctitle
218
219
  Provides-Extra: runtime-common
219
220
  Requires-Dist: aiohttp; extra == "runtime-common"
220
221
  Requires-Dist: decord; extra == "runtime-common"
@@ -232,16 +233,17 @@ Requires-Dist: psutil; extra == "runtime-common"
232
233
  Requires-Dist: pydantic; extra == "runtime-common"
233
234
  Requires-Dist: python-multipart; extra == "runtime-common"
234
235
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
235
- Requires-Dist: torchao; extra == "runtime-common"
236
+ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
237
+ Requires-Dist: gemlite; extra == "runtime-common"
236
238
  Requires-Dist: uvicorn; extra == "runtime-common"
237
239
  Requires-Dist: uvloop; extra == "runtime-common"
238
- Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
240
+ Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
239
241
  Provides-Extra: srt
240
242
  Requires-Dist: sglang[runtime_common]; extra == "srt"
241
243
  Requires-Dist: torch; extra == "srt"
242
- Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
244
+ Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
243
245
  Requires-Dist: cuda-python; extra == "srt"
244
- Requires-Dist: flashinfer>=0.1.6; extra == "srt"
246
+ Requires-Dist: flashinfer==0.1.6; extra == "srt"
245
247
  Provides-Extra: srt-hip
246
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
247
249
  Requires-Dist: torch; extra == "srt-hip"
@@ -311,10 +313,14 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
311
313
 
312
314
  --------------------------------------------------------------------------------
313
315
 
314
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
315
- [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
316
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
317
+ | [**Documentation**](https://sgl-project.github.io/)
318
+ | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
319
+ | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
320
+ | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
316
321
 
317
322
  ## News
323
+ - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
318
324
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
319
325
  - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
320
326
  - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
@@ -346,13 +352,13 @@ The core features include:
346
352
  - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
347
353
 
348
354
  ## Benchmark And Performance
349
- Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
355
+ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
350
356
 
351
357
  ## Roadmap
352
358
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
353
359
 
354
360
  ## Adoption and Sponsorship
355
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
361
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
356
362
 
357
363
  ## Acknowledgment and Citation
358
364
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -12,10 +12,14 @@
12
12
 
13
13
  --------------------------------------------------------------------------------
14
14
 
15
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
16
- [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
15
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
16
+ | [**Documentation**](https://sgl-project.github.io/)
17
+ | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
18
+ | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
19
+ | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
17
20
 
18
21
  ## News
22
+ - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
19
23
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
20
24
  - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
21
25
  - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
@@ -47,13 +51,13 @@ The core features include:
47
51
  - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
48
52
 
49
53
  ## Benchmark And Performance
50
- Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
54
+ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
51
55
 
52
56
  ## Roadmap
53
57
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
54
58
 
55
59
  ## Adoption and Sponsorship
56
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
60
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
57
61
 
58
62
  ## Acknowledgment and Citation
59
63
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.4.0"
7
+ version = "0.4.0.post2"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -13,7 +13,7 @@ classifiers = [
13
13
  "Programming Language :: Python :: 3",
14
14
  "License :: OSI Approved :: Apache Software License",
15
15
  ]
16
- dependencies = ["requests", "tqdm", "numpy", "IPython"]
16
+ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
17
17
 
18
18
  [project.optional-dependencies]
19
19
  runtime_common = ["aiohttp", "decord", "fastapi",
@@ -21,9 +21,9 @@ runtime_common = ["aiohttp", "decord", "fastapi",
21
21
  "orjson", "outlines>=0.0.44,<0.1.0",
22
22
  "packaging", "pillow", "prometheus-client>=0.20.0",
23
23
  "psutil", "pydantic", "python-multipart",
24
- "pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
25
- "xgrammar>=0.1.4"]
26
- srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python", "flashinfer>=0.1.6"]
24
+ "pyzmq>=25.1.2", "torchao>=0.7.0", "gemlite", "uvicorn", "uvloop",
25
+ "xgrammar>=0.1.6"]
26
+ srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6"]
27
27
 
28
28
  # HIP (Heterogeneous-computing Interface for Portability) for AMD
29
29
  # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -33,7 +33,7 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
33
33
  srt_xpu = ["sglang[runtime_common]"]
34
34
  #For Intel Gaudi(device : hpu) follow the installation guide
35
35
  #https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
36
- srt_hpu = ["sglang[runtime_common]"]
36
+ srt_hpu = ["sglang[runtime_common]"]
37
37
 
38
38
  openai = ["openai>=1.0", "tiktoken"]
39
39
  anthropic = ["anthropic>=0.20.0"]
@@ -50,6 +50,7 @@ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
50
50
  all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
51
51
  all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
52
52
  all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
53
+
53
54
  dev = ["sglang[all]", "sglang[test]"]
54
55
  dev_hip = ["sglang[all_hip]", "sglang[test]"]
55
56
  dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
@@ -66,7 +66,7 @@ from sglang.version import __version__
66
66
 
67
67
  __all__ += ["__version__"]
68
68
 
69
- # SGL Backends
69
+ # SGLang Backends
70
70
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
71
71
  from sglang.utils import LazyImport
72
72
 
@@ -201,18 +201,17 @@ def throughput_test_once(
201
201
  for r in reqs
202
202
  ]
203
203
 
204
- st = time.perf_counter()
205
204
  if profile:
206
205
  backend.start_profile()
207
206
 
207
+ st = time.perf_counter()
208
208
  gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
209
+ latency = time.perf_counter() - st
209
210
 
210
211
  if profile:
211
212
  backend.stop_profile()
212
213
  monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
213
214
 
214
- latency = time.perf_counter() - st
215
-
216
215
  if backend_name == "runtime":
217
216
  gen_out = json.loads(gen_out)
218
217
 
@@ -285,7 +284,7 @@ def throughput_test(
285
284
  else:
286
285
  raise ValueError('Please set backend to either "engine" or "runtime"')
287
286
 
288
- tokenizer_id = server_args.model_path
287
+ tokenizer_id = server_args.tokenizer_path or server_args.model_path
289
288
  tokenizer = get_tokenizer(tokenizer_id)
290
289
 
291
290
  # Set global environmnets
@@ -304,8 +303,8 @@ def throughput_test(
304
303
  warmup_requests = sample_random_requests(
305
304
  input_len=256,
306
305
  output_len=16,
307
- num_prompts=16,
308
- range_ratio=0.8,
306
+ num_prompts=min(bench_args.num_prompts, 16),
307
+ range_ratio=1.0,
309
308
  tokenizer=tokenizer,
310
309
  dataset_path=bench_args.dataset_path,
311
310
  )
@@ -321,6 +320,19 @@ def throughput_test(
321
320
  extra_request_body=extra_request_body,
322
321
  profile=False,
323
322
  )
323
+ time.sleep(0.5)
324
+
325
+ try:
326
+ import os
327
+ import pwd
328
+
329
+ from gemlite.core import GemLiteLinearTriton
330
+
331
+ GemLiteLinearTriton.cache_config(
332
+ f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
333
+ )
334
+ except ImportError:
335
+ pass
324
336
 
325
337
  logging.info("\nBenchmark...")
326
338
  result = throughput_test_once(
@@ -385,6 +385,19 @@ def latency_test(
385
385
  8, # shorter decoding to speed up the warmup
386
386
  server_args.device,
387
387
  )
388
+
389
+ try:
390
+ import os
391
+ import pwd
392
+
393
+ from gemlite.core import GemLiteLinearTriton
394
+
395
+ GemLiteLinearTriton.cache_config(
396
+ f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
397
+ )
398
+ except ImportError:
399
+ pass
400
+
388
401
  rank_print("Benchmark ...")
389
402
 
390
403
  # Run the sweep
@@ -321,6 +321,8 @@ async def async_request_sglang_generate(
321
321
  },
322
322
  "stream": not args.disable_stream,
323
323
  "lora_path": request_func_input.lora_name,
324
+ "return_logprob": args.return_logprob,
325
+ "logprob_start_len": -1,
324
326
  **request_func_input.extra_request_body,
325
327
  }
326
328
  headers = {}
@@ -911,7 +913,7 @@ async def benchmark(
911
913
  prompt=test_prompt,
912
914
  api_url=api_url,
913
915
  prompt_len=test_prompt_len,
914
- output_len=test_output_len,
916
+ output_len=min(test_output_len, 32),
915
917
  lora_name=lora_name,
916
918
  extra_request_body=extra_request_body,
917
919
  )
@@ -1413,6 +1415,11 @@ if __name__ == "__main__":
1413
1415
  action="store_true",
1414
1416
  help="Disable ignoring EOS.",
1415
1417
  )
1418
+ parser.add_argument(
1419
+ "--return-logprob",
1420
+ action="store_true",
1421
+ help="Return logprob.",
1422
+ )
1416
1423
  parser.add_argument(
1417
1424
  "--extra-request-body",
1418
1425
  metavar='{"key1": "value1", "key2": "value2"}',
@@ -0,0 +1,305 @@
1
+ """Check environment configurations and dependency versions."""
2
+
3
+ import importlib
4
+ import os
5
+ import resource
6
+ import subprocess
7
+ import sys
8
+ from collections import OrderedDict, defaultdict
9
+
10
+ import torch
11
+
12
+ from sglang.srt.utils import is_hip
13
+
14
+
15
+ def is_cuda_v2():
16
+ return torch.version.cuda is not None
17
+
18
+
19
+ # List of packages to check versions
20
+ PACKAGE_LIST = [
21
+ "sglang",
22
+ "flashinfer",
23
+ "triton",
24
+ "transformers",
25
+ "torchao",
26
+ "numpy",
27
+ "aiohttp",
28
+ "fastapi",
29
+ "hf_transfer",
30
+ "huggingface_hub",
31
+ "interegular",
32
+ "modelscope",
33
+ "orjson",
34
+ "outlines",
35
+ "packaging",
36
+ "psutil",
37
+ "pydantic",
38
+ "multipart",
39
+ "zmq",
40
+ "torchao",
41
+ "uvicorn",
42
+ "uvloop",
43
+ "vllm",
44
+ "xgrammar",
45
+ "openai",
46
+ "tiktoken",
47
+ "anthropic",
48
+ "litellm",
49
+ "decord",
50
+ ]
51
+
52
+
53
+ def get_package_versions(packages):
54
+ """
55
+ Get versions of specified packages.
56
+ """
57
+ versions = {}
58
+ for package in packages:
59
+ package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
60
+ try:
61
+ module = importlib.import_module(package_name)
62
+ if hasattr(module, "__version__"):
63
+ versions[package_name] = module.__version__
64
+ except ModuleNotFoundError:
65
+ versions[package_name] = "Module Not Found"
66
+ return versions
67
+
68
+
69
+ def get_cuda_info():
70
+ """
71
+ Get CUDA-related information if available.
72
+ """
73
+ if is_cuda_v2():
74
+ cuda_info = {"CUDA available": torch.cuda.is_available()}
75
+
76
+ if cuda_info["CUDA available"]:
77
+ cuda_info.update(_get_gpu_info())
78
+ cuda_info.update(_get_cuda_version_info())
79
+
80
+ return cuda_info
81
+ elif is_hip():
82
+ cuda_info = {"ROCM available": torch.cuda.is_available()}
83
+
84
+ if cuda_info["ROCM available"]:
85
+ cuda_info.update(_get_gpu_info())
86
+ cuda_info.update(_get_cuda_version_info())
87
+
88
+ return cuda_info
89
+
90
+
91
+ def _get_gpu_info():
92
+ """
93
+ Get information about available GPUs.
94
+ """
95
+ devices = defaultdict(list)
96
+ capabilities = defaultdict(list)
97
+ for k in range(torch.cuda.device_count()):
98
+ devices[torch.cuda.get_device_name(k)].append(str(k))
99
+ capability = torch.cuda.get_device_capability(k)
100
+ capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
101
+
102
+ gpu_info = {}
103
+ for name, device_ids in devices.items():
104
+ gpu_info[f"GPU {','.join(device_ids)}"] = name
105
+
106
+ if len(capabilities) == 1:
107
+ # All GPUs have the same compute capability
108
+ cap, gpu_ids = list(capabilities.items())[0]
109
+ gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
110
+ else:
111
+ # GPUs have different compute capabilities
112
+ for cap, gpu_ids in capabilities.items():
113
+ gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
114
+
115
+ return gpu_info
116
+
117
+
118
+ def _get_cuda_version_info():
119
+ """
120
+ Get CUDA version information.
121
+ """
122
+ if is_cuda_v2():
123
+ from torch.utils.cpp_extension import CUDA_HOME
124
+
125
+ cuda_info = {"CUDA_HOME": CUDA_HOME}
126
+
127
+ if CUDA_HOME and os.path.isdir(CUDA_HOME):
128
+ cuda_info.update(_get_nvcc_info())
129
+ cuda_info.update(_get_cuda_driver_version())
130
+
131
+ return cuda_info
132
+ elif is_hip():
133
+ from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
134
+
135
+ cuda_info = {"ROCM_HOME": ROCM_HOME}
136
+
137
+ if ROCM_HOME and os.path.isdir(ROCM_HOME):
138
+ cuda_info.update(_get_nvcc_info())
139
+ cuda_info.update(_get_cuda_driver_version())
140
+
141
+ return cuda_info
142
+ else:
143
+ cuda_info = {"CUDA_HOME": ""}
144
+ return cuda_info
145
+
146
+
147
+ def _get_nvcc_info():
148
+ """
149
+ Get NVCC version information.
150
+ """
151
+ if is_cuda_v2():
152
+ from torch.utils.cpp_extension import CUDA_HOME
153
+
154
+ try:
155
+ nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
156
+ nvcc_output = (
157
+ subprocess.check_output(f'"{nvcc}" -V', shell=True)
158
+ .decode("utf-8")
159
+ .strip()
160
+ )
161
+ return {
162
+ "NVCC": nvcc_output[
163
+ nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind(
164
+ "Build"
165
+ )
166
+ ].strip()
167
+ }
168
+ except subprocess.SubprocessError:
169
+ return {"NVCC": "Not Available"}
170
+ elif is_hip():
171
+ from torch.utils.cpp_extension import ROCM_HOME
172
+
173
+ try:
174
+ hipcc = os.path.join(ROCM_HOME, "bin/hipcc")
175
+ hipcc_output = (
176
+ subprocess.check_output(f'"{hipcc}" --version', shell=True)
177
+ .decode("utf-8")
178
+ .strip()
179
+ )
180
+ return {
181
+ "HIPCC": hipcc_output[
182
+ hipcc_output.rfind("HIP version") : hipcc_output.rfind("AMD clang")
183
+ ].strip()
184
+ }
185
+ except subprocess.SubprocessError:
186
+ return {"HIPCC": "Not Available"}
187
+ else:
188
+ return {"NVCC": "Not Available"}
189
+
190
+
191
+ def _get_cuda_driver_version():
192
+ """
193
+ Get CUDA driver version.
194
+ """
195
+ versions = set()
196
+ if is_cuda_v2():
197
+ try:
198
+ output = subprocess.check_output(
199
+ [
200
+ "nvidia-smi",
201
+ "--query-gpu=driver_version",
202
+ "--format=csv,noheader,nounits",
203
+ ]
204
+ )
205
+ versions = set(output.decode().strip().split("\n"))
206
+ if len(versions) == 1:
207
+ return {"CUDA Driver Version": versions.pop()}
208
+ else:
209
+ return {"CUDA Driver Versions": ", ".join(sorted(versions))}
210
+ except subprocess.SubprocessError:
211
+ return {"CUDA Driver Version": "Not Available"}
212
+ elif is_hip():
213
+ try:
214
+ output = subprocess.check_output(
215
+ [
216
+ "rocm-smi",
217
+ "--showdriverversion",
218
+ "--csv",
219
+ ]
220
+ )
221
+ versions = set(output.decode().strip().split("\n"))
222
+ versions.discard("name, value")
223
+ ver = versions.pop()
224
+ ver = ver.replace('"Driver version", ', "").replace('"', "")
225
+
226
+ return {"ROCM Driver Version": ver}
227
+ except subprocess.SubprocessError:
228
+ return {"ROCM Driver Version": "Not Available"}
229
+ else:
230
+ return {"CUDA Driver Version": "Not Available"}
231
+
232
+
233
+ def get_gpu_topology():
234
+ """
235
+ Get GPU topology information.
236
+ """
237
+ if is_cuda_v2():
238
+ try:
239
+ result = subprocess.run(
240
+ ["nvidia-smi", "topo", "-m"],
241
+ stdout=subprocess.PIPE,
242
+ stderr=subprocess.PIPE,
243
+ text=True,
244
+ check=True,
245
+ )
246
+ return "\n" + result.stdout if result.returncode == 0 else None
247
+ except subprocess.SubprocessError:
248
+ return None
249
+ elif is_hip():
250
+ try:
251
+ result = subprocess.run(
252
+ ["rocm-smi", "--showtopotype"],
253
+ stdout=subprocess.PIPE,
254
+ stderr=subprocess.PIPE,
255
+ text=True,
256
+ check=True,
257
+ )
258
+ return "\n" + result.stdout if result.returncode == 0 else None
259
+ except subprocess.SubprocessError:
260
+ return None
261
+ else:
262
+ return None
263
+
264
+
265
+ def get_hypervisor_vendor():
266
+ try:
267
+ output = subprocess.check_output(["lscpu"], text=True)
268
+ for line in output.split("\n"):
269
+ if "Hypervisor vendor:" in line:
270
+ return line.split(":")[1].strip()
271
+ return None
272
+ except:
273
+ return None
274
+
275
+
276
+ def check_env():
277
+ """
278
+ Check and print environment information.
279
+ """
280
+ env_info = OrderedDict()
281
+ env_info["Python"] = sys.version.replace("\n", "")
282
+ env_info.update(get_cuda_info())
283
+ env_info["PyTorch"] = torch.__version__
284
+ env_info.update(get_package_versions(PACKAGE_LIST))
285
+
286
+ gpu_topo = get_gpu_topology()
287
+ if gpu_topo:
288
+ if is_cuda_v2():
289
+ env_info["NVIDIA Topology"] = gpu_topo
290
+ elif is_hip():
291
+ env_info["AMD Topology"] = gpu_topo
292
+
293
+ hypervisor_vendor = get_hypervisor_vendor()
294
+ if hypervisor_vendor:
295
+ env_info["Hypervisor vendor"] = hypervisor_vendor
296
+
297
+ ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
298
+ env_info["ulimit soft"] = ulimit_soft
299
+
300
+ for k, v in env_info.items():
301
+ print(f"{k}: {v}")
302
+
303
+
304
+ if __name__ == "__main__":
305
+ check_env()
@@ -55,6 +55,7 @@ class RuntimeEndpoint(BaseBackend):
55
55
  self.base_url + "/flush_cache",
56
56
  api_key=self.api_key,
57
57
  verify=self.verify,
58
+ method="POST",
58
59
  )
59
60
  self._assert_success(res)
60
61
 
@@ -320,6 +320,28 @@ register_chat_template(
320
320
  )
321
321
  )
322
322
 
323
+ register_chat_template(
324
+ ChatTemplate(
325
+ name="granite-3-instruct",
326
+ default_system_prompt=None,
327
+ role_prefix_and_suffix={
328
+ "system": (
329
+ "<|start_of_role|>system<|end_of_role|>",
330
+ "<|end_of_text|>",
331
+ ),
332
+ "user": (
333
+ "<|start_of_role|>user<|end_of_role|>",
334
+ "<|end_of_text|>",
335
+ ),
336
+ "assistant": (
337
+ "<|start_of_role|>assistant<|end_of_role|>",
338
+ "<|end_of_text|>",
339
+ ),
340
+ },
341
+ stop_str=("<|end_of_text|>",),
342
+ )
343
+ )
344
+
323
345
 
324
346
  @register_chat_template_matching_function
325
347
  def match_dbrx(model_path: str):
@@ -402,6 +424,16 @@ def match_c4ai_command_r(model_path: str):
402
424
  return get_chat_template("c4ai-command-r")
403
425
 
404
426
 
427
+ @register_chat_template_matching_function
428
+ def match_granite_instruct(model_path: str):
429
+ model_path = model_path.lower()
430
+ # When future versions of Granite are released, this code may
431
+ # need to be updated. For now, assume that the Granite 3.0
432
+ # template works across the board.
433
+ if "granite" in model_path and "instruct" in model_path:
434
+ return get_chat_template("granite-3-instruct")
435
+
436
+
405
437
  if __name__ == "__main__":
406
438
  messages = [
407
439
  {"role": "system", "content": None}, # None means default