sglang 0.3.1.post2__tar.gz → 0.3.1.post3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. {sglang-0.3.1.post2/sglang.egg-info → sglang-0.3.1.post3}/PKG-INFO +3 -2
  2. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/README.md +2 -1
  3. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/pyproject.toml +1 -1
  4. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/bench_latency.py +8 -1
  5. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/activation.py +3 -2
  6. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/attention_backend.py +3 -1
  7. sglang-0.3.1.post3/sglang/srt/layers/linear.py +1133 -0
  8. sglang-0.3.1.post3/sglang/srt/layers/quantization/__init__.py +76 -0
  9. sglang-0.3.1.post3/sglang/srt/layers/quantization/base_config.py +122 -0
  10. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/baichuan.py +1 -1
  11. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/chatglm.py +6 -6
  12. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/commandr.py +7 -7
  13. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/dbrx.py +7 -7
  14. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/deepseek.py +7 -7
  15. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/deepseek_v2.py +7 -7
  16. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/exaone.py +6 -6
  17. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/gemma.py +6 -6
  18. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/gemma2.py +6 -6
  19. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/gpt_bigcode.py +6 -6
  20. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/grok.py +6 -6
  21. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/internlm2.py +6 -6
  22. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/llama.py +6 -6
  23. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/llama_classification.py +1 -1
  24. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/llava.py +1 -1
  25. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/llavavid.py +1 -1
  26. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/minicpm.py +6 -6
  27. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/minicpm3.py +1 -1
  28. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/mixtral.py +6 -6
  29. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/mixtral_quant.py +6 -6
  30. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/olmoe.py +1 -1
  31. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/qwen.py +6 -6
  32. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/qwen2.py +6 -6
  33. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/qwen2_moe.py +7 -7
  34. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/stablelm.py +6 -6
  35. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/xverse.py +1 -1
  36. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/xverse_moe.py +1 -1
  37. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/yivl.py +1 -1
  38. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/utils.py +21 -1
  39. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/test_utils.py +4 -2
  40. sglang-0.3.1.post3/sglang/version.py +1 -0
  41. {sglang-0.3.1.post2 → sglang-0.3.1.post3/sglang.egg-info}/PKG-INFO +3 -2
  42. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang.egg-info/SOURCES.txt +3 -0
  43. sglang-0.3.1.post2/sglang/version.py +0 -1
  44. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/LICENSE +0 -0
  45. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/setup.cfg +0 -0
  46. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/__init__.py +0 -0
  47. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/api.py +0 -0
  48. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/bench_server_latency.py +0 -0
  49. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/bench_serving.py +0 -0
  50. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/check_env.py +0 -0
  51. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/global_config.py +0 -0
  52. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/__init__.py +0 -0
  53. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/__init__.py +0 -0
  54. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/anthropic.py +0 -0
  55. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/base_backend.py +0 -0
  56. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/litellm.py +0 -0
  57. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/openai.py +0 -0
  58. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/runtime_endpoint.py +0 -0
  59. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/backend/vertexai.py +0 -0
  60. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/chat_template.py +0 -0
  61. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/choices.py +0 -0
  62. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/compiler.py +0 -0
  63. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/interpreter.py +0 -0
  64. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/ir.py +0 -0
  65. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/lang/tracer.py +0 -0
  66. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/launch_server.py +0 -0
  67. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/launch_server_llavavid.py +0 -0
  68. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/configs/__init__.py +0 -0
  69. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/configs/exaone.py +0 -0
  70. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/configs/model_config.py +0 -0
  71. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/constrained/__init__.py +0 -0
  72. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/constrained/base_tool_cache.py +0 -0
  73. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/constrained/fsm_cache.py +0 -0
  74. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/constrained/jump_forward.py +0 -0
  75. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/conversation.py +0 -0
  76. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/hf_transformers_utils.py +0 -0
  77. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/flashinfer_utils.py +0 -0
  78. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  79. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  80. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/fused_moe/layer.py +0 -0
  81. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/layernorm.py +0 -0
  82. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/logits_processor.py +0 -0
  83. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/pooler.py +0 -0
  84. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/radix_attention.py +0 -0
  85. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/sampler.py +0 -0
  86. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/torchao_utils.py +0 -0
  87. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/triton_attention/decode_attention.py +0 -0
  88. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/triton_attention/extend_attention.py +0 -0
  89. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/layers/triton_attention/prefill_attention.py +0 -0
  90. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/lora/lora.py +0 -0
  91. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/lora/lora_config.py +0 -0
  92. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/lora/lora_manager.py +0 -0
  93. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/controller_multi.py +0 -0
  94. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/controller_single.py +0 -0
  95. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/detokenizer_manager.py +0 -0
  96. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/io_struct.py +0 -0
  97. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/policy_scheduler.py +0 -0
  98. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/schedule_batch.py +0 -0
  99. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/tokenizer_manager.py +0 -0
  100. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/managers/tp_worker.py +0 -0
  101. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  102. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  103. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mem_cache/flush_cache.py +0 -0
  104. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mem_cache/memory_pool.py +0 -0
  105. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mem_cache/radix_cache.py +0 -0
  106. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/mm_utils.py +0 -0
  107. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/model_executor/cuda_graph_runner.py +0 -0
  108. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/model_executor/forward_batch_info.py +0 -0
  109. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/model_executor/model_runner.py +0 -0
  110. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/llama_embedding.py +0 -0
  111. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/models/mistral.py +0 -0
  112. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/openai_api/adapter.py +0 -0
  113. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/openai_api/protocol.py +0 -0
  114. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  115. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  116. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  117. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  118. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  119. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  120. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/sampling_batch_info.py +0 -0
  121. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/sampling/sampling_params.py +0 -0
  122. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/server.py +0 -0
  123. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/srt/server_args.py +0 -0
  124. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/few_shot_gsm8k.py +0 -0
  125. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/run_eval.py +0 -0
  126. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/runners.py +0 -0
  127. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_common.py +0 -0
  128. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_gpqa.py +0 -0
  129. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_humaneval.py +0 -0
  130. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_math.py +0 -0
  131. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_mgsm.py +0 -0
  132. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/simple_eval_mmlu.py +0 -0
  133. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  134. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/test_activation.py +0 -0
  135. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/test_layernorm.py +0 -0
  136. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/test/test_programs.py +0 -0
  137. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang/utils.py +0 -0
  138. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang.egg-info/dependency_links.txt +0 -0
  139. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang.egg-info/requires.txt +0 -0
  140. {sglang-0.3.1.post2 → sglang-0.3.1.post3}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.1.post2
3
+ Version: 0.3.1.post3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
318
  ### Method 2: From source
319
319
  ```
320
320
  # Use the last release branch
321
- git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
321
+ git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
322
322
  cd sglang
323
323
 
324
324
  pip install --upgrade pip
@@ -499,6 +499,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
499
499
  - Llama / Llama 2 / Llama 3 / Llama 3.1
500
500
  - Mistral / Mixtral / Mistral NeMo
501
501
  - Gemma / Gemma 2
502
+ - OLMoE
502
503
  - Qwen / Qwen 2 / Qwen 2 MoE
503
504
  - DeepSeek / DeepSeek 2
504
505
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -60,7 +60,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
60
60
  ### Method 2: From source
61
61
  ```
62
62
  # Use the last release branch
63
- git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
63
+ git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
64
64
  cd sglang
65
65
 
66
66
  pip install --upgrade pip
@@ -241,6 +241,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
241
241
  - Llama / Llama 2 / Llama 3 / Llama 3.1
242
242
  - Mistral / Mixtral / Mistral NeMo
243
243
  - Gemma / Gemma 2
244
+ - OLMoE
244
245
  - Qwen / Qwen 2 / Qwen 2 MoE
245
246
  - DeepSeek / DeepSeek 2
246
247
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.3.1.post2"
7
+ version = "0.3.1.post3"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -64,8 +64,13 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
64
64
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
65
65
  from sglang.srt.model_executor.model_runner import ModelRunner
66
66
  from sglang.srt.sampling.sampling_params import SamplingParams
67
+ from sglang.srt.server import _set_envs_and_config
67
68
  from sglang.srt.server_args import ServerArgs
68
- from sglang.srt.utils import kill_child_process, suppress_other_loggers
69
+ from sglang.srt.utils import (
70
+ configure_logger,
71
+ kill_child_process,
72
+ suppress_other_loggers,
73
+ )
69
74
 
70
75
 
71
76
  @dataclasses.dataclass
@@ -341,6 +346,8 @@ def latency_test(
341
346
  bench_args,
342
347
  tp_rank,
343
348
  ):
349
+ configure_logger(server_args, prefix=f" TP{tp_rank}")
350
+ _set_envs_and_config(server_args)
344
351
  rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
345
352
 
346
353
  # Load the model
@@ -31,8 +31,9 @@ from vllm.distributed import (
31
31
  get_tensor_model_parallel_world_size,
32
32
  )
33
33
  from vllm.model_executor.custom_op import CustomOp
34
- from vllm.model_executor.layers.quantization import QuantizationConfig
35
- from vllm.model_executor.utils import set_weight_attrs
34
+
35
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
36
+ from sglang.srt.utils import set_weight_attrs
36
37
 
37
38
  logger = logging.getLogger(__name__)
38
39
 
@@ -346,7 +346,9 @@ class TritonAttnBackend(AttentionBackend):
346
346
 
347
347
  self.decode_attention_fwd = decode_attention_fwd
348
348
  self.extend_attention_fwd = extend_attention_fwd
349
- self.num_head = model_runner.model_config.num_attention_heads
349
+ self.num_head = (
350
+ model_runner.model_config.num_attention_heads // model_runner.tp_size
351
+ )
350
352
 
351
353
  if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
352
354
  self.reduce_dtype = torch.float32