sglang 0.1.21__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. sglang/__init__.py +8 -8
  2. sglang/api.py +1 -1
  3. sglang/backend/vertexai.py +5 -4
  4. sglang/bench.py +627 -0
  5. sglang/bench_latency.py +22 -19
  6. sglang/bench_serving.py +758 -0
  7. sglang/check_env.py +171 -0
  8. sglang/lang/backend/__init__.py +0 -0
  9. sglang/lang/backend/anthropic.py +77 -0
  10. sglang/lang/backend/base_backend.py +80 -0
  11. sglang/lang/backend/litellm.py +90 -0
  12. sglang/lang/backend/openai.py +438 -0
  13. sglang/lang/backend/runtime_endpoint.py +283 -0
  14. sglang/lang/backend/vertexai.py +149 -0
  15. sglang/lang/tracer.py +1 -1
  16. sglang/launch_server.py +1 -1
  17. sglang/launch_server_llavavid.py +1 -4
  18. sglang/srt/conversation.py +1 -1
  19. sglang/srt/layers/context_flashattention_nopad.py +0 -29
  20. sglang/srt/layers/extend_attention.py +0 -39
  21. sglang/srt/layers/linear.py +869 -0
  22. sglang/srt/layers/quantization/__init__.py +49 -0
  23. sglang/srt/layers/quantization/fp8.py +662 -0
  24. sglang/srt/layers/radix_attention.py +31 -5
  25. sglang/srt/layers/token_attention.py +1 -51
  26. sglang/srt/managers/controller/cuda_graph_runner.py +14 -12
  27. sglang/srt/managers/controller/infer_batch.py +47 -49
  28. sglang/srt/managers/controller/manager_multi.py +107 -100
  29. sglang/srt/managers/controller/manager_single.py +76 -96
  30. sglang/srt/managers/controller/model_runner.py +35 -23
  31. sglang/srt/managers/controller/tp_worker.py +127 -138
  32. sglang/srt/managers/detokenizer_manager.py +49 -5
  33. sglang/srt/managers/io_struct.py +36 -17
  34. sglang/srt/managers/tokenizer_manager.py +228 -125
  35. sglang/srt/memory_pool.py +19 -6
  36. sglang/srt/model_loader/model_loader.py +277 -0
  37. sglang/srt/model_loader/utils.py +260 -0
  38. sglang/srt/models/chatglm.py +1 -0
  39. sglang/srt/models/dbrx.py +1 -0
  40. sglang/srt/models/grok.py +1 -0
  41. sglang/srt/models/internlm2.py +317 -0
  42. sglang/srt/models/llama2.py +65 -16
  43. sglang/srt/models/llama_classification.py +1 -0
  44. sglang/srt/models/llava.py +1 -0
  45. sglang/srt/models/llavavid.py +1 -0
  46. sglang/srt/models/minicpm.py +1 -0
  47. sglang/srt/models/mixtral.py +1 -0
  48. sglang/srt/models/mixtral_quant.py +1 -0
  49. sglang/srt/models/qwen.py +1 -0
  50. sglang/srt/models/qwen2.py +6 -0
  51. sglang/srt/models/qwen2_moe.py +7 -4
  52. sglang/srt/models/stablelm.py +1 -0
  53. sglang/srt/openai_api/adapter.py +432 -0
  54. sglang/srt/openai_api/api_adapter.py +432 -0
  55. sglang/srt/openai_api/openai_api_adapter.py +431 -0
  56. sglang/srt/openai_api/openai_protocol.py +207 -0
  57. sglang/srt/openai_api/protocol.py +208 -0
  58. sglang/srt/openai_protocol.py +17 -0
  59. sglang/srt/sampling_params.py +2 -0
  60. sglang/srt/server.py +113 -84
  61. sglang/srt/server_args.py +23 -15
  62. sglang/srt/utils.py +16 -117
  63. sglang/test/test_conversation.py +1 -1
  64. sglang/test/test_openai_protocol.py +1 -1
  65. sglang/test/test_programs.py +1 -1
  66. sglang/test/test_utils.py +2 -2
  67. {sglang-0.1.21.dist-info → sglang-0.1.22.dist-info}/METADATA +157 -167
  68. sglang-0.1.22.dist-info/RECORD +103 -0
  69. {sglang-0.1.21.dist-info → sglang-0.1.22.dist-info}/WHEEL +1 -1
  70. sglang-0.1.21.dist-info/RECORD +0 -82
  71. {sglang-0.1.21.dist-info → sglang-0.1.22.dist-info}/LICENSE +0 -0
  72. {sglang-0.1.21.dist-info → sglang-0.1.22.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,49 @@
1
+ # temporarily adapted from vLLM
2
+ # FIXME: in progress of refactoring the model loader
3
+
4
+ from typing import Dict, Type
5
+
6
+ from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
7
+ from vllm.model_executor.layers.quantization.awq import AWQConfig
8
+ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
9
+ from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
10
+ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
11
+ CompressedTensorsConfig,
12
+ )
13
+ from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
14
+ from vllm.model_executor.layers.quantization.gptq import GPTQConfig
15
+ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig
16
+ from vllm.model_executor.layers.quantization.gptq_marlin_24 import GPTQMarlin24Config
17
+ from vllm.model_executor.layers.quantization.marlin import MarlinConfig
18
+ from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
19
+
20
+ from sglang.srt.layers.quantization.fp8 import Fp8Config
21
+
22
+ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
23
+ "aqlm": AQLMConfig,
24
+ "awq": AWQConfig,
25
+ "deepspeedfp": DeepSpeedFPConfig,
26
+ "fp8": Fp8Config,
27
+ # The order of gptq methods is important for config.py iteration over
28
+ # override_quantization_method(..)
29
+ "marlin": MarlinConfig,
30
+ "gptq_marlin_24": GPTQMarlin24Config,
31
+ "gptq_marlin": GPTQMarlinConfig,
32
+ "gptq": GPTQConfig,
33
+ "squeezellm": SqueezeLLMConfig,
34
+ "compressed-tensors": CompressedTensorsConfig,
35
+ "bitsandbytes": BitsAndBytesConfig,
36
+ }
37
+
38
+
39
+ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
40
+ if quantization not in QUANTIZATION_METHODS:
41
+ raise ValueError(f"Invalid quantization method: {quantization}")
42
+ return QUANTIZATION_METHODS[quantization]
43
+
44
+
45
+ __all__ = [
46
+ "QuantizationConfig",
47
+ "get_quantization_config",
48
+ "QUANTIZATION_METHODS",
49
+ ]