sglang 0.1.21__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. sglang/__init__.py +8 -8
  2. sglang/api.py +1 -1
  3. sglang/backend/vertexai.py +5 -4
  4. sglang/bench.py +627 -0
  5. sglang/bench_latency.py +22 -19
  6. sglang/bench_serving.py +976 -0
  7. sglang/check_env.py +171 -0
  8. sglang/global_config.py +3 -2
  9. sglang/lang/backend/__init__.py +0 -0
  10. sglang/lang/backend/anthropic.py +77 -0
  11. sglang/lang/backend/base_backend.py +80 -0
  12. sglang/lang/backend/litellm.py +90 -0
  13. sglang/lang/backend/openai.py +438 -0
  14. sglang/lang/backend/runtime_endpoint.py +283 -0
  15. sglang/lang/backend/vertexai.py +149 -0
  16. sglang/lang/interpreter.py +1 -0
  17. sglang/lang/tracer.py +1 -1
  18. sglang/launch_server.py +1 -1
  19. sglang/launch_server_llavavid.py +1 -4
  20. sglang/srt/conversation.py +1 -1
  21. sglang/srt/hf_transformers_utils.py +13 -1
  22. sglang/srt/layers/context_flashattention_nopad.py +0 -29
  23. sglang/srt/layers/extend_attention.py +0 -39
  24. sglang/srt/layers/linear.py +869 -0
  25. sglang/srt/layers/logits_processor.py +4 -5
  26. sglang/srt/layers/quantization/__init__.py +49 -0
  27. sglang/srt/layers/quantization/fp8.py +662 -0
  28. sglang/srt/layers/radix_attention.py +39 -24
  29. sglang/srt/layers/token_attention.py +1 -51
  30. sglang/srt/managers/controller/cuda_graph_runner.py +72 -28
  31. sglang/srt/managers/controller/infer_batch.py +90 -63
  32. sglang/srt/managers/controller/manager_multi.py +107 -100
  33. sglang/srt/managers/controller/manager_single.py +76 -96
  34. sglang/srt/managers/controller/model_runner.py +41 -26
  35. sglang/srt/managers/controller/schedule_heuristic.py +8 -3
  36. sglang/srt/managers/controller/tp_worker.py +136 -149
  37. sglang/srt/managers/detokenizer_manager.py +49 -5
  38. sglang/srt/managers/io_struct.py +36 -17
  39. sglang/srt/managers/tokenizer_manager.py +228 -125
  40. sglang/srt/memory_pool.py +32 -11
  41. sglang/srt/model_loader/model_loader.py +277 -0
  42. sglang/srt/model_loader/utils.py +260 -0
  43. sglang/srt/models/chatglm.py +1 -0
  44. sglang/srt/models/dbrx.py +1 -0
  45. sglang/srt/models/deepseek.py +430 -0
  46. sglang/srt/models/gpt_bigcode.py +282 -0
  47. sglang/srt/models/grok.py +1 -0
  48. sglang/srt/models/internlm2.py +317 -0
  49. sglang/srt/models/llama2.py +81 -23
  50. sglang/srt/models/llama_classification.py +1 -0
  51. sglang/srt/models/llava.py +1 -0
  52. sglang/srt/models/llavavid.py +1 -0
  53. sglang/srt/models/minicpm.py +1 -0
  54. sglang/srt/models/mixtral.py +1 -0
  55. sglang/srt/models/mixtral_quant.py +1 -0
  56. sglang/srt/models/qwen.py +1 -0
  57. sglang/srt/models/qwen2.py +6 -0
  58. sglang/srt/models/qwen2_moe.py +7 -4
  59. sglang/srt/models/stablelm.py +1 -0
  60. sglang/srt/openai_api/adapter.py +432 -0
  61. sglang/srt/openai_api/api_adapter.py +432 -0
  62. sglang/srt/openai_api/openai_api_adapter.py +431 -0
  63. sglang/srt/openai_api/openai_protocol.py +207 -0
  64. sglang/srt/openai_api/protocol.py +208 -0
  65. sglang/srt/openai_protocol.py +17 -0
  66. sglang/srt/sampling_params.py +2 -0
  67. sglang/srt/server.py +132 -84
  68. sglang/srt/server_args.py +35 -21
  69. sglang/srt/utils.py +65 -117
  70. sglang/test/test_conversation.py +1 -1
  71. sglang/test/test_openai_protocol.py +1 -1
  72. sglang/test/test_programs.py +1 -1
  73. sglang/test/test_utils.py +2 -2
  74. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/METADATA +162 -168
  75. sglang-0.1.24.dist-info/RECORD +105 -0
  76. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/WHEEL +1 -1
  77. sglang-0.1.21.dist-info/RECORD +0 -82
  78. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/LICENSE +0 -0
  79. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/top_level.txt +0 -0
@@ -34,12 +34,11 @@ class LogitProcessorOutput:
34
34
  @dataclasses.dataclass
35
35
  class LogitsMetadata:
36
36
  forward_mode: ForwardMode
37
- extend_seq_lens: torch.Tensor
38
- extend_start_loc: torch.Tensor
39
-
40
- # For logprobs
41
37
  return_logprob: bool
42
- top_logprobs_nums: List[int]
38
+
39
+ extend_seq_lens: torch.Tensor = None
40
+ extend_start_loc: torch.Tensor = None
41
+ top_logprobs_nums: List[int] = None
43
42
 
44
43
  @classmethod
45
44
  def from_input_metadata(cls, input_metadata: InputMetadata):
@@ -0,0 +1,49 @@
1
+ # temporarily adapted from vLLM
2
+ # FIXME: in progress of refactoring the model loader
3
+
4
+ from typing import Dict, Type
5
+
6
+ from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
7
+ from vllm.model_executor.layers.quantization.awq import AWQConfig
8
+ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
9
+ from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
10
+ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
11
+ CompressedTensorsConfig,
12
+ )
13
+ from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
14
+ from vllm.model_executor.layers.quantization.gptq import GPTQConfig
15
+ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig
16
+ from vllm.model_executor.layers.quantization.gptq_marlin_24 import GPTQMarlin24Config
17
+ from vllm.model_executor.layers.quantization.marlin import MarlinConfig
18
+ from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
19
+
20
+ from sglang.srt.layers.quantization.fp8 import Fp8Config
21
+
22
+ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
23
+ "aqlm": AQLMConfig,
24
+ "awq": AWQConfig,
25
+ "deepspeedfp": DeepSpeedFPConfig,
26
+ "fp8": Fp8Config,
27
+ # The order of gptq methods is important for config.py iteration over
28
+ # override_quantization_method(..)
29
+ "marlin": MarlinConfig,
30
+ "gptq_marlin_24": GPTQMarlin24Config,
31
+ "gptq_marlin": GPTQMarlinConfig,
32
+ "gptq": GPTQConfig,
33
+ "squeezellm": SqueezeLLMConfig,
34
+ "compressed-tensors": CompressedTensorsConfig,
35
+ "bitsandbytes": BitsAndBytesConfig,
36
+ }
37
+
38
+
39
+ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
40
+ if quantization not in QUANTIZATION_METHODS:
41
+ raise ValueError(f"Invalid quantization method: {quantization}")
42
+ return QUANTIZATION_METHODS[quantization]
43
+
44
+
45
+ __all__ = [
46
+ "QuantizationConfig",
47
+ "get_quantization_config",
48
+ "QUANTIZATION_METHODS",
49
+ ]