sglang 0.4.3__py3-none-any.whl → 0.4.3.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. sglang/lang/backend/openai.py +5 -0
  2. sglang/lang/chat_template.py +22 -7
  3. sglang/lang/ir.py +1 -0
  4. sglang/srt/configs/__init__.py +6 -3
  5. sglang/srt/configs/model_config.py +2 -0
  6. sglang/srt/configs/qwen2_5_vl_config.py +1003 -0
  7. sglang/srt/entrypoints/engine.py +17 -2
  8. sglang/srt/hf_transformers_utils.py +2 -3
  9. sglang/srt/layers/attention/flashinfer_backend.py +101 -30
  10. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  11. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  12. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  13. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  14. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  15. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  16. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  17. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  18. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  19. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  20. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  21. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  22. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  23. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  24. sglang/srt/managers/image_processor.py +217 -122
  25. sglang/srt/managers/schedule_batch.py +1 -0
  26. sglang/srt/model_executor/forward_batch_info.py +4 -1
  27. sglang/srt/model_executor/model_runner.py +1 -0
  28. sglang/srt/models/deepseek_nextn.py +295 -0
  29. sglang/srt/models/deepseek_v2.py +9 -3
  30. sglang/srt/models/llava.py +2 -1
  31. sglang/srt/models/qwen2_5_vl.py +722 -0
  32. sglang/srt/models/qwen2_vl.py +2 -1
  33. sglang/srt/openai_api/adapter.py +17 -3
  34. sglang/srt/server_args.py +6 -3
  35. sglang/srt/speculative/eagle_worker.py +7 -2
  36. sglang/srt/speculative/spec_info.py +11 -1
  37. sglang/utils.py +99 -19
  38. sglang/version.py +1 -1
  39. {sglang-0.4.3.dist-info → sglang-0.4.3.post2.dist-info}/METADATA +3 -3
  40. {sglang-0.4.3.dist-info → sglang-0.4.3.post2.dist-info}/RECORD +43 -27
  41. sglang/srt/configs/qwen2vl.py +0 -130
  42. {sglang-0.4.3.dist-info → sglang-0.4.3.post2.dist-info}/LICENSE +0 -0
  43. {sglang-0.4.3.dist-info → sglang-0.4.3.post2.dist-info}/WHEEL +0 -0
  44. {sglang-0.4.3.dist-info → sglang-0.4.3.post2.dist-info}/top_level.txt +0 -0
@@ -1,130 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
3
- # All rights reserved.
4
- #
5
- # Licensed under the Apache License, Version 2.0 (the "License");
6
- # you may not use this file except in compliance with the License.
7
- # You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- """Qwen2VL model configuration"""
17
-
18
- import os
19
- from typing import Union
20
-
21
- from transformers import PretrainedConfig
22
-
23
-
24
- class Qwen2VLVisionConfig(PretrainedConfig):
25
- model_type = "qwen2_vl"
26
-
27
- def __init__(
28
- self,
29
- depth=32,
30
- embed_dim=1280,
31
- hidden_size=3584,
32
- hidden_act="quick_gelu",
33
- mlp_ratio=4,
34
- num_heads=16,
35
- in_channels=3,
36
- patch_size=14,
37
- spatial_merge_size=2,
38
- temporal_patch_size=2,
39
- **kwargs,
40
- ):
41
- super().__init__(**kwargs)
42
-
43
- self.depth = depth
44
- self.embed_dim = embed_dim
45
- self.hidden_size = hidden_size
46
- self.hidden_act = hidden_act
47
- self.mlp_ratio = mlp_ratio
48
- self.num_heads = num_heads
49
- self.in_channels = in_channels
50
- self.patch_size = patch_size
51
- self.spatial_merge_size = spatial_merge_size
52
- self.temporal_patch_size = temporal_patch_size
53
-
54
- @classmethod
55
- def from_pretrained(
56
- cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
57
- ) -> "PretrainedConfig":
58
- cls._set_token_in_kwargs(kwargs)
59
-
60
- config_dict, kwargs = cls.get_config_dict(
61
- pretrained_model_name_or_path, **kwargs
62
- )
63
-
64
- if config_dict.get("model_type") == "qwen2_vl":
65
- config_dict = config_dict["vision_config"]
66
-
67
- return cls.from_dict(config_dict, **kwargs)
68
-
69
-
70
- class Qwen2VLConfig(PretrainedConfig):
71
- model_type = "qwen2_vl"
72
-
73
- def __init__(
74
- self,
75
- vocab_size=152064,
76
- hidden_size=8192,
77
- intermediate_size=29568,
78
- num_hidden_layers=80,
79
- num_attention_heads=64,
80
- num_key_value_heads=8,
81
- hidden_act="silu",
82
- max_position_embeddings=32768,
83
- initializer_range=0.02,
84
- rms_norm_eps=1e-05,
85
- use_cache=True,
86
- tie_word_embeddings=False,
87
- rope_theta=1000000.0,
88
- use_sliding_window=False,
89
- sliding_window=4096,
90
- max_window_layers=80,
91
- attention_dropout=0.0,
92
- vision_config=None,
93
- rope_scaling=None,
94
- **kwargs,
95
- ):
96
- if isinstance(vision_config, dict):
97
- self.vision_config = Qwen2VLVisionConfig(**vision_config)
98
- elif vision_config is None:
99
- self.vision_config = Qwen2VLVisionConfig()
100
-
101
- self.vocab_size = vocab_size
102
- self.max_position_embeddings = max_position_embeddings
103
- self.hidden_size = hidden_size
104
- self.intermediate_size = intermediate_size
105
- self.num_hidden_layers = num_hidden_layers
106
- self.num_attention_heads = num_attention_heads
107
- self.use_sliding_window = use_sliding_window
108
- self.sliding_window = sliding_window
109
- self.max_window_layers = max_window_layers
110
-
111
- # for backward compatibility
112
- if num_key_value_heads is None:
113
- num_key_value_heads = num_attention_heads
114
-
115
- self.num_key_value_heads = num_key_value_heads
116
- self.hidden_act = hidden_act
117
- self.initializer_range = initializer_range
118
- self.rms_norm_eps = rms_norm_eps
119
- self.use_cache = use_cache
120
- self.rope_theta = rope_theta
121
- self.attention_dropout = attention_dropout
122
- self.rope_scaling = rope_scaling
123
-
124
- # NOTE(HandH1998): This is necessary for configuring the `rope_type`` of qwen2vl models after removing dependencies on vllm.
125
- if self.rope_scaling is not None and "type" in self.rope_scaling:
126
- if self.rope_scaling["type"] == "mrope":
127
- self.rope_scaling["type"] = "default"
128
- self.rope_scaling["rope_type"] = self.rope_scaling["type"]
129
-
130
- super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)