sglang 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. sglang/api.py +13 -1
  2. sglang/bench_latency.py +10 -5
  3. sglang/bench_serving.py +50 -26
  4. sglang/check_env.py +15 -0
  5. sglang/global_config.py +1 -1
  6. sglang/lang/backend/runtime_endpoint.py +60 -49
  7. sglang/lang/chat_template.py +10 -5
  8. sglang/lang/compiler.py +4 -0
  9. sglang/lang/interpreter.py +5 -2
  10. sglang/lang/ir.py +22 -4
  11. sglang/launch_server.py +8 -1
  12. sglang/srt/constrained/jump_forward.py +13 -2
  13. sglang/srt/conversation.py +50 -1
  14. sglang/srt/hf_transformers_utils.py +22 -23
  15. sglang/srt/layers/activation.py +24 -2
  16. sglang/srt/layers/decode_attention.py +338 -50
  17. sglang/srt/layers/extend_attention.py +3 -1
  18. sglang/srt/layers/fused_moe/__init__.py +1 -0
  19. sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
  20. sglang/srt/layers/fused_moe/layer.py +587 -0
  21. sglang/srt/layers/layernorm.py +3 -0
  22. sglang/srt/layers/logits_processor.py +64 -27
  23. sglang/srt/layers/radix_attention.py +41 -18
  24. sglang/srt/layers/sampler.py +154 -0
  25. sglang/srt/managers/controller_multi.py +2 -8
  26. sglang/srt/managers/controller_single.py +7 -10
  27. sglang/srt/managers/detokenizer_manager.py +20 -9
  28. sglang/srt/managers/io_struct.py +44 -11
  29. sglang/srt/managers/policy_scheduler.py +5 -2
  30. sglang/srt/managers/schedule_batch.py +59 -179
  31. sglang/srt/managers/tokenizer_manager.py +193 -84
  32. sglang/srt/managers/tp_worker.py +131 -50
  33. sglang/srt/mem_cache/memory_pool.py +82 -8
  34. sglang/srt/mm_utils.py +79 -7
  35. sglang/srt/model_executor/cuda_graph_runner.py +97 -28
  36. sglang/srt/model_executor/forward_batch_info.py +188 -82
  37. sglang/srt/model_executor/model_runner.py +269 -87
  38. sglang/srt/models/chatglm.py +6 -14
  39. sglang/srt/models/commandr.py +6 -2
  40. sglang/srt/models/dbrx.py +5 -1
  41. sglang/srt/models/deepseek.py +7 -3
  42. sglang/srt/models/deepseek_v2.py +12 -7
  43. sglang/srt/models/gemma.py +6 -2
  44. sglang/srt/models/gemma2.py +22 -8
  45. sglang/srt/models/gpt_bigcode.py +5 -1
  46. sglang/srt/models/grok.py +66 -398
  47. sglang/srt/models/internlm2.py +5 -1
  48. sglang/srt/models/llama2.py +7 -3
  49. sglang/srt/models/llama_classification.py +2 -2
  50. sglang/srt/models/llama_embedding.py +4 -0
  51. sglang/srt/models/llava.py +176 -59
  52. sglang/srt/models/minicpm.py +7 -3
  53. sglang/srt/models/mixtral.py +61 -255
  54. sglang/srt/models/mixtral_quant.py +6 -5
  55. sglang/srt/models/qwen.py +7 -4
  56. sglang/srt/models/qwen2.py +15 -5
  57. sglang/srt/models/qwen2_moe.py +7 -16
  58. sglang/srt/models/stablelm.py +6 -2
  59. sglang/srt/openai_api/adapter.py +149 -58
  60. sglang/srt/sampling/sampling_batch_info.py +209 -0
  61. sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -4
  62. sglang/srt/server.py +107 -71
  63. sglang/srt/server_args.py +49 -15
  64. sglang/srt/utils.py +27 -18
  65. sglang/test/runners.py +38 -38
  66. sglang/test/simple_eval_common.py +9 -10
  67. sglang/test/simple_eval_gpqa.py +2 -1
  68. sglang/test/simple_eval_humaneval.py +2 -2
  69. sglang/test/simple_eval_math.py +2 -1
  70. sglang/test/simple_eval_mmlu.py +2 -1
  71. sglang/test/test_activation.py +55 -0
  72. sglang/test/test_programs.py +32 -5
  73. sglang/test/test_utils.py +37 -50
  74. sglang/version.py +1 -1
  75. {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/METADATA +102 -27
  76. sglang-0.2.14.dist-info/RECORD +114 -0
  77. {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
  78. sglang/launch_server_llavavid.py +0 -29
  79. sglang/srt/model_loader/model_loader.py +0 -292
  80. sglang/srt/model_loader/utils.py +0 -275
  81. sglang-0.2.12.dist-info/RECORD +0 -112
  82. {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
  83. {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0
@@ -1,292 +0,0 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
16
- # temporarily adapted from https://github.com/vllm-project/vllm/blob/10383887e03412196a2689b9398290719c4797bf/vllm/model_executor/model_loader/loader.py
17
- # FIXME: in progress of refactoring the model loader
18
-
19
- import glob
20
- import os
21
- import re
22
- from typing import Any, Dict, Generator, List, Optional, Tuple, Type
23
-
24
- import torch
25
- from torch import nn
26
- from tqdm import tqdm
27
- from vllm.config import (
28
- CacheConfig,
29
- DeviceConfig,
30
- LoadConfig,
31
- LoadFormat,
32
- LoRAConfig,
33
- ModelConfig,
34
- MultiModalConfig,
35
- ParallelConfig,
36
- SchedulerConfig,
37
- )
38
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
39
- from vllm.model_executor.model_loader.utils import (
40
- get_model_architecture,
41
- set_default_torch_dtype,
42
- )
43
- from vllm.platforms import current_platform
44
-
45
- from sglang.srt.model_loader.utils import (
46
- download_safetensors_index_file_from_hf,
47
- download_weights_from_hf,
48
- filter_duplicate_safetensors_files,
49
- get_quant_config,
50
- safetensors_weights_iterator,
51
- )
52
-
53
-
54
- def _get_quantization_config(
55
- model_config: ModelConfig, load_config: LoadConfig
56
- ) -> Optional[QuantizationConfig]:
57
- """Get the quantization config."""
58
- if model_config.quantization is not None:
59
- quant_config = get_quant_config(model_config, load_config)
60
- capability = current_platform.get_device_capability()
61
- capability = capability[0] * 10 + capability[1]
62
- if capability < quant_config.get_min_capability():
63
- raise ValueError(
64
- f"The quantization method {model_config.quantization} is not "
65
- "supported for the current GPU. "
66
- f"Minimum capability: {quant_config.get_min_capability()}. "
67
- f"Current capability: {capability}."
68
- )
69
- supported_dtypes = quant_config.get_supported_act_dtypes()
70
- if model_config.dtype not in supported_dtypes:
71
- raise ValueError(
72
- f"{model_config.dtype} is not supported for quantization "
73
- f"method {model_config.quantization}. Supported dtypes: "
74
- f"{supported_dtypes}"
75
- )
76
- return quant_config
77
- return None
78
-
79
-
80
- def _get_model_initialization_kwargs(
81
- model_class: Type[nn.Module],
82
- lora_config: Optional[LoRAConfig],
83
- multimodal_config: Optional[MultiModalConfig],
84
- ) -> Dict[str, Any]:
85
- """Get extra kwargs for model initialization."""
86
- extra_kwargs: Dict[str, Any] = {}
87
-
88
- assert lora_config is None
89
- assert multimodal_config is None
90
-
91
- return extra_kwargs
92
-
93
-
94
- def _initialize_model(
95
- model_config: ModelConfig,
96
- load_config: LoadConfig,
97
- lora_config: Optional[LoRAConfig],
98
- multimodal_config: Optional[MultiModalConfig],
99
- cache_config: CacheConfig,
100
- ) -> nn.Module:
101
- """Initialize a model with the given configurations."""
102
- model_class = get_model_architecture(model_config)[0]
103
- quant_config = _get_quantization_config(model_config, load_config)
104
-
105
- return model_class(
106
- config=model_config.hf_config,
107
- cache_config=cache_config,
108
- quant_config=quant_config,
109
- efficient_weight_load=True,
110
- **_get_model_initialization_kwargs(model_class, lora_config, multimodal_config),
111
- )
112
-
113
-
114
- class ModelLoader:
115
- """Model loader that can load different file types from disk."""
116
-
117
- def __init__(self, load_config: LoadConfig):
118
- self.load_config = load_config
119
-
120
- def _prepare_weights(
121
- self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
122
- ) -> Tuple[str, List[str], bool]:
123
- """Prepare weights for the model.
124
-
125
- If the model is not local, it will be downloaded."""
126
-
127
- is_local = os.path.isdir(model_name_or_path)
128
- load_format = self.load_config.load_format
129
- use_safetensors = False
130
- # Some quantized models use .pt files for storing the weights.
131
- if load_format == LoadFormat.AUTO:
132
- allow_patterns = ["*.safetensors", "*.bin"]
133
- elif load_format == LoadFormat.SAFETENSORS:
134
- use_safetensors = True
135
- allow_patterns = ["*.safetensors"]
136
- elif load_format == LoadFormat.PT:
137
- allow_patterns = ["*.pt"]
138
- elif load_format == LoadFormat.NPCACHE:
139
- allow_patterns = ["*.bin"]
140
- else:
141
- raise ValueError(f"Unknown load_format: {load_format}")
142
-
143
- if fall_back_to_pt:
144
- allow_patterns += ["*.pt"]
145
-
146
- if not is_local:
147
- hf_folder = download_weights_from_hf(
148
- model_name_or_path,
149
- self.load_config.download_dir,
150
- allow_patterns,
151
- revision,
152
- )
153
- else:
154
- hf_folder = model_name_or_path
155
-
156
- hf_weights_files: List[str] = []
157
- for pattern in allow_patterns:
158
- hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
159
- if len(hf_weights_files) > 0:
160
- if pattern == "*.safetensors":
161
- use_safetensors = True
162
- break
163
-
164
- if use_safetensors:
165
- # For models like Mistral-7B-Instruct-v0.3
166
- # there are both sharded safetensors files and a consolidated
167
- # safetensors file. Using both breaks.
168
- # Here, we download the `model.safetensors.index.json` and filter
169
- # any files not found in the index.
170
- if not is_local:
171
- download_safetensors_index_file_from_hf(
172
- model_name_or_path, self.load_config.download_dir, revision
173
- )
174
- hf_weights_files = filter_duplicate_safetensors_files(
175
- hf_weights_files, hf_folder
176
- )
177
- else:
178
- hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
179
-
180
- if len(hf_weights_files) == 0:
181
- raise RuntimeError(
182
- f"Cannot find any model weights with `{model_name_or_path}`"
183
- )
184
-
185
- return hf_folder, hf_weights_files, use_safetensors
186
-
187
- def _get_weights_iterator(
188
- self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
189
- ) -> Generator[Tuple[str, torch.Tensor], None, None]:
190
- """Get an iterator for the model weights based on the load format."""
191
- hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
192
- model_name_or_path, revision, fall_back_to_pt
193
- )
194
- if self.load_config.load_format == LoadFormat.NPCACHE:
195
- # Currently np_cache only support *.bin checkpoints
196
- assert use_safetensors is False
197
- weights_iterator = np_cache_weights_iterator(
198
- model_name_or_path,
199
- self.load_config.download_dir,
200
- hf_folder,
201
- hf_weights_files,
202
- )
203
- elif use_safetensors:
204
- weights_iterator = safetensors_weights_iterator(hf_weights_files)
205
- else:
206
- weights_iterator = pt_weights_iterator(hf_weights_files)
207
-
208
- return weights_iterator
209
-
210
- def load_model(
211
- self,
212
- *,
213
- model_config: ModelConfig,
214
- device_config: DeviceConfig,
215
- lora_config: Optional[LoRAConfig],
216
- multimodal_config: Optional[MultiModalConfig],
217
- parallel_config: ParallelConfig,
218
- scheduler_config: SchedulerConfig,
219
- cache_config: CacheConfig,
220
- ) -> nn.Module:
221
- with set_default_torch_dtype(model_config.dtype):
222
- with torch.device(device_config.device):
223
- model = _initialize_model(
224
- model_config,
225
- self.load_config,
226
- lora_config,
227
- multimodal_config,
228
- cache_config,
229
- )
230
- weights = self._get_weights_iterator(
231
- model_config.model,
232
- model_config.revision,
233
- fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
234
- )
235
-
236
- modules = {}
237
- for name, module in model.named_modules():
238
- modules[name] = module
239
-
240
- def apply_quant_method(module):
241
- quant_method = getattr(module, "quant_method", None)
242
- if quant_method is not None:
243
- # print("before apply quant", module.weight, module.weight.dtype)
244
- quant_method.process_weights_after_loading(module)
245
- # print("after apply quant", module.weight, module.weight.dtype)
246
- # FIXME: Remove this after Mixtral is updated
247
- # to use quant_method.
248
- if hasattr(module, "process_weights_after_loading"):
249
- module.process_weights_after_loading()
250
-
251
- if torch.cuda.current_device() == 0:
252
- weights = tqdm(
253
- weights, total=model.get_num_params() * 1.5, desc="load model"
254
- )
255
-
256
- num_shard = {}
257
- num_loaded = {}
258
- for name, loaded_weight in weights:
259
- model.load_weights(None, name, loaded_weight)
260
- module_name, shard_num = model.get_module_name(name)
261
- num_shard[module_name] = shard_num
262
- if module_name not in num_loaded:
263
- num_loaded[module_name] = 1
264
- else:
265
- num_loaded[module_name] += 1
266
- if num_loaded[module_name] == num_shard[module_name]:
267
- apply_quant_method(modules[module_name])
268
-
269
- return model.eval()
270
-
271
-
272
- def get_model(
273
- *,
274
- model_config: ModelConfig,
275
- load_config: LoadConfig,
276
- device_config: DeviceConfig,
277
- parallel_config: ParallelConfig,
278
- scheduler_config: SchedulerConfig,
279
- lora_config: Optional[LoRAConfig],
280
- multimodal_config: Optional[MultiModalConfig],
281
- cache_config: CacheConfig,
282
- ) -> nn.Module:
283
- loader = ModelLoader(load_config)
284
- return loader.load_model(
285
- model_config=model_config,
286
- device_config=device_config,
287
- lora_config=lora_config,
288
- multimodal_config=multimodal_config,
289
- parallel_config=parallel_config,
290
- scheduler_config=scheduler_config,
291
- cache_config=cache_config,
292
- )
@@ -1,275 +0,0 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
16
- # temporarily adapted from vLLM
17
- # FIXME: in progress of refactoring the model loader
18
- """Utilities for selecting and loading models."""
19
- import contextlib
20
- import fnmatch
21
- import hashlib
22
- import json
23
- import logging
24
- import os
25
- import tempfile
26
- from typing import Any, Generator, Iterable, List, Optional, Tuple, Type
27
-
28
- import filelock
29
- import huggingface_hub.constants
30
- import torch
31
- from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
32
- from safetensors.torch import load_file, safe_open, save_file
33
- from torch import nn
34
- from tqdm.auto import tqdm
35
- from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
36
- from vllm.config import LoadConfig, ModelConfig
37
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
38
-
39
- from sglang.srt.layers.quantization import get_quantization_config
40
-
41
- logger = logging.getLogger(__name__)
42
- temp_dir = tempfile.gettempdir()
43
-
44
-
45
- @contextlib.contextmanager
46
- def set_default_torch_dtype(dtype: torch.dtype):
47
- """Sets the default torch dtype to the given dtype."""
48
- old_dtype = torch.get_default_dtype()
49
- torch.set_default_dtype(dtype)
50
- yield
51
- torch.set_default_dtype(old_dtype)
52
-
53
-
54
- def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
55
- architectures = getattr(model_config.hf_config, "architectures", [])
56
- # Special handling for quantized Mixtral.
57
- # FIXME(woosuk): This is a temporary hack.
58
- if (
59
- model_config.quantization is not None
60
- and model_config.quantization != "fp8"
61
- and "MixtralForCausalLM" in architectures
62
- ):
63
- architectures = ["QuantMixtralForCausalLM"]
64
-
65
- for arch in architectures:
66
- model_cls = ModelRegistry.load_model_cls(arch)
67
- if model_cls is not None:
68
- return (model_cls, arch)
69
- raise ValueError(
70
- f"Model architectures {architectures} are not supported for now. "
71
- f"Supported architectures: {ModelRegistry.get_supported_archs()}"
72
- )
73
-
74
-
75
- class DisabledTqdm(tqdm):
76
-
77
- def __init__(self, *args, **kwargs):
78
- super().__init__(*args, **kwargs, disable=True)
79
-
80
-
81
- def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
82
- lock_dir = cache_dir or temp_dir
83
- os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
84
- model_name = model_name_or_path.replace("/", "-")
85
- hash_name = hashlib.sha256(model_name.encode()).hexdigest()
86
- # add hash to avoid conflict with old users' lock files
87
- lock_file_name = hash_name + model_name + ".lock"
88
- # mode 0o666 is required for the filelock to be shared across users
89
- lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name), mode=0o666)
90
- return lock
91
-
92
-
93
- def download_weights_from_hf(
94
- model_name_or_path: str,
95
- cache_dir: Optional[str],
96
- allow_patterns: List[str],
97
- revision: Optional[str] = None,
98
- ) -> str:
99
- """Download model weights from Hugging Face Hub.
100
-
101
- Args:
102
- model_name_or_path (str): The model name or path.
103
- cache_dir (Optional[str]): The cache directory to store the model
104
- weights. If None, will use HF defaults.
105
- allow_patterns (List[str]): The allowed patterns for the
106
- weight files. Files matched by any of the patterns will be
107
- downloaded.
108
- revision (Optional[str]): The revision of the model.
109
-
110
- Returns:
111
- str: The path to the downloaded model weights.
112
- """
113
- if not huggingface_hub.constants.HF_HUB_OFFLINE:
114
- # Before we download we look at that is available:
115
- fs = HfFileSystem()
116
- file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
117
-
118
- # depending on what is available we download different things
119
- for pattern in allow_patterns:
120
- matching = fnmatch.filter(file_list, pattern)
121
- if len(matching) > 0:
122
- allow_patterns = [pattern]
123
- break
124
-
125
- logger.info("Using model weights format %s", allow_patterns)
126
- # Use file lock to prevent multiple processes from
127
- # downloading the same model weights at the same time.
128
- with get_lock(model_name_or_path, cache_dir):
129
- hf_folder = snapshot_download(
130
- model_name_or_path,
131
- allow_patterns=allow_patterns,
132
- cache_dir=cache_dir,
133
- tqdm_class=DisabledTqdm,
134
- revision=revision,
135
- local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
136
- )
137
- return hf_folder
138
-
139
-
140
- def download_safetensors_index_file_from_hf(
141
- model_name_or_path: str,
142
- cache_dir: Optional[str],
143
- revision: Optional[str] = None,
144
- ) -> None:
145
- """Download hf safetensors index file from Hugging Face Hub.
146
-
147
- Args:
148
- model_name_or_path (str): The model name or path.
149
- cache_dir (Optional[str]): The cache directory to store the model
150
- weights. If None, will use HF defaults.
151
- revision (Optional[str]): The revision of the model.
152
- """
153
- # Use file lock to prevent multiple processes from
154
- # downloading the same model weights at the same time.
155
- with get_lock(model_name_or_path, cache_dir):
156
- try:
157
- # Download the safetensors index file.
158
- hf_hub_download(
159
- repo_id=model_name_or_path,
160
- filename=SAFE_WEIGHTS_INDEX_NAME,
161
- cache_dir=cache_dir,
162
- revision=revision,
163
- local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
164
- )
165
- # If file not found on remote or locally, we should not fail since
166
- # only some models will have SAFE_WEIGHTS_INDEX_NAME.
167
- except huggingface_hub.utils.EntryNotFoundError:
168
- logger.info("No %s found in remote.", SAFE_WEIGHTS_INDEX_NAME)
169
- except huggingface_hub.utils.LocalEntryNotFoundError:
170
- logger.info("No %s found in local cache.", SAFE_WEIGHTS_INDEX_NAME)
171
-
172
-
173
- # For models like Mistral-7B-v0.3, there are both sharded
174
- # safetensors files and a consolidated safetensors file.
175
- # Passing both of these to the weight loader functionality breaks.
176
- # So, we use the SAFE_WEIGHTS_INDEX_NAME to
177
- # look up which safetensors files should be used.
178
- def filter_duplicate_safetensors_files(
179
- hf_weights_files: List[str], hf_folder: str
180
- ) -> List[str]:
181
- # model.safetensors.index.json is a mapping from keys in the
182
- # torch state_dict to safetensors file holding that weight.
183
- index_file_name = os.path.join(hf_folder, SAFE_WEIGHTS_INDEX_NAME)
184
- if not os.path.isfile(index_file_name):
185
- return hf_weights_files
186
-
187
- # Iterate through the weight_map (weight_name: safetensors files)
188
- # to identify weights that we should use.
189
- with open(index_file_name) as index_file:
190
- weight_map = json.load(index_file)["weight_map"]
191
- weight_files_in_index = set()
192
- for weight_name in weight_map:
193
- weight_files_in_index.add(os.path.join(hf_folder, weight_map[weight_name]))
194
- # Filter out any fields that are not found in the index file.
195
- hf_weights_files = [f for f in hf_weights_files if f in weight_files_in_index]
196
- return hf_weights_files
197
-
198
-
199
- def safetensors_weights_iterator(
200
- hf_weights_files: List[str],
201
- ) -> Generator[Tuple[str, torch.Tensor], None, None]:
202
- """Iterate over the weights in the model safetensor files."""
203
- for st_file in hf_weights_files:
204
- with safe_open(st_file, framework="pt") as f:
205
- for name in f.keys(): # noqa: SIM118
206
- param = f.get_tensor(name)
207
- yield name, param
208
-
209
-
210
- def get_quant_config(
211
- model_config: ModelConfig, load_config: LoadConfig
212
- ) -> QuantizationConfig:
213
- quant_cls = get_quantization_config(model_config.quantization)
214
- # Read the quantization config from the HF model config, if available.
215
- hf_quant_config = getattr(model_config.hf_config, "quantization_config", None)
216
- if hf_quant_config is None:
217
- # compressed-tensors uses a compressions_config
218
- hf_quant_config = getattr(model_config.hf_config, "compression_config", None)
219
- if hf_quant_config is not None:
220
- return quant_cls.from_config(hf_quant_config)
221
- # In case of bitsandbytes/QLoRA, get quant config from the adapter model.
222
- if model_config.quantization == "bitsandbytes":
223
- if (
224
- not load_config.model_loader_extra_config
225
- or "qlora_adapter_name_or_path" not in load_config.model_loader_extra_config
226
- ):
227
- return quant_cls.from_config({"adapter_name_or_path": ""})
228
- model_name_or_path = load_config.model_loader_extra_config[
229
- "qlora_adapter_name_or_path"
230
- ]
231
-
232
- else:
233
- model_name_or_path = model_config.model
234
- is_local = os.path.isdir(model_name_or_path)
235
- if not is_local:
236
- # Download the config files.
237
- with get_lock(model_name_or_path, load_config.download_dir):
238
- hf_folder = snapshot_download(
239
- model_name_or_path,
240
- revision=model_config.revision,
241
- allow_patterns="*.json",
242
- cache_dir=load_config.download_dir,
243
- local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
244
- tqdm_class=DisabledTqdm,
245
- )
246
- else:
247
- hf_folder = model_name_or_path
248
-
249
- possible_config_filenames = quant_cls.get_config_filenames()
250
-
251
- # If the quantization config is not found, use the default config.
252
- if not possible_config_filenames:
253
- return quant_cls()
254
-
255
- config_files = glob.glob(os.path.join(hf_folder, "*.json"))
256
-
257
- quant_config_files = [
258
- f for f in config_files if any(f.endswith(x) for x in possible_config_filenames)
259
- ]
260
- if len(quant_config_files) == 0:
261
- raise ValueError(f"Cannot find the config file for {model_config.quantization}")
262
- if len(quant_config_files) > 1:
263
- raise ValueError(
264
- f"Found multiple config files for {model_config.quantization}: "
265
- f"{quant_config_files}"
266
- )
267
-
268
- quant_config_file = quant_config_files[0]
269
- with open(quant_config_file, "r") as f:
270
- config = json.load(f)
271
-
272
- if model_config.quantization == "bitsandbytes":
273
- config["adapter_name_or_path"] = model_name_or_path
274
-
275
- return quant_cls.from_config(config)
@@ -1,112 +0,0 @@
1
- sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
2
- sglang/api.py,sha256=gAY9JhqWXjrYoWnMvR-iiuuY1YSN94We-lc1LH0z3cw,6030
3
- sglang/bench_latency.py,sha256=E-cfuZSjBGonzKL0LgB0zAqMWpiP3qozB_Ht9dH8qvc,16207
4
- sglang/bench_serving.py,sha256=sS-fawAyzngrOVbPE3N1FBxPojoPd9vj9XQDsWpIYTQ,35798
5
- sglang/check_env.py,sha256=oU8VmjjPK2SviRhr41cF1953soBu-eTT5E0Hf04zMzo,4974
6
- sglang/global_config.py,sha256=9JxaFkBKSgep6BVeEl_kx9tuW9PqdijYELyBGTryl6o,1704
7
- sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
8
- sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
9
- sglang/utils.py,sha256=zFYGkC4vOUR3sTv1TmQXcsOLZDtDBR3wnjqnDp3xMIs,8352
10
- sglang/version.py,sha256=X4KG3FscE5AhbGbcdDDgdDC550CVpxNMwdNLcx6EQ7M,23
11
- sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
13
- sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
14
- sglang/lang/compiler.py,sha256=1Tc6MQs4RsIfrNmmO7PMSUEHIqvNqKOp_HxaYqonwFE,7533
15
- sglang/lang/interpreter.py,sha256=3RIeSGdKlKTq2Ixg_Tyo0fGEDTvBKS2f9FaJYODBHzA,30102
16
- sglang/lang/ir.py,sha256=Ow6jXDPIeRd1piAuYjvgyFxfro1G2_-1QwUFfq4Aihs,16842
17
- sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
18
- sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
20
- sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
21
- sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
22
- sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
23
- sglang/lang/backend/runtime_endpoint.py,sha256=AaBc5yczchX7mkwiKDMyjLjBkJsh2Lubrfd9lvCOlDo,9544
24
- sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
25
- sglang/srt/conversation.py,sha256=V5YuoeO6-aLqGv0p3J2qx8TnBJbN1oTopYFutNul3GQ,16491
26
- sglang/srt/hf_transformers_utils.py,sha256=Tf_RplcW7llVXsigRvSGqmeAUxBeAL8rPCkzuqWfZ8U,11925
27
- sglang/srt/mm_utils.py,sha256=n7_GmbOM_0IWVXovpM34rKIBw0Py9yb_NXSQw27u4OA,9454
28
- sglang/srt/model_config.py,sha256=k4OfRV-szWkFaJMIC40JoJGJ75AfYQ2hf4M1dS1aQ-o,6366
29
- sglang/srt/sampling_params.py,sha256=5V1MhhEvyCWZrCF5VmQxcKNuKVoC4LynY-q4Bx3P3mo,4876
30
- sglang/srt/server.py,sha256=FvczPB9ojDVLIdC2kic0RLAmOTt0WZrql_BvYzwbeRY,18495
31
- sglang/srt/server_args.py,sha256=GLuJkgwv-Osmf3IqCvZqfdqIBJjcHkdtoNT0_zq75Kc,16849
32
- sglang/srt/utils.py,sha256=ReJqGMdquK_cfve269yjpWWQaozTVoEHSLG5P3CKvAg,24102
33
- sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
34
- sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
35
- sglang/srt/constrained/fsm_cache.py,sha256=QTrBFoZCp2FeigtIakz2MCgQLtvQFXgl2lDPQaGtu9M,2784
36
- sglang/srt/constrained/jump_forward.py,sha256=IgZ8D0woy5FLIQvXkE8wZRYejDsfVkjU0sqUlkiv_f4,6193
37
- sglang/srt/layers/activation.py,sha256=MXkuGi5caKHEwqUegoEfOk2Omab8OLrxP-sjPj2TVzU,1197
38
- sglang/srt/layers/decode_attention.py,sha256=Vgxd2rWzSZkNFp0bjZRAUAusG4bz6iy3D0CULnN-cdk,8904
39
- sglang/srt/layers/extend_attention.py,sha256=_LOgzSr-1c2UweHZXADjWHbXOmd2JPm-tUMb1vwTTZI,14197
40
- sglang/srt/layers/fused_moe.py,sha256=KmyXwau2OOZpQimGIQrHptzGNs1trIud5AKEEKXdzPU,20823
41
- sglang/srt/layers/layernorm.py,sha256=RzN4eESN9S8mw32r2Nxarq7wKFdeG1yhxPmehUMx79s,2073
42
- sglang/srt/layers/logits_processor.py,sha256=iewPk7VR4jdJeLH6NAO_XqwqM4RhIHdWJzj7-qPRYIw,11362
43
- sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
44
- sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
45
- sglang/srt/layers/radix_attention.py,sha256=LpfTizXKXm1oS5oUfh6aowZceHUHqnquvx-GpfyYjdk,7508
46
- sglang/srt/managers/controller_multi.py,sha256=LYI-XE9h57DW8Uh4gpd8upsC3p2dd5weKzddEH274jg,6626
47
- sglang/srt/managers/controller_single.py,sha256=CdQ9_XPZdcWF5jArDmVR8K-WZ9_8Gpgk4SwANKxTX-Y,5112
48
- sglang/srt/managers/detokenizer_manager.py,sha256=OXufjdCt2ebt-S7MDndjY9Ew16rP4fhualGgj6YEKp0,6295
49
- sglang/srt/managers/io_struct.py,sha256=Xvfl6DNZ2Ek2S4qlRzpVo3foc-aC-1-N-5odcJ4gdq4,9446
50
- sglang/srt/managers/policy_scheduler.py,sha256=KRFaZwjCAkPQDX3W8lbzrxYqgOe7LKFDj2BPlcmlnR8,8379
51
- sglang/srt/managers/schedule_batch.py,sha256=iZ2OwdEn5As7cVGAoe0x97cMCPSS6q_SI_iG79mF8LQ,31111
52
- sglang/srt/managers/tokenizer_manager.py,sha256=TIIo4YlfdM10LE4JVqv2cO2uDJJtKXDagwzfjMCDU5Q,24858
53
- sglang/srt/managers/tp_worker.py,sha256=qOx99QL6BIW0aOz7SknWqgflLeNeFYpJsGq0ZsYmYFY,32805
54
- sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
55
- sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
56
- sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
57
- sglang/srt/mem_cache/memory_pool.py,sha256=eXDCstd5Mvu1CbHt1y9z27Eq60QYwW45FsKbZspu4yw,5310
58
- sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
59
- sglang/srt/model_executor/cuda_graph_runner.py,sha256=xQgTTtoMkvYJhYyRJHxPdybmPtfvcODqPLW9btUFt60,10003
60
- sglang/srt/model_executor/forward_batch_info.py,sha256=B3flTlRNLMa7Km7use1O0Z2YL3-a6rw1BodNKjKV51g,11049
61
- sglang/srt/model_executor/model_runner.py,sha256=ZlFgqBNuqgWpa-NrjkfTT-_amtea33H9M1tBl-MT_nk,16977
62
- sglang/srt/model_loader/model_loader.py,sha256=QmZUhHh1nmWrfYlunfnxMcTsIvip1l6aMIlrXoCED4I,10697
63
- sglang/srt/model_loader/utils.py,sha256=0AoWXX9uV5rKRYXJ4HduSnvdeerytI4ONCLCH6X4XFQ,10675
64
- sglang/srt/models/chatglm.py,sha256=7bHU2AFoppINDZm0EdxgtAJe7rwr9OPkhOCfq2qNrIA,13862
65
- sglang/srt/models/commandr.py,sha256=5BEtIS2uUQJANkkY-6ZeDqlrpUK5yXVYHiztU3vsTKY,14172
66
- sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
67
- sglang/srt/models/deepseek.py,sha256=E5W4nkH-Ne449rAIwQZgz-FAH2Qqp2r1vNfboyk5wEg,16024
68
- sglang/srt/models/deepseek_v2.py,sha256=NMcckZb48kVUwAmDA2l8wO19T6DNkJOkKAhHa6utBZM,26968
69
- sglang/srt/models/gemma.py,sha256=ilfN_NOcz7hpwEJ2y7NW3fBFmFO7YfjhdFDbfzl2qww,12285
70
- sglang/srt/models/gemma2.py,sha256=ybQOXAPofw_Pv3mBer7dTpH4SlZt6Gf2I462Q3lOIww,16359
71
- sglang/srt/models/gpt_bigcode.py,sha256=OKk9UP67as3T5bePlTRGHTCD-1wqaUEk92AowXPm6dg,10204
72
- sglang/srt/models/grok.py,sha256=M9rtdXslqYBle5VyZqFVHiJUXq_q_aHbza63xa03zqI,27861
73
- sglang/srt/models/internlm2.py,sha256=6j7JH0p3yib8GZDH8Cmrs-pgwfH3eOlAK6V3Cq64O7w,12202
74
- sglang/srt/models/llama2.py,sha256=HmzE1I8OnesmrdPY5b56l7okhWH_lRvWAg16K-UwKHg,14300
75
- sglang/srt/models/llama_classification.py,sha256=Dvzy3PfETiJtnKFOk8qDDLUoZECf_cpSrNeA60PaDo4,4932
76
- sglang/srt/models/llama_embedding.py,sha256=e2lpZ6GHKrHT1rr7_5gHGoCpfqdOBMusZCz34n62lec,3542
77
- sglang/srt/models/llava.py,sha256=-ysi192vpBDxNaMS8qaLOhC34lXQyRtbG_0niVaceSo,18436
78
- sglang/srt/models/llavavid.py,sha256=MX7YpqYh5J4BoOnV7vVAIfoOlBFQXYpp8Kpe7WK0ejk,13562
79
- sglang/srt/models/minicpm.py,sha256=ea_OyiwVTo6Tg9jNRAwqxETnA6FFeAqlIbiUS-xViEI,13843
80
- sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
81
- sglang/srt/models/mixtral.py,sha256=raSLbp6AfWg5_u-f-lYeRejE9koAjbHt8iIHXd3nURM,21397
82
- sglang/srt/models/mixtral_quant.py,sha256=xYeeatZ9OfwCTas_KbH9nl6lnUT4YqSY7NAxpgLp5LE,14222
83
- sglang/srt/models/qwen.py,sha256=43ea6gn4wHzAaI3JTDLtl08aEm0vIqgzbVH9M8oeuY0,10006
84
- sglang/srt/models/qwen2.py,sha256=Hyhks2r4KHpKeb9iHZpnvEVc5klmnrPwcLohqg8j1kw,12284
85
- sglang/srt/models/qwen2_moe.py,sha256=pTfBivDyzdbcP22_7PdmdPqgx34esH8J98r-EgFA9Uw,17747
86
- sglang/srt/models/stablelm.py,sha256=yPrdzPEoUD2s_Q3RgOq7BBC7z-UtEaACzabqbDRs2tA,11368
87
- sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
88
- sglang/srt/openai_api/adapter.py,sha256=fgUAPAcQ_mUJszbpsI_cgv2vzOAS7AKKAJPi2B91aw4,42490
89
- sglang/srt/openai_api/protocol.py,sha256=knf-nds0XO2LYg-hPM-Ho1f1y2XZIV_Gvg3xcCKLfgQ,9411
90
- sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
91
- sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
92
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
93
- sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
94
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
95
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
96
- sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
97
- sglang/test/runners.py,sha256=FYLbrWePfTacN5bsbAgMl5RiDI4g_Bsbwh1gXqRwr0Y,7794
98
- sglang/test/simple_eval_common.py,sha256=HL1bfgkTAKP7sk-kShg73WTeADhuBD6xSsuLbV_9C3s,12359
99
- sglang/test/simple_eval_gpqa.py,sha256=CaRAuHdZj0m4mRm4tH9k7cB0kQxe0LHwlz7Vn1qyKps,3189
100
- sglang/test/simple_eval_humaneval.py,sha256=iCtN2LBL6j3nxMDjRJ--m0MCNPAwDo81gJ2whE-2Rt0,5674
101
- sglang/test/simple_eval_math.py,sha256=EQblQmtUt-kl558drzhP7c6KhpDNgr1EJhhKx5eeHM4,2519
102
- sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
103
- sglang/test/simple_eval_mmlu.py,sha256=KqSSdSu2qfoKQ870ttxev1NJ7c90xv2mvKOQsSODtAw,4326
104
- sglang/test/test_layernorm.py,sha256=VDdoeqGvebUa-l3rDiid6cC7wZq0Phpbm5fxxD0-cpg,1910
105
- sglang/test/test_programs.py,sha256=vRhKIriZgSk_Zn8gGviIfiY_suOBA7Ni7P0NaQM2Esk,13894
106
- sglang/test/test_utils.py,sha256=cO0ZbnfBS_MxyZ6MDyA7DrDVwu3umKRb3WP_dwggPng,14505
107
- sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
108
- sglang-0.2.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
109
- sglang-0.2.12.dist-info/METADATA,sha256=k4QBFP1vyWHeXgCA9Npoz7Wb8qT9aC8rL7R1QP2J60g,34314
110
- sglang-0.2.12.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
111
- sglang-0.2.12.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
112
- sglang-0.2.12.dist-info/RECORD,,