sglang 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. sglang/api.py +7 -1
  2. sglang/bench_latency.py +9 -6
  3. sglang/bench_serving.py +46 -22
  4. sglang/global_config.py +1 -1
  5. sglang/lang/backend/runtime_endpoint.py +60 -49
  6. sglang/lang/compiler.py +2 -2
  7. sglang/lang/interpreter.py +4 -2
  8. sglang/lang/ir.py +16 -7
  9. sglang/srt/constrained/base_tool_cache.py +1 -1
  10. sglang/srt/constrained/fsm_cache.py +12 -2
  11. sglang/srt/constrained/jump_forward.py +13 -2
  12. sglang/srt/layers/activation.py +32 -0
  13. sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
  14. sglang/srt/layers/extend_attention.py +9 -2
  15. sglang/srt/layers/fused_moe/__init__.py +1 -0
  16. sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
  17. sglang/srt/layers/fused_moe/layer.py +587 -0
  18. sglang/srt/layers/layernorm.py +65 -0
  19. sglang/srt/layers/logits_processor.py +7 -2
  20. sglang/srt/layers/pooler.py +50 -0
  21. sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
  22. sglang/srt/layers/radix_attention.py +40 -16
  23. sglang/srt/managers/detokenizer_manager.py +31 -9
  24. sglang/srt/managers/io_struct.py +63 -0
  25. sglang/srt/managers/policy_scheduler.py +173 -25
  26. sglang/srt/managers/schedule_batch.py +115 -97
  27. sglang/srt/managers/tokenizer_manager.py +194 -112
  28. sglang/srt/managers/tp_worker.py +290 -359
  29. sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
  30. sglang/srt/mem_cache/chunk_cache.py +43 -20
  31. sglang/srt/mem_cache/memory_pool.py +2 -2
  32. sglang/srt/mem_cache/radix_cache.py +74 -40
  33. sglang/srt/model_executor/cuda_graph_runner.py +71 -25
  34. sglang/srt/model_executor/forward_batch_info.py +293 -156
  35. sglang/srt/model_executor/model_runner.py +77 -57
  36. sglang/srt/models/chatglm.py +2 -2
  37. sglang/srt/models/commandr.py +1 -1
  38. sglang/srt/models/deepseek.py +2 -2
  39. sglang/srt/models/deepseek_v2.py +7 -6
  40. sglang/srt/models/gemma.py +1 -1
  41. sglang/srt/models/gemma2.py +11 -6
  42. sglang/srt/models/grok.py +50 -396
  43. sglang/srt/models/internlm2.py +2 -7
  44. sglang/srt/models/llama2.py +4 -4
  45. sglang/srt/models/llama_embedding.py +88 -0
  46. sglang/srt/models/minicpm.py +2 -2
  47. sglang/srt/models/mixtral.py +56 -254
  48. sglang/srt/models/mixtral_quant.py +1 -4
  49. sglang/srt/models/qwen.py +2 -2
  50. sglang/srt/models/qwen2.py +2 -2
  51. sglang/srt/models/qwen2_moe.py +2 -13
  52. sglang/srt/models/stablelm.py +1 -1
  53. sglang/srt/openai_api/adapter.py +187 -48
  54. sglang/srt/openai_api/protocol.py +37 -1
  55. sglang/srt/sampling/penaltylib/__init__.py +13 -0
  56. sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
  57. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
  58. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
  59. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
  60. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
  61. sglang/srt/sampling_params.py +31 -8
  62. sglang/srt/server.py +91 -29
  63. sglang/srt/server_args.py +32 -19
  64. sglang/srt/utils.py +32 -15
  65. sglang/test/run_eval.py +10 -1
  66. sglang/test/runners.py +81 -73
  67. sglang/test/simple_eval_humaneval.py +2 -8
  68. sglang/test/simple_eval_mgsm.py +203 -0
  69. sglang/test/srt/sampling/penaltylib/utils.py +337 -0
  70. sglang/test/test_layernorm.py +60 -0
  71. sglang/test/test_programs.py +36 -7
  72. sglang/test/test_utils.py +24 -2
  73. sglang/utils.py +0 -1
  74. sglang/version.py +1 -1
  75. {sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/METADATA +33 -16
  76. sglang-0.2.13.dist-info/RECORD +112 -0
  77. {sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/WHEEL +1 -1
  78. sglang/srt/layers/linear.py +0 -884
  79. sglang/srt/layers/quantization/__init__.py +0 -64
  80. sglang/srt/layers/quantization/fp8.py +0 -677
  81. sglang/srt/model_loader/model_loader.py +0 -292
  82. sglang/srt/model_loader/utils.py +0 -275
  83. sglang-0.2.11.dist-info/RECORD +0 -102
  84. {sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/LICENSE +0 -0
  85. {sglang-0.2.11.dist-info → sglang-0.2.13.dist-info}/top_level.txt +0 -0
@@ -1,292 +0,0 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
16
- # temporarily adapted from https://github.com/vllm-project/vllm/blob/10383887e03412196a2689b9398290719c4797bf/vllm/model_executor/model_loader/loader.py
17
- # FIXME: in progress of refactoring the model loader
18
-
19
- import glob
20
- import os
21
- import re
22
- from typing import Any, Dict, Generator, List, Optional, Tuple, Type
23
-
24
- import torch
25
- from torch import nn
26
- from tqdm import tqdm
27
- from vllm.config import (
28
- CacheConfig,
29
- DeviceConfig,
30
- LoadConfig,
31
- LoadFormat,
32
- LoRAConfig,
33
- ModelConfig,
34
- MultiModalConfig,
35
- ParallelConfig,
36
- SchedulerConfig,
37
- )
38
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
39
- from vllm.model_executor.model_loader.utils import (
40
- get_model_architecture,
41
- set_default_torch_dtype,
42
- )
43
- from vllm.platforms import current_platform
44
-
45
- from sglang.srt.model_loader.utils import (
46
- download_safetensors_index_file_from_hf,
47
- download_weights_from_hf,
48
- filter_duplicate_safetensors_files,
49
- get_quant_config,
50
- safetensors_weights_iterator,
51
- )
52
-
53
-
54
- def _get_quantization_config(
55
- model_config: ModelConfig, load_config: LoadConfig
56
- ) -> Optional[QuantizationConfig]:
57
- """Get the quantization config."""
58
- if model_config.quantization is not None:
59
- quant_config = get_quant_config(model_config, load_config)
60
- capability = current_platform.get_device_capability()
61
- capability = capability[0] * 10 + capability[1]
62
- if capability < quant_config.get_min_capability():
63
- raise ValueError(
64
- f"The quantization method {model_config.quantization} is not "
65
- "supported for the current GPU. "
66
- f"Minimum capability: {quant_config.get_min_capability()}. "
67
- f"Current capability: {capability}."
68
- )
69
- supported_dtypes = quant_config.get_supported_act_dtypes()
70
- if model_config.dtype not in supported_dtypes:
71
- raise ValueError(
72
- f"{model_config.dtype} is not supported for quantization "
73
- f"method {model_config.quantization}. Supported dtypes: "
74
- f"{supported_dtypes}"
75
- )
76
- return quant_config
77
- return None
78
-
79
-
80
- def _get_model_initialization_kwargs(
81
- model_class: Type[nn.Module],
82
- lora_config: Optional[LoRAConfig],
83
- multimodal_config: Optional[MultiModalConfig],
84
- ) -> Dict[str, Any]:
85
- """Get extra kwargs for model initialization."""
86
- extra_kwargs: Dict[str, Any] = {}
87
-
88
- assert lora_config is None
89
- assert multimodal_config is None
90
-
91
- return extra_kwargs
92
-
93
-
94
- def _initialize_model(
95
- model_config: ModelConfig,
96
- load_config: LoadConfig,
97
- lora_config: Optional[LoRAConfig],
98
- multimodal_config: Optional[MultiModalConfig],
99
- cache_config: CacheConfig,
100
- ) -> nn.Module:
101
- """Initialize a model with the given configurations."""
102
- model_class = get_model_architecture(model_config)[0]
103
- quant_config = _get_quantization_config(model_config, load_config)
104
-
105
- return model_class(
106
- config=model_config.hf_config,
107
- cache_config=cache_config,
108
- quant_config=quant_config,
109
- efficient_weight_load=True,
110
- **_get_model_initialization_kwargs(model_class, lora_config, multimodal_config),
111
- )
112
-
113
-
114
- class ModelLoader:
115
- """Model loader that can load different file types from disk."""
116
-
117
- def __init__(self, load_config: LoadConfig):
118
- self.load_config = load_config
119
-
120
- def _prepare_weights(
121
- self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
122
- ) -> Tuple[str, List[str], bool]:
123
- """Prepare weights for the model.
124
-
125
- If the model is not local, it will be downloaded."""
126
-
127
- is_local = os.path.isdir(model_name_or_path)
128
- load_format = self.load_config.load_format
129
- use_safetensors = False
130
- # Some quantized models use .pt files for storing the weights.
131
- if load_format == LoadFormat.AUTO:
132
- allow_patterns = ["*.safetensors", "*.bin"]
133
- elif load_format == LoadFormat.SAFETENSORS:
134
- use_safetensors = True
135
- allow_patterns = ["*.safetensors"]
136
- elif load_format == LoadFormat.PT:
137
- allow_patterns = ["*.pt"]
138
- elif load_format == LoadFormat.NPCACHE:
139
- allow_patterns = ["*.bin"]
140
- else:
141
- raise ValueError(f"Unknown load_format: {load_format}")
142
-
143
- if fall_back_to_pt:
144
- allow_patterns += ["*.pt"]
145
-
146
- if not is_local:
147
- hf_folder = download_weights_from_hf(
148
- model_name_or_path,
149
- self.load_config.download_dir,
150
- allow_patterns,
151
- revision,
152
- )
153
- else:
154
- hf_folder = model_name_or_path
155
-
156
- hf_weights_files: List[str] = []
157
- for pattern in allow_patterns:
158
- hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
159
- if len(hf_weights_files) > 0:
160
- if pattern == "*.safetensors":
161
- use_safetensors = True
162
- break
163
-
164
- if use_safetensors:
165
- # For models like Mistral-7B-Instruct-v0.3
166
- # there are both sharded safetensors files and a consolidated
167
- # safetensors file. Using both breaks.
168
- # Here, we download the `model.safetensors.index.json` and filter
169
- # any files not found in the index.
170
- if not is_local:
171
- download_safetensors_index_file_from_hf(
172
- model_name_or_path, self.load_config.download_dir, revision
173
- )
174
- hf_weights_files = filter_duplicate_safetensors_files(
175
- hf_weights_files, hf_folder
176
- )
177
- else:
178
- hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
179
-
180
- if len(hf_weights_files) == 0:
181
- raise RuntimeError(
182
- f"Cannot find any model weights with `{model_name_or_path}`"
183
- )
184
-
185
- return hf_folder, hf_weights_files, use_safetensors
186
-
187
- def _get_weights_iterator(
188
- self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
189
- ) -> Generator[Tuple[str, torch.Tensor], None, None]:
190
- """Get an iterator for the model weights based on the load format."""
191
- hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
192
- model_name_or_path, revision, fall_back_to_pt
193
- )
194
- if self.load_config.load_format == LoadFormat.NPCACHE:
195
- # Currently np_cache only support *.bin checkpoints
196
- assert use_safetensors is False
197
- weights_iterator = np_cache_weights_iterator(
198
- model_name_or_path,
199
- self.load_config.download_dir,
200
- hf_folder,
201
- hf_weights_files,
202
- )
203
- elif use_safetensors:
204
- weights_iterator = safetensors_weights_iterator(hf_weights_files)
205
- else:
206
- weights_iterator = pt_weights_iterator(hf_weights_files)
207
-
208
- return weights_iterator
209
-
210
- def load_model(
211
- self,
212
- *,
213
- model_config: ModelConfig,
214
- device_config: DeviceConfig,
215
- lora_config: Optional[LoRAConfig],
216
- multimodal_config: Optional[MultiModalConfig],
217
- parallel_config: ParallelConfig,
218
- scheduler_config: SchedulerConfig,
219
- cache_config: CacheConfig,
220
- ) -> nn.Module:
221
- with set_default_torch_dtype(model_config.dtype):
222
- with torch.device(device_config.device):
223
- model = _initialize_model(
224
- model_config,
225
- self.load_config,
226
- lora_config,
227
- multimodal_config,
228
- cache_config,
229
- )
230
- weights = self._get_weights_iterator(
231
- model_config.model,
232
- model_config.revision,
233
- fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
234
- )
235
-
236
- modules = {}
237
- for name, module in model.named_modules():
238
- modules[name] = module
239
-
240
- def apply_quant_method(module):
241
- quant_method = getattr(module, "quant_method", None)
242
- if quant_method is not None:
243
- # print("before apply quant", module.weight, module.weight.dtype)
244
- quant_method.process_weights_after_loading(module)
245
- # print("after apply quant", module.weight, module.weight.dtype)
246
- # FIXME: Remove this after Mixtral is updated
247
- # to use quant_method.
248
- if hasattr(module, "process_weights_after_loading"):
249
- module.process_weights_after_loading()
250
-
251
- if torch.cuda.current_device() == 0:
252
- weights = tqdm(
253
- weights, total=model.get_num_params() * 1.5, desc="load model"
254
- )
255
-
256
- num_shard = {}
257
- num_loaded = {}
258
- for name, loaded_weight in weights:
259
- model.load_weights(None, name, loaded_weight)
260
- module_name, shard_num = model.get_module_name(name)
261
- num_shard[module_name] = shard_num
262
- if module_name not in num_loaded:
263
- num_loaded[module_name] = 1
264
- else:
265
- num_loaded[module_name] += 1
266
- if num_loaded[module_name] == num_shard[module_name]:
267
- apply_quant_method(modules[module_name])
268
-
269
- return model.eval()
270
-
271
-
272
- def get_model(
273
- *,
274
- model_config: ModelConfig,
275
- load_config: LoadConfig,
276
- device_config: DeviceConfig,
277
- parallel_config: ParallelConfig,
278
- scheduler_config: SchedulerConfig,
279
- lora_config: Optional[LoRAConfig],
280
- multimodal_config: Optional[MultiModalConfig],
281
- cache_config: CacheConfig,
282
- ) -> nn.Module:
283
- loader = ModelLoader(load_config)
284
- return loader.load_model(
285
- model_config=model_config,
286
- device_config=device_config,
287
- lora_config=lora_config,
288
- multimodal_config=multimodal_config,
289
- parallel_config=parallel_config,
290
- scheduler_config=scheduler_config,
291
- cache_config=cache_config,
292
- )
@@ -1,275 +0,0 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
16
- # temporarily adapted from vLLM
17
- # FIXME: in progress of refactoring the model loader
18
- """Utilities for selecting and loading models."""
19
- import contextlib
20
- import fnmatch
21
- import hashlib
22
- import json
23
- import logging
24
- import os
25
- import tempfile
26
- from typing import Any, Generator, Iterable, List, Optional, Tuple, Type
27
-
28
- import filelock
29
- import huggingface_hub.constants
30
- import torch
31
- from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
32
- from safetensors.torch import load_file, safe_open, save_file
33
- from torch import nn
34
- from tqdm.auto import tqdm
35
- from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
36
- from vllm.config import LoadConfig, ModelConfig
37
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
38
-
39
- from sglang.srt.layers.quantization import get_quantization_config
40
-
41
- logger = logging.getLogger(__name__)
42
- temp_dir = tempfile.gettempdir()
43
-
44
-
45
- @contextlib.contextmanager
46
- def set_default_torch_dtype(dtype: torch.dtype):
47
- """Sets the default torch dtype to the given dtype."""
48
- old_dtype = torch.get_default_dtype()
49
- torch.set_default_dtype(dtype)
50
- yield
51
- torch.set_default_dtype(old_dtype)
52
-
53
-
54
- def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
55
- architectures = getattr(model_config.hf_config, "architectures", [])
56
- # Special handling for quantized Mixtral.
57
- # FIXME(woosuk): This is a temporary hack.
58
- if (
59
- model_config.quantization is not None
60
- and model_config.quantization != "fp8"
61
- and "MixtralForCausalLM" in architectures
62
- ):
63
- architectures = ["QuantMixtralForCausalLM"]
64
-
65
- for arch in architectures:
66
- model_cls = ModelRegistry.load_model_cls(arch)
67
- if model_cls is not None:
68
- return (model_cls, arch)
69
- raise ValueError(
70
- f"Model architectures {architectures} are not supported for now. "
71
- f"Supported architectures: {ModelRegistry.get_supported_archs()}"
72
- )
73
-
74
-
75
- class DisabledTqdm(tqdm):
76
-
77
- def __init__(self, *args, **kwargs):
78
- super().__init__(*args, **kwargs, disable=True)
79
-
80
-
81
- def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
82
- lock_dir = cache_dir or temp_dir
83
- os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
84
- model_name = model_name_or_path.replace("/", "-")
85
- hash_name = hashlib.sha256(model_name.encode()).hexdigest()
86
- # add hash to avoid conflict with old users' lock files
87
- lock_file_name = hash_name + model_name + ".lock"
88
- # mode 0o666 is required for the filelock to be shared across users
89
- lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name), mode=0o666)
90
- return lock
91
-
92
-
93
- def download_weights_from_hf(
94
- model_name_or_path: str,
95
- cache_dir: Optional[str],
96
- allow_patterns: List[str],
97
- revision: Optional[str] = None,
98
- ) -> str:
99
- """Download model weights from Hugging Face Hub.
100
-
101
- Args:
102
- model_name_or_path (str): The model name or path.
103
- cache_dir (Optional[str]): The cache directory to store the model
104
- weights. If None, will use HF defaults.
105
- allow_patterns (List[str]): The allowed patterns for the
106
- weight files. Files matched by any of the patterns will be
107
- downloaded.
108
- revision (Optional[str]): The revision of the model.
109
-
110
- Returns:
111
- str: The path to the downloaded model weights.
112
- """
113
- if not huggingface_hub.constants.HF_HUB_OFFLINE:
114
- # Before we download we look at that is available:
115
- fs = HfFileSystem()
116
- file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
117
-
118
- # depending on what is available we download different things
119
- for pattern in allow_patterns:
120
- matching = fnmatch.filter(file_list, pattern)
121
- if len(matching) > 0:
122
- allow_patterns = [pattern]
123
- break
124
-
125
- logger.info("Using model weights format %s", allow_patterns)
126
- # Use file lock to prevent multiple processes from
127
- # downloading the same model weights at the same time.
128
- with get_lock(model_name_or_path, cache_dir):
129
- hf_folder = snapshot_download(
130
- model_name_or_path,
131
- allow_patterns=allow_patterns,
132
- cache_dir=cache_dir,
133
- tqdm_class=DisabledTqdm,
134
- revision=revision,
135
- local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
136
- )
137
- return hf_folder
138
-
139
-
140
- def download_safetensors_index_file_from_hf(
141
- model_name_or_path: str,
142
- cache_dir: Optional[str],
143
- revision: Optional[str] = None,
144
- ) -> None:
145
- """Download hf safetensors index file from Hugging Face Hub.
146
-
147
- Args:
148
- model_name_or_path (str): The model name or path.
149
- cache_dir (Optional[str]): The cache directory to store the model
150
- weights. If None, will use HF defaults.
151
- revision (Optional[str]): The revision of the model.
152
- """
153
- # Use file lock to prevent multiple processes from
154
- # downloading the same model weights at the same time.
155
- with get_lock(model_name_or_path, cache_dir):
156
- try:
157
- # Download the safetensors index file.
158
- hf_hub_download(
159
- repo_id=model_name_or_path,
160
- filename=SAFE_WEIGHTS_INDEX_NAME,
161
- cache_dir=cache_dir,
162
- revision=revision,
163
- local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
164
- )
165
- # If file not found on remote or locally, we should not fail since
166
- # only some models will have SAFE_WEIGHTS_INDEX_NAME.
167
- except huggingface_hub.utils.EntryNotFoundError:
168
- logger.info("No %s found in remote.", SAFE_WEIGHTS_INDEX_NAME)
169
- except huggingface_hub.utils.LocalEntryNotFoundError:
170
- logger.info("No %s found in local cache.", SAFE_WEIGHTS_INDEX_NAME)
171
-
172
-
173
- # For models like Mistral-7B-v0.3, there are both sharded
174
- # safetensors files and a consolidated safetensors file.
175
- # Passing both of these to the weight loader functionality breaks.
176
- # So, we use the SAFE_WEIGHTS_INDEX_NAME to
177
- # look up which safetensors files should be used.
178
- def filter_duplicate_safetensors_files(
179
- hf_weights_files: List[str], hf_folder: str
180
- ) -> List[str]:
181
- # model.safetensors.index.json is a mapping from keys in the
182
- # torch state_dict to safetensors file holding that weight.
183
- index_file_name = os.path.join(hf_folder, SAFE_WEIGHTS_INDEX_NAME)
184
- if not os.path.isfile(index_file_name):
185
- return hf_weights_files
186
-
187
- # Iterate through the weight_map (weight_name: safetensors files)
188
- # to identify weights that we should use.
189
- with open(index_file_name) as index_file:
190
- weight_map = json.load(index_file)["weight_map"]
191
- weight_files_in_index = set()
192
- for weight_name in weight_map:
193
- weight_files_in_index.add(os.path.join(hf_folder, weight_map[weight_name]))
194
- # Filter out any fields that are not found in the index file.
195
- hf_weights_files = [f for f in hf_weights_files if f in weight_files_in_index]
196
- return hf_weights_files
197
-
198
-
199
- def safetensors_weights_iterator(
200
- hf_weights_files: List[str],
201
- ) -> Generator[Tuple[str, torch.Tensor], None, None]:
202
- """Iterate over the weights in the model safetensor files."""
203
- for st_file in hf_weights_files:
204
- with safe_open(st_file, framework="pt") as f:
205
- for name in f.keys(): # noqa: SIM118
206
- param = f.get_tensor(name)
207
- yield name, param
208
-
209
-
210
- def get_quant_config(
211
- model_config: ModelConfig, load_config: LoadConfig
212
- ) -> QuantizationConfig:
213
- quant_cls = get_quantization_config(model_config.quantization)
214
- # Read the quantization config from the HF model config, if available.
215
- hf_quant_config = getattr(model_config.hf_config, "quantization_config", None)
216
- if hf_quant_config is None:
217
- # compressed-tensors uses a compressions_config
218
- hf_quant_config = getattr(model_config.hf_config, "compression_config", None)
219
- if hf_quant_config is not None:
220
- return quant_cls.from_config(hf_quant_config)
221
- # In case of bitsandbytes/QLoRA, get quant config from the adapter model.
222
- if model_config.quantization == "bitsandbytes":
223
- if (
224
- not load_config.model_loader_extra_config
225
- or "qlora_adapter_name_or_path" not in load_config.model_loader_extra_config
226
- ):
227
- return quant_cls.from_config({"adapter_name_or_path": ""})
228
- model_name_or_path = load_config.model_loader_extra_config[
229
- "qlora_adapter_name_or_path"
230
- ]
231
-
232
- else:
233
- model_name_or_path = model_config.model
234
- is_local = os.path.isdir(model_name_or_path)
235
- if not is_local:
236
- # Download the config files.
237
- with get_lock(model_name_or_path, load_config.download_dir):
238
- hf_folder = snapshot_download(
239
- model_name_or_path,
240
- revision=model_config.revision,
241
- allow_patterns="*.json",
242
- cache_dir=load_config.download_dir,
243
- local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
244
- tqdm_class=DisabledTqdm,
245
- )
246
- else:
247
- hf_folder = model_name_or_path
248
-
249
- possible_config_filenames = quant_cls.get_config_filenames()
250
-
251
- # If the quantization config is not found, use the default config.
252
- if not possible_config_filenames:
253
- return quant_cls()
254
-
255
- config_files = glob.glob(os.path.join(hf_folder, "*.json"))
256
-
257
- quant_config_files = [
258
- f for f in config_files if any(f.endswith(x) for x in possible_config_filenames)
259
- ]
260
- if len(quant_config_files) == 0:
261
- raise ValueError(f"Cannot find the config file for {model_config.quantization}")
262
- if len(quant_config_files) > 1:
263
- raise ValueError(
264
- f"Found multiple config files for {model_config.quantization}: "
265
- f"{quant_config_files}"
266
- )
267
-
268
- quant_config_file = quant_config_files[0]
269
- with open(quant_config_file, "r") as f:
270
- config = json.load(f)
271
-
272
- if model_config.quantization == "bitsandbytes":
273
- config["adapter_name_or_path"] = model_name_or_path
274
-
275
- return quant_cls.from_config(config)
@@ -1,102 +0,0 @@
1
- sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
2
- sglang/api.py,sha256=gAY9JhqWXjrYoWnMvR-iiuuY1YSN94We-lc1LH0z3cw,6030
3
- sglang/bench_latency.py,sha256=CXvukEW0IeoH2IwN2vuriC0eHBdJsz3lgT7OwwNo_7A,16146
4
- sglang/bench_serving.py,sha256=M0YQT6xElpkx-FtmyUe6lhX1DZfVLGh54qd6qfFYquc,34801
5
- sglang/check_env.py,sha256=oU8VmjjPK2SviRhr41cF1953soBu-eTT5E0Hf04zMzo,4974
6
- sglang/global_config.py,sha256=9JxaFkBKSgep6BVeEl_kx9tuW9PqdijYELyBGTryl6o,1704
7
- sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
8
- sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
9
- sglang/utils.py,sha256=C50xm06WWKpKB8kSNs9vO4egJ2QTk_OAA6M13S2cB_A,8369
10
- sglang/version.py,sha256=_MLx4ac1juJPWEEiC9kMQISX3x3jFBr507jM2P_hxMg,23
11
- sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
13
- sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
14
- sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
15
- sglang/lang/interpreter.py,sha256=3RIeSGdKlKTq2Ixg_Tyo0fGEDTvBKS2f9FaJYODBHzA,30102
16
- sglang/lang/ir.py,sha256=FGWghAfVW9IcxcrVqHiqpf7vmWzuNYoVTMSbBZkYVRk,16839
17
- sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
18
- sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
20
- sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
21
- sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
22
- sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
23
- sglang/lang/backend/runtime_endpoint.py,sha256=AaBc5yczchX7mkwiKDMyjLjBkJsh2Lubrfd9lvCOlDo,9544
24
- sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
25
- sglang/srt/conversation.py,sha256=V5YuoeO6-aLqGv0p3J2qx8TnBJbN1oTopYFutNul3GQ,16491
26
- sglang/srt/hf_transformers_utils.py,sha256=Tf_RplcW7llVXsigRvSGqmeAUxBeAL8rPCkzuqWfZ8U,11925
27
- sglang/srt/mm_utils.py,sha256=n7_GmbOM_0IWVXovpM34rKIBw0Py9yb_NXSQw27u4OA,9454
28
- sglang/srt/model_config.py,sha256=k4OfRV-szWkFaJMIC40JoJGJ75AfYQ2hf4M1dS1aQ-o,6366
29
- sglang/srt/sampling_params.py,sha256=uZFDlTUPnNR5_3IDH-INDeN-tm6LlRkC2KT-B3njxJs,3687
30
- sglang/srt/server.py,sha256=hUNnTvH4c1AI2JJzoBUf9TQuTelx-vulcqwkEplw7Gk,16699
31
- sglang/srt/server_args.py,sha256=SmvnebtDTsvPNDyW6lltuJKC7h8eVdYmurY1ieIMySA,16475
32
- sglang/srt/utils.py,sha256=GcRFf3pb5l-Q5TJU4gF-Wp7Ct46l3BO0aMpjlyHXp3I,23766
33
- sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
34
- sglang/srt/constrained/base_tool_cache.py,sha256=1_m-AivPtWRwUgGiEZBafCrSFUGahK4UM4vgAd8TkMg,2004
35
- sglang/srt/constrained/fsm_cache.py,sha256=GoPBr_9ZdJizF2PKbYoQw2I4ckfrUYwCeMZxB9sY3TM,2639
36
- sglang/srt/constrained/jump_forward.py,sha256=IgZ8D0woy5FLIQvXkE8wZRYejDsfVkjU0sqUlkiv_f4,6193
37
- sglang/srt/layers/context_flashattention_nopad.py,sha256=r_TpHuYAVgq1pN81PiWe1bebtY-p9MBndBaoIE2VXrk,5180
38
- sglang/srt/layers/extend_attention.py,sha256=V5pm7toSDlzByaV4lGRgXVGWFUPf68chvvahlT2h4mk,14092
39
- sglang/srt/layers/fused_moe.py,sha256=KmyXwau2OOZpQimGIQrHptzGNs1trIud5AKEEKXdzPU,20823
40
- sglang/srt/layers/linear.py,sha256=3Se2FRXyqXcd-uvNx2b7s-jolsUTEVeYBMYHmV82wPw,34518
41
- sglang/srt/layers/logits_processor.py,sha256=wHKB1FjbfY0a7KGw5dCsEhmO4sc7VMy3gYtSPv4oQYM,11097
42
- sglang/srt/layers/radix_attention.py,sha256=lXwm-qs7hPy_EFV1Zf2pPQ0-drAdrO8V5J4eX0LwLtU,7505
43
- sglang/srt/layers/token_attention.py,sha256=pdBORaWQGvDy_Aitcq0XDHk2Rravol-jZZkrsgkXeng,8849
44
- sglang/srt/layers/quantization/__init__.py,sha256=JMlgE-FWS759lfQ9Uc6mGFqBbTFLlvKeVEFpZLATe14,2536
45
- sglang/srt/layers/quantization/fp8.py,sha256=GQOLeGbrcUfwO-7oClzDda0RXGPHR70ZXUHArZsa174,25511
46
- sglang/srt/managers/controller_multi.py,sha256=LYI-XE9h57DW8Uh4gpd8upsC3p2dd5weKzddEH274jg,6626
47
- sglang/srt/managers/controller_single.py,sha256=CdQ9_XPZdcWF5jArDmVR8K-WZ9_8Gpgk4SwANKxTX-Y,5112
48
- sglang/srt/managers/detokenizer_manager.py,sha256=GXWdW4n2N-otL3zcgdr0t1PcEe2EmQJA8AElntiNV1o,5606
49
- sglang/srt/managers/io_struct.py,sha256=VK61d6zfnBz5a3IMmwYsa5PNa9jUXPPmED1TdDRQGDs,7345
50
- sglang/srt/managers/policy_scheduler.py,sha256=ajSB-gCC6VJkXvnKU8FYU3Kgcigozp2pMTwF84Wp14o,3138
51
- sglang/srt/managers/schedule_batch.py,sha256=sKQAHRL6VoapGiO7yQV796gW4sVGAgVVBMtmENbKtvg,29641
52
- sglang/srt/managers/tokenizer_manager.py,sha256=wqb6zQbkHYcSNU14Auuh5519CVMmfbKGBQvn_IwDSAo,21408
53
- sglang/srt/managers/tp_worker.py,sha256=3sHlN4hxksF22lkOJ8i3X6WSH4_5POy74BfbIAzIDtM,35216
54
- sglang/srt/mem_cache/base_cache.py,sha256=czyN8IumXcMQskYOZDV3DzjfD4kdR-qwLVxceDqnOmE,788
55
- sglang/srt/mem_cache/chunk_cache.py,sha256=u1mkGoTI7_31H0i0mhKT7S57StYSsdmsSPqyGubE7lY,1560
56
- sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
57
- sglang/srt/mem_cache/memory_pool.py,sha256=oOKtPTgzujo9gHXykSuER7VKqQRuwNKlXyXlaK-3dxo,5280
58
- sglang/srt/mem_cache/radix_cache.py,sha256=pa5RD4xNKPSuvL55BnC4mimoca5oJRXr4Rg91-sbTcs,8881
59
- sglang/srt/model_executor/cuda_graph_runner.py,sha256=EyI8sMMoVlOjdTT2Y3cfwo1-uQ43QCQ1skx5BNgchjE,9433
60
- sglang/srt/model_executor/forward_batch_info.py,sha256=P5bGeLsnFbEqgWLI5X5Eg0XFCG1j2oWZOsIAMZNkZW4,9022
61
- sglang/srt/model_executor/model_runner.py,sha256=yzkJLIM41mhbfgfq87ToskAaA1PS67YzhmoSMbflkZI,17479
62
- sglang/srt/model_loader/model_loader.py,sha256=QmZUhHh1nmWrfYlunfnxMcTsIvip1l6aMIlrXoCED4I,10697
63
- sglang/srt/model_loader/utils.py,sha256=0AoWXX9uV5rKRYXJ4HduSnvdeerytI4ONCLCH6X4XFQ,10675
64
- sglang/srt/models/chatglm.py,sha256=7bHU2AFoppINDZm0EdxgtAJe7rwr9OPkhOCfq2qNrIA,13862
65
- sglang/srt/models/commandr.py,sha256=5BEtIS2uUQJANkkY-6ZeDqlrpUK5yXVYHiztU3vsTKY,14172
66
- sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
67
- sglang/srt/models/deepseek.py,sha256=E5W4nkH-Ne449rAIwQZgz-FAH2Qqp2r1vNfboyk5wEg,16024
68
- sglang/srt/models/deepseek_v2.py,sha256=NMcckZb48kVUwAmDA2l8wO19T6DNkJOkKAhHa6utBZM,26968
69
- sglang/srt/models/gemma.py,sha256=ilfN_NOcz7hpwEJ2y7NW3fBFmFO7YfjhdFDbfzl2qww,12285
70
- sglang/srt/models/gemma2.py,sha256=D8GZOI1tAbEV9PaBmJSsJRzCmvaK3tGXttIbrMb5yiQ,16426
71
- sglang/srt/models/gpt_bigcode.py,sha256=OKk9UP67as3T5bePlTRGHTCD-1wqaUEk92AowXPm6dg,10204
72
- sglang/srt/models/grok.py,sha256=M9rtdXslqYBle5VyZqFVHiJUXq_q_aHbza63xa03zqI,27861
73
- sglang/srt/models/internlm2.py,sha256=CKWBL0dBvLdaEUeJOUvLUNPb8BLrAZ8_BSf2mfFQhfU,12225
74
- sglang/srt/models/llama2.py,sha256=3ZEWi0PVCDNjTrVNvLs1ESdyTcZhJlZjaH5uyS46JyM,14288
75
- sglang/srt/models/llama_classification.py,sha256=Dvzy3PfETiJtnKFOk8qDDLUoZECf_cpSrNeA60PaDo4,4932
76
- sglang/srt/models/llava.py,sha256=-ysi192vpBDxNaMS8qaLOhC34lXQyRtbG_0niVaceSo,18436
77
- sglang/srt/models/llavavid.py,sha256=MX7YpqYh5J4BoOnV7vVAIfoOlBFQXYpp8Kpe7WK0ejk,13562
78
- sglang/srt/models/minicpm.py,sha256=ea_OyiwVTo6Tg9jNRAwqxETnA6FFeAqlIbiUS-xViEI,13843
79
- sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
80
- sglang/srt/models/mixtral.py,sha256=raSLbp6AfWg5_u-f-lYeRejE9koAjbHt8iIHXd3nURM,21397
81
- sglang/srt/models/mixtral_quant.py,sha256=xYeeatZ9OfwCTas_KbH9nl6lnUT4YqSY7NAxpgLp5LE,14222
82
- sglang/srt/models/qwen.py,sha256=43ea6gn4wHzAaI3JTDLtl08aEm0vIqgzbVH9M8oeuY0,10006
83
- sglang/srt/models/qwen2.py,sha256=Hyhks2r4KHpKeb9iHZpnvEVc5klmnrPwcLohqg8j1kw,12284
84
- sglang/srt/models/qwen2_moe.py,sha256=PZdhEf0DUuGWsld3TyDWlIqSbrrOdqvCD4lAtCPWXeg,18147
85
- sglang/srt/models/stablelm.py,sha256=yPrdzPEoUD2s_Q3RgOq7BBC7z-UtEaACzabqbDRs2tA,11368
86
- sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
87
- sglang/srt/openai_api/adapter.py,sha256=Eq44_hGwHcglCKOc6WqWDxBsgyRqtuC6VR4HB4GLfUY,38193
88
- sglang/srt/openai_api/protocol.py,sha256=pcRgmDM3Kozh74Aj-qEo8q64BI6hEjrdhYDU4m9srdI,8294
89
- sglang/test/run_eval.py,sha256=kbM6SiosfXj-1uYTFXPWMd7hZDvJZwV-AmdHi_WfP3A,3559
90
- sglang/test/runners.py,sha256=APXXbrqmUGUqnX7T1Aq8X2NJQkIqtv6B42a2ybdlPjA,7459
91
- sglang/test/simple_eval_common.py,sha256=HL1bfgkTAKP7sk-kShg73WTeADhuBD6xSsuLbV_9C3s,12359
92
- sglang/test/simple_eval_gpqa.py,sha256=CaRAuHdZj0m4mRm4tH9k7cB0kQxe0LHwlz7Vn1qyKps,3189
93
- sglang/test/simple_eval_humaneval.py,sha256=k50DKoAbXiw-ubrFXHet9B-7tboHU2dQJf5G3C-KKq4,5838
94
- sglang/test/simple_eval_math.py,sha256=EQblQmtUt-kl558drzhP7c6KhpDNgr1EJhhKx5eeHM4,2519
95
- sglang/test/simple_eval_mmlu.py,sha256=KqSSdSu2qfoKQ870ttxev1NJ7c90xv2mvKOQsSODtAw,4326
96
- sglang/test/test_programs.py,sha256=e9_ifoIvuI1Ctkbkz3wfdZLBBSRikby8ywcodBIkf9M,13826
97
- sglang/test/test_utils.py,sha256=ITQcY3WGV4kLGWEkfU-AeuFX8yGLmq9LEK5jHiuW7Sw,13991
98
- sglang-0.2.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
99
- sglang-0.2.11.dist-info/METADATA,sha256=gSQA5-Hf9y41ulOKiMeHRu4Nf-c9Nbt6xhmlCGzvhNY,33783
100
- sglang-0.2.11.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
101
- sglang-0.2.11.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
102
- sglang-0.2.11.dist-info/RECORD,,