sglang 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. sglang/__init__.py +55 -2
  2. sglang/api.py +3 -5
  3. sglang/backend/anthropic.py +18 -4
  4. sglang/backend/openai.py +2 -1
  5. sglang/backend/runtime_endpoint.py +18 -5
  6. sglang/backend/vertexai.py +1 -0
  7. sglang/global_config.py +1 -0
  8. sglang/lang/chat_template.py +74 -0
  9. sglang/lang/interpreter.py +40 -16
  10. sglang/lang/tracer.py +6 -4
  11. sglang/launch_server.py +2 -1
  12. sglang/srt/constrained/fsm_cache.py +1 -0
  13. sglang/srt/constrained/jump_forward.py +1 -0
  14. sglang/srt/conversation.py +2 -2
  15. sglang/srt/hf_transformers_utils.py +2 -1
  16. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  17. sglang/srt/layers/extend_attention.py +1 -0
  18. sglang/srt/layers/logits_processor.py +114 -54
  19. sglang/srt/layers/radix_attention.py +2 -1
  20. sglang/srt/layers/token_attention.py +1 -0
  21. sglang/srt/managers/detokenizer_manager.py +5 -1
  22. sglang/srt/managers/io_struct.py +12 -0
  23. sglang/srt/managers/router/infer_batch.py +70 -33
  24. sglang/srt/managers/router/manager.py +7 -2
  25. sglang/srt/managers/router/model_rpc.py +116 -73
  26. sglang/srt/managers/router/model_runner.py +111 -167
  27. sglang/srt/managers/router/radix_cache.py +46 -38
  28. sglang/srt/managers/tokenizer_manager.py +56 -11
  29. sglang/srt/memory_pool.py +5 -14
  30. sglang/srt/model_config.py +7 -0
  31. sglang/srt/models/commandr.py +376 -0
  32. sglang/srt/models/dbrx.py +413 -0
  33. sglang/srt/models/dbrx_config.py +281 -0
  34. sglang/srt/models/gemma.py +22 -20
  35. sglang/srt/models/llama2.py +23 -21
  36. sglang/srt/models/llava.py +12 -10
  37. sglang/srt/models/mixtral.py +27 -25
  38. sglang/srt/models/qwen.py +23 -21
  39. sglang/srt/models/qwen2.py +23 -21
  40. sglang/srt/models/stablelm.py +20 -21
  41. sglang/srt/models/yivl.py +6 -5
  42. sglang/srt/openai_api_adapter.py +356 -0
  43. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
  44. sglang/srt/sampling_params.py +2 -0
  45. sglang/srt/server.py +68 -447
  46. sglang/srt/server_args.py +76 -49
  47. sglang/srt/utils.py +88 -32
  48. sglang/srt/weight_utils.py +402 -0
  49. sglang/test/test_programs.py +8 -7
  50. sglang/test/test_utils.py +195 -7
  51. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/METADATA +12 -14
  52. sglang-0.1.15.dist-info/RECORD +69 -0
  53. sglang-0.1.14.dist-info/RECORD +0 -64
  54. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/LICENSE +0 -0
  55. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/WHEEL +0 -0
  56. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,402 @@
1
+ # The PR(https://github.com/vllm-project/vllm/pull/4097) of vllm borken the sglang code.
2
+ # In order to adapt to the latest code without modifying too much code,
3
+ # copied the previous vllm/model_executor/weight_utils.py
4
+ # Copied in https://github.com/vllm-project/vllm/blob/05434764cd99990035779cf9a4ed86623b528825/vllm/model_executor/weight_utils.py
5
+
6
+ """Utilities for downloading and initializing model weights."""
7
+ import fnmatch
8
+ import glob
9
+ import hashlib
10
+ import json
11
+ import os
12
+ from collections import defaultdict
13
+ from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
14
+
15
+ import filelock
16
+ import huggingface_hub.constants
17
+ import numpy as np
18
+ import torch
19
+ from huggingface_hub import HfFileSystem, snapshot_download
20
+ from safetensors.torch import load_file, safe_open, save_file
21
+ from tqdm.auto import tqdm
22
+
23
+ from vllm.config import ModelConfig
24
+ from vllm.logger import init_logger
25
+ from vllm.model_executor.layers.quantization import (QuantizationConfig,
26
+ get_quantization_config)
27
+ from vllm.model_executor.layers.quantization.schema import QuantParamSchema
28
+
29
+ logger = init_logger(__name__)
30
+
31
+ # use system-level temp directory for file locks, so that multiple users
32
+ # can share the same lock without error.
33
+ # lock files in the temp directory will be automatically deleted when the
34
+ # system reboots, so users will not complain about annoying lock files
35
+ temp_dir = os.environ.get('TMPDIR') or os.environ.get(
36
+ 'TEMP') or os.environ.get('TMP') or "/tmp/"
37
+
38
+
39
+ def enable_hf_transfer():
40
+ """automatically activates hf_transfer
41
+ """
42
+ if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
43
+ try:
44
+ # enable hf hub transfer if available
45
+ import hf_transfer # type: ignore # noqa
46
+ huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
47
+ except ImportError:
48
+ pass
49
+
50
+
51
+ enable_hf_transfer()
52
+
53
+
54
+ class Disabledtqdm(tqdm):
55
+
56
+ def __init__(self, *args, **kwargs):
57
+ super().__init__(*args, **kwargs, disable=True)
58
+
59
+
60
+ def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
61
+ lock_dir = cache_dir or temp_dir
62
+ os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
63
+ model_name = model_name_or_path.replace("/", "-")
64
+ hash_name = hashlib.sha256(model_name.encode()).hexdigest()
65
+ # add hash to avoid conflict with old users' lock files
66
+ lock_file_name = hash_name + model_name + ".lock"
67
+ # mode 0o666 is required for the filelock to be shared across users
68
+ lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
69
+ mode=0o666)
70
+ return lock
71
+
72
+
73
+ def _shared_pointers(tensors):
74
+ ptrs = defaultdict(list)
75
+ for k, v in tensors.items():
76
+ ptrs[v.data_ptr()].append(k)
77
+ failing = []
78
+ for _, names in ptrs.items():
79
+ if len(names) > 1:
80
+ failing.append(names)
81
+ return failing
82
+
83
+
84
+ def convert_bin_to_safetensor_file(
85
+ pt_filename: str,
86
+ sf_filename: str,
87
+ ) -> None:
88
+ loaded = torch.load(pt_filename, map_location="cpu")
89
+ if "state_dict" in loaded:
90
+ loaded = loaded["state_dict"]
91
+ shared = _shared_pointers(loaded)
92
+ for shared_weights in shared:
93
+ for name in shared_weights[1:]:
94
+ loaded.pop(name)
95
+
96
+ # For tensors to be contiguous
97
+ loaded = {k: v.contiguous() for k, v in loaded.items()}
98
+
99
+ dirname = os.path.dirname(sf_filename)
100
+ os.makedirs(dirname, exist_ok=True)
101
+ save_file(loaded, sf_filename, metadata={"format": "pt"})
102
+
103
+ # check file size
104
+ sf_size = os.stat(sf_filename).st_size
105
+ pt_size = os.stat(pt_filename).st_size
106
+ if (sf_size - pt_size) / pt_size > 0.01:
107
+ raise RuntimeError(f"""The file size different is more than 1%:
108
+ - {sf_filename}: {sf_size}
109
+ - {pt_filename}: {pt_size}
110
+ """)
111
+
112
+ # check if the tensors are the same
113
+ reloaded = load_file(sf_filename)
114
+ for k in loaded:
115
+ pt_tensor = loaded[k]
116
+ sf_tensor = reloaded[k]
117
+ if not torch.equal(pt_tensor, sf_tensor):
118
+ raise RuntimeError(f"The output tensors do not match for key {k}")
119
+
120
+
121
+ # TODO(woosuk): Move this to other place.
122
+ def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
123
+ quant_cls = get_quantization_config(model_config.quantization)
124
+ # Read the quantization config from the HF model config, if available.
125
+ hf_quant_config = getattr(model_config.hf_config, "quantization_config",
126
+ None)
127
+ if hf_quant_config is not None:
128
+ return quant_cls.from_config(hf_quant_config)
129
+ model_name_or_path = model_config.model
130
+ is_local = os.path.isdir(model_name_or_path)
131
+ if not is_local:
132
+ # Download the config files.
133
+ with get_lock(model_name_or_path, model_config.download_dir):
134
+ hf_folder = snapshot_download(model_name_or_path,
135
+ revision=model_config.revision,
136
+ allow_patterns="*.json",
137
+ cache_dir=model_config.download_dir,
138
+ tqdm_class=Disabledtqdm)
139
+ else:
140
+ hf_folder = model_name_or_path
141
+ config_files = glob.glob(os.path.join(hf_folder, "*.json"))
142
+
143
+ quant_config_files = [
144
+ f for f in config_files if any(
145
+ f.endswith(x) for x in quant_cls.get_config_filenames())
146
+ ]
147
+ if len(quant_config_files) == 0:
148
+ raise ValueError(
149
+ f"Cannot find the config file for {model_config.quantization}")
150
+ if len(quant_config_files) > 1:
151
+ raise ValueError(
152
+ f"Found multiple config files for {model_config.quantization}: "
153
+ f"{quant_config_files}")
154
+
155
+ quant_config_file = quant_config_files[0]
156
+ with open(quant_config_file, "r") as f:
157
+ config = json.load(f)
158
+ return quant_cls.from_config(config)
159
+
160
+
161
+ def prepare_hf_model_weights(
162
+ model_name_or_path: str,
163
+ cache_dir: Optional[str] = None,
164
+ load_format: str = "auto",
165
+ fall_back_to_pt: bool = True,
166
+ revision: Optional[str] = None,
167
+ ) -> Tuple[str, List[str], bool]:
168
+ # Download model weights from huggingface.
169
+ is_local = os.path.isdir(model_name_or_path) \
170
+ and load_format != "tensorizer"
171
+ use_safetensors = False
172
+ # Some quantized models use .pt files for storing the weights.
173
+ if load_format == "auto":
174
+ allow_patterns = ["*.safetensors", "*.bin"]
175
+ elif load_format == "safetensors":
176
+ use_safetensors = True
177
+ allow_patterns = ["*.safetensors"]
178
+ elif load_format == "pt":
179
+ allow_patterns = ["*.pt"]
180
+ elif load_format == "npcache":
181
+ allow_patterns = ["*.bin"]
182
+ elif load_format == "tensorizer":
183
+ allow_patterns = ["*.tensors"]
184
+ else:
185
+ raise ValueError(f"Unknown load_format: {load_format}")
186
+
187
+ if fall_back_to_pt:
188
+ allow_patterns += ["*.pt"]
189
+
190
+ if not is_local and load_format != "tensorizer":
191
+ # Before we download we look at that is available:
192
+ fs = HfFileSystem()
193
+ file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
194
+
195
+ # depending on what is available we download different things
196
+ for pattern in allow_patterns:
197
+ matching = fnmatch.filter(file_list, pattern)
198
+ if len(matching) > 0:
199
+ allow_patterns = [pattern]
200
+ break
201
+
202
+ logger.info(f"Using model weights format {allow_patterns}")
203
+ # Use file lock to prevent multiple processes from
204
+ # downloading the same model weights at the same time.
205
+ with get_lock(model_name_or_path, cache_dir):
206
+ hf_folder = snapshot_download(model_name_or_path,
207
+ allow_patterns=allow_patterns,
208
+ cache_dir=cache_dir,
209
+ tqdm_class=Disabledtqdm,
210
+ revision=revision)
211
+ else:
212
+ hf_folder = model_name_or_path
213
+ hf_weights_files: List[str] = []
214
+ for pattern in allow_patterns:
215
+ hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
216
+ if len(hf_weights_files) > 0:
217
+ if pattern == "*.safetensors":
218
+ use_safetensors = True
219
+ break
220
+ if not use_safetensors:
221
+ # Exclude files that are not needed for inference.
222
+ # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
223
+ blacklist = [
224
+ "training_args.bin",
225
+ "optimizer.bin",
226
+ "optimizer.pt",
227
+ "scheduler.pt",
228
+ "scaler.pt",
229
+ ]
230
+ hf_weights_files = [
231
+ f for f in hf_weights_files
232
+ if not any(f.endswith(x) for x in blacklist)
233
+ ]
234
+
235
+ if load_format == "tensorizer":
236
+ return hf_folder, hf_weights_files, use_safetensors
237
+
238
+ if len(hf_weights_files) == 0:
239
+ raise RuntimeError(
240
+ f"Cannot find any model weights with `{model_name_or_path}`")
241
+
242
+ return hf_folder, hf_weights_files, use_safetensors
243
+
244
+
245
+ def hf_model_weights_iterator(
246
+ model_name_or_path: str,
247
+ cache_dir: Optional[str] = None,
248
+ load_format: Union[Tuple, str] = "auto",
249
+ revision: Optional[str] = None,
250
+ fall_back_to_pt: Optional[bool] = True,
251
+ ) -> Iterator[Tuple[str, torch.Tensor]]:
252
+ hf_folder, hf_weights_files, use_safetensors = prepare_hf_model_weights(
253
+ model_name_or_path,
254
+ cache_dir=cache_dir,
255
+ load_format=load_format,
256
+ fall_back_to_pt=fall_back_to_pt,
257
+ revision=revision)
258
+
259
+ if load_format == "npcache":
260
+ # Currently np_cache only support *.bin checkpoints
261
+ assert use_safetensors is False
262
+
263
+ # Convert the model weights from torch tensors to numpy arrays for
264
+ # faster loading.
265
+ np_folder = os.path.join(hf_folder, "np")
266
+ os.makedirs(np_folder, exist_ok=True)
267
+ weight_names_file = os.path.join(np_folder, "weight_names.json")
268
+ # Use file lock to prevent multiple processes from
269
+ # dumping the same model weights to numpy at the same time.
270
+ with get_lock(model_name_or_path, cache_dir):
271
+ if not os.path.exists(weight_names_file):
272
+ weight_names = []
273
+ for bin_file in hf_weights_files:
274
+ state = torch.load(bin_file, map_location="cpu")
275
+ for name, param in state.items():
276
+ param_path = os.path.join(np_folder, name)
277
+ with open(param_path, "wb") as f:
278
+ np.save(f, param.cpu().detach().numpy())
279
+ weight_names.append(name)
280
+ with open(weight_names_file, "w") as f:
281
+ json.dump(weight_names, f)
282
+
283
+ with open(weight_names_file, "r") as f:
284
+ weight_names = json.load(f)
285
+
286
+ for name in weight_names:
287
+ param_path = os.path.join(np_folder, name)
288
+ with open(param_path, "rb") as f:
289
+ param = np.load(f)
290
+ yield name, torch.from_numpy(param)
291
+ elif load_format == "tensorizer":
292
+ from vllm.model_executor.tensorizer_loader import (TensorDeserializer,
293
+ open_stream,
294
+ tensorizer_warning)
295
+ tensorizer_args = load_format.params
296
+ tensorizer_warning(
297
+ "Deserializing HuggingFace models is not optimized for "
298
+ "loading on vLLM, as tensorizer is forced to load to CPU. "
299
+ "Consider deserializing a vLLM model instead for faster "
300
+ "load times. See the examples/tensorize_vllm_model.py example "
301
+ "script for serializing vLLM models.")
302
+
303
+ deserializer_args = tensorizer_args.deserializer_params
304
+ stream_params = tensorizer_args.stream_params
305
+ stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
306
+ with TensorDeserializer(stream, **deserializer_args,
307
+ device="cpu") as state:
308
+ for name, param in state.items():
309
+ yield name, param
310
+ del state
311
+ elif use_safetensors:
312
+ for st_file in hf_weights_files:
313
+ with safe_open(st_file, framework="pt") as f:
314
+ for name in f.keys(): # noqa: SIM118
315
+ param = f.get_tensor(name)
316
+ yield name, param
317
+ else:
318
+ for bin_file in hf_weights_files:
319
+ state = torch.load(bin_file, map_location="cpu")
320
+ for name, param in state.items():
321
+ yield name, param
322
+ del state
323
+ torch.cuda.empty_cache()
324
+
325
+
326
+ def kv_cache_scales_loader(
327
+ filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int,
328
+ model_type: Optional[str]) -> Iterable[Tuple[int, float]]:
329
+ """
330
+ A simple utility to read in KV cache scaling factors that have been
331
+ previously serialized to disk. Used by the model to populate the appropriate
332
+ KV cache scaling factors. The serialization should represent a dictionary
333
+ whose keys are the TP ranks and values are another dictionary mapping layers
334
+ to their KV cache scaling factors.
335
+ Keep this function in sync with the output of examples/fp8/extract_scales.py
336
+ """
337
+ try:
338
+ with open(filename) as f:
339
+ context = {
340
+ "model_type": model_type,
341
+ "num_hidden_layers": num_hidden_layers,
342
+ "tp_rank": tp_rank,
343
+ "tp_size": tp_size,
344
+ }
345
+ schema_dct = json.load(f)
346
+ schema = QuantParamSchema.model_validate(schema_dct,
347
+ context=context)
348
+ layer_scales_map = schema.kv_cache.scaling_factor[tp_rank]
349
+ return layer_scales_map.items()
350
+
351
+ except FileNotFoundError:
352
+ logger.error(f"File or directory '{filename}' not found.")
353
+ except json.JSONDecodeError:
354
+ logger.error(f"Error decoding JSON in file '{filename}'.")
355
+ except Exception as e:
356
+ logger.error(f"An error occurred while reading '{filename}': {e}")
357
+ # This section is reached if and only if any of the excepts are hit
358
+ # Return an empty iterable (list) => no KV cache scales are loaded
359
+ # which ultimately defaults to 1.0 scales
360
+ logger.warning("Defaulting to KV cache scaling factors = 1.0 "
361
+ f"for all layers in TP rank {tp_rank} "
362
+ "as an error occurred during loading.")
363
+ return []
364
+
365
+
366
+ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
367
+ """convert PySafeSlice object from safetensors to torch.Tensor
368
+
369
+ PySafeSlice object supports indexing, which is done before loading the
370
+ actual tensor and can reduce the amount of memory being read into the
371
+ memory. However, it does not support more advanced functionalities
372
+ like `.view()` or `.t()`. Therefore, if we need to modify the loaded
373
+ tensor with these more complicated operators, we need to convert to
374
+ tensor first.
375
+ """
376
+ if not isinstance(x, torch.Tensor):
377
+ x = x[:]
378
+ return x
379
+
380
+
381
+ def default_weight_loader(param: torch.Tensor,
382
+ loaded_weight: torch.Tensor) -> None:
383
+ """Default weight loader."""
384
+ assert param.size() == loaded_weight.size()
385
+ param.data.copy_(loaded_weight)
386
+
387
+
388
+ def initialize_dummy_weights(
389
+ model: torch.nn.Module,
390
+ low: float = -1e-3,
391
+ high: float = 1e-3,
392
+ ) -> None:
393
+ """Initialize model weights with random values.
394
+
395
+ The model weights must be randomly initialized for accurate performance
396
+ measurements. Additionally, the model weights should not cause NaNs in the
397
+ forward pass. We empirically found that initializing the weights with
398
+ values between -1e-3 and 1e-3 works well for most models.
399
+ """
400
+ for param in model.state_dict().values():
401
+ if torch.is_floating_point(param):
402
+ param.data.uniform_(low, high)
@@ -226,7 +226,7 @@ Action 3: Finish [United States].\n
226
226
 
227
227
  def test_parallel_decoding():
228
228
  max_tokens = 64
229
- number = 5
229
+ fork_size = 5
230
230
 
231
231
  @sgl.function
232
232
  def parallel_decoding(s, topic):
@@ -234,17 +234,17 @@ def test_parallel_decoding():
234
234
  s += "USER: Give some tips for " + topic + ".\n"
235
235
  s += (
236
236
  "ASSISTANT: Okay. Here are "
237
- + str(number)
237
+ + str(fork_size)
238
238
  + " concise tips, each under 8 words:\n"
239
239
  )
240
240
 
241
241
  # Generate skeleton
242
- for i in range(1, 1 + number):
242
+ for i in range(1, 1 + fork_size):
243
243
  s += f"{i}." + sgl.gen(max_tokens=16, stop=[".", "\n"]) + ".\n"
244
244
 
245
245
  # Generate detailed tips
246
- forks = s.fork(number)
247
- for i in range(number):
246
+ forks = s.fork(fork_size)
247
+ for i in range(fork_size):
248
248
  forks[
249
249
  i
250
250
  ] += f"Now, I expand tip {i+1} into a detailed paragraph:\nTip {i+1}:"
@@ -253,7 +253,7 @@ def test_parallel_decoding():
253
253
 
254
254
  # Concatenate tips and summarize
255
255
  s += "Here are these tips with detailed explanation:\n"
256
- for i in range(number):
256
+ for i in range(fork_size):
257
257
  s += f"Tip {i+1}:" + forks[i]["detailed_tip"] + "\n"
258
258
 
259
259
  s += "\nIn summary," + sgl.gen("summary", max_tokens=512)
@@ -296,7 +296,7 @@ def test_parallel_encoding(check_answer=True):
296
296
  def test_image_qa():
297
297
  @sgl.function
298
298
  def image_qa(s, question):
299
- s += sgl.user(sgl.image("test_image.png") + question)
299
+ s += sgl.user(sgl.image("example_image.png") + question)
300
300
  s += sgl.assistant(sgl.gen("answer"))
301
301
 
302
302
  state = image_qa.run(
@@ -313,6 +313,7 @@ def test_image_qa():
313
313
  def test_stream():
314
314
  @sgl.function
315
315
  def qa(s, question):
316
+ s += sgl.system("You are a helpful assistant.")
316
317
  s += sgl.user(question)
317
318
  s += sgl.assistant(sgl.gen("answer"))
318
319