sglang 0.1.16__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. sglang/__init__.py +3 -1
  2. sglang/api.py +3 -3
  3. sglang/backend/anthropic.py +1 -1
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +148 -12
  6. sglang/backend/runtime_endpoint.py +18 -10
  7. sglang/global_config.py +8 -1
  8. sglang/lang/interpreter.py +114 -67
  9. sglang/lang/ir.py +17 -2
  10. sglang/srt/constrained/fsm_cache.py +3 -0
  11. sglang/srt/flush_cache.py +1 -1
  12. sglang/srt/hf_transformers_utils.py +75 -1
  13. sglang/srt/layers/extend_attention.py +17 -0
  14. sglang/srt/layers/fused_moe.py +485 -0
  15. sglang/srt/layers/logits_processor.py +12 -7
  16. sglang/srt/layers/radix_attention.py +10 -3
  17. sglang/srt/layers/token_attention.py +16 -1
  18. sglang/srt/managers/controller/dp_worker.py +110 -0
  19. sglang/srt/managers/controller/infer_batch.py +619 -0
  20. sglang/srt/managers/controller/manager_multi.py +191 -0
  21. sglang/srt/managers/controller/manager_single.py +97 -0
  22. sglang/srt/managers/controller/model_runner.py +462 -0
  23. sglang/srt/managers/controller/radix_cache.py +267 -0
  24. sglang/srt/managers/controller/schedule_heuristic.py +59 -0
  25. sglang/srt/managers/controller/tp_worker.py +791 -0
  26. sglang/srt/managers/detokenizer_manager.py +45 -45
  27. sglang/srt/managers/io_struct.py +15 -11
  28. sglang/srt/managers/router/infer_batch.py +103 -59
  29. sglang/srt/managers/router/manager.py +1 -1
  30. sglang/srt/managers/router/model_rpc.py +175 -122
  31. sglang/srt/managers/router/model_runner.py +91 -104
  32. sglang/srt/managers/router/radix_cache.py +7 -1
  33. sglang/srt/managers/router/scheduler.py +6 -6
  34. sglang/srt/managers/tokenizer_manager.py +152 -89
  35. sglang/srt/model_config.py +4 -5
  36. sglang/srt/models/commandr.py +10 -13
  37. sglang/srt/models/dbrx.py +9 -15
  38. sglang/srt/models/gemma.py +8 -15
  39. sglang/srt/models/grok.py +671 -0
  40. sglang/srt/models/llama2.py +19 -15
  41. sglang/srt/models/llava.py +84 -20
  42. sglang/srt/models/llavavid.py +11 -20
  43. sglang/srt/models/mixtral.py +248 -118
  44. sglang/srt/models/mixtral_quant.py +373 -0
  45. sglang/srt/models/qwen.py +9 -13
  46. sglang/srt/models/qwen2.py +11 -13
  47. sglang/srt/models/stablelm.py +9 -15
  48. sglang/srt/models/yivl.py +17 -22
  49. sglang/srt/openai_api_adapter.py +140 -95
  50. sglang/srt/openai_protocol.py +10 -1
  51. sglang/srt/server.py +77 -42
  52. sglang/srt/server_args.py +51 -6
  53. sglang/srt/utils.py +124 -66
  54. sglang/test/test_programs.py +44 -0
  55. sglang/test/test_utils.py +32 -1
  56. sglang/utils.py +22 -4
  57. {sglang-0.1.16.dist-info → sglang-0.1.17.dist-info}/METADATA +15 -9
  58. sglang-0.1.17.dist-info/RECORD +81 -0
  59. sglang/srt/backend_config.py +0 -13
  60. sglang/srt/models/dbrx_config.py +0 -281
  61. sglang/srt/weight_utils.py +0 -417
  62. sglang-0.1.16.dist-info/RECORD +0 -72
  63. {sglang-0.1.16.dist-info → sglang-0.1.17.dist-info}/LICENSE +0 -0
  64. {sglang-0.1.16.dist-info → sglang-0.1.17.dist-info}/WHEEL +0 -0
  65. {sglang-0.1.16.dist-info → sglang-0.1.17.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,17 @@
1
1
  # Adapted from
2
- # https://github.com/vllm-project/vllm/blob/671af2b1c0b3ed6d856d37c21a561cc429a10701/vllm/model_executor/models/llama.py#L1
2
+ # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
3
3
  """Inference-only LLaMA model compatible with HuggingFace weights."""
4
- from typing import Any, Dict, Optional, Tuple
4
+ from typing import Any, Dict, Optional, Tuple, Iterable
5
5
 
6
6
  import torch
7
+ import tqdm
7
8
  from torch import nn
8
9
  from transformers import LlamaConfig
9
- from vllm.distributed import get_tensor_model_parallel_world_size
10
+ from vllm.config import CacheConfig
11
+ from vllm.distributed import (
12
+ get_tensor_model_parallel_rank,
13
+ get_tensor_model_parallel_world_size
14
+ )
10
15
  from vllm.model_executor.layers.activation import SiluAndMul
11
16
  from vllm.model_executor.layers.layernorm import RMSNorm
12
17
  from vllm.model_executor.layers.linear import (
@@ -20,11 +25,11 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
20
25
  ParallelLMHead,
21
26
  VocabParallelEmbedding,
22
27
  )
28
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
23
29
 
24
30
  from sglang.srt.layers.logits_processor import LogitsProcessor
25
31
  from sglang.srt.layers.radix_attention import RadixAttention
26
- from sglang.srt.managers.router.model_runner import InputMetadata
27
- from sglang.srt.weight_utils import default_weight_loader, hf_model_weights_iterator
32
+ from sglang.srt.managers.controller.model_runner import InputMetadata
28
33
 
29
34
 
30
35
  class LlamaMLP(nn.Module):
@@ -152,6 +157,10 @@ class LlamaDecoderLayer(nn.Module):
152
157
  self.hidden_size = config.hidden_size
153
158
  rope_theta = getattr(config, "rope_theta", 10000)
154
159
  rope_scaling = getattr(config, "rope_scaling", None)
160
+ if rope_scaling is not None and getattr(
161
+ config, "original_max_position_embeddings", None):
162
+ rope_scaling["original_max_position_embeddings"] = (
163
+ config.original_max_position_embeddings)
155
164
  max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
156
165
  self.self_attn = LlamaAttention(
157
166
  hidden_size=self.hidden_size,
@@ -250,6 +259,7 @@ class LlamaForCausalLM(nn.Module):
250
259
  self,
251
260
  config: LlamaConfig,
252
261
  quant_config: Optional[QuantizationConfig] = None,
262
+ cache_config: Optional[CacheConfig] = None,
253
263
  ) -> None:
254
264
  super().__init__()
255
265
  self.config = config
@@ -270,13 +280,7 @@ class LlamaForCausalLM(nn.Module):
270
280
  input_ids, hidden_states, self.lm_head.weight, input_metadata
271
281
  )
272
282
 
273
- def load_weights(
274
- self,
275
- model_name_or_path: str,
276
- cache_dir: Optional[str] = None,
277
- load_format: str = "auto",
278
- revision: Optional[str] = None,
279
- ):
283
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
280
284
  stacked_params_mapping = [
281
285
  # (param_name, shard_name, shard_id)
282
286
  ("qkv_proj", "q_proj", "q"),
@@ -286,9 +290,9 @@ class LlamaForCausalLM(nn.Module):
286
290
  ("gate_up_proj", "up_proj", 1),
287
291
  ]
288
292
  params_dict = dict(self.named_parameters())
289
- for name, loaded_weight in hf_model_weights_iterator(
290
- model_name_or_path, cache_dir, load_format, revision
291
- ):
293
+ if get_tensor_model_parallel_rank() == 0:
294
+ weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5))
295
+ for name, loaded_weight in weights:
292
296
  if "rotary_emb.inv_freq" in name or "projector" in name:
293
297
  continue
294
298
  if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
@@ -1,23 +1,26 @@
1
1
  """Inference-only LLaVa model compatible with HuggingFace weights."""
2
2
 
3
- from typing import List, Optional
3
+ from typing import List, Iterable, Optional, Tuple
4
4
 
5
5
  import numpy as np
6
6
  import torch
7
7
  from torch import nn
8
- from transformers import CLIPVisionModel, LlavaConfig
8
+ from transformers import CLIPVisionModel, CLIPVisionConfig, LlavaConfig, Qwen2Config, MistralConfig
9
9
  from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
10
+ from vllm.config import CacheConfig
10
11
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
12
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
11
13
 
12
- from sglang.srt.managers.router.infer_batch import ForwardMode
13
- from sglang.srt.managers.router.model_runner import InputMetadata
14
+ from sglang.srt.managers.controller.infer_batch import ForwardMode
15
+ from sglang.srt.managers.controller.model_runner import InputMetadata
14
16
  from sglang.srt.mm_utils import (
15
17
  get_anyres_image_grid_shape,
16
18
  unpad_image,
17
19
  unpad_image_shape,
18
20
  )
19
21
  from sglang.srt.models.llama2 import LlamaForCausalLM
20
- from sglang.srt.weight_utils import default_weight_loader, hf_model_weights_iterator
22
+ from sglang.srt.models.qwen2 import Qwen2ForCausalLM
23
+ from sglang.srt.models.mistral import MistralForCausalLM
21
24
 
22
25
 
23
26
  class LlavaLlamaForCausalLM(nn.Module):
@@ -25,6 +28,7 @@ class LlavaLlamaForCausalLM(nn.Module):
25
28
  self,
26
29
  config: LlavaConfig,
27
30
  quant_config: Optional[QuantizationConfig] = None,
31
+ cache_config: Optional[CacheConfig] = None,
28
32
  ) -> None:
29
33
  super().__init__()
30
34
  self.config = config
@@ -233,13 +237,7 @@ class LlavaLlamaForCausalLM(nn.Module):
233
237
  elif input_metadata.forward_mode == ForwardMode.DECODE:
234
238
  return self.language_model(input_ids, positions, input_metadata)
235
239
 
236
- def load_weights(
237
- self,
238
- model_name_or_path: str,
239
- cache_dir: Optional[str] = None,
240
- load_format: str = "auto",
241
- revision: Optional[str] = None,
242
- ):
240
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
243
241
  # load clip vision model by cfg['mm_vision_tower']:
244
242
  # huggingface_name or path_of_clip_relative_to_llava_model_dir
245
243
  vision_path = self.config.mm_vision_tower
@@ -272,9 +270,8 @@ class LlavaLlamaForCausalLM(nn.Module):
272
270
  "model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
273
271
  }
274
272
  params_dict = dict(self.named_parameters())
275
- for name, loaded_weight in hf_model_weights_iterator(
276
- model_name_or_path, cache_dir, load_format, revision
277
- ):
273
+ weights = list(weights)
274
+ for name, loaded_weight in weights:
278
275
  # FIXME: why projector weights read two times?
279
276
  if "projector" in name or "vision_tower" in name:
280
277
  for weight_name, param_name in projector_weights.items():
@@ -285,9 +282,7 @@ class LlavaLlamaForCausalLM(nn.Module):
285
282
  weight_loader(param, loaded_weight)
286
283
 
287
284
  # load language model
288
- self.language_model.load_weights(
289
- model_name_or_path, cache_dir, load_format, revision
290
- )
285
+ self.language_model.load_weights(weights)
291
286
 
292
287
  monkey_path_clip_vision_embed_forward()
293
288
 
@@ -296,8 +291,73 @@ class LlavaLlamaForCausalLM(nn.Module):
296
291
  return self.image_size // self.patch_size
297
292
 
298
293
 
299
- first_call = True
294
+ class LlavaQwenForCausalLM(LlavaLlamaForCausalLM):
295
+ def __init__(
296
+ self,
297
+ config: LlavaConfig,
298
+ quant_config: Optional[QuantizationConfig] = None,
299
+ cache_config: Optional[CacheConfig] = None,
300
+ ) -> None:
301
+ super().__init__(config, quant_config=quant_config, cache_config=cache_config)
302
+ self.config = config
303
+ self.vision_tower = None
304
+ if getattr(self.config, "vision_config", None) is None:
305
+ self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
306
+
307
+ if getattr(self.config, "text_config", None) is None:
308
+ self.config.text_config = Qwen2Config(self.config._name_or_path)
309
+
310
+ self.config.vision_config.hidden_size = config.mm_hidden_size
311
+ self.config.text_config.hidden_size = config.hidden_size
312
+
313
+ if getattr(self.config, "projector_hidden_act", None) is None:
314
+ self.config.projector_hidden_act = "gelu"
315
+
316
+ if getattr(self.config, "image_token_index", None) is None:
317
+ self.config.image_token_index = 151646
318
+
319
+ self.multi_modal_projector = LlavaMultiModalProjector(config)
320
+ self.language_model = Qwen2ForCausalLM(config, quant_config=quant_config)
321
+ if "unpad" in getattr(config, "mm_patch_merge_type", ""):
322
+ self.language_model.model.image_newline = nn.Parameter(
323
+ torch.empty(config.text_config.hidden_size, dtype=torch.float16)
324
+ )
325
+
300
326
 
327
+ class LlavaMistralForCausalLM(LlavaLlamaForCausalLM):
328
+ def __init__(
329
+ self,
330
+ config: LlavaConfig,
331
+ quant_config: Optional[QuantizationConfig] = None,
332
+ cache_config: Optional[CacheConfig] = None,
333
+ ) -> None:
334
+ super().__init__(config, quant_config=quant_config, cache_config=cache_config)
335
+ self.config = config
336
+ self.vision_tower = None
337
+ if getattr(self.config, "vision_config", None) is None:
338
+ self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
339
+
340
+ if getattr(self.config, "text_config", None) is None:
341
+ self.config.text_config = MistralConfig(self.config._name_or_path)
342
+
343
+ self.config.vision_config.hidden_size = config.mm_hidden_size
344
+ self.config.text_config.hidden_size = config.hidden_size
345
+
346
+ if getattr(self.config, "projector_hidden_act", None) is None:
347
+ self.config.projector_hidden_act = "gelu"
348
+
349
+ if getattr(self.config, "image_token_index", None) is None:
350
+ self.config.image_token_index = 32000
351
+
352
+ self.multi_modal_projector = LlavaMultiModalProjector(config)
353
+ self.language_model = MistralForCausalLM(config, quant_config=quant_config)
354
+ if "unpad" in getattr(config, "mm_patch_merge_type", ""):
355
+ self.language_model.model.image_newline = nn.Parameter(
356
+ torch.empty(config.text_config.hidden_size, dtype=torch.float16)
357
+ )
358
+
359
+
360
+ first_call = True
301
361
 
302
362
  def clip_vision_embed_forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
303
363
  batch_size = pixel_values.shape[0]
@@ -328,4 +388,8 @@ def monkey_path_clip_vision_embed_forward():
328
388
  )
329
389
 
330
390
 
331
- EntryClass = LlavaLlamaForCausalLM
391
+ EntryClass = [
392
+ LlavaLlamaForCausalLM,
393
+ LlavaQwenForCausalLM,
394
+ LlavaMistralForCausalLM
395
+ ]
@@ -1,24 +1,24 @@
1
1
  """Inference-only LLaVa video model compatible with HuggingFace weights."""
2
2
 
3
- import os
4
- from typing import List, Optional
3
+ from typing import List, Iterable, Optional, Tuple
5
4
 
6
5
  import numpy as np
7
6
  import torch
8
7
  from torch import nn
9
- from transformers import CLIPVisionModel, LlamaConfig, LlavaConfig
8
+ from transformers import CLIPVisionModel, LlavaConfig
10
9
  from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
10
+ from vllm.config import CacheConfig
11
11
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
12
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
12
13
 
13
- from sglang.srt.managers.router.infer_batch import ForwardMode
14
- from sglang.srt.managers.router.model_runner import InputMetadata
14
+ from sglang.srt.managers.controller.infer_batch import ForwardMode
15
+ from sglang.srt.managers.controller.model_runner import InputMetadata
15
16
  from sglang.srt.mm_utils import (
16
17
  get_anyres_image_grid_shape,
17
18
  unpad_image,
18
19
  unpad_image_shape,
19
20
  )
20
21
  from sglang.srt.models.llama2 import LlamaForCausalLM
21
- from sglang.srt.weight_utils import default_weight_loader, hf_model_weights_iterator
22
22
 
23
23
 
24
24
  class LlavaVidForCausalLM(nn.Module):
@@ -26,6 +26,7 @@ class LlavaVidForCausalLM(nn.Module):
26
26
  self,
27
27
  config: LlavaConfig,
28
28
  quant_config: Optional[QuantizationConfig] = None,
29
+ cache_config: Optional[CacheConfig] = None,
29
30
  ) -> None:
30
31
  super().__init__()
31
32
  self.config = config
@@ -65,7 +66,6 @@ class LlavaVidForCausalLM(nn.Module):
65
66
  pad_ids = pad_value * (
66
67
  (new_image_feature_len + len(pad_value)) // len(pad_value)
67
68
  )
68
- # print(input_ids)
69
69
  offset = input_ids.index(self.config.image_token_index)
70
70
  # old_len + pad_len - 1, because we need to remove image_token_id
71
71
  new_input_ids = (
@@ -200,13 +200,7 @@ class LlavaVidForCausalLM(nn.Module):
200
200
  elif input_metadata.forward_mode == ForwardMode.DECODE:
201
201
  return self.language_model(input_ids, positions, input_metadata)
202
202
 
203
- def load_weights(
204
- self,
205
- model_name_or_path: str,
206
- cache_dir: Optional[str] = None,
207
- load_format: str = "auto",
208
- revision: Optional[str] = None,
209
- ):
203
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
210
204
  # load clip vision model by cfg['mm_vision_tower']:
211
205
  # huggingface_name or path_of_clip_relative_to_llava_model_dir
212
206
  vision_path = self.config.mm_vision_tower
@@ -244,9 +238,8 @@ class LlavaVidForCausalLM(nn.Module):
244
238
  "model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
245
239
  }
246
240
  params_dict = dict(self.named_parameters())
247
- for name, loaded_weight in hf_model_weights_iterator(
248
- model_name_or_path, cache_dir, load_format, revision
249
- ):
241
+ weights = list(weights)
242
+ for name, loaded_weight in weights:
250
243
  # FIXME: why projector weights read two times?
251
244
  if "projector" in name or "vision_tower" in name:
252
245
  for weight_name, param_name in projector_weights.items():
@@ -261,9 +254,7 @@ class LlavaVidForCausalLM(nn.Module):
261
254
  weight_loader(param, loaded_weight)
262
255
 
263
256
  # load language model
264
- self.language_model.load_weights(
265
- model_name_or_path, cache_dir, load_format, revision
266
- )
257
+ self.language_model.load_weights(weights)
267
258
 
268
259
  monkey_path_clip_vision_embed_forward()
269
260