sglang 0.1.16__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +3 -1
- sglang/api.py +3 -3
- sglang/backend/anthropic.py +1 -1
- sglang/backend/litellm.py +90 -0
- sglang/backend/openai.py +148 -12
- sglang/backend/runtime_endpoint.py +18 -10
- sglang/global_config.py +8 -1
- sglang/lang/interpreter.py +114 -67
- sglang/lang/ir.py +17 -2
- sglang/srt/constrained/fsm_cache.py +3 -0
- sglang/srt/flush_cache.py +1 -1
- sglang/srt/hf_transformers_utils.py +75 -1
- sglang/srt/layers/extend_attention.py +17 -0
- sglang/srt/layers/fused_moe.py +485 -0
- sglang/srt/layers/logits_processor.py +12 -7
- sglang/srt/layers/radix_attention.py +10 -3
- sglang/srt/layers/token_attention.py +16 -1
- sglang/srt/managers/controller/dp_worker.py +110 -0
- sglang/srt/managers/controller/infer_batch.py +619 -0
- sglang/srt/managers/controller/manager_multi.py +191 -0
- sglang/srt/managers/controller/manager_single.py +97 -0
- sglang/srt/managers/controller/model_runner.py +462 -0
- sglang/srt/managers/controller/radix_cache.py +267 -0
- sglang/srt/managers/controller/schedule_heuristic.py +59 -0
- sglang/srt/managers/controller/tp_worker.py +791 -0
- sglang/srt/managers/detokenizer_manager.py +45 -45
- sglang/srt/managers/io_struct.py +15 -11
- sglang/srt/managers/router/infer_batch.py +103 -59
- sglang/srt/managers/router/manager.py +1 -1
- sglang/srt/managers/router/model_rpc.py +175 -122
- sglang/srt/managers/router/model_runner.py +91 -104
- sglang/srt/managers/router/radix_cache.py +7 -1
- sglang/srt/managers/router/scheduler.py +6 -6
- sglang/srt/managers/tokenizer_manager.py +152 -89
- sglang/srt/model_config.py +4 -5
- sglang/srt/models/commandr.py +10 -13
- sglang/srt/models/dbrx.py +9 -15
- sglang/srt/models/gemma.py +8 -15
- sglang/srt/models/grok.py +671 -0
- sglang/srt/models/llama2.py +19 -15
- sglang/srt/models/llava.py +84 -20
- sglang/srt/models/llavavid.py +11 -20
- sglang/srt/models/mixtral.py +248 -118
- sglang/srt/models/mixtral_quant.py +373 -0
- sglang/srt/models/qwen.py +9 -13
- sglang/srt/models/qwen2.py +11 -13
- sglang/srt/models/stablelm.py +9 -15
- sglang/srt/models/yivl.py +17 -22
- sglang/srt/openai_api_adapter.py +140 -95
- sglang/srt/openai_protocol.py +10 -1
- sglang/srt/server.py +77 -42
- sglang/srt/server_args.py +51 -6
- sglang/srt/utils.py +124 -66
- sglang/test/test_programs.py +44 -0
- sglang/test/test_utils.py +32 -1
- sglang/utils.py +22 -4
- {sglang-0.1.16.dist-info → sglang-0.1.17.dist-info}/METADATA +15 -9
- sglang-0.1.17.dist-info/RECORD +81 -0
- sglang/srt/backend_config.py +0 -13
- sglang/srt/models/dbrx_config.py +0 -281
- sglang/srt/weight_utils.py +0 -417
- sglang-0.1.16.dist-info/RECORD +0 -72
- {sglang-0.1.16.dist-info → sglang-0.1.17.dist-info}/LICENSE +0 -0
- {sglang-0.1.16.dist-info → sglang-0.1.17.dist-info}/WHEEL +0 -0
- {sglang-0.1.16.dist-info → sglang-0.1.17.dist-info}/top_level.txt +0 -0
sglang/srt/models/llama2.py
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
# Adapted from
|
2
|
-
# https://github.com/vllm-project/vllm/blob/
|
2
|
+
# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
|
3
3
|
"""Inference-only LLaMA model compatible with HuggingFace weights."""
|
4
|
-
from typing import Any, Dict, Optional, Tuple
|
4
|
+
from typing import Any, Dict, Optional, Tuple, Iterable
|
5
5
|
|
6
6
|
import torch
|
7
|
+
import tqdm
|
7
8
|
from torch import nn
|
8
9
|
from transformers import LlamaConfig
|
9
|
-
from vllm.
|
10
|
+
from vllm.config import CacheConfig
|
11
|
+
from vllm.distributed import (
|
12
|
+
get_tensor_model_parallel_rank,
|
13
|
+
get_tensor_model_parallel_world_size
|
14
|
+
)
|
10
15
|
from vllm.model_executor.layers.activation import SiluAndMul
|
11
16
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
12
17
|
from vllm.model_executor.layers.linear import (
|
@@ -20,11 +25,11 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
20
25
|
ParallelLMHead,
|
21
26
|
VocabParallelEmbedding,
|
22
27
|
)
|
28
|
+
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
23
29
|
|
24
30
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
25
31
|
from sglang.srt.layers.radix_attention import RadixAttention
|
26
|
-
from sglang.srt.managers.
|
27
|
-
from sglang.srt.weight_utils import default_weight_loader, hf_model_weights_iterator
|
32
|
+
from sglang.srt.managers.controller.model_runner import InputMetadata
|
28
33
|
|
29
34
|
|
30
35
|
class LlamaMLP(nn.Module):
|
@@ -152,6 +157,10 @@ class LlamaDecoderLayer(nn.Module):
|
|
152
157
|
self.hidden_size = config.hidden_size
|
153
158
|
rope_theta = getattr(config, "rope_theta", 10000)
|
154
159
|
rope_scaling = getattr(config, "rope_scaling", None)
|
160
|
+
if rope_scaling is not None and getattr(
|
161
|
+
config, "original_max_position_embeddings", None):
|
162
|
+
rope_scaling["original_max_position_embeddings"] = (
|
163
|
+
config.original_max_position_embeddings)
|
155
164
|
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
156
165
|
self.self_attn = LlamaAttention(
|
157
166
|
hidden_size=self.hidden_size,
|
@@ -250,6 +259,7 @@ class LlamaForCausalLM(nn.Module):
|
|
250
259
|
self,
|
251
260
|
config: LlamaConfig,
|
252
261
|
quant_config: Optional[QuantizationConfig] = None,
|
262
|
+
cache_config: Optional[CacheConfig] = None,
|
253
263
|
) -> None:
|
254
264
|
super().__init__()
|
255
265
|
self.config = config
|
@@ -270,13 +280,7 @@ class LlamaForCausalLM(nn.Module):
|
|
270
280
|
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
271
281
|
)
|
272
282
|
|
273
|
-
def load_weights(
|
274
|
-
self,
|
275
|
-
model_name_or_path: str,
|
276
|
-
cache_dir: Optional[str] = None,
|
277
|
-
load_format: str = "auto",
|
278
|
-
revision: Optional[str] = None,
|
279
|
-
):
|
283
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
280
284
|
stacked_params_mapping = [
|
281
285
|
# (param_name, shard_name, shard_id)
|
282
286
|
("qkv_proj", "q_proj", "q"),
|
@@ -286,9 +290,9 @@ class LlamaForCausalLM(nn.Module):
|
|
286
290
|
("gate_up_proj", "up_proj", 1),
|
287
291
|
]
|
288
292
|
params_dict = dict(self.named_parameters())
|
289
|
-
|
290
|
-
|
291
|
-
|
293
|
+
if get_tensor_model_parallel_rank() == 0:
|
294
|
+
weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5))
|
295
|
+
for name, loaded_weight in weights:
|
292
296
|
if "rotary_emb.inv_freq" in name or "projector" in name:
|
293
297
|
continue
|
294
298
|
if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
|
sglang/srt/models/llava.py
CHANGED
@@ -1,23 +1,26 @@
|
|
1
1
|
"""Inference-only LLaVa model compatible with HuggingFace weights."""
|
2
2
|
|
3
|
-
from typing import List, Optional
|
3
|
+
from typing import List, Iterable, Optional, Tuple
|
4
4
|
|
5
5
|
import numpy as np
|
6
6
|
import torch
|
7
7
|
from torch import nn
|
8
|
-
from transformers import CLIPVisionModel, LlavaConfig
|
8
|
+
from transformers import CLIPVisionModel, CLIPVisionConfig, LlavaConfig, Qwen2Config, MistralConfig
|
9
9
|
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
10
|
+
from vllm.config import CacheConfig
|
10
11
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
12
|
+
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
11
13
|
|
12
|
-
from sglang.srt.managers.
|
13
|
-
from sglang.srt.managers.
|
14
|
+
from sglang.srt.managers.controller.infer_batch import ForwardMode
|
15
|
+
from sglang.srt.managers.controller.model_runner import InputMetadata
|
14
16
|
from sglang.srt.mm_utils import (
|
15
17
|
get_anyres_image_grid_shape,
|
16
18
|
unpad_image,
|
17
19
|
unpad_image_shape,
|
18
20
|
)
|
19
21
|
from sglang.srt.models.llama2 import LlamaForCausalLM
|
20
|
-
from sglang.srt.
|
22
|
+
from sglang.srt.models.qwen2 import Qwen2ForCausalLM
|
23
|
+
from sglang.srt.models.mistral import MistralForCausalLM
|
21
24
|
|
22
25
|
|
23
26
|
class LlavaLlamaForCausalLM(nn.Module):
|
@@ -25,6 +28,7 @@ class LlavaLlamaForCausalLM(nn.Module):
|
|
25
28
|
self,
|
26
29
|
config: LlavaConfig,
|
27
30
|
quant_config: Optional[QuantizationConfig] = None,
|
31
|
+
cache_config: Optional[CacheConfig] = None,
|
28
32
|
) -> None:
|
29
33
|
super().__init__()
|
30
34
|
self.config = config
|
@@ -233,13 +237,7 @@ class LlavaLlamaForCausalLM(nn.Module):
|
|
233
237
|
elif input_metadata.forward_mode == ForwardMode.DECODE:
|
234
238
|
return self.language_model(input_ids, positions, input_metadata)
|
235
239
|
|
236
|
-
def load_weights(
|
237
|
-
self,
|
238
|
-
model_name_or_path: str,
|
239
|
-
cache_dir: Optional[str] = None,
|
240
|
-
load_format: str = "auto",
|
241
|
-
revision: Optional[str] = None,
|
242
|
-
):
|
240
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
243
241
|
# load clip vision model by cfg['mm_vision_tower']:
|
244
242
|
# huggingface_name or path_of_clip_relative_to_llava_model_dir
|
245
243
|
vision_path = self.config.mm_vision_tower
|
@@ -272,9 +270,8 @@ class LlavaLlamaForCausalLM(nn.Module):
|
|
272
270
|
"model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
|
273
271
|
}
|
274
272
|
params_dict = dict(self.named_parameters())
|
275
|
-
|
276
|
-
|
277
|
-
):
|
273
|
+
weights = list(weights)
|
274
|
+
for name, loaded_weight in weights:
|
278
275
|
# FIXME: why projector weights read two times?
|
279
276
|
if "projector" in name or "vision_tower" in name:
|
280
277
|
for weight_name, param_name in projector_weights.items():
|
@@ -285,9 +282,7 @@ class LlavaLlamaForCausalLM(nn.Module):
|
|
285
282
|
weight_loader(param, loaded_weight)
|
286
283
|
|
287
284
|
# load language model
|
288
|
-
self.language_model.load_weights(
|
289
|
-
model_name_or_path, cache_dir, load_format, revision
|
290
|
-
)
|
285
|
+
self.language_model.load_weights(weights)
|
291
286
|
|
292
287
|
monkey_path_clip_vision_embed_forward()
|
293
288
|
|
@@ -296,8 +291,73 @@ class LlavaLlamaForCausalLM(nn.Module):
|
|
296
291
|
return self.image_size // self.patch_size
|
297
292
|
|
298
293
|
|
299
|
-
|
294
|
+
class LlavaQwenForCausalLM(LlavaLlamaForCausalLM):
|
295
|
+
def __init__(
|
296
|
+
self,
|
297
|
+
config: LlavaConfig,
|
298
|
+
quant_config: Optional[QuantizationConfig] = None,
|
299
|
+
cache_config: Optional[CacheConfig] = None,
|
300
|
+
) -> None:
|
301
|
+
super().__init__(config, quant_config=quant_config, cache_config=cache_config)
|
302
|
+
self.config = config
|
303
|
+
self.vision_tower = None
|
304
|
+
if getattr(self.config, "vision_config", None) is None:
|
305
|
+
self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
|
306
|
+
|
307
|
+
if getattr(self.config, "text_config", None) is None:
|
308
|
+
self.config.text_config = Qwen2Config(self.config._name_or_path)
|
309
|
+
|
310
|
+
self.config.vision_config.hidden_size = config.mm_hidden_size
|
311
|
+
self.config.text_config.hidden_size = config.hidden_size
|
312
|
+
|
313
|
+
if getattr(self.config, "projector_hidden_act", None) is None:
|
314
|
+
self.config.projector_hidden_act = "gelu"
|
315
|
+
|
316
|
+
if getattr(self.config, "image_token_index", None) is None:
|
317
|
+
self.config.image_token_index = 151646
|
318
|
+
|
319
|
+
self.multi_modal_projector = LlavaMultiModalProjector(config)
|
320
|
+
self.language_model = Qwen2ForCausalLM(config, quant_config=quant_config)
|
321
|
+
if "unpad" in getattr(config, "mm_patch_merge_type", ""):
|
322
|
+
self.language_model.model.image_newline = nn.Parameter(
|
323
|
+
torch.empty(config.text_config.hidden_size, dtype=torch.float16)
|
324
|
+
)
|
325
|
+
|
300
326
|
|
327
|
+
class LlavaMistralForCausalLM(LlavaLlamaForCausalLM):
|
328
|
+
def __init__(
|
329
|
+
self,
|
330
|
+
config: LlavaConfig,
|
331
|
+
quant_config: Optional[QuantizationConfig] = None,
|
332
|
+
cache_config: Optional[CacheConfig] = None,
|
333
|
+
) -> None:
|
334
|
+
super().__init__(config, quant_config=quant_config, cache_config=cache_config)
|
335
|
+
self.config = config
|
336
|
+
self.vision_tower = None
|
337
|
+
if getattr(self.config, "vision_config", None) is None:
|
338
|
+
self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
|
339
|
+
|
340
|
+
if getattr(self.config, "text_config", None) is None:
|
341
|
+
self.config.text_config = MistralConfig(self.config._name_or_path)
|
342
|
+
|
343
|
+
self.config.vision_config.hidden_size = config.mm_hidden_size
|
344
|
+
self.config.text_config.hidden_size = config.hidden_size
|
345
|
+
|
346
|
+
if getattr(self.config, "projector_hidden_act", None) is None:
|
347
|
+
self.config.projector_hidden_act = "gelu"
|
348
|
+
|
349
|
+
if getattr(self.config, "image_token_index", None) is None:
|
350
|
+
self.config.image_token_index = 32000
|
351
|
+
|
352
|
+
self.multi_modal_projector = LlavaMultiModalProjector(config)
|
353
|
+
self.language_model = MistralForCausalLM(config, quant_config=quant_config)
|
354
|
+
if "unpad" in getattr(config, "mm_patch_merge_type", ""):
|
355
|
+
self.language_model.model.image_newline = nn.Parameter(
|
356
|
+
torch.empty(config.text_config.hidden_size, dtype=torch.float16)
|
357
|
+
)
|
358
|
+
|
359
|
+
|
360
|
+
first_call = True
|
301
361
|
|
302
362
|
def clip_vision_embed_forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
303
363
|
batch_size = pixel_values.shape[0]
|
@@ -328,4 +388,8 @@ def monkey_path_clip_vision_embed_forward():
|
|
328
388
|
)
|
329
389
|
|
330
390
|
|
331
|
-
EntryClass =
|
391
|
+
EntryClass = [
|
392
|
+
LlavaLlamaForCausalLM,
|
393
|
+
LlavaQwenForCausalLM,
|
394
|
+
LlavaMistralForCausalLM
|
395
|
+
]
|
sglang/srt/models/llavavid.py
CHANGED
@@ -1,24 +1,24 @@
|
|
1
1
|
"""Inference-only LLaVa video model compatible with HuggingFace weights."""
|
2
2
|
|
3
|
-
import
|
4
|
-
from typing import List, Optional
|
3
|
+
from typing import List, Iterable, Optional, Tuple
|
5
4
|
|
6
5
|
import numpy as np
|
7
6
|
import torch
|
8
7
|
from torch import nn
|
9
|
-
from transformers import CLIPVisionModel,
|
8
|
+
from transformers import CLIPVisionModel, LlavaConfig
|
10
9
|
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
10
|
+
from vllm.config import CacheConfig
|
11
11
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
12
|
+
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
12
13
|
|
13
|
-
from sglang.srt.managers.
|
14
|
-
from sglang.srt.managers.
|
14
|
+
from sglang.srt.managers.controller.infer_batch import ForwardMode
|
15
|
+
from sglang.srt.managers.controller.model_runner import InputMetadata
|
15
16
|
from sglang.srt.mm_utils import (
|
16
17
|
get_anyres_image_grid_shape,
|
17
18
|
unpad_image,
|
18
19
|
unpad_image_shape,
|
19
20
|
)
|
20
21
|
from sglang.srt.models.llama2 import LlamaForCausalLM
|
21
|
-
from sglang.srt.weight_utils import default_weight_loader, hf_model_weights_iterator
|
22
22
|
|
23
23
|
|
24
24
|
class LlavaVidForCausalLM(nn.Module):
|
@@ -26,6 +26,7 @@ class LlavaVidForCausalLM(nn.Module):
|
|
26
26
|
self,
|
27
27
|
config: LlavaConfig,
|
28
28
|
quant_config: Optional[QuantizationConfig] = None,
|
29
|
+
cache_config: Optional[CacheConfig] = None,
|
29
30
|
) -> None:
|
30
31
|
super().__init__()
|
31
32
|
self.config = config
|
@@ -65,7 +66,6 @@ class LlavaVidForCausalLM(nn.Module):
|
|
65
66
|
pad_ids = pad_value * (
|
66
67
|
(new_image_feature_len + len(pad_value)) // len(pad_value)
|
67
68
|
)
|
68
|
-
# print(input_ids)
|
69
69
|
offset = input_ids.index(self.config.image_token_index)
|
70
70
|
# old_len + pad_len - 1, because we need to remove image_token_id
|
71
71
|
new_input_ids = (
|
@@ -200,13 +200,7 @@ class LlavaVidForCausalLM(nn.Module):
|
|
200
200
|
elif input_metadata.forward_mode == ForwardMode.DECODE:
|
201
201
|
return self.language_model(input_ids, positions, input_metadata)
|
202
202
|
|
203
|
-
def load_weights(
|
204
|
-
self,
|
205
|
-
model_name_or_path: str,
|
206
|
-
cache_dir: Optional[str] = None,
|
207
|
-
load_format: str = "auto",
|
208
|
-
revision: Optional[str] = None,
|
209
|
-
):
|
203
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
210
204
|
# load clip vision model by cfg['mm_vision_tower']:
|
211
205
|
# huggingface_name or path_of_clip_relative_to_llava_model_dir
|
212
206
|
vision_path = self.config.mm_vision_tower
|
@@ -244,9 +238,8 @@ class LlavaVidForCausalLM(nn.Module):
|
|
244
238
|
"model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
|
245
239
|
}
|
246
240
|
params_dict = dict(self.named_parameters())
|
247
|
-
|
248
|
-
|
249
|
-
):
|
241
|
+
weights = list(weights)
|
242
|
+
for name, loaded_weight in weights:
|
250
243
|
# FIXME: why projector weights read two times?
|
251
244
|
if "projector" in name or "vision_tower" in name:
|
252
245
|
for weight_name, param_name in projector_weights.items():
|
@@ -261,9 +254,7 @@ class LlavaVidForCausalLM(nn.Module):
|
|
261
254
|
weight_loader(param, loaded_weight)
|
262
255
|
|
263
256
|
# load language model
|
264
|
-
self.language_model.load_weights(
|
265
|
-
model_name_or_path, cache_dir, load_format, revision
|
266
|
-
)
|
257
|
+
self.language_model.load_weights(weights)
|
267
258
|
|
268
259
|
monkey_path_clip_vision_embed_forward()
|
269
260
|
|