sglang 0.3.0__py3-none-any.whl → 0.3.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +17 -8
- sglang/bench_serving.py +33 -38
- sglang/global_config.py +5 -17
- sglang/lang/backend/runtime_endpoint.py +5 -2
- sglang/lang/interpreter.py +1 -4
- sglang/launch_server.py +3 -6
- sglang/launch_server_llavavid.py +7 -8
- sglang/srt/{model_config.py → configs/model_config.py} +5 -0
- sglang/srt/constrained/__init__.py +2 -0
- sglang/srt/constrained/fsm_cache.py +33 -38
- sglang/srt/constrained/jump_forward.py +0 -1
- sglang/srt/conversation.py +4 -1
- sglang/srt/hf_transformers_utils.py +1 -3
- sglang/srt/layers/activation.py +12 -0
- sglang/srt/layers/attention_backend.py +480 -0
- sglang/srt/layers/flashinfer_utils.py +235 -0
- sglang/srt/layers/fused_moe/layer.py +27 -7
- sglang/srt/layers/layernorm.py +12 -0
- sglang/srt/layers/logits_processor.py +64 -77
- sglang/srt/layers/radix_attention.py +11 -161
- sglang/srt/layers/sampler.py +38 -122
- sglang/srt/layers/torchao_utils.py +75 -0
- sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} +67 -63
- sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} +40 -132
- sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} +13 -7
- sglang/srt/lora/lora.py +403 -0
- sglang/srt/lora/lora_config.py +43 -0
- sglang/srt/lora/lora_manager.py +259 -0
- sglang/srt/managers/controller_multi.py +1 -5
- sglang/srt/managers/controller_single.py +0 -5
- sglang/srt/managers/io_struct.py +16 -1
- sglang/srt/managers/policy_scheduler.py +122 -5
- sglang/srt/managers/schedule_batch.py +105 -71
- sglang/srt/managers/tokenizer_manager.py +17 -8
- sglang/srt/managers/tp_worker.py +188 -121
- sglang/srt/model_executor/cuda_graph_runner.py +69 -133
- sglang/srt/model_executor/forward_batch_info.py +35 -312
- sglang/srt/model_executor/model_runner.py +123 -154
- sglang/srt/models/baichuan.py +416 -0
- sglang/srt/models/chatglm.py +1 -5
- sglang/srt/models/commandr.py +1 -5
- sglang/srt/models/dbrx.py +1 -5
- sglang/srt/models/deepseek.py +1 -5
- sglang/srt/models/deepseek_v2.py +7 -6
- sglang/srt/models/exaone.py +1 -5
- sglang/srt/models/gemma.py +1 -5
- sglang/srt/models/gemma2.py +1 -5
- sglang/srt/models/gpt_bigcode.py +1 -5
- sglang/srt/models/grok.py +1 -5
- sglang/srt/models/internlm2.py +1 -5
- sglang/srt/models/llama.py +51 -5
- sglang/srt/models/llama_classification.py +1 -20
- sglang/srt/models/llava.py +30 -5
- sglang/srt/models/llavavid.py +2 -2
- sglang/srt/models/minicpm.py +1 -5
- sglang/srt/models/minicpm3.py +669 -0
- sglang/srt/models/mixtral.py +6 -5
- sglang/srt/models/mixtral_quant.py +1 -5
- sglang/srt/models/olmoe.py +415 -0
- sglang/srt/models/qwen.py +1 -5
- sglang/srt/models/qwen2.py +1 -5
- sglang/srt/models/qwen2_moe.py +6 -5
- sglang/srt/models/stablelm.py +1 -5
- sglang/srt/models/xverse.py +375 -0
- sglang/srt/models/xverse_moe.py +445 -0
- sglang/srt/openai_api/adapter.py +65 -46
- sglang/srt/openai_api/protocol.py +11 -3
- sglang/srt/sampling/sampling_batch_info.py +46 -80
- sglang/srt/server.py +30 -15
- sglang/srt/server_args.py +163 -28
- sglang/srt/utils.py +19 -51
- sglang/test/few_shot_gsm8k.py +132 -0
- sglang/test/runners.py +114 -22
- sglang/test/test_programs.py +7 -5
- sglang/test/test_utils.py +85 -2
- sglang/utils.py +32 -37
- sglang/version.py +1 -1
- {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/METADATA +30 -18
- sglang-0.3.1.post1.dist-info/RECORD +130 -0
- {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/WHEEL +1 -1
- sglang-0.3.0.dist-info/RECORD +0 -118
- {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/LICENSE +0 -0
- {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ limitations under the License.
|
|
18
18
|
import asyncio
|
19
19
|
import concurrent.futures
|
20
20
|
import dataclasses
|
21
|
+
import json
|
21
22
|
import logging
|
22
23
|
import multiprocessing as mp
|
23
24
|
import os
|
@@ -77,7 +78,6 @@ class TokenizerManager:
|
|
77
78
|
self,
|
78
79
|
server_args: ServerArgs,
|
79
80
|
port_args: PortArgs,
|
80
|
-
model_override_args: dict = None,
|
81
81
|
):
|
82
82
|
self.server_args = server_args
|
83
83
|
|
@@ -95,7 +95,7 @@ class TokenizerManager:
|
|
95
95
|
self.hf_config = get_config(
|
96
96
|
self.model_path,
|
97
97
|
trust_remote_code=server_args.trust_remote_code,
|
98
|
-
model_override_args=
|
98
|
+
model_override_args=json.loads(server_args.json_model_override_args),
|
99
99
|
)
|
100
100
|
self.is_generation = is_generation_model(
|
101
101
|
self.hf_config.architectures, self.server_args.is_embedding
|
@@ -188,6 +188,7 @@ class TokenizerManager:
|
|
188
188
|
pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
|
189
189
|
obj.image_data if not_use_index else obj.image_data[index]
|
190
190
|
)
|
191
|
+
modalities = obj.modalities
|
191
192
|
return_logprob = (
|
192
193
|
obj.return_logprob if not_use_index else obj.return_logprob[index]
|
193
194
|
)
|
@@ -196,8 +197,6 @@ class TokenizerManager:
|
|
196
197
|
if not_use_index
|
197
198
|
else obj.logprob_start_len[index]
|
198
199
|
)
|
199
|
-
if return_logprob and logprob_start_len == -1:
|
200
|
-
logprob_start_len = len(input_ids) - 1
|
201
200
|
top_logprobs_num = (
|
202
201
|
obj.top_logprobs_num
|
203
202
|
if not_use_index
|
@@ -243,14 +242,13 @@ class TokenizerManager:
|
|
243
242
|
pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
|
244
243
|
obj.image_data[0]
|
245
244
|
)
|
245
|
+
modalities = obj.modalities
|
246
246
|
return_logprob = obj.return_logprob[0]
|
247
247
|
logprob_start_len = obj.logprob_start_len[0]
|
248
248
|
top_logprobs_num = obj.top_logprobs_num[0]
|
249
249
|
|
250
250
|
# Send to the controller
|
251
251
|
if self.is_generation:
|
252
|
-
if return_logprob and logprob_start_len == -1:
|
253
|
-
logprob_start_len = len(input_ids) - 1
|
254
252
|
tokenized_obj = TokenizedGenerateReqInput(
|
255
253
|
rid,
|
256
254
|
input_text,
|
@@ -263,6 +261,12 @@ class TokenizerManager:
|
|
263
261
|
logprob_start_len,
|
264
262
|
top_logprobs_num,
|
265
263
|
obj.stream,
|
264
|
+
modalities,
|
265
|
+
(
|
266
|
+
obj.lora_path[index]
|
267
|
+
if isinstance(obj.lora_path, list)
|
268
|
+
else obj.lora_path
|
269
|
+
),
|
266
270
|
)
|
267
271
|
else: # is embedding
|
268
272
|
tokenized_obj = TokenizedEmbeddingReqInput(
|
@@ -341,11 +345,10 @@ class TokenizerManager:
|
|
341
345
|
sampling_params = self._get_sampling_params(obj.sampling_params[index])
|
342
346
|
|
343
347
|
if self.is_generation:
|
344
|
-
if obj.return_logprob[index] and obj.logprob_start_len[index] == -1:
|
345
|
-
obj.logprob_start_len[index] = len(input_ids) - 1
|
346
348
|
pixel_values, image_hashes, image_sizes = (
|
347
349
|
await self._get_pixel_values(obj.image_data[index])
|
348
350
|
)
|
351
|
+
modalities = obj.modalities
|
349
352
|
|
350
353
|
tokenized_obj = TokenizedGenerateReqInput(
|
351
354
|
rid,
|
@@ -359,6 +362,12 @@ class TokenizerManager:
|
|
359
362
|
obj.logprob_start_len[index],
|
360
363
|
obj.top_logprobs_num[index],
|
361
364
|
obj.stream,
|
365
|
+
modalities,
|
366
|
+
(
|
367
|
+
obj.lora_path[index]
|
368
|
+
if isinstance(obj.lora_path, list)
|
369
|
+
else obj.lora_path
|
370
|
+
),
|
362
371
|
)
|
363
372
|
else:
|
364
373
|
tokenized_obj = TokenizedEmbeddingReqInput(
|