sglang 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +7 -1
- sglang/bench_latency.py +3 -2
- sglang/global_config.py +1 -1
- sglang/lang/backend/runtime_endpoint.py +60 -49
- sglang/lang/interpreter.py +4 -2
- sglang/lang/ir.py +13 -4
- sglang/srt/constrained/jump_forward.py +13 -2
- sglang/srt/layers/activation.py +0 -1
- sglang/srt/layers/extend_attention.py +3 -1
- sglang/srt/layers/fused_moe/__init__.py +1 -0
- sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
- sglang/srt/layers/fused_moe/layer.py +587 -0
- sglang/srt/layers/logits_processor.py +4 -4
- sglang/srt/layers/radix_attention.py +38 -14
- sglang/srt/managers/schedule_batch.py +9 -14
- sglang/srt/managers/tokenizer_manager.py +1 -1
- sglang/srt/managers/tp_worker.py +1 -7
- sglang/srt/model_executor/cuda_graph_runner.py +48 -17
- sglang/srt/model_executor/forward_batch_info.py +132 -58
- sglang/srt/model_executor/model_runner.py +61 -28
- sglang/srt/models/chatglm.py +2 -2
- sglang/srt/models/commandr.py +1 -1
- sglang/srt/models/deepseek.py +2 -2
- sglang/srt/models/deepseek_v2.py +7 -6
- sglang/srt/models/gemma.py +1 -1
- sglang/srt/models/gemma2.py +11 -5
- sglang/srt/models/grok.py +50 -396
- sglang/srt/models/minicpm.py +2 -2
- sglang/srt/models/mixtral.py +56 -254
- sglang/srt/models/mixtral_quant.py +1 -4
- sglang/srt/models/qwen.py +2 -2
- sglang/srt/models/qwen2.py +2 -2
- sglang/srt/models/qwen2_moe.py +2 -2
- sglang/srt/models/stablelm.py +1 -1
- sglang/srt/openai_api/adapter.py +32 -21
- sglang/srt/sampling_params.py +0 -4
- sglang/srt/server.py +23 -15
- sglang/srt/server_args.py +7 -1
- sglang/srt/utils.py +1 -2
- sglang/test/runners.py +18 -10
- sglang/test/test_programs.py +32 -5
- sglang/test/test_utils.py +5 -1
- sglang/version.py +1 -1
- {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/METADATA +12 -4
- {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/RECORD +48 -48
- {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/WHEEL +1 -1
- sglang/srt/model_loader/model_loader.py +0 -292
- sglang/srt/model_loader/utils.py +0 -275
- {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/LICENSE +0 -0
- {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/top_level.txt +0 -0
sglang/srt/server.py
CHANGED
@@ -288,6 +288,8 @@ def launch_server(
|
|
288
288
|
|
289
289
|
# Launch processes
|
290
290
|
tokenizer_manager = TokenizerManager(server_args, port_args, model_overide_args)
|
291
|
+
if server_args.chat_template:
|
292
|
+
load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
|
291
293
|
pipe_controller_reader, pipe_controller_writer = mp.Pipe(duplex=False)
|
292
294
|
pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)
|
293
295
|
|
@@ -358,6 +360,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
358
360
|
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
359
361
|
os.environ["NCCL_NVLS_ENABLE"] = "0"
|
360
362
|
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
363
|
+
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
|
361
364
|
|
362
365
|
# Set ulimit
|
363
366
|
set_ulimit()
|
@@ -375,16 +378,11 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
375
378
|
# FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
|
376
379
|
maybe_set_triton_cache_manager()
|
377
380
|
|
378
|
-
# Set global chat template
|
379
|
-
if server_args.chat_template:
|
380
|
-
# TODO: replace this with huggingface transformers template
|
381
|
-
load_chat_template_for_openai_api(server_args.chat_template)
|
382
|
-
|
383
381
|
# Check flashinfer version
|
384
382
|
if not server_args.disable_flashinfer:
|
385
383
|
assert_pkg_version(
|
386
384
|
"flashinfer",
|
387
|
-
"0.1.
|
385
|
+
"0.1.5",
|
388
386
|
"Please uninstall the old version and "
|
389
387
|
"reinstall the latest version by following the instructions "
|
390
388
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -533,11 +531,18 @@ class Runtime:
|
|
533
531
|
prompt: str,
|
534
532
|
sampling_params: Optional[Dict] = None,
|
535
533
|
):
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
534
|
+
if self.server_args.skip_tokenizer_init:
|
535
|
+
json_data = {
|
536
|
+
"input_ids": prompt,
|
537
|
+
"sampling_params": sampling_params,
|
538
|
+
"stream": True,
|
539
|
+
}
|
540
|
+
else:
|
541
|
+
json_data = {
|
542
|
+
"text": prompt,
|
543
|
+
"sampling_params": sampling_params,
|
544
|
+
"stream": True,
|
545
|
+
}
|
541
546
|
pos = 0
|
542
547
|
|
543
548
|
timeout = aiohttp.ClientTimeout(total=3 * 3600)
|
@@ -549,10 +554,13 @@ class Runtime:
|
|
549
554
|
if chunk == "data: [DONE]\n\n":
|
550
555
|
break
|
551
556
|
data = json.loads(chunk[5:].strip("\n"))
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
557
|
+
if hasattr(data, "text"):
|
558
|
+
cur = data["text"][pos:]
|
559
|
+
if cur:
|
560
|
+
yield cur
|
561
|
+
pos += len(cur)
|
562
|
+
else:
|
563
|
+
yield data
|
556
564
|
|
557
565
|
add_request = async_generate
|
558
566
|
|
sglang/srt/server_args.py
CHANGED
@@ -17,9 +17,12 @@ limitations under the License.
|
|
17
17
|
|
18
18
|
import argparse
|
19
19
|
import dataclasses
|
20
|
+
import logging
|
20
21
|
import random
|
21
22
|
from typing import List, Optional, Union
|
22
23
|
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
23
26
|
|
24
27
|
@dataclasses.dataclass
|
25
28
|
class ServerArgs:
|
@@ -46,7 +49,7 @@ class ServerArgs:
|
|
46
49
|
max_running_requests: Optional[int] = None
|
47
50
|
max_num_reqs: Optional[int] = None
|
48
51
|
max_total_tokens: Optional[int] = None
|
49
|
-
chunked_prefill_size: int =
|
52
|
+
chunked_prefill_size: int = 8192
|
50
53
|
max_prefill_tokens: int = 16384
|
51
54
|
schedule_policy: str = "lpm"
|
52
55
|
schedule_conservativeness: float = 1.0
|
@@ -446,6 +449,9 @@ class ServerArgs:
|
|
446
449
|
assert not (
|
447
450
|
self.dp_size > 1 and self.node_rank is not None
|
448
451
|
), "multi-node data parallel is not supported"
|
452
|
+
if "gemma-2" in self.model_path.lower():
|
453
|
+
logger.info(f"When using sliding window in gemma-2, turn on flashinfer.")
|
454
|
+
self.disable_flashinfer = False
|
449
455
|
|
450
456
|
|
451
457
|
@dataclasses.dataclass
|
sglang/srt/utils.py
CHANGED
@@ -35,7 +35,6 @@ import torch
|
|
35
35
|
import torch.distributed as dist
|
36
36
|
from fastapi.responses import JSONResponse
|
37
37
|
from packaging import version as pkg_version
|
38
|
-
from starlette.middleware.base import BaseHTTPMiddleware
|
39
38
|
from torch.nn.parameter import Parameter
|
40
39
|
from triton.runtime.cache import (
|
41
40
|
FileCacheManager,
|
@@ -644,7 +643,7 @@ def set_ulimit(target_soft_limit=65535):
|
|
644
643
|
logger.warn(f"Fail to set RLIMIT_NOFILE: {e}")
|
645
644
|
|
646
645
|
|
647
|
-
def
|
646
|
+
def is_llama3_405b_fp8_head_16(model_config):
|
648
647
|
"""Return whether the model is meta-llama/Meta-Llama-3.1-405B-FP8 with 16 kv heads."""
|
649
648
|
if (
|
650
649
|
model_config.hf_config.architectures[0] == "LlamaForCausalLM"
|
sglang/test/runners.py
CHANGED
@@ -15,6 +15,7 @@ limitations under the License.
|
|
15
15
|
|
16
16
|
import json
|
17
17
|
import multiprocessing
|
18
|
+
import os
|
18
19
|
from dataclasses import dataclass
|
19
20
|
from typing import List, Union
|
20
21
|
|
@@ -31,8 +32,14 @@ DEFAULT_PROMPTS = [
|
|
31
32
|
"The capital of the United Kindom is",
|
32
33
|
"Today is a sunny day and I like",
|
33
34
|
"AI is a field of computer science focused on",
|
35
|
+
"Apple is red. Banana is Yellow. " * 800 + "Apple is",
|
34
36
|
]
|
35
37
|
|
38
|
+
dirpath = os.path.dirname(__file__)
|
39
|
+
with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f:
|
40
|
+
long_prompt = f.read()
|
41
|
+
DEFAULT_PROMPTS.append(long_prompt)
|
42
|
+
|
36
43
|
NUM_TOP_LOGPROBS = 5
|
37
44
|
|
38
45
|
|
@@ -125,16 +132,14 @@ class HFRunner:
|
|
125
132
|
)
|
126
133
|
|
127
134
|
logits = self.model.forward(input_ids).logits[0]
|
128
|
-
logprobs = F.log_softmax(
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
# print("index",
|
133
|
-
logprobs
|
134
|
-
|
135
|
-
|
136
|
-
]
|
137
|
-
prefill_logprobs.append(logprobs)
|
135
|
+
logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
|
136
|
+
logprobs, top_indices = torch.topk(
|
137
|
+
logprobs, k=NUM_TOP_LOGPROBS, dim=-1
|
138
|
+
)
|
139
|
+
# print("index", top_indices)
|
140
|
+
prefill_logprobs.append(logprobs.tolist())
|
141
|
+
del logits
|
142
|
+
del logprobs
|
138
143
|
|
139
144
|
out_queue.put(
|
140
145
|
ModelOutput(
|
@@ -174,6 +179,7 @@ class SRTRunner:
|
|
174
179
|
tp_size=1,
|
175
180
|
torch_dtype=torch.float16,
|
176
181
|
is_generation_model=None,
|
182
|
+
port=5157,
|
177
183
|
):
|
178
184
|
self.is_generation_model = (
|
179
185
|
is_generation_model(model_path)
|
@@ -184,6 +190,8 @@ class SRTRunner:
|
|
184
190
|
model_path=model_path,
|
185
191
|
tp_size=tp_size,
|
186
192
|
dtype=get_dtype_str(torch_dtype),
|
193
|
+
port=port,
|
194
|
+
mem_fraction_static=0.7,
|
187
195
|
)
|
188
196
|
|
189
197
|
def forward(
|
sglang/test/test_programs.py
CHANGED
@@ -103,16 +103,19 @@ def test_decode_int():
|
|
103
103
|
def test_decode_json_regex():
|
104
104
|
@sgl.function
|
105
105
|
def decode_json(s):
|
106
|
-
from sglang.lang.ir import REGEX_FLOAT, REGEX_INT,
|
106
|
+
from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR
|
107
107
|
|
108
108
|
s += "Generate a JSON object to describe the basic city information of Paris.\n"
|
109
|
+
s += "Here are the JSON object:\n"
|
110
|
+
|
111
|
+
# NOTE: we recommend using dtype gen or whole regex string to control the output
|
109
112
|
|
110
113
|
with s.var_scope("json_output"):
|
111
114
|
s += "{\n"
|
112
|
-
s += ' "name": ' + sgl.gen(regex=
|
113
|
-
s += ' "population": ' + sgl.gen(regex=REGEX_INT
|
114
|
-
s += ' "area": ' + sgl.gen(regex=REGEX_INT
|
115
|
-
s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT) + "\n"
|
115
|
+
s += ' "name": ' + sgl.gen(regex=REGEX_STR) + ",\n"
|
116
|
+
s += ' "population": ' + sgl.gen(regex=REGEX_INT, stop=[" ", "\n"]) + ",\n"
|
117
|
+
s += ' "area": ' + sgl.gen(regex=REGEX_INT, stop=[" ", "\n"]) + ",\n"
|
118
|
+
s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT, stop=[" ", "\n"]) + "\n"
|
116
119
|
s += "}"
|
117
120
|
|
118
121
|
ret = decode_json.run(temperature=0.0)
|
@@ -359,6 +362,30 @@ def test_regex():
|
|
359
362
|
assert re.match(regex, answer)
|
360
363
|
|
361
364
|
|
365
|
+
def test_dtype_gen():
|
366
|
+
@sgl.function
|
367
|
+
def dtype_gen(s):
|
368
|
+
s += "Q: What is the full name of DNS?\n"
|
369
|
+
s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
|
370
|
+
s += "Q: Which year was DNS invented?\n"
|
371
|
+
s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
|
372
|
+
s += "Q: What is the value of pi?\n"
|
373
|
+
s += "A: " + sgl.gen("float_res", dtype=float) + "\n"
|
374
|
+
s += "Q: Is the sky blue?\n"
|
375
|
+
s += "A: " + sgl.gen("bool_res", dtype=bool) + "\n"
|
376
|
+
|
377
|
+
state = dtype_gen.run()
|
378
|
+
|
379
|
+
try:
|
380
|
+
state["int_res"] = int(state["int_res"])
|
381
|
+
state["float_res"] = float(state["float_res"])
|
382
|
+
state["bool_res"] = bool(state["bool_res"])
|
383
|
+
# assert state["str_res"].startswith('"') and state["str_res"].endswith('"')
|
384
|
+
except ValueError:
|
385
|
+
print(state)
|
386
|
+
raise
|
387
|
+
|
388
|
+
|
362
389
|
def test_completion_speculative():
|
363
390
|
@sgl.function(num_api_spec_tokens=64)
|
364
391
|
def gen_character_spec(s):
|
sglang/test/test_utils.py
CHANGED
@@ -21,7 +21,11 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
|
21
21
|
from sglang.utils import get_exception_traceback
|
22
22
|
|
23
23
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
24
|
-
|
24
|
+
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
25
|
+
DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157"
|
26
|
+
DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157"
|
27
|
+
DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157"
|
28
|
+
DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157"
|
25
29
|
|
26
30
|
|
27
31
|
def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.13"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.13
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -308,7 +308,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
308
308
|
### Method 2: From source
|
309
309
|
```
|
310
310
|
# Use the last release branch
|
311
|
-
git clone -b v0.2.
|
311
|
+
git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
|
312
312
|
cd sglang
|
313
313
|
|
314
314
|
pip install --upgrade pip
|
@@ -329,11 +329,19 @@ docker run --gpus all \
|
|
329
329
|
--env "HF_TOKEN=<secret>" \
|
330
330
|
--ipc=host \
|
331
331
|
lmsysorg/sglang:latest \
|
332
|
-
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
332
|
+
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
333
333
|
```
|
334
334
|
|
335
|
+
### Method 4: Using docker compose
|
336
|
+
|
337
|
+
> This method is recommended if you plan to serve it as a service.
|
338
|
+
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
339
|
+
|
340
|
+
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
341
|
+
2. Execute the command `docker compose up -d` in your terminal.
|
342
|
+
|
335
343
|
### Common Notes
|
336
|
-
-
|
344
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
|
337
345
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
338
346
|
|
339
347
|
## Backend: SGLang Runtime (SRT)
|
@@ -1,91 +1,91 @@
|
|
1
1
|
sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
|
2
|
-
sglang/api.py,sha256=
|
3
|
-
sglang/bench_latency.py,sha256=
|
2
|
+
sglang/api.py,sha256=sRuA17JzayE9SFOhaZFqKFJDb_aRpNlcyKiMA5BzsDk,6258
|
3
|
+
sglang/bench_latency.py,sha256=UM5noYvFb6hc7wS82WAFeWTx3u83vkg9pfhyW0KdvY4,16234
|
4
4
|
sglang/bench_serving.py,sha256=sS-fawAyzngrOVbPE3N1FBxPojoPd9vj9XQDsWpIYTQ,35798
|
5
5
|
sglang/check_env.py,sha256=oU8VmjjPK2SviRhr41cF1953soBu-eTT5E0Hf04zMzo,4974
|
6
|
-
sglang/global_config.py,sha256=
|
6
|
+
sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
|
7
7
|
sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
|
8
8
|
sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
|
9
9
|
sglang/utils.py,sha256=zFYGkC4vOUR3sTv1TmQXcsOLZDtDBR3wnjqnDp3xMIs,8352
|
10
|
-
sglang/version.py,sha256=
|
10
|
+
sglang/version.py,sha256=C0atO05M0rfDTTHt02NxNa4jt0eSqXM4AxShEhb2epA,23
|
11
11
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
|
13
13
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
14
14
|
sglang/lang/compiler.py,sha256=1Tc6MQs4RsIfrNmmO7PMSUEHIqvNqKOp_HxaYqonwFE,7533
|
15
|
-
sglang/lang/interpreter.py,sha256=
|
16
|
-
sglang/lang/ir.py,sha256=
|
15
|
+
sglang/lang/interpreter.py,sha256=8QiLvjUgVJrtzIjS9lCUR01k7BeZWZQsmRAwLMz-cmA,30194
|
16
|
+
sglang/lang/ir.py,sha256=WOZdRbONMhhSeD75bvUeQRv4gObxVMtkvzmalRrVdkM,17261
|
17
17
|
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
18
18
|
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
20
20
|
sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
|
21
21
|
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
22
22
|
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
23
|
-
sglang/lang/backend/runtime_endpoint.py,sha256=
|
23
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=SDlp03EuQEK1eGK4_IaFySWgxlp4wCs3EPewZ6O640E,9549
|
24
24
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
25
25
|
sglang/srt/conversation.py,sha256=V5YuoeO6-aLqGv0p3J2qx8TnBJbN1oTopYFutNul3GQ,16491
|
26
26
|
sglang/srt/hf_transformers_utils.py,sha256=Tf_RplcW7llVXsigRvSGqmeAUxBeAL8rPCkzuqWfZ8U,11925
|
27
27
|
sglang/srt/mm_utils.py,sha256=n7_GmbOM_0IWVXovpM34rKIBw0Py9yb_NXSQw27u4OA,9454
|
28
28
|
sglang/srt/model_config.py,sha256=k4OfRV-szWkFaJMIC40JoJGJ75AfYQ2hf4M1dS1aQ-o,6366
|
29
|
-
sglang/srt/sampling_params.py,sha256=
|
30
|
-
sglang/srt/server.py,sha256=
|
31
|
-
sglang/srt/server_args.py,sha256=
|
32
|
-
sglang/srt/utils.py,sha256=
|
29
|
+
sglang/srt/sampling_params.py,sha256=CIrM-OLAjUJ8oSQfhXetjv50BAseexWYOV5Wr6LXYeY,4739
|
30
|
+
sglang/srt/server.py,sha256=gSGC6MJLLXsuusizKzTxJaaWiaQjsa-Zm5hxV2fYHb8,18845
|
31
|
+
sglang/srt/server_args.py,sha256=YoTVFzt65w1vjypyh0a4FV7BNreVGS49d8uf6TPrM_w,17083
|
32
|
+
sglang/srt/utils.py,sha256=MIDD53BT4ukaHO-zmEQZD5l7Xco_gefO0co4FJsMsn4,24053
|
33
33
|
sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
|
34
34
|
sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
|
35
35
|
sglang/srt/constrained/fsm_cache.py,sha256=QTrBFoZCp2FeigtIakz2MCgQLtvQFXgl2lDPQaGtu9M,2784
|
36
|
-
sglang/srt/constrained/jump_forward.py,sha256=
|
37
|
-
sglang/srt/layers/activation.py,sha256=
|
36
|
+
sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
|
37
|
+
sglang/srt/layers/activation.py,sha256=j2zQmY1snfB5DqrYr5KqRUEkMXQn6LVnkeur60FfMCU,1175
|
38
38
|
sglang/srt/layers/decode_attention.py,sha256=Vgxd2rWzSZkNFp0bjZRAUAusG4bz6iy3D0CULnN-cdk,8904
|
39
|
-
sglang/srt/layers/extend_attention.py,sha256=
|
40
|
-
sglang/srt/layers/fused_moe.py,sha256=KmyXwau2OOZpQimGIQrHptzGNs1trIud5AKEEKXdzPU,20823
|
39
|
+
sglang/srt/layers/extend_attention.py,sha256=h4O0R7PJpAVKS3Vx_583zhrFPD0vv6XqzvOcHBI3zoc,14268
|
41
40
|
sglang/srt/layers/layernorm.py,sha256=RzN4eESN9S8mw32r2Nxarq7wKFdeG1yhxPmehUMx79s,2073
|
42
|
-
sglang/srt/layers/logits_processor.py,sha256=
|
41
|
+
sglang/srt/layers/logits_processor.py,sha256=wBgo6IVxWgV4vYRQesnuE2qA8ynB2oFtv0COZSAMIeA,11374
|
43
42
|
sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
|
44
43
|
sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
|
45
|
-
sglang/srt/layers/radix_attention.py,sha256=
|
44
|
+
sglang/srt/layers/radix_attention.py,sha256=EA7rc73ZGnle2tQlslF9Ri_VEY07jD0e0cPiKcsqOyA,8473
|
45
|
+
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
46
|
+
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
47
|
+
sglang/srt/layers/fused_moe/layer.py,sha256=ByNlMmmXsckcsjI12rhlg_IH0KvO6zWJoOYuk7i4ogY,20947
|
46
48
|
sglang/srt/managers/controller_multi.py,sha256=LYI-XE9h57DW8Uh4gpd8upsC3p2dd5weKzddEH274jg,6626
|
47
49
|
sglang/srt/managers/controller_single.py,sha256=CdQ9_XPZdcWF5jArDmVR8K-WZ9_8Gpgk4SwANKxTX-Y,5112
|
48
50
|
sglang/srt/managers/detokenizer_manager.py,sha256=OXufjdCt2ebt-S7MDndjY9Ew16rP4fhualGgj6YEKp0,6295
|
49
51
|
sglang/srt/managers/io_struct.py,sha256=Xvfl6DNZ2Ek2S4qlRzpVo3foc-aC-1-N-5odcJ4gdq4,9446
|
50
52
|
sglang/srt/managers/policy_scheduler.py,sha256=KRFaZwjCAkPQDX3W8lbzrxYqgOe7LKFDj2BPlcmlnR8,8379
|
51
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
52
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
53
|
-
sglang/srt/managers/tp_worker.py,sha256=
|
53
|
+
sglang/srt/managers/schedule_batch.py,sha256=L9kBQZBfsy-2Arzkx4ZjKjNL-zN1BErnv9LqRi3CQNI,30657
|
54
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=4cf7JyuMGvLVp6Dv8pWG6c9285O6zuD2Ja0eEePUCNg,24857
|
55
|
+
sglang/srt/managers/tp_worker.py,sha256=TPtWHcLM-bh7GGdA7-8c-zdNLFeLxWNnl3iqODKwYWw,32583
|
54
56
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
55
57
|
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
56
58
|
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
57
59
|
sglang/srt/mem_cache/memory_pool.py,sha256=eXDCstd5Mvu1CbHt1y9z27Eq60QYwW45FsKbZspu4yw,5310
|
58
60
|
sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
|
59
|
-
sglang/srt/model_executor/cuda_graph_runner.py,sha256=
|
60
|
-
sglang/srt/model_executor/forward_batch_info.py,sha256=
|
61
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
62
|
-
sglang/srt/
|
63
|
-
sglang/srt/
|
64
|
-
sglang/srt/models/chatglm.py,sha256=7bHU2AFoppINDZm0EdxgtAJe7rwr9OPkhOCfq2qNrIA,13862
|
65
|
-
sglang/srt/models/commandr.py,sha256=5BEtIS2uUQJANkkY-6ZeDqlrpUK5yXVYHiztU3vsTKY,14172
|
61
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=xvhFptAJKonqnEjeVYaIiKwhEM4NzbSeF9YvC6YqVc8,11364
|
62
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=tcWwiKBU2W2USg19ASRlx-9utvYL6PTO0NPNyK5frJk,14272
|
63
|
+
sglang/srt/model_executor/model_runner.py,sha256=QpNzsV1WiH4_1T0klmM6GjivWI-fKLATC5E67C1LSYk,18158
|
64
|
+
sglang/srt/models/chatglm.py,sha256=aoEgA2nflcOCIKtZojhUoboqxSP6i5IrrvuDOpzNPnE,13844
|
65
|
+
sglang/srt/models/commandr.py,sha256=2rAXRZRb4PkJZ4NWEqP_rIgsjxbdZyHpuoMOarqTWzQ,14163
|
66
66
|
sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
|
67
|
-
sglang/srt/models/deepseek.py,sha256=
|
68
|
-
sglang/srt/models/deepseek_v2.py,sha256=
|
69
|
-
sglang/srt/models/gemma.py,sha256=
|
70
|
-
sglang/srt/models/gemma2.py,sha256=
|
67
|
+
sglang/srt/models/deepseek.py,sha256=7UJgde1EV9ey6d-CKRcEyTKh1_WhZdatpZiltIuqpik,16006
|
68
|
+
sglang/srt/models/deepseek_v2.py,sha256=uk--2a1e83H6U9wTx_wd3UvkS3VrSRSkjCOjky0R0uo,27004
|
69
|
+
sglang/srt/models/gemma.py,sha256=3orOUznoGt2NxVKO5c8AjD_ue0gWqwb7LnKbhlcS5Vg,12276
|
70
|
+
sglang/srt/models/gemma2.py,sha256=IUXKjwO11dpnhevmapS9jz_qPZvzSKrHhYHIXnBR9AU,16475
|
71
71
|
sglang/srt/models/gpt_bigcode.py,sha256=OKk9UP67as3T5bePlTRGHTCD-1wqaUEk92AowXPm6dg,10204
|
72
|
-
sglang/srt/models/grok.py,sha256=
|
72
|
+
sglang/srt/models/grok.py,sha256=TrYcCQZhV7f5SUntU4Lo4ZDC8uBi0Vg0SWtyYiZxdqs,14530
|
73
73
|
sglang/srt/models/internlm2.py,sha256=6j7JH0p3yib8GZDH8Cmrs-pgwfH3eOlAK6V3Cq64O7w,12202
|
74
74
|
sglang/srt/models/llama2.py,sha256=HmzE1I8OnesmrdPY5b56l7okhWH_lRvWAg16K-UwKHg,14300
|
75
75
|
sglang/srt/models/llama_classification.py,sha256=Dvzy3PfETiJtnKFOk8qDDLUoZECf_cpSrNeA60PaDo4,4932
|
76
76
|
sglang/srt/models/llama_embedding.py,sha256=e2lpZ6GHKrHT1rr7_5gHGoCpfqdOBMusZCz34n62lec,3542
|
77
77
|
sglang/srt/models/llava.py,sha256=-ysi192vpBDxNaMS8qaLOhC34lXQyRtbG_0niVaceSo,18436
|
78
78
|
sglang/srt/models/llavavid.py,sha256=MX7YpqYh5J4BoOnV7vVAIfoOlBFQXYpp8Kpe7WK0ejk,13562
|
79
|
-
sglang/srt/models/minicpm.py,sha256=
|
79
|
+
sglang/srt/models/minicpm.py,sha256=ioqCsTCE_oF8xqGF5fm5cK9dclK5Y0EQ1UJfyteIDDo,13825
|
80
80
|
sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
|
81
|
-
sglang/srt/models/mixtral.py,sha256=
|
82
|
-
sglang/srt/models/mixtral_quant.py,sha256=
|
83
|
-
sglang/srt/models/qwen.py,sha256=
|
84
|
-
sglang/srt/models/qwen2.py,sha256=
|
85
|
-
sglang/srt/models/qwen2_moe.py,sha256
|
86
|
-
sglang/srt/models/stablelm.py,sha256=
|
81
|
+
sglang/srt/models/mixtral.py,sha256=cZK-1kGXQC8ZC0tFNmbAoqWlyrrvv5omumpDdEwzzss,13623
|
82
|
+
sglang/srt/models/mixtral_quant.py,sha256=wMACJq78OTWj7HlqPDRNEh8cjrVAjKqJEsOG3CO5xow,14072
|
83
|
+
sglang/srt/models/qwen.py,sha256=ssdSgVuhT1Ei0JPa0xwqzrwwPNwkCHRJA4q70hK-Z7E,9988
|
84
|
+
sglang/srt/models/qwen2.py,sha256=eeah76x-OYZiy6Bb1SDNVk8m_xXHYuh-P58GXjEFZ4w,12266
|
85
|
+
sglang/srt/models/qwen2_moe.py,sha256=-Ijn_H2IGCjQAYA-9teS9IXKTPMBWSkkPp0Nox6MCuQ,17729
|
86
|
+
sglang/srt/models/stablelm.py,sha256=30ngpc0Xq3VxzXJlf6svP1oax8Q3krMJkxM8PVKtZWU,11359
|
87
87
|
sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
|
88
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
88
|
+
sglang/srt/openai_api/adapter.py,sha256=C53adcpLGfIUm_B259iWnOCQ3B3VjJbqFseqP8Vo-t8,43064
|
89
89
|
sglang/srt/openai_api/protocol.py,sha256=knf-nds0XO2LYg-hPM-Ho1f1y2XZIV_Gvg3xcCKLfgQ,9411
|
90
90
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
91
91
|
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
|
@@ -94,7 +94,7 @@ sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgc
|
|
94
94
|
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
|
95
95
|
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
96
96
|
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
97
|
-
sglang/test/runners.py,sha256=
|
97
|
+
sglang/test/runners.py,sha256=J4XfBSPhZvLiHLrDsHUuIKjX3kzbMrD7fFEPr07SUkU,7975
|
98
98
|
sglang/test/simple_eval_common.py,sha256=HL1bfgkTAKP7sk-kShg73WTeADhuBD6xSsuLbV_9C3s,12359
|
99
99
|
sglang/test/simple_eval_gpqa.py,sha256=CaRAuHdZj0m4mRm4tH9k7cB0kQxe0LHwlz7Vn1qyKps,3189
|
100
100
|
sglang/test/simple_eval_humaneval.py,sha256=iCtN2LBL6j3nxMDjRJ--m0MCNPAwDo81gJ2whE-2Rt0,5674
|
@@ -102,11 +102,11 @@ sglang/test/simple_eval_math.py,sha256=EQblQmtUt-kl558drzhP7c6KhpDNgr1EJhhKx5eeH
|
|
102
102
|
sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
|
103
103
|
sglang/test/simple_eval_mmlu.py,sha256=KqSSdSu2qfoKQ870ttxev1NJ7c90xv2mvKOQsSODtAw,4326
|
104
104
|
sglang/test/test_layernorm.py,sha256=VDdoeqGvebUa-l3rDiid6cC7wZq0Phpbm5fxxD0-cpg,1910
|
105
|
-
sglang/test/test_programs.py,sha256=
|
106
|
-
sglang/test/test_utils.py,sha256=
|
105
|
+
sglang/test/test_programs.py,sha256=V_-Bx3lLkw37P6gDyA7mZCqxlyNMaFLBkRrPMQQQqn4,14909
|
106
|
+
sglang/test/test_utils.py,sha256=Fw606sa8sTX6HJ7OCuyDUH8LQr9PvtwBKYnyZj2SLWU,14741
|
107
107
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
108
|
-
sglang-0.2.
|
109
|
-
sglang-0.2.
|
110
|
-
sglang-0.2.
|
111
|
-
sglang-0.2.
|
112
|
-
sglang-0.2.
|
108
|
+
sglang-0.2.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
109
|
+
sglang-0.2.13.dist-info/METADATA,sha256=oy69SBbn-iEZE0JRzPkHuhzRlAjNj6v8twSXrjsOWXs,34892
|
110
|
+
sglang-0.2.13.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
111
|
+
sglang-0.2.13.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
112
|
+
sglang-0.2.13.dist-info/RECORD,,
|