sglang 0.4.1.post5__py3-none-any.whl → 0.4.1.post6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/configs/model_config.py +15 -6
- sglang/srt/layers/attention/flashinfer_backend.py +17 -3
- sglang/srt/layers/linear.py +36 -98
- sglang/srt/layers/moe/fused_moe_triton/layer.py +37 -9
- sglang/srt/layers/moe/topk.py +4 -2
- sglang/srt/layers/parameter.py +24 -16
- sglang/srt/layers/quantization/__init__.py +2 -0
- sglang/srt/layers/quantization/fp8.py +106 -52
- sglang/srt/layers/quantization/fp8_utils.py +1 -1
- sglang/srt/layers/quantization/int8_kernel.py +54 -0
- sglang/srt/layers/quantization/modelopt_quant.py +1 -1
- sglang/srt/layers/quantization/w8a8_int8.py +117 -0
- sglang/srt/layers/radix_attention.py +2 -0
- sglang/srt/layers/vocab_parallel_embedding.py +15 -2
- sglang/srt/managers/configure_logging.py +43 -0
- sglang/srt/managers/detokenizer_manager.py +0 -2
- sglang/srt/managers/io_struct.py +29 -13
- sglang/srt/managers/scheduler.py +48 -9
- sglang/srt/managers/tokenizer_manager.py +109 -49
- sglang/srt/mem_cache/memory_pool.py +107 -52
- sglang/srt/metrics/collector.py +10 -5
- sglang/srt/model_executor/model_runner.py +43 -6
- sglang/srt/models/llama.py +37 -2
- sglang/srt/models/qwen2.py +11 -0
- sglang/srt/models/qwen2_eagle.py +131 -0
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
- sglang/srt/sampling/sampling_batch_info.py +14 -5
- sglang/srt/sampling/sampling_params.py +1 -1
- sglang/srt/server.py +114 -61
- sglang/srt/server_args.py +27 -18
- sglang/srt/speculative/eagle_worker.py +1 -0
- sglang/srt/torch_memory_saver_adapter.py +59 -0
- sglang/srt/utils.py +29 -0
- sglang/version.py +1 -1
- {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post6.dist-info}/METADATA +12 -10
- {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post6.dist-info}/RECORD +39 -34
- {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post6.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post6.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post5.dist-info → sglang-0.4.1.post6.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -23,7 +23,6 @@ from typing import List, Optional
|
|
23
23
|
import torch
|
24
24
|
|
25
25
|
from sglang.srt.hf_transformers_utils import check_gguf_file
|
26
|
-
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
27
26
|
from sglang.srt.utils import (
|
28
27
|
get_amdgpu_memory_capacity,
|
29
28
|
get_hpu_memory_capacity,
|
@@ -32,6 +31,7 @@ from sglang.srt.utils import (
|
|
32
31
|
is_hip,
|
33
32
|
is_ipv6,
|
34
33
|
is_port_available,
|
34
|
+
nullable_str,
|
35
35
|
)
|
36
36
|
|
37
37
|
logger = logging.getLogger(__name__)
|
@@ -47,6 +47,7 @@ class ServerArgs:
|
|
47
47
|
trust_remote_code: bool = True
|
48
48
|
dtype: str = "auto"
|
49
49
|
kv_cache_dtype: str = "auto"
|
50
|
+
quantization_param_path: nullable_str = None
|
50
51
|
quantization: Optional[str] = None
|
51
52
|
context_length: Optional[int] = None
|
52
53
|
device: str = "cuda"
|
@@ -55,7 +56,6 @@ class ServerArgs:
|
|
55
56
|
is_embedding: bool = False
|
56
57
|
revision: Optional[str] = None
|
57
58
|
skip_tokenizer_init: bool = False
|
58
|
-
return_token_ids: bool = False
|
59
59
|
|
60
60
|
# Port for the HTTP server
|
61
61
|
host: str = "127.0.0.1"
|
@@ -91,7 +91,7 @@ class ServerArgs:
|
|
91
91
|
|
92
92
|
# API related
|
93
93
|
api_key: Optional[str] = None
|
94
|
-
file_storage_pth: str = "
|
94
|
+
file_storage_pth: str = "sglang_storage"
|
95
95
|
enable_cache_report: bool = False
|
96
96
|
|
97
97
|
# Data parallelism
|
@@ -156,6 +156,7 @@ class ServerArgs:
|
|
156
156
|
triton_attention_num_kv_splits: int = 8
|
157
157
|
num_continuous_decode_steps: int = 1
|
158
158
|
delete_ckpt_after_loading: bool = False
|
159
|
+
enable_memory_saver: bool = False
|
159
160
|
|
160
161
|
def __post_init__(self):
|
161
162
|
# Set missing default values
|
@@ -296,6 +297,11 @@ class ServerArgs:
|
|
296
297
|
"tokenizer if available, and 'slow' will "
|
297
298
|
"always use the slow tokenizer.",
|
298
299
|
)
|
300
|
+
parser.add_argument(
|
301
|
+
"--skip-tokenizer-init",
|
302
|
+
action="store_true",
|
303
|
+
help="If set, skip init tokenizer and pass input_ids in generate request",
|
304
|
+
)
|
299
305
|
parser.add_argument(
|
300
306
|
"--load-format",
|
301
307
|
type=str,
|
@@ -346,8 +352,17 @@ class ServerArgs:
|
|
346
352
|
"--kv-cache-dtype",
|
347
353
|
type=str,
|
348
354
|
default=ServerArgs.kv_cache_dtype,
|
349
|
-
choices=["auto", "fp8_e5m2"],
|
350
|
-
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" is supported for CUDA 11.8+.',
|
355
|
+
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
|
356
|
+
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
|
357
|
+
)
|
358
|
+
parser.add_argument(
|
359
|
+
"--quantization-param-path",
|
360
|
+
type=nullable_str,
|
361
|
+
default=None,
|
362
|
+
help="Path to the JSON file containing the KV cache "
|
363
|
+
"scaling factors. This should generally be supplied, when "
|
364
|
+
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
|
365
|
+
"default to 1.0, which may cause accuracy issues. ",
|
351
366
|
)
|
352
367
|
parser.add_argument(
|
353
368
|
"--quantization",
|
@@ -363,6 +378,7 @@ class ServerArgs:
|
|
363
378
|
"bitsandbytes",
|
364
379
|
"gguf",
|
365
380
|
"modelopt",
|
381
|
+
"w8a8_int8",
|
366
382
|
],
|
367
383
|
help="The quantization method.",
|
368
384
|
)
|
@@ -404,18 +420,6 @@ class ServerArgs:
|
|
404
420
|
"name, a tag name, or a commit id. If unspecified, will use "
|
405
421
|
"the default version.",
|
406
422
|
)
|
407
|
-
parser.add_argument(
|
408
|
-
"--skip-tokenizer-init",
|
409
|
-
action="store_true",
|
410
|
-
help="If set, skip init tokenizer and pass input_ids in generate request",
|
411
|
-
)
|
412
|
-
parser.add_argument(
|
413
|
-
"--return-token-ids",
|
414
|
-
action="store_true",
|
415
|
-
default=ServerArgs.return_token_ids,
|
416
|
-
help="Whether to return token IDs in the output, this may introduce additional overhead.",
|
417
|
-
)
|
418
|
-
|
419
423
|
# Memory and scheduling
|
420
424
|
parser.add_argument(
|
421
425
|
"--mem-fraction-static",
|
@@ -551,7 +555,7 @@ class ServerArgs:
|
|
551
555
|
"--decode-log-interval",
|
552
556
|
type=int,
|
553
557
|
default=ServerArgs.decode_log_interval,
|
554
|
-
help="The log interval of decode batch",
|
558
|
+
help="The log interval of decode batch.",
|
555
559
|
)
|
556
560
|
|
557
561
|
# API related
|
@@ -851,6 +855,11 @@ class ServerArgs:
|
|
851
855
|
action="store_true",
|
852
856
|
help="Delete the model checkpoint after loading the model.",
|
853
857
|
)
|
858
|
+
parser.add_argument(
|
859
|
+
"--enable-memory-saver",
|
860
|
+
action="store_true",
|
861
|
+
help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
|
862
|
+
)
|
854
863
|
|
855
864
|
@classmethod
|
856
865
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from abc import ABC
|
2
|
+
from contextlib import contextmanager
|
3
|
+
|
4
|
+
try:
|
5
|
+
import torch_memory_saver
|
6
|
+
|
7
|
+
_primary_memory_saver = torch_memory_saver.TorchMemorySaver()
|
8
|
+
except ImportError:
|
9
|
+
pass
|
10
|
+
|
11
|
+
|
12
|
+
class TorchMemorySaverAdapter(ABC):
|
13
|
+
@staticmethod
|
14
|
+
def create(enable: bool):
|
15
|
+
return (
|
16
|
+
_TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop()
|
17
|
+
)
|
18
|
+
|
19
|
+
def configure_subprocess(self):
|
20
|
+
raise NotImplementedError
|
21
|
+
|
22
|
+
def region(self):
|
23
|
+
raise NotImplementedError
|
24
|
+
|
25
|
+
def pause(self):
|
26
|
+
raise NotImplementedError
|
27
|
+
|
28
|
+
def resume(self):
|
29
|
+
raise NotImplementedError
|
30
|
+
|
31
|
+
|
32
|
+
class _TorchMemorySaverAdapterReal(TorchMemorySaverAdapter):
|
33
|
+
def configure_subprocess(self):
|
34
|
+
return torch_memory_saver.configure_subprocess()
|
35
|
+
|
36
|
+
def region(self):
|
37
|
+
return _primary_memory_saver.region()
|
38
|
+
|
39
|
+
def pause(self):
|
40
|
+
return _primary_memory_saver.pause()
|
41
|
+
|
42
|
+
def resume(self):
|
43
|
+
return _primary_memory_saver.resume()
|
44
|
+
|
45
|
+
|
46
|
+
class _TorchMemorySaverAdapterNoop(TorchMemorySaverAdapter):
|
47
|
+
@contextmanager
|
48
|
+
def configure_subprocess(self):
|
49
|
+
yield
|
50
|
+
|
51
|
+
@contextmanager
|
52
|
+
def region(self):
|
53
|
+
yield
|
54
|
+
|
55
|
+
def pause(self):
|
56
|
+
pass
|
57
|
+
|
58
|
+
def resume(self):
|
59
|
+
pass
|
sglang/srt/utils.py
CHANGED
@@ -97,6 +97,10 @@ def is_flashinfer_available():
|
|
97
97
|
return torch.cuda.is_available() and torch.version.cuda
|
98
98
|
|
99
99
|
|
100
|
+
def is_cuda_available():
|
101
|
+
return torch.cuda.is_available() and torch.version.cuda
|
102
|
+
|
103
|
+
|
100
104
|
def is_ipv6(address):
|
101
105
|
try:
|
102
106
|
ipaddress.IPv6Address(address)
|
@@ -1340,6 +1344,25 @@ def parse_tool_response(text, tools, **kwargs):
|
|
1340
1344
|
return text, call_info_list
|
1341
1345
|
|
1342
1346
|
|
1347
|
+
def permute_weight(x: torch.Tensor) -> torch.Tensor:
|
1348
|
+
b_ = x.shape[0]
|
1349
|
+
n_ = x.shape[1]
|
1350
|
+
k_ = x.shape[2]
|
1351
|
+
|
1352
|
+
x_ = x
|
1353
|
+
if x.dtype == torch.bfloat16 or x.dtype == torch.float16:
|
1354
|
+
x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 32), 4, 8)
|
1355
|
+
elif x.dtype == torch.float8_e4m3fnuz or x.dtype == torch.int8:
|
1356
|
+
x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 64), 4, 16)
|
1357
|
+
else:
|
1358
|
+
return x_
|
1359
|
+
|
1360
|
+
x_ = x_.permute(0, 1, 3, 4, 2, 5)
|
1361
|
+
x_ = x_.contiguous()
|
1362
|
+
x_ = x_.view(*x.shape)
|
1363
|
+
return x_
|
1364
|
+
|
1365
|
+
|
1343
1366
|
class MultiprocessingSerializer:
|
1344
1367
|
@staticmethod
|
1345
1368
|
def serialize(obj):
|
@@ -1375,3 +1398,9 @@ def debug_timing(func):
|
|
1375
1398
|
return func(*args, **kwargs)
|
1376
1399
|
|
1377
1400
|
return wrapper
|
1401
|
+
|
1402
|
+
|
1403
|
+
def nullable_str(val: str):
|
1404
|
+
if not val or val == "None":
|
1405
|
+
return None
|
1406
|
+
return val
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.1.
|
1
|
+
__version__ = "0.4.1.post6"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.1.
|
3
|
+
Version: 0.4.1.post6
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -240,7 +240,7 @@ Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
|
|
240
240
|
Provides-Extra: srt
|
241
241
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
242
242
|
Requires-Dist: cuda-python; extra == "srt"
|
243
|
-
Requires-Dist: sgl-kernel>=0.0.2.
|
243
|
+
Requires-Dist: sgl-kernel>=0.0.2.post12; extra == "srt"
|
244
244
|
Requires-Dist: torch; extra == "srt"
|
245
245
|
Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
|
246
246
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
@@ -259,6 +259,8 @@ Provides-Extra: anthropic
|
|
259
259
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
260
260
|
Provides-Extra: litellm
|
261
261
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
262
|
+
Provides-Extra: torch-memory-saver
|
263
|
+
Requires-Dist: torch_memory_saver; extra == "torch-memory-saver"
|
262
264
|
Provides-Extra: test
|
263
265
|
Requires-Dist: jsonlines; extra == "test"
|
264
266
|
Requires-Dist: matplotlib; extra == "test"
|
@@ -314,9 +316,9 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
|
|
314
316
|
--------------------------------------------------------------------------------
|
315
317
|
|
316
318
|
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
|
317
|
-
| [**Documentation**](https://
|
318
|
-
| [**Join Slack**](https://
|
319
|
-
| [**Join Bi-Weekly Development Meeting**](https://
|
319
|
+
| [**Documentation**](https://docs.sglang.ai/)
|
320
|
+
| [**Join Slack**](https://slack.sglang.ai/)
|
321
|
+
| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
|
320
322
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
321
323
|
|
322
324
|
## News
|
@@ -346,11 +348,11 @@ The core features include:
|
|
346
348
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
347
349
|
|
348
350
|
## Getting Started
|
349
|
-
- [Install SGLang](https://
|
350
|
-
- [Quick Start](https://
|
351
|
-
- [Backend Tutorial](https://
|
352
|
-
- [Frontend Tutorial](https://
|
353
|
-
- [Contribution Guide](https://
|
351
|
+
- [Install SGLang](https://docs.sglang.ai/start/install.html)
|
352
|
+
- [Quick Start](https://docs.sglang.ai/start/send_request.html)
|
353
|
+
- [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
|
354
|
+
- [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
|
355
|
+
- [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
|
354
356
|
|
355
357
|
## Benchmark and Performance
|
356
358
|
Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
@@ -11,7 +11,7 @@ sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
|
|
11
11
|
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
12
12
|
sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
|
13
13
|
sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
|
14
|
-
sglang/version.py,sha256=
|
14
|
+
sglang/version.py,sha256=67TlBPUpVb158CbDn3v32POQ-USKtg7P1fg71jmrBWc,28
|
15
15
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
|
17
17
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -32,16 +32,17 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
|
|
32
32
|
sglang/srt/hf_transformers_utils.py,sha256=_24uqCkZ4dvS9Uc5p2cCzX0Q8ShUzrh_Hp6mvg7hxHY,7729
|
33
33
|
sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
|
34
34
|
sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
|
35
|
-
sglang/srt/server.py,sha256=
|
36
|
-
sglang/srt/server_args.py,sha256=
|
37
|
-
sglang/srt/
|
35
|
+
sglang/srt/server.py,sha256=g2Wf1S3tOev0T2Wn98UkaOuDYPMixsy2xUzW2jUrQ3o,37148
|
36
|
+
sglang/srt/server_args.py,sha256=N8ByNO3vlQ-nl_-rgiCsRkiksefKtyKY9W7-24rhQKw,36965
|
37
|
+
sglang/srt/torch_memory_saver_adapter.py,sha256=--FgbrcvJxTcRe856plD9ktqgrHGPTE18eZCJlE50hY,1255
|
38
|
+
sglang/srt/utils.py,sha256=8TobQ4TwR22aa4j3W-XMkhJVBsuZ85t0zI8Mupx7L3M,46180
|
38
39
|
sglang/srt/configs/__init__.py,sha256=Nvwtif0X9IYUtj0aL9XvAo_RRZcxTshsaliwc8djooU,347
|
39
40
|
sglang/srt/configs/chatglm.py,sha256=j-b0YkdYUmQm2y1kNmMJtKeACxWKmBbvNNkDWbs6kbI,2907
|
40
41
|
sglang/srt/configs/dbrx.py,sha256=tdhIkXAQl1yr0MxqFmsDG1E0e2puRTTKm6UTyANBLac,11005
|
41
42
|
sglang/srt/configs/device_config.py,sha256=dResqHjkg_dq10v6rnVpbXpvABZRB0jylOm-2_JAnx0,428
|
42
43
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
43
44
|
sglang/srt/configs/load_config.py,sha256=TcPi_HY6xu5SiVZsxPOoB5pGeDUNebOk7muoUH9VBDg,3083
|
44
|
-
sglang/srt/configs/model_config.py,sha256=
|
45
|
+
sglang/srt/configs/model_config.py,sha256=qDTL1oxSlCxptPX8AI-VlEuxMB7m0UCAUDsbwXpUjow,16831
|
45
46
|
sglang/srt/configs/qwen2vl.py,sha256=ZjLy9v2eZY4wptUfY3CWgYKg2B5DDrkfCSyTy_Zf_bg,4351
|
46
47
|
sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
|
47
48
|
sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
|
@@ -64,18 +65,18 @@ sglang/srt/distributed/device_communicators/xpu_communicator.py,sha256=P3WKgddcf
|
|
64
65
|
sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwxBQ,5471
|
65
66
|
sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
|
66
67
|
sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
|
67
|
-
sglang/srt/layers/linear.py,sha256=
|
68
|
+
sglang/srt/layers/linear.py,sha256=s5hGfdBgYkFMHolTTsSLXQdOay9HZxYyrS6AYFZaeYA,48860
|
68
69
|
sglang/srt/layers/logits_processor.py,sha256=r2yGmNqQTpi1l7qvN2Bvjb7lVKfBsxIBrJ6CpBh-_wg,12993
|
69
|
-
sglang/srt/layers/parameter.py,sha256=
|
70
|
+
sglang/srt/layers/parameter.py,sha256=pC6hz2Vu9bFKH4Mt5lh-BwNWUNrJO_GsaFY9aNVDsrY,14684
|
70
71
|
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
71
|
-
sglang/srt/layers/radix_attention.py,sha256=
|
72
|
+
sglang/srt/layers/radix_attention.py,sha256=nVHKPFyr-CWNm6AnMGPhuuTFTtgYwPL8sAVBZ5u3d94,2232
|
72
73
|
sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
|
73
74
|
sglang/srt/layers/sampler.py,sha256=HQWi1zb1gmD9pHMQyEP3WPjnL8vy-ncZDVMENbjQW7c,6944
|
74
75
|
sglang/srt/layers/torchao_utils.py,sha256=8c2vzt106iP_QKbJtfN1GuABW8nCuP5dElQLUeci6qg,3934
|
75
|
-
sglang/srt/layers/vocab_parallel_embedding.py,sha256=
|
76
|
+
sglang/srt/layers/vocab_parallel_embedding.py,sha256=8TvdxJZipUy6Ewm8Ovsbho7GzZ_yvDZ-eXjK_8vc_8k,22149
|
76
77
|
sglang/srt/layers/attention/__init__.py,sha256=KlQ0fl-o9v_NxBDhNZ4dPW2uQ2HeJjLm-0MTMWgaa28,2980
|
77
78
|
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=QEDF8tQKMkh-nbt4jHKHZhhgHuV0Fla_BPzzoo9JfT4,9231
|
78
|
-
sglang/srt/layers/attention/flashinfer_backend.py,sha256=
|
79
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=1He2KvcPQmLbr-8wkgy20NYjsu_hicW6NlumoVP9-kM,33842
|
79
80
|
sglang/srt/layers/attention/torch_native_backend.py,sha256=KrcAqTLVZLtwgOmB0xhwUUsX32M-5LYZpNxaRNT4VuA,9252
|
80
81
|
sglang/srt/layers/attention/triton_backend.py,sha256=44ScKsVs-rFvqsaAZG_mREEpczhGaUBvaflvWqrukVE,6743
|
81
82
|
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=ltWcZ00ugpglSYvszpGb-UCpGIixdG25cWtSrOOOMik,17943
|
@@ -83,13 +84,13 @@ sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXf
|
|
83
84
|
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=DWOZXSTVN5ZbcFjDjcqs-nPdUkxSwum0SVXhVKqwh2g,11688
|
84
85
|
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
|
85
86
|
sglang/srt/layers/moe/fused_moe_native.py,sha256=8q-LFZMSCGLc2_Gltp2lH0gSb4A1WOuKQW3wo3rpj5g,1601
|
86
|
-
sglang/srt/layers/moe/topk.py,sha256=
|
87
|
+
sglang/srt/layers/moe/topk.py,sha256=qcWDUVvEV6TIO_idymStylkpPp6dMk-wbYj2Zq4ZYJ0,7057
|
87
88
|
sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
88
89
|
sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
|
89
90
|
sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
|
90
91
|
sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
|
91
92
|
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=3at2h0NDC8JF144jH6h5ze_YkBasvjo227bdFLiK0vs,36759
|
92
|
-
sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=
|
93
|
+
sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=KCYdT1kftwY8V_wRahoW6GbXkrm7lAZ86xvmu1qZK8w,21802
|
93
94
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",sha256=iNGsE2ZeVnQEnN4A8UJ9Jv0d3hbRF2MJ9oBgjup5Szk,2737
|
94
95
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=JJN0hryyLr5Zv3dSS7C8cPFhAwTT6XxUVnBGMZvV6JA,2752
|
95
96
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",sha256=ouRyZ5PEMPP2njPftCNhs-1g1y6wueWLmhI7G1SjV1k,4131
|
@@ -181,12 +182,14 @@ sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=LwEoCt1lUc0uvCvRhBAy6Gkx1
|
|
181
182
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json",sha256=aMP7oZmh8BZnPOrl0MFibcdhTn3VmOSjqoKoK2rMSbU,4323
|
182
183
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=sY2nWMPh9lsIkhPCjkHO245wpnfFbrHmzdcZDVFPVww,3265
|
183
184
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=Uz5X80VcNBOaxshwVNUEittHk2zqB4HQCfTJ4TPG5aM,3274
|
184
|
-
sglang/srt/layers/quantization/__init__.py,sha256=
|
185
|
+
sglang/srt/layers/quantization/__init__.py,sha256=vM6Vhlu-Jv4t9DDwywitXGz58psTQ5k7guVuK0o4jTk,4785
|
185
186
|
sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
|
186
|
-
sglang/srt/layers/quantization/fp8.py,sha256=
|
187
|
+
sglang/srt/layers/quantization/fp8.py,sha256=2k6vk2sTVB6JCtEJLsFFn5bJKR8lWwMRke4tu9nnTP0,34806
|
187
188
|
sglang/srt/layers/quantization/fp8_kernel.py,sha256=cYF4ckqrUyhCO9Ha7zi05R8EhRaqSa8rFpYisz-9Ed0,10743
|
188
|
-
sglang/srt/layers/quantization/fp8_utils.py,sha256=
|
189
|
-
sglang/srt/layers/quantization/
|
189
|
+
sglang/srt/layers/quantization/fp8_utils.py,sha256=7v-RNwuYXa-gPO3msRDB0Z3uajOQMYd2Cj0NMoq1hg4,4148
|
190
|
+
sglang/srt/layers/quantization/int8_kernel.py,sha256=t_BLVf8XjOyn7S3Lu3B4hXvw8DvTg4Anco7TNadL58U,1436
|
191
|
+
sglang/srt/layers/quantization/modelopt_quant.py,sha256=64Qec1kzduAcxyDLd_Y47wDHZ4ShS9Vb-Rf57jc1Zmg,6245
|
192
|
+
sglang/srt/layers/quantization/w8a8_int8.py,sha256=RO_s0KPH5wSx2HaI5PbAkdEXVqPS05AS6yo3oyZnIbw,3353
|
190
193
|
"sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",sha256=tkLjwLC_aVXhzuvo-2QHkojXZauPJsf3jNHFn1S7uRA,3244
|
191
194
|
"sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json",sha256=Qoj9rLLRDbKM4IKBCXvN8RcxzSmNPd0TQUiM7CXDqHI,3241
|
192
195
|
"sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json",sha256=4D3Ku4y7BCVEJzueKvQC_KvOR026w3ONWsxfsA_YrEc,3249
|
@@ -229,27 +232,28 @@ sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
|
|
229
232
|
sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
|
230
233
|
sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
|
231
234
|
sglang/srt/managers/cache_controller.py,sha256=DXnIunJgtTws1WF2vZOYVQe56vacV7Mn4wL9zoG8Xz8,10909
|
235
|
+
sglang/srt/managers/configure_logging.py,sha256=wa1NLWaxC2NGSTJflZvCvUrONH4i6wreNvVHb90bd14,1374
|
232
236
|
sglang/srt/managers/data_parallel_controller.py,sha256=VZSXGsNJ029BJlu56lCugaapMPvzjzE2yFATd8KWLNY,8468
|
233
|
-
sglang/srt/managers/detokenizer_manager.py,sha256=
|
237
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLohvHl25xlQpBoo,8439
|
234
238
|
sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
|
235
|
-
sglang/srt/managers/io_struct.py,sha256=
|
239
|
+
sglang/srt/managers/io_struct.py,sha256=H1rNLCl2iqDijUGLBafjodTrohaUi1ztJn69XjkhjTk,16207
|
236
240
|
sglang/srt/managers/schedule_batch.py,sha256=jmPTc-XyI-AXktz9Rofs-Fb3OlOgb-bThI142kOy--g,47134
|
237
241
|
sglang/srt/managers/schedule_policy.py,sha256=aHkIL9pZtc4Kdmy8XU9tsjaDzdChVN2dnGKvJkSyqFg,17965
|
238
|
-
sglang/srt/managers/scheduler.py,sha256=
|
242
|
+
sglang/srt/managers/scheduler.py,sha256=Kn7NyoLwHIeuGKQercV4jKsC5-KVLK4JhRiflNNLu9A,66790
|
239
243
|
sglang/srt/managers/session_controller.py,sha256=0L9_3lhFGU4kLm8b2G1QAeslxvTT_y_Iw8spwrpgr30,5508
|
240
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
244
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=p9k7fvFWyKkHO-Am-2JdbR6-VRsuGEiwQO7t1F7_rfs,35956
|
241
245
|
sglang/srt/managers/tp_worker.py,sha256=-bvUFCo544QQSEHqPPjeOvCWMEFn01Bva6AeO39Qe3o,8043
|
242
246
|
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=rdHz2thdGSmceDedrolHOqjNPhrralyDTuNREL56oNI,9067
|
243
247
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
|
244
248
|
sglang/srt/mem_cache/chunk_cache.py,sha256=R2gHAuqKd5ayQW3NnsgoGUH31---Z5izCDyCqLL0FjQ,2524
|
245
249
|
sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
|
246
|
-
sglang/srt/mem_cache/memory_pool.py,sha256=
|
250
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=McBKAcV444ewM-idOuCbfeKHoF-lhCL9m5R27M8H9ew,20401
|
247
251
|
sglang/srt/mem_cache/radix_cache.py,sha256=c5voySV5L855c0G9cBEc9iQ4nR7PDDmg0V6fWWJHcq4,10945
|
248
|
-
sglang/srt/metrics/collector.py,sha256=
|
252
|
+
sglang/srt/metrics/collector.py,sha256=sbgruNDzxBmTd-lnRi8mBZGCt2J7qgRVvDk2LQ5HvQU,6936
|
249
253
|
sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
|
250
254
|
sglang/srt/model_executor/cuda_graph_runner.py,sha256=rGG0ZS673YC_RVaXMlmNTBJln-L7ugsgDz0Q6XmO0Cc,18544
|
251
255
|
sglang/srt/model_executor/forward_batch_info.py,sha256=Vu6qlbfm6dMUfvGaSmmLIroi8hBqfDpNVLxl7oECzIs,15001
|
252
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
256
|
+
sglang/srt/model_executor/model_runner.py,sha256=AQPN4q-Wuw3yCeFjXwWvN5m07geS07l21SXFKr-FeCk,31955
|
253
257
|
sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
|
254
258
|
sglang/srt/model_loader/loader.py,sha256=7OG_8-66vFDFZ9kVKGNK1BFBjZ6ql449dlyvdCbMqvE,43876
|
255
259
|
sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
|
@@ -270,7 +274,7 @@ sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,
|
|
270
274
|
sglang/srt/models/grok.py,sha256=gIr6uFNLv42v-yjAko4w8uugAA7vE0396S23V98Aiu4,18002
|
271
275
|
sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
|
272
276
|
sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
|
273
|
-
sglang/srt/models/llama.py,sha256
|
277
|
+
sglang/srt/models/llama.py,sha256=r9MwIsKv5SrwpLewdB_gqai1YDfjyG-2dlT_pYPNIac,22087
|
274
278
|
sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
|
275
279
|
sglang/srt/models/llama_eagle.py,sha256=88DzR54DKBIKJ1h-bkIa8mc1qJnlkdZ1eGYY3c5mpBY,4442
|
276
280
|
sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
|
@@ -288,7 +292,8 @@ sglang/srt/models/olmo2.py,sha256=aC7svioN7XT5owRxPrvhvWBNMON9QXGQBWJ1KHMyXeA,13
|
|
288
292
|
sglang/srt/models/olmoe.py,sha256=LiHVGfRaC5c_BU_vVgtV9uLuDH_SC0dw1kEc61posmI,15351
|
289
293
|
sglang/srt/models/phi3_small.py,sha256=44_my3QmgJ2N7SOkGZzEb62DXBeCVHojfmCWgkk2uCI,14802
|
290
294
|
sglang/srt/models/qwen.py,sha256=_FKDbwaS5C07uJyyivZpBrXJVej4Ph9ivzJdzWJPxJ4,9904
|
291
|
-
sglang/srt/models/qwen2.py,sha256=
|
295
|
+
sglang/srt/models/qwen2.py,sha256=aRumlGWYYUntMHR3LoOpeduelnzo9Ls0FXVwVKiL7tY,13332
|
296
|
+
sglang/srt/models/qwen2_eagle.py,sha256=KTtejEezdLfd_odg3Na1i5kBk7W-YFg9hImfWyrMgVc,4288
|
292
297
|
sglang/srt/models/qwen2_moe.py,sha256=6xRRJxWWh1M5UFPfvhsCpY477zv-30AeSRJXsvOkgFc,16542
|
293
298
|
sglang/srt/models/qwen2_vl.py,sha256=3EaUlTbyWOTRXA7eViK1WqmVbCFhXLIpnos49zzf-yM,26561
|
294
299
|
sglang/srt/models/registry.py,sha256=inKh9iwOp3LFYm3nqujg-OtABClOP-ifc1stA9cZegA,3434
|
@@ -299,17 +304,17 @@ sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQ
|
|
299
304
|
sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
|
300
305
|
sglang/srt/openai_api/adapter.py,sha256=Yv-rEA0Jd54iFlnkVy-OZM4EnPqkW_NLtDPGCiPWVWo,56386
|
301
306
|
sglang/srt/openai_api/protocol.py,sha256=v_YUwH1PF4vIVqSE5rj1ODdSglprTe_vGiXoS99cOV4,11613
|
302
|
-
sglang/srt/sampling/sampling_batch_info.py,sha256=
|
303
|
-
sglang/srt/sampling/sampling_params.py,sha256=
|
307
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=BEcDjMlTQ6wRuvwwCjB-2cy6GMgS3dpmjG4xetBuI4Q,9637
|
308
|
+
sglang/srt/sampling/sampling_params.py,sha256=YdfObBzfkgK9rU2XY6_7kxl7H1wjtDGrinpyIszTGUw,5678
|
304
309
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
305
310
|
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=J-DEemZcKm1--o37kf3qDOE8SZ_6H3d5oex49Mgq2ZU,10762
|
306
311
|
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD60mwD1tCcSG0x5IYo0v4z9ce-q_YwbJ9f8,2490
|
307
312
|
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=_Nxv0XgUPirZjw2SEJYp_Cd9ZcLwmt7h6JE6J4hhFq4,3629
|
308
313
|
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=5tOgCg7OvE9kSN9VMCpH1hwqo1YMxt9iS5PVpct9HpU,2468
|
309
|
-
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=
|
314
|
+
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=vmE5muVz_ztRA6glgYOiQnKas_zTvQZ3nxcUEQao-L8,3070
|
310
315
|
sglang/srt/speculative/build_eagle_tree.py,sha256=SIKuOFUOIzMLyanL5vViPmFBEiUHm_ezwiGuIyLmauE,9886
|
311
316
|
sglang/srt/speculative/eagle_utils.py,sha256=Z51xGuvn-ZIMp0OXENZUhpDOz8kTDkujhHZA-Z2MKbA,23422
|
312
|
-
sglang/srt/speculative/eagle_worker.py,sha256=
|
317
|
+
sglang/srt/speculative/eagle_worker.py,sha256=P__BMJ0eKLaPzCS8jEWylk2POstue5u3RIVZeFtj84I,7843
|
313
318
|
sglang/srt/speculative/spec_info.py,sha256=D7A27UU1iOwIBEjXTgAxZ7jdftbTiVlMCvK8GmYr2zg,488
|
314
319
|
sglang/test/few_shot_gsm8k.py,sha256=7yDbEQe49gZeJhz2wFFX-gf_59ThDKsCS1xwfogNc7k,4034
|
315
320
|
sglang/test/few_shot_gsm8k_engine.py,sha256=QQbrwOX6-cJDD3RZC_e7zPnt6aSo8JdF8X_lRHSjdDM,3886
|
@@ -327,8 +332,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
|
|
327
332
|
sglang/test/test_programs.py,sha256=AABFLu0W9FlK-VN2wb2rLkwFCK6YCkLYrgQClymzpcw,18835
|
328
333
|
sglang/test/test_utils.py,sha256=3xUJpb-HNSwzoRZ_eVO_Q52m5pWlQMU84PXnsSzoD9g,24585
|
329
334
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
|
330
|
-
sglang-0.4.1.
|
331
|
-
sglang-0.4.1.
|
332
|
-
sglang-0.4.1.
|
333
|
-
sglang-0.4.1.
|
334
|
-
sglang-0.4.1.
|
335
|
+
sglang-0.4.1.post6.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
336
|
+
sglang-0.4.1.post6.dist-info/METADATA,sha256=hls-gahHEVIiMlj9JHUiKHzKkiUiS_J5_JACvVh6riM,22527
|
337
|
+
sglang-0.4.1.post6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
338
|
+
sglang-0.4.1.post6.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
339
|
+
sglang-0.4.1.post6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|