sglang 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +1 -1
- sglang/api.py +26 -0
- sglang/backend/runtime_endpoint.py +18 -14
- sglang/bench_latency.py +34 -16
- sglang/global_config.py +1 -0
- sglang/lang/chat_template.py +41 -6
- sglang/lang/interpreter.py +5 -1
- sglang/lang/ir.py +61 -25
- sglang/srt/constrained/__init__.py +3 -2
- sglang/srt/hf_transformers_utils.py +7 -3
- sglang/srt/layers/extend_attention.py +2 -1
- sglang/srt/layers/fused_moe.py +181 -167
- sglang/srt/layers/logits_processor.py +55 -19
- sglang/srt/layers/radix_attention.py +24 -27
- sglang/srt/layers/token_attention.py +4 -1
- sglang/srt/managers/controller/infer_batch.py +2 -2
- sglang/srt/managers/controller/manager_single.py +1 -1
- sglang/srt/managers/controller/model_runner.py +27 -15
- sglang/srt/managers/controller/tp_worker.py +31 -14
- sglang/srt/managers/detokenizer_manager.py +4 -2
- sglang/srt/managers/io_struct.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +14 -13
- sglang/srt/model_config.py +6 -0
- sglang/srt/models/gemma2.py +436 -0
- sglang/srt/models/llama2.py +3 -3
- sglang/srt/models/llama_classification.py +10 -7
- sglang/srt/models/minicpm.py +373 -0
- sglang/srt/models/qwen2_moe.py +454 -0
- sglang/srt/openai_api_adapter.py +2 -2
- sglang/srt/openai_protocol.py +1 -1
- sglang/srt/server.py +17 -8
- sglang/srt/server_args.py +14 -16
- sglang/srt/utils.py +68 -35
- {sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/METADATA +19 -13
- {sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/RECORD +38 -35
- {sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
- {sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/WHEEL +0 -0
- {sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py
CHANGED
@@ -432,13 +432,12 @@ def assert_pkg_version(pkg: str, min_version: str, message: str):
|
|
432
432
|
if pkg_version.parse(installed_version) < pkg_version.parse(min_version):
|
433
433
|
raise Exception(
|
434
434
|
f"{pkg} is installed with version {installed_version}, which "
|
435
|
-
f"is less than the minimum required version {min_version}. " +
|
436
|
-
message
|
435
|
+
f"is less than the minimum required version {min_version}. " + message
|
437
436
|
)
|
438
437
|
except PackageNotFoundError:
|
439
438
|
raise Exception(
|
440
|
-
f"{pkg} with minimum required version {min_version} is not installed. "
|
441
|
-
message
|
439
|
+
f"{pkg} with minimum required version {min_version} is not installed. "
|
440
|
+
+ message
|
442
441
|
)
|
443
442
|
|
444
443
|
|
@@ -459,13 +458,9 @@ def monkey_patch_vllm_p2p_access_check(gpu_id: int):
|
|
459
458
|
NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
|
460
459
|
"""
|
461
460
|
|
462
|
-
|
463
|
-
# compat: skip RTX 40 series as they do not have P2P feature and even checking for them may cause errors
|
464
|
-
device_name = torch.cuda.get_device_name(gpu_id)
|
465
|
-
if "RTX 40" not in device_name:
|
466
|
-
import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
|
461
|
+
import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
|
467
462
|
|
468
|
-
|
463
|
+
setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
|
469
464
|
|
470
465
|
|
471
466
|
def monkey_patch_vllm_dummy_weight_loader():
|
@@ -474,24 +469,40 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
474
469
|
"""
|
475
470
|
|
476
471
|
from vllm.model_executor.model_loader.loader import (
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
472
|
+
CacheConfig,
|
473
|
+
DeviceConfig,
|
474
|
+
DummyModelLoader,
|
475
|
+
LoRAConfig,
|
476
|
+
ModelConfig,
|
477
|
+
ParallelConfig,
|
478
|
+
SchedulerConfig,
|
479
|
+
MultiModalConfig,
|
480
|
+
_initialize_model,
|
481
|
+
initialize_dummy_weights,
|
482
|
+
nn,
|
483
|
+
set_default_torch_dtype,
|
481
484
|
)
|
482
485
|
|
483
|
-
def load_model(
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
486
|
+
def load_model(
|
487
|
+
self,
|
488
|
+
*,
|
489
|
+
model_config: ModelConfig,
|
490
|
+
device_config: DeviceConfig,
|
491
|
+
lora_config: Optional[LoRAConfig],
|
492
|
+
multimodal_config: Optional[MultiModalConfig],
|
493
|
+
parallel_config: ParallelConfig,
|
494
|
+
scheduler_config: SchedulerConfig,
|
495
|
+
cache_config: CacheConfig,
|
496
|
+
) -> nn.Module:
|
490
497
|
with set_default_torch_dtype(model_config.dtype):
|
491
498
|
with torch.device(device_config.device):
|
492
|
-
model = _initialize_model(
|
493
|
-
|
494
|
-
|
499
|
+
model = _initialize_model(
|
500
|
+
model_config,
|
501
|
+
self.load_config,
|
502
|
+
lora_config,
|
503
|
+
multimodal_config,
|
504
|
+
cache_config,
|
505
|
+
)
|
495
506
|
|
496
507
|
for _, module in model.named_modules():
|
497
508
|
quant_method = getattr(module, "quant_method", None)
|
@@ -541,7 +552,7 @@ def get_ip_address(ifname):
|
|
541
552
|
ip_address = fcntl.ioctl(
|
542
553
|
s.fileno(),
|
543
554
|
0x8915, # SIOCGIFADDR
|
544
|
-
struct.pack(
|
555
|
+
struct.pack("256s", bytes(ifname[:15], "utf-8")),
|
545
556
|
)[20:24]
|
546
557
|
return socket.inet_ntoa(ip_address)
|
547
558
|
|
@@ -550,44 +561,66 @@ def send_addrs_to_rank_0(model_port_args, server_args):
|
|
550
561
|
assert server_args.node_rank != 0 and server_args.dp_size == 1
|
551
562
|
import torch.distributed as dist
|
552
563
|
|
553
|
-
ifname = os.environ.get(
|
564
|
+
ifname = os.environ.get(
|
565
|
+
"SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
|
566
|
+
)
|
554
567
|
ip_addr = get_ip_address(ifname)
|
555
568
|
|
556
569
|
num_tp_ports = server_args.tp_size // server_args.nnodes
|
557
570
|
model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
|
558
571
|
ip_addr = [int(x) for x in ip_addr.split(".")]
|
559
|
-
addrs_tensor = torch.tensor(
|
572
|
+
addrs_tensor = torch.tensor(
|
573
|
+
ip_addr + model_port_args.model_tp_ports, dtype=torch.int
|
574
|
+
)
|
560
575
|
|
561
576
|
init_method = f"tcp://{server_args.nccl_init_addr}"
|
562
|
-
dist.init_process_group(
|
577
|
+
dist.init_process_group(
|
578
|
+
backend="gloo",
|
579
|
+
init_method=init_method,
|
580
|
+
rank=server_args.node_rank,
|
581
|
+
world_size=server_args.nnodes,
|
582
|
+
)
|
563
583
|
dist.send(addrs_tensor, dst=0)
|
564
|
-
print(
|
584
|
+
print(
|
585
|
+
f"Node {server_args.node_rank} sent: ip_address {ip_addr} and ports {model_port_args.model_tp_ports}"
|
586
|
+
)
|
565
587
|
|
566
588
|
dist.barrier()
|
567
|
-
dist.destroy_process_group()
|
589
|
+
dist.destroy_process_group()
|
568
590
|
|
569
591
|
|
570
592
|
def receive_addrs(model_port_args, server_args):
|
571
593
|
assert server_args.node_rank == 0 and server_args.dp_size == 1
|
572
594
|
import torch.distributed as dist
|
573
595
|
|
574
|
-
ifname = os.environ.get(
|
596
|
+
ifname = os.environ.get(
|
597
|
+
"SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
|
598
|
+
)
|
575
599
|
ip_addr = get_ip_address(ifname)
|
576
600
|
|
577
601
|
num_tp_ports = server_args.tp_size // server_args.nnodes
|
578
602
|
model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
|
579
603
|
|
580
604
|
init_method = f"tcp://{server_args.nccl_init_addr}"
|
581
|
-
dist.init_process_group(
|
605
|
+
dist.init_process_group(
|
606
|
+
backend="gloo",
|
607
|
+
init_method=init_method,
|
608
|
+
rank=server_args.node_rank,
|
609
|
+
world_size=server_args.nnodes,
|
610
|
+
)
|
582
611
|
|
583
612
|
for src_rank in range(1, server_args.nnodes):
|
584
613
|
tensor = torch.zeros(4 + num_tp_ports, dtype=torch.int)
|
585
614
|
dist.recv(tensor, src=src_rank)
|
586
615
|
ip = ".".join([str(x) for x in tensor[:4].tolist()])
|
587
616
|
ports = tensor[4:].tolist()
|
588
|
-
model_port_args.model_tp_ips[
|
589
|
-
|
617
|
+
model_port_args.model_tp_ips[
|
618
|
+
num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
|
619
|
+
] = [ip] * num_tp_ports
|
620
|
+
model_port_args.model_tp_ports[
|
621
|
+
num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
|
622
|
+
] = ports
|
590
623
|
print(f"Node 0 received from rank {src_rank}: {tensor.tolist()}")
|
591
624
|
|
592
625
|
dist.barrier()
|
593
|
-
dist.destroy_process_group()
|
626
|
+
dist.destroy_process_group()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.19
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -241,7 +241,7 @@ Requires-Dist: torch ; extra == 'srt'
|
|
241
241
|
Requires-Dist: uvicorn ; extra == 'srt'
|
242
242
|
Requires-Dist: uvloop ; extra == 'srt'
|
243
243
|
Requires-Dist: zmq ; extra == 'srt'
|
244
|
-
Requires-Dist: vllm ==0.5.
|
244
|
+
Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
|
245
245
|
Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
|
246
246
|
|
247
247
|
<div align="center">
|
@@ -257,7 +257,7 @@ It makes your interaction with LLMs faster and more controllable by co-designing
|
|
257
257
|
|
258
258
|
The core features include:
|
259
259
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
|
-
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
260
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
261
261
|
|
262
262
|
## News
|
263
263
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -288,15 +288,21 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
288
288
|
git clone https://github.com/sgl-project/sglang.git
|
289
289
|
cd sglang
|
290
290
|
|
291
|
-
pip install --upgrade pip
|
292
291
|
pip install -e "python[all]"
|
293
292
|
|
294
293
|
# Install FlashInfer CUDA kernels
|
295
294
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
296
295
|
```
|
297
296
|
|
298
|
-
###
|
299
|
-
|
297
|
+
### Method 3: Using docker
|
298
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
299
|
+
|
300
|
+
### Common Notes
|
301
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
|
302
|
+
```
|
303
|
+
pip uninstall -y triton triton-nightly
|
304
|
+
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
|
305
|
+
```
|
300
306
|
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
301
307
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
302
308
|
|
@@ -518,8 +524,8 @@ for out in state.text_iter():
|
|
518
524
|
```
|
519
525
|
|
520
526
|
### Tips and Implementation Details
|
521
|
-
- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
|
522
|
-
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
|
527
|
+
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
528
|
+
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
523
529
|
|
524
530
|
## Backend: SGLang Runtime (SRT)
|
525
531
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
@@ -576,7 +582,6 @@ response = client.chat.completions.create(
|
|
576
582
|
print(response)
|
577
583
|
```
|
578
584
|
|
579
|
-
|
580
585
|
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
581
586
|
|
582
587
|
If needed, you can also override the chat template when launching the server:
|
@@ -605,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
605
610
|
```
|
606
611
|
|
607
612
|
### Additional Arguments
|
608
|
-
- Add `--tp 2` to enable tensor parallelism.
|
613
|
+
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
609
614
|
```
|
610
615
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
611
616
|
```
|
@@ -623,9 +628,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
623
628
|
- Llama
|
624
629
|
- Mistral
|
625
630
|
- Mixtral
|
626
|
-
- Qwen / Qwen 2
|
627
|
-
- Gemma
|
628
|
-
- Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
|
631
|
+
- Qwen / Qwen 2 / Qwen 2 MoE
|
632
|
+
- Gemma / Gemma 2
|
629
633
|
- `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
|
630
634
|
- LLaVA
|
631
635
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -638,6 +642,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
638
642
|
- StableLM
|
639
643
|
- Command-R
|
640
644
|
- DBRX
|
645
|
+
- Grok
|
646
|
+
- ChatGLM
|
641
647
|
- AWQ/GPTQ/Marlin quantization
|
642
648
|
|
643
649
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
@@ -1,7 +1,7 @@
|
|
1
|
-
sglang/__init__.py,sha256=
|
2
|
-
sglang/api.py,sha256=
|
3
|
-
sglang/bench_latency.py,sha256=
|
4
|
-
sglang/global_config.py,sha256=
|
1
|
+
sglang/__init__.py,sha256=GriWuMrszCcPLrLQRv50jP0Crc6b8CLsBA3UYM36ISw,1116
|
2
|
+
sglang/api.py,sha256=W_FO5JTrW9I-DoGx2O8cLhcSA6LJqgplrOIqAX-ryNA,5560
|
3
|
+
sglang/bench_latency.py,sha256=Ln3DbLmTwIhgsiFZH0_L5Fd3Sc5jM_Vb9PFZytX76hM,10299
|
4
|
+
sglang/global_config.py,sha256=1HsHrPFgkqCc5iIwrweKQ0HLip0DLogtpm9vaqbZqfE,1426
|
5
5
|
sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
|
6
6
|
sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
|
7
7
|
sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
|
@@ -10,69 +10,72 @@ sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2
|
|
10
10
|
sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
|
11
11
|
sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
|
12
12
|
sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
|
13
|
-
sglang/backend/runtime_endpoint.py,sha256=
|
13
|
+
sglang/backend/runtime_endpoint.py,sha256=XTHAoN_EAwdfADc6vq9tuqri7udGMUih8dStgTuKV1g,9077
|
14
14
|
sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
|
15
15
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
sglang/lang/chat_template.py,sha256=
|
16
|
+
sglang/lang/chat_template.py,sha256=hLX1qpXaUQi7PFndAwbOoOeGlX0NekskR_HndAvGnwQ,13307
|
17
17
|
sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
|
18
|
-
sglang/lang/interpreter.py,sha256=
|
19
|
-
sglang/lang/ir.py,sha256=
|
18
|
+
sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
|
19
|
+
sglang/lang/ir.py,sha256=ZGXJbJELlt8D8H7CyW3IqcRpZm8Pp7h_hLQw46NSb6I,16639
|
20
20
|
sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
|
21
21
|
sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
|
22
22
|
sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
|
23
|
-
sglang/srt/hf_transformers_utils.py,sha256=
|
23
|
+
sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
|
24
24
|
sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
|
25
25
|
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
26
|
-
sglang/srt/model_config.py,sha256=
|
27
|
-
sglang/srt/openai_api_adapter.py,sha256=
|
28
|
-
sglang/srt/openai_protocol.py,sha256
|
26
|
+
sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
|
27
|
+
sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
|
28
|
+
sglang/srt/openai_protocol.py,sha256=-KJsGx2izL3Fc5EhOGi9PAXExuaq-DKRk0UlNjts11E,5348
|
29
29
|
sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
|
30
|
-
sglang/srt/server.py,sha256=
|
31
|
-
sglang/srt/server_args.py,sha256=
|
32
|
-
sglang/srt/utils.py,sha256=
|
33
|
-
sglang/srt/constrained/__init__.py,sha256=
|
30
|
+
sglang/srt/server.py,sha256=ntl5XwnbOm2favQWbqVULXBUOLhXsgZ3mf1i2MY4e14,13226
|
31
|
+
sglang/srt/server_args.py,sha256=rvJImd-b9CVveg_V7n7dSotlro6q6pAqBk7lOxRC7nk,12307
|
32
|
+
sglang/srt/utils.py,sha256=e-yPzqDMCGsPgEf4TIe7CEh44lsKpZnclsrMtBggS_Y,19366
|
33
|
+
sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
|
34
34
|
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
35
35
|
sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
|
36
36
|
sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
|
37
37
|
sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
|
38
|
-
sglang/srt/layers/extend_attention.py,sha256=
|
39
|
-
sglang/srt/layers/fused_moe.py,sha256=
|
40
|
-
sglang/srt/layers/logits_processor.py,sha256=
|
41
|
-
sglang/srt/layers/radix_attention.py,sha256=
|
42
|
-
sglang/srt/layers/token_attention.py,sha256=
|
43
|
-
sglang/srt/managers/detokenizer_manager.py,sha256=
|
44
|
-
sglang/srt/managers/io_struct.py,sha256=
|
45
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
38
|
+
sglang/srt/layers/extend_attention.py,sha256=sVd94ViwwQaQDuE94sPMg6Ac6VOp7nX80hFol8qr85Q,13008
|
39
|
+
sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
|
40
|
+
sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
|
41
|
+
sglang/srt/layers/radix_attention.py,sha256=e468GCYteIuVOW7T9xols-IqXS0hJysmicvEiwD0xIM,6857
|
42
|
+
sglang/srt/layers/token_attention.py,sha256=eKUUU5pvYsF5EGthfbv-L_IUlg366l5e5X1eWTkE_Xw,8908
|
43
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=2oYNtYrSwtfu8G-QcFz_vZK6Buq-eHuZGg9VpxVhYOI,3492
|
44
|
+
sglang/srt/managers/io_struct.py,sha256=aCI4yYtKoioP459lWRN8kqVf4tvYYr_IhZaSnvJylgY,4533
|
45
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=h5nOR8NHCwEm52wiL-ZA1hoM_pvMuyG0j7Zj1h7aMxk,14898
|
46
46
|
sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
|
47
|
-
sglang/srt/managers/controller/infer_batch.py,sha256
|
47
|
+
sglang/srt/managers/controller/infer_batch.py,sha256=wOuvi4lNhVEZtfXZKinBXCubG_VEaRTv60ijbHpSMgM,25713
|
48
48
|
sglang/srt/managers/controller/manager_multi.py,sha256=Z0a-iZzqk8T2Xl7ak2DgE9j00GA6Eb0XoNVx7UlxKa4,6630
|
49
|
-
sglang/srt/managers/controller/manager_single.py,sha256=
|
50
|
-
sglang/srt/managers/controller/model_runner.py,sha256=
|
49
|
+
sglang/srt/managers/controller/manager_single.py,sha256=5c33d1jPgOtys5gmfZe79UD7aXrsV--1Yq9Yc24bh1g,3469
|
50
|
+
sglang/srt/managers/controller/model_runner.py,sha256=a-1RKjA12U11BvDbnOECyPf6rpxes895pEZ0-Hyxo6c,21888
|
51
51
|
sglang/srt/managers/controller/radix_cache.py,sha256=fMqIm1fTvufI9I_QMoFLfQMkSUWp8VN4wh3-63KJUL0,8193
|
52
52
|
sglang/srt/managers/controller/schedule_heuristic.py,sha256=_ne7W2mrpuO794uh5tYLR3q6XBbgTMdNmE6VpzY1sJE,2312
|
53
|
-
sglang/srt/managers/controller/tp_worker.py,sha256=
|
53
|
+
sglang/srt/managers/controller/tp_worker.py,sha256=WBqL5_VVDAf3o12ymZwxQn7RYZ_dm_w2dXCnMVQ5L3M,31828
|
54
54
|
sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
|
55
55
|
sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
|
56
56
|
sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
|
57
57
|
sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
|
58
|
+
sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
|
58
59
|
sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
|
59
|
-
sglang/srt/models/llama2.py,sha256=
|
60
|
-
sglang/srt/models/llama_classification.py,sha256=
|
60
|
+
sglang/srt/models/llama2.py,sha256=FIUlkFoBhRNidU_Tlcr4UbSqzKPdz3wBc9OocN_CzQs,12188
|
61
|
+
sglang/srt/models/llama_classification.py,sha256=bLuugRFcPGEaNd58_LFOkWqOru2rCAGChhBw9dSu7pc,4349
|
61
62
|
sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
|
62
63
|
sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
|
64
|
+
sglang/srt/models/minicpm.py,sha256=vYCGjUjYIYVroiV2kOXWdWIPF6__vkN8JnRK-DqgKNI,13271
|
63
65
|
sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
|
64
66
|
sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
|
65
67
|
sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
|
66
68
|
sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
|
67
69
|
sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
|
70
|
+
sglang/srt/models/qwen2_moe.py,sha256=hV3dF_AzYONd-pQEmEkrrwpTZC6A7K4wY1_cph9UC54,18421
|
68
71
|
sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
|
69
72
|
sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
|
70
73
|
sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
|
71
74
|
sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
|
72
75
|
sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
|
73
76
|
sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
|
74
|
-
sglang-0.1.
|
75
|
-
sglang-0.1.
|
76
|
-
sglang-0.1.
|
77
|
-
sglang-0.1.
|
78
|
-
sglang-0.1.
|
77
|
+
sglang-0.1.19.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
78
|
+
sglang-0.1.19.dist-info/METADATA,sha256=iSIkO_DxfMHQIEv7ZdMXWwi_weLZtf8YRNS80vjf1Kk,30262
|
79
|
+
sglang-0.1.19.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
|
80
|
+
sglang-0.1.19.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
81
|
+
sglang-0.1.19.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|