sglang 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +59 -2
- sglang/api.py +40 -11
- sglang/backend/anthropic.py +17 -3
- sglang/backend/litellm.py +90 -0
- sglang/backend/openai.py +160 -12
- sglang/backend/runtime_endpoint.py +62 -27
- sglang/backend/vertexai.py +1 -0
- sglang/bench_latency.py +320 -0
- sglang/global_config.py +24 -3
- sglang/lang/chat_template.py +122 -6
- sglang/lang/compiler.py +2 -2
- sglang/lang/interpreter.py +206 -98
- sglang/lang/ir.py +98 -34
- sglang/lang/tracer.py +6 -4
- sglang/launch_server.py +4 -1
- sglang/launch_server_llavavid.py +32 -0
- sglang/srt/constrained/__init__.py +14 -6
- sglang/srt/constrained/fsm_cache.py +9 -2
- sglang/srt/constrained/jump_forward.py +113 -24
- sglang/srt/conversation.py +4 -2
- sglang/srt/flush_cache.py +18 -0
- sglang/srt/hf_transformers_utils.py +144 -3
- sglang/srt/layers/context_flashattention_nopad.py +1 -0
- sglang/srt/layers/extend_attention.py +20 -1
- sglang/srt/layers/fused_moe.py +596 -0
- sglang/srt/layers/logits_processor.py +190 -61
- sglang/srt/layers/radix_attention.py +62 -53
- sglang/srt/layers/token_attention.py +21 -9
- sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
- sglang/srt/managers/controller/dp_worker.py +113 -0
- sglang/srt/managers/controller/infer_batch.py +908 -0
- sglang/srt/managers/controller/manager_multi.py +195 -0
- sglang/srt/managers/controller/manager_single.py +177 -0
- sglang/srt/managers/controller/model_runner.py +359 -0
- sglang/srt/managers/{router → controller}/radix_cache.py +102 -53
- sglang/srt/managers/controller/schedule_heuristic.py +65 -0
- sglang/srt/managers/controller/tp_worker.py +813 -0
- sglang/srt/managers/detokenizer_manager.py +42 -40
- sglang/srt/managers/io_struct.py +44 -10
- sglang/srt/managers/tokenizer_manager.py +224 -82
- sglang/srt/memory_pool.py +52 -59
- sglang/srt/model_config.py +97 -2
- sglang/srt/models/chatglm.py +399 -0
- sglang/srt/models/commandr.py +369 -0
- sglang/srt/models/dbrx.py +406 -0
- sglang/srt/models/gemma.py +34 -38
- sglang/srt/models/gemma2.py +436 -0
- sglang/srt/models/grok.py +738 -0
- sglang/srt/models/llama2.py +47 -37
- sglang/srt/models/llama_classification.py +107 -0
- sglang/srt/models/llava.py +92 -27
- sglang/srt/models/llavavid.py +298 -0
- sglang/srt/models/minicpm.py +366 -0
- sglang/srt/models/mixtral.py +302 -127
- sglang/srt/models/mixtral_quant.py +372 -0
- sglang/srt/models/qwen.py +40 -35
- sglang/srt/models/qwen2.py +33 -36
- sglang/srt/models/qwen2_moe.py +473 -0
- sglang/srt/models/stablelm.py +33 -39
- sglang/srt/models/yivl.py +19 -26
- sglang/srt/openai_api_adapter.py +411 -0
- sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +44 -19
- sglang/srt/sampling_params.py +2 -0
- sglang/srt/server.py +197 -481
- sglang/srt/server_args.py +190 -74
- sglang/srt/utils.py +460 -95
- sglang/test/test_programs.py +73 -10
- sglang/test/test_utils.py +226 -7
- sglang/utils.py +97 -27
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA +74 -45
- sglang-0.1.21.dist-info/RECORD +82 -0
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
- sglang/srt/backend_config.py +0 -13
- sglang/srt/managers/router/infer_batch.py +0 -503
- sglang/srt/managers/router/manager.py +0 -79
- sglang/srt/managers/router/model_rpc.py +0 -686
- sglang/srt/managers/router/model_runner.py +0 -514
- sglang/srt/managers/router/scheduler.py +0 -70
- sglang-0.1.14.dist-info/RECORD +0 -64
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.21
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -212,35 +212,37 @@ Requires-Python: >=3.8
|
|
212
212
|
Description-Content-Type: text/markdown
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: requests
|
215
|
+
Requires-Dist: tqdm
|
216
|
+
Requires-Dist: numpy
|
215
217
|
Provides-Extra: all
|
216
218
|
Requires-Dist: sglang[srt] ; extra == 'all'
|
217
219
|
Requires-Dist: sglang[openai] ; extra == 'all'
|
218
220
|
Requires-Dist: sglang[anthropic] ; extra == 'all'
|
221
|
+
Requires-Dist: sglang[litellm] ; extra == 'all'
|
219
222
|
Provides-Extra: anthropic
|
220
223
|
Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
|
221
|
-
|
224
|
+
Provides-Extra: litellm
|
225
|
+
Requires-Dist: litellm >=1.0.0 ; extra == 'litellm'
|
222
226
|
Provides-Extra: openai
|
223
227
|
Requires-Dist: openai >=1.0 ; extra == 'openai'
|
224
|
-
Requires-Dist:
|
228
|
+
Requires-Dist: tiktoken ; extra == 'openai'
|
225
229
|
Provides-Extra: srt
|
226
230
|
Requires-Dist: aiohttp ; extra == 'srt'
|
227
231
|
Requires-Dist: fastapi ; extra == 'srt'
|
232
|
+
Requires-Dist: hf-transfer ; extra == 'srt'
|
233
|
+
Requires-Dist: huggingface-hub ; extra == 'srt'
|
234
|
+
Requires-Dist: interegular ; extra == 'srt'
|
235
|
+
Requires-Dist: packaging ; extra == 'srt'
|
236
|
+
Requires-Dist: pillow ; extra == 'srt'
|
228
237
|
Requires-Dist: psutil ; extra == 'srt'
|
238
|
+
Requires-Dist: pydantic ; extra == 'srt'
|
229
239
|
Requires-Dist: rpyc ; extra == 'srt'
|
230
240
|
Requires-Dist: torch ; extra == 'srt'
|
231
|
-
Requires-Dist: uvloop ; extra == 'srt'
|
232
241
|
Requires-Dist: uvicorn ; extra == 'srt'
|
242
|
+
Requires-Dist: uvloop ; extra == 'srt'
|
233
243
|
Requires-Dist: zmq ; extra == 'srt'
|
234
|
-
Requires-Dist: vllm
|
235
|
-
Requires-Dist:
|
236
|
-
Requires-Dist: lark ; extra == 'srt'
|
237
|
-
Requires-Dist: numba ; extra == 'srt'
|
238
|
-
Requires-Dist: pydantic ; extra == 'srt'
|
239
|
-
Requires-Dist: referencing ; extra == 'srt'
|
240
|
-
Requires-Dist: diskcache ; extra == 'srt'
|
241
|
-
Requires-Dist: cloudpickle ; extra == 'srt'
|
242
|
-
Requires-Dist: pillow ; extra == 'srt'
|
243
|
-
Requires-Dist: outlines >=0.0.27 ; extra == 'srt'
|
244
|
+
Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
|
245
|
+
Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
|
244
246
|
|
245
247
|
<div align="center">
|
246
248
|
<img src="assets/logo.png" alt="logo" width="400"></img>
|
@@ -253,9 +255,9 @@ Requires-Dist: outlines >=0.0.27 ; extra == 'srt'
|
|
253
255
|
SGLang is a structured generation language designed for large language models (LLMs).
|
254
256
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
255
257
|
|
256
|
-
The core features
|
257
|
-
- **
|
258
|
-
- **
|
258
|
+
The core features include:
|
259
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
259
261
|
|
260
262
|
## News
|
261
263
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -276,23 +278,33 @@ The core features of SGLang include:
|
|
276
278
|
### Method 1: With pip
|
277
279
|
```
|
278
280
|
pip install "sglang[all]"
|
281
|
+
|
282
|
+
# Install FlashInfer CUDA kernels
|
283
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
279
284
|
```
|
280
285
|
|
281
286
|
### Method 2: From source
|
282
287
|
```
|
283
|
-
git clone
|
288
|
+
git clone https://github.com/sgl-project/sglang.git
|
284
289
|
cd sglang
|
285
290
|
|
286
|
-
pip install --upgrade pip
|
287
291
|
pip install -e "python[all]"
|
292
|
+
|
293
|
+
# Install FlashInfer CUDA kernels
|
294
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
288
295
|
```
|
289
296
|
|
290
|
-
###
|
291
|
-
|
292
|
-
- For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
|
293
|
-
- For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
|
294
|
-
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
|
297
|
+
### Method 3: Using docker
|
298
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
295
299
|
|
300
|
+
### Common Notes
|
301
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
|
302
|
+
```
|
303
|
+
pip uninstall -y triton triton-nightly
|
304
|
+
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
|
305
|
+
```
|
306
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
307
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
296
308
|
|
297
309
|
## Quick Start
|
298
310
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -512,8 +524,8 @@ for out in state.text_iter():
|
|
512
524
|
```
|
513
525
|
|
514
526
|
### Tips and Implementation Details
|
515
|
-
- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
|
516
|
-
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
|
527
|
+
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
528
|
+
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
517
529
|
|
518
530
|
## Backend: SGLang Runtime (SRT)
|
519
531
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
@@ -541,7 +553,6 @@ curl http://localhost:30000/generate \
|
|
541
553
|
Learn more about the argument format [here](docs/sampling_params.md).
|
542
554
|
|
543
555
|
### OpenAI Compatible API
|
544
|
-
|
545
556
|
In addition, the server supports an experimental OpenAI-compatible API.
|
546
557
|
|
547
558
|
```python
|
@@ -571,15 +582,16 @@ response = client.chat.completions.create(
|
|
571
582
|
print(response)
|
572
583
|
```
|
573
584
|
|
574
|
-
|
575
|
-
|
585
|
+
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
586
|
+
|
587
|
+
If needed, you can also override the chat template when launching the server:
|
576
588
|
|
577
589
|
```
|
578
590
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
|
579
591
|
```
|
580
592
|
|
581
593
|
If the chat template you are looking for is missing, you are welcome to contribute it.
|
582
|
-
Meanwhile, you can also
|
594
|
+
Meanwhile, you can also temporarily register your chat template as follows:
|
583
595
|
|
584
596
|
```json
|
585
597
|
{
|
@@ -598,58 +610,75 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
598
610
|
```
|
599
611
|
|
600
612
|
### Additional Arguments
|
601
|
-
- Add `--tp 2` to enable tensor parallelism.
|
613
|
+
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
602
614
|
```
|
603
615
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
604
616
|
```
|
617
|
+
- Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
|
618
|
+
```
|
619
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
|
620
|
+
```
|
605
621
|
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
606
622
|
```
|
607
623
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
608
624
|
```
|
609
|
-
-
|
625
|
+
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
626
|
+
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
|
627
|
+
```
|
628
|
+
# Node 0
|
629
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
|
630
|
+
|
631
|
+
# Node 1
|
632
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
|
633
|
+
```
|
610
634
|
|
611
635
|
### Supported Models
|
612
636
|
- Llama
|
613
637
|
- Mistral
|
614
638
|
- Mixtral
|
615
|
-
- Qwen / Qwen 2
|
616
|
-
- Gemma
|
617
|
-
- Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
|
639
|
+
- Qwen / Qwen 2 / Qwen 2 MoE
|
640
|
+
- Gemma / Gemma 2
|
618
641
|
- `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
|
619
642
|
- LLaVA
|
620
643
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
621
644
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
622
645
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
646
|
+
- LLaVA-NeXT-Video
|
647
|
+
- see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
|
623
648
|
- Yi-VL
|
624
649
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
625
|
-
-
|
650
|
+
- StableLM
|
651
|
+
- Command-R
|
652
|
+
- DBRX
|
653
|
+
- Grok
|
654
|
+
- ChatGLM
|
655
|
+
- AWQ/GPTQ/Marlin quantization
|
626
656
|
|
627
|
-
|
657
|
+
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
628
658
|
|
659
|
+
## Benchmark And Performance
|
629
660
|
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
630
661
|

|
631
662
|
|
632
663
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
633
664
|

|
634
665
|
|
635
|
-
Learn more [
|
666
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
667
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
636
668
|
|
637
669
|
## Roadmap
|
638
670
|
https://github.com/sgl-project/sglang/issues/157
|
639
671
|
|
640
672
|
## Citation And Acknowledgment
|
641
673
|
```
|
642
|
-
@misc{
|
643
|
-
title={
|
644
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
645
|
-
year={
|
674
|
+
@misc{zheng2024sglang,
|
675
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
676
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
677
|
+
year={2024},
|
646
678
|
eprint={2312.07104},
|
647
679
|
archivePrefix={arXiv},
|
648
680
|
primaryClass={cs.AI}
|
649
681
|
}
|
650
682
|
```
|
651
683
|
|
652
|
-
[](https://huggingface.co/papers/2312.07104)
|
653
|
-
|
654
|
-
|
655
684
|
We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
|
@@ -0,0 +1,82 @@
|
|
1
|
+
sglang/__init__.py,sha256=vvd5xGflm3C6lftzWLBh2W9kpr0PgM8RWCApp-VmHs0,1116
|
2
|
+
sglang/api.py,sha256=W_FO5JTrW9I-DoGx2O8cLhcSA6LJqgplrOIqAX-ryNA,5560
|
3
|
+
sglang/bench_latency.py,sha256=b3tnG-FumU7ZHArNDFJAnxof6McAUu4q_O88nTZtooQ,10409
|
4
|
+
sglang/global_config.py,sha256=6WAMjRR1lDeGFdFu-18xUAbWVM2Vj0_L5ExvQ5wofus,1711
|
5
|
+
sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
|
6
|
+
sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
|
7
|
+
sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
|
8
|
+
sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
|
10
|
+
sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
|
11
|
+
sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
|
12
|
+
sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
|
13
|
+
sglang/backend/runtime_endpoint.py,sha256=PAdnQBj3yQNtgw8GH9F1ecGE7HhxGa2T7Tz_c--H2aE,9203
|
14
|
+
sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
|
15
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
|
17
|
+
sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
|
18
|
+
sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
|
19
|
+
sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
|
20
|
+
sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
|
21
|
+
sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
|
22
|
+
sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
|
23
|
+
sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
|
24
|
+
sglang/srt/memory_pool.py,sha256=CZeW1s2bbD4XznIf6XT3WyMCyQEOtYM5RrvlPbN3WuE,3448
|
25
|
+
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
26
|
+
sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
|
27
|
+
sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
|
28
|
+
sglang/srt/openai_protocol.py,sha256=-KJsGx2izL3Fc5EhOGi9PAXExuaq-DKRk0UlNjts11E,5348
|
29
|
+
sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
|
30
|
+
sglang/srt/server.py,sha256=naq38YJNErLYbD_9p-w6JSUHYWDh58k5uVPRyM5kZY4,13194
|
31
|
+
sglang/srt/server_args.py,sha256=EjDYdeeh4yLFO9BCkjV03h-gbLcjk41RDNfGxjzuyj8,12577
|
32
|
+
sglang/srt/utils.py,sha256=Tbm50WWWNEbaO5RNEcybpmwQtsNbOd0bAAZp50LKQMo,19366
|
33
|
+
sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
|
34
|
+
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
35
|
+
sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
|
36
|
+
sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
|
37
|
+
sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
|
38
|
+
sglang/srt/layers/extend_attention.py,sha256=sVd94ViwwQaQDuE94sPMg6Ac6VOp7nX80hFol8qr85Q,13008
|
39
|
+
sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
|
40
|
+
sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
|
41
|
+
sglang/srt/layers/radix_attention.py,sha256=2WgUw39eC2wv61OcGimnSf-Jps4M7mAO5hqomszukvY,5735
|
42
|
+
sglang/srt/layers/token_attention.py,sha256=skkKJCNblFDP7Vqc9oGgK6493A50r6sOHZlPXFfokVM,8667
|
43
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=2oYNtYrSwtfu8G-QcFz_vZK6Buq-eHuZGg9VpxVhYOI,3492
|
44
|
+
sglang/srt/managers/io_struct.py,sha256=aCI4yYtKoioP459lWRN8kqVf4tvYYr_IhZaSnvJylgY,4533
|
45
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=h5nOR8NHCwEm52wiL-ZA1hoM_pvMuyG0j7Zj1h7aMxk,14898
|
46
|
+
sglang/srt/managers/controller/cuda_graph_runner.py,sha256=ki_yS6sb1CQe5bPgC3Sz_sxl2V-y_qhLUK4P86sK-2Y,7011
|
47
|
+
sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
|
48
|
+
sglang/srt/managers/controller/infer_batch.py,sha256=-tEwHPXoK6lV48aQnXC78-wDYQIfLjT4BF8DGS0bvnY,33066
|
49
|
+
sglang/srt/managers/controller/manager_multi.py,sha256=Xp8QR7fhUXzyifA0PC0it9VbsYSQj__gM2cDml-t9Kw,6767
|
50
|
+
sglang/srt/managers/controller/manager_single.py,sha256=WodzU8MuDzjoxbw3z0uCbdcnIsa_7JLyUCytsfCFU24,5506
|
51
|
+
sglang/srt/managers/controller/model_runner.py,sha256=XfDZ_KwuwlILNGdPeEDPgyoxRSBypnWk0eL5tVWdAtk,13387
|
52
|
+
sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
|
53
|
+
sglang/srt/managers/controller/schedule_heuristic.py,sha256=tw9WEiA_pzL4dkPnoS34SYhhQ3hJXBL6K03zRm2n_g8,2482
|
54
|
+
sglang/srt/managers/controller/tp_worker.py,sha256=D_MgXTgtdvJhxh1eVSKi8GhYzArcwYBoLEWExIt0mL8,31863
|
55
|
+
sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
|
56
|
+
sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
|
57
|
+
sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
|
58
|
+
sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
|
59
|
+
sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
|
60
|
+
sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
|
61
|
+
sglang/srt/models/llama2.py,sha256=FIUlkFoBhRNidU_Tlcr4UbSqzKPdz3wBc9OocN_CzQs,12188
|
62
|
+
sglang/srt/models/llama_classification.py,sha256=bLuugRFcPGEaNd58_LFOkWqOru2rCAGChhBw9dSu7pc,4349
|
63
|
+
sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
|
64
|
+
sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
|
65
|
+
sglang/srt/models/minicpm.py,sha256=RFTlREqaQn0EUEwBkJcQgGvdVSZtiIQhSAOhUGsk-OM,13256
|
66
|
+
sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
|
67
|
+
sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
|
68
|
+
sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
|
69
|
+
sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
|
70
|
+
sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
|
71
|
+
sglang/srt/models/qwen2_moe.py,sha256=DEdIveL882HM5kY1mLJui48gaOOL7ELacCtgMxrUa_s,17514
|
72
|
+
sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
|
73
|
+
sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
|
74
|
+
sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
|
75
|
+
sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
|
76
|
+
sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
|
77
|
+
sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
|
78
|
+
sglang-0.1.21.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
79
|
+
sglang-0.1.21.dist-info/METADATA,sha256=i2-wXDSvTGOEWa-JRxbq3G_ur-WM-4X_dVLD5nKjx28,30776
|
80
|
+
sglang-0.1.21.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
81
|
+
sglang-0.1.21.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
82
|
+
sglang-0.1.21.dist-info/RECORD,,
|
sglang/srt/backend_config.py
DELETED