PyPI - sglang - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl - Mend

sglang 0.1.14py3-none-any.whl → 0.1.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

sglang/__init__.py +59 -2
sglang/api.py +40 -11
sglang/backend/anthropic.py +17 -3
sglang/backend/litellm.py +90 -0
sglang/backend/openai.py +160 -12
sglang/backend/runtime_endpoint.py +62 -27
sglang/backend/vertexai.py +1 -0
sglang/bench_latency.py +320 -0
sglang/global_config.py +24 -3
sglang/lang/chat_template.py +122 -6
sglang/lang/compiler.py +2 -2
sglang/lang/interpreter.py +206 -98
sglang/lang/ir.py +98 -34
sglang/lang/tracer.py +6 -4
sglang/launch_server.py +4 -1
sglang/launch_server_llavavid.py +32 -0
sglang/srt/constrained/__init__.py +14 -6
sglang/srt/constrained/fsm_cache.py +9 -2
sglang/srt/constrained/jump_forward.py +113 -24
sglang/srt/conversation.py +4 -2
sglang/srt/flush_cache.py +18 -0
sglang/srt/hf_transformers_utils.py +144 -3
sglang/srt/layers/context_flashattention_nopad.py +1 -0
sglang/srt/layers/extend_attention.py +20 -1
sglang/srt/layers/fused_moe.py +596 -0
sglang/srt/layers/logits_processor.py +190 -61
sglang/srt/layers/radix_attention.py +62 -53
sglang/srt/layers/token_attention.py +21 -9
sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
sglang/srt/managers/controller/dp_worker.py +113 -0
sglang/srt/managers/controller/infer_batch.py +908 -0
sglang/srt/managers/controller/manager_multi.py +195 -0
sglang/srt/managers/controller/manager_single.py +177 -0
sglang/srt/managers/controller/model_runner.py +359 -0
sglang/srt/managers/{router → controller}/radix_cache.py +102 -53
sglang/srt/managers/controller/schedule_heuristic.py +65 -0
sglang/srt/managers/controller/tp_worker.py +813 -0
sglang/srt/managers/detokenizer_manager.py +42 -40
sglang/srt/managers/io_struct.py +44 -10
sglang/srt/managers/tokenizer_manager.py +224 -82
sglang/srt/memory_pool.py +52 -59
sglang/srt/model_config.py +97 -2
sglang/srt/models/chatglm.py +399 -0
sglang/srt/models/commandr.py +369 -0
sglang/srt/models/dbrx.py +406 -0
sglang/srt/models/gemma.py +34 -38
sglang/srt/models/gemma2.py +436 -0
sglang/srt/models/grok.py +738 -0
sglang/srt/models/llama2.py +47 -37
sglang/srt/models/llama_classification.py +107 -0
sglang/srt/models/llava.py +92 -27
sglang/srt/models/llavavid.py +298 -0
sglang/srt/models/minicpm.py +366 -0
sglang/srt/models/mixtral.py +302 -127
sglang/srt/models/mixtral_quant.py +372 -0
sglang/srt/models/qwen.py +40 -35
sglang/srt/models/qwen2.py +33 -36
sglang/srt/models/qwen2_moe.py +473 -0
sglang/srt/models/stablelm.py +33 -39
sglang/srt/models/yivl.py +19 -26
sglang/srt/openai_api_adapter.py +411 -0
sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +44 -19
sglang/srt/sampling_params.py +2 -0
sglang/srt/server.py +197 -481
sglang/srt/server_args.py +190 -74
sglang/srt/utils.py +460 -95
sglang/test/test_programs.py +73 -10
sglang/test/test_utils.py +226 -7
sglang/utils.py +97 -27
{sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA +74 -45
sglang-0.1.21.dist-info/RECORD +82 -0
{sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
sglang/srt/backend_config.py +0 -13
sglang/srt/managers/router/infer_batch.py +0 -503
sglang/srt/managers/router/manager.py +0 -79
sglang/srt/managers/router/model_rpc.py +0 -686
sglang/srt/managers/router/model_runner.py +0 -514
sglang/srt/managers/router/scheduler.py +0 -70
sglang-0.1.14.dist-info/RECORD +0 -64
{sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
{sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0

{sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.1.14
+Version: 0.1.21
 Summary: A structured generation langauge for LLMs.
 License: Apache License
                                    Version 2.0, January 2004
@@ -212,35 +212,37 @@ Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: requests
+Requires-Dist: tqdm
+Requires-Dist: numpy
 Provides-Extra: all
 Requires-Dist: sglang[srt] ; extra == 'all'
 Requires-Dist: sglang[openai] ; extra == 'all'
 Requires-Dist: sglang[anthropic] ; extra == 'all'
+Requires-Dist: sglang[litellm] ; extra == 'all'
 Provides-Extra: anthropic
 Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
-Requires-Dist: numpy ; extra == 'anthropic'
+Provides-Extra: litellm
+Requires-Dist: litellm >=1.0.0 ; extra == 'litellm'
 Provides-Extra: openai
 Requires-Dist: openai >=1.0 ; extra == 'openai'
-Requires-Dist: numpy ; extra == 'openai'
+Requires-Dist: tiktoken ; extra == 'openai'
 Provides-Extra: srt
 Requires-Dist: aiohttp ; extra == 'srt'
 Requires-Dist: fastapi ; extra == 'srt'
+Requires-Dist: hf-transfer ; extra == 'srt'
+Requires-Dist: huggingface-hub ; extra == 'srt'
+Requires-Dist: interegular ; extra == 'srt'
+Requires-Dist: packaging ; extra == 'srt'
+Requires-Dist: pillow ; extra == 'srt'
 Requires-Dist: psutil ; extra == 'srt'
+Requires-Dist: pydantic ; extra == 'srt'
 Requires-Dist: rpyc ; extra == 'srt'
 Requires-Dist: torch ; extra == 'srt'
-Requires-Dist: uvloop ; extra == 'srt'
 Requires-Dist: uvicorn ; extra == 'srt'
+Requires-Dist: uvloop ; extra == 'srt'
 Requires-Dist: zmq ; extra == 'srt'
-Requires-Dist: vllm >=0.3.3 ; extra == 'srt'
-Requires-Dist: interegular ; extra == 'srt'
-Requires-Dist: lark ; extra == 'srt'
-Requires-Dist: numba ; extra == 'srt'
-Requires-Dist: pydantic ; extra == 'srt'
-Requires-Dist: referencing ; extra == 'srt'
-Requires-Dist: diskcache ; extra == 'srt'
-Requires-Dist: cloudpickle ; extra == 'srt'
-Requires-Dist: pillow ; extra == 'srt'
-Requires-Dist: outlines >=0.0.27 ; extra == 'srt'
+Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
+Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
 <div align="center">
 <img src="assets/logo.png" alt="logo" width="400"></img>
@@ -253,9 +255,9 @@ Requires-Dist: outlines >=0.0.27 ; extra == 'srt'
 SGLang is a structured generation language designed for large language models (LLMs).
 It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
-The core features of SGLang include:
-- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
-- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatic KV cache reuse across multiple calls. It also supports other common techniques like continuous batching and tensor parallelism.
+The core features include:
+- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
+- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
 ## News
 - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -276,23 +278,33 @@ The core features of SGLang include:
 ### Method 1: With pip
 ```
 pip install "sglang[all]"
+# Install FlashInfer CUDA kernels
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ```
 ### Method 2: From source
 ```
-git clone git@github.com:sgl-project/sglang.git
+git clone https://github.com/sgl-project/sglang.git
 cd sglang
-pip install --upgrade pip
 pip install -e "python[all]"
+# Install FlashInfer CUDA kernels
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ```
-### Notes
-- If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
-  - For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
-  - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
-- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
+### Method 3: Using docker
+The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
+### Common Notes
+- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
+```
+pip uninstall -y triton triton-nightly
+pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
+```
+- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
+- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
 ## Quick Start
 The example below shows how to use sglang to answer a mulit-turn question.
@@ -512,8 +524,8 @@ for out in state.text_iter():
 ```
 ### Tips and Implementation Details
-- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
-- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
+- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
+- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
 ## Backend: SGLang Runtime (SRT)
 The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -541,7 +553,6 @@ curl http://localhost:30000/generate \
 Learn more about the argument format [here](docs/sampling_params.md).
 ### OpenAI Compatible API
 In addition, the server supports an experimental OpenAI-compatible API.
 ```python
@@ -571,15 +582,16 @@ response = client.chat.completions.create(
 print(response)
 ```
-In above example, the server uses the chat template specified in the model tokenizer.
-You can override the chat template if needed when launching the server:
+By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
+If needed, you can also override the chat template when launching the server:
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
 ```
 If the chat template you are looking for is missing, you are welcome to contribute it.
-Meanwhile, you can also temporary register your chat template as follows:
+Meanwhile, you can also temporarily register your chat template as follows:
 ```json
 {
@@ -598,58 +610,75 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
 ```
 ### Additional Arguments
-- Add `--tp 2` to enable tensor parallelism.
+- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
 ```
+- Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
+```
 - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
 ```
-- You can turn on [flashinfer](docs/flashinfer.md) to acclerate the inference by using highly optimized CUDA kernels.
+- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
+- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
+```
+# Node 0
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
+# Node 1
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
+```
 ### Supported Models
 - Llama
 - Mistral
 - Mixtral
-- Qwen / Qwen 2
-- Gemma
-  - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
+- Qwen / Qwen 2 / Qwen 2 MoE
+- Gemma / Gemma 2
   - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
 - LLaVA
   - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
   - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
   - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
+- LLaVA-NeXT-Video
+  - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
 - Yi-VL
   - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
-- AWQ/GPTQ quantization
+- StableLM
+- Command-R
+- DBRX
+- Grok
+- ChatGLM
+- AWQ/GPTQ/Marlin quantization
-## Benchmark And Performance
+Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
+## Benchmark And Performance
 - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
 ![llama_7b](assets/llama_7b.jpg)
 - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
 ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
-Learn more [here](docs/benchmark_results.md).
+- Learn more about the above [results](docs/benchmark_results.md).
+- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
 ## Roadmap
 https://github.com/sgl-project/sglang/issues/157
 ## Citation And Acknowledgment
 ```
-@misc{zheng2023efficiently,
-      title={Efficiently Programming Large Language Models using SGLang},
-      author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
-      year={2023},
+@misc{zheng2024sglang,
+      title={SGLang: Efficient Execution of Structured Language Model Programs},
+      author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
+      year={2024},
       eprint={2312.07104},
       archivePrefix={arXiv},
       primaryClass={cs.AI}
 }
 ```
-[![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/2312.07104)
 We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).

sglang-0.1.21.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,82 @@
+sglang/__init__.py,sha256=vvd5xGflm3C6lftzWLBh2W9kpr0PgM8RWCApp-VmHs0,1116
+sglang/api.py,sha256=W_FO5JTrW9I-DoGx2O8cLhcSA6LJqgplrOIqAX-ryNA,5560
+sglang/bench_latency.py,sha256=b3tnG-FumU7ZHArNDFJAnxof6McAUu4q_O88nTZtooQ,10409
+sglang/global_config.py,sha256=6WAMjRR1lDeGFdFu-18xUAbWVM2Vj0_L5ExvQ5wofus,1711
+sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
+sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
+sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
+sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
+sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
+sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
+sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
+sglang/backend/runtime_endpoint.py,sha256=PAdnQBj3yQNtgw8GH9F1ecGE7HhxGa2T7Tz_c--H2aE,9203
+sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
+sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
+sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
+sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
+sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
+sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
+sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
+sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
+sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
+sglang/srt/memory_pool.py,sha256=CZeW1s2bbD4XznIf6XT3WyMCyQEOtYM5RrvlPbN3WuE,3448
+sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
+sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
+sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
+sglang/srt/openai_protocol.py,sha256=-KJsGx2izL3Fc5EhOGi9PAXExuaq-DKRk0UlNjts11E,5348
+sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
+sglang/srt/server.py,sha256=naq38YJNErLYbD_9p-w6JSUHYWDh58k5uVPRyM5kZY4,13194
+sglang/srt/server_args.py,sha256=EjDYdeeh4yLFO9BCkjV03h-gbLcjk41RDNfGxjzuyj8,12577
+sglang/srt/utils.py,sha256=Tbm50WWWNEbaO5RNEcybpmwQtsNbOd0bAAZp50LKQMo,19366
+sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
+sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
+sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
+sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
+sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
+sglang/srt/layers/extend_attention.py,sha256=sVd94ViwwQaQDuE94sPMg6Ac6VOp7nX80hFol8qr85Q,13008
+sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
+sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
+sglang/srt/layers/radix_attention.py,sha256=2WgUw39eC2wv61OcGimnSf-Jps4M7mAO5hqomszukvY,5735
+sglang/srt/layers/token_attention.py,sha256=skkKJCNblFDP7Vqc9oGgK6493A50r6sOHZlPXFfokVM,8667
+sglang/srt/managers/detokenizer_manager.py,sha256=2oYNtYrSwtfu8G-QcFz_vZK6Buq-eHuZGg9VpxVhYOI,3492
+sglang/srt/managers/io_struct.py,sha256=aCI4yYtKoioP459lWRN8kqVf4tvYYr_IhZaSnvJylgY,4533
+sglang/srt/managers/tokenizer_manager.py,sha256=h5nOR8NHCwEm52wiL-ZA1hoM_pvMuyG0j7Zj1h7aMxk,14898
+sglang/srt/managers/controller/cuda_graph_runner.py,sha256=ki_yS6sb1CQe5bPgC3Sz_sxl2V-y_qhLUK4P86sK-2Y,7011
+sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
+sglang/srt/managers/controller/infer_batch.py,sha256=-tEwHPXoK6lV48aQnXC78-wDYQIfLjT4BF8DGS0bvnY,33066
+sglang/srt/managers/controller/manager_multi.py,sha256=Xp8QR7fhUXzyifA0PC0it9VbsYSQj__gM2cDml-t9Kw,6767
+sglang/srt/managers/controller/manager_single.py,sha256=WodzU8MuDzjoxbw3z0uCbdcnIsa_7JLyUCytsfCFU24,5506
+sglang/srt/managers/controller/model_runner.py,sha256=XfDZ_KwuwlILNGdPeEDPgyoxRSBypnWk0eL5tVWdAtk,13387
+sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
+sglang/srt/managers/controller/schedule_heuristic.py,sha256=tw9WEiA_pzL4dkPnoS34SYhhQ3hJXBL6K03zRm2n_g8,2482
+sglang/srt/managers/controller/tp_worker.py,sha256=D_MgXTgtdvJhxh1eVSKi8GhYzArcwYBoLEWExIt0mL8,31863
+sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
+sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
+sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
+sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
+sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
+sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
+sglang/srt/models/llama2.py,sha256=FIUlkFoBhRNidU_Tlcr4UbSqzKPdz3wBc9OocN_CzQs,12188
+sglang/srt/models/llama_classification.py,sha256=bLuugRFcPGEaNd58_LFOkWqOru2rCAGChhBw9dSu7pc,4349
+sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
+sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
+sglang/srt/models/minicpm.py,sha256=RFTlREqaQn0EUEwBkJcQgGvdVSZtiIQhSAOhUGsk-OM,13256
+sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
+sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
+sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
+sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
+sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
+sglang/srt/models/qwen2_moe.py,sha256=DEdIveL882HM5kY1mLJui48gaOOL7ELacCtgMxrUa_s,17514
+sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
+sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
+sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
+sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
+sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
+sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
+sglang-0.1.21.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.1.21.dist-info/METADATA,sha256=i2-wXDSvTGOEWa-JRxbq3G_ur-WM-4X_dVLD5nKjx28,30776
+sglang-0.1.21.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
+sglang-0.1.21.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.1.21.dist-info/RECORD,,

{sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (70.3.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

sglang/srt/backend_config.py DELETED Viewed

@@ -1,13 +0,0 @@
-"""
-Backend configurations, may vary with different serving platforms.
-"""
-from dataclasses import dataclass
-@dataclass
-class BackendConfig:
-    extend_dependency_time: float = 0.03
-GLOBAL_BACKEND_CONFIG = BackendConfig()

sglang 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl

sglang 0.1.14py3-none-any.whl → 0.1.21py3-none-any.whl