sglang 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. sglang/__init__.py +59 -2
  2. sglang/api.py +40 -11
  3. sglang/backend/anthropic.py +17 -3
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +160 -12
  6. sglang/backend/runtime_endpoint.py +62 -27
  7. sglang/backend/vertexai.py +1 -0
  8. sglang/bench_latency.py +320 -0
  9. sglang/global_config.py +24 -3
  10. sglang/lang/chat_template.py +122 -6
  11. sglang/lang/compiler.py +2 -2
  12. sglang/lang/interpreter.py +206 -98
  13. sglang/lang/ir.py +98 -34
  14. sglang/lang/tracer.py +6 -4
  15. sglang/launch_server.py +4 -1
  16. sglang/launch_server_llavavid.py +32 -0
  17. sglang/srt/constrained/__init__.py +14 -6
  18. sglang/srt/constrained/fsm_cache.py +9 -2
  19. sglang/srt/constrained/jump_forward.py +113 -24
  20. sglang/srt/conversation.py +4 -2
  21. sglang/srt/flush_cache.py +18 -0
  22. sglang/srt/hf_transformers_utils.py +144 -3
  23. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  24. sglang/srt/layers/extend_attention.py +20 -1
  25. sglang/srt/layers/fused_moe.py +596 -0
  26. sglang/srt/layers/logits_processor.py +190 -61
  27. sglang/srt/layers/radix_attention.py +62 -53
  28. sglang/srt/layers/token_attention.py +21 -9
  29. sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
  30. sglang/srt/managers/controller/dp_worker.py +113 -0
  31. sglang/srt/managers/controller/infer_batch.py +908 -0
  32. sglang/srt/managers/controller/manager_multi.py +195 -0
  33. sglang/srt/managers/controller/manager_single.py +177 -0
  34. sglang/srt/managers/controller/model_runner.py +359 -0
  35. sglang/srt/managers/{router → controller}/radix_cache.py +102 -53
  36. sglang/srt/managers/controller/schedule_heuristic.py +65 -0
  37. sglang/srt/managers/controller/tp_worker.py +813 -0
  38. sglang/srt/managers/detokenizer_manager.py +42 -40
  39. sglang/srt/managers/io_struct.py +44 -10
  40. sglang/srt/managers/tokenizer_manager.py +224 -82
  41. sglang/srt/memory_pool.py +52 -59
  42. sglang/srt/model_config.py +97 -2
  43. sglang/srt/models/chatglm.py +399 -0
  44. sglang/srt/models/commandr.py +369 -0
  45. sglang/srt/models/dbrx.py +406 -0
  46. sglang/srt/models/gemma.py +34 -38
  47. sglang/srt/models/gemma2.py +436 -0
  48. sglang/srt/models/grok.py +738 -0
  49. sglang/srt/models/llama2.py +47 -37
  50. sglang/srt/models/llama_classification.py +107 -0
  51. sglang/srt/models/llava.py +92 -27
  52. sglang/srt/models/llavavid.py +298 -0
  53. sglang/srt/models/minicpm.py +366 -0
  54. sglang/srt/models/mixtral.py +302 -127
  55. sglang/srt/models/mixtral_quant.py +372 -0
  56. sglang/srt/models/qwen.py +40 -35
  57. sglang/srt/models/qwen2.py +33 -36
  58. sglang/srt/models/qwen2_moe.py +473 -0
  59. sglang/srt/models/stablelm.py +33 -39
  60. sglang/srt/models/yivl.py +19 -26
  61. sglang/srt/openai_api_adapter.py +411 -0
  62. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +44 -19
  63. sglang/srt/sampling_params.py +2 -0
  64. sglang/srt/server.py +197 -481
  65. sglang/srt/server_args.py +190 -74
  66. sglang/srt/utils.py +460 -95
  67. sglang/test/test_programs.py +73 -10
  68. sglang/test/test_utils.py +226 -7
  69. sglang/utils.py +97 -27
  70. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA +74 -45
  71. sglang-0.1.21.dist-info/RECORD +82 -0
  72. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
  73. sglang/srt/backend_config.py +0 -13
  74. sglang/srt/managers/router/infer_batch.py +0 -503
  75. sglang/srt/managers/router/manager.py +0 -79
  76. sglang/srt/managers/router/model_rpc.py +0 -686
  77. sglang/srt/managers/router/model_runner.py +0 -514
  78. sglang/srt/managers/router/scheduler.py +0 -70
  79. sglang-0.1.14.dist-info/RECORD +0 -64
  80. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
  81. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.14
3
+ Version: 0.1.21
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -212,35 +212,37 @@ Requires-Python: >=3.8
212
212
  Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: requests
215
+ Requires-Dist: tqdm
216
+ Requires-Dist: numpy
215
217
  Provides-Extra: all
216
218
  Requires-Dist: sglang[srt] ; extra == 'all'
217
219
  Requires-Dist: sglang[openai] ; extra == 'all'
218
220
  Requires-Dist: sglang[anthropic] ; extra == 'all'
221
+ Requires-Dist: sglang[litellm] ; extra == 'all'
219
222
  Provides-Extra: anthropic
220
223
  Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
221
- Requires-Dist: numpy ; extra == 'anthropic'
224
+ Provides-Extra: litellm
225
+ Requires-Dist: litellm >=1.0.0 ; extra == 'litellm'
222
226
  Provides-Extra: openai
223
227
  Requires-Dist: openai >=1.0 ; extra == 'openai'
224
- Requires-Dist: numpy ; extra == 'openai'
228
+ Requires-Dist: tiktoken ; extra == 'openai'
225
229
  Provides-Extra: srt
226
230
  Requires-Dist: aiohttp ; extra == 'srt'
227
231
  Requires-Dist: fastapi ; extra == 'srt'
232
+ Requires-Dist: hf-transfer ; extra == 'srt'
233
+ Requires-Dist: huggingface-hub ; extra == 'srt'
234
+ Requires-Dist: interegular ; extra == 'srt'
235
+ Requires-Dist: packaging ; extra == 'srt'
236
+ Requires-Dist: pillow ; extra == 'srt'
228
237
  Requires-Dist: psutil ; extra == 'srt'
238
+ Requires-Dist: pydantic ; extra == 'srt'
229
239
  Requires-Dist: rpyc ; extra == 'srt'
230
240
  Requires-Dist: torch ; extra == 'srt'
231
- Requires-Dist: uvloop ; extra == 'srt'
232
241
  Requires-Dist: uvicorn ; extra == 'srt'
242
+ Requires-Dist: uvloop ; extra == 'srt'
233
243
  Requires-Dist: zmq ; extra == 'srt'
234
- Requires-Dist: vllm >=0.3.3 ; extra == 'srt'
235
- Requires-Dist: interegular ; extra == 'srt'
236
- Requires-Dist: lark ; extra == 'srt'
237
- Requires-Dist: numba ; extra == 'srt'
238
- Requires-Dist: pydantic ; extra == 'srt'
239
- Requires-Dist: referencing ; extra == 'srt'
240
- Requires-Dist: diskcache ; extra == 'srt'
241
- Requires-Dist: cloudpickle ; extra == 'srt'
242
- Requires-Dist: pillow ; extra == 'srt'
243
- Requires-Dist: outlines >=0.0.27 ; extra == 'srt'
244
+ Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
245
+ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
244
246
 
245
247
  <div align="center">
246
248
  <img src="assets/logo.png" alt="logo" width="400"></img>
@@ -253,9 +255,9 @@ Requires-Dist: outlines >=0.0.27 ; extra == 'srt'
253
255
  SGLang is a structured generation language designed for large language models (LLMs).
254
256
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
255
257
 
256
- The core features of SGLang include:
257
- - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
258
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatic KV cache reuse across multiple calls. It also supports other common techniques like continuous batching and tensor parallelism.
258
+ The core features include:
259
+ - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
259
261
 
260
262
  ## News
261
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -276,23 +278,33 @@ The core features of SGLang include:
276
278
  ### Method 1: With pip
277
279
  ```
278
280
  pip install "sglang[all]"
281
+
282
+ # Install FlashInfer CUDA kernels
283
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
279
284
  ```
280
285
 
281
286
  ### Method 2: From source
282
287
  ```
283
- git clone git@github.com:sgl-project/sglang.git
288
+ git clone https://github.com/sgl-project/sglang.git
284
289
  cd sglang
285
290
 
286
- pip install --upgrade pip
287
291
  pip install -e "python[all]"
292
+
293
+ # Install FlashInfer CUDA kernels
294
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
288
295
  ```
289
296
 
290
- ### Notes
291
- - If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
292
- - For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
293
- - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
294
- - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
297
+ ### Method 3: Using docker
298
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
295
299
 
300
+ ### Common Notes
301
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
302
+ ```
303
+ pip uninstall -y triton triton-nightly
304
+ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
305
+ ```
306
+ - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
307
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
296
308
 
297
309
  ## Quick Start
298
310
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -512,8 +524,8 @@ for out in state.text_iter():
512
524
  ```
513
525
 
514
526
  ### Tips and Implementation Details
515
- - The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
516
- - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
527
+ - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
528
+ - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
517
529
 
518
530
  ## Backend: SGLang Runtime (SRT)
519
531
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -541,7 +553,6 @@ curl http://localhost:30000/generate \
541
553
  Learn more about the argument format [here](docs/sampling_params.md).
542
554
 
543
555
  ### OpenAI Compatible API
544
-
545
556
  In addition, the server supports an experimental OpenAI-compatible API.
546
557
 
547
558
  ```python
@@ -571,15 +582,16 @@ response = client.chat.completions.create(
571
582
  print(response)
572
583
  ```
573
584
 
574
- In above example, the server uses the chat template specified in the model tokenizer.
575
- You can override the chat template if needed when launching the server:
585
+ By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
586
+
587
+ If needed, you can also override the chat template when launching the server:
576
588
 
577
589
  ```
578
590
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
579
591
  ```
580
592
 
581
593
  If the chat template you are looking for is missing, you are welcome to contribute it.
582
- Meanwhile, you can also temporary register your chat template as follows:
594
+ Meanwhile, you can also temporarily register your chat template as follows:
583
595
 
584
596
  ```json
585
597
  {
@@ -598,58 +610,75 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
598
610
  ```
599
611
 
600
612
  ### Additional Arguments
601
- - Add `--tp 2` to enable tensor parallelism.
613
+ - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
602
614
  ```
603
615
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
604
616
  ```
617
+ - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
618
+ ```
619
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
620
+ ```
605
621
  - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
606
622
  ```
607
623
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
608
624
  ```
609
- - You can turn on [flashinfer](docs/flashinfer.md) to acclerate the inference by using highly optimized CUDA kernels.
625
+ - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
626
+ - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
627
+ ```
628
+ # Node 0
629
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
630
+
631
+ # Node 1
632
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
633
+ ```
610
634
 
611
635
  ### Supported Models
612
636
  - Llama
613
637
  - Mistral
614
638
  - Mixtral
615
- - Qwen / Qwen 2
616
- - Gemma
617
- - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
639
+ - Qwen / Qwen 2 / Qwen 2 MoE
640
+ - Gemma / Gemma 2
618
641
  - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
619
642
  - LLaVA
620
643
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
621
644
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
622
645
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
646
+ - LLaVA-NeXT-Video
647
+ - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
623
648
  - Yi-VL
624
649
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
625
- - AWQ/GPTQ quantization
650
+ - StableLM
651
+ - Command-R
652
+ - DBRX
653
+ - Grok
654
+ - ChatGLM
655
+ - AWQ/GPTQ/Marlin quantization
626
656
 
627
- ## Benchmark And Performance
657
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
628
658
 
659
+ ## Benchmark And Performance
629
660
  - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
630
661
  ![llama_7b](assets/llama_7b.jpg)
631
662
 
632
663
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
633
664
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
634
665
 
635
- Learn more [here](docs/benchmark_results.md).
666
+ - Learn more about the above [results](docs/benchmark_results.md).
667
+ - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
636
668
 
637
669
  ## Roadmap
638
670
  https://github.com/sgl-project/sglang/issues/157
639
671
 
640
672
  ## Citation And Acknowledgment
641
673
  ```
642
- @misc{zheng2023efficiently,
643
- title={Efficiently Programming Large Language Models using SGLang},
644
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
645
- year={2023},
674
+ @misc{zheng2024sglang,
675
+ title={SGLang: Efficient Execution of Structured Language Model Programs},
676
+ author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
677
+ year={2024},
646
678
  eprint={2312.07104},
647
679
  archivePrefix={arXiv},
648
680
  primaryClass={cs.AI}
649
681
  }
650
682
  ```
651
683
 
652
- [![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/2312.07104)
653
-
654
-
655
684
  We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
@@ -0,0 +1,82 @@
1
+ sglang/__init__.py,sha256=vvd5xGflm3C6lftzWLBh2W9kpr0PgM8RWCApp-VmHs0,1116
2
+ sglang/api.py,sha256=W_FO5JTrW9I-DoGx2O8cLhcSA6LJqgplrOIqAX-ryNA,5560
3
+ sglang/bench_latency.py,sha256=b3tnG-FumU7ZHArNDFJAnxof6McAUu4q_O88nTZtooQ,10409
4
+ sglang/global_config.py,sha256=6WAMjRR1lDeGFdFu-18xUAbWVM2Vj0_L5ExvQ5wofus,1711
5
+ sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
6
+ sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
7
+ sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
8
+ sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
10
+ sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
11
+ sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
12
+ sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
13
+ sglang/backend/runtime_endpoint.py,sha256=PAdnQBj3yQNtgw8GH9F1ecGE7HhxGa2T7Tz_c--H2aE,9203
14
+ sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
15
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
17
+ sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
18
+ sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
19
+ sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
20
+ sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
21
+ sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
22
+ sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
23
+ sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
24
+ sglang/srt/memory_pool.py,sha256=CZeW1s2bbD4XznIf6XT3WyMCyQEOtYM5RrvlPbN3WuE,3448
25
+ sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
26
+ sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
27
+ sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
28
+ sglang/srt/openai_protocol.py,sha256=-KJsGx2izL3Fc5EhOGi9PAXExuaq-DKRk0UlNjts11E,5348
29
+ sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
30
+ sglang/srt/server.py,sha256=naq38YJNErLYbD_9p-w6JSUHYWDh58k5uVPRyM5kZY4,13194
31
+ sglang/srt/server_args.py,sha256=EjDYdeeh4yLFO9BCkjV03h-gbLcjk41RDNfGxjzuyj8,12577
32
+ sglang/srt/utils.py,sha256=Tbm50WWWNEbaO5RNEcybpmwQtsNbOd0bAAZp50LKQMo,19366
33
+ sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
34
+ sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
35
+ sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
36
+ sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
37
+ sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
38
+ sglang/srt/layers/extend_attention.py,sha256=sVd94ViwwQaQDuE94sPMg6Ac6VOp7nX80hFol8qr85Q,13008
39
+ sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
40
+ sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
41
+ sglang/srt/layers/radix_attention.py,sha256=2WgUw39eC2wv61OcGimnSf-Jps4M7mAO5hqomszukvY,5735
42
+ sglang/srt/layers/token_attention.py,sha256=skkKJCNblFDP7Vqc9oGgK6493A50r6sOHZlPXFfokVM,8667
43
+ sglang/srt/managers/detokenizer_manager.py,sha256=2oYNtYrSwtfu8G-QcFz_vZK6Buq-eHuZGg9VpxVhYOI,3492
44
+ sglang/srt/managers/io_struct.py,sha256=aCI4yYtKoioP459lWRN8kqVf4tvYYr_IhZaSnvJylgY,4533
45
+ sglang/srt/managers/tokenizer_manager.py,sha256=h5nOR8NHCwEm52wiL-ZA1hoM_pvMuyG0j7Zj1h7aMxk,14898
46
+ sglang/srt/managers/controller/cuda_graph_runner.py,sha256=ki_yS6sb1CQe5bPgC3Sz_sxl2V-y_qhLUK4P86sK-2Y,7011
47
+ sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
48
+ sglang/srt/managers/controller/infer_batch.py,sha256=-tEwHPXoK6lV48aQnXC78-wDYQIfLjT4BF8DGS0bvnY,33066
49
+ sglang/srt/managers/controller/manager_multi.py,sha256=Xp8QR7fhUXzyifA0PC0it9VbsYSQj__gM2cDml-t9Kw,6767
50
+ sglang/srt/managers/controller/manager_single.py,sha256=WodzU8MuDzjoxbw3z0uCbdcnIsa_7JLyUCytsfCFU24,5506
51
+ sglang/srt/managers/controller/model_runner.py,sha256=XfDZ_KwuwlILNGdPeEDPgyoxRSBypnWk0eL5tVWdAtk,13387
52
+ sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
53
+ sglang/srt/managers/controller/schedule_heuristic.py,sha256=tw9WEiA_pzL4dkPnoS34SYhhQ3hJXBL6K03zRm2n_g8,2482
54
+ sglang/srt/managers/controller/tp_worker.py,sha256=D_MgXTgtdvJhxh1eVSKi8GhYzArcwYBoLEWExIt0mL8,31863
55
+ sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
56
+ sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
57
+ sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
58
+ sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
59
+ sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
60
+ sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
61
+ sglang/srt/models/llama2.py,sha256=FIUlkFoBhRNidU_Tlcr4UbSqzKPdz3wBc9OocN_CzQs,12188
62
+ sglang/srt/models/llama_classification.py,sha256=bLuugRFcPGEaNd58_LFOkWqOru2rCAGChhBw9dSu7pc,4349
63
+ sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
64
+ sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
65
+ sglang/srt/models/minicpm.py,sha256=RFTlREqaQn0EUEwBkJcQgGvdVSZtiIQhSAOhUGsk-OM,13256
66
+ sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
67
+ sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
68
+ sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
69
+ sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
70
+ sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
71
+ sglang/srt/models/qwen2_moe.py,sha256=DEdIveL882HM5kY1mLJui48gaOOL7ELacCtgMxrUa_s,17514
72
+ sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
73
+ sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
74
+ sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
75
+ sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
76
+ sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
77
+ sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
78
+ sglang-0.1.21.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
79
+ sglang-0.1.21.dist-info/METADATA,sha256=i2-wXDSvTGOEWa-JRxbq3G_ur-WM-4X_dVLD5nKjx28,30776
80
+ sglang-0.1.21.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
81
+ sglang-0.1.21.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
82
+ sglang-0.1.21.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (70.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,13 +0,0 @@
1
- """
2
- Backend configurations, may vary with different serving platforms.
3
- """
4
-
5
- from dataclasses import dataclass
6
-
7
-
8
- @dataclass
9
- class BackendConfig:
10
- extend_dependency_time: float = 0.03
11
-
12
-
13
- GLOBAL_BACKEND_CONFIG = BackendConfig()