sglang 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. sglang/__init__.py +5 -1
  2. sglang/api.py +8 -3
  3. sglang/backend/anthropic.py +1 -1
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +148 -12
  6. sglang/backend/runtime_endpoint.py +18 -10
  7. sglang/global_config.py +11 -1
  8. sglang/lang/chat_template.py +9 -2
  9. sglang/lang/interpreter.py +161 -81
  10. sglang/lang/ir.py +29 -11
  11. sglang/lang/tracer.py +1 -1
  12. sglang/launch_server.py +1 -2
  13. sglang/launch_server_llavavid.py +31 -0
  14. sglang/srt/constrained/fsm_cache.py +3 -0
  15. sglang/srt/flush_cache.py +16 -0
  16. sglang/srt/hf_transformers_utils.py +83 -2
  17. sglang/srt/layers/extend_attention.py +17 -0
  18. sglang/srt/layers/fused_moe.py +485 -0
  19. sglang/srt/layers/logits_processor.py +12 -7
  20. sglang/srt/layers/radix_attention.py +10 -3
  21. sglang/srt/layers/token_attention.py +16 -1
  22. sglang/srt/managers/controller/dp_worker.py +110 -0
  23. sglang/srt/managers/controller/infer_batch.py +619 -0
  24. sglang/srt/managers/controller/manager_multi.py +191 -0
  25. sglang/srt/managers/controller/manager_single.py +97 -0
  26. sglang/srt/managers/controller/model_runner.py +462 -0
  27. sglang/srt/managers/controller/radix_cache.py +267 -0
  28. sglang/srt/managers/controller/schedule_heuristic.py +59 -0
  29. sglang/srt/managers/controller/tp_worker.py +791 -0
  30. sglang/srt/managers/detokenizer_manager.py +45 -45
  31. sglang/srt/managers/io_struct.py +26 -10
  32. sglang/srt/managers/router/infer_batch.py +130 -74
  33. sglang/srt/managers/router/manager.py +7 -9
  34. sglang/srt/managers/router/model_rpc.py +224 -135
  35. sglang/srt/managers/router/model_runner.py +94 -107
  36. sglang/srt/managers/router/radix_cache.py +54 -18
  37. sglang/srt/managers/router/scheduler.py +23 -34
  38. sglang/srt/managers/tokenizer_manager.py +183 -88
  39. sglang/srt/model_config.py +5 -2
  40. sglang/srt/models/commandr.py +15 -22
  41. sglang/srt/models/dbrx.py +22 -29
  42. sglang/srt/models/gemma.py +14 -24
  43. sglang/srt/models/grok.py +671 -0
  44. sglang/srt/models/llama2.py +24 -23
  45. sglang/srt/models/llava.py +85 -25
  46. sglang/srt/models/llavavid.py +298 -0
  47. sglang/srt/models/mixtral.py +254 -130
  48. sglang/srt/models/mixtral_quant.py +373 -0
  49. sglang/srt/models/qwen.py +28 -25
  50. sglang/srt/models/qwen2.py +17 -22
  51. sglang/srt/models/stablelm.py +21 -26
  52. sglang/srt/models/yivl.py +17 -25
  53. sglang/srt/openai_api_adapter.py +140 -95
  54. sglang/srt/openai_protocol.py +10 -1
  55. sglang/srt/server.py +101 -52
  56. sglang/srt/server_args.py +59 -11
  57. sglang/srt/utils.py +242 -75
  58. sglang/test/test_programs.py +44 -0
  59. sglang/test/test_utils.py +32 -1
  60. sglang/utils.py +95 -26
  61. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/METADATA +23 -13
  62. sglang-0.1.17.dist-info/RECORD +81 -0
  63. sglang/srt/backend_config.py +0 -13
  64. sglang/srt/models/dbrx_config.py +0 -281
  65. sglang/srt/weight_utils.py +0 -402
  66. sglang-0.1.15.dist-info/RECORD +0 -69
  67. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/LICENSE +0 -0
  68. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/WHEEL +0 -0
  69. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.15
3
+ Version: 0.1.17
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -217,9 +217,12 @@ Provides-Extra: all
217
217
  Requires-Dist: sglang[srt] ; extra == 'all'
218
218
  Requires-Dist: sglang[openai] ; extra == 'all'
219
219
  Requires-Dist: sglang[anthropic] ; extra == 'all'
220
+ Requires-Dist: sglang[litellm] ; extra == 'all'
220
221
  Provides-Extra: anthropic
221
222
  Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
222
223
  Requires-Dist: numpy ; extra == 'anthropic'
224
+ Provides-Extra: litellm
225
+ Requires-Dist: litellm >=1.0.0 ; extra == 'litellm'
223
226
  Provides-Extra: openai
224
227
  Requires-Dist: openai >=1.0 ; extra == 'openai'
225
228
  Requires-Dist: numpy ; extra == 'openai'
@@ -233,12 +236,14 @@ Requires-Dist: torch ; extra == 'srt'
233
236
  Requires-Dist: uvloop ; extra == 'srt'
234
237
  Requires-Dist: uvicorn ; extra == 'srt'
235
238
  Requires-Dist: zmq ; extra == 'srt'
236
- Requires-Dist: vllm >=0.4.2 ; extra == 'srt'
239
+ Requires-Dist: vllm ==0.4.3 ; extra == 'srt'
237
240
  Requires-Dist: interegular ; extra == 'srt'
238
241
  Requires-Dist: pydantic ; extra == 'srt'
239
242
  Requires-Dist: pillow ; extra == 'srt'
240
- Requires-Dist: outlines >=0.0.27 ; extra == 'srt'
241
243
  Requires-Dist: packaging ; extra == 'srt'
244
+ Requires-Dist: huggingface-hub ; extra == 'srt'
245
+ Requires-Dist: hf-transfer ; extra == 'srt'
246
+ Requires-Dist: outlines >=0.0.34 ; extra == 'srt'
242
247
 
243
248
  <div align="center">
244
249
  <img src="assets/logo.png" alt="logo" width="400"></img>
@@ -251,9 +256,9 @@ Requires-Dist: packaging ; extra == 'srt'
251
256
  SGLang is a structured generation language designed for large language models (LLMs).
252
257
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
253
258
 
254
- The core features of SGLang include:
259
+ The core features include:
255
260
  - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
256
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatic KV cache reuse across multiple calls. It also supports other common techniques like continuous batching and tensor parallelism.
261
+ - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
257
262
 
258
263
  ## News
259
264
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -286,12 +291,8 @@ pip install -e "python[all]"
286
291
  ```
287
292
 
288
293
  ### Notes
289
- - If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
290
- - For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
291
- - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
292
294
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
293
295
 
294
-
295
296
  ## Quick Start
296
297
  The example below shows how to use sglang to answer a mulit-turn question.
297
298
 
@@ -568,15 +569,17 @@ response = client.chat.completions.create(
568
569
  print(response)
569
570
  ```
570
571
 
571
- In above example, the server uses the chat template specified in the model tokenizer.
572
- You can override the chat template if needed when launching the server:
572
+
573
+ By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
574
+
575
+ If needed, you can also override the chat template when launching the server:
573
576
 
574
577
  ```
575
578
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
576
579
  ```
577
580
 
578
581
  If the chat template you are looking for is missing, you are welcome to contribute it.
579
- Meanwhile, you can also temporary register your chat template as follows:
582
+ Meanwhile, you can also temporarily register your chat template as follows:
580
583
 
581
584
  ```json
582
585
  {
@@ -599,11 +602,16 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
599
602
  ```
600
603
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
601
604
  ```
605
+ - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
606
+ ```
607
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
608
+ ```
602
609
  - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
603
610
  ```
604
611
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
605
612
  ```
606
- - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
613
+ - See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
614
+ - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
607
615
 
608
616
  ### Supported Models
609
617
  - Llama
@@ -617,6 +625,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
617
625
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
618
626
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
619
627
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
628
+ - LLaVA-NeXT-Video
629
+ - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
620
630
  - Yi-VL
621
631
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
622
632
  - StableLM
@@ -0,0 +1,81 @@
1
+ sglang/__init__.py,sha256=yEHUYdlMU-BtdYBBPSNKnqUTfQ4cdwWwWqA1BfLVB1M,1116
2
+ sglang/api.py,sha256=imnZeqgNmkex9Wg3B5VQ1M8FlBZzx9Wh9D0q5ibO0Bc,4548
3
+ sglang/global_config.py,sha256=Osa7UjpAXjEcULYvMUSa93JrvNP03vR0xLGy2gQ6uJw,1233
4
+ sglang/launch_server.py,sha256=jKPZRDN5bUe8Wgz5eoDkqeePhmKa8DLD4DpXQLT5auo,294
5
+ sglang/launch_server_llavavid.py,sha256=UWo_qUCJ9yknp1TVPzrz4B_aZtEuQpLQq0l96FMgynI,1058
6
+ sglang/utils.py,sha256=-IlcZtGHnOB4Gl_ltsQZPw9Epe7maUnXFTRtvMniw2k,8146
7
+ sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
9
+ sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
10
+ sglang/backend/litellm.py,sha256=Y8lfWN0z8_hKvLMJbl-Xuw7Yn_5drNusC_wJv4BOQUY,2439
11
+ sglang/backend/openai.py,sha256=Xv_QJc6tN5W1Da2fu3kzvrrfT9RvW921_Cwq8R_Ak9Y,14711
12
+ sglang/backend/runtime_endpoint.py,sha256=8NyWgMvhzUcA5VEsPLo1AacZ_UPVSnpxpzt6vYdVQSU,8871
13
+ sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
14
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ sglang/lang/chat_template.py,sha256=ogIT8iMlDcSEgcNBTh5pRLoCkdQI_ec5Hc27wFUFDIg,11532
16
+ sglang/lang/compiler.py,sha256=wNn_UqV6Sxl22mv-PpzFUtRgiFFV-Y4OYpO4LshEoRM,7527
17
+ sglang/lang/interpreter.py,sha256=_QIzpnfSr02JUkeaJzTcZxxF4gv0naY16fvVkDZH9xE,29493
18
+ sglang/lang/ir.py,sha256=EMAXzC7upkx6qvKzCss8p7OSQYAXCT1hCl649s0Kp_c,13882
19
+ sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
20
+ sglang/srt/conversation.py,sha256=NwTVuQXd3NqPq5WCllaYUgPLG2w2pMMbzIKDQfJMMO0,15491
21
+ sglang/srt/flush_cache.py,sha256=N0etybT9tIS8_zreJFu64j9TYHKiR3sVXMTjHwHK8X0,382
22
+ sglang/srt/hf_transformers_utils.py,sha256=3aDNhwxaaObiMCrw9nqzBILoosIx1-Qy7COK6NIHtog,8244
23
+ sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
24
+ sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
25
+ sglang/srt/model_config.py,sha256=6XJHUtev-hI-E3NAIoWiNKtpZfN2hHoaxs_r79vGDe8,1724
26
+ sglang/srt/openai_api_adapter.py,sha256=BDUwhTQpFJHHnWsw4a9XsoGhEZkfgZqd3EUbkD5g5ko,15089
27
+ sglang/srt/openai_protocol.py,sha256=jChImDalBjYk9tzBccb_m5eVVJExdHm9LhCJ4Cso5LU,5350
28
+ sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
29
+ sglang/srt/server.py,sha256=O1lJq6F95ZHeVb4aantcE7SnnM3XM7JSCa6il8vf_Mg,11595
30
+ sglang/srt/server_args.py,sha256=N5sLrpLBL6Zkfspgvanl8-9bKhMSM2Lrv9gHJ8ENmLc,10822
31
+ sglang/srt/utils.py,sha256=pvyyPvJF6RnoR0DG0wSDo73mSS_2x2MhtKqVmXObtyA,14654
32
+ sglang/srt/constrained/__init__.py,sha256=BPRNDJnWtzYJ13X4urRS5aE6wFuwAVNBA9qeWIHF8rE,1236
33
+ sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
34
+ sglang/srt/constrained/fsm_cache.py,sha256=RmAdaAAXlh_KeDiK4w3AARiEnvrbsuELROBgMzJvZKk,967
35
+ sglang/srt/constrained/jump_forward.py,sha256=fUa4AlnGX40gYiWTLuICTJfq4b7wA3AL5dydTqT3jz4,2483
36
+ sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
37
+ sglang/srt/layers/extend_attention.py,sha256=JUYuYSAhfbgOXrwIK5YHJCXPq54a6IZ7vQrze-3VvMQ,12955
38
+ sglang/srt/layers/fused_moe.py,sha256=0JchWmMrqF4Dqn3_gcBcaS2_uypgmOiEE0vjfo-l24U,19484
39
+ sglang/srt/layers/logits_processor.py,sha256=96WMfpBAD-nQNq4cQ4edfhqqS3HuDkAIj42EWj_8Rwo,7283
40
+ sglang/srt/layers/radix_attention.py,sha256=xsF8G-jrXi076Xwk_7-eD-FbNJvDvGGH6Pk4EzMUduA,5818
41
+ sglang/srt/layers/token_attention.py,sha256=rVbPlFpmLoU3nx3qtK2YZdynDxfvMKtQNTPeKi0KNP0,8823
42
+ sglang/srt/managers/detokenizer_manager.py,sha256=XzhlONpgAQBPUWotCGJn6XnIA7YTm6JEmHxj0Zbn6_A,3452
43
+ sglang/srt/managers/io_struct.py,sha256=oWHLvrdszhY8y5pNlFtoVYDBVslEM-rMCegIfbFYOco,4370
44
+ sglang/srt/managers/tokenizer_manager.py,sha256=uV8JuASF2pm95Hvit9dUF4y7juowp1aZ_Yl26Wh-mr0,14827
45
+ sglang/srt/managers/controller/dp_worker.py,sha256=xN7oQ3TG0FeX5K7nv6p3tUXCSE8wn0svdBHFePWe2ZU,3635
46
+ sglang/srt/managers/controller/infer_batch.py,sha256=sMjntty1MPDo__QzsxBVavMFeVIBlWU1x2lfRIP3Fmg,22716
47
+ sglang/srt/managers/controller/manager_multi.py,sha256=VmDkViOc3KFZA5HCcqC1mTmwuVda95NqELzMOrjNsp4,6629
48
+ sglang/srt/managers/controller/manager_single.py,sha256=CMaEl304o1SvNl3t-BpBrrQeyjmfdxNkKxlZh_c49sQ,3222
49
+ sglang/srt/managers/controller/model_runner.py,sha256=08HpdJYih-Nz_IlJ5a_53bb133ESEJ18Y_KSNJ0mTNQ,16993
50
+ sglang/srt/managers/controller/radix_cache.py,sha256=QnScfPDzy_QgZt0nM2BzDI_hDiohmDpJ8QKlAHAspxw,8127
51
+ sglang/srt/managers/controller/schedule_heuristic.py,sha256=DUNbv8DWSjk6I1pabfPGTYhZRz8vAFCsAh8IQcm1jxM,2276
52
+ sglang/srt/managers/controller/tp_worker.py,sha256=7qkDHURfeEPDSbUuN_-glwdgJ66H6dXd49yV8DT5JK0,31306
53
+ sglang/srt/managers/router/infer_batch.py,sha256=PEq_tCQNnmSDerlL6RRjJKadFwgP0r7l67OZypHq-II,22088
54
+ sglang/srt/managers/router/manager.py,sha256=3kTf05O2ADU91wIDoFpIZJXEz1dWeMKis0hn7j1dbzo,2693
55
+ sglang/srt/managers/router/model_rpc.py,sha256=-W-oWF1nOiWp7TwjTUo0DN4-mPdTK4S8noiVkLoQ-vo,31877
56
+ sglang/srt/managers/router/model_runner.py,sha256=PG7iSADgk_E1Eb60mS13Gl5MgHidEmi3YnO4k_Oz-7E,16515
57
+ sglang/srt/managers/router/radix_cache.py,sha256=QnScfPDzy_QgZt0nM2BzDI_hDiohmDpJ8QKlAHAspxw,8127
58
+ sglang/srt/managers/router/scheduler.py,sha256=od3fjTNyTjwTDbXVfT8jEHNPvNDk6Ss9NUUkIeXyq8s,2268
59
+ sglang/srt/models/commandr.py,sha256=JWjljtNr_t_L9PdPuymo6beUS0_EJ7NHZHrhKD3xoL0,13606
60
+ sglang/srt/models/dbrx.py,sha256=Wr45o_DTU1YTq3h5caTAH_1R3nYCSwRyKha64Ygl4Ak,14074
61
+ sglang/srt/models/gemma.py,sha256=rOw9WBNZqdeKfJT9wUa-y5sAj-pAj0YNfjk-dKtxEhA,11501
62
+ sglang/srt/models/grok.py,sha256=R_Y6CptcPgYvRt9YWob-LG2D3hTCa9VxjmA2k734Xlg,26944
63
+ sglang/srt/models/llama2.py,sha256=-IKmBoUDcZ76dRjMSNy0rUPB7NdDh4Ayc8skV0WlRCA,11959
64
+ sglang/srt/models/llava.py,sha256=S9Kz87les4Z_nZ2KAp1ZgmaK-ntILdZHqqqadJBLAt4,17893
65
+ sglang/srt/models/llavavid.py,sha256=8SVkICyDSvsw-5aSmGqSLT9S1xw8ouH0gJmAAeFLOPo,13029
66
+ sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
67
+ sglang/srt/models/mixtral.py,sha256=dDdwkxHOfZdtfr3CixjXIZwNmB5DBfZPSQGmdz2-cJQ,20727
68
+ sglang/srt/models/mixtral_quant.py,sha256=ZP5YfMaZUfthXwSO_84o6L6Be8RhJR-1-lvG5w42wis,13636
69
+ sglang/srt/models/qwen.py,sha256=5Q10AAzBy79SRtZinpnRQYJskjGst2jf4IhJBkmDtjE,9419
70
+ sglang/srt/models/qwen2.py,sha256=_7wLaaDEs_RUgS1cjC8wgk7JqJ6CngHPNTMsDdH5Yok,11465
71
+ sglang/srt/models/stablelm.py,sha256=rzkCKYC0mGg1geFTedcbtyoOFgr_s9HacYbdb_9XJMU,10781
72
+ sglang/srt/models/yivl.py,sha256=wHaoyC2JAvhWssfgwN84BRG8CND4d7TMj1Q-pzbDea8,4367
73
+ sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
74
+ sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
75
+ sglang/test/test_programs.py,sha256=HIfIEjO6fgBmbLIy4z4zpbz6oVw2GvHP8CeVQd69YDU,13378
76
+ sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
77
+ sglang-0.1.17.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
78
+ sglang-0.1.17.dist-info/METADATA,sha256=AZQ36_LEiRR8Bf2AmS0qQMdFBmQK8boZwnlgFaLeoUg,29242
79
+ sglang-0.1.17.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
80
+ sglang-0.1.17.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
81
+ sglang-0.1.17.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- """
2
- Backend configurations, may vary with different serving platforms.
3
- """
4
-
5
- from dataclasses import dataclass
6
-
7
-
8
- @dataclass
9
- class BackendConfig:
10
- extend_dependency_time: float = 0.03
11
-
12
-
13
- GLOBAL_BACKEND_CONFIG = BackendConfig()
@@ -1,281 +0,0 @@
1
- # Adapted from:
2
- # https://github.com/vllm-project/vllm/blob/14ccd94c89d0ffd9da283545d93ab1dfea5da340/vllm/transformers_utils/configs/dbrx.py
3
- # yapf: disable
4
- # ruff: noqa: E501
5
- # coding=utf-8
6
- # Copied from
7
- # https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
8
- """Dbrx configuration."""
9
-
10
- # FIXME: remove this once vllm releases a new version
11
-
12
- from typing import Any, Optional
13
-
14
- from transformers.configuration_utils import PretrainedConfig
15
- from transformers.utils import logging
16
-
17
- logger = logging.get_logger(__name__)
18
-
19
- DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
20
-
21
-
22
- class DbrxAttentionConfig(PretrainedConfig):
23
- """Configuration class for Dbrx Attention.
24
-
25
- [`DbrxAttention`] class. It is used to instantiate attention layers
26
- according to the specified arguments, defining the layers architecture.
27
-
28
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
29
- documentation from [`PretrainedConfig`] for more information.
30
-
31
- Args:
32
- attn_pdrop (`float`, *optional*, defaults to 0.0):
33
- The dropout probability for the attention layers.
34
- clip_qkv (`float`, *optional*, defaults to None):
35
- If not `None`, clip the queries, keys, and values in the attention layer to this value.
36
- kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
37
- rope_theta (float): The base frequency for rope.
38
- """
39
-
40
- def __init__(
41
- self,
42
- attn_pdrop: float = 0,
43
- clip_qkv: Optional[float] = None,
44
- kv_n_heads: int = 1,
45
- rope_theta: float = 10000.0,
46
- **kwargs: Any,
47
- ):
48
- super().__init__(**kwargs)
49
- self.attn_pdrop = attn_pdrop
50
- self.clip_qkv = clip_qkv
51
- self.kv_n_heads = kv_n_heads
52
- self.rope_theta = rope_theta
53
-
54
- for k in ["model_type"]:
55
- if k in kwargs:
56
- kwargs.pop(k)
57
- if len(kwargs) != 0:
58
- raise ValueError(f"Found unknown {kwargs=}")
59
-
60
- @classmethod
61
- def from_pretrained(
62
- cls, pretrained_model_name_or_path: str, **kwargs: Any
63
- ) -> "PretrainedConfig":
64
- cls._set_token_in_kwargs(kwargs)
65
-
66
- config_dict, kwargs = cls.get_config_dict(
67
- pretrained_model_name_or_path, **kwargs
68
- )
69
-
70
- if config_dict.get("model_type") == "dbrx":
71
- config_dict = config_dict["attn_config"]
72
-
73
- if (
74
- "model_type" in config_dict
75
- and hasattr(cls, "model_type")
76
- and config_dict["model_type"] != cls.model_type
77
- ):
78
- logger.warning(
79
- f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
80
- + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
81
- )
82
-
83
- return cls.from_dict(config_dict, **kwargs)
84
-
85
-
86
- class DbrxFFNConfig(PretrainedConfig):
87
- """Configuration class for Dbrx FFN.
88
-
89
- [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
90
- the specified arguments, defining the layers architecture.
91
-
92
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
93
- documentation from [`PretrainedConfig`] for more information.
94
-
95
- Args:
96
- ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
97
- The dict should have a key 'name' with the value being the name of
98
- the activation function along with any additional keyword arguments.
99
- ffn_hidden_size (int, optional): The hidden size of the feedforward network.
100
- moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
101
- moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
102
- moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
103
- moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
104
- moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
105
- uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
106
- This should only be used for benchmarking purposes.
107
- """
108
-
109
- def __init__(
110
- self,
111
- ffn_act_fn: Optional[dict] = None,
112
- ffn_hidden_size: int = 3584,
113
- moe_num_experts: int = 4,
114
- moe_top_k: int = 1,
115
- moe_jitter_eps: Optional[float] = None,
116
- moe_loss_weight: float = 0.01,
117
- moe_normalize_expert_weights: Optional[float] = 1,
118
- uniform_expert_assignment: bool = False,
119
- **kwargs: Any,
120
- ):
121
- super().__init__()
122
- if ffn_act_fn is None:
123
- ffn_act_fn = {"name": "silu"}
124
- self.ffn_act_fn = ffn_act_fn
125
- self.ffn_hidden_size = ffn_hidden_size
126
- self.moe_num_experts = moe_num_experts
127
- self.moe_top_k = moe_top_k
128
- self.moe_jitter_eps = moe_jitter_eps
129
- self.moe_loss_weight = moe_loss_weight
130
- self.moe_normalize_expert_weights = moe_normalize_expert_weights
131
- self.uniform_expert_assignment = uniform_expert_assignment
132
-
133
- for k in ["model_type"]:
134
- if k in kwargs:
135
- kwargs.pop(k)
136
- if len(kwargs) != 0:
137
- raise ValueError(f"Found unknown {kwargs=}")
138
-
139
- @classmethod
140
- def from_pretrained(
141
- cls, pretrained_model_name_or_path: str, **kwargs: Any
142
- ) -> "PretrainedConfig":
143
- cls._set_token_in_kwargs(kwargs)
144
-
145
- config_dict, kwargs = cls.get_config_dict(
146
- pretrained_model_name_or_path, **kwargs
147
- )
148
-
149
- if config_dict.get("model_type") == "dbrx":
150
- config_dict = config_dict["ffn_config"]
151
-
152
- if (
153
- "model_type" in config_dict
154
- and hasattr(cls, "model_type")
155
- and config_dict["model_type"] != cls.model_type
156
- ):
157
- logger.warning(
158
- f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
159
- + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
160
- )
161
-
162
- return cls.from_dict(config_dict, **kwargs)
163
-
164
-
165
- class DbrxConfig(PretrainedConfig):
166
- """Configuration class for Dbrx.
167
-
168
- [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
169
- specified arguments, defining the model architecture.
170
-
171
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
172
- documentation from [`PretrainedConfig`] for more information.
173
-
174
-
175
- Args:
176
- d_model (`int`, *optional*, defaults to 6144):
177
- Dimensionality of the embeddings and hidden states.
178
- n_heads (`int`, *optional*, defaults to 48):
179
- Number of attention heads for each attention layer in the Transformer encoder.
180
- n_layers (`int`, *optional*, defaults to 40):
181
- Number of hidden layers in the Transformer encoder.
182
- max_seq_len (`int`, *optional*, defaults to 32768):
183
- The maximum sequence length of the model.
184
- vocab_size (`int`, *optional*, defaults to 100352):
185
- Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
186
- the `inputs_ids` passed when calling [`DbrxModel`].
187
- resid_pdrop (`float`, *optional*, defaults to 0.0):
188
- The dropout probability applied to the attention output before combining with residual.
189
- emb_pdrop (`float`, *optional*, defaults to 0.0):
190
- The dropout probability for the embedding layer.
191
- attn_config (`dict`, *optional*):
192
- A dictionary used to configure the model's attention module.
193
- ffn_config (`dict`, *optional*):
194
- A dictionary used to configure the model's FFN module.
195
- use_cache (`bool`, *optional*, defaults to `False`):
196
- Whether or not the model should return the last key/values attentions (not used by all models).
197
- initializer_range (`float`, *optional*, defaults to 0.02):
198
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
199
- output_router_logits (`bool`, *optional*, defaults to `False`):
200
- Whether or not the router logits should be returned by the model. Enabling this will also
201
- allow the model to output the auxiliary loss. See [here]() for more details
202
- router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
203
- The aux loss factor for the total loss.
204
-
205
-
206
- Example:
207
- ```python
208
- >>> from transformers import DbrxConfig, DbrxModel
209
-
210
- >>> # Initializing a Dbrx configuration
211
- >>> configuration = DbrxConfig()
212
-
213
- >>> # Initializing a model (with random weights) from the configuration
214
- >>> model = DbrxModel(configuration)
215
-
216
- >>> # Accessing the model configuration
217
- >>> configuration = model.config
218
- ```
219
- """
220
-
221
- model_type = "dbrx"
222
- attribute_map = {
223
- "num_attention_heads": "n_heads",
224
- "hidden_size": "d_model",
225
- "num_hidden_layers": "n_layers",
226
- "max_position_embeddings": "max_seq_len",
227
- }
228
-
229
- def __init__(
230
- self,
231
- d_model: int = 2048,
232
- n_heads: int = 16,
233
- n_layers: int = 24,
234
- max_seq_len: int = 2048,
235
- vocab_size: int = 32000,
236
- resid_pdrop: float = 0.0,
237
- emb_pdrop: float = 0.0,
238
- attn_config: Optional[DbrxAttentionConfig] = None,
239
- ffn_config: Optional[DbrxFFNConfig] = None,
240
- use_cache: bool = True,
241
- initializer_range: float = 0.02,
242
- output_router_logits: bool = False,
243
- router_aux_loss_coef: float = 0.05,
244
- **kwargs: Any,
245
- ):
246
- if attn_config is None:
247
- self.attn_config = DbrxAttentionConfig()
248
- elif isinstance(attn_config, dict):
249
- self.attn_config = DbrxAttentionConfig(**attn_config)
250
- else:
251
- self.attn_config = attn_config
252
-
253
- if ffn_config is None:
254
- self.ffn_config = DbrxFFNConfig()
255
- elif isinstance(ffn_config, dict):
256
- self.ffn_config = DbrxFFNConfig(**ffn_config)
257
- else:
258
- self.ffn_config = ffn_config
259
-
260
- self.d_model = d_model
261
- self.n_heads = n_heads
262
- self.n_layers = n_layers
263
- self.max_seq_len = max_seq_len
264
- self.vocab_size = vocab_size
265
- self.resid_pdrop = resid_pdrop
266
- self.emb_pdrop = emb_pdrop
267
- self.use_cache = use_cache
268
- self.initializer_range = initializer_range
269
- self.output_router_logits = output_router_logits
270
- self.router_aux_loss_coef = router_aux_loss_coef
271
-
272
- tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
273
- if tie_word_embeddings:
274
- raise ValueError(
275
- "tie_word_embeddings is not supported for Dbrx models."
276
- )
277
-
278
- super().__init__(
279
- tie_word_embeddings=tie_word_embeddings,
280
- **kwargs,
281
- )