sglang 0.1.21__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. sglang/__init__.py +8 -8
  2. sglang/api.py +1 -1
  3. sglang/backend/vertexai.py +5 -4
  4. sglang/bench.py +627 -0
  5. sglang/bench_latency.py +22 -19
  6. sglang/bench_serving.py +976 -0
  7. sglang/check_env.py +171 -0
  8. sglang/global_config.py +3 -2
  9. sglang/lang/backend/__init__.py +0 -0
  10. sglang/lang/backend/anthropic.py +77 -0
  11. sglang/lang/backend/base_backend.py +80 -0
  12. sglang/lang/backend/litellm.py +90 -0
  13. sglang/lang/backend/openai.py +438 -0
  14. sglang/lang/backend/runtime_endpoint.py +283 -0
  15. sglang/lang/backend/vertexai.py +149 -0
  16. sglang/lang/interpreter.py +1 -0
  17. sglang/lang/tracer.py +1 -1
  18. sglang/launch_server.py +1 -1
  19. sglang/launch_server_llavavid.py +1 -4
  20. sglang/srt/conversation.py +1 -1
  21. sglang/srt/hf_transformers_utils.py +13 -1
  22. sglang/srt/layers/context_flashattention_nopad.py +0 -29
  23. sglang/srt/layers/extend_attention.py +0 -39
  24. sglang/srt/layers/linear.py +869 -0
  25. sglang/srt/layers/logits_processor.py +4 -5
  26. sglang/srt/layers/quantization/__init__.py +49 -0
  27. sglang/srt/layers/quantization/fp8.py +662 -0
  28. sglang/srt/layers/radix_attention.py +39 -24
  29. sglang/srt/layers/token_attention.py +1 -51
  30. sglang/srt/managers/controller/cuda_graph_runner.py +72 -28
  31. sglang/srt/managers/controller/infer_batch.py +90 -63
  32. sglang/srt/managers/controller/manager_multi.py +107 -100
  33. sglang/srt/managers/controller/manager_single.py +76 -96
  34. sglang/srt/managers/controller/model_runner.py +41 -26
  35. sglang/srt/managers/controller/schedule_heuristic.py +8 -3
  36. sglang/srt/managers/controller/tp_worker.py +136 -149
  37. sglang/srt/managers/detokenizer_manager.py +49 -5
  38. sglang/srt/managers/io_struct.py +36 -17
  39. sglang/srt/managers/tokenizer_manager.py +228 -125
  40. sglang/srt/memory_pool.py +32 -11
  41. sglang/srt/model_loader/model_loader.py +277 -0
  42. sglang/srt/model_loader/utils.py +260 -0
  43. sglang/srt/models/chatglm.py +1 -0
  44. sglang/srt/models/dbrx.py +1 -0
  45. sglang/srt/models/deepseek.py +430 -0
  46. sglang/srt/models/gpt_bigcode.py +282 -0
  47. sglang/srt/models/grok.py +1 -0
  48. sglang/srt/models/internlm2.py +317 -0
  49. sglang/srt/models/llama2.py +81 -23
  50. sglang/srt/models/llama_classification.py +1 -0
  51. sglang/srt/models/llava.py +1 -0
  52. sglang/srt/models/llavavid.py +1 -0
  53. sglang/srt/models/minicpm.py +1 -0
  54. sglang/srt/models/mixtral.py +1 -0
  55. sglang/srt/models/mixtral_quant.py +1 -0
  56. sglang/srt/models/qwen.py +1 -0
  57. sglang/srt/models/qwen2.py +6 -0
  58. sglang/srt/models/qwen2_moe.py +7 -4
  59. sglang/srt/models/stablelm.py +1 -0
  60. sglang/srt/openai_api/adapter.py +432 -0
  61. sglang/srt/openai_api/api_adapter.py +432 -0
  62. sglang/srt/openai_api/openai_api_adapter.py +431 -0
  63. sglang/srt/openai_api/openai_protocol.py +207 -0
  64. sglang/srt/openai_api/protocol.py +208 -0
  65. sglang/srt/openai_protocol.py +17 -0
  66. sglang/srt/sampling_params.py +2 -0
  67. sglang/srt/server.py +132 -84
  68. sglang/srt/server_args.py +35 -21
  69. sglang/srt/utils.py +65 -117
  70. sglang/test/test_conversation.py +1 -1
  71. sglang/test/test_openai_protocol.py +1 -1
  72. sglang/test/test_programs.py +1 -1
  73. sglang/test/test_utils.py +2 -2
  74. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/METADATA +162 -168
  75. sglang-0.1.24.dist-info/RECORD +105 -0
  76. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/WHEEL +1 -1
  77. sglang-0.1.21.dist-info/RECORD +0 -82
  78. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/LICENSE +0 -0
  79. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.21
4
- Summary: A structured generation langauge for LLMs.
3
+ Version: 0.1.24
4
+ Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
7
7
  http://www.apache.org/licenses/
@@ -236,12 +236,11 @@ Requires-Dist: packaging ; extra == 'srt'
236
236
  Requires-Dist: pillow ; extra == 'srt'
237
237
  Requires-Dist: psutil ; extra == 'srt'
238
238
  Requires-Dist: pydantic ; extra == 'srt'
239
- Requires-Dist: rpyc ; extra == 'srt'
240
239
  Requires-Dist: torch ; extra == 'srt'
241
240
  Requires-Dist: uvicorn ; extra == 'srt'
242
241
  Requires-Dist: uvloop ; extra == 'srt'
243
242
  Requires-Dist: zmq ; extra == 'srt'
244
- Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
243
+ Requires-Dist: vllm ==0.5.3.post1 ; extra == 'srt'
245
244
  Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
246
245
 
247
246
  <div align="center">
@@ -252,23 +251,29 @@ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
252
251
 
253
252
  | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
254
253
 
255
- SGLang is a structured generation language designed for large language models (LLMs).
256
- It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
254
+ SGLang is a fast serving framework for large language models and vision language models.
255
+ It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
257
256
 
258
257
  The core features include:
258
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
259
259
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
- - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
261
260
 
262
261
  ## News
262
+ - [2024/04] 🔥 SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
263
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
264
- - [2024/01] 🔥 SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
265
264
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
266
265
 
266
+ <details>
267
+ <summary>More</summary>
268
+
269
+ - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
270
+
271
+ </details>
272
+
267
273
  ## Contents
268
274
  - [Install](#install)
269
- - [Quick Start](#quick-start)
270
- - [Frontend: Structured Generation Language (SGLang)](#frontend-structured-generation-language-sglang)
271
275
  - [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
276
+ - [Frontend: Structured Generation Language (SGLang)](#frontend-structured-generation-language-sglang)
272
277
  - [Benchmark And Performance](#benchmark-and-performance)
273
278
  - [Roadmap](#roadmap)
274
279
  - [Citation And Acknowledgment](#citation-and-acknowledgment)
@@ -277,6 +282,7 @@ The core features include:
277
282
 
278
283
  ### Method 1: With pip
279
284
  ```
285
+ pip install --upgrade pip setuptools wheel
280
286
  pip install "sglang[all]"
281
287
 
282
288
  # Install FlashInfer CUDA kernels
@@ -288,6 +294,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
288
294
  git clone https://github.com/sgl-project/sglang.git
289
295
  cd sglang
290
296
 
297
+ pip install --upgrade pip
291
298
  pip install -e "python[all]"
292
299
 
293
300
  # Install FlashInfer CUDA kernels
@@ -297,6 +304,16 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
297
304
  ### Method 3: Using docker
298
305
  The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
299
306
 
307
+ ```bash
308
+ docker run --gpus all \
309
+ -p 30000:30000 \
310
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
311
+ --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
312
+ --ipc=host \
313
+ lmsysorg/sglang:latest \
314
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B --host 0.0.0.0 --port 30000
315
+ ```
316
+
300
317
  ### Common Notes
301
318
  - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
302
319
  ```
@@ -306,13 +323,131 @@ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/
306
323
  - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
307
324
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
308
325
 
309
- ## Quick Start
326
+ ## Backend: SGLang Runtime (SRT)
327
+ The SGLang Runtime (SRT) is an efficient serving engine.
328
+
329
+ ### Quick Start
330
+ Launch a server
331
+ ```
332
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
333
+ ```
334
+
335
+ Send a request
336
+ ```
337
+ curl http://localhost:30000/generate \
338
+ -H "Content-Type: application/json" \
339
+ -d '{
340
+ "text": "Once upon a time,",
341
+ "sampling_params": {
342
+ "max_new_tokens": 16,
343
+ "temperature": 0
344
+ }
345
+ }'
346
+ ```
347
+ Learn more about the argument format [here](docs/sampling_params.md).
348
+
349
+ ### OpenAI Compatible API
350
+ In addition, the server supports OpenAI-compatible APIs.
351
+
352
+ ```python
353
+ import openai
354
+ client = openai.Client(
355
+ base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
356
+
357
+ # Text completion
358
+ response = client.completions.create(
359
+ model="default",
360
+ prompt="The capital of France is",
361
+ temperature=0,
362
+ max_tokens=32,
363
+ )
364
+ print(response)
365
+
366
+ # Chat completion
367
+ response = client.chat.completions.create(
368
+ model="default",
369
+ messages=[
370
+ {"role": "system", "content": "You are a helpful AI assistant"},
371
+ {"role": "user", "content": "List 3 countries and their capitals."},
372
+ ],
373
+ temperature=0,
374
+ max_tokens=64,
375
+ )
376
+ print(response)
377
+ ```
378
+
379
+ It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
380
+
381
+ ### Additional Server Arguments
382
+ - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
383
+ ```
384
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
385
+ ```
386
+ - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
387
+ ```
388
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
389
+ ```
390
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
391
+ ```
392
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
393
+ ```
394
+ - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
395
+ - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
396
+ ```
397
+ # Node 0
398
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
399
+
400
+ # Node 1
401
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
402
+ ```
403
+ - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
404
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
405
+
406
+ ### Supported Models
407
+
408
+ - Llama / Llama 2 / Llama 3
409
+ - Mistral / Mixtral
410
+ - Gemma / Gemma 2
411
+ - Qwen / Qwen 2 / Qwen 2 MoE
412
+ - LLaVA 1.5 / 1.6
413
+ - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
414
+ - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
415
+ - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
416
+ - LLaVA-NeXT-Video
417
+ - see [examples/usage/llava_video](examples/usage/llava_video)
418
+ - Yi-VL
419
+ - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
420
+ - StableLM
421
+ - Command-R
422
+ - DBRX
423
+ - Grok
424
+ - ChatGLM
425
+ - InternLM 2
426
+ - Mistral NeMo
427
+
428
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
429
+
430
+ ### Benchmark Performance
431
+
432
+ - Benchmark a single static batch. Run the following command without launching a server. The arguments are the same as those for `launch_server.py`.
433
+ ```
434
+ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
435
+ ```
436
+ - Benchmark online serving. Launch a server first and run the following command.
437
+ ```
438
+ python3 -m sglang.bench_serving --backend sglang --num-prompt 10
439
+ ```
440
+
441
+ ## Frontend: Structured Generation Language (SGLang)
442
+ The frontend language can be used with local models or API models.
443
+
444
+ ### Quick Start
310
445
  The example below shows how to use sglang to answer a mulit-turn question.
311
446
 
312
- ### Using Local Models
447
+ #### Using Local Models
313
448
  First, launch a server with
314
449
  ```
315
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
450
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
316
451
  ```
317
452
 
318
453
  Then, connect to the server and answer a multi-turn question.
@@ -341,7 +476,7 @@ for m in state.messages():
341
476
  print(state["answer_1"])
342
477
  ```
343
478
 
344
- ### Using OpenAI Models
479
+ #### Using OpenAI Models
345
480
  Set the OpenAI API Key
346
481
  ```
347
482
  export OPENAI_API_KEY=sk-******
@@ -372,13 +507,12 @@ for m in state.messages():
372
507
  print(state["answer_1"])
373
508
  ```
374
509
 
375
- ### More Examples
510
+ #### More Examples
376
511
 
377
512
  Anthropic and VertexAI (Gemini) models are also supported.
378
513
  You can find more examples at [examples/quick_start](examples/quick_start).
379
514
 
380
- ## Frontend: Structured Generation Language (SGLang)
381
-
515
+ ### Language Feature
382
516
  To begin with, import sglang.
383
517
  ```python
384
518
  import sglang as sgl
@@ -391,7 +525,7 @@ The system will manage the state, chat template, parallelism and batching for yo
391
525
 
392
526
  The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
393
527
 
394
- ### Control Flow
528
+ #### Control Flow
395
529
  You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
396
530
 
397
531
  ```python
@@ -406,7 +540,7 @@ def tool_use(s, question):
406
540
  s += "The key word to search is" + sgl.gen("word")
407
541
  ```
408
542
 
409
- ### Parallelism
543
+ #### Parallelism
410
544
  Use `fork` to launch parallel prompts.
411
545
  Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
412
546
 
@@ -428,7 +562,7 @@ def tip_suggestion(s):
428
562
  s += "In summary" + sgl.gen("summary")
429
563
  ```
430
564
 
431
- ### Multi Modality
565
+ #### Multi Modality
432
566
  Use `sgl.image` to pass an image as input.
433
567
 
434
568
  ```python
@@ -440,7 +574,7 @@ def image_qa(s, image_file, question):
440
574
 
441
575
  See also [srt_example_llava.py](examples/quick_start/srt_example_llava.py).
442
576
 
443
- ### Constrained Decoding
577
+ #### Constrained Decoding
444
578
  Use `regex` to specify a regular expression as a decoding constraint.
445
579
  This is only supported for local models.
446
580
 
@@ -455,7 +589,7 @@ def regular_expression_gen(s):
455
589
  )
456
590
  ```
457
591
 
458
- ### JSON Decoding
592
+ #### JSON Decoding
459
593
  Use `regex` to specify a JSON schema with a regular expression.
460
594
 
461
595
  ```python
@@ -484,8 +618,7 @@ def character_gen(s, name):
484
618
 
485
619
  See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
486
620
 
487
-
488
- ### Batching
621
+ #### Batching
489
622
  Use `run_batch` to run a batch of requests with continuous batching.
490
623
 
491
624
  ```python
@@ -504,7 +637,7 @@ states = text_qa.run_batch(
504
637
  )
505
638
  ```
506
639
 
507
- ### Streaming
640
+ #### Streaming
508
641
  Add `stream=True` to enable streaming.
509
642
 
510
643
  ```python
@@ -523,139 +656,10 @@ for out in state.text_iter():
523
656
  print(out, end="", flush=True)
524
657
  ```
525
658
 
526
- ### Tips and Implementation Details
659
+ #### Tips and Implementation Details
527
660
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
528
661
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
529
662
 
530
- ## Backend: SGLang Runtime (SRT)
531
- The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
532
- However, it can also be used as a standalone API server.
533
- In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
534
-
535
- ### Usage
536
- Launch a server
537
- ```
538
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
539
- ```
540
-
541
- Send a request
542
- ```
543
- curl http://localhost:30000/generate \
544
- -H "Content-Type: application/json" \
545
- -d '{
546
- "text": "Once upon a time,",
547
- "sampling_params": {
548
- "max_new_tokens": 16,
549
- "temperature": 0
550
- }
551
- }'
552
- ```
553
- Learn more about the argument format [here](docs/sampling_params.md).
554
-
555
- ### OpenAI Compatible API
556
- In addition, the server supports an experimental OpenAI-compatible API.
557
-
558
- ```python
559
- import openai
560
- client = openai.Client(
561
- base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
562
-
563
- # Text completion
564
- response = client.completions.create(
565
- model="default",
566
- prompt="The capital of France is",
567
- temperature=0,
568
- max_tokens=32,
569
- )
570
- print(response)
571
-
572
- # Chat completion
573
- response = client.chat.completions.create(
574
- model="default",
575
- messages=[
576
- {"role": "system", "content": "You are a helpful AI assistant"},
577
- {"role": "user", "content": "List 3 countries and their capitals."},
578
- ],
579
- temperature=0,
580
- max_tokens=64,
581
- )
582
- print(response)
583
- ```
584
-
585
- By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
586
-
587
- If needed, you can also override the chat template when launching the server:
588
-
589
- ```
590
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
591
- ```
592
-
593
- If the chat template you are looking for is missing, you are welcome to contribute it.
594
- Meanwhile, you can also temporarily register your chat template as follows:
595
-
596
- ```json
597
- {
598
- "name": "my_model",
599
- "system": "<|im_start|>system",
600
- "user": "<|im_start|>user",
601
- "assistant": "<|im_start|>assistant",
602
- "sep_style": "CHATML",
603
- "sep": "<|im_end|>",
604
- "stop_str": ["<|im_end|>", "<|im_start|>"]
605
- }
606
- ```
607
-
608
- ```
609
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.json
610
- ```
611
-
612
- ### Additional Arguments
613
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
614
- ```
615
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
616
- ```
617
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
618
- ```
619
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
620
- ```
621
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
622
- ```
623
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
624
- ```
625
- - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
626
- - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
627
- ```
628
- # Node 0
629
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
630
-
631
- # Node 1
632
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
633
- ```
634
-
635
- ### Supported Models
636
- - Llama
637
- - Mistral
638
- - Mixtral
639
- - Qwen / Qwen 2 / Qwen 2 MoE
640
- - Gemma / Gemma 2
641
- - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
642
- - LLaVA
643
- - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
644
- - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
645
- - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
646
- - LLaVA-NeXT-Video
647
- - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
648
- - Yi-VL
649
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
650
- - StableLM
651
- - Command-R
652
- - DBRX
653
- - Grok
654
- - ChatGLM
655
- - AWQ/GPTQ/Marlin quantization
656
-
657
- Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
658
-
659
663
  ## Benchmark And Performance
660
664
  - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
661
665
  ![llama_7b](assets/llama_7b.jpg)
@@ -667,18 +671,8 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
667
671
  - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
668
672
 
669
673
  ## Roadmap
670
- https://github.com/sgl-project/sglang/issues/157
674
+ [Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
671
675
 
672
676
  ## Citation And Acknowledgment
673
- ```
674
- @misc{zheng2024sglang,
675
- title={SGLang: Efficient Execution of Structured Language Model Programs},
676
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
677
- year={2024},
678
- eprint={2312.07104},
679
- archivePrefix={arXiv},
680
- primaryClass={cs.AI}
681
- }
682
- ```
683
-
684
- We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
677
+ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
678
+ We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -0,0 +1,105 @@
1
+ sglang/__init__.py,sha256=nMs6lYeKcQpYArIaZLQ2VGNleY1dVvdBFaHyG7fpOsA,1141
2
+ sglang/api.py,sha256=1JARbc1wNYF6tODdUpgmNgTyLOvMnxdTBctLvEwzGTY,5565
3
+ sglang/bench.py,sha256=p34wnfMRdiedOUf9GKGZkkNxehmyTzK6Q1O20q_SGjY,21841
4
+ sglang/bench_latency.py,sha256=UPy6WhrddMTDX7HqIeHNhCn5vF0YMOKxJlQRvhMC8zU,10552
5
+ sglang/bench_serving.py,sha256=zKGgVX3S-ggUvOxvEM4AszzXRPRVU6NGNnBG5vAAvRY,34577
6
+ sglang/check_env.py,sha256=CscuPMlf68dkgZf0m-FiLpUisNNDoihMck4qhLOeV1Q,4124
7
+ sglang/global_config.py,sha256=QG-ABVJksKK_llvUx7fSZcmK4GGCs-hBUVcM4LCr7Nw,1749
8
+ sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
9
+ sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
10
+ sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
11
+ sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
13
+ sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
14
+ sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
15
+ sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
16
+ sglang/backend/runtime_endpoint.py,sha256=PAdnQBj3yQNtgw8GH9F1ecGE7HhxGa2T7Tz_c--H2aE,9203
17
+ sglang/backend/vertexai.py,sha256=98toR-L0OTi4dYHaSmmzJdlQ2qN_0lImoKZFlVgYLRE,4850
18
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
20
+ sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
21
+ sglang/lang/interpreter.py,sha256=27j7H9p7TY4uUfF9f5E17FxK1xCNeNju4aut_PaWCrQ,29693
22
+ sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
23
+ sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
24
+ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
26
+ sglang/lang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
27
+ sglang/lang/backend/litellm.py,sha256=QsaLRh0KVyuaxRZGAvLOdCCSStIMs-V0XyMX0PR6y0w,2452
28
+ sglang/lang/backend/openai.py,sha256=-ScfI2TFALB_FTYBur9ab0gNYxK1ogHkhdLxX19t6-Y,14808
29
+ sglang/lang/backend/runtime_endpoint.py,sha256=TZ0NV89or5_3MIZZFnc1JXAAjnv7tCfeQmHDla8R0e0,9208
30
+ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
31
+ sglang/srt/conversation.py,sha256=Il7JJuu4o42k2xdBWVfONNmstTsAM-4idX6AcEOnrXQ,15526
32
+ sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
33
+ sglang/srt/hf_transformers_utils.py,sha256=94mOI93B2xOmXKqfJfEoGxqHgwwlWNbPHgsA47AQJK8,11245
34
+ sglang/srt/memory_pool.py,sha256=FhJk5GtYortO3MJIsMMQ-o49agwDHVX1aEQH2LITq6c,3949
35
+ sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
36
+ sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
37
+ sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
38
+ sglang/srt/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
39
+ sglang/srt/sampling_params.py,sha256=OI11asr1Bd_E5soDjih614v4flgWxdMZU9HAF0aBafQ,3062
40
+ sglang/srt/server.py,sha256=JC6rs8mkWg2mWwriwZvYEZyO514_HJFOUNda-pu8U_4,14369
41
+ sglang/srt/server_args.py,sha256=aF6L35mEB-FU3BL_ooKuCIcOXLhYLxA9-MjpaOTQRCo,13189
42
+ sglang/srt/utils.py,sha256=ZB9WLlZ_GpKVpPJiETrYkqH10J8iWrN_4buxDnQoA88,18568
43
+ sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
44
+ sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
45
+ sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
46
+ sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
47
+ sglang/srt/layers/context_flashattention_nopad.py,sha256=7ps_9W_ia9zikL9HqsSUwWHyBVotywosE-dOiPtaGY8,4615
48
+ sglang/srt/layers/extend_attention.py,sha256=aYAAL9HZJpaSASp-ulMvbmSmyMcqdYUsgVQC-Lbm7_U,12008
49
+ sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
50
+ sglang/srt/layers/linear.py,sha256=qLwFkOiRAljzE7LkAkLRdcCdVMk-t7b56jEjwQAuYDM,33953
51
+ sglang/srt/layers/logits_processor.py,sha256=KyRYANCiq9Cfu_VPjrIbSBAlqN_clcAgF3JrG9waU5k,9674
52
+ sglang/srt/layers/radix_attention.py,sha256=A3J_wOlysjblFXHgehAqRHBQmpYAHLyUovyLFsrMJ7A,6386
53
+ sglang/srt/layers/token_attention.py,sha256=EJ4gjbVLfshOZ_vr1iB-Eq8_B-4F26n_wPDj6e1Zrww,7386
54
+ sglang/srt/layers/quantization/__init__.py,sha256=PQFzdPpul98DvywBA6YMBOnrMjtHE1LMlMpJ7FM8J3I,1971
55
+ sglang/srt/layers/quantization/fp8.py,sha256=jaqgRFnHC--IL8iqB6Qygi-KXYPYBKKqt_j4Rk55_h4,24946
56
+ sglang/srt/managers/detokenizer_manager.py,sha256=8rN2cdMr61LWy07lingEqLnNy0W5Rebdn14IsTQ9PCs,5049
57
+ sglang/srt/managers/io_struct.py,sha256=Y6jW3p0cNg0jcrEQNki1H8MMEWxwWA4p6Y-xVgUVWaI,5404
58
+ sglang/srt/managers/tokenizer_manager.py,sha256=SbivhFhZUR9HU9pLTe93MlYprAFAHzOU3KMBA2piQUk,19308
59
+ sglang/srt/managers/controller/cuda_graph_runner.py,sha256=0aRqA1_34oJ557Zn8PjpJecex5bBWJdnCmBlcDVvYO0,8509
60
+ sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
61
+ sglang/srt/managers/controller/infer_batch.py,sha256=SKwCwhnZ_CNlG0mVCEc4X0e4HNjJFke-c8zdWP3TzjQ,34186
62
+ sglang/srt/managers/controller/manager_multi.py,sha256=DT8Y9RF5OyTxlrLEZYz4claNWir3UrVztdOZaVPiA6g,6077
63
+ sglang/srt/managers/controller/manager_single.py,sha256=2xO_iWK6tWvc0B31nKbe2N3klxwQBJmPTnFhNjzhVSI,4566
64
+ sglang/srt/managers/controller/model_runner.py,sha256=927tf6nJjLjEDgz2wCDj2kvpZ-E_rAVm8PVKFVfP4p8,13951
65
+ sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
66
+ sglang/srt/managers/controller/schedule_heuristic.py,sha256=SQAGzPS3aB_TPj7rnPBhewwyR6W1sVwW4D3zG3JUY00,2714
67
+ sglang/srt/managers/controller/tp_worker.py,sha256=yjz-Xzl0zEy4QSU-EYneZH5vi3oHtBuXTtYe4VuDp2g,30517
68
+ sglang/srt/model_loader/model_loader.py,sha256=VS8VQL5ITN3akZ9eU_-uHWMan1axLMNG2_O12HzGysA,10132
69
+ sglang/srt/model_loader/utils.py,sha256=I2PS5HIH5Cg-p7xKO_Cw_foK2vQ61xVc3zQv7CbeGEw,10120
70
+ sglang/srt/models/chatglm.py,sha256=pH8g2Dj8qQLGPYpWVTb-IONfXsdfmpWi0-IEYNdSi4s,13296
71
+ sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
72
+ sglang/srt/models/dbrx.py,sha256=rRxOusGPu670ommeqXg62AllwB1apzE4yZoWc1fcr2M,14095
73
+ sglang/srt/models/deepseek.py,sha256=YtoPmv4fKmiH_jsRMSab9Wxq3aOZga9pCPGnkCs3Vvs,15457
74
+ sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
75
+ sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
76
+ sglang/srt/models/gpt_bigcode.py,sha256=XHO1naPdXfiKYQRQ6uZe1fN3PBDhKH3-bchsaaZvfE4,9637
77
+ sglang/srt/models/grok.py,sha256=611zrlIchvFaVfztRdBY7z97oU3KB-anykbOZy1hK6M,27295
78
+ sglang/srt/models/internlm2.py,sha256=8MNcwxU5Th9IxWa314HqqmbCRlPUFScnfneBDs0riIU,11659
79
+ sglang/srt/models/llama2.py,sha256=OyAf_lun5aZEsT80WmrIYBF8QXTXRpW8sUlylr4AZIc,14204
80
+ sglang/srt/models/llama_classification.py,sha256=foCPvNyP2bTZ0YcRBF-qkmBv-gT24lhLNCXP30Oq4VU,4370
81
+ sglang/srt/models/llava.py,sha256=vBI6EEeOG_9o23Shi9h8k58rxTOHZnSKMmPl3B3Q3uc,17924
82
+ sglang/srt/models/llavavid.py,sha256=SrNQ-U2wekHvP_up-ZXRkCSros2NzheHpPfXHrp0YBU,13050
83
+ sglang/srt/models/minicpm.py,sha256=9uE8D-NopAj-sfaKJ7d-0x-PuCTEevQPoHPZvZlwstA,13277
84
+ sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
85
+ sglang/srt/models/mixtral.py,sha256=LWOIu3okC_30RWTy2Yh2xDjQzbiEBMEpZquleDMU1Y8,20831
86
+ sglang/srt/models/mixtral_quant.py,sha256=ObxdI5thDuy-7ljLMwWdmkuirhI1ESoA_h_mTYE5BE4,13656
87
+ sglang/srt/models/qwen.py,sha256=AUf9L6tkdFXn6VTlBariplMH7yM-o96JH0xLLoM4YgI,9440
88
+ sglang/srt/models/qwen2.py,sha256=87Tt1Bti-Py3AGudcf7k5ni-OHhtDKPj_Hke44YGw4U,11718
89
+ sglang/srt/models/qwen2_moe.py,sha256=oHNoo45myV5kitkls2GWVzuGt1Q4pRHN2nLlXEltFI8,17581
90
+ sglang/srt/models/stablelm.py,sha256=Z_XCDSHY_QMz3lZwwkZdIZjEOizZjLYJU9GDi8o08qQ,10802
91
+ sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
92
+ sglang/srt/openai_api/adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
93
+ sglang/srt/openai_api/api_adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
94
+ sglang/srt/openai_api/openai_api_adapter.py,sha256=5pDaktIEteHxp3qN89U_U3ndd7N0FIfUZAM06YeziUY,15687
95
+ sglang/srt/openai_api/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
96
+ sglang/srt/openai_api/protocol.py,sha256=j7ifIR2SFQxTwaHAd9ksM096vfffcNltzTH4sg7H0RA,5739
97
+ sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
98
+ sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
99
+ sglang/test/test_programs.py,sha256=uefeHUFKT2NJESOujj-CsnPXdw1aQQN2TzUbPCHJjGs,13654
100
+ sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
101
+ sglang-0.1.24.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
102
+ sglang-0.1.24.dist-info/METADATA,sha256=_HKFljParVedu-eht7OKKb_RpEkVcB-Wh_P_jRW3TJk,30933
103
+ sglang-0.1.24.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
104
+ sglang-0.1.24.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
105
+ sglang-0.1.24.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.3.0)
2
+ Generator: setuptools (71.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5