sglang 0.1.20__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. sglang/__init__.py +8 -8
  2. sglang/api.py +1 -1
  3. sglang/backend/runtime_endpoint.py +14 -4
  4. sglang/backend/vertexai.py +5 -4
  5. sglang/bench.py +627 -0
  6. sglang/bench_latency.py +22 -20
  7. sglang/bench_serving.py +758 -0
  8. sglang/check_env.py +171 -0
  9. sglang/global_config.py +3 -1
  10. sglang/lang/backend/__init__.py +0 -0
  11. sglang/lang/backend/anthropic.py +77 -0
  12. sglang/lang/backend/base_backend.py +80 -0
  13. sglang/lang/backend/litellm.py +90 -0
  14. sglang/lang/backend/openai.py +438 -0
  15. sglang/lang/backend/runtime_endpoint.py +283 -0
  16. sglang/lang/backend/vertexai.py +149 -0
  17. sglang/lang/chat_template.py +2 -2
  18. sglang/lang/ir.py +3 -3
  19. sglang/lang/tracer.py +1 -1
  20. sglang/launch_server.py +1 -1
  21. sglang/launch_server_llavavid.py +1 -4
  22. sglang/srt/conversation.py +1 -1
  23. sglang/srt/layers/context_flashattention_nopad.py +0 -29
  24. sglang/srt/layers/extend_attention.py +0 -39
  25. sglang/srt/layers/linear.py +869 -0
  26. sglang/srt/layers/quantization/__init__.py +49 -0
  27. sglang/srt/layers/quantization/fp8.py +662 -0
  28. sglang/srt/layers/radix_attention.py +31 -5
  29. sglang/srt/layers/token_attention.py +1 -51
  30. sglang/srt/managers/controller/cuda_graph_runner.py +44 -18
  31. sglang/srt/managers/controller/infer_batch.py +76 -72
  32. sglang/srt/managers/controller/manager_multi.py +109 -98
  33. sglang/srt/managers/controller/manager_single.py +105 -50
  34. sglang/srt/managers/controller/model_runner.py +42 -18
  35. sglang/srt/managers/controller/radix_cache.py +4 -3
  36. sglang/srt/managers/controller/schedule_heuristic.py +4 -0
  37. sglang/srt/managers/controller/tp_worker.py +143 -156
  38. sglang/srt/managers/detokenizer_manager.py +49 -5
  39. sglang/srt/managers/io_struct.py +36 -17
  40. sglang/srt/managers/tokenizer_manager.py +228 -125
  41. sglang/srt/memory_pool.py +46 -58
  42. sglang/srt/model_loader/model_loader.py +277 -0
  43. sglang/srt/model_loader/utils.py +260 -0
  44. sglang/srt/models/chatglm.py +1 -0
  45. sglang/srt/models/dbrx.py +1 -0
  46. sglang/srt/models/grok.py +1 -0
  47. sglang/srt/models/internlm2.py +317 -0
  48. sglang/srt/models/llama2.py +65 -16
  49. sglang/srt/models/llama_classification.py +1 -0
  50. sglang/srt/models/llava.py +1 -0
  51. sglang/srt/models/llavavid.py +1 -0
  52. sglang/srt/models/minicpm.py +2 -8
  53. sglang/srt/models/mixtral.py +1 -0
  54. sglang/srt/models/mixtral_quant.py +1 -0
  55. sglang/srt/models/qwen.py +1 -0
  56. sglang/srt/models/qwen2.py +6 -0
  57. sglang/srt/models/qwen2_moe.py +130 -108
  58. sglang/srt/models/stablelm.py +1 -0
  59. sglang/srt/openai_api/adapter.py +432 -0
  60. sglang/srt/openai_api/api_adapter.py +432 -0
  61. sglang/srt/openai_api/openai_api_adapter.py +431 -0
  62. sglang/srt/openai_api/openai_protocol.py +207 -0
  63. sglang/srt/openai_api/protocol.py +208 -0
  64. sglang/srt/openai_protocol.py +17 -0
  65. sglang/srt/sampling_params.py +2 -0
  66. sglang/srt/server.py +114 -90
  67. sglang/srt/server_args.py +27 -17
  68. sglang/srt/utils.py +17 -118
  69. sglang/test/test_conversation.py +1 -1
  70. sglang/test/test_openai_protocol.py +1 -1
  71. sglang/test/test_programs.py +1 -1
  72. sglang/test/test_utils.py +2 -2
  73. {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/METADATA +157 -159
  74. sglang-0.1.22.dist-info/RECORD +103 -0
  75. {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/WHEEL +1 -1
  76. sglang-0.1.20.dist-info/RECORD +0 -82
  77. {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/LICENSE +0 -0
  78. {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.20
4
- Summary: A structured generation langauge for LLMs.
3
+ Version: 0.1.22
4
+ Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
7
7
  http://www.apache.org/licenses/
@@ -236,7 +236,6 @@ Requires-Dist: packaging ; extra == 'srt'
236
236
  Requires-Dist: pillow ; extra == 'srt'
237
237
  Requires-Dist: psutil ; extra == 'srt'
238
238
  Requires-Dist: pydantic ; extra == 'srt'
239
- Requires-Dist: rpyc ; extra == 'srt'
240
239
  Requires-Dist: torch ; extra == 'srt'
241
240
  Requires-Dist: uvicorn ; extra == 'srt'
242
241
  Requires-Dist: uvloop ; extra == 'srt'
@@ -252,23 +251,29 @@ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
252
251
 
253
252
  | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
254
253
 
255
- SGLang is a structured generation language designed for large language models (LLMs).
256
- It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
254
+ SGLang is a fast serving framework for large language models and vision language models.
255
+ It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
257
256
 
258
257
  The core features include:
258
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
259
259
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
- - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
261
260
 
262
261
  ## News
262
+ - [2024/04] 🔥 SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
263
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
264
- - [2024/01] 🔥 SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
265
264
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
266
265
 
266
+ <details>
267
+ <summary>More</summary>
268
+
269
+ - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
270
+
271
+ </details>
272
+
267
273
  ## Contents
268
274
  - [Install](#install)
269
- - [Quick Start](#quick-start)
270
- - [Frontend: Structured Generation Language (SGLang)](#frontend-structured-generation-language-sglang)
271
275
  - [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
276
+ - [Frontend: Structured Generation Language (SGLang)](#frontend-structured-generation-language-sglang)
272
277
  - [Benchmark And Performance](#benchmark-and-performance)
273
278
  - [Roadmap](#roadmap)
274
279
  - [Citation And Acknowledgment](#citation-and-acknowledgment)
@@ -297,6 +302,16 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
297
302
  ### Method 3: Using docker
298
303
  The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
299
304
 
305
+ ```bash
306
+ docker run --gpus all \
307
+ -p 30000:30000 \
308
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
309
+ --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
310
+ --ipc=host \
311
+ lmsysorg/sglang:latest \
312
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B --host 0.0.0.0 --port 30000
313
+ ```
314
+
300
315
  ### Common Notes
301
316
  - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
302
317
  ```
@@ -306,13 +321,129 @@ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/
306
321
  - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
307
322
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
308
323
 
309
- ## Quick Start
324
+ ## Backend: SGLang Runtime (SRT)
325
+ The SGLang Runtime (SRT) is an efficient serving engine.
326
+
327
+ ### Quick Start
328
+ Launch a server
329
+ ```
330
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
331
+ ```
332
+
333
+ Send a request
334
+ ```
335
+ curl http://localhost:30000/generate \
336
+ -H "Content-Type: application/json" \
337
+ -d '{
338
+ "text": "Once upon a time,",
339
+ "sampling_params": {
340
+ "max_new_tokens": 16,
341
+ "temperature": 0
342
+ }
343
+ }'
344
+ ```
345
+ Learn more about the argument format [here](docs/sampling_params.md).
346
+
347
+ ### OpenAI Compatible API
348
+ In addition, the server supports OpenAI-compatible APIs.
349
+
350
+ ```python
351
+ import openai
352
+ client = openai.Client(
353
+ base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
354
+
355
+ # Text completion
356
+ response = client.completions.create(
357
+ model="default",
358
+ prompt="The capital of France is",
359
+ temperature=0,
360
+ max_tokens=32,
361
+ )
362
+ print(response)
363
+
364
+ # Chat completion
365
+ response = client.chat.completions.create(
366
+ model="default",
367
+ messages=[
368
+ {"role": "system", "content": "You are a helpful AI assistant"},
369
+ {"role": "user", "content": "List 3 countries and their capitals."},
370
+ ],
371
+ temperature=0,
372
+ max_tokens=64,
373
+ )
374
+ print(response)
375
+ ```
376
+
377
+ It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
378
+
379
+ ### Additional Server Arguments
380
+ - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
381
+ ```
382
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
383
+ ```
384
+ - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
385
+ ```
386
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
387
+ ```
388
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
389
+ ```
390
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
391
+ ```
392
+ - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
393
+ - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
394
+ ```
395
+ # Node 0
396
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
397
+
398
+ # Node 1
399
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
400
+ ```
401
+ - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
402
+
403
+ ### Supported Models
404
+
405
+ - Llama / Llama 2 / Llama 3
406
+ - Mistral / Mixtral
407
+ - Gemma / Gemma 2
408
+ - Qwen / Qwen 2 / Qwen 2 MoE
409
+ - LLaVA 1.5 / 1.6
410
+ - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
411
+ - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
412
+ - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
413
+ - LLaVA-NeXT-Video
414
+ - see [examples/usage/llava_video](examples/usage/llava_video)
415
+ - Yi-VL
416
+ - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
417
+ - StableLM
418
+ - Command-R
419
+ - DBRX
420
+ - Grok
421
+ - ChatGLM
422
+ - InternLM 2
423
+
424
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
425
+
426
+ ### Benchmark Performance
427
+
428
+ - Benchmark a single static batch. Run the following command without launching a server. The arguments are the same as those for `launch_server.py`.
429
+ ```
430
+ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
431
+ ```
432
+ - Benchmark online serving. Launch a server first and run the following command.
433
+ ```
434
+ python3 -m sglang.bench_serving --backend sglang --num-prompt 10
435
+ ```
436
+
437
+ ## Frontend: Structured Generation Language (SGLang)
438
+ The frontend language can be used with local models or API models.
439
+
440
+ ### Quick Start
310
441
  The example below shows how to use sglang to answer a mulit-turn question.
311
442
 
312
- ### Using Local Models
443
+ #### Using Local Models
313
444
  First, launch a server with
314
445
  ```
315
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
446
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
316
447
  ```
317
448
 
318
449
  Then, connect to the server and answer a multi-turn question.
@@ -341,7 +472,7 @@ for m in state.messages():
341
472
  print(state["answer_1"])
342
473
  ```
343
474
 
344
- ### Using OpenAI Models
475
+ #### Using OpenAI Models
345
476
  Set the OpenAI API Key
346
477
  ```
347
478
  export OPENAI_API_KEY=sk-******
@@ -372,13 +503,12 @@ for m in state.messages():
372
503
  print(state["answer_1"])
373
504
  ```
374
505
 
375
- ### More Examples
506
+ #### More Examples
376
507
 
377
508
  Anthropic and VertexAI (Gemini) models are also supported.
378
509
  You can find more examples at [examples/quick_start](examples/quick_start).
379
510
 
380
- ## Frontend: Structured Generation Language (SGLang)
381
-
511
+ ### Language Feature
382
512
  To begin with, import sglang.
383
513
  ```python
384
514
  import sglang as sgl
@@ -391,7 +521,7 @@ The system will manage the state, chat template, parallelism and batching for yo
391
521
 
392
522
  The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
393
523
 
394
- ### Control Flow
524
+ #### Control Flow
395
525
  You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
396
526
 
397
527
  ```python
@@ -406,7 +536,7 @@ def tool_use(s, question):
406
536
  s += "The key word to search is" + sgl.gen("word")
407
537
  ```
408
538
 
409
- ### Parallelism
539
+ #### Parallelism
410
540
  Use `fork` to launch parallel prompts.
411
541
  Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
412
542
 
@@ -428,7 +558,7 @@ def tip_suggestion(s):
428
558
  s += "In summary" + sgl.gen("summary")
429
559
  ```
430
560
 
431
- ### Multi Modality
561
+ #### Multi Modality
432
562
  Use `sgl.image` to pass an image as input.
433
563
 
434
564
  ```python
@@ -440,7 +570,7 @@ def image_qa(s, image_file, question):
440
570
 
441
571
  See also [srt_example_llava.py](examples/quick_start/srt_example_llava.py).
442
572
 
443
- ### Constrained Decoding
573
+ #### Constrained Decoding
444
574
  Use `regex` to specify a regular expression as a decoding constraint.
445
575
  This is only supported for local models.
446
576
 
@@ -455,7 +585,7 @@ def regular_expression_gen(s):
455
585
  )
456
586
  ```
457
587
 
458
- ### JSON Decoding
588
+ #### JSON Decoding
459
589
  Use `regex` to specify a JSON schema with a regular expression.
460
590
 
461
591
  ```python
@@ -484,8 +614,7 @@ def character_gen(s, name):
484
614
 
485
615
  See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
486
616
 
487
-
488
- ### Batching
617
+ #### Batching
489
618
  Use `run_batch` to run a batch of requests with continuous batching.
490
619
 
491
620
  ```python
@@ -504,7 +633,7 @@ states = text_qa.run_batch(
504
633
  )
505
634
  ```
506
635
 
507
- ### Streaming
636
+ #### Streaming
508
637
  Add `stream=True` to enable streaming.
509
638
 
510
639
  ```python
@@ -523,131 +652,10 @@ for out in state.text_iter():
523
652
  print(out, end="", flush=True)
524
653
  ```
525
654
 
526
- ### Tips and Implementation Details
655
+ #### Tips and Implementation Details
527
656
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
528
657
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
529
658
 
530
- ## Backend: SGLang Runtime (SRT)
531
- The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
532
- However, it can also be used as a standalone API server.
533
- In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
534
-
535
- ### Usage
536
- Launch a server
537
- ```
538
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
539
- ```
540
-
541
- Send a request
542
- ```
543
- curl http://localhost:30000/generate \
544
- -H "Content-Type: application/json" \
545
- -d '{
546
- "text": "Once upon a time,",
547
- "sampling_params": {
548
- "max_new_tokens": 16,
549
- "temperature": 0
550
- }
551
- }'
552
- ```
553
- Learn more about the argument format [here](docs/sampling_params.md).
554
-
555
- ### OpenAI Compatible API
556
- In addition, the server supports an experimental OpenAI-compatible API.
557
-
558
- ```python
559
- import openai
560
- client = openai.Client(
561
- base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
562
-
563
- # Text completion
564
- response = client.completions.create(
565
- model="default",
566
- prompt="The capital of France is",
567
- temperature=0,
568
- max_tokens=32,
569
- )
570
- print(response)
571
-
572
- # Chat completion
573
- response = client.chat.completions.create(
574
- model="default",
575
- messages=[
576
- {"role": "system", "content": "You are a helpful AI assistant"},
577
- {"role": "user", "content": "List 3 countries and their capitals."},
578
- ],
579
- temperature=0,
580
- max_tokens=64,
581
- )
582
- print(response)
583
- ```
584
-
585
- By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
586
-
587
- If needed, you can also override the chat template when launching the server:
588
-
589
- ```
590
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
591
- ```
592
-
593
- If the chat template you are looking for is missing, you are welcome to contribute it.
594
- Meanwhile, you can also temporarily register your chat template as follows:
595
-
596
- ```json
597
- {
598
- "name": "my_model",
599
- "system": "<|im_start|>system",
600
- "user": "<|im_start|>user",
601
- "assistant": "<|im_start|>assistant",
602
- "sep_style": "CHATML",
603
- "sep": "<|im_end|>",
604
- "stop_str": ["<|im_end|>", "<|im_start|>"]
605
- }
606
- ```
607
-
608
- ```
609
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.json
610
- ```
611
-
612
- ### Additional Arguments
613
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
614
- ```
615
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
616
- ```
617
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
618
- ```
619
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
620
- ```
621
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
622
- ```
623
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
624
- ```
625
- - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
626
-
627
- ### Supported Models
628
- - Llama
629
- - Mistral
630
- - Mixtral
631
- - Qwen / Qwen 2 / Qwen 2 MoE
632
- - Gemma / Gemma 2
633
- - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
634
- - LLaVA
635
- - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
636
- - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
637
- - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
638
- - LLaVA-NeXT-Video
639
- - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
640
- - Yi-VL
641
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
642
- - StableLM
643
- - Command-R
644
- - DBRX
645
- - Grok
646
- - ChatGLM
647
- - AWQ/GPTQ/Marlin quantization
648
-
649
- Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
650
-
651
659
  ## Benchmark And Performance
652
660
  - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
653
661
  ![llama_7b](assets/llama_7b.jpg)
@@ -659,18 +667,8 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
659
667
  - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
660
668
 
661
669
  ## Roadmap
662
- https://github.com/sgl-project/sglang/issues/157
670
+ [Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
663
671
 
664
672
  ## Citation And Acknowledgment
665
- ```
666
- @misc{zheng2024sglang,
667
- title={SGLang: Efficient Execution of Structured Language Model Programs},
668
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
669
- year={2024},
670
- eprint={2312.07104},
671
- archivePrefix={arXiv},
672
- primaryClass={cs.AI}
673
- }
674
- ```
675
-
676
- We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
673
+ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
674
+ We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -0,0 +1,103 @@
1
+ sglang/__init__.py,sha256=7-tQgpOarxM1MfYy5nCbpqhqSKB_hKRAI4tekucmYz4,1141
2
+ sglang/api.py,sha256=1JARbc1wNYF6tODdUpgmNgTyLOvMnxdTBctLvEwzGTY,5565
3
+ sglang/bench.py,sha256=p34wnfMRdiedOUf9GKGZkkNxehmyTzK6Q1O20q_SGjY,21841
4
+ sglang/bench_latency.py,sha256=UPy6WhrddMTDX7HqIeHNhCn5vF0YMOKxJlQRvhMC8zU,10552
5
+ sglang/bench_serving.py,sha256=IebHhb0AM_4FhA74Xu13QK1-KXpkRZ_k3ohwKiot9mU,26116
6
+ sglang/check_env.py,sha256=CscuPMlf68dkgZf0m-FiLpUisNNDoihMck4qhLOeV1Q,4124
7
+ sglang/global_config.py,sha256=6WAMjRR1lDeGFdFu-18xUAbWVM2Vj0_L5ExvQ5wofus,1711
8
+ sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
9
+ sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
10
+ sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
11
+ sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
13
+ sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
14
+ sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
15
+ sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
16
+ sglang/backend/runtime_endpoint.py,sha256=PAdnQBj3yQNtgw8GH9F1ecGE7HhxGa2T7Tz_c--H2aE,9203
17
+ sglang/backend/vertexai.py,sha256=98toR-L0OTi4dYHaSmmzJdlQ2qN_0lImoKZFlVgYLRE,4850
18
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
20
+ sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
21
+ sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
22
+ sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
23
+ sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
24
+ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
26
+ sglang/lang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
27
+ sglang/lang/backend/litellm.py,sha256=QsaLRh0KVyuaxRZGAvLOdCCSStIMs-V0XyMX0PR6y0w,2452
28
+ sglang/lang/backend/openai.py,sha256=-ScfI2TFALB_FTYBur9ab0gNYxK1ogHkhdLxX19t6-Y,14808
29
+ sglang/lang/backend/runtime_endpoint.py,sha256=TZ0NV89or5_3MIZZFnc1JXAAjnv7tCfeQmHDla8R0e0,9208
30
+ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
31
+ sglang/srt/conversation.py,sha256=Il7JJuu4o42k2xdBWVfONNmstTsAM-4idX6AcEOnrXQ,15526
32
+ sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
33
+ sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
34
+ sglang/srt/memory_pool.py,sha256=rzJq9-kgO9ON5mgHcLT5GKiQCWBCFaczPE8-9M6ckaU,3680
35
+ sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
36
+ sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
37
+ sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
38
+ sglang/srt/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
39
+ sglang/srt/sampling_params.py,sha256=OI11asr1Bd_E5soDjih614v4flgWxdMZU9HAF0aBafQ,3062
40
+ sglang/srt/server.py,sha256=c0Ldp-10tvTroJI0msHWorrqObR90FuNK6SM4KP-qeU,13682
41
+ sglang/srt/server_args.py,sha256=6pMKJN0S1QoTcVAstmxc5Laub2OAxMYpMykQky-Ym10,12959
42
+ sglang/srt/utils.py,sha256=GFO0K-BnpAGi1_Cp4cSKOVTjfILz8qNltF-feZPR7yE,16804
43
+ sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
44
+ sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
45
+ sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
46
+ sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
47
+ sglang/srt/layers/context_flashattention_nopad.py,sha256=7ps_9W_ia9zikL9HqsSUwWHyBVotywosE-dOiPtaGY8,4615
48
+ sglang/srt/layers/extend_attention.py,sha256=aYAAL9HZJpaSASp-ulMvbmSmyMcqdYUsgVQC-Lbm7_U,12008
49
+ sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
50
+ sglang/srt/layers/linear.py,sha256=qLwFkOiRAljzE7LkAkLRdcCdVMk-t7b56jEjwQAuYDM,33953
51
+ sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
52
+ sglang/srt/layers/radix_attention.py,sha256=xdj4v0L5DEcQDDHSbfo_VFqdvHLAWpiT2oU8wKqz3Gk,6212
53
+ sglang/srt/layers/token_attention.py,sha256=EJ4gjbVLfshOZ_vr1iB-Eq8_B-4F26n_wPDj6e1Zrww,7386
54
+ sglang/srt/layers/quantization/__init__.py,sha256=PQFzdPpul98DvywBA6YMBOnrMjtHE1LMlMpJ7FM8J3I,1971
55
+ sglang/srt/layers/quantization/fp8.py,sha256=jaqgRFnHC--IL8iqB6Qygi-KXYPYBKKqt_j4Rk55_h4,24946
56
+ sglang/srt/managers/detokenizer_manager.py,sha256=8rN2cdMr61LWy07lingEqLnNy0W5Rebdn14IsTQ9PCs,5049
57
+ sglang/srt/managers/io_struct.py,sha256=Y6jW3p0cNg0jcrEQNki1H8MMEWxwWA4p6Y-xVgUVWaI,5404
58
+ sglang/srt/managers/tokenizer_manager.py,sha256=SbivhFhZUR9HU9pLTe93MlYprAFAHzOU3KMBA2piQUk,19308
59
+ sglang/srt/managers/controller/cuda_graph_runner.py,sha256=xWyLPg7vG2EAsgmSG1AI0aEk_AueyOD0-aNbK3Mt_DE,7043
60
+ sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
61
+ sglang/srt/managers/controller/infer_batch.py,sha256=phXzANqBUFyqFwRVl06bd5yBnGK2hem6qzf5i0lrTq0,33086
62
+ sglang/srt/managers/controller/manager_multi.py,sha256=DT8Y9RF5OyTxlrLEZYz4claNWir3UrVztdOZaVPiA6g,6077
63
+ sglang/srt/managers/controller/manager_single.py,sha256=2xO_iWK6tWvc0B31nKbe2N3klxwQBJmPTnFhNjzhVSI,4566
64
+ sglang/srt/managers/controller/model_runner.py,sha256=UBvaHShjBWWFMWSEKeDh2tNqd0zWTwtfK37BbYR7c6w,13864
65
+ sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
66
+ sglang/srt/managers/controller/schedule_heuristic.py,sha256=tw9WEiA_pzL4dkPnoS34SYhhQ3hJXBL6K03zRm2n_g8,2482
67
+ sglang/srt/managers/controller/tp_worker.py,sha256=uyoAW4O08UPciRYxGBPK6U5jaVuwEOvKBjaeJNNAe8s,30531
68
+ sglang/srt/model_loader/model_loader.py,sha256=VS8VQL5ITN3akZ9eU_-uHWMan1axLMNG2_O12HzGysA,10132
69
+ sglang/srt/model_loader/utils.py,sha256=I2PS5HIH5Cg-p7xKO_Cw_foK2vQ61xVc3zQv7CbeGEw,10120
70
+ sglang/srt/models/chatglm.py,sha256=pH8g2Dj8qQLGPYpWVTb-IONfXsdfmpWi0-IEYNdSi4s,13296
71
+ sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
72
+ sglang/srt/models/dbrx.py,sha256=rRxOusGPu670ommeqXg62AllwB1apzE4yZoWc1fcr2M,14095
73
+ sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
74
+ sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
75
+ sglang/srt/models/grok.py,sha256=611zrlIchvFaVfztRdBY7z97oU3KB-anykbOZy1hK6M,27295
76
+ sglang/srt/models/internlm2.py,sha256=8MNcwxU5Th9IxWa314HqqmbCRlPUFScnfneBDs0riIU,11659
77
+ sglang/srt/models/llama2.py,sha256=i97Ib4zq0-AbW7Wwp_ctFWnK528vipmlZVD_a7gB8L8,13819
78
+ sglang/srt/models/llama_classification.py,sha256=foCPvNyP2bTZ0YcRBF-qkmBv-gT24lhLNCXP30Oq4VU,4370
79
+ sglang/srt/models/llava.py,sha256=vBI6EEeOG_9o23Shi9h8k58rxTOHZnSKMmPl3B3Q3uc,17924
80
+ sglang/srt/models/llavavid.py,sha256=SrNQ-U2wekHvP_up-ZXRkCSros2NzheHpPfXHrp0YBU,13050
81
+ sglang/srt/models/minicpm.py,sha256=9uE8D-NopAj-sfaKJ7d-0x-PuCTEevQPoHPZvZlwstA,13277
82
+ sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
83
+ sglang/srt/models/mixtral.py,sha256=LWOIu3okC_30RWTy2Yh2xDjQzbiEBMEpZquleDMU1Y8,20831
84
+ sglang/srt/models/mixtral_quant.py,sha256=ObxdI5thDuy-7ljLMwWdmkuirhI1ESoA_h_mTYE5BE4,13656
85
+ sglang/srt/models/qwen.py,sha256=AUf9L6tkdFXn6VTlBariplMH7yM-o96JH0xLLoM4YgI,9440
86
+ sglang/srt/models/qwen2.py,sha256=87Tt1Bti-Py3AGudcf7k5ni-OHhtDKPj_Hke44YGw4U,11718
87
+ sglang/srt/models/qwen2_moe.py,sha256=oHNoo45myV5kitkls2GWVzuGt1Q4pRHN2nLlXEltFI8,17581
88
+ sglang/srt/models/stablelm.py,sha256=Z_XCDSHY_QMz3lZwwkZdIZjEOizZjLYJU9GDi8o08qQ,10802
89
+ sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
90
+ sglang/srt/openai_api/adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
91
+ sglang/srt/openai_api/api_adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
92
+ sglang/srt/openai_api/openai_api_adapter.py,sha256=5pDaktIEteHxp3qN89U_U3ndd7N0FIfUZAM06YeziUY,15687
93
+ sglang/srt/openai_api/openai_protocol.py,sha256=lGBhfxG6jmgUkMOh2NpBK9w9TUTRZKrsfHdW7XYhKKI,5700
94
+ sglang/srt/openai_api/protocol.py,sha256=j7ifIR2SFQxTwaHAd9ksM096vfffcNltzTH4sg7H0RA,5739
95
+ sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
96
+ sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
97
+ sglang/test/test_programs.py,sha256=uefeHUFKT2NJESOujj-CsnPXdw1aQQN2TzUbPCHJjGs,13654
98
+ sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
99
+ sglang-0.1.22.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
100
+ sglang-0.1.22.dist-info/METADATA,sha256=O1pihQWf_523B_fgluftctwOxcou6oj13_Wuquj7ztU,30691
101
+ sglang-0.1.22.dist-info/WHEEL,sha256=rWxmBtp7hEUqVLOnTaDOPpR-cZpCDkzhhcBce-Zyd5k,91
102
+ sglang-0.1.22.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
103
+ sglang-0.1.22.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.3.0)
2
+ Generator: setuptools (71.0.4)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,82 +0,0 @@
1
- sglang/__init__.py,sha256=M8FawCAF4B2Fbj3fJTm3kLu0YDgYRVC7MMRcBS_QwM0,1116
2
- sglang/api.py,sha256=W_FO5JTrW9I-DoGx2O8cLhcSA6LJqgplrOIqAX-ryNA,5560
3
- sglang/bench_latency.py,sha256=r2F3TcxlvpCz4hCiK41dnmpxxU_sS37x1F7md5HJNIQ,10410
4
- sglang/global_config.py,sha256=8ImBH5mECeCFVi8TtewNRFKHlFCrYG3qof5cSS7aZUY,1670
5
- sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
6
- sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
7
- sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
8
- sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
10
- sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
11
- sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
12
- sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
13
- sglang/backend/runtime_endpoint.py,sha256=XTHAoN_EAwdfADc6vq9tuqri7udGMUih8dStgTuKV1g,9077
14
- sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
15
- sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- sglang/lang/chat_template.py,sha256=hLX1qpXaUQi7PFndAwbOoOeGlX0NekskR_HndAvGnwQ,13307
17
- sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
18
- sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
19
- sglang/lang/ir.py,sha256=ZGXJbJELlt8D8H7CyW3IqcRpZm8Pp7h_hLQw46NSb6I,16639
20
- sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
21
- sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
22
- sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
23
- sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
24
- sglang/srt/memory_pool.py,sha256=3ftXky9baIrgDzYJAywBOO2YOJXQ7RgCG-usMFK6QaQ,4418
25
- sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
26
- sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
27
- sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
28
- sglang/srt/openai_protocol.py,sha256=-KJsGx2izL3Fc5EhOGi9PAXExuaq-DKRk0UlNjts11E,5348
29
- sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
30
- sglang/srt/server.py,sha256=iVrP9G8ljMGY8Tbg64RrPwbcmyl6og66fBatdXIx6TE,13268
31
- sglang/srt/server_args.py,sha256=NPsNq_FuOU4cQVne4XoqHNoeQgGDCwBkAqUKn3joMDk,12492
32
- sglang/srt/utils.py,sha256=e-yPzqDMCGsPgEf4TIe7CEh44lsKpZnclsrMtBggS_Y,19366
33
- sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
34
- sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
35
- sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
36
- sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
37
- sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
38
- sglang/srt/layers/extend_attention.py,sha256=sVd94ViwwQaQDuE94sPMg6Ac6VOp7nX80hFol8qr85Q,13008
39
- sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
40
- sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
41
- sglang/srt/layers/radix_attention.py,sha256=2WgUw39eC2wv61OcGimnSf-Jps4M7mAO5hqomszukvY,5735
42
- sglang/srt/layers/token_attention.py,sha256=skkKJCNblFDP7Vqc9oGgK6493A50r6sOHZlPXFfokVM,8667
43
- sglang/srt/managers/detokenizer_manager.py,sha256=2oYNtYrSwtfu8G-QcFz_vZK6Buq-eHuZGg9VpxVhYOI,3492
44
- sglang/srt/managers/io_struct.py,sha256=aCI4yYtKoioP459lWRN8kqVf4tvYYr_IhZaSnvJylgY,4533
45
- sglang/srt/managers/tokenizer_manager.py,sha256=h5nOR8NHCwEm52wiL-ZA1hoM_pvMuyG0j7Zj1h7aMxk,14898
46
- sglang/srt/managers/controller/cuda_graph_runner.py,sha256=Z-BbHyMupRYU0W844F75Puiuk3MIyKD2grqK-GpE2qk,6691
47
- sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
48
- sglang/srt/managers/controller/infer_batch.py,sha256=ZZEAsh5UoeTDOyrNiRmRYsivUCoI-pjtQVra3N0x2a8,33071
49
- sglang/srt/managers/controller/manager_multi.py,sha256=Z0a-iZzqk8T2Xl7ak2DgE9j00GA6Eb0XoNVx7UlxKa4,6630
50
- sglang/srt/managers/controller/manager_single.py,sha256=5c33d1jPgOtys5gmfZe79UD7aXrsV--1Yq9Yc24bh1g,3469
51
- sglang/srt/managers/controller/model_runner.py,sha256=hql_1aaSvsq1-AQTITUmyPIHd6RZAwXTBHC3QLLtbho,13244
52
- sglang/srt/managers/controller/radix_cache.py,sha256=fMqIm1fTvufI9I_QMoFLfQMkSUWp8VN4wh3-63KJUL0,8193
53
- sglang/srt/managers/controller/schedule_heuristic.py,sha256=_ne7W2mrpuO794uh5tYLR3q6XBbgTMdNmE6VpzY1sJE,2312
54
- sglang/srt/managers/controller/tp_worker.py,sha256=_jqg5RyMarrzD3SJY-Qs5GaLfnzv2pnzI8_BySmmRos,31926
55
- sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
56
- sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
57
- sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
58
- sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
59
- sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
60
- sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
61
- sglang/srt/models/llama2.py,sha256=FIUlkFoBhRNidU_Tlcr4UbSqzKPdz3wBc9OocN_CzQs,12188
62
- sglang/srt/models/llama_classification.py,sha256=bLuugRFcPGEaNd58_LFOkWqOru2rCAGChhBw9dSu7pc,4349
63
- sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
64
- sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
65
- sglang/srt/models/minicpm.py,sha256=vYCGjUjYIYVroiV2kOXWdWIPF6__vkN8JnRK-DqgKNI,13271
66
- sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
67
- sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
68
- sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
69
- sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
70
- sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
71
- sglang/srt/models/qwen2_moe.py,sha256=hV3dF_AzYONd-pQEmEkrrwpTZC6A7K4wY1_cph9UC54,18421
72
- sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
73
- sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
74
- sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
75
- sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
76
- sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
77
- sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
78
- sglang-0.1.20.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
79
- sglang-0.1.20.dist-info/METADATA,sha256=0m4UL_uHZotvQ2fIohPKob2f-HQev5BJHhEUWUEcqQ4,30262
80
- sglang-0.1.20.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
81
- sglang-0.1.20.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
82
- sglang-0.1.20.dist-info/RECORD,,