sglang 0.1.14__tar.gz → 0.1.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {sglang-0.1.14/sglang.egg-info → sglang-0.1.16}/PKG-INFO +20 -18
  2. {sglang-0.1.14 → sglang-0.1.16}/README.md +13 -11
  3. {sglang-0.1.14 → sglang-0.1.16}/pyproject.toml +4 -4
  4. sglang-0.1.16/sglang/__init__.py +59 -0
  5. {sglang-0.1.14 → sglang-0.1.16}/sglang/api.py +8 -5
  6. {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/anthropic.py +18 -4
  7. {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/openai.py +2 -1
  8. {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/runtime_endpoint.py +18 -5
  9. {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/vertexai.py +1 -0
  10. {sglang-0.1.14 → sglang-0.1.16}/sglang/global_config.py +5 -1
  11. {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/chat_template.py +83 -2
  12. {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/interpreter.py +92 -35
  13. {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/ir.py +12 -9
  14. {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/tracer.py +6 -4
  15. sglang-0.1.16/sglang/launch_server_llavavid.py +31 -0
  16. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/constrained/fsm_cache.py +1 -0
  17. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/constrained/jump_forward.py +1 -0
  18. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/conversation.py +2 -2
  19. sglang-0.1.16/sglang/srt/flush_cache.py +16 -0
  20. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/hf_transformers_utils.py +10 -2
  21. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/layers/context_flashattention_nopad.py +1 -0
  22. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/layers/extend_attention.py +1 -0
  23. sglang-0.1.16/sglang/srt/layers/logits_processor.py +175 -0
  24. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/layers/radix_attention.py +2 -1
  25. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/layers/token_attention.py +1 -0
  26. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/detokenizer_manager.py +5 -1
  27. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/io_struct.py +27 -3
  28. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/infer_batch.py +97 -48
  29. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/manager.py +11 -8
  30. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/model_rpc.py +169 -90
  31. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/model_runner.py +110 -166
  32. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/radix_cache.py +89 -51
  33. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/scheduler.py +17 -28
  34. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/tokenizer_manager.py +110 -33
  35. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/memory_pool.py +5 -14
  36. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/model_config.py +11 -0
  37. sglang-0.1.16/sglang/srt/models/commandr.py +372 -0
  38. sglang-0.1.16/sglang/srt/models/dbrx.py +412 -0
  39. sglang-0.1.16/sglang/srt/models/dbrx_config.py +281 -0
  40. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/gemma.py +24 -25
  41. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/llama2.py +25 -26
  42. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/llava.py +8 -10
  43. sglang-0.1.16/sglang/srt/models/llavavid.py +307 -0
  44. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/mixtral.py +29 -33
  45. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/qwen.py +34 -25
  46. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/qwen2.py +25 -26
  47. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/stablelm.py +26 -26
  48. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/yivl.py +3 -5
  49. sglang-0.1.16/sglang/srt/openai_api_adapter.py +356 -0
  50. {sglang-0.1.14/sglang/srt/managers → sglang-0.1.16/sglang/srt}/openai_protocol.py +36 -20
  51. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/sampling_params.py +2 -0
  52. sglang-0.1.16/sglang/srt/server.py +331 -0
  53. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/server_args.py +79 -49
  54. sglang-0.1.16/sglang/srt/utils.py +426 -0
  55. sglang-0.1.16/sglang/srt/weight_utils.py +417 -0
  56. {sglang-0.1.14 → sglang-0.1.16}/sglang/test/test_programs.py +8 -7
  57. sglang-0.1.16/sglang/test/test_utils.py +350 -0
  58. {sglang-0.1.14 → sglang-0.1.16}/sglang/utils.py +77 -26
  59. {sglang-0.1.14 → sglang-0.1.16/sglang.egg-info}/PKG-INFO +20 -18
  60. {sglang-0.1.14 → sglang-0.1.16}/sglang.egg-info/SOURCES.txt +9 -1
  61. {sglang-0.1.14 → sglang-0.1.16}/sglang.egg-info/requires.txt +7 -7
  62. sglang-0.1.14/sglang/__init__.py +0 -4
  63. sglang-0.1.14/sglang/srt/layers/logits_processor.py +0 -115
  64. sglang-0.1.14/sglang/srt/server.py +0 -696
  65. sglang-0.1.14/sglang/srt/utils.py +0 -261
  66. sglang-0.1.14/sglang/test/test_utils.py +0 -162
  67. {sglang-0.1.14 → sglang-0.1.16}/LICENSE +0 -0
  68. {sglang-0.1.14 → sglang-0.1.16}/setup.cfg +0 -0
  69. {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/__init__.py +0 -0
  70. {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/base_backend.py +0 -0
  71. {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/__init__.py +0 -0
  72. {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/compiler.py +0 -0
  73. {sglang-0.1.14 → sglang-0.1.16}/sglang/launch_server.py +0 -0
  74. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/backend_config.py +0 -0
  75. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/constrained/__init__.py +0 -0
  76. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/constrained/base_cache.py +0 -0
  77. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/mm_utils.py +0 -0
  78. {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/mistral.py +0 -0
  79. {sglang-0.1.14 → sglang-0.1.16}/sglang/test/test_conversation.py +0 -0
  80. {sglang-0.1.14 → sglang-0.1.16}/sglang/test/test_openai_protocol.py +0 -0
  81. {sglang-0.1.14 → sglang-0.1.16}/sglang.egg-info/dependency_links.txt +0 -0
  82. {sglang-0.1.14 → sglang-0.1.16}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.14
3
+ Version: 0.1.16
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -212,6 +212,7 @@ Requires-Python: >=3.8
212
212
  Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: requests
215
+ Requires-Dist: tqdm
215
216
  Provides-Extra: srt
216
217
  Requires-Dist: aiohttp; extra == "srt"
217
218
  Requires-Dist: fastapi; extra == "srt"
@@ -221,19 +222,18 @@ Requires-Dist: torch; extra == "srt"
221
222
  Requires-Dist: uvloop; extra == "srt"
222
223
  Requires-Dist: uvicorn; extra == "srt"
223
224
  Requires-Dist: zmq; extra == "srt"
224
- Requires-Dist: vllm>=0.3.3; extra == "srt"
225
+ Requires-Dist: vllm>=0.4.2; extra == "srt"
225
226
  Requires-Dist: interegular; extra == "srt"
226
- Requires-Dist: lark; extra == "srt"
227
- Requires-Dist: numba; extra == "srt"
228
227
  Requires-Dist: pydantic; extra == "srt"
229
- Requires-Dist: referencing; extra == "srt"
230
- Requires-Dist: diskcache; extra == "srt"
231
- Requires-Dist: cloudpickle; extra == "srt"
232
228
  Requires-Dist: pillow; extra == "srt"
233
- Requires-Dist: outlines>=0.0.27; extra == "srt"
229
+ Requires-Dist: packaging; extra == "srt"
230
+ Requires-Dist: huggingface_hub; extra == "srt"
231
+ Requires-Dist: hf_transfer; extra == "srt"
232
+ Requires-Dist: outlines>=0.0.34; extra == "srt"
234
233
  Provides-Extra: openai
235
234
  Requires-Dist: openai>=1.0; extra == "openai"
236
235
  Requires-Dist: numpy; extra == "openai"
236
+ Requires-Dist: tiktoken; extra == "openai"
237
237
  Provides-Extra: anthropic
238
238
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
239
239
  Requires-Dist: numpy; extra == "anthropic"
@@ -541,7 +541,6 @@ curl http://localhost:30000/generate \
541
541
  Learn more about the argument format [here](docs/sampling_params.md).
542
542
 
543
543
  ### OpenAI Compatible API
544
-
545
544
  In addition, the server supports an experimental OpenAI-compatible API.
546
545
 
547
546
  ```python
@@ -571,15 +570,17 @@ response = client.chat.completions.create(
571
570
  print(response)
572
571
  ```
573
572
 
574
- In above example, the server uses the chat template specified in the model tokenizer.
575
- You can override the chat template if needed when launching the server:
573
+
574
+ By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
575
+
576
+ If needed, you can also override the chat template when launching the server:
576
577
 
577
578
  ```
578
579
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
579
580
  ```
580
581
 
581
582
  If the chat template you are looking for is missing, you are welcome to contribute it.
582
- Meanwhile, you can also temporary register your chat template as follows:
583
+ Meanwhile, you can also temporarily register your chat template as follows:
583
584
 
584
585
  ```json
585
586
  {
@@ -606,7 +607,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
606
607
  ```
607
608
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
608
609
  ```
609
- - You can turn on [flashinfer](docs/flashinfer.md) to acclerate the inference by using highly optimized CUDA kernels.
610
+ - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
610
611
 
611
612
  ### Supported Models
612
613
  - Llama
@@ -622,10 +623,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
622
623
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
623
624
  - Yi-VL
624
625
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
625
- - AWQ/GPTQ quantization
626
+ - StableLM
627
+ - Command-R
628
+ - DBRX
629
+ - AWQ/GPTQ/Marlin quantization
626
630
 
627
- ## Benchmark And Performance
631
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
628
632
 
633
+ ## Benchmark And Performance
629
634
  - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
630
635
  ![llama_7b](assets/llama_7b.jpg)
631
636
 
@@ -649,7 +654,4 @@ https://github.com/sgl-project/sglang/issues/157
649
654
  }
650
655
  ```
651
656
 
652
- [![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/2312.07104)
653
-
654
-
655
657
  We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
@@ -297,7 +297,6 @@ curl http://localhost:30000/generate \
297
297
  Learn more about the argument format [here](docs/sampling_params.md).
298
298
 
299
299
  ### OpenAI Compatible API
300
-
301
300
  In addition, the server supports an experimental OpenAI-compatible API.
302
301
 
303
302
  ```python
@@ -327,15 +326,17 @@ response = client.chat.completions.create(
327
326
  print(response)
328
327
  ```
329
328
 
330
- In above example, the server uses the chat template specified in the model tokenizer.
331
- You can override the chat template if needed when launching the server:
329
+
330
+ By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
331
+
332
+ If needed, you can also override the chat template when launching the server:
332
333
 
333
334
  ```
334
335
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
335
336
  ```
336
337
 
337
338
  If the chat template you are looking for is missing, you are welcome to contribute it.
338
- Meanwhile, you can also temporary register your chat template as follows:
339
+ Meanwhile, you can also temporarily register your chat template as follows:
339
340
 
340
341
  ```json
341
342
  {
@@ -362,7 +363,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
362
363
  ```
363
364
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
364
365
  ```
365
- - You can turn on [flashinfer](docs/flashinfer.md) to acclerate the inference by using highly optimized CUDA kernels.
366
+ - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
366
367
 
367
368
  ### Supported Models
368
369
  - Llama
@@ -378,10 +379,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
378
379
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
379
380
  - Yi-VL
380
381
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
381
- - AWQ/GPTQ quantization
382
+ - StableLM
383
+ - Command-R
384
+ - DBRX
385
+ - AWQ/GPTQ/Marlin quantization
382
386
 
383
- ## Benchmark And Performance
387
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
384
388
 
389
+ ## Benchmark And Performance
385
390
  - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
386
391
  ![llama_7b](assets/llama_7b.jpg)
387
392
 
@@ -405,7 +410,4 @@ https://github.com/sgl-project/sglang/issues/157
405
410
  }
406
411
  ```
407
412
 
408
- [![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/2312.07104)
409
-
410
-
411
- We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
413
+ We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.14"
7
+ version = "0.1.16"
8
8
  description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -15,13 +15,13 @@ classifiers = [
15
15
  ]
16
16
  dependencies = [
17
17
  "requests",
18
+ "tqdm",
18
19
  ]
19
20
 
20
21
  [project.optional-dependencies]
21
22
  srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
22
- "zmq", "vllm>=0.3.3", "interegular", "lark", "numba",
23
- "pydantic", "referencing", "diskcache", "cloudpickle", "pillow", "outlines>=0.0.27"]
24
- openai = ["openai>=1.0", "numpy"]
23
+ "zmq", "vllm>=0.4.2", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
24
+ openai = ["openai>=1.0", "numpy", "tiktoken"]
25
25
  anthropic = ["anthropic>=0.20.0", "numpy"]
26
26
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
27
27
 
@@ -0,0 +1,59 @@
1
+ __version__ = "0.1.16"
2
+
3
+ # SGL API Components
4
+ from sglang.api import (
5
+ Runtime,
6
+ assistant,
7
+ assistant_begin,
8
+ assistant_end,
9
+ flush_cache,
10
+ function,
11
+ gen,
12
+ gen_int,
13
+ gen_string,
14
+ get_server_args,
15
+ image,
16
+ select,
17
+ set_default_backend,
18
+ system,
19
+ user,
20
+ user_begin,
21
+ user_end,
22
+ video,
23
+ )
24
+
25
+ # SGL Backends
26
+ from sglang.backend.anthropic import Anthropic
27
+ from sglang.backend.openai import OpenAI
28
+ from sglang.backend.runtime_endpoint import RuntimeEndpoint
29
+ from sglang.backend.vertexai import VertexAI
30
+
31
+ # Global Configurations
32
+ from sglang.global_config import global_config
33
+
34
+ # public APIs management
35
+ __all__ = [
36
+ "global_config",
37
+ "Anthropic",
38
+ "OpenAI",
39
+ "RuntimeEndpoint",
40
+ "VertexAI",
41
+ "function",
42
+ "Runtime",
43
+ "set_default_backend",
44
+ "flush_cache",
45
+ "get_server_args",
46
+ "gen",
47
+ "gen_int",
48
+ "gen_string",
49
+ "image",
50
+ "video",
51
+ "select",
52
+ "system",
53
+ "user",
54
+ "assistant",
55
+ "user_begin",
56
+ "user_end",
57
+ "assistant_begin",
58
+ "assistant_end",
59
+ ]
@@ -1,13 +1,10 @@
1
- """Public API"""
1
+ """Some Public API Definitions"""
2
2
 
3
+ import os
3
4
  import re
4
5
  from typing import Callable, List, Optional, Union
5
6
 
6
- from sglang.backend.anthropic import Anthropic
7
7
  from sglang.backend.base_backend import BaseBackend
8
- from sglang.backend.openai import OpenAI
9
- from sglang.backend.runtime_endpoint import RuntimeEndpoint
10
- from sglang.backend.vertexai import VertexAI
11
8
  from sglang.global_config import global_config
12
9
  from sglang.lang.ir import (
13
10
  SglExpr,
@@ -18,6 +15,7 @@ from sglang.lang.ir import (
18
15
  SglRoleBegin,
19
16
  SglRoleEnd,
20
17
  SglSelect,
18
+ SglVideo,
21
19
  )
22
20
 
23
21
 
@@ -35,6 +33,7 @@ def function(
35
33
 
36
34
  def Runtime(*args, **kwargs):
37
35
  # Avoid importing unnecessary dependency
36
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
38
37
  from sglang.srt.server import Runtime
39
38
 
40
39
  return Runtime(*args, **kwargs)
@@ -153,6 +152,10 @@ def image(expr: SglExpr):
153
152
  return SglImage(expr)
154
153
 
155
154
 
155
+ def video(path: str, num_frames: int):
156
+ return SglVideo(path, num_frames)
157
+
158
+
156
159
  def select(
157
160
  name: Optional[str] = None,
158
161
  choices: List[str] = None,
@@ -1,6 +1,7 @@
1
1
  from typing import List, Optional, Union
2
2
 
3
3
  import numpy as np
4
+
4
5
  from sglang.backend.base_backend import BaseBackend
5
6
  from sglang.lang.chat_template import get_chat_template
6
7
  from sglang.lang.interpreter import StreamExecutor
@@ -13,7 +14,7 @@ except ImportError as e:
13
14
 
14
15
 
15
16
  class Anthropic(BaseBackend):
16
- def __init__(self, model_name):
17
+ def __init__(self, model_name, *args, **kwargs):
17
18
  super().__init__()
18
19
 
19
20
  if isinstance(anthropic, Exception):
@@ -21,6 +22,7 @@ class Anthropic(BaseBackend):
21
22
 
22
23
  self.model_name = model_name
23
24
  self.chat_template = get_chat_template("claude")
25
+ self.client = anthropic.Anthropic(*args, **kwargs)
24
26
 
25
27
  def get_chat_template(self):
26
28
  return self.chat_template
@@ -35,8 +37,14 @@ class Anthropic(BaseBackend):
35
37
  else:
36
38
  messages = [{"role": "user", "content": s.text_}]
37
39
 
38
- ret = anthropic.Anthropic().messages.create(
40
+ if messages and messages[0]["role"] == "system":
41
+ system = messages.pop(0)["content"]
42
+ else:
43
+ system = ""
44
+
45
+ ret = self.client.messages.create(
39
46
  model=self.model_name,
47
+ system=system,
40
48
  messages=messages,
41
49
  **sampling_params.to_anthropic_kwargs(),
42
50
  )
@@ -54,10 +62,16 @@ class Anthropic(BaseBackend):
54
62
  else:
55
63
  messages = [{"role": "user", "content": s.text_}]
56
64
 
57
- with anthropic.Anthropic().messages.stream(
65
+ if messages and messages[0]["role"] == "system":
66
+ system = messages.pop(0)["content"]
67
+ else:
68
+ system = ""
69
+
70
+ with self.client.messages.stream(
58
71
  model=self.model_name,
72
+ system=system,
59
73
  messages=messages,
60
74
  **sampling_params.to_anthropic_kwargs(),
61
75
  ) as stream:
62
76
  for text in stream.text_stream:
63
- yield text, {}
77
+ yield text, {}
@@ -3,6 +3,7 @@ import time
3
3
  from typing import Callable, List, Optional, Union
4
4
 
5
5
  import numpy as np
6
+
6
7
  from sglang.backend.base_backend import BaseBackend
7
8
  from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
8
9
  from sglang.lang.interpreter import StreamExecutor
@@ -227,7 +228,7 @@ class OpenAI(BaseBackend):
227
228
  prompt_tokens.append(ret_token)
228
229
 
229
230
  decision = choices[np.argmax(scores)]
230
- return decision, scores, scores
231
+ return decision, scores, None, None
231
232
 
232
233
 
233
234
  def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
@@ -3,6 +3,7 @@ from typing import Callable, List, Optional, Union
3
3
 
4
4
  import numpy as np
5
5
  import requests
6
+
6
7
  from sglang.backend.base_backend import BaseBackend
7
8
  from sglang.global_config import global_config
8
9
  from sglang.lang.chat_template import get_chat_template_by_model_path
@@ -73,9 +74,11 @@ class RuntimeEndpoint(BaseBackend):
73
74
  assert res.status_code == 200
74
75
 
75
76
  def commit_lazy_operations(self, s: StreamExecutor):
77
+ data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
78
+ self._add_images(s, data)
76
79
  res = http_request(
77
80
  self.base_url + "/generate",
78
- json={"text": s.text_, "sampling_params": {"max_new_tokens": 0}},
81
+ json=data,
79
82
  auth_token=self.auth_token,
80
83
  api_key=self.api_key,
81
84
  verify=self.verify,
@@ -104,6 +107,7 @@ class RuntimeEndpoint(BaseBackend):
104
107
  "text": s.text_,
105
108
  "sampling_params": {
106
109
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
110
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
107
111
  **sampling_params.to_srt_kwargs(),
108
112
  },
109
113
  }
@@ -112,6 +116,7 @@ class RuntimeEndpoint(BaseBackend):
112
116
  "text": s.text_,
113
117
  "sampling_params": {
114
118
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
119
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
115
120
  "dtype": "int",
116
121
  **sampling_params.to_srt_kwargs(),
117
122
  },
@@ -142,6 +147,7 @@ class RuntimeEndpoint(BaseBackend):
142
147
  "text": s.text_,
143
148
  "sampling_params": {
144
149
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
150
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
145
151
  **sampling_params.to_srt_kwargs(),
146
152
  },
147
153
  }
@@ -150,6 +156,7 @@ class RuntimeEndpoint(BaseBackend):
150
156
  "text": s.text_,
151
157
  "sampling_params": {
152
158
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
159
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
153
160
  "dtype": "int",
154
161
  **sampling_params.to_srt_kwargs(),
155
162
  },
@@ -224,13 +231,19 @@ class RuntimeEndpoint(BaseBackend):
224
231
  )
225
232
  assert res.status_code == 200
226
233
  obj = res.json()
227
- normalized_prompt_logprob = [
234
+ normalized_prompt_logprobs = [
228
235
  r["meta_info"]["normalized_prompt_logprob"] for r in obj
229
236
  ]
230
- prompt_logprob = [r["meta_info"]["prompt_logprob"] for r in obj]
237
+ decision = choices[np.argmax(normalized_prompt_logprobs)]
238
+ prefill_token_logprobs = [r["meta_info"]["prefill_token_logprobs"] for r in obj]
239
+ decode_token_logprobs = [r["meta_info"]["decode_token_logprobs"] for r in obj]
231
240
 
232
- decision = choices[np.argmax(normalized_prompt_logprob)]
233
- return decision, normalized_prompt_logprob, prompt_logprob
241
+ return (
242
+ decision,
243
+ normalized_prompt_logprobs,
244
+ prefill_token_logprobs,
245
+ decode_token_logprobs,
246
+ )
234
247
 
235
248
  def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
236
249
  res = http_request(
@@ -3,6 +3,7 @@ import warnings
3
3
  from typing import List, Optional, Union
4
4
 
5
5
  import numpy as np
6
+
6
7
  from sglang.backend.base_backend import BaseBackend
7
8
  from sglang.lang.chat_template import get_chat_template
8
9
  from sglang.lang.interpreter import StreamExecutor
@@ -12,10 +12,11 @@ class GlobalConfig:
12
12
 
13
13
  # Output configs
14
14
  self.skip_special_tokens_in_output = True
15
+ self.spaces_between_special_tokens_in_out = True
15
16
 
16
17
  # Optimization configs
17
18
  self.eager_fill_image = False
18
- self.enable_prefix_sharing = True
19
+ self.enable_precache_with_tracing = True
19
20
  self.enable_parallel_encoding = True
20
21
  self.enable_parallel_decoding = True
21
22
 
@@ -24,5 +25,8 @@ class GlobalConfig:
24
25
  # adjust_cache: Adjust the position embedding of KV cache.
25
26
  self.concate_and_append_mode = "no_adjust"
26
27
 
28
+ # Request dependency time due to network delay
29
+ self.request_dependency_time = 0.03
30
+
27
31
 
28
32
  global_config = GlobalConfig()
@@ -162,6 +162,28 @@ register_chat_template(
162
162
  )
163
163
  )
164
164
 
165
+ register_chat_template(
166
+ ChatTemplate(
167
+ name="llama-3-instruct",
168
+ default_system_prompt=None,
169
+ role_prefix_and_suffix={
170
+ "system": (
171
+ "<|start_header_id|>system<|end_header_id|>\n\n",
172
+ "<|eot_id|>",
173
+ ),
174
+ "user": (
175
+ "<|start_header_id|>user<|end_header_id|>\n\n",
176
+ "<|eot_id|>",
177
+ ),
178
+ "assistant": (
179
+ "<|start_header_id|>assistant<|end_header_id|>\n\n",
180
+ "<|eot_id|>",
181
+ ),
182
+ },
183
+ stop_str=("<|eot_id|>",),
184
+ )
185
+ )
186
+
165
187
  # Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
166
188
  register_chat_template(
167
189
  ChatTemplate(
@@ -192,6 +214,44 @@ register_chat_template(
192
214
  )
193
215
  )
194
216
 
217
+ register_chat_template(
218
+ ChatTemplate(
219
+ name="dbrx-instruct",
220
+ default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
221
+ role_prefix_and_suffix={
222
+ "system": ("<|im_start|>system\n", "<|im_end|>"),
223
+ "user": ("\n<|im_start|>user\n", "<|im_end|>"),
224
+ "assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
225
+ },
226
+ stop_str=("<|im_end|>",),
227
+ )
228
+ )
229
+
230
+ register_chat_template(
231
+ ChatTemplate(
232
+ name="c4ai-command-r",
233
+ default_system_prompt=None,
234
+ role_prefix_and_suffix={
235
+ "system": (
236
+ "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
237
+ "<|END_OF_TURN_TOKEN|>",
238
+ ),
239
+ "user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
240
+ "assistant": (
241
+ "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
242
+ "<|END_OF_TURN_TOKEN|>",
243
+ ),
244
+ },
245
+ style=ChatTemplateStyle.PLAIN,
246
+ )
247
+ )
248
+
249
+
250
+ @register_chat_template_matching_function
251
+ def match_dbrx(model_path: str):
252
+ if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
253
+ return get_chat_template("dbrx-instruct")
254
+
195
255
 
196
256
  @register_chat_template_matching_function
197
257
  def match_vicuna(model_path: str):
@@ -199,6 +259,8 @@ def match_vicuna(model_path: str):
199
259
  return get_chat_template("vicuna_v1.1")
200
260
  if "llava-v1.5" in model_path.lower():
201
261
  return get_chat_template("vicuna_v1.1")
262
+ if "llava-next-video-7b" in model_path.lower():
263
+ return get_chat_template("vicuna_v1.1")
202
264
 
203
265
 
204
266
  @register_chat_template_matching_function
@@ -214,21 +276,33 @@ def match_llama2_chat(model_path: str):
214
276
  return get_chat_template("llama-2-chat")
215
277
 
216
278
 
279
+ @register_chat_template_matching_function
280
+ def match_llama3_instruct(model_path: str):
281
+ model_path = model_path.lower()
282
+ if "llama-3" in model_path and "instruct" in model_path:
283
+ return get_chat_template("llama-3-instruct")
284
+
285
+
217
286
  @register_chat_template_matching_function
218
287
  def match_chat_ml(model_path: str):
288
+ # import pdb;pdb.set_trace()
219
289
  model_path = model_path.lower()
220
290
  if "tinyllama" in model_path:
221
291
  return get_chat_template("chatml")
222
292
  if "qwen" in model_path and "chat" in model_path:
223
293
  return get_chat_template("chatml")
224
- if "llava-v1.6-34b" in model_path:
294
+ if (
295
+ "llava-v1.6-34b" in model_path
296
+ or "llava-v1.6-yi-34b" in model_path
297
+ or "llava-next-video-34b" in model_path
298
+ ):
225
299
  return get_chat_template("chatml-llava")
226
300
 
227
301
 
228
302
  @register_chat_template_matching_function
229
303
  def match_chat_yi(model_path: str):
230
304
  model_path = model_path.lower()
231
- if "yi" in model_path:
305
+ if "yi" in model_path and "llava" not in model_path:
232
306
  return get_chat_template("yi")
233
307
 
234
308
 
@@ -239,6 +313,13 @@ def match_gemma_it(model_path: str):
239
313
  return get_chat_template("gemma-it")
240
314
 
241
315
 
316
+ @register_chat_template_matching_function
317
+ def match_c4ai_command_r(model_path: str):
318
+ model_path = model_path.lower()
319
+ if "c4ai-command-r" in model_path:
320
+ return get_chat_template("c4ai-command-r")
321
+
322
+
242
323
  if __name__ == "__main__":
243
324
  messages = [
244
325
  {"role": "system", "content": None}, # None means default