sglang 0.1.14__tar.gz → 0.1.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.14/sglang.egg-info → sglang-0.1.16}/PKG-INFO +20 -18
- {sglang-0.1.14 → sglang-0.1.16}/README.md +13 -11
- {sglang-0.1.14 → sglang-0.1.16}/pyproject.toml +4 -4
- sglang-0.1.16/sglang/__init__.py +59 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/api.py +8 -5
- {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/anthropic.py +18 -4
- {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/openai.py +2 -1
- {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/runtime_endpoint.py +18 -5
- {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/vertexai.py +1 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/global_config.py +5 -1
- {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/chat_template.py +83 -2
- {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/interpreter.py +92 -35
- {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/ir.py +12 -9
- {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/tracer.py +6 -4
- sglang-0.1.16/sglang/launch_server_llavavid.py +31 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/constrained/fsm_cache.py +1 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/constrained/jump_forward.py +1 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/conversation.py +2 -2
- sglang-0.1.16/sglang/srt/flush_cache.py +16 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/hf_transformers_utils.py +10 -2
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/layers/context_flashattention_nopad.py +1 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/layers/extend_attention.py +1 -0
- sglang-0.1.16/sglang/srt/layers/logits_processor.py +175 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/layers/radix_attention.py +2 -1
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/layers/token_attention.py +1 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/detokenizer_manager.py +5 -1
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/io_struct.py +27 -3
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/infer_batch.py +97 -48
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/manager.py +11 -8
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/model_rpc.py +169 -90
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/model_runner.py +110 -166
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/radix_cache.py +89 -51
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/router/scheduler.py +17 -28
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/managers/tokenizer_manager.py +110 -33
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/memory_pool.py +5 -14
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/model_config.py +11 -0
- sglang-0.1.16/sglang/srt/models/commandr.py +372 -0
- sglang-0.1.16/sglang/srt/models/dbrx.py +412 -0
- sglang-0.1.16/sglang/srt/models/dbrx_config.py +281 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/gemma.py +24 -25
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/llama2.py +25 -26
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/llava.py +8 -10
- sglang-0.1.16/sglang/srt/models/llavavid.py +307 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/mixtral.py +29 -33
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/qwen.py +34 -25
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/qwen2.py +25 -26
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/stablelm.py +26 -26
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/yivl.py +3 -5
- sglang-0.1.16/sglang/srt/openai_api_adapter.py +356 -0
- {sglang-0.1.14/sglang/srt/managers → sglang-0.1.16/sglang/srt}/openai_protocol.py +36 -20
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/sampling_params.py +2 -0
- sglang-0.1.16/sglang/srt/server.py +331 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/server_args.py +79 -49
- sglang-0.1.16/sglang/srt/utils.py +426 -0
- sglang-0.1.16/sglang/srt/weight_utils.py +417 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/test/test_programs.py +8 -7
- sglang-0.1.16/sglang/test/test_utils.py +350 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/utils.py +77 -26
- {sglang-0.1.14 → sglang-0.1.16/sglang.egg-info}/PKG-INFO +20 -18
- {sglang-0.1.14 → sglang-0.1.16}/sglang.egg-info/SOURCES.txt +9 -1
- {sglang-0.1.14 → sglang-0.1.16}/sglang.egg-info/requires.txt +7 -7
- sglang-0.1.14/sglang/__init__.py +0 -4
- sglang-0.1.14/sglang/srt/layers/logits_processor.py +0 -115
- sglang-0.1.14/sglang/srt/server.py +0 -696
- sglang-0.1.14/sglang/srt/utils.py +0 -261
- sglang-0.1.14/sglang/test/test_utils.py +0 -162
- {sglang-0.1.14 → sglang-0.1.16}/LICENSE +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/setup.cfg +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/lang/compiler.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/launch_server.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/backend_config.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/test/test_conversation.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.14 → sglang-0.1.16}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.16
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -212,6 +212,7 @@ Requires-Python: >=3.8
|
|
212
212
|
Description-Content-Type: text/markdown
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: requests
|
215
|
+
Requires-Dist: tqdm
|
215
216
|
Provides-Extra: srt
|
216
217
|
Requires-Dist: aiohttp; extra == "srt"
|
217
218
|
Requires-Dist: fastapi; extra == "srt"
|
@@ -221,19 +222,18 @@ Requires-Dist: torch; extra == "srt"
|
|
221
222
|
Requires-Dist: uvloop; extra == "srt"
|
222
223
|
Requires-Dist: uvicorn; extra == "srt"
|
223
224
|
Requires-Dist: zmq; extra == "srt"
|
224
|
-
Requires-Dist: vllm>=0.
|
225
|
+
Requires-Dist: vllm>=0.4.2; extra == "srt"
|
225
226
|
Requires-Dist: interegular; extra == "srt"
|
226
|
-
Requires-Dist: lark; extra == "srt"
|
227
|
-
Requires-Dist: numba; extra == "srt"
|
228
227
|
Requires-Dist: pydantic; extra == "srt"
|
229
|
-
Requires-Dist: referencing; extra == "srt"
|
230
|
-
Requires-Dist: diskcache; extra == "srt"
|
231
|
-
Requires-Dist: cloudpickle; extra == "srt"
|
232
228
|
Requires-Dist: pillow; extra == "srt"
|
233
|
-
Requires-Dist:
|
229
|
+
Requires-Dist: packaging; extra == "srt"
|
230
|
+
Requires-Dist: huggingface_hub; extra == "srt"
|
231
|
+
Requires-Dist: hf_transfer; extra == "srt"
|
232
|
+
Requires-Dist: outlines>=0.0.34; extra == "srt"
|
234
233
|
Provides-Extra: openai
|
235
234
|
Requires-Dist: openai>=1.0; extra == "openai"
|
236
235
|
Requires-Dist: numpy; extra == "openai"
|
236
|
+
Requires-Dist: tiktoken; extra == "openai"
|
237
237
|
Provides-Extra: anthropic
|
238
238
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
239
239
|
Requires-Dist: numpy; extra == "anthropic"
|
@@ -541,7 +541,6 @@ curl http://localhost:30000/generate \
|
|
541
541
|
Learn more about the argument format [here](docs/sampling_params.md).
|
542
542
|
|
543
543
|
### OpenAI Compatible API
|
544
|
-
|
545
544
|
In addition, the server supports an experimental OpenAI-compatible API.
|
546
545
|
|
547
546
|
```python
|
@@ -571,15 +570,17 @@ response = client.chat.completions.create(
|
|
571
570
|
print(response)
|
572
571
|
```
|
573
572
|
|
574
|
-
|
575
|
-
|
573
|
+
|
574
|
+
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
575
|
+
|
576
|
+
If needed, you can also override the chat template when launching the server:
|
576
577
|
|
577
578
|
```
|
578
579
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
|
579
580
|
```
|
580
581
|
|
581
582
|
If the chat template you are looking for is missing, you are welcome to contribute it.
|
582
|
-
Meanwhile, you can also
|
583
|
+
Meanwhile, you can also temporarily register your chat template as follows:
|
583
584
|
|
584
585
|
```json
|
585
586
|
{
|
@@ -606,7 +607,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
606
607
|
```
|
607
608
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
608
609
|
```
|
609
|
-
- You can turn on [flashinfer](docs/flashinfer.md) to
|
610
|
+
- You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
|
610
611
|
|
611
612
|
### Supported Models
|
612
613
|
- Llama
|
@@ -622,10 +623,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
622
623
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
623
624
|
- Yi-VL
|
624
625
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
625
|
-
-
|
626
|
+
- StableLM
|
627
|
+
- Command-R
|
628
|
+
- DBRX
|
629
|
+
- AWQ/GPTQ/Marlin quantization
|
626
630
|
|
627
|
-
|
631
|
+
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
628
632
|
|
633
|
+
## Benchmark And Performance
|
629
634
|
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
630
635
|

|
631
636
|
|
@@ -649,7 +654,4 @@ https://github.com/sgl-project/sglang/issues/157
|
|
649
654
|
}
|
650
655
|
```
|
651
656
|
|
652
|
-
[](https://huggingface.co/papers/2312.07104)
|
653
|
-
|
654
|
-
|
655
657
|
We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
|
@@ -297,7 +297,6 @@ curl http://localhost:30000/generate \
|
|
297
297
|
Learn more about the argument format [here](docs/sampling_params.md).
|
298
298
|
|
299
299
|
### OpenAI Compatible API
|
300
|
-
|
301
300
|
In addition, the server supports an experimental OpenAI-compatible API.
|
302
301
|
|
303
302
|
```python
|
@@ -327,15 +326,17 @@ response = client.chat.completions.create(
|
|
327
326
|
print(response)
|
328
327
|
```
|
329
328
|
|
330
|
-
|
331
|
-
|
329
|
+
|
330
|
+
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
331
|
+
|
332
|
+
If needed, you can also override the chat template when launching the server:
|
332
333
|
|
333
334
|
```
|
334
335
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
|
335
336
|
```
|
336
337
|
|
337
338
|
If the chat template you are looking for is missing, you are welcome to contribute it.
|
338
|
-
Meanwhile, you can also
|
339
|
+
Meanwhile, you can also temporarily register your chat template as follows:
|
339
340
|
|
340
341
|
```json
|
341
342
|
{
|
@@ -362,7 +363,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
362
363
|
```
|
363
364
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
364
365
|
```
|
365
|
-
- You can turn on [flashinfer](docs/flashinfer.md) to
|
366
|
+
- You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
|
366
367
|
|
367
368
|
### Supported Models
|
368
369
|
- Llama
|
@@ -378,10 +379,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
378
379
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
379
380
|
- Yi-VL
|
380
381
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
381
|
-
-
|
382
|
+
- StableLM
|
383
|
+
- Command-R
|
384
|
+
- DBRX
|
385
|
+
- AWQ/GPTQ/Marlin quantization
|
382
386
|
|
383
|
-
|
387
|
+
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
384
388
|
|
389
|
+
## Benchmark And Performance
|
385
390
|
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
386
391
|

|
387
392
|
|
@@ -405,7 +410,4 @@ https://github.com/sgl-project/sglang/issues/157
|
|
405
410
|
}
|
406
411
|
```
|
407
412
|
|
408
|
-
[
|
409
|
-
|
410
|
-
|
411
|
-
We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
|
413
|
+
We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.16"
|
8
8
|
description = "A structured generation langauge for LLMs."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -15,13 +15,13 @@ classifiers = [
|
|
15
15
|
]
|
16
16
|
dependencies = [
|
17
17
|
"requests",
|
18
|
+
"tqdm",
|
18
19
|
]
|
19
20
|
|
20
21
|
[project.optional-dependencies]
|
21
22
|
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
|
22
|
-
"zmq", "vllm>=0.
|
23
|
-
|
24
|
-
openai = ["openai>=1.0", "numpy"]
|
23
|
+
"zmq", "vllm>=0.4.2", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
|
24
|
+
openai = ["openai>=1.0", "numpy", "tiktoken"]
|
25
25
|
anthropic = ["anthropic>=0.20.0", "numpy"]
|
26
26
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
|
27
27
|
|
@@ -0,0 +1,59 @@
|
|
1
|
+
__version__ = "0.1.16"
|
2
|
+
|
3
|
+
# SGL API Components
|
4
|
+
from sglang.api import (
|
5
|
+
Runtime,
|
6
|
+
assistant,
|
7
|
+
assistant_begin,
|
8
|
+
assistant_end,
|
9
|
+
flush_cache,
|
10
|
+
function,
|
11
|
+
gen,
|
12
|
+
gen_int,
|
13
|
+
gen_string,
|
14
|
+
get_server_args,
|
15
|
+
image,
|
16
|
+
select,
|
17
|
+
set_default_backend,
|
18
|
+
system,
|
19
|
+
user,
|
20
|
+
user_begin,
|
21
|
+
user_end,
|
22
|
+
video,
|
23
|
+
)
|
24
|
+
|
25
|
+
# SGL Backends
|
26
|
+
from sglang.backend.anthropic import Anthropic
|
27
|
+
from sglang.backend.openai import OpenAI
|
28
|
+
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
29
|
+
from sglang.backend.vertexai import VertexAI
|
30
|
+
|
31
|
+
# Global Configurations
|
32
|
+
from sglang.global_config import global_config
|
33
|
+
|
34
|
+
# public APIs management
|
35
|
+
__all__ = [
|
36
|
+
"global_config",
|
37
|
+
"Anthropic",
|
38
|
+
"OpenAI",
|
39
|
+
"RuntimeEndpoint",
|
40
|
+
"VertexAI",
|
41
|
+
"function",
|
42
|
+
"Runtime",
|
43
|
+
"set_default_backend",
|
44
|
+
"flush_cache",
|
45
|
+
"get_server_args",
|
46
|
+
"gen",
|
47
|
+
"gen_int",
|
48
|
+
"gen_string",
|
49
|
+
"image",
|
50
|
+
"video",
|
51
|
+
"select",
|
52
|
+
"system",
|
53
|
+
"user",
|
54
|
+
"assistant",
|
55
|
+
"user_begin",
|
56
|
+
"user_end",
|
57
|
+
"assistant_begin",
|
58
|
+
"assistant_end",
|
59
|
+
]
|
@@ -1,13 +1,10 @@
|
|
1
|
-
"""Public API"""
|
1
|
+
"""Some Public API Definitions"""
|
2
2
|
|
3
|
+
import os
|
3
4
|
import re
|
4
5
|
from typing import Callable, List, Optional, Union
|
5
6
|
|
6
|
-
from sglang.backend.anthropic import Anthropic
|
7
7
|
from sglang.backend.base_backend import BaseBackend
|
8
|
-
from sglang.backend.openai import OpenAI
|
9
|
-
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
10
|
-
from sglang.backend.vertexai import VertexAI
|
11
8
|
from sglang.global_config import global_config
|
12
9
|
from sglang.lang.ir import (
|
13
10
|
SglExpr,
|
@@ -18,6 +15,7 @@ from sglang.lang.ir import (
|
|
18
15
|
SglRoleBegin,
|
19
16
|
SglRoleEnd,
|
20
17
|
SglSelect,
|
18
|
+
SglVideo,
|
21
19
|
)
|
22
20
|
|
23
21
|
|
@@ -35,6 +33,7 @@ def function(
|
|
35
33
|
|
36
34
|
def Runtime(*args, **kwargs):
|
37
35
|
# Avoid importing unnecessary dependency
|
36
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
38
37
|
from sglang.srt.server import Runtime
|
39
38
|
|
40
39
|
return Runtime(*args, **kwargs)
|
@@ -153,6 +152,10 @@ def image(expr: SglExpr):
|
|
153
152
|
return SglImage(expr)
|
154
153
|
|
155
154
|
|
155
|
+
def video(path: str, num_frames: int):
|
156
|
+
return SglVideo(path, num_frames)
|
157
|
+
|
158
|
+
|
156
159
|
def select(
|
157
160
|
name: Optional[str] = None,
|
158
161
|
choices: List[str] = None,
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import List, Optional, Union
|
2
2
|
|
3
3
|
import numpy as np
|
4
|
+
|
4
5
|
from sglang.backend.base_backend import BaseBackend
|
5
6
|
from sglang.lang.chat_template import get_chat_template
|
6
7
|
from sglang.lang.interpreter import StreamExecutor
|
@@ -13,7 +14,7 @@ except ImportError as e:
|
|
13
14
|
|
14
15
|
|
15
16
|
class Anthropic(BaseBackend):
|
16
|
-
def __init__(self, model_name):
|
17
|
+
def __init__(self, model_name, *args, **kwargs):
|
17
18
|
super().__init__()
|
18
19
|
|
19
20
|
if isinstance(anthropic, Exception):
|
@@ -21,6 +22,7 @@ class Anthropic(BaseBackend):
|
|
21
22
|
|
22
23
|
self.model_name = model_name
|
23
24
|
self.chat_template = get_chat_template("claude")
|
25
|
+
self.client = anthropic.Anthropic(*args, **kwargs)
|
24
26
|
|
25
27
|
def get_chat_template(self):
|
26
28
|
return self.chat_template
|
@@ -35,8 +37,14 @@ class Anthropic(BaseBackend):
|
|
35
37
|
else:
|
36
38
|
messages = [{"role": "user", "content": s.text_}]
|
37
39
|
|
38
|
-
|
40
|
+
if messages and messages[0]["role"] == "system":
|
41
|
+
system = messages.pop(0)["content"]
|
42
|
+
else:
|
43
|
+
system = ""
|
44
|
+
|
45
|
+
ret = self.client.messages.create(
|
39
46
|
model=self.model_name,
|
47
|
+
system=system,
|
40
48
|
messages=messages,
|
41
49
|
**sampling_params.to_anthropic_kwargs(),
|
42
50
|
)
|
@@ -54,10 +62,16 @@ class Anthropic(BaseBackend):
|
|
54
62
|
else:
|
55
63
|
messages = [{"role": "user", "content": s.text_}]
|
56
64
|
|
57
|
-
|
65
|
+
if messages and messages[0]["role"] == "system":
|
66
|
+
system = messages.pop(0)["content"]
|
67
|
+
else:
|
68
|
+
system = ""
|
69
|
+
|
70
|
+
with self.client.messages.stream(
|
58
71
|
model=self.model_name,
|
72
|
+
system=system,
|
59
73
|
messages=messages,
|
60
74
|
**sampling_params.to_anthropic_kwargs(),
|
61
75
|
) as stream:
|
62
76
|
for text in stream.text_stream:
|
63
|
-
yield text, {}
|
77
|
+
yield text, {}
|
@@ -3,6 +3,7 @@ import time
|
|
3
3
|
from typing import Callable, List, Optional, Union
|
4
4
|
|
5
5
|
import numpy as np
|
6
|
+
|
6
7
|
from sglang.backend.base_backend import BaseBackend
|
7
8
|
from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
|
8
9
|
from sglang.lang.interpreter import StreamExecutor
|
@@ -227,7 +228,7 @@ class OpenAI(BaseBackend):
|
|
227
228
|
prompt_tokens.append(ret_token)
|
228
229
|
|
229
230
|
decision = choices[np.argmax(scores)]
|
230
|
-
return decision, scores,
|
231
|
+
return decision, scores, None, None
|
231
232
|
|
232
233
|
|
233
234
|
def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
|
@@ -3,6 +3,7 @@ from typing import Callable, List, Optional, Union
|
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import requests
|
6
|
+
|
6
7
|
from sglang.backend.base_backend import BaseBackend
|
7
8
|
from sglang.global_config import global_config
|
8
9
|
from sglang.lang.chat_template import get_chat_template_by_model_path
|
@@ -73,9 +74,11 @@ class RuntimeEndpoint(BaseBackend):
|
|
73
74
|
assert res.status_code == 200
|
74
75
|
|
75
76
|
def commit_lazy_operations(self, s: StreamExecutor):
|
77
|
+
data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
|
78
|
+
self._add_images(s, data)
|
76
79
|
res = http_request(
|
77
80
|
self.base_url + "/generate",
|
78
|
-
json=
|
81
|
+
json=data,
|
79
82
|
auth_token=self.auth_token,
|
80
83
|
api_key=self.api_key,
|
81
84
|
verify=self.verify,
|
@@ -104,6 +107,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
104
107
|
"text": s.text_,
|
105
108
|
"sampling_params": {
|
106
109
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
110
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
107
111
|
**sampling_params.to_srt_kwargs(),
|
108
112
|
},
|
109
113
|
}
|
@@ -112,6 +116,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
112
116
|
"text": s.text_,
|
113
117
|
"sampling_params": {
|
114
118
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
119
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
115
120
|
"dtype": "int",
|
116
121
|
**sampling_params.to_srt_kwargs(),
|
117
122
|
},
|
@@ -142,6 +147,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
142
147
|
"text": s.text_,
|
143
148
|
"sampling_params": {
|
144
149
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
150
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
145
151
|
**sampling_params.to_srt_kwargs(),
|
146
152
|
},
|
147
153
|
}
|
@@ -150,6 +156,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
150
156
|
"text": s.text_,
|
151
157
|
"sampling_params": {
|
152
158
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
159
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
153
160
|
"dtype": "int",
|
154
161
|
**sampling_params.to_srt_kwargs(),
|
155
162
|
},
|
@@ -224,13 +231,19 @@ class RuntimeEndpoint(BaseBackend):
|
|
224
231
|
)
|
225
232
|
assert res.status_code == 200
|
226
233
|
obj = res.json()
|
227
|
-
|
234
|
+
normalized_prompt_logprobs = [
|
228
235
|
r["meta_info"]["normalized_prompt_logprob"] for r in obj
|
229
236
|
]
|
230
|
-
|
237
|
+
decision = choices[np.argmax(normalized_prompt_logprobs)]
|
238
|
+
prefill_token_logprobs = [r["meta_info"]["prefill_token_logprobs"] for r in obj]
|
239
|
+
decode_token_logprobs = [r["meta_info"]["decode_token_logprobs"] for r in obj]
|
231
240
|
|
232
|
-
|
233
|
-
|
241
|
+
return (
|
242
|
+
decision,
|
243
|
+
normalized_prompt_logprobs,
|
244
|
+
prefill_token_logprobs,
|
245
|
+
decode_token_logprobs,
|
246
|
+
)
|
234
247
|
|
235
248
|
def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
|
236
249
|
res = http_request(
|
@@ -12,10 +12,11 @@ class GlobalConfig:
|
|
12
12
|
|
13
13
|
# Output configs
|
14
14
|
self.skip_special_tokens_in_output = True
|
15
|
+
self.spaces_between_special_tokens_in_out = True
|
15
16
|
|
16
17
|
# Optimization configs
|
17
18
|
self.eager_fill_image = False
|
18
|
-
self.
|
19
|
+
self.enable_precache_with_tracing = True
|
19
20
|
self.enable_parallel_encoding = True
|
20
21
|
self.enable_parallel_decoding = True
|
21
22
|
|
@@ -24,5 +25,8 @@ class GlobalConfig:
|
|
24
25
|
# adjust_cache: Adjust the position embedding of KV cache.
|
25
26
|
self.concate_and_append_mode = "no_adjust"
|
26
27
|
|
28
|
+
# Request dependency time due to network delay
|
29
|
+
self.request_dependency_time = 0.03
|
30
|
+
|
27
31
|
|
28
32
|
global_config = GlobalConfig()
|
@@ -162,6 +162,28 @@ register_chat_template(
|
|
162
162
|
)
|
163
163
|
)
|
164
164
|
|
165
|
+
register_chat_template(
|
166
|
+
ChatTemplate(
|
167
|
+
name="llama-3-instruct",
|
168
|
+
default_system_prompt=None,
|
169
|
+
role_prefix_and_suffix={
|
170
|
+
"system": (
|
171
|
+
"<|start_header_id|>system<|end_header_id|>\n\n",
|
172
|
+
"<|eot_id|>",
|
173
|
+
),
|
174
|
+
"user": (
|
175
|
+
"<|start_header_id|>user<|end_header_id|>\n\n",
|
176
|
+
"<|eot_id|>",
|
177
|
+
),
|
178
|
+
"assistant": (
|
179
|
+
"<|start_header_id|>assistant<|end_header_id|>\n\n",
|
180
|
+
"<|eot_id|>",
|
181
|
+
),
|
182
|
+
},
|
183
|
+
stop_str=("<|eot_id|>",),
|
184
|
+
)
|
185
|
+
)
|
186
|
+
|
165
187
|
# Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
|
166
188
|
register_chat_template(
|
167
189
|
ChatTemplate(
|
@@ -192,6 +214,44 @@ register_chat_template(
|
|
192
214
|
)
|
193
215
|
)
|
194
216
|
|
217
|
+
register_chat_template(
|
218
|
+
ChatTemplate(
|
219
|
+
name="dbrx-instruct",
|
220
|
+
default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
|
221
|
+
role_prefix_and_suffix={
|
222
|
+
"system": ("<|im_start|>system\n", "<|im_end|>"),
|
223
|
+
"user": ("\n<|im_start|>user\n", "<|im_end|>"),
|
224
|
+
"assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
|
225
|
+
},
|
226
|
+
stop_str=("<|im_end|>",),
|
227
|
+
)
|
228
|
+
)
|
229
|
+
|
230
|
+
register_chat_template(
|
231
|
+
ChatTemplate(
|
232
|
+
name="c4ai-command-r",
|
233
|
+
default_system_prompt=None,
|
234
|
+
role_prefix_and_suffix={
|
235
|
+
"system": (
|
236
|
+
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
|
237
|
+
"<|END_OF_TURN_TOKEN|>",
|
238
|
+
),
|
239
|
+
"user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
|
240
|
+
"assistant": (
|
241
|
+
"<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
242
|
+
"<|END_OF_TURN_TOKEN|>",
|
243
|
+
),
|
244
|
+
},
|
245
|
+
style=ChatTemplateStyle.PLAIN,
|
246
|
+
)
|
247
|
+
)
|
248
|
+
|
249
|
+
|
250
|
+
@register_chat_template_matching_function
|
251
|
+
def match_dbrx(model_path: str):
|
252
|
+
if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
|
253
|
+
return get_chat_template("dbrx-instruct")
|
254
|
+
|
195
255
|
|
196
256
|
@register_chat_template_matching_function
|
197
257
|
def match_vicuna(model_path: str):
|
@@ -199,6 +259,8 @@ def match_vicuna(model_path: str):
|
|
199
259
|
return get_chat_template("vicuna_v1.1")
|
200
260
|
if "llava-v1.5" in model_path.lower():
|
201
261
|
return get_chat_template("vicuna_v1.1")
|
262
|
+
if "llava-next-video-7b" in model_path.lower():
|
263
|
+
return get_chat_template("vicuna_v1.1")
|
202
264
|
|
203
265
|
|
204
266
|
@register_chat_template_matching_function
|
@@ -214,21 +276,33 @@ def match_llama2_chat(model_path: str):
|
|
214
276
|
return get_chat_template("llama-2-chat")
|
215
277
|
|
216
278
|
|
279
|
+
@register_chat_template_matching_function
|
280
|
+
def match_llama3_instruct(model_path: str):
|
281
|
+
model_path = model_path.lower()
|
282
|
+
if "llama-3" in model_path and "instruct" in model_path:
|
283
|
+
return get_chat_template("llama-3-instruct")
|
284
|
+
|
285
|
+
|
217
286
|
@register_chat_template_matching_function
|
218
287
|
def match_chat_ml(model_path: str):
|
288
|
+
# import pdb;pdb.set_trace()
|
219
289
|
model_path = model_path.lower()
|
220
290
|
if "tinyllama" in model_path:
|
221
291
|
return get_chat_template("chatml")
|
222
292
|
if "qwen" in model_path and "chat" in model_path:
|
223
293
|
return get_chat_template("chatml")
|
224
|
-
if
|
294
|
+
if (
|
295
|
+
"llava-v1.6-34b" in model_path
|
296
|
+
or "llava-v1.6-yi-34b" in model_path
|
297
|
+
or "llava-next-video-34b" in model_path
|
298
|
+
):
|
225
299
|
return get_chat_template("chatml-llava")
|
226
300
|
|
227
301
|
|
228
302
|
@register_chat_template_matching_function
|
229
303
|
def match_chat_yi(model_path: str):
|
230
304
|
model_path = model_path.lower()
|
231
|
-
if "yi" in model_path:
|
305
|
+
if "yi" in model_path and "llava" not in model_path:
|
232
306
|
return get_chat_template("yi")
|
233
307
|
|
234
308
|
|
@@ -239,6 +313,13 @@ def match_gemma_it(model_path: str):
|
|
239
313
|
return get_chat_template("gemma-it")
|
240
314
|
|
241
315
|
|
316
|
+
@register_chat_template_matching_function
|
317
|
+
def match_c4ai_command_r(model_path: str):
|
318
|
+
model_path = model_path.lower()
|
319
|
+
if "c4ai-command-r" in model_path:
|
320
|
+
return get_chat_template("c4ai-command-r")
|
321
|
+
|
322
|
+
|
242
323
|
if __name__ == "__main__":
|
243
324
|
messages = [
|
244
325
|
{"role": "system", "content": None}, # None means default
|