sglang 0.1.13__tar.gz → 0.1.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.13/sglang.egg-info → sglang-0.1.15}/PKG-INFO +13 -15
- {sglang-0.1.13 → sglang-0.1.15}/README.md +7 -7
- {sglang-0.1.13 → sglang-0.1.15}/pyproject.toml +5 -5
- sglang-0.1.15/sglang/__init__.py +57 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/api.py +3 -5
- {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/anthropic.py +33 -13
- {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/openai.py +2 -1
- {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/runtime_endpoint.py +18 -5
- {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/vertexai.py +1 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/global_config.py +1 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/chat_template.py +74 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/interpreter.py +40 -16
- {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/ir.py +1 -1
- {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/tracer.py +6 -4
- {sglang-0.1.13 → sglang-0.1.15}/sglang/launch_server.py +2 -1
- sglang-0.1.15/sglang/srt/constrained/fsm_cache.py +25 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/constrained/jump_forward.py +1 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/conversation.py +2 -2
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/hf_transformers_utils.py +2 -1
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/layers/context_flashattention_nopad.py +1 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/layers/extend_attention.py +1 -0
- sglang-0.1.15/sglang/srt/layers/logits_processor.py +175 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/layers/radix_attention.py +2 -1
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/layers/token_attention.py +1 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/detokenizer_manager.py +5 -1
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/io_struct.py +12 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/infer_batch.py +70 -33
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/manager.py +7 -2
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/model_rpc.py +116 -73
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/model_runner.py +121 -155
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/radix_cache.py +46 -38
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/tokenizer_manager.py +56 -11
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/memory_pool.py +5 -14
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/model_config.py +7 -0
- sglang-0.1.15/sglang/srt/models/commandr.py +376 -0
- sglang-0.1.15/sglang/srt/models/dbrx.py +413 -0
- sglang-0.1.15/sglang/srt/models/dbrx_config.py +281 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/gemma.py +22 -20
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/llama2.py +23 -21
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/llava.py +12 -10
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/mixtral.py +27 -25
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/qwen.py +23 -21
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/qwen2.py +23 -21
- sglang-0.1.15/sglang/srt/models/stablelm.py +292 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/yivl.py +6 -5
- sglang-0.1.15/sglang/srt/openai_api_adapter.py +356 -0
- {sglang-0.1.13/sglang/srt/managers → sglang-0.1.15/sglang/srt}/openai_protocol.py +36 -20
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/sampling_params.py +2 -0
- sglang-0.1.15/sglang/srt/server.py +317 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/server_args.py +76 -49
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/utils.py +88 -32
- sglang-0.1.15/sglang/srt/weight_utils.py +402 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/test/test_programs.py +8 -7
- sglang-0.1.15/sglang/test/test_utils.py +350 -0
- {sglang-0.1.13 → sglang-0.1.15/sglang.egg-info}/PKG-INFO +13 -15
- {sglang-0.1.13 → sglang-0.1.15}/sglang.egg-info/SOURCES.txt +7 -1
- {sglang-0.1.13 → sglang-0.1.15}/sglang.egg-info/requires.txt +5 -7
- sglang-0.1.13/sglang/__init__.py +0 -4
- sglang-0.1.13/sglang/srt/constrained/fsm_cache.py +0 -13
- sglang-0.1.13/sglang/srt/layers/logits_processor.py +0 -115
- sglang-0.1.13/sglang/srt/server.py +0 -688
- sglang-0.1.13/sglang/test/test_utils.py +0 -162
- {sglang-0.1.13 → sglang-0.1.15}/LICENSE +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/setup.cfg +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/compiler.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/backend_config.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/scheduler.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/test/test_conversation.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang/utils.py +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.13 → sglang-0.1.15}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.15
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -212,6 +212,7 @@ Requires-Python: >=3.8
|
|
212
212
|
Description-Content-Type: text/markdown
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: requests
|
215
|
+
Requires-Dist: tqdm
|
215
216
|
Provides-Extra: srt
|
216
217
|
Requires-Dist: aiohttp; extra == "srt"
|
217
218
|
Requires-Dist: fastapi; extra == "srt"
|
@@ -221,21 +222,18 @@ Requires-Dist: torch; extra == "srt"
|
|
221
222
|
Requires-Dist: uvloop; extra == "srt"
|
222
223
|
Requires-Dist: uvicorn; extra == "srt"
|
223
224
|
Requires-Dist: zmq; extra == "srt"
|
224
|
-
Requires-Dist: vllm>=0.
|
225
|
+
Requires-Dist: vllm>=0.4.2; extra == "srt"
|
225
226
|
Requires-Dist: interegular; extra == "srt"
|
226
|
-
Requires-Dist: lark; extra == "srt"
|
227
|
-
Requires-Dist: numba; extra == "srt"
|
228
227
|
Requires-Dist: pydantic; extra == "srt"
|
229
|
-
Requires-Dist: referencing; extra == "srt"
|
230
|
-
Requires-Dist: diskcache; extra == "srt"
|
231
|
-
Requires-Dist: cloudpickle; extra == "srt"
|
232
228
|
Requires-Dist: pillow; extra == "srt"
|
233
229
|
Requires-Dist: outlines>=0.0.27; extra == "srt"
|
230
|
+
Requires-Dist: packaging; extra == "srt"
|
234
231
|
Provides-Extra: openai
|
235
232
|
Requires-Dist: openai>=1.0; extra == "openai"
|
236
233
|
Requires-Dist: numpy; extra == "openai"
|
234
|
+
Requires-Dist: tiktoken; extra == "openai"
|
237
235
|
Provides-Extra: anthropic
|
238
|
-
Requires-Dist: anthropic; extra == "anthropic"
|
236
|
+
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
239
237
|
Requires-Dist: numpy; extra == "anthropic"
|
240
238
|
Provides-Extra: all
|
241
239
|
Requires-Dist: sglang[srt]; extra == "all"
|
@@ -541,7 +539,6 @@ curl http://localhost:30000/generate \
|
|
541
539
|
Learn more about the argument format [here](docs/sampling_params.md).
|
542
540
|
|
543
541
|
### OpenAI Compatible API
|
544
|
-
|
545
542
|
In addition, the server supports an experimental OpenAI-compatible API.
|
546
543
|
|
547
544
|
```python
|
@@ -606,7 +603,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
606
603
|
```
|
607
604
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
608
605
|
```
|
609
|
-
- You can turn on [flashinfer](docs/flashinfer.md) to
|
606
|
+
- You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
|
610
607
|
|
611
608
|
### Supported Models
|
612
609
|
- Llama
|
@@ -622,10 +619,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
622
619
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
623
620
|
- Yi-VL
|
624
621
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
625
|
-
-
|
622
|
+
- StableLM
|
623
|
+
- Command-R
|
624
|
+
- DBRX
|
625
|
+
- AWQ/GPTQ/Marlin quantization
|
626
626
|
|
627
|
-
|
627
|
+
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
628
628
|
|
629
|
+
## Benchmark And Performance
|
629
630
|
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
630
631
|

|
631
632
|
|
@@ -649,7 +650,4 @@ https://github.com/sgl-project/sglang/issues/157
|
|
649
650
|
}
|
650
651
|
```
|
651
652
|
|
652
|
-
[](https://huggingface.co/papers/2312.07104)
|
653
|
-
|
654
|
-
|
655
653
|
We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
|
@@ -297,7 +297,6 @@ curl http://localhost:30000/generate \
|
|
297
297
|
Learn more about the argument format [here](docs/sampling_params.md).
|
298
298
|
|
299
299
|
### OpenAI Compatible API
|
300
|
-
|
301
300
|
In addition, the server supports an experimental OpenAI-compatible API.
|
302
301
|
|
303
302
|
```python
|
@@ -362,7 +361,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
362
361
|
```
|
363
362
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
364
363
|
```
|
365
|
-
- You can turn on [flashinfer](docs/flashinfer.md) to
|
364
|
+
- You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
|
366
365
|
|
367
366
|
### Supported Models
|
368
367
|
- Llama
|
@@ -378,10 +377,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
378
377
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
379
378
|
- Yi-VL
|
380
379
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
381
|
-
-
|
380
|
+
- StableLM
|
381
|
+
- Command-R
|
382
|
+
- DBRX
|
383
|
+
- AWQ/GPTQ/Marlin quantization
|
382
384
|
|
383
|
-
|
385
|
+
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
384
386
|
|
387
|
+
## Benchmark And Performance
|
385
388
|
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
386
389
|

|
387
390
|
|
@@ -405,7 +408,4 @@ https://github.com/sgl-project/sglang/issues/157
|
|
405
408
|
}
|
406
409
|
```
|
407
410
|
|
408
|
-
[](https://huggingface.co/papers/2312.07104)
|
409
|
-
|
410
|
-
|
411
411
|
We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.15"
|
8
8
|
description = "A structured generation langauge for LLMs."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -15,14 +15,14 @@ classifiers = [
|
|
15
15
|
]
|
16
16
|
dependencies = [
|
17
17
|
"requests",
|
18
|
+
"tqdm",
|
18
19
|
]
|
19
20
|
|
20
21
|
[project.optional-dependencies]
|
21
22
|
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
|
22
|
-
"zmq", "vllm>=0.
|
23
|
-
|
24
|
-
|
25
|
-
anthropic = ["anthropic", "numpy"]
|
23
|
+
"zmq", "vllm>=0.4.2", "interegular", "pydantic", "pillow", "outlines>=0.0.27", "packaging"]
|
24
|
+
openai = ["openai>=1.0", "numpy", "tiktoken"]
|
25
|
+
anthropic = ["anthropic>=0.20.0", "numpy"]
|
26
26
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
|
27
27
|
|
28
28
|
[project.urls]
|
@@ -0,0 +1,57 @@
|
|
1
|
+
__version__ = "0.1.15"
|
2
|
+
|
3
|
+
# SGL API Components
|
4
|
+
from sglang.api import (
|
5
|
+
Runtime,
|
6
|
+
assistant,
|
7
|
+
assistant_begin,
|
8
|
+
assistant_end,
|
9
|
+
flush_cache,
|
10
|
+
function,
|
11
|
+
gen,
|
12
|
+
gen_int,
|
13
|
+
gen_string,
|
14
|
+
get_server_args,
|
15
|
+
image,
|
16
|
+
select,
|
17
|
+
set_default_backend,
|
18
|
+
system,
|
19
|
+
user,
|
20
|
+
user_begin,
|
21
|
+
user_end,
|
22
|
+
)
|
23
|
+
|
24
|
+
# SGL Backends
|
25
|
+
from sglang.backend.anthropic import Anthropic
|
26
|
+
from sglang.backend.openai import OpenAI
|
27
|
+
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
28
|
+
from sglang.backend.vertexai import VertexAI
|
29
|
+
|
30
|
+
# Global Configurations
|
31
|
+
from sglang.global_config import global_config
|
32
|
+
|
33
|
+
# public APIs management
|
34
|
+
__all__ = [
|
35
|
+
"global_config",
|
36
|
+
"Anthropic",
|
37
|
+
"OpenAI",
|
38
|
+
"RuntimeEndpoint",
|
39
|
+
"VertexAI",
|
40
|
+
"function",
|
41
|
+
"Runtime",
|
42
|
+
"set_default_backend",
|
43
|
+
"flush_cache",
|
44
|
+
"get_server_args",
|
45
|
+
"gen",
|
46
|
+
"gen_int",
|
47
|
+
"gen_string",
|
48
|
+
"image",
|
49
|
+
"select",
|
50
|
+
"system",
|
51
|
+
"user",
|
52
|
+
"assistant",
|
53
|
+
"user_begin",
|
54
|
+
"user_end",
|
55
|
+
"assistant_begin",
|
56
|
+
"assistant_end",
|
57
|
+
]
|
@@ -1,13 +1,10 @@
|
|
1
|
-
"""Public API"""
|
1
|
+
"""Some Public API Definitions"""
|
2
2
|
|
3
|
+
import os
|
3
4
|
import re
|
4
5
|
from typing import Callable, List, Optional, Union
|
5
6
|
|
6
|
-
from sglang.backend.anthropic import Anthropic
|
7
7
|
from sglang.backend.base_backend import BaseBackend
|
8
|
-
from sglang.backend.openai import OpenAI
|
9
|
-
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
10
|
-
from sglang.backend.vertexai import VertexAI
|
11
8
|
from sglang.global_config import global_config
|
12
9
|
from sglang.lang.ir import (
|
13
10
|
SglExpr,
|
@@ -35,6 +32,7 @@ def function(
|
|
35
32
|
|
36
33
|
def Runtime(*args, **kwargs):
|
37
34
|
# Avoid importing unnecessary dependency
|
35
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
38
36
|
from sglang.srt.server import Runtime
|
39
37
|
|
40
38
|
return Runtime(*args, **kwargs)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import List, Optional, Union
|
2
2
|
|
3
3
|
import numpy as np
|
4
|
+
|
4
5
|
from sglang.backend.base_backend import BaseBackend
|
5
6
|
from sglang.lang.chat_template import get_chat_template
|
6
7
|
from sglang.lang.interpreter import StreamExecutor
|
@@ -13,7 +14,7 @@ except ImportError as e:
|
|
13
14
|
|
14
15
|
|
15
16
|
class Anthropic(BaseBackend):
|
16
|
-
def __init__(self, model_name):
|
17
|
+
def __init__(self, model_name, *args, **kwargs):
|
17
18
|
super().__init__()
|
18
19
|
|
19
20
|
if isinstance(anthropic, Exception):
|
@@ -21,6 +22,7 @@ class Anthropic(BaseBackend):
|
|
21
22
|
|
22
23
|
self.model_name = model_name
|
23
24
|
self.chat_template = get_chat_template("claude")
|
25
|
+
self.client = anthropic.Anthropic(*args, **kwargs)
|
24
26
|
|
25
27
|
def get_chat_template(self):
|
26
28
|
return self.chat_template
|
@@ -30,13 +32,23 @@ class Anthropic(BaseBackend):
|
|
30
32
|
s: StreamExecutor,
|
31
33
|
sampling_params: SglSamplingParams,
|
32
34
|
):
|
33
|
-
|
34
|
-
|
35
|
+
if s.messages_:
|
36
|
+
messages = s.messages_
|
37
|
+
else:
|
38
|
+
messages = [{"role": "user", "content": s.text_}]
|
39
|
+
|
40
|
+
if messages and messages[0]["role"] == "system":
|
41
|
+
system = messages.pop(0)["content"]
|
42
|
+
else:
|
43
|
+
system = ""
|
44
|
+
|
45
|
+
ret = self.client.messages.create(
|
35
46
|
model=self.model_name,
|
36
|
-
|
47
|
+
system=system,
|
48
|
+
messages=messages,
|
37
49
|
**sampling_params.to_anthropic_kwargs(),
|
38
50
|
)
|
39
|
-
comp = ret.
|
51
|
+
comp = ret.content[0].text
|
40
52
|
|
41
53
|
return comp, {}
|
42
54
|
|
@@ -45,13 +57,21 @@ class Anthropic(BaseBackend):
|
|
45
57
|
s: StreamExecutor,
|
46
58
|
sampling_params: SglSamplingParams,
|
47
59
|
):
|
48
|
-
|
49
|
-
|
60
|
+
if s.messages_:
|
61
|
+
messages = s.messages_
|
62
|
+
else:
|
63
|
+
messages = [{"role": "user", "content": s.text_}]
|
64
|
+
|
65
|
+
if messages and messages[0]["role"] == "system":
|
66
|
+
system = messages.pop(0)["content"]
|
67
|
+
else:
|
68
|
+
system = ""
|
69
|
+
|
70
|
+
with self.client.messages.stream(
|
50
71
|
model=self.model_name,
|
51
|
-
|
52
|
-
|
72
|
+
system=system,
|
73
|
+
messages=messages,
|
53
74
|
**sampling_params.to_anthropic_kwargs(),
|
54
|
-
)
|
55
|
-
|
56
|
-
|
57
|
-
yield ret.completion, {}
|
75
|
+
) as stream:
|
76
|
+
for text in stream.text_stream:
|
77
|
+
yield text, {}
|
@@ -3,6 +3,7 @@ import time
|
|
3
3
|
from typing import Callable, List, Optional, Union
|
4
4
|
|
5
5
|
import numpy as np
|
6
|
+
|
6
7
|
from sglang.backend.base_backend import BaseBackend
|
7
8
|
from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
|
8
9
|
from sglang.lang.interpreter import StreamExecutor
|
@@ -227,7 +228,7 @@ class OpenAI(BaseBackend):
|
|
227
228
|
prompt_tokens.append(ret_token)
|
228
229
|
|
229
230
|
decision = choices[np.argmax(scores)]
|
230
|
-
return decision, scores,
|
231
|
+
return decision, scores, None, None
|
231
232
|
|
232
233
|
|
233
234
|
def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
|
@@ -3,6 +3,7 @@ from typing import Callable, List, Optional, Union
|
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import requests
|
6
|
+
|
6
7
|
from sglang.backend.base_backend import BaseBackend
|
7
8
|
from sglang.global_config import global_config
|
8
9
|
from sglang.lang.chat_template import get_chat_template_by_model_path
|
@@ -73,9 +74,11 @@ class RuntimeEndpoint(BaseBackend):
|
|
73
74
|
assert res.status_code == 200
|
74
75
|
|
75
76
|
def commit_lazy_operations(self, s: StreamExecutor):
|
77
|
+
data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
|
78
|
+
self._add_images(s, data)
|
76
79
|
res = http_request(
|
77
80
|
self.base_url + "/generate",
|
78
|
-
json=
|
81
|
+
json=data,
|
79
82
|
auth_token=self.auth_token,
|
80
83
|
api_key=self.api_key,
|
81
84
|
verify=self.verify,
|
@@ -104,6 +107,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
104
107
|
"text": s.text_,
|
105
108
|
"sampling_params": {
|
106
109
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
110
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
107
111
|
**sampling_params.to_srt_kwargs(),
|
108
112
|
},
|
109
113
|
}
|
@@ -112,6 +116,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
112
116
|
"text": s.text_,
|
113
117
|
"sampling_params": {
|
114
118
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
119
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
115
120
|
"dtype": "int",
|
116
121
|
**sampling_params.to_srt_kwargs(),
|
117
122
|
},
|
@@ -142,6 +147,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
142
147
|
"text": s.text_,
|
143
148
|
"sampling_params": {
|
144
149
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
150
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
145
151
|
**sampling_params.to_srt_kwargs(),
|
146
152
|
},
|
147
153
|
}
|
@@ -150,6 +156,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
150
156
|
"text": s.text_,
|
151
157
|
"sampling_params": {
|
152
158
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
159
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
153
160
|
"dtype": "int",
|
154
161
|
**sampling_params.to_srt_kwargs(),
|
155
162
|
},
|
@@ -224,13 +231,19 @@ class RuntimeEndpoint(BaseBackend):
|
|
224
231
|
)
|
225
232
|
assert res.status_code == 200
|
226
233
|
obj = res.json()
|
227
|
-
|
234
|
+
normalized_prompt_logprobs = [
|
228
235
|
r["meta_info"]["normalized_prompt_logprob"] for r in obj
|
229
236
|
]
|
230
|
-
|
237
|
+
decision = choices[np.argmax(normalized_prompt_logprobs)]
|
238
|
+
prefill_token_logprobs = [r["meta_info"]["prefill_token_logprobs"] for r in obj]
|
239
|
+
decode_token_logprobs = [r["meta_info"]["decode_token_logprobs"] for r in obj]
|
231
240
|
|
232
|
-
|
233
|
-
|
241
|
+
return (
|
242
|
+
decision,
|
243
|
+
normalized_prompt_logprobs,
|
244
|
+
prefill_token_logprobs,
|
245
|
+
decode_token_logprobs,
|
246
|
+
)
|
234
247
|
|
235
248
|
def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
|
236
249
|
res = http_request(
|
@@ -162,6 +162,28 @@ register_chat_template(
|
|
162
162
|
)
|
163
163
|
)
|
164
164
|
|
165
|
+
register_chat_template(
|
166
|
+
ChatTemplate(
|
167
|
+
name="llama-3-instruct",
|
168
|
+
default_system_prompt=None,
|
169
|
+
role_prefix_and_suffix={
|
170
|
+
"system": (
|
171
|
+
"<|start_header_id|>system<|end_header_id|>\n\n",
|
172
|
+
"<|eot_id|>",
|
173
|
+
),
|
174
|
+
"user": (
|
175
|
+
"<|start_header_id|>user<|end_header_id|>\n\n",
|
176
|
+
"<|eot_id|>",
|
177
|
+
),
|
178
|
+
"assistant": (
|
179
|
+
"<|start_header_id|>assistant<|end_header_id|>\n\n",
|
180
|
+
"<|eot_id|>",
|
181
|
+
),
|
182
|
+
},
|
183
|
+
stop_str=("<|eot_id|>",),
|
184
|
+
)
|
185
|
+
)
|
186
|
+
|
165
187
|
# Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
|
166
188
|
register_chat_template(
|
167
189
|
ChatTemplate(
|
@@ -192,6 +214,44 @@ register_chat_template(
|
|
192
214
|
)
|
193
215
|
)
|
194
216
|
|
217
|
+
register_chat_template(
|
218
|
+
ChatTemplate(
|
219
|
+
name="dbrx-instruct",
|
220
|
+
default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
|
221
|
+
role_prefix_and_suffix={
|
222
|
+
"system": ("<|im_start|>system\n", "<|im_end|>"),
|
223
|
+
"user": ("\n<|im_start|>user\n", "<|im_end|>"),
|
224
|
+
"assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
|
225
|
+
},
|
226
|
+
stop_str=("<|im_end|>",),
|
227
|
+
)
|
228
|
+
)
|
229
|
+
|
230
|
+
register_chat_template(
|
231
|
+
ChatTemplate(
|
232
|
+
name="c4ai-command-r",
|
233
|
+
default_system_prompt=None,
|
234
|
+
role_prefix_and_suffix={
|
235
|
+
"system": (
|
236
|
+
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
|
237
|
+
"<|END_OF_TURN_TOKEN|>",
|
238
|
+
),
|
239
|
+
"user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
|
240
|
+
"assistant": (
|
241
|
+
"<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
242
|
+
"<|END_OF_TURN_TOKEN|>",
|
243
|
+
),
|
244
|
+
},
|
245
|
+
style=ChatTemplateStyle.PLAIN,
|
246
|
+
)
|
247
|
+
)
|
248
|
+
|
249
|
+
|
250
|
+
@register_chat_template_matching_function
|
251
|
+
def match_dbrx(model_path: str):
|
252
|
+
if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
|
253
|
+
return get_chat_template("dbrx-instruct")
|
254
|
+
|
195
255
|
|
196
256
|
@register_chat_template_matching_function
|
197
257
|
def match_vicuna(model_path: str):
|
@@ -214,6 +274,13 @@ def match_llama2_chat(model_path: str):
|
|
214
274
|
return get_chat_template("llama-2-chat")
|
215
275
|
|
216
276
|
|
277
|
+
@register_chat_template_matching_function
|
278
|
+
def match_llama3_instruct(model_path: str):
|
279
|
+
model_path = model_path.lower()
|
280
|
+
if "llama-3" in model_path and "instruct" in model_path:
|
281
|
+
return get_chat_template("llama-3-instruct")
|
282
|
+
|
283
|
+
|
217
284
|
@register_chat_template_matching_function
|
218
285
|
def match_chat_ml(model_path: str):
|
219
286
|
model_path = model_path.lower()
|
@@ -239,6 +306,13 @@ def match_gemma_it(model_path: str):
|
|
239
306
|
return get_chat_template("gemma-it")
|
240
307
|
|
241
308
|
|
309
|
+
@register_chat_template_matching_function
|
310
|
+
def match_c4ai_command_r(model_path: str):
|
311
|
+
model_path = model_path.lower()
|
312
|
+
if "c4ai-command-r" in model_path:
|
313
|
+
return get_chat_template("c4ai-command-r")
|
314
|
+
|
315
|
+
|
242
316
|
if __name__ == "__main__":
|
243
317
|
messages = [
|
244
318
|
{"role": "system", "content": None}, # None means default
|
@@ -1,6 +1,7 @@
|
|
1
1
|
"""The interpreter that executes SGL programs"""
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import contextvars
|
4
5
|
import multiprocessing
|
5
6
|
import queue
|
6
7
|
import threading
|
@@ -10,6 +11,7 @@ from contextlib import contextmanager
|
|
10
11
|
from typing import Any, Callable, Dict, List, Optional, Union
|
11
12
|
|
12
13
|
import tqdm
|
14
|
+
|
13
15
|
from sglang.global_config import global_config
|
14
16
|
from sglang.lang.ir import (
|
15
17
|
SglCommitLazy,
|
@@ -217,7 +219,13 @@ class StreamExecutor:
|
|
217
219
|
self.use_thread = use_thread
|
218
220
|
if self.use_thread:
|
219
221
|
self.queue = queue.Queue()
|
220
|
-
|
222
|
+
|
223
|
+
def _run_worker_in_context():
|
224
|
+
self._thread_worker_func()
|
225
|
+
|
226
|
+
self.worker = threading.Thread(
|
227
|
+
target=contextvars.copy_context().run, args=(_run_worker_in_context,)
|
228
|
+
)
|
221
229
|
self.worker.start()
|
222
230
|
|
223
231
|
# For streaming
|
@@ -248,17 +256,24 @@ class StreamExecutor:
|
|
248
256
|
def set_var(self, name, value):
|
249
257
|
self.variables[name] = value
|
250
258
|
|
251
|
-
def get_meta_info(self, name):
|
259
|
+
def get_meta_info(self, name, timeout=None):
|
252
260
|
if name in self.variable_event:
|
253
|
-
self.variable_event[name].wait()
|
261
|
+
got = self.variable_event[name].wait(timeout)
|
262
|
+
if not got:
|
263
|
+
raise TimeoutError(f"Timeout while waiting for event '{name}'")
|
254
264
|
ret = self.meta_info.get(name, None)
|
255
265
|
return ret
|
256
266
|
|
257
|
-
def fork(
|
258
|
-
self
|
259
|
-
|
267
|
+
def fork(
|
268
|
+
self,
|
269
|
+
size: int = 1,
|
270
|
+
position_ids_offset: Optional[List[int]] = None,
|
271
|
+
):
|
272
|
+
if size > 1:
|
273
|
+
self.submit(SglCommitLazy())
|
260
274
|
|
261
|
-
|
275
|
+
self.sync()
|
276
|
+
size = int(size)
|
262
277
|
|
263
278
|
exes = [
|
264
279
|
StreamExecutor(
|
@@ -268,14 +283,15 @@ class StreamExecutor:
|
|
268
283
|
self.chat_template,
|
269
284
|
self.stream,
|
270
285
|
)
|
271
|
-
for _ in range(
|
286
|
+
for _ in range(size)
|
272
287
|
]
|
273
|
-
for i in range(
|
288
|
+
for i in range(size):
|
274
289
|
exes[i].variables = dict(self.variables)
|
275
290
|
exes[i].text_ = str(self.text_)
|
276
291
|
exes[i].messages_ = list(self.messages_)
|
277
292
|
exes[i].cur_role = self.cur_role
|
278
293
|
exes[i].fork_start_text_pos = len(self.text_)
|
294
|
+
exes[i].images_ = list(self.images_)
|
279
295
|
|
280
296
|
return exes
|
281
297
|
|
@@ -454,15 +470,19 @@ class StreamExecutor:
|
|
454
470
|
self.stream_var_event[name].set()
|
455
471
|
|
456
472
|
def _execute_select(self, expr: SglSelect):
|
457
|
-
|
458
|
-
|
459
|
-
|
473
|
+
(
|
474
|
+
decision,
|
475
|
+
normalized_prompt_logprobs,
|
476
|
+
prefill_token_logprobs,
|
477
|
+
decode_token_logprobs,
|
478
|
+
) = self.backend.select(self, expr.choices, expr.temperature)
|
460
479
|
if expr.name is not None:
|
461
480
|
name = expr.name
|
462
481
|
self.variables[name] = decision
|
463
482
|
self.meta_info[name] = {
|
464
|
-
"
|
465
|
-
"
|
483
|
+
"normalized_prompt_logprobs": normalized_prompt_logprobs,
|
484
|
+
"prefill_token_logprobs": prefill_token_logprobs,
|
485
|
+
"decode_token_logprobs": decode_token_logprobs,
|
466
486
|
}
|
467
487
|
self.variable_event[name].set()
|
468
488
|
self.text_ += decision
|
@@ -634,8 +654,12 @@ class ProgramState:
|
|
634
654
|
yield
|
635
655
|
self.stream_executor.submit(SglVarScopeEnd(name))
|
636
656
|
|
637
|
-
def fork(
|
638
|
-
|
657
|
+
def fork(
|
658
|
+
self,
|
659
|
+
size: int = 1,
|
660
|
+
position_ids_offset: Optional[List[int]] = None,
|
661
|
+
):
|
662
|
+
stream_executors = self.stream_executor.fork(size, position_ids_offset)
|
639
663
|
states = [ProgramState(x) for x in stream_executors]
|
640
664
|
state_group = ProgramStateGroup(states, self)
|
641
665
|
return state_group
|
@@ -73,7 +73,7 @@ class SglSamplingParams:
|
|
73
73
|
"Regular expression is not supported in the Anthropic backend."
|
74
74
|
)
|
75
75
|
return {
|
76
|
-
"
|
76
|
+
"max_tokens": self.max_new_tokens,
|
77
77
|
"stop_sequences": (
|
78
78
|
self.stop if isinstance(self.stop, (list, tuple)) else [self.stop]
|
79
79
|
),
|