sglang 0.1.4__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.4/sglang.egg-info → sglang-0.1.5}/PKG-INFO +26 -8
- {sglang-0.1.4 → sglang-0.1.5}/README.md +25 -7
- {sglang-0.1.4 → sglang-0.1.5}/pyproject.toml +1 -1
- {sglang-0.1.4 → sglang-0.1.5}/sglang/__init__.py +1 -1
- {sglang-0.1.4 → sglang-0.1.5}/sglang/api.py +1 -0
- sglang-0.1.5/sglang/backend/vertexai.py +147 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/interpreter.py +8 -9
- {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/ir.py +21 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/context_flashattention_nopad.py +0 -1
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/extend_attention.py +0 -1
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/manager.py +2 -2
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/model_rpc.py +6 -3
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/model_runner.py +1 -1
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/models/mixtral.py +1 -1
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/server_args.py +22 -4
- {sglang-0.1.4 → sglang-0.1.5}/sglang/test/test_programs.py +4 -1
- {sglang-0.1.4 → sglang-0.1.5/sglang.egg-info}/PKG-INFO +26 -8
- {sglang-0.1.4 → sglang-0.1.5}/sglang.egg-info/SOURCES.txt +1 -2
- sglang-0.1.4/sglang/backend/huggingface.py +0 -349
- sglang-0.1.4/sglang/backend/tgi.py +0 -190
- {sglang-0.1.4 → sglang-0.1.5}/LICENSE +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/setup.cfg +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/backend/anthropic.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/backend/openai.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/backend/runtime_endpoint.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/flush_cache.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/global_config.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/chat_template.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/compiler.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/tracer.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/launch_server.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/backend_config.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/constrained/fsm.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/constrained/regex.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/constrained/tokenizer.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/get_selected_logprob.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/token_attention.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/io_struct.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/openai_protocol.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/infer_batch.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/radix_cache.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/scheduler.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/tokenizer_manager.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/memory_pool.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/model_config.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/models/llama2.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/models/llava.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/server.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/utils.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/test/test_utils.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang/utils.py +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.1.4 → sglang-0.1.5}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.5
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -234,6 +234,7 @@ Requires-Dist: sglang[openai]; extra == "all"
|
|
234
234
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
235
235
|
|
236
236
|
# SGLang
|
237
|
+
| [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
237
238
|
|
238
239
|
SGLang is a structured generation language designed for large language models (LLMs).
|
239
240
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
@@ -277,7 +278,7 @@ The example below shows how to use sglang to answer a mulit-turn question.
|
|
277
278
|
### Using OpenAI Models
|
278
279
|
Set the OpenAI API Key
|
279
280
|
```
|
280
|
-
export OPENAI_API_KEY=sk
|
281
|
+
export OPENAI_API_KEY=sk-******
|
281
282
|
```
|
282
283
|
|
283
284
|
Then, answer a multi-turn question.
|
@@ -335,6 +336,7 @@ for m in state.messages():
|
|
335
336
|
|
336
337
|
### More Examples
|
337
338
|
|
339
|
+
Anthropic and VertexAI (Gemini) models are also supported.
|
338
340
|
You can find more examples at [examples/quick_start](examples/quick_start).
|
339
341
|
|
340
342
|
## Frontend: Structured Generation Langauge (SGLang)
|
@@ -350,13 +352,14 @@ You can then invoke the function with `run` or `run_batch`.
|
|
350
352
|
The system will manage the state, chat template, and parallelism for you.
|
351
353
|
|
352
354
|
### Control Flow
|
355
|
+
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
|
356
|
+
|
353
357
|
```python
|
354
358
|
@sgl.function
|
355
359
|
def control_flow(s, question):
|
356
360
|
s += "To answer this question: " + question + ", "
|
357
361
|
s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
|
358
362
|
|
359
|
-
# You can use if or nested function calls
|
360
363
|
if s["tool"] == "calculator":
|
361
364
|
s += "The math expression is" + sgl.gen("expression")
|
362
365
|
elif s["tool"] == "web browser":
|
@@ -364,6 +367,9 @@ def control_flow(s, question):
|
|
364
367
|
```
|
365
368
|
|
366
369
|
### Parallelism
|
370
|
+
Use `fork` to launch parallel prompts.
|
371
|
+
Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
|
372
|
+
|
367
373
|
```python
|
368
374
|
@sgl.function
|
369
375
|
def tip_suggestion(s):
|
@@ -372,7 +378,7 @@ def tip_suggestion(s):
|
|
372
378
|
"1. Balanced Diet. 2. Regular Exercise.\n\n"
|
373
379
|
)
|
374
380
|
|
375
|
-
forks = s.fork(2)
|
381
|
+
forks = s.fork(2)
|
376
382
|
for i, f in enumerate(forks):
|
377
383
|
f += f"Now, expand tip {i+1} into a paragraph:\n"
|
378
384
|
f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
|
@@ -383,6 +389,8 @@ def tip_suggestion(s):
|
|
383
389
|
```
|
384
390
|
|
385
391
|
### Multi Modality
|
392
|
+
Use `sgl.image` to pass an image as input.
|
393
|
+
|
386
394
|
```python
|
387
395
|
@sgl.function
|
388
396
|
def image_qa(s, image_file, question):
|
@@ -391,6 +399,8 @@ def image_qa(s, image_file, question):
|
|
391
399
|
```
|
392
400
|
|
393
401
|
### Constrained Decoding
|
402
|
+
Use `regex=` to specify a regular expression as a decoding constraint.
|
403
|
+
|
394
404
|
```python
|
395
405
|
@sgl.function
|
396
406
|
def regular_expression_gen(s):
|
@@ -403,6 +413,8 @@ def regular_expression_gen(s):
|
|
403
413
|
```
|
404
414
|
|
405
415
|
### Batching
|
416
|
+
Use `run_batch` to run a batch of requests with continuous batching.
|
417
|
+
|
406
418
|
```python
|
407
419
|
@sgl.function
|
408
420
|
def text_qa(s, question):
|
@@ -415,10 +427,13 @@ states = text_qa.run_batch(
|
|
415
427
|
{"question": "What is the capital of France?"},
|
416
428
|
{"question": "What is the capital of Japan?"},
|
417
429
|
],
|
430
|
+
progress_bar=True
|
418
431
|
)
|
419
432
|
```
|
420
433
|
|
421
434
|
### Streaming
|
435
|
+
Add `stream=True` to enable streaming.
|
436
|
+
|
422
437
|
```python
|
423
438
|
@sgl.function
|
424
439
|
def text_qa(s, question):
|
@@ -427,7 +442,9 @@ def text_qa(s, question):
|
|
427
442
|
|
428
443
|
states = text_qa.run(
|
429
444
|
question="What is the capital of France?",
|
430
|
-
temperature=0.1
|
445
|
+
temperature=0.1,
|
446
|
+
stream=True
|
447
|
+
)
|
431
448
|
|
432
449
|
for out in state.text_iter():
|
433
450
|
print(out, end="", flush=True)
|
@@ -471,6 +488,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
471
488
|
- Mixtral
|
472
489
|
- LLaVA
|
473
490
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000`
|
491
|
+
- AWQ quantization
|
474
492
|
|
475
493
|
## Benchmark And Performance
|
476
494
|
|
@@ -483,10 +501,10 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
483
501
|
Learn more [here](docs/benchmark_results.md).
|
484
502
|
|
485
503
|
## Roadmap
|
486
|
-
- [ ] Function call
|
487
|
-
- [ ] Quantization
|
504
|
+
- [ ] Function call APIs
|
488
505
|
- [ ] S-LoRA
|
489
|
-
- [ ]
|
506
|
+
- [ ] Support more models
|
507
|
+
- [ ] Support more hardware backends
|
490
508
|
|
491
509
|
## Citation And Acknowledgment
|
492
510
|
```
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# SGLang
|
2
|
+
| [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
2
3
|
|
3
4
|
SGLang is a structured generation language designed for large language models (LLMs).
|
4
5
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
@@ -42,7 +43,7 @@ The example below shows how to use sglang to answer a mulit-turn question.
|
|
42
43
|
### Using OpenAI Models
|
43
44
|
Set the OpenAI API Key
|
44
45
|
```
|
45
|
-
export OPENAI_API_KEY=sk
|
46
|
+
export OPENAI_API_KEY=sk-******
|
46
47
|
```
|
47
48
|
|
48
49
|
Then, answer a multi-turn question.
|
@@ -100,6 +101,7 @@ for m in state.messages():
|
|
100
101
|
|
101
102
|
### More Examples
|
102
103
|
|
104
|
+
Anthropic and VertexAI (Gemini) models are also supported.
|
103
105
|
You can find more examples at [examples/quick_start](examples/quick_start).
|
104
106
|
|
105
107
|
## Frontend: Structured Generation Langauge (SGLang)
|
@@ -115,13 +117,14 @@ You can then invoke the function with `run` or `run_batch`.
|
|
115
117
|
The system will manage the state, chat template, and parallelism for you.
|
116
118
|
|
117
119
|
### Control Flow
|
120
|
+
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
|
121
|
+
|
118
122
|
```python
|
119
123
|
@sgl.function
|
120
124
|
def control_flow(s, question):
|
121
125
|
s += "To answer this question: " + question + ", "
|
122
126
|
s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
|
123
127
|
|
124
|
-
# You can use if or nested function calls
|
125
128
|
if s["tool"] == "calculator":
|
126
129
|
s += "The math expression is" + sgl.gen("expression")
|
127
130
|
elif s["tool"] == "web browser":
|
@@ -129,6 +132,9 @@ def control_flow(s, question):
|
|
129
132
|
```
|
130
133
|
|
131
134
|
### Parallelism
|
135
|
+
Use `fork` to launch parallel prompts.
|
136
|
+
Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
|
137
|
+
|
132
138
|
```python
|
133
139
|
@sgl.function
|
134
140
|
def tip_suggestion(s):
|
@@ -137,7 +143,7 @@ def tip_suggestion(s):
|
|
137
143
|
"1. Balanced Diet. 2. Regular Exercise.\n\n"
|
138
144
|
)
|
139
145
|
|
140
|
-
forks = s.fork(2)
|
146
|
+
forks = s.fork(2)
|
141
147
|
for i, f in enumerate(forks):
|
142
148
|
f += f"Now, expand tip {i+1} into a paragraph:\n"
|
143
149
|
f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
|
@@ -148,6 +154,8 @@ def tip_suggestion(s):
|
|
148
154
|
```
|
149
155
|
|
150
156
|
### Multi Modality
|
157
|
+
Use `sgl.image` to pass an image as input.
|
158
|
+
|
151
159
|
```python
|
152
160
|
@sgl.function
|
153
161
|
def image_qa(s, image_file, question):
|
@@ -156,6 +164,8 @@ def image_qa(s, image_file, question):
|
|
156
164
|
```
|
157
165
|
|
158
166
|
### Constrained Decoding
|
167
|
+
Use `regex=` to specify a regular expression as a decoding constraint.
|
168
|
+
|
159
169
|
```python
|
160
170
|
@sgl.function
|
161
171
|
def regular_expression_gen(s):
|
@@ -168,6 +178,8 @@ def regular_expression_gen(s):
|
|
168
178
|
```
|
169
179
|
|
170
180
|
### Batching
|
181
|
+
Use `run_batch` to run a batch of requests with continuous batching.
|
182
|
+
|
171
183
|
```python
|
172
184
|
@sgl.function
|
173
185
|
def text_qa(s, question):
|
@@ -180,10 +192,13 @@ states = text_qa.run_batch(
|
|
180
192
|
{"question": "What is the capital of France?"},
|
181
193
|
{"question": "What is the capital of Japan?"},
|
182
194
|
],
|
195
|
+
progress_bar=True
|
183
196
|
)
|
184
197
|
```
|
185
198
|
|
186
199
|
### Streaming
|
200
|
+
Add `stream=True` to enable streaming.
|
201
|
+
|
187
202
|
```python
|
188
203
|
@sgl.function
|
189
204
|
def text_qa(s, question):
|
@@ -192,7 +207,9 @@ def text_qa(s, question):
|
|
192
207
|
|
193
208
|
states = text_qa.run(
|
194
209
|
question="What is the capital of France?",
|
195
|
-
temperature=0.1
|
210
|
+
temperature=0.1,
|
211
|
+
stream=True
|
212
|
+
)
|
196
213
|
|
197
214
|
for out in state.text_iter():
|
198
215
|
print(out, end="", flush=True)
|
@@ -236,6 +253,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
236
253
|
- Mixtral
|
237
254
|
- LLaVA
|
238
255
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000`
|
256
|
+
- AWQ quantization
|
239
257
|
|
240
258
|
## Benchmark And Performance
|
241
259
|
|
@@ -248,10 +266,10 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
248
266
|
Learn more [here](docs/benchmark_results.md).
|
249
267
|
|
250
268
|
## Roadmap
|
251
|
-
- [ ] Function call
|
252
|
-
- [ ] Quantization
|
269
|
+
- [ ] Function call APIs
|
253
270
|
- [ ] S-LoRA
|
254
|
-
- [ ]
|
271
|
+
- [ ] Support more models
|
272
|
+
- [ ] Support more hardware backends
|
255
273
|
|
256
274
|
## Citation And Acknowledgment
|
257
275
|
```
|
@@ -6,6 +6,7 @@ from sglang.backend.anthropic import Anthropic
|
|
6
6
|
from sglang.backend.base_backend import BaseBackend
|
7
7
|
from sglang.backend.openai import OpenAI
|
8
8
|
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
9
|
+
from sglang.backend.vertexai import VertexAI
|
9
10
|
from sglang.global_config import global_config
|
10
11
|
from sglang.lang.ir import (
|
11
12
|
SglExpr,
|
@@ -0,0 +1,147 @@
|
|
1
|
+
import os
|
2
|
+
import warnings
|
3
|
+
from typing import List, Optional, Union
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
from sglang.backend.base_backend import BaseBackend
|
7
|
+
from sglang.lang.chat_template import get_chat_template
|
8
|
+
from sglang.lang.interpreter import StreamExecutor
|
9
|
+
from sglang.lang.ir import SglSamplingParams
|
10
|
+
|
11
|
+
try:
|
12
|
+
import vertexai
|
13
|
+
from vertexai.preview.generative_models import (
|
14
|
+
GenerationConfig,
|
15
|
+
GenerativeModel,
|
16
|
+
Image,
|
17
|
+
)
|
18
|
+
except ImportError as e:
|
19
|
+
GenerativeModel = e
|
20
|
+
|
21
|
+
|
22
|
+
class VertexAI(BaseBackend):
|
23
|
+
def __init__(self, model_name):
|
24
|
+
super().__init__()
|
25
|
+
|
26
|
+
if isinstance(GenerativeModel, Exception):
|
27
|
+
raise GenerativeModel
|
28
|
+
|
29
|
+
project_id = os.environ["GCP_PROJECT_ID"]
|
30
|
+
location = os.environ.get("GCP_LOCATION")
|
31
|
+
vertexai.init(project=project_id, location=location)
|
32
|
+
|
33
|
+
self.model_name = model_name
|
34
|
+
self.chat_template = get_chat_template("default")
|
35
|
+
|
36
|
+
def get_chat_template(self):
|
37
|
+
return self.chat_template
|
38
|
+
|
39
|
+
def generate(
|
40
|
+
self,
|
41
|
+
s: StreamExecutor,
|
42
|
+
sampling_params: SglSamplingParams,
|
43
|
+
):
|
44
|
+
if s.messages_:
|
45
|
+
prompt = self.messages_to_vertexai_input(s.messages_)
|
46
|
+
else:
|
47
|
+
# single-turn
|
48
|
+
prompt = (
|
49
|
+
self.text_to_vertexai_input(s.text_, s.cur_images)
|
50
|
+
if s.cur_images
|
51
|
+
else s.text_
|
52
|
+
)
|
53
|
+
ret = GenerativeModel(self.model_name).generate_content(
|
54
|
+
prompt,
|
55
|
+
generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
|
56
|
+
)
|
57
|
+
|
58
|
+
comp = ret.text
|
59
|
+
|
60
|
+
return comp, {}
|
61
|
+
|
62
|
+
def generate_stream(
|
63
|
+
self,
|
64
|
+
s: StreamExecutor,
|
65
|
+
sampling_params: SglSamplingParams,
|
66
|
+
):
|
67
|
+
if s.messages_:
|
68
|
+
prompt = self.messages_to_vertexai_input(s.messages_)
|
69
|
+
else:
|
70
|
+
# single-turn
|
71
|
+
prompt = (
|
72
|
+
self.text_to_vertexai_input(s.text_, s.cur_images)
|
73
|
+
if s.cur_images
|
74
|
+
else s.text_
|
75
|
+
)
|
76
|
+
generator = GenerativeModel(self.model_name).generate_content(
|
77
|
+
prompt,
|
78
|
+
stream=True,
|
79
|
+
generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
|
80
|
+
)
|
81
|
+
for ret in generator:
|
82
|
+
yield ret.text, {}
|
83
|
+
|
84
|
+
def text_to_vertexai_input(self, text, images):
|
85
|
+
input = []
|
86
|
+
# split with image token
|
87
|
+
text_segs = text.split(self.chat_template.image_token)
|
88
|
+
for image_path, image_base64_data in images:
|
89
|
+
text_seg = text_segs.pop(0)
|
90
|
+
if text_seg != "":
|
91
|
+
input.append(text_seg)
|
92
|
+
input.append(Image.from_bytes(image_base64_data))
|
93
|
+
text_seg = text_segs.pop(0)
|
94
|
+
if text_seg != "":
|
95
|
+
input.append(text_seg)
|
96
|
+
return input
|
97
|
+
|
98
|
+
def messages_to_vertexai_input(self, messages):
|
99
|
+
vertexai_message = []
|
100
|
+
# from openai message format to vertexai message format
|
101
|
+
for msg in messages:
|
102
|
+
if isinstance(msg["content"], str):
|
103
|
+
text = msg["content"]
|
104
|
+
else:
|
105
|
+
text = msg["content"][0]["text"]
|
106
|
+
|
107
|
+
if msg["role"] == "system":
|
108
|
+
warnings.warn("Warning: system prompt is not supported in VertexAI.")
|
109
|
+
vertexai_message.append(
|
110
|
+
{
|
111
|
+
"role": "user",
|
112
|
+
"parts": [{"text": "System prompt: " + text}],
|
113
|
+
}
|
114
|
+
)
|
115
|
+
vertexai_message.append(
|
116
|
+
{
|
117
|
+
"role": "model",
|
118
|
+
"parts": [{"text": "Understood."}],
|
119
|
+
}
|
120
|
+
)
|
121
|
+
continue
|
122
|
+
if msg["role"] == "user":
|
123
|
+
vertexai_msg = {
|
124
|
+
"role": "user",
|
125
|
+
"parts": [{"text": text}],
|
126
|
+
}
|
127
|
+
elif msg["role"] == "assistant":
|
128
|
+
vertexai_msg = {
|
129
|
+
"role": "model",
|
130
|
+
"parts": [{"text": text}],
|
131
|
+
}
|
132
|
+
|
133
|
+
# images
|
134
|
+
if isinstance(msg["content"], list) and len(msg["content"]) > 1:
|
135
|
+
for image in msg["content"][1:]:
|
136
|
+
assert image["type"] == "image_url"
|
137
|
+
vertexai_msg["parts"].append(
|
138
|
+
{
|
139
|
+
"inline_data": {
|
140
|
+
"data": image["image_url"]["url"].split(",")[1],
|
141
|
+
"mime_type": "image/jpeg",
|
142
|
+
}
|
143
|
+
}
|
144
|
+
)
|
145
|
+
|
146
|
+
vertexai_message.append(vertexai_msg)
|
147
|
+
return vertexai_message
|
@@ -365,11 +365,10 @@ class StreamExecutor:
|
|
365
365
|
for comp, meta_info in generator:
|
366
366
|
self.text_ += comp
|
367
367
|
self.variables[name] += comp
|
368
|
+
self.meta_info[name] = meta_info
|
368
369
|
self.stream_var_event[name].set()
|
369
370
|
self.stream_text_event.set()
|
370
371
|
|
371
|
-
self.meta_info[name] = meta_info
|
372
|
-
|
373
372
|
self.variable_event[name].set()
|
374
373
|
self.stream_var_event[name].set()
|
375
374
|
|
@@ -428,6 +427,7 @@ class StreamExecutor:
|
|
428
427
|
self.messages_.append(last_msg)
|
429
428
|
self.cur_images = []
|
430
429
|
else:
|
430
|
+
# OpenAI chat API format
|
431
431
|
self.messages_.append({"role": expr.role, "content": new_text})
|
432
432
|
|
433
433
|
self.cur_role = None
|
@@ -582,7 +582,7 @@ class ProgramState:
|
|
582
582
|
else:
|
583
583
|
yield self.get_var(name)
|
584
584
|
|
585
|
-
async def text_async_iter(self, var_name=None):
|
585
|
+
async def text_async_iter(self, var_name=None, return_meta_data=False):
|
586
586
|
loop = asyncio.get_running_loop()
|
587
587
|
|
588
588
|
if self.stream_executor.stream:
|
@@ -606,7 +606,10 @@ class ProgramState:
|
|
606
606
|
out = str(self.stream_executor.variables[var_name][prev:])
|
607
607
|
prev += len(out)
|
608
608
|
if out:
|
609
|
-
|
609
|
+
if return_meta_data:
|
610
|
+
yield out, self.stream_executor.meta_info[var_name]
|
611
|
+
else:
|
612
|
+
yield out
|
610
613
|
if self.stream_executor.variable_event[var_name].is_set():
|
611
614
|
break
|
612
615
|
else:
|
@@ -632,11 +635,7 @@ class ProgramState:
|
|
632
635
|
self.stream_executor.end()
|
633
636
|
|
634
637
|
def __repr__(self) -> str:
|
635
|
-
|
636
|
-
ret = ""
|
637
|
-
for msg in msgs:
|
638
|
-
ret += msg["role"] + ":\n" + msg["content"] + "\n"
|
639
|
-
return ret
|
638
|
+
return f"ProgramState({self.text()})"
|
640
639
|
|
641
640
|
|
642
641
|
class ProgramStateGroup:
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
import dataclasses
|
4
4
|
import inspect
|
5
|
+
import warnings
|
5
6
|
from typing import List, Optional, Union
|
6
7
|
|
7
8
|
from sglang.global_config import global_config
|
@@ -40,6 +41,8 @@ class SglSamplingParams:
|
|
40
41
|
|
41
42
|
def to_openai_kwargs(self):
|
42
43
|
# OpenAI does not support top_k, so we drop it here
|
44
|
+
if self.regex is not None:
|
45
|
+
warnings.warn("Regular expression is not supported in the OpenAI backend.")
|
43
46
|
return {
|
44
47
|
"max_tokens": self.max_new_tokens,
|
45
48
|
"stop": self.stop or None,
|
@@ -49,8 +52,26 @@ class SglSamplingParams:
|
|
49
52
|
"presence_penalty": self.presence_penalty,
|
50
53
|
}
|
51
54
|
|
55
|
+
def to_vertexai_kwargs(self):
|
56
|
+
if self.regex is not None:
|
57
|
+
warnings.warn(
|
58
|
+
"Regular expression is not supported in the VertexAI backend."
|
59
|
+
)
|
60
|
+
return {
|
61
|
+
"candidate_count": 1,
|
62
|
+
"max_output_tokens": self.max_new_tokens,
|
63
|
+
"stop_sequences": self.stop,
|
64
|
+
"temperature": self.temperature,
|
65
|
+
"top_p": self.top_p,
|
66
|
+
"top_k": self.top_k if self.top_k > 0 else None,
|
67
|
+
}
|
68
|
+
|
52
69
|
def to_anthropic_kwargs(self):
|
53
70
|
# Anthropic does not support frequency_penalty or presence_penalty, so we drop it here
|
71
|
+
if self.regex is not None:
|
72
|
+
warnings.warn(
|
73
|
+
"Regular expression is not supported in the Anthropic backend."
|
74
|
+
)
|
54
75
|
return {
|
55
76
|
"max_tokens_to_sample": self.max_new_tokens,
|
56
77
|
"stop_sequences": self.stop,
|
@@ -28,7 +28,7 @@ class RouterManager:
|
|
28
28
|
self.model_client = model_client
|
29
29
|
self.recv_reqs = []
|
30
30
|
|
31
|
-
# Init
|
31
|
+
# Init some configs
|
32
32
|
self.extend_dependency_time = GLOBAL_BACKEND_CONFIG.extend_dependency_time
|
33
33
|
|
34
34
|
async def loop_for_forward(self):
|
@@ -46,7 +46,7 @@ class RouterManager:
|
|
46
46
|
if has_finished:
|
47
47
|
await asyncio.sleep(self.extend_dependency_time)
|
48
48
|
|
49
|
-
await asyncio.sleep(0.
|
49
|
+
await asyncio.sleep(0.0006)
|
50
50
|
|
51
51
|
async def loop_for_recv_requests(self):
|
52
52
|
while True:
|
@@ -2,10 +2,10 @@ import asyncio
|
|
2
2
|
import logging
|
3
3
|
import multiprocessing
|
4
4
|
import time
|
5
|
+
import warnings
|
5
6
|
from concurrent.futures import ThreadPoolExecutor
|
6
7
|
from enum import Enum, auto
|
7
8
|
from typing import Dict, List, Optional, Tuple, Union
|
8
|
-
import warnings
|
9
9
|
|
10
10
|
import numpy as np
|
11
11
|
import rpyc
|
@@ -45,6 +45,7 @@ class ModelRpcServer(rpyc.Service):
|
|
45
45
|
self.tp_rank = tp_rank
|
46
46
|
self.tp_size = server_args.tp_size
|
47
47
|
self.schedule_heuristic = server_args.schedule_heuristic
|
48
|
+
self.schedule_conservativeness = server_args.schedule_conservativeness
|
48
49
|
|
49
50
|
# Init model and tokenizer
|
50
51
|
self.model_config = ModelConfig(
|
@@ -108,7 +109,7 @@ class ModelRpcServer(rpyc.Service):
|
|
108
109
|
self.running_batch: Batch = None
|
109
110
|
self.out_pyobjs = []
|
110
111
|
self.decode_forward_ct = 0
|
111
|
-
self.stream_interval =
|
112
|
+
self.stream_interval = server_args.stream_interval
|
112
113
|
|
113
114
|
# Init the FSM cache for constrained generation
|
114
115
|
self.regex_fsm_cache = FSMCache(self.tokenizer)
|
@@ -248,7 +249,9 @@ class ModelRpcServer(rpyc.Service):
|
|
248
249
|
available_size = (
|
249
250
|
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
|
250
251
|
)
|
251
|
-
new_ratio =
|
252
|
+
new_ratio = (
|
253
|
+
self.scheduler.new_token_estimation_ratio() * self.schedule_conservativeness
|
254
|
+
)
|
252
255
|
if self.running_batch:
|
253
256
|
available_size -= sum(
|
254
257
|
[
|
@@ -355,7 +355,7 @@ class MixtralForCausalLM(nn.Module):
|
|
355
355
|
):
|
356
356
|
if "rotary_emb.inv_freq" in name:
|
357
357
|
continue
|
358
|
-
for
|
358
|
+
for param_name, weight_name, shard_id in stacked_params_mapping:
|
359
359
|
if weight_name not in name:
|
360
360
|
continue
|
361
361
|
name = name.replace(weight_name, param_name)
|