sglang 0.1.4__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {sglang-0.1.4/sglang.egg-info → sglang-0.1.5}/PKG-INFO +26 -8
  2. {sglang-0.1.4 → sglang-0.1.5}/README.md +25 -7
  3. {sglang-0.1.4 → sglang-0.1.5}/pyproject.toml +1 -1
  4. {sglang-0.1.4 → sglang-0.1.5}/sglang/__init__.py +1 -1
  5. {sglang-0.1.4 → sglang-0.1.5}/sglang/api.py +1 -0
  6. sglang-0.1.5/sglang/backend/vertexai.py +147 -0
  7. {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/interpreter.py +8 -9
  8. {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/ir.py +21 -0
  9. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/context_flashattention_nopad.py +0 -1
  10. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/extend_attention.py +0 -1
  11. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/manager.py +2 -2
  12. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/model_rpc.py +6 -3
  13. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/model_runner.py +1 -1
  14. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/models/mixtral.py +1 -1
  15. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/server_args.py +22 -4
  16. {sglang-0.1.4 → sglang-0.1.5}/sglang/test/test_programs.py +4 -1
  17. {sglang-0.1.4 → sglang-0.1.5/sglang.egg-info}/PKG-INFO +26 -8
  18. {sglang-0.1.4 → sglang-0.1.5}/sglang.egg-info/SOURCES.txt +1 -2
  19. sglang-0.1.4/sglang/backend/huggingface.py +0 -349
  20. sglang-0.1.4/sglang/backend/tgi.py +0 -190
  21. {sglang-0.1.4 → sglang-0.1.5}/LICENSE +0 -0
  22. {sglang-0.1.4 → sglang-0.1.5}/setup.cfg +0 -0
  23. {sglang-0.1.4 → sglang-0.1.5}/sglang/backend/__init__.py +0 -0
  24. {sglang-0.1.4 → sglang-0.1.5}/sglang/backend/anthropic.py +0 -0
  25. {sglang-0.1.4 → sglang-0.1.5}/sglang/backend/base_backend.py +0 -0
  26. {sglang-0.1.4 → sglang-0.1.5}/sglang/backend/openai.py +0 -0
  27. {sglang-0.1.4 → sglang-0.1.5}/sglang/backend/runtime_endpoint.py +0 -0
  28. {sglang-0.1.4 → sglang-0.1.5}/sglang/flush_cache.py +0 -0
  29. {sglang-0.1.4 → sglang-0.1.5}/sglang/global_config.py +0 -0
  30. {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/__init__.py +0 -0
  31. {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/chat_template.py +0 -0
  32. {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/compiler.py +0 -0
  33. {sglang-0.1.4 → sglang-0.1.5}/sglang/lang/tracer.py +0 -0
  34. {sglang-0.1.4 → sglang-0.1.5}/sglang/launch_server.py +0 -0
  35. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/backend_config.py +0 -0
  36. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/constrained/fsm.py +0 -0
  37. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/constrained/fsm_cache.py +0 -0
  38. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/constrained/regex.py +0 -0
  39. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/constrained/tokenizer.py +0 -0
  40. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/hf_transformers_utils.py +0 -0
  41. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/get_selected_logprob.py +0 -0
  42. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/logits_processor.py +0 -0
  43. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/radix_attention.py +0 -0
  44. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/layers/token_attention.py +0 -0
  45. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/detokenizer_manager.py +0 -0
  46. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/io_struct.py +0 -0
  47. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/openai_protocol.py +0 -0
  48. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/infer_batch.py +0 -0
  49. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/radix_cache.py +0 -0
  50. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/router/scheduler.py +0 -0
  51. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/managers/tokenizer_manager.py +0 -0
  52. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/memory_pool.py +0 -0
  53. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/model_config.py +0 -0
  54. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/models/llama2.py +0 -0
  55. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/models/llava.py +0 -0
  56. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/sampling_params.py +0 -0
  57. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/server.py +0 -0
  58. {sglang-0.1.4 → sglang-0.1.5}/sglang/srt/utils.py +0 -0
  59. {sglang-0.1.4 → sglang-0.1.5}/sglang/test/test_utils.py +0 -0
  60. {sglang-0.1.4 → sglang-0.1.5}/sglang/utils.py +0 -0
  61. {sglang-0.1.4 → sglang-0.1.5}/sglang.egg-info/dependency_links.txt +0 -0
  62. {sglang-0.1.4 → sglang-0.1.5}/sglang.egg-info/requires.txt +0 -0
  63. {sglang-0.1.4 → sglang-0.1.5}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -234,6 +234,7 @@ Requires-Dist: sglang[openai]; extra == "all"
234
234
  Requires-Dist: sglang[anthropic]; extra == "all"
235
235
 
236
236
  # SGLang
237
+ | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
237
238
 
238
239
  SGLang is a structured generation language designed for large language models (LLMs).
239
240
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
@@ -277,7 +278,7 @@ The example below shows how to use sglang to answer a mulit-turn question.
277
278
  ### Using OpenAI Models
278
279
  Set the OpenAI API Key
279
280
  ```
280
- export OPENAI_API_KEY=sk-xxxxxx
281
+ export OPENAI_API_KEY=sk-******
281
282
  ```
282
283
 
283
284
  Then, answer a multi-turn question.
@@ -335,6 +336,7 @@ for m in state.messages():
335
336
 
336
337
  ### More Examples
337
338
 
339
+ Anthropic and VertexAI (Gemini) models are also supported.
338
340
  You can find more examples at [examples/quick_start](examples/quick_start).
339
341
 
340
342
  ## Frontend: Structured Generation Langauge (SGLang)
@@ -350,13 +352,14 @@ You can then invoke the function with `run` or `run_batch`.
350
352
  The system will manage the state, chat template, and parallelism for you.
351
353
 
352
354
  ### Control Flow
355
+ You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
356
+
353
357
  ```python
354
358
  @sgl.function
355
359
  def control_flow(s, question):
356
360
  s += "To answer this question: " + question + ", "
357
361
  s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
358
362
 
359
- # You can use if or nested function calls
360
363
  if s["tool"] == "calculator":
361
364
  s += "The math expression is" + sgl.gen("expression")
362
365
  elif s["tool"] == "web browser":
@@ -364,6 +367,9 @@ def control_flow(s, question):
364
367
  ```
365
368
 
366
369
  ### Parallelism
370
+ Use `fork` to launch parallel prompts.
371
+ Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
372
+
367
373
  ```python
368
374
  @sgl.function
369
375
  def tip_suggestion(s):
@@ -372,7 +378,7 @@ def tip_suggestion(s):
372
378
  "1. Balanced Diet. 2. Regular Exercise.\n\n"
373
379
  )
374
380
 
375
- forks = s.fork(2) # Launch parallel prompts
381
+ forks = s.fork(2)
376
382
  for i, f in enumerate(forks):
377
383
  f += f"Now, expand tip {i+1} into a paragraph:\n"
378
384
  f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
@@ -383,6 +389,8 @@ def tip_suggestion(s):
383
389
  ```
384
390
 
385
391
  ### Multi Modality
392
+ Use `sgl.image` to pass an image as input.
393
+
386
394
  ```python
387
395
  @sgl.function
388
396
  def image_qa(s, image_file, question):
@@ -391,6 +399,8 @@ def image_qa(s, image_file, question):
391
399
  ```
392
400
 
393
401
  ### Constrained Decoding
402
+ Use `regex=` to specify a regular expression as a decoding constraint.
403
+
394
404
  ```python
395
405
  @sgl.function
396
406
  def regular_expression_gen(s):
@@ -403,6 +413,8 @@ def regular_expression_gen(s):
403
413
  ```
404
414
 
405
415
  ### Batching
416
+ Use `run_batch` to run a batch of requests with continuous batching.
417
+
406
418
  ```python
407
419
  @sgl.function
408
420
  def text_qa(s, question):
@@ -415,10 +427,13 @@ states = text_qa.run_batch(
415
427
  {"question": "What is the capital of France?"},
416
428
  {"question": "What is the capital of Japan?"},
417
429
  ],
430
+ progress_bar=True
418
431
  )
419
432
  ```
420
433
 
421
434
  ### Streaming
435
+ Add `stream=True` to enable streaming.
436
+
422
437
  ```python
423
438
  @sgl.function
424
439
  def text_qa(s, question):
@@ -427,7 +442,9 @@ def text_qa(s, question):
427
442
 
428
443
  states = text_qa.run(
429
444
  question="What is the capital of France?",
430
- temperature=0.1)
445
+ temperature=0.1,
446
+ stream=True
447
+ )
431
448
 
432
449
  for out in state.text_iter():
433
450
  print(out, end="", flush=True)
@@ -471,6 +488,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
471
488
  - Mixtral
472
489
  - LLaVA
473
490
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000`
491
+ - AWQ quantization
474
492
 
475
493
  ## Benchmark And Performance
476
494
 
@@ -483,10 +501,10 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
483
501
  Learn more [here](docs/benchmark_results.md).
484
502
 
485
503
  ## Roadmap
486
- - [ ] Function call
487
- - [ ] Quantization
504
+ - [ ] Function call APIs
488
505
  - [ ] S-LoRA
489
- - [ ] More models
506
+ - [ ] Support more models
507
+ - [ ] Support more hardware backends
490
508
 
491
509
  ## Citation And Acknowledgment
492
510
  ```
@@ -1,4 +1,5 @@
1
1
  # SGLang
2
+ | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
2
3
 
3
4
  SGLang is a structured generation language designed for large language models (LLMs).
4
5
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
@@ -42,7 +43,7 @@ The example below shows how to use sglang to answer a mulit-turn question.
42
43
  ### Using OpenAI Models
43
44
  Set the OpenAI API Key
44
45
  ```
45
- export OPENAI_API_KEY=sk-xxxxxx
46
+ export OPENAI_API_KEY=sk-******
46
47
  ```
47
48
 
48
49
  Then, answer a multi-turn question.
@@ -100,6 +101,7 @@ for m in state.messages():
100
101
 
101
102
  ### More Examples
102
103
 
104
+ Anthropic and VertexAI (Gemini) models are also supported.
103
105
  You can find more examples at [examples/quick_start](examples/quick_start).
104
106
 
105
107
  ## Frontend: Structured Generation Langauge (SGLang)
@@ -115,13 +117,14 @@ You can then invoke the function with `run` or `run_batch`.
115
117
  The system will manage the state, chat template, and parallelism for you.
116
118
 
117
119
  ### Control Flow
120
+ You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
121
+
118
122
  ```python
119
123
  @sgl.function
120
124
  def control_flow(s, question):
121
125
  s += "To answer this question: " + question + ", "
122
126
  s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
123
127
 
124
- # You can use if or nested function calls
125
128
  if s["tool"] == "calculator":
126
129
  s += "The math expression is" + sgl.gen("expression")
127
130
  elif s["tool"] == "web browser":
@@ -129,6 +132,9 @@ def control_flow(s, question):
129
132
  ```
130
133
 
131
134
  ### Parallelism
135
+ Use `fork` to launch parallel prompts.
136
+ Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
137
+
132
138
  ```python
133
139
  @sgl.function
134
140
  def tip_suggestion(s):
@@ -137,7 +143,7 @@ def tip_suggestion(s):
137
143
  "1. Balanced Diet. 2. Regular Exercise.\n\n"
138
144
  )
139
145
 
140
- forks = s.fork(2) # Launch parallel prompts
146
+ forks = s.fork(2)
141
147
  for i, f in enumerate(forks):
142
148
  f += f"Now, expand tip {i+1} into a paragraph:\n"
143
149
  f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
@@ -148,6 +154,8 @@ def tip_suggestion(s):
148
154
  ```
149
155
 
150
156
  ### Multi Modality
157
+ Use `sgl.image` to pass an image as input.
158
+
151
159
  ```python
152
160
  @sgl.function
153
161
  def image_qa(s, image_file, question):
@@ -156,6 +164,8 @@ def image_qa(s, image_file, question):
156
164
  ```
157
165
 
158
166
  ### Constrained Decoding
167
+ Use `regex=` to specify a regular expression as a decoding constraint.
168
+
159
169
  ```python
160
170
  @sgl.function
161
171
  def regular_expression_gen(s):
@@ -168,6 +178,8 @@ def regular_expression_gen(s):
168
178
  ```
169
179
 
170
180
  ### Batching
181
+ Use `run_batch` to run a batch of requests with continuous batching.
182
+
171
183
  ```python
172
184
  @sgl.function
173
185
  def text_qa(s, question):
@@ -180,10 +192,13 @@ states = text_qa.run_batch(
180
192
  {"question": "What is the capital of France?"},
181
193
  {"question": "What is the capital of Japan?"},
182
194
  ],
195
+ progress_bar=True
183
196
  )
184
197
  ```
185
198
 
186
199
  ### Streaming
200
+ Add `stream=True` to enable streaming.
201
+
187
202
  ```python
188
203
  @sgl.function
189
204
  def text_qa(s, question):
@@ -192,7 +207,9 @@ def text_qa(s, question):
192
207
 
193
208
  states = text_qa.run(
194
209
  question="What is the capital of France?",
195
- temperature=0.1)
210
+ temperature=0.1,
211
+ stream=True
212
+ )
196
213
 
197
214
  for out in state.text_iter():
198
215
  print(out, end="", flush=True)
@@ -236,6 +253,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
236
253
  - Mixtral
237
254
  - LLaVA
238
255
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000`
256
+ - AWQ quantization
239
257
 
240
258
  ## Benchmark And Performance
241
259
 
@@ -248,10 +266,10 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
248
266
  Learn more [here](docs/benchmark_results.md).
249
267
 
250
268
  ## Roadmap
251
- - [ ] Function call
252
- - [ ] Quantization
269
+ - [ ] Function call APIs
253
270
  - [ ] S-LoRA
254
- - [ ] More models
271
+ - [ ] Support more models
272
+ - [ ] Support more hardware backends
255
273
 
256
274
  ## Citation And Acknowledgment
257
275
  ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.4"
7
+ version = "0.1.5"
8
8
  description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.4"
1
+ __version__ = "0.1.5"
2
2
 
3
3
  from sglang.api import *
4
4
  from sglang.global_config import global_config
@@ -6,6 +6,7 @@ from sglang.backend.anthropic import Anthropic
6
6
  from sglang.backend.base_backend import BaseBackend
7
7
  from sglang.backend.openai import OpenAI
8
8
  from sglang.backend.runtime_endpoint import RuntimeEndpoint
9
+ from sglang.backend.vertexai import VertexAI
9
10
  from sglang.global_config import global_config
10
11
  from sglang.lang.ir import (
11
12
  SglExpr,
@@ -0,0 +1,147 @@
1
+ import os
2
+ import warnings
3
+ from typing import List, Optional, Union
4
+
5
+ import numpy as np
6
+ from sglang.backend.base_backend import BaseBackend
7
+ from sglang.lang.chat_template import get_chat_template
8
+ from sglang.lang.interpreter import StreamExecutor
9
+ from sglang.lang.ir import SglSamplingParams
10
+
11
+ try:
12
+ import vertexai
13
+ from vertexai.preview.generative_models import (
14
+ GenerationConfig,
15
+ GenerativeModel,
16
+ Image,
17
+ )
18
+ except ImportError as e:
19
+ GenerativeModel = e
20
+
21
+
22
+ class VertexAI(BaseBackend):
23
+ def __init__(self, model_name):
24
+ super().__init__()
25
+
26
+ if isinstance(GenerativeModel, Exception):
27
+ raise GenerativeModel
28
+
29
+ project_id = os.environ["GCP_PROJECT_ID"]
30
+ location = os.environ.get("GCP_LOCATION")
31
+ vertexai.init(project=project_id, location=location)
32
+
33
+ self.model_name = model_name
34
+ self.chat_template = get_chat_template("default")
35
+
36
+ def get_chat_template(self):
37
+ return self.chat_template
38
+
39
+ def generate(
40
+ self,
41
+ s: StreamExecutor,
42
+ sampling_params: SglSamplingParams,
43
+ ):
44
+ if s.messages_:
45
+ prompt = self.messages_to_vertexai_input(s.messages_)
46
+ else:
47
+ # single-turn
48
+ prompt = (
49
+ self.text_to_vertexai_input(s.text_, s.cur_images)
50
+ if s.cur_images
51
+ else s.text_
52
+ )
53
+ ret = GenerativeModel(self.model_name).generate_content(
54
+ prompt,
55
+ generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
56
+ )
57
+
58
+ comp = ret.text
59
+
60
+ return comp, {}
61
+
62
+ def generate_stream(
63
+ self,
64
+ s: StreamExecutor,
65
+ sampling_params: SglSamplingParams,
66
+ ):
67
+ if s.messages_:
68
+ prompt = self.messages_to_vertexai_input(s.messages_)
69
+ else:
70
+ # single-turn
71
+ prompt = (
72
+ self.text_to_vertexai_input(s.text_, s.cur_images)
73
+ if s.cur_images
74
+ else s.text_
75
+ )
76
+ generator = GenerativeModel(self.model_name).generate_content(
77
+ prompt,
78
+ stream=True,
79
+ generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
80
+ )
81
+ for ret in generator:
82
+ yield ret.text, {}
83
+
84
+ def text_to_vertexai_input(self, text, images):
85
+ input = []
86
+ # split with image token
87
+ text_segs = text.split(self.chat_template.image_token)
88
+ for image_path, image_base64_data in images:
89
+ text_seg = text_segs.pop(0)
90
+ if text_seg != "":
91
+ input.append(text_seg)
92
+ input.append(Image.from_bytes(image_base64_data))
93
+ text_seg = text_segs.pop(0)
94
+ if text_seg != "":
95
+ input.append(text_seg)
96
+ return input
97
+
98
+ def messages_to_vertexai_input(self, messages):
99
+ vertexai_message = []
100
+ # from openai message format to vertexai message format
101
+ for msg in messages:
102
+ if isinstance(msg["content"], str):
103
+ text = msg["content"]
104
+ else:
105
+ text = msg["content"][0]["text"]
106
+
107
+ if msg["role"] == "system":
108
+ warnings.warn("Warning: system prompt is not supported in VertexAI.")
109
+ vertexai_message.append(
110
+ {
111
+ "role": "user",
112
+ "parts": [{"text": "System prompt: " + text}],
113
+ }
114
+ )
115
+ vertexai_message.append(
116
+ {
117
+ "role": "model",
118
+ "parts": [{"text": "Understood."}],
119
+ }
120
+ )
121
+ continue
122
+ if msg["role"] == "user":
123
+ vertexai_msg = {
124
+ "role": "user",
125
+ "parts": [{"text": text}],
126
+ }
127
+ elif msg["role"] == "assistant":
128
+ vertexai_msg = {
129
+ "role": "model",
130
+ "parts": [{"text": text}],
131
+ }
132
+
133
+ # images
134
+ if isinstance(msg["content"], list) and len(msg["content"]) > 1:
135
+ for image in msg["content"][1:]:
136
+ assert image["type"] == "image_url"
137
+ vertexai_msg["parts"].append(
138
+ {
139
+ "inline_data": {
140
+ "data": image["image_url"]["url"].split(",")[1],
141
+ "mime_type": "image/jpeg",
142
+ }
143
+ }
144
+ )
145
+
146
+ vertexai_message.append(vertexai_msg)
147
+ return vertexai_message
@@ -365,11 +365,10 @@ class StreamExecutor:
365
365
  for comp, meta_info in generator:
366
366
  self.text_ += comp
367
367
  self.variables[name] += comp
368
+ self.meta_info[name] = meta_info
368
369
  self.stream_var_event[name].set()
369
370
  self.stream_text_event.set()
370
371
 
371
- self.meta_info[name] = meta_info
372
-
373
372
  self.variable_event[name].set()
374
373
  self.stream_var_event[name].set()
375
374
 
@@ -428,6 +427,7 @@ class StreamExecutor:
428
427
  self.messages_.append(last_msg)
429
428
  self.cur_images = []
430
429
  else:
430
+ # OpenAI chat API format
431
431
  self.messages_.append({"role": expr.role, "content": new_text})
432
432
 
433
433
  self.cur_role = None
@@ -582,7 +582,7 @@ class ProgramState:
582
582
  else:
583
583
  yield self.get_var(name)
584
584
 
585
- async def text_async_iter(self, var_name=None):
585
+ async def text_async_iter(self, var_name=None, return_meta_data=False):
586
586
  loop = asyncio.get_running_loop()
587
587
 
588
588
  if self.stream_executor.stream:
@@ -606,7 +606,10 @@ class ProgramState:
606
606
  out = str(self.stream_executor.variables[var_name][prev:])
607
607
  prev += len(out)
608
608
  if out:
609
- yield out
609
+ if return_meta_data:
610
+ yield out, self.stream_executor.meta_info[var_name]
611
+ else:
612
+ yield out
610
613
  if self.stream_executor.variable_event[var_name].is_set():
611
614
  break
612
615
  else:
@@ -632,11 +635,7 @@ class ProgramState:
632
635
  self.stream_executor.end()
633
636
 
634
637
  def __repr__(self) -> str:
635
- msgs = self.messages()
636
- ret = ""
637
- for msg in msgs:
638
- ret += msg["role"] + ":\n" + msg["content"] + "\n"
639
- return ret
638
+ return f"ProgramState({self.text()})"
640
639
 
641
640
 
642
641
  class ProgramStateGroup:
@@ -2,6 +2,7 @@
2
2
 
3
3
  import dataclasses
4
4
  import inspect
5
+ import warnings
5
6
  from typing import List, Optional, Union
6
7
 
7
8
  from sglang.global_config import global_config
@@ -40,6 +41,8 @@ class SglSamplingParams:
40
41
 
41
42
  def to_openai_kwargs(self):
42
43
  # OpenAI does not support top_k, so we drop it here
44
+ if self.regex is not None:
45
+ warnings.warn("Regular expression is not supported in the OpenAI backend.")
43
46
  return {
44
47
  "max_tokens": self.max_new_tokens,
45
48
  "stop": self.stop or None,
@@ -49,8 +52,26 @@ class SglSamplingParams:
49
52
  "presence_penalty": self.presence_penalty,
50
53
  }
51
54
 
55
+ def to_vertexai_kwargs(self):
56
+ if self.regex is not None:
57
+ warnings.warn(
58
+ "Regular expression is not supported in the VertexAI backend."
59
+ )
60
+ return {
61
+ "candidate_count": 1,
62
+ "max_output_tokens": self.max_new_tokens,
63
+ "stop_sequences": self.stop,
64
+ "temperature": self.temperature,
65
+ "top_p": self.top_p,
66
+ "top_k": self.top_k if self.top_k > 0 else None,
67
+ }
68
+
52
69
  def to_anthropic_kwargs(self):
53
70
  # Anthropic does not support frequency_penalty or presence_penalty, so we drop it here
71
+ if self.regex is not None:
72
+ warnings.warn(
73
+ "Regular expression is not supported in the Anthropic backend."
74
+ )
54
75
  return {
55
76
  "max_tokens_to_sample": self.max_new_tokens,
56
77
  "stop_sequences": self.stop,
@@ -5,7 +5,6 @@ import triton
5
5
  import triton.language as tl
6
6
  from sglang.srt.utils import wrap_kernel_launcher
7
7
 
8
-
9
8
  CUDA_CAPABILITY = torch.cuda.get_device_capability()
10
9
 
11
10
 
@@ -4,7 +4,6 @@ import triton.language as tl
4
4
  from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
5
5
  from sglang.srt.utils import wrap_kernel_launcher
6
6
 
7
-
8
7
  CUDA_CAPABILITY = torch.cuda.get_device_capability()
9
8
 
10
9
 
@@ -28,7 +28,7 @@ class RouterManager:
28
28
  self.model_client = model_client
29
29
  self.recv_reqs = []
30
30
 
31
- # Init Some Configs
31
+ # Init some configs
32
32
  self.extend_dependency_time = GLOBAL_BACKEND_CONFIG.extend_dependency_time
33
33
 
34
34
  async def loop_for_forward(self):
@@ -46,7 +46,7 @@ class RouterManager:
46
46
  if has_finished:
47
47
  await asyncio.sleep(self.extend_dependency_time)
48
48
 
49
- await asyncio.sleep(0.001)
49
+ await asyncio.sleep(0.0006)
50
50
 
51
51
  async def loop_for_recv_requests(self):
52
52
  while True:
@@ -2,10 +2,10 @@ import asyncio
2
2
  import logging
3
3
  import multiprocessing
4
4
  import time
5
+ import warnings
5
6
  from concurrent.futures import ThreadPoolExecutor
6
7
  from enum import Enum, auto
7
8
  from typing import Dict, List, Optional, Tuple, Union
8
- import warnings
9
9
 
10
10
  import numpy as np
11
11
  import rpyc
@@ -45,6 +45,7 @@ class ModelRpcServer(rpyc.Service):
45
45
  self.tp_rank = tp_rank
46
46
  self.tp_size = server_args.tp_size
47
47
  self.schedule_heuristic = server_args.schedule_heuristic
48
+ self.schedule_conservativeness = server_args.schedule_conservativeness
48
49
 
49
50
  # Init model and tokenizer
50
51
  self.model_config = ModelConfig(
@@ -108,7 +109,7 @@ class ModelRpcServer(rpyc.Service):
108
109
  self.running_batch: Batch = None
109
110
  self.out_pyobjs = []
110
111
  self.decode_forward_ct = 0
111
- self.stream_interval = 2
112
+ self.stream_interval = server_args.stream_interval
112
113
 
113
114
  # Init the FSM cache for constrained generation
114
115
  self.regex_fsm_cache = FSMCache(self.tokenizer)
@@ -248,7 +249,9 @@ class ModelRpcServer(rpyc.Service):
248
249
  available_size = (
249
250
  self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
250
251
  )
251
- new_ratio = self.scheduler.new_token_estimation_ratio()
252
+ new_ratio = (
253
+ self.scheduler.new_token_estimation_ratio() * self.schedule_conservativeness
254
+ )
252
255
  if self.running_batch:
253
256
  available_size -= sum(
254
257
  [
@@ -278,7 +278,7 @@ class ModelRunner:
278
278
  load_format=self.load_format,
279
279
  revision=None,
280
280
  )
281
- self.model = model
281
+ self.model = model.eval()
282
282
 
283
283
  def profile_max_num_token(self, total_gpu_memory):
284
284
  available_gpu_memory = get_available_gpu_memory(
@@ -355,7 +355,7 @@ class MixtralForCausalLM(nn.Module):
355
355
  ):
356
356
  if "rotary_emb.inv_freq" in name:
357
357
  continue
358
- for (param_name, weight_name, shard_id) in stacked_params_mapping:
358
+ for param_name, weight_name, shard_id in stacked_params_mapping:
359
359
  if weight_name not in name:
360
360
  continue
361
361
  name = name.replace(weight_name, param_name)