sglang 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. {sglang-0.1.3/sglang.egg-info → sglang-0.1.5}/PKG-INFO +44 -12
  2. {sglang-0.1.3 → sglang-0.1.5}/README.md +43 -11
  3. {sglang-0.1.3 → sglang-0.1.5}/pyproject.toml +1 -1
  4. {sglang-0.1.3 → sglang-0.1.5}/sglang/__init__.py +1 -1
  5. {sglang-0.1.3 → sglang-0.1.5}/sglang/api.py +1 -0
  6. sglang-0.1.5/sglang/backend/vertexai.py +147 -0
  7. {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/interpreter.py +8 -9
  8. {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/ir.py +21 -0
  9. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/context_flashattention_nopad.py +7 -1
  10. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/extend_attention.py +46 -1
  11. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/manager.py +2 -2
  12. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/model_rpc.py +7 -3
  13. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/model_runner.py +1 -1
  14. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/models/mixtral.py +1 -1
  15. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/server_args.py +22 -4
  16. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/utils.py +1 -1
  17. {sglang-0.1.3 → sglang-0.1.5}/sglang/test/test_programs.py +4 -1
  18. {sglang-0.1.3 → sglang-0.1.5/sglang.egg-info}/PKG-INFO +44 -12
  19. {sglang-0.1.3 → sglang-0.1.5}/sglang.egg-info/SOURCES.txt +1 -2
  20. sglang-0.1.3/sglang/backend/huggingface.py +0 -349
  21. sglang-0.1.3/sglang/backend/tgi.py +0 -190
  22. {sglang-0.1.3 → sglang-0.1.5}/LICENSE +0 -0
  23. {sglang-0.1.3 → sglang-0.1.5}/setup.cfg +0 -0
  24. {sglang-0.1.3 → sglang-0.1.5}/sglang/backend/__init__.py +0 -0
  25. {sglang-0.1.3 → sglang-0.1.5}/sglang/backend/anthropic.py +0 -0
  26. {sglang-0.1.3 → sglang-0.1.5}/sglang/backend/base_backend.py +0 -0
  27. {sglang-0.1.3 → sglang-0.1.5}/sglang/backend/openai.py +0 -0
  28. {sglang-0.1.3 → sglang-0.1.5}/sglang/backend/runtime_endpoint.py +0 -0
  29. {sglang-0.1.3 → sglang-0.1.5}/sglang/flush_cache.py +0 -0
  30. {sglang-0.1.3 → sglang-0.1.5}/sglang/global_config.py +0 -0
  31. {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/__init__.py +0 -0
  32. {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/chat_template.py +0 -0
  33. {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/compiler.py +0 -0
  34. {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/tracer.py +0 -0
  35. {sglang-0.1.3 → sglang-0.1.5}/sglang/launch_server.py +0 -0
  36. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/backend_config.py +0 -0
  37. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/constrained/fsm.py +0 -0
  38. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/constrained/fsm_cache.py +0 -0
  39. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/constrained/regex.py +0 -0
  40. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/constrained/tokenizer.py +0 -0
  41. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/hf_transformers_utils.py +0 -0
  42. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/get_selected_logprob.py +0 -0
  43. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/logits_processor.py +0 -0
  44. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/radix_attention.py +0 -0
  45. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/token_attention.py +0 -0
  46. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/detokenizer_manager.py +0 -0
  47. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/io_struct.py +0 -0
  48. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/openai_protocol.py +0 -0
  49. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/infer_batch.py +0 -0
  50. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/radix_cache.py +0 -0
  51. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/scheduler.py +0 -0
  52. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/tokenizer_manager.py +0 -0
  53. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/memory_pool.py +0 -0
  54. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/model_config.py +0 -0
  55. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/models/llama2.py +0 -0
  56. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/models/llava.py +0 -0
  57. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/sampling_params.py +0 -0
  58. {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/server.py +0 -0
  59. {sglang-0.1.3 → sglang-0.1.5}/sglang/test/test_utils.py +0 -0
  60. {sglang-0.1.3 → sglang-0.1.5}/sglang/utils.py +0 -0
  61. {sglang-0.1.3 → sglang-0.1.5}/sglang.egg-info/dependency_links.txt +0 -0
  62. {sglang-0.1.3 → sglang-0.1.5}/sglang.egg-info/requires.txt +0 -0
  63. {sglang-0.1.3 → sglang-0.1.5}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -234,6 +234,7 @@ Requires-Dist: sglang[openai]; extra == "all"
234
234
  Requires-Dist: sglang[anthropic]; extra == "all"
235
235
 
236
236
  # SGLang
237
+ | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
237
238
 
238
239
  SGLang is a structured generation language designed for large language models (LLMs).
239
240
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
@@ -267,10 +268,20 @@ pip install --upgrade pip
267
268
  pip install -e "python[all]"
268
269
  ```
269
270
 
271
+ ### Notes
272
+ - If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
273
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
274
+
270
275
  ## Quick Start
271
276
  The example below shows how to use sglang to answer a mulit-turn question.
272
277
 
273
278
  ### Using OpenAI Models
279
+ Set the OpenAI API Key
280
+ ```
281
+ export OPENAI_API_KEY=sk-******
282
+ ```
283
+
284
+ Then, answer a multi-turn question.
274
285
  ```python
275
286
  from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
276
287
 
@@ -325,6 +336,7 @@ for m in state.messages():
325
336
 
326
337
  ### More Examples
327
338
 
339
+ Anthropic and VertexAI (Gemini) models are also supported.
328
340
  You can find more examples at [examples/quick_start](examples/quick_start).
329
341
 
330
342
  ## Frontend: Structured Generation Langauge (SGLang)
@@ -334,19 +346,20 @@ To begin with, import sglang.
334
346
  import sglang as sgl
335
347
  ```
336
348
 
337
- `sglang` provides some simple primitives such as `gen`, `select`, `fork`.
349
+ `sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
338
350
  You can implement your prompt flow in a function decorated by `sgl.function`.
339
351
  You can then invoke the function with `run` or `run_batch`.
340
352
  The system will manage the state, chat template, and parallelism for you.
341
353
 
342
354
  ### Control Flow
355
+ You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
356
+
343
357
  ```python
344
358
  @sgl.function
345
359
  def control_flow(s, question):
346
360
  s += "To answer this question: " + question + ", "
347
361
  s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
348
362
 
349
- # You can use if or nested function calls
350
363
  if s["tool"] == "calculator":
351
364
  s += "The math expression is" + sgl.gen("expression")
352
365
  elif s["tool"] == "web browser":
@@ -354,6 +367,9 @@ def control_flow(s, question):
354
367
  ```
355
368
 
356
369
  ### Parallelism
370
+ Use `fork` to launch parallel prompts.
371
+ Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
372
+
357
373
  ```python
358
374
  @sgl.function
359
375
  def tip_suggestion(s):
@@ -362,7 +378,7 @@ def tip_suggestion(s):
362
378
  "1. Balanced Diet. 2. Regular Exercise.\n\n"
363
379
  )
364
380
 
365
- forks = s.fork(2) # Launch parallel prompts
381
+ forks = s.fork(2)
366
382
  for i, f in enumerate(forks):
367
383
  f += f"Now, expand tip {i+1} into a paragraph:\n"
368
384
  f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
@@ -373,6 +389,8 @@ def tip_suggestion(s):
373
389
  ```
374
390
 
375
391
  ### Multi Modality
392
+ Use `sgl.image` to pass an image as input.
393
+
376
394
  ```python
377
395
  @sgl.function
378
396
  def image_qa(s, image_file, question):
@@ -381,11 +399,13 @@ def image_qa(s, image_file, question):
381
399
  ```
382
400
 
383
401
  ### Constrained Decoding
402
+ Use `regex=` to specify a regular expression as a decoding constraint.
403
+
384
404
  ```python
385
- @function
405
+ @sgl.function
386
406
  def regular_expression_gen(s):
387
407
  s += "Q: What is the IP address of the Google DNS servers?\n"
388
- s += "A: " + gen(
408
+ s += "A: " + sgl.gen(
389
409
  "answer",
390
410
  temperature=0,
391
411
  regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
@@ -393,6 +413,8 @@ def regular_expression_gen(s):
393
413
  ```
394
414
 
395
415
  ### Batching
416
+ Use `run_batch` to run a batch of requests with continuous batching.
417
+
396
418
  ```python
397
419
  @sgl.function
398
420
  def text_qa(s, question):
@@ -405,10 +427,13 @@ states = text_qa.run_batch(
405
427
  {"question": "What is the capital of France?"},
406
428
  {"question": "What is the capital of Japan?"},
407
429
  ],
430
+ progress_bar=True
408
431
  )
409
432
  ```
410
433
 
411
434
  ### Streaming
435
+ Add `stream=True` to enable streaming.
436
+
412
437
  ```python
413
438
  @sgl.function
414
439
  def text_qa(s, question):
@@ -417,7 +442,9 @@ def text_qa(s, question):
417
442
 
418
443
  states = text_qa.run(
419
444
  question="What is the capital of France?",
420
- temperature=0.1)
445
+ temperature=0.1,
446
+ stream=True
447
+ )
421
448
 
422
449
  for out in state.text_iter():
423
450
  print(out, end="", flush=True)
@@ -426,7 +453,7 @@ for out in state.text_iter():
426
453
  ## Backend: SGLang Runtime (SRT)
427
454
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
428
455
  However, it can also be used as a standalone API server.
429
- In this case, the RadixAttention can still greatly accelerate many use cases.
456
+ In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
430
457
 
431
458
  ### Usage
432
459
  Launch a server
@@ -450,6 +477,10 @@ curl http://localhost:30000/v1/completions \
450
477
  ```
451
478
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
452
479
  ```
480
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
481
+ ```
482
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
483
+ ```
453
484
 
454
485
  ### Supported Models
455
486
  - Llama
@@ -457,6 +488,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
457
488
  - Mixtral
458
489
  - LLaVA
459
490
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000`
491
+ - AWQ quantization
460
492
 
461
493
  ## Benchmark And Performance
462
494
 
@@ -466,13 +498,13 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
466
498
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
467
499
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
468
500
 
469
- Learn more [here]().
501
+ Learn more [here](docs/benchmark_results.md).
470
502
 
471
503
  ## Roadmap
472
- - [ ] Function call
473
- - [ ] Quantization
504
+ - [ ] Function call APIs
474
505
  - [ ] S-LoRA
475
- - [ ] More models
506
+ - [ ] Support more models
507
+ - [ ] Support more hardware backends
476
508
 
477
509
  ## Citation And Acknowledgment
478
510
  ```
@@ -1,4 +1,5 @@
1
1
  # SGLang
2
+ | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
2
3
 
3
4
  SGLang is a structured generation language designed for large language models (LLMs).
4
5
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
@@ -32,10 +33,20 @@ pip install --upgrade pip
32
33
  pip install -e "python[all]"
33
34
  ```
34
35
 
36
+ ### Notes
37
+ - If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
38
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
39
+
35
40
  ## Quick Start
36
41
  The example below shows how to use sglang to answer a mulit-turn question.
37
42
 
38
43
  ### Using OpenAI Models
44
+ Set the OpenAI API Key
45
+ ```
46
+ export OPENAI_API_KEY=sk-******
47
+ ```
48
+
49
+ Then, answer a multi-turn question.
39
50
  ```python
40
51
  from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
41
52
 
@@ -90,6 +101,7 @@ for m in state.messages():
90
101
 
91
102
  ### More Examples
92
103
 
104
+ Anthropic and VertexAI (Gemini) models are also supported.
93
105
  You can find more examples at [examples/quick_start](examples/quick_start).
94
106
 
95
107
  ## Frontend: Structured Generation Langauge (SGLang)
@@ -99,19 +111,20 @@ To begin with, import sglang.
99
111
  import sglang as sgl
100
112
  ```
101
113
 
102
- `sglang` provides some simple primitives such as `gen`, `select`, `fork`.
114
+ `sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
103
115
  You can implement your prompt flow in a function decorated by `sgl.function`.
104
116
  You can then invoke the function with `run` or `run_batch`.
105
117
  The system will manage the state, chat template, and parallelism for you.
106
118
 
107
119
  ### Control Flow
120
+ You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
121
+
108
122
  ```python
109
123
  @sgl.function
110
124
  def control_flow(s, question):
111
125
  s += "To answer this question: " + question + ", "
112
126
  s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
113
127
 
114
- # You can use if or nested function calls
115
128
  if s["tool"] == "calculator":
116
129
  s += "The math expression is" + sgl.gen("expression")
117
130
  elif s["tool"] == "web browser":
@@ -119,6 +132,9 @@ def control_flow(s, question):
119
132
  ```
120
133
 
121
134
  ### Parallelism
135
+ Use `fork` to launch parallel prompts.
136
+ Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
137
+
122
138
  ```python
123
139
  @sgl.function
124
140
  def tip_suggestion(s):
@@ -127,7 +143,7 @@ def tip_suggestion(s):
127
143
  "1. Balanced Diet. 2. Regular Exercise.\n\n"
128
144
  )
129
145
 
130
- forks = s.fork(2) # Launch parallel prompts
146
+ forks = s.fork(2)
131
147
  for i, f in enumerate(forks):
132
148
  f += f"Now, expand tip {i+1} into a paragraph:\n"
133
149
  f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
@@ -138,6 +154,8 @@ def tip_suggestion(s):
138
154
  ```
139
155
 
140
156
  ### Multi Modality
157
+ Use `sgl.image` to pass an image as input.
158
+
141
159
  ```python
142
160
  @sgl.function
143
161
  def image_qa(s, image_file, question):
@@ -146,11 +164,13 @@ def image_qa(s, image_file, question):
146
164
  ```
147
165
 
148
166
  ### Constrained Decoding
167
+ Use `regex=` to specify a regular expression as a decoding constraint.
168
+
149
169
  ```python
150
- @function
170
+ @sgl.function
151
171
  def regular_expression_gen(s):
152
172
  s += "Q: What is the IP address of the Google DNS servers?\n"
153
- s += "A: " + gen(
173
+ s += "A: " + sgl.gen(
154
174
  "answer",
155
175
  temperature=0,
156
176
  regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
@@ -158,6 +178,8 @@ def regular_expression_gen(s):
158
178
  ```
159
179
 
160
180
  ### Batching
181
+ Use `run_batch` to run a batch of requests with continuous batching.
182
+
161
183
  ```python
162
184
  @sgl.function
163
185
  def text_qa(s, question):
@@ -170,10 +192,13 @@ states = text_qa.run_batch(
170
192
  {"question": "What is the capital of France?"},
171
193
  {"question": "What is the capital of Japan?"},
172
194
  ],
195
+ progress_bar=True
173
196
  )
174
197
  ```
175
198
 
176
199
  ### Streaming
200
+ Add `stream=True` to enable streaming.
201
+
177
202
  ```python
178
203
  @sgl.function
179
204
  def text_qa(s, question):
@@ -182,7 +207,9 @@ def text_qa(s, question):
182
207
 
183
208
  states = text_qa.run(
184
209
  question="What is the capital of France?",
185
- temperature=0.1)
210
+ temperature=0.1,
211
+ stream=True
212
+ )
186
213
 
187
214
  for out in state.text_iter():
188
215
  print(out, end="", flush=True)
@@ -191,7 +218,7 @@ for out in state.text_iter():
191
218
  ## Backend: SGLang Runtime (SRT)
192
219
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
193
220
  However, it can also be used as a standalone API server.
194
- In this case, the RadixAttention can still greatly accelerate many use cases.
221
+ In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
195
222
 
196
223
  ### Usage
197
224
  Launch a server
@@ -215,6 +242,10 @@ curl http://localhost:30000/v1/completions \
215
242
  ```
216
243
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
217
244
  ```
245
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
246
+ ```
247
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
248
+ ```
218
249
 
219
250
  ### Supported Models
220
251
  - Llama
@@ -222,6 +253,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
222
253
  - Mixtral
223
254
  - LLaVA
224
255
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000`
256
+ - AWQ quantization
225
257
 
226
258
  ## Benchmark And Performance
227
259
 
@@ -231,13 +263,13 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
231
263
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
232
264
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
233
265
 
234
- Learn more [here]().
266
+ Learn more [here](docs/benchmark_results.md).
235
267
 
236
268
  ## Roadmap
237
- - [ ] Function call
238
- - [ ] Quantization
269
+ - [ ] Function call APIs
239
270
  - [ ] S-LoRA
240
- - [ ] More models
271
+ - [ ] Support more models
272
+ - [ ] Support more hardware backends
241
273
 
242
274
  ## Citation And Acknowledgment
243
275
  ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.3"
7
+ version = "0.1.5"
8
8
  description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.3"
1
+ __version__ = "0.1.5"
2
2
 
3
3
  from sglang.api import *
4
4
  from sglang.global_config import global_config
@@ -6,6 +6,7 @@ from sglang.backend.anthropic import Anthropic
6
6
  from sglang.backend.base_backend import BaseBackend
7
7
  from sglang.backend.openai import OpenAI
8
8
  from sglang.backend.runtime_endpoint import RuntimeEndpoint
9
+ from sglang.backend.vertexai import VertexAI
9
10
  from sglang.global_config import global_config
10
11
  from sglang.lang.ir import (
11
12
  SglExpr,
@@ -0,0 +1,147 @@
1
+ import os
2
+ import warnings
3
+ from typing import List, Optional, Union
4
+
5
+ import numpy as np
6
+ from sglang.backend.base_backend import BaseBackend
7
+ from sglang.lang.chat_template import get_chat_template
8
+ from sglang.lang.interpreter import StreamExecutor
9
+ from sglang.lang.ir import SglSamplingParams
10
+
11
+ try:
12
+ import vertexai
13
+ from vertexai.preview.generative_models import (
14
+ GenerationConfig,
15
+ GenerativeModel,
16
+ Image,
17
+ )
18
+ except ImportError as e:
19
+ GenerativeModel = e
20
+
21
+
22
+ class VertexAI(BaseBackend):
23
+ def __init__(self, model_name):
24
+ super().__init__()
25
+
26
+ if isinstance(GenerativeModel, Exception):
27
+ raise GenerativeModel
28
+
29
+ project_id = os.environ["GCP_PROJECT_ID"]
30
+ location = os.environ.get("GCP_LOCATION")
31
+ vertexai.init(project=project_id, location=location)
32
+
33
+ self.model_name = model_name
34
+ self.chat_template = get_chat_template("default")
35
+
36
+ def get_chat_template(self):
37
+ return self.chat_template
38
+
39
+ def generate(
40
+ self,
41
+ s: StreamExecutor,
42
+ sampling_params: SglSamplingParams,
43
+ ):
44
+ if s.messages_:
45
+ prompt = self.messages_to_vertexai_input(s.messages_)
46
+ else:
47
+ # single-turn
48
+ prompt = (
49
+ self.text_to_vertexai_input(s.text_, s.cur_images)
50
+ if s.cur_images
51
+ else s.text_
52
+ )
53
+ ret = GenerativeModel(self.model_name).generate_content(
54
+ prompt,
55
+ generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
56
+ )
57
+
58
+ comp = ret.text
59
+
60
+ return comp, {}
61
+
62
+ def generate_stream(
63
+ self,
64
+ s: StreamExecutor,
65
+ sampling_params: SglSamplingParams,
66
+ ):
67
+ if s.messages_:
68
+ prompt = self.messages_to_vertexai_input(s.messages_)
69
+ else:
70
+ # single-turn
71
+ prompt = (
72
+ self.text_to_vertexai_input(s.text_, s.cur_images)
73
+ if s.cur_images
74
+ else s.text_
75
+ )
76
+ generator = GenerativeModel(self.model_name).generate_content(
77
+ prompt,
78
+ stream=True,
79
+ generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
80
+ )
81
+ for ret in generator:
82
+ yield ret.text, {}
83
+
84
+ def text_to_vertexai_input(self, text, images):
85
+ input = []
86
+ # split with image token
87
+ text_segs = text.split(self.chat_template.image_token)
88
+ for image_path, image_base64_data in images:
89
+ text_seg = text_segs.pop(0)
90
+ if text_seg != "":
91
+ input.append(text_seg)
92
+ input.append(Image.from_bytes(image_base64_data))
93
+ text_seg = text_segs.pop(0)
94
+ if text_seg != "":
95
+ input.append(text_seg)
96
+ return input
97
+
98
+ def messages_to_vertexai_input(self, messages):
99
+ vertexai_message = []
100
+ # from openai message format to vertexai message format
101
+ for msg in messages:
102
+ if isinstance(msg["content"], str):
103
+ text = msg["content"]
104
+ else:
105
+ text = msg["content"][0]["text"]
106
+
107
+ if msg["role"] == "system":
108
+ warnings.warn("Warning: system prompt is not supported in VertexAI.")
109
+ vertexai_message.append(
110
+ {
111
+ "role": "user",
112
+ "parts": [{"text": "System prompt: " + text}],
113
+ }
114
+ )
115
+ vertexai_message.append(
116
+ {
117
+ "role": "model",
118
+ "parts": [{"text": "Understood."}],
119
+ }
120
+ )
121
+ continue
122
+ if msg["role"] == "user":
123
+ vertexai_msg = {
124
+ "role": "user",
125
+ "parts": [{"text": text}],
126
+ }
127
+ elif msg["role"] == "assistant":
128
+ vertexai_msg = {
129
+ "role": "model",
130
+ "parts": [{"text": text}],
131
+ }
132
+
133
+ # images
134
+ if isinstance(msg["content"], list) and len(msg["content"]) > 1:
135
+ for image in msg["content"][1:]:
136
+ assert image["type"] == "image_url"
137
+ vertexai_msg["parts"].append(
138
+ {
139
+ "inline_data": {
140
+ "data": image["image_url"]["url"].split(",")[1],
141
+ "mime_type": "image/jpeg",
142
+ }
143
+ }
144
+ )
145
+
146
+ vertexai_message.append(vertexai_msg)
147
+ return vertexai_message
@@ -365,11 +365,10 @@ class StreamExecutor:
365
365
  for comp, meta_info in generator:
366
366
  self.text_ += comp
367
367
  self.variables[name] += comp
368
+ self.meta_info[name] = meta_info
368
369
  self.stream_var_event[name].set()
369
370
  self.stream_text_event.set()
370
371
 
371
- self.meta_info[name] = meta_info
372
-
373
372
  self.variable_event[name].set()
374
373
  self.stream_var_event[name].set()
375
374
 
@@ -428,6 +427,7 @@ class StreamExecutor:
428
427
  self.messages_.append(last_msg)
429
428
  self.cur_images = []
430
429
  else:
430
+ # OpenAI chat API format
431
431
  self.messages_.append({"role": expr.role, "content": new_text})
432
432
 
433
433
  self.cur_role = None
@@ -582,7 +582,7 @@ class ProgramState:
582
582
  else:
583
583
  yield self.get_var(name)
584
584
 
585
- async def text_async_iter(self, var_name=None):
585
+ async def text_async_iter(self, var_name=None, return_meta_data=False):
586
586
  loop = asyncio.get_running_loop()
587
587
 
588
588
  if self.stream_executor.stream:
@@ -606,7 +606,10 @@ class ProgramState:
606
606
  out = str(self.stream_executor.variables[var_name][prev:])
607
607
  prev += len(out)
608
608
  if out:
609
- yield out
609
+ if return_meta_data:
610
+ yield out, self.stream_executor.meta_info[var_name]
611
+ else:
612
+ yield out
610
613
  if self.stream_executor.variable_event[var_name].is_set():
611
614
  break
612
615
  else:
@@ -632,11 +635,7 @@ class ProgramState:
632
635
  self.stream_executor.end()
633
636
 
634
637
  def __repr__(self) -> str:
635
- msgs = self.messages()
636
- ret = ""
637
- for msg in msgs:
638
- ret += msg["role"] + ":\n" + msg["content"] + "\n"
639
- return ret
638
+ return f"ProgramState({self.text()})"
640
639
 
641
640
 
642
641
  class ProgramStateGroup:
@@ -2,6 +2,7 @@
2
2
 
3
3
  import dataclasses
4
4
  import inspect
5
+ import warnings
5
6
  from typing import List, Optional, Union
6
7
 
7
8
  from sglang.global_config import global_config
@@ -40,6 +41,8 @@ class SglSamplingParams:
40
41
 
41
42
  def to_openai_kwargs(self):
42
43
  # OpenAI does not support top_k, so we drop it here
44
+ if self.regex is not None:
45
+ warnings.warn("Regular expression is not supported in the OpenAI backend.")
43
46
  return {
44
47
  "max_tokens": self.max_new_tokens,
45
48
  "stop": self.stop or None,
@@ -49,8 +52,26 @@ class SglSamplingParams:
49
52
  "presence_penalty": self.presence_penalty,
50
53
  }
51
54
 
55
+ def to_vertexai_kwargs(self):
56
+ if self.regex is not None:
57
+ warnings.warn(
58
+ "Regular expression is not supported in the VertexAI backend."
59
+ )
60
+ return {
61
+ "candidate_count": 1,
62
+ "max_output_tokens": self.max_new_tokens,
63
+ "stop_sequences": self.stop,
64
+ "temperature": self.temperature,
65
+ "top_p": self.top_p,
66
+ "top_k": self.top_k if self.top_k > 0 else None,
67
+ }
68
+
52
69
  def to_anthropic_kwargs(self):
53
70
  # Anthropic does not support frequency_penalty or presence_penalty, so we drop it here
71
+ if self.regex is not None:
72
+ warnings.warn(
73
+ "Regular expression is not supported in the Anthropic backend."
74
+ )
54
75
  return {
55
76
  "max_tokens_to_sample": self.max_new_tokens,
56
77
  "stop_sequences": self.stop,
@@ -5,6 +5,8 @@ import triton
5
5
  import triton.language as tl
6
6
  from sglang.srt.utils import wrap_kernel_launcher
7
7
 
8
+ CUDA_CAPABILITY = torch.cuda.get_device_capability()
9
+
8
10
 
9
11
  @triton.jit
10
12
  def _fwd_kernel(
@@ -120,7 +122,11 @@ cached_kernel = None
120
122
 
121
123
 
122
124
  def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
123
- BLOCK = 128
125
+ if CUDA_CAPABILITY[0] >= 8:
126
+ BLOCK = 128
127
+ else:
128
+ BLOCK = 64
129
+
124
130
  Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
125
131
  assert Lq == Lk and Lk == Lv
126
132
  assert Lk in {16, 32, 64, 128}