sglang 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {sglang-0.1.2 → sglang-0.1.3}/PKG-INFO +86 -4
  2. {sglang-0.1.2 → sglang-0.1.3}/README.md +85 -3
  3. {sglang-0.1.2 → sglang-0.1.3}/pyproject.toml +1 -1
  4. {sglang-0.1.2 → sglang-0.1.3}/sglang/__init__.py +1 -1
  5. {sglang-0.1.2 → sglang-0.1.3}/sglang.egg-info/PKG-INFO +86 -4
  6. {sglang-0.1.2 → sglang-0.1.3}/LICENSE +0 -0
  7. {sglang-0.1.2 → sglang-0.1.3}/setup.cfg +0 -0
  8. {sglang-0.1.2 → sglang-0.1.3}/sglang/api.py +0 -0
  9. {sglang-0.1.2 → sglang-0.1.3}/sglang/backend/__init__.py +0 -0
  10. {sglang-0.1.2 → sglang-0.1.3}/sglang/backend/anthropic.py +0 -0
  11. {sglang-0.1.2 → sglang-0.1.3}/sglang/backend/base_backend.py +0 -0
  12. {sglang-0.1.2 → sglang-0.1.3}/sglang/backend/huggingface.py +0 -0
  13. {sglang-0.1.2 → sglang-0.1.3}/sglang/backend/openai.py +0 -0
  14. {sglang-0.1.2 → sglang-0.1.3}/sglang/backend/runtime_endpoint.py +0 -0
  15. {sglang-0.1.2 → sglang-0.1.3}/sglang/backend/tgi.py +0 -0
  16. {sglang-0.1.2 → sglang-0.1.3}/sglang/flush_cache.py +0 -0
  17. {sglang-0.1.2 → sglang-0.1.3}/sglang/global_config.py +0 -0
  18. {sglang-0.1.2 → sglang-0.1.3}/sglang/lang/__init__.py +0 -0
  19. {sglang-0.1.2 → sglang-0.1.3}/sglang/lang/chat_template.py +0 -0
  20. {sglang-0.1.2 → sglang-0.1.3}/sglang/lang/compiler.py +0 -0
  21. {sglang-0.1.2 → sglang-0.1.3}/sglang/lang/interpreter.py +0 -0
  22. {sglang-0.1.2 → sglang-0.1.3}/sglang/lang/ir.py +0 -0
  23. {sglang-0.1.2 → sglang-0.1.3}/sglang/lang/tracer.py +0 -0
  24. {sglang-0.1.2 → sglang-0.1.3}/sglang/launch_server.py +0 -0
  25. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/backend_config.py +0 -0
  26. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/constrained/fsm.py +0 -0
  27. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/constrained/fsm_cache.py +0 -0
  28. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/constrained/regex.py +0 -0
  29. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/constrained/tokenizer.py +0 -0
  30. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/hf_transformers_utils.py +0 -0
  31. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
  32. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/layers/extend_attention.py +0 -0
  33. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/layers/get_selected_logprob.py +0 -0
  34. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/layers/logits_processor.py +0 -0
  35. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/layers/radix_attention.py +0 -0
  36. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/layers/token_attention.py +0 -0
  37. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/managers/detokenizer_manager.py +0 -0
  38. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/managers/io_struct.py +0 -0
  39. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/managers/openai_protocol.py +0 -0
  40. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/managers/router/infer_batch.py +0 -0
  41. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/managers/router/manager.py +0 -0
  42. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/managers/router/model_rpc.py +0 -0
  43. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/managers/router/model_runner.py +0 -0
  44. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/managers/router/radix_cache.py +0 -0
  45. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/managers/router/scheduler.py +0 -0
  46. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/managers/tokenizer_manager.py +0 -0
  47. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/memory_pool.py +0 -0
  48. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/model_config.py +0 -0
  49. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/models/llama2.py +0 -0
  50. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/models/llava.py +0 -0
  51. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/models/mixtral.py +0 -0
  52. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/sampling_params.py +0 -0
  53. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/server.py +0 -0
  54. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/server_args.py +0 -0
  55. {sglang-0.1.2 → sglang-0.1.3}/sglang/srt/utils.py +0 -0
  56. {sglang-0.1.2 → sglang-0.1.3}/sglang/test/test_programs.py +0 -0
  57. {sglang-0.1.2 → sglang-0.1.3}/sglang/test/test_utils.py +0 -0
  58. {sglang-0.1.2 → sglang-0.1.3}/sglang/utils.py +0 -0
  59. {sglang-0.1.2 → sglang-0.1.3}/sglang.egg-info/SOURCES.txt +0 -0
  60. {sglang-0.1.2 → sglang-0.1.3}/sglang.egg-info/dependency_links.txt +0 -0
  61. {sglang-0.1.2 → sglang-0.1.3}/sglang.egg-info/requires.txt +0 -0
  62. {sglang-0.1.2 → sglang-0.1.3}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -329,25 +329,99 @@ You can find more examples at [examples/quick_start](examples/quick_start).
329
329
 
330
330
  ## Frontend: Structured Generation Langauge (SGLang)
331
331
 
332
+ To begin with, import sglang.
333
+ ```python
334
+ import sglang as sgl
335
+ ```
336
+
337
+ `sglang` provides some simple primitives such as `gen`, `select`, `fork`.
338
+ You can implement your prompt flow in a function decorated by `sgl.function`.
339
+ You can then invoke the function with `run` or `run_batch`.
340
+ The system will manage the state, chat template, and parallelism for you.
341
+
332
342
  ### Control Flow
343
+ ```python
344
+ @sgl.function
345
+ def control_flow(s, question):
346
+ s += "To answer this question: " + question + ", "
347
+ s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
348
+
349
+ # You can use if or nested function calls
350
+ if s["tool"] == "calculator":
351
+ s += "The math expression is" + sgl.gen("expression")
352
+ elif s["tool"] == "web browser":
353
+ s += "The website url is" + sgl.gen("url")
354
+ ```
333
355
 
334
356
  ### Parallelism
357
+ ```python
358
+ @sgl.function
359
+ def tip_suggestion(s):
360
+ s += (
361
+ "Here are two tips for staying healthy: "
362
+ "1. Balanced Diet. 2. Regular Exercise.\n\n"
363
+ )
364
+
365
+ forks = s.fork(2) # Launch parallel prompts
366
+ for i, f in enumerate(forks):
367
+ f += f"Now, expand tip {i+1} into a paragraph:\n"
368
+ f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
369
+
370
+ s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
371
+ s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
372
+ s += "In summary" + sgl.gen("summary")
373
+ ```
335
374
 
336
375
  ### Multi Modality
337
376
  ```python
338
377
  @sgl.function
339
378
  def image_qa(s, image_file, question):
340
379
  s += sgl.user(sgl.image(image_file) + question)
341
- s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
380
+ s += sgl.assistant(sgl.gen("answer", max_tokens=256)
342
381
  ```
343
382
 
344
- ### Constrained decoding
383
+ ### Constrained Decoding
384
+ ```python
385
+ @function
386
+ def regular_expression_gen(s):
387
+ s += "Q: What is the IP address of the Google DNS servers?\n"
388
+ s += "A: " + gen(
389
+ "answer",
390
+ temperature=0,
391
+ regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
392
+ )
393
+ ```
345
394
 
346
395
  ### Batching
396
+ ```python
397
+ @sgl.function
398
+ def text_qa(s, question):
399
+ s += "Q: " + question + "\n"
400
+ s += "A:" + sgl.gen("answer", stop="\n")
401
+
402
+ states = text_qa.run_batch(
403
+ [
404
+ {"question": "What is the capital of the United Kingdom?"},
405
+ {"question": "What is the capital of France?"},
406
+ {"question": "What is the capital of Japan?"},
407
+ ],
408
+ )
409
+ ```
347
410
 
348
411
  ### Streaming
412
+ ```python
413
+ @sgl.function
414
+ def text_qa(s, question):
415
+ s += "Q: " + question + "\n"
416
+ s += "A:" + sgl.gen("answer", stop="\n")
417
+
418
+ states = text_qa.run(
419
+ question="What is the capital of France?",
420
+ temperature=0.1)
349
421
 
350
- ### Other Backends
422
+ for out in state.text_iter():
423
+ print(out, end="", flush=True)
424
+ ```
351
425
 
352
426
  ## Backend: SGLang Runtime (SRT)
353
427
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -386,6 +460,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
386
460
 
387
461
  ## Benchmark And Performance
388
462
 
463
+ - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
464
+ ![llama_7b](assets/llama_7b.jpg)
465
+
466
+ - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
467
+ ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
468
+
469
+ Learn more [here]().
470
+
389
471
  ## Roadmap
390
472
  - [ ] Function call
391
473
  - [ ] Quantization
@@ -94,25 +94,99 @@ You can find more examples at [examples/quick_start](examples/quick_start).
94
94
 
95
95
  ## Frontend: Structured Generation Langauge (SGLang)
96
96
 
97
+ To begin with, import sglang.
98
+ ```python
99
+ import sglang as sgl
100
+ ```
101
+
102
+ `sglang` provides some simple primitives such as `gen`, `select`, `fork`.
103
+ You can implement your prompt flow in a function decorated by `sgl.function`.
104
+ You can then invoke the function with `run` or `run_batch`.
105
+ The system will manage the state, chat template, and parallelism for you.
106
+
97
107
  ### Control Flow
108
+ ```python
109
+ @sgl.function
110
+ def control_flow(s, question):
111
+ s += "To answer this question: " + question + ", "
112
+ s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
113
+
114
+ # You can use if or nested function calls
115
+ if s["tool"] == "calculator":
116
+ s += "The math expression is" + sgl.gen("expression")
117
+ elif s["tool"] == "web browser":
118
+ s += "The website url is" + sgl.gen("url")
119
+ ```
98
120
 
99
121
  ### Parallelism
122
+ ```python
123
+ @sgl.function
124
+ def tip_suggestion(s):
125
+ s += (
126
+ "Here are two tips for staying healthy: "
127
+ "1. Balanced Diet. 2. Regular Exercise.\n\n"
128
+ )
129
+
130
+ forks = s.fork(2) # Launch parallel prompts
131
+ for i, f in enumerate(forks):
132
+ f += f"Now, expand tip {i+1} into a paragraph:\n"
133
+ f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
134
+
135
+ s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
136
+ s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
137
+ s += "In summary" + sgl.gen("summary")
138
+ ```
100
139
 
101
140
  ### Multi Modality
102
141
  ```python
103
142
  @sgl.function
104
143
  def image_qa(s, image_file, question):
105
144
  s += sgl.user(sgl.image(image_file) + question)
106
- s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
145
+ s += sgl.assistant(sgl.gen("answer", max_tokens=256)
107
146
  ```
108
147
 
109
- ### Constrained decoding
148
+ ### Constrained Decoding
149
+ ```python
150
+ @function
151
+ def regular_expression_gen(s):
152
+ s += "Q: What is the IP address of the Google DNS servers?\n"
153
+ s += "A: " + gen(
154
+ "answer",
155
+ temperature=0,
156
+ regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
157
+ )
158
+ ```
110
159
 
111
160
  ### Batching
161
+ ```python
162
+ @sgl.function
163
+ def text_qa(s, question):
164
+ s += "Q: " + question + "\n"
165
+ s += "A:" + sgl.gen("answer", stop="\n")
166
+
167
+ states = text_qa.run_batch(
168
+ [
169
+ {"question": "What is the capital of the United Kingdom?"},
170
+ {"question": "What is the capital of France?"},
171
+ {"question": "What is the capital of Japan?"},
172
+ ],
173
+ )
174
+ ```
112
175
 
113
176
  ### Streaming
177
+ ```python
178
+ @sgl.function
179
+ def text_qa(s, question):
180
+ s += "Q: " + question + "\n"
181
+ s += "A:" + sgl.gen("answer", stop="\n")
182
+
183
+ states = text_qa.run(
184
+ question="What is the capital of France?",
185
+ temperature=0.1)
114
186
 
115
- ### Other Backends
187
+ for out in state.text_iter():
188
+ print(out, end="", flush=True)
189
+ ```
116
190
 
117
191
  ## Backend: SGLang Runtime (SRT)
118
192
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -151,6 +225,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
151
225
 
152
226
  ## Benchmark And Performance
153
227
 
228
+ - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
229
+ ![llama_7b](assets/llama_7b.jpg)
230
+
231
+ - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
232
+ ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
233
+
234
+ Learn more [here]().
235
+
154
236
  ## Roadmap
155
237
  - [ ] Function call
156
238
  - [ ] Quantization
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.2"
7
+ version = "0.1.3"
8
8
  description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.2"
1
+ __version__ = "0.1.3"
2
2
 
3
3
  from sglang.api import *
4
4
  from sglang.global_config import global_config
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -329,25 +329,99 @@ You can find more examples at [examples/quick_start](examples/quick_start).
329
329
 
330
330
  ## Frontend: Structured Generation Langauge (SGLang)
331
331
 
332
+ To begin with, import sglang.
333
+ ```python
334
+ import sglang as sgl
335
+ ```
336
+
337
+ `sglang` provides some simple primitives such as `gen`, `select`, `fork`.
338
+ You can implement your prompt flow in a function decorated by `sgl.function`.
339
+ You can then invoke the function with `run` or `run_batch`.
340
+ The system will manage the state, chat template, and parallelism for you.
341
+
332
342
  ### Control Flow
343
+ ```python
344
+ @sgl.function
345
+ def control_flow(s, question):
346
+ s += "To answer this question: " + question + ", "
347
+ s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
348
+
349
+ # You can use if or nested function calls
350
+ if s["tool"] == "calculator":
351
+ s += "The math expression is" + sgl.gen("expression")
352
+ elif s["tool"] == "web browser":
353
+ s += "The website url is" + sgl.gen("url")
354
+ ```
333
355
 
334
356
  ### Parallelism
357
+ ```python
358
+ @sgl.function
359
+ def tip_suggestion(s):
360
+ s += (
361
+ "Here are two tips for staying healthy: "
362
+ "1. Balanced Diet. 2. Regular Exercise.\n\n"
363
+ )
364
+
365
+ forks = s.fork(2) # Launch parallel prompts
366
+ for i, f in enumerate(forks):
367
+ f += f"Now, expand tip {i+1} into a paragraph:\n"
368
+ f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
369
+
370
+ s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
371
+ s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
372
+ s += "In summary" + sgl.gen("summary")
373
+ ```
335
374
 
336
375
  ### Multi Modality
337
376
  ```python
338
377
  @sgl.function
339
378
  def image_qa(s, image_file, question):
340
379
  s += sgl.user(sgl.image(image_file) + question)
341
- s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
380
+ s += sgl.assistant(sgl.gen("answer", max_tokens=256)
342
381
  ```
343
382
 
344
- ### Constrained decoding
383
+ ### Constrained Decoding
384
+ ```python
385
+ @function
386
+ def regular_expression_gen(s):
387
+ s += "Q: What is the IP address of the Google DNS servers?\n"
388
+ s += "A: " + gen(
389
+ "answer",
390
+ temperature=0,
391
+ regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
392
+ )
393
+ ```
345
394
 
346
395
  ### Batching
396
+ ```python
397
+ @sgl.function
398
+ def text_qa(s, question):
399
+ s += "Q: " + question + "\n"
400
+ s += "A:" + sgl.gen("answer", stop="\n")
401
+
402
+ states = text_qa.run_batch(
403
+ [
404
+ {"question": "What is the capital of the United Kingdom?"},
405
+ {"question": "What is the capital of France?"},
406
+ {"question": "What is the capital of Japan?"},
407
+ ],
408
+ )
409
+ ```
347
410
 
348
411
  ### Streaming
412
+ ```python
413
+ @sgl.function
414
+ def text_qa(s, question):
415
+ s += "Q: " + question + "\n"
416
+ s += "A:" + sgl.gen("answer", stop="\n")
417
+
418
+ states = text_qa.run(
419
+ question="What is the capital of France?",
420
+ temperature=0.1)
349
421
 
350
- ### Other Backends
422
+ for out in state.text_iter():
423
+ print(out, end="", flush=True)
424
+ ```
351
425
 
352
426
  ## Backend: SGLang Runtime (SRT)
353
427
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -386,6 +460,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
386
460
 
387
461
  ## Benchmark And Performance
388
462
 
463
+ - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
464
+ ![llama_7b](assets/llama_7b.jpg)
465
+
466
+ - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
467
+ ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
468
+
469
+ Learn more [here]().
470
+
389
471
  ## Roadmap
390
472
  - [ ] Function call
391
473
  - [ ] Quantization
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes