sglang 0.1.13__tar.gz → 0.1.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {sglang-0.1.13/sglang.egg-info → sglang-0.1.15}/PKG-INFO +13 -15
  2. {sglang-0.1.13 → sglang-0.1.15}/README.md +7 -7
  3. {sglang-0.1.13 → sglang-0.1.15}/pyproject.toml +5 -5
  4. sglang-0.1.15/sglang/__init__.py +57 -0
  5. {sglang-0.1.13 → sglang-0.1.15}/sglang/api.py +3 -5
  6. {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/anthropic.py +33 -13
  7. {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/openai.py +2 -1
  8. {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/runtime_endpoint.py +18 -5
  9. {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/vertexai.py +1 -0
  10. {sglang-0.1.13 → sglang-0.1.15}/sglang/global_config.py +1 -0
  11. {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/chat_template.py +74 -0
  12. {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/interpreter.py +40 -16
  13. {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/ir.py +1 -1
  14. {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/tracer.py +6 -4
  15. {sglang-0.1.13 → sglang-0.1.15}/sglang/launch_server.py +2 -1
  16. sglang-0.1.15/sglang/srt/constrained/fsm_cache.py +25 -0
  17. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/constrained/jump_forward.py +1 -0
  18. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/conversation.py +2 -2
  19. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/hf_transformers_utils.py +2 -1
  20. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/layers/context_flashattention_nopad.py +1 -0
  21. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/layers/extend_attention.py +1 -0
  22. sglang-0.1.15/sglang/srt/layers/logits_processor.py +175 -0
  23. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/layers/radix_attention.py +2 -1
  24. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/layers/token_attention.py +1 -0
  25. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/detokenizer_manager.py +5 -1
  26. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/io_struct.py +12 -0
  27. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/infer_batch.py +70 -33
  28. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/manager.py +7 -2
  29. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/model_rpc.py +116 -73
  30. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/model_runner.py +121 -155
  31. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/radix_cache.py +46 -38
  32. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/tokenizer_manager.py +56 -11
  33. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/memory_pool.py +5 -14
  34. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/model_config.py +7 -0
  35. sglang-0.1.15/sglang/srt/models/commandr.py +376 -0
  36. sglang-0.1.15/sglang/srt/models/dbrx.py +413 -0
  37. sglang-0.1.15/sglang/srt/models/dbrx_config.py +281 -0
  38. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/gemma.py +22 -20
  39. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/llama2.py +23 -21
  40. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/llava.py +12 -10
  41. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/mixtral.py +27 -25
  42. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/qwen.py +23 -21
  43. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/qwen2.py +23 -21
  44. sglang-0.1.15/sglang/srt/models/stablelm.py +292 -0
  45. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/yivl.py +6 -5
  46. sglang-0.1.15/sglang/srt/openai_api_adapter.py +356 -0
  47. {sglang-0.1.13/sglang/srt/managers → sglang-0.1.15/sglang/srt}/openai_protocol.py +36 -20
  48. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/sampling_params.py +2 -0
  49. sglang-0.1.15/sglang/srt/server.py +317 -0
  50. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/server_args.py +76 -49
  51. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/utils.py +88 -32
  52. sglang-0.1.15/sglang/srt/weight_utils.py +402 -0
  53. {sglang-0.1.13 → sglang-0.1.15}/sglang/test/test_programs.py +8 -7
  54. sglang-0.1.15/sglang/test/test_utils.py +350 -0
  55. {sglang-0.1.13 → sglang-0.1.15/sglang.egg-info}/PKG-INFO +13 -15
  56. {sglang-0.1.13 → sglang-0.1.15}/sglang.egg-info/SOURCES.txt +7 -1
  57. {sglang-0.1.13 → sglang-0.1.15}/sglang.egg-info/requires.txt +5 -7
  58. sglang-0.1.13/sglang/__init__.py +0 -4
  59. sglang-0.1.13/sglang/srt/constrained/fsm_cache.py +0 -13
  60. sglang-0.1.13/sglang/srt/layers/logits_processor.py +0 -115
  61. sglang-0.1.13/sglang/srt/server.py +0 -688
  62. sglang-0.1.13/sglang/test/test_utils.py +0 -162
  63. {sglang-0.1.13 → sglang-0.1.15}/LICENSE +0 -0
  64. {sglang-0.1.13 → sglang-0.1.15}/setup.cfg +0 -0
  65. {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/__init__.py +0 -0
  66. {sglang-0.1.13 → sglang-0.1.15}/sglang/backend/base_backend.py +0 -0
  67. {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/__init__.py +0 -0
  68. {sglang-0.1.13 → sglang-0.1.15}/sglang/lang/compiler.py +0 -0
  69. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/backend_config.py +0 -0
  70. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/constrained/__init__.py +0 -0
  71. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/constrained/base_cache.py +0 -0
  72. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/managers/router/scheduler.py +0 -0
  73. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/mm_utils.py +0 -0
  74. {sglang-0.1.13 → sglang-0.1.15}/sglang/srt/models/mistral.py +0 -0
  75. {sglang-0.1.13 → sglang-0.1.15}/sglang/test/test_conversation.py +0 -0
  76. {sglang-0.1.13 → sglang-0.1.15}/sglang/test/test_openai_protocol.py +0 -0
  77. {sglang-0.1.13 → sglang-0.1.15}/sglang/utils.py +0 -0
  78. {sglang-0.1.13 → sglang-0.1.15}/sglang.egg-info/dependency_links.txt +0 -0
  79. {sglang-0.1.13 → sglang-0.1.15}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -212,6 +212,7 @@ Requires-Python: >=3.8
212
212
  Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: requests
215
+ Requires-Dist: tqdm
215
216
  Provides-Extra: srt
216
217
  Requires-Dist: aiohttp; extra == "srt"
217
218
  Requires-Dist: fastapi; extra == "srt"
@@ -221,21 +222,18 @@ Requires-Dist: torch; extra == "srt"
221
222
  Requires-Dist: uvloop; extra == "srt"
222
223
  Requires-Dist: uvicorn; extra == "srt"
223
224
  Requires-Dist: zmq; extra == "srt"
224
- Requires-Dist: vllm>=0.3.3; extra == "srt"
225
+ Requires-Dist: vllm>=0.4.2; extra == "srt"
225
226
  Requires-Dist: interegular; extra == "srt"
226
- Requires-Dist: lark; extra == "srt"
227
- Requires-Dist: numba; extra == "srt"
228
227
  Requires-Dist: pydantic; extra == "srt"
229
- Requires-Dist: referencing; extra == "srt"
230
- Requires-Dist: diskcache; extra == "srt"
231
- Requires-Dist: cloudpickle; extra == "srt"
232
228
  Requires-Dist: pillow; extra == "srt"
233
229
  Requires-Dist: outlines>=0.0.27; extra == "srt"
230
+ Requires-Dist: packaging; extra == "srt"
234
231
  Provides-Extra: openai
235
232
  Requires-Dist: openai>=1.0; extra == "openai"
236
233
  Requires-Dist: numpy; extra == "openai"
234
+ Requires-Dist: tiktoken; extra == "openai"
237
235
  Provides-Extra: anthropic
238
- Requires-Dist: anthropic; extra == "anthropic"
236
+ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
239
237
  Requires-Dist: numpy; extra == "anthropic"
240
238
  Provides-Extra: all
241
239
  Requires-Dist: sglang[srt]; extra == "all"
@@ -541,7 +539,6 @@ curl http://localhost:30000/generate \
541
539
  Learn more about the argument format [here](docs/sampling_params.md).
542
540
 
543
541
  ### OpenAI Compatible API
544
-
545
542
  In addition, the server supports an experimental OpenAI-compatible API.
546
543
 
547
544
  ```python
@@ -606,7 +603,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
606
603
  ```
607
604
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
608
605
  ```
609
- - You can turn on [flashinfer](docs/flashinfer.md) to acclerate the inference by using highly optimized CUDA kernels.
606
+ - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
610
607
 
611
608
  ### Supported Models
612
609
  - Llama
@@ -622,10 +619,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
622
619
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
623
620
  - Yi-VL
624
621
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
625
- - AWQ/GPTQ quantization
622
+ - StableLM
623
+ - Command-R
624
+ - DBRX
625
+ - AWQ/GPTQ/Marlin quantization
626
626
 
627
- ## Benchmark And Performance
627
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
628
628
 
629
+ ## Benchmark And Performance
629
630
  - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
630
631
  ![llama_7b](assets/llama_7b.jpg)
631
632
 
@@ -649,7 +650,4 @@ https://github.com/sgl-project/sglang/issues/157
649
650
  }
650
651
  ```
651
652
 
652
- [![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/2312.07104)
653
-
654
-
655
653
  We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
@@ -297,7 +297,6 @@ curl http://localhost:30000/generate \
297
297
  Learn more about the argument format [here](docs/sampling_params.md).
298
298
 
299
299
  ### OpenAI Compatible API
300
-
301
300
  In addition, the server supports an experimental OpenAI-compatible API.
302
301
 
303
302
  ```python
@@ -362,7 +361,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
362
361
  ```
363
362
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
364
363
  ```
365
- - You can turn on [flashinfer](docs/flashinfer.md) to acclerate the inference by using highly optimized CUDA kernels.
364
+ - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
366
365
 
367
366
  ### Supported Models
368
367
  - Llama
@@ -378,10 +377,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
378
377
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
379
378
  - Yi-VL
380
379
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
381
- - AWQ/GPTQ quantization
380
+ - StableLM
381
+ - Command-R
382
+ - DBRX
383
+ - AWQ/GPTQ/Marlin quantization
382
384
 
383
- ## Benchmark And Performance
385
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
384
386
 
387
+ ## Benchmark And Performance
385
388
  - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
386
389
  ![llama_7b](assets/llama_7b.jpg)
387
390
 
@@ -405,7 +408,4 @@ https://github.com/sgl-project/sglang/issues/157
405
408
  }
406
409
  ```
407
410
 
408
- [![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/2312.07104)
409
-
410
-
411
411
  We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.13"
7
+ version = "0.1.15"
8
8
  description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -15,14 +15,14 @@ classifiers = [
15
15
  ]
16
16
  dependencies = [
17
17
  "requests",
18
+ "tqdm",
18
19
  ]
19
20
 
20
21
  [project.optional-dependencies]
21
22
  srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
22
- "zmq", "vllm>=0.3.3", "interegular", "lark", "numba",
23
- "pydantic", "referencing", "diskcache", "cloudpickle", "pillow", "outlines>=0.0.27"]
24
- openai = ["openai>=1.0", "numpy"]
25
- anthropic = ["anthropic", "numpy"]
23
+ "zmq", "vllm>=0.4.2", "interegular", "pydantic", "pillow", "outlines>=0.0.27", "packaging"]
24
+ openai = ["openai>=1.0", "numpy", "tiktoken"]
25
+ anthropic = ["anthropic>=0.20.0", "numpy"]
26
26
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
27
27
 
28
28
  [project.urls]
@@ -0,0 +1,57 @@
1
+ __version__ = "0.1.15"
2
+
3
+ # SGL API Components
4
+ from sglang.api import (
5
+ Runtime,
6
+ assistant,
7
+ assistant_begin,
8
+ assistant_end,
9
+ flush_cache,
10
+ function,
11
+ gen,
12
+ gen_int,
13
+ gen_string,
14
+ get_server_args,
15
+ image,
16
+ select,
17
+ set_default_backend,
18
+ system,
19
+ user,
20
+ user_begin,
21
+ user_end,
22
+ )
23
+
24
+ # SGL Backends
25
+ from sglang.backend.anthropic import Anthropic
26
+ from sglang.backend.openai import OpenAI
27
+ from sglang.backend.runtime_endpoint import RuntimeEndpoint
28
+ from sglang.backend.vertexai import VertexAI
29
+
30
+ # Global Configurations
31
+ from sglang.global_config import global_config
32
+
33
+ # public APIs management
34
+ __all__ = [
35
+ "global_config",
36
+ "Anthropic",
37
+ "OpenAI",
38
+ "RuntimeEndpoint",
39
+ "VertexAI",
40
+ "function",
41
+ "Runtime",
42
+ "set_default_backend",
43
+ "flush_cache",
44
+ "get_server_args",
45
+ "gen",
46
+ "gen_int",
47
+ "gen_string",
48
+ "image",
49
+ "select",
50
+ "system",
51
+ "user",
52
+ "assistant",
53
+ "user_begin",
54
+ "user_end",
55
+ "assistant_begin",
56
+ "assistant_end",
57
+ ]
@@ -1,13 +1,10 @@
1
- """Public API"""
1
+ """Some Public API Definitions"""
2
2
 
3
+ import os
3
4
  import re
4
5
  from typing import Callable, List, Optional, Union
5
6
 
6
- from sglang.backend.anthropic import Anthropic
7
7
  from sglang.backend.base_backend import BaseBackend
8
- from sglang.backend.openai import OpenAI
9
- from sglang.backend.runtime_endpoint import RuntimeEndpoint
10
- from sglang.backend.vertexai import VertexAI
11
8
  from sglang.global_config import global_config
12
9
  from sglang.lang.ir import (
13
10
  SglExpr,
@@ -35,6 +32,7 @@ def function(
35
32
 
36
33
  def Runtime(*args, **kwargs):
37
34
  # Avoid importing unnecessary dependency
35
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
38
36
  from sglang.srt.server import Runtime
39
37
 
40
38
  return Runtime(*args, **kwargs)
@@ -1,6 +1,7 @@
1
1
  from typing import List, Optional, Union
2
2
 
3
3
  import numpy as np
4
+
4
5
  from sglang.backend.base_backend import BaseBackend
5
6
  from sglang.lang.chat_template import get_chat_template
6
7
  from sglang.lang.interpreter import StreamExecutor
@@ -13,7 +14,7 @@ except ImportError as e:
13
14
 
14
15
 
15
16
  class Anthropic(BaseBackend):
16
- def __init__(self, model_name):
17
+ def __init__(self, model_name, *args, **kwargs):
17
18
  super().__init__()
18
19
 
19
20
  if isinstance(anthropic, Exception):
@@ -21,6 +22,7 @@ class Anthropic(BaseBackend):
21
22
 
22
23
  self.model_name = model_name
23
24
  self.chat_template = get_chat_template("claude")
25
+ self.client = anthropic.Anthropic(*args, **kwargs)
24
26
 
25
27
  def get_chat_template(self):
26
28
  return self.chat_template
@@ -30,13 +32,23 @@ class Anthropic(BaseBackend):
30
32
  s: StreamExecutor,
31
33
  sampling_params: SglSamplingParams,
32
34
  ):
33
- prompt = s.text_
34
- ret = anthropic.Anthropic().completions.create(
35
+ if s.messages_:
36
+ messages = s.messages_
37
+ else:
38
+ messages = [{"role": "user", "content": s.text_}]
39
+
40
+ if messages and messages[0]["role"] == "system":
41
+ system = messages.pop(0)["content"]
42
+ else:
43
+ system = ""
44
+
45
+ ret = self.client.messages.create(
35
46
  model=self.model_name,
36
- prompt=prompt,
47
+ system=system,
48
+ messages=messages,
37
49
  **sampling_params.to_anthropic_kwargs(),
38
50
  )
39
- comp = ret.completion
51
+ comp = ret.content[0].text
40
52
 
41
53
  return comp, {}
42
54
 
@@ -45,13 +57,21 @@ class Anthropic(BaseBackend):
45
57
  s: StreamExecutor,
46
58
  sampling_params: SglSamplingParams,
47
59
  ):
48
- prompt = s.text_
49
- generator = anthropic.Anthropic().completions.create(
60
+ if s.messages_:
61
+ messages = s.messages_
62
+ else:
63
+ messages = [{"role": "user", "content": s.text_}]
64
+
65
+ if messages and messages[0]["role"] == "system":
66
+ system = messages.pop(0)["content"]
67
+ else:
68
+ system = ""
69
+
70
+ with self.client.messages.stream(
50
71
  model=self.model_name,
51
- prompt=prompt,
52
- stream=True,
72
+ system=system,
73
+ messages=messages,
53
74
  **sampling_params.to_anthropic_kwargs(),
54
- )
55
-
56
- for ret in generator:
57
- yield ret.completion, {}
75
+ ) as stream:
76
+ for text in stream.text_stream:
77
+ yield text, {}
@@ -3,6 +3,7 @@ import time
3
3
  from typing import Callable, List, Optional, Union
4
4
 
5
5
  import numpy as np
6
+
6
7
  from sglang.backend.base_backend import BaseBackend
7
8
  from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
8
9
  from sglang.lang.interpreter import StreamExecutor
@@ -227,7 +228,7 @@ class OpenAI(BaseBackend):
227
228
  prompt_tokens.append(ret_token)
228
229
 
229
230
  decision = choices[np.argmax(scores)]
230
- return decision, scores, scores
231
+ return decision, scores, None, None
231
232
 
232
233
 
233
234
  def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
@@ -3,6 +3,7 @@ from typing import Callable, List, Optional, Union
3
3
 
4
4
  import numpy as np
5
5
  import requests
6
+
6
7
  from sglang.backend.base_backend import BaseBackend
7
8
  from sglang.global_config import global_config
8
9
  from sglang.lang.chat_template import get_chat_template_by_model_path
@@ -73,9 +74,11 @@ class RuntimeEndpoint(BaseBackend):
73
74
  assert res.status_code == 200
74
75
 
75
76
  def commit_lazy_operations(self, s: StreamExecutor):
77
+ data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
78
+ self._add_images(s, data)
76
79
  res = http_request(
77
80
  self.base_url + "/generate",
78
- json={"text": s.text_, "sampling_params": {"max_new_tokens": 0}},
81
+ json=data,
79
82
  auth_token=self.auth_token,
80
83
  api_key=self.api_key,
81
84
  verify=self.verify,
@@ -104,6 +107,7 @@ class RuntimeEndpoint(BaseBackend):
104
107
  "text": s.text_,
105
108
  "sampling_params": {
106
109
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
110
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
107
111
  **sampling_params.to_srt_kwargs(),
108
112
  },
109
113
  }
@@ -112,6 +116,7 @@ class RuntimeEndpoint(BaseBackend):
112
116
  "text": s.text_,
113
117
  "sampling_params": {
114
118
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
119
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
115
120
  "dtype": "int",
116
121
  **sampling_params.to_srt_kwargs(),
117
122
  },
@@ -142,6 +147,7 @@ class RuntimeEndpoint(BaseBackend):
142
147
  "text": s.text_,
143
148
  "sampling_params": {
144
149
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
150
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
145
151
  **sampling_params.to_srt_kwargs(),
146
152
  },
147
153
  }
@@ -150,6 +156,7 @@ class RuntimeEndpoint(BaseBackend):
150
156
  "text": s.text_,
151
157
  "sampling_params": {
152
158
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
159
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
153
160
  "dtype": "int",
154
161
  **sampling_params.to_srt_kwargs(),
155
162
  },
@@ -224,13 +231,19 @@ class RuntimeEndpoint(BaseBackend):
224
231
  )
225
232
  assert res.status_code == 200
226
233
  obj = res.json()
227
- normalized_prompt_logprob = [
234
+ normalized_prompt_logprobs = [
228
235
  r["meta_info"]["normalized_prompt_logprob"] for r in obj
229
236
  ]
230
- prompt_logprob = [r["meta_info"]["prompt_logprob"] for r in obj]
237
+ decision = choices[np.argmax(normalized_prompt_logprobs)]
238
+ prefill_token_logprobs = [r["meta_info"]["prefill_token_logprobs"] for r in obj]
239
+ decode_token_logprobs = [r["meta_info"]["decode_token_logprobs"] for r in obj]
231
240
 
232
- decision = choices[np.argmax(normalized_prompt_logprob)]
233
- return decision, normalized_prompt_logprob, prompt_logprob
241
+ return (
242
+ decision,
243
+ normalized_prompt_logprobs,
244
+ prefill_token_logprobs,
245
+ decode_token_logprobs,
246
+ )
234
247
 
235
248
  def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
236
249
  res = http_request(
@@ -3,6 +3,7 @@ import warnings
3
3
  from typing import List, Optional, Union
4
4
 
5
5
  import numpy as np
6
+
6
7
  from sglang.backend.base_backend import BaseBackend
7
8
  from sglang.lang.chat_template import get_chat_template
8
9
  from sglang.lang.interpreter import StreamExecutor
@@ -12,6 +12,7 @@ class GlobalConfig:
12
12
 
13
13
  # Output configs
14
14
  self.skip_special_tokens_in_output = True
15
+ self.spaces_between_special_tokens_in_out = True
15
16
 
16
17
  # Optimization configs
17
18
  self.eager_fill_image = False
@@ -162,6 +162,28 @@ register_chat_template(
162
162
  )
163
163
  )
164
164
 
165
+ register_chat_template(
166
+ ChatTemplate(
167
+ name="llama-3-instruct",
168
+ default_system_prompt=None,
169
+ role_prefix_and_suffix={
170
+ "system": (
171
+ "<|start_header_id|>system<|end_header_id|>\n\n",
172
+ "<|eot_id|>",
173
+ ),
174
+ "user": (
175
+ "<|start_header_id|>user<|end_header_id|>\n\n",
176
+ "<|eot_id|>",
177
+ ),
178
+ "assistant": (
179
+ "<|start_header_id|>assistant<|end_header_id|>\n\n",
180
+ "<|eot_id|>",
181
+ ),
182
+ },
183
+ stop_str=("<|eot_id|>",),
184
+ )
185
+ )
186
+
165
187
  # Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
166
188
  register_chat_template(
167
189
  ChatTemplate(
@@ -192,6 +214,44 @@ register_chat_template(
192
214
  )
193
215
  )
194
216
 
217
+ register_chat_template(
218
+ ChatTemplate(
219
+ name="dbrx-instruct",
220
+ default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
221
+ role_prefix_and_suffix={
222
+ "system": ("<|im_start|>system\n", "<|im_end|>"),
223
+ "user": ("\n<|im_start|>user\n", "<|im_end|>"),
224
+ "assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
225
+ },
226
+ stop_str=("<|im_end|>",),
227
+ )
228
+ )
229
+
230
+ register_chat_template(
231
+ ChatTemplate(
232
+ name="c4ai-command-r",
233
+ default_system_prompt=None,
234
+ role_prefix_and_suffix={
235
+ "system": (
236
+ "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
237
+ "<|END_OF_TURN_TOKEN|>",
238
+ ),
239
+ "user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
240
+ "assistant": (
241
+ "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
242
+ "<|END_OF_TURN_TOKEN|>",
243
+ ),
244
+ },
245
+ style=ChatTemplateStyle.PLAIN,
246
+ )
247
+ )
248
+
249
+
250
+ @register_chat_template_matching_function
251
+ def match_dbrx(model_path: str):
252
+ if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
253
+ return get_chat_template("dbrx-instruct")
254
+
195
255
 
196
256
  @register_chat_template_matching_function
197
257
  def match_vicuna(model_path: str):
@@ -214,6 +274,13 @@ def match_llama2_chat(model_path: str):
214
274
  return get_chat_template("llama-2-chat")
215
275
 
216
276
 
277
+ @register_chat_template_matching_function
278
+ def match_llama3_instruct(model_path: str):
279
+ model_path = model_path.lower()
280
+ if "llama-3" in model_path and "instruct" in model_path:
281
+ return get_chat_template("llama-3-instruct")
282
+
283
+
217
284
  @register_chat_template_matching_function
218
285
  def match_chat_ml(model_path: str):
219
286
  model_path = model_path.lower()
@@ -239,6 +306,13 @@ def match_gemma_it(model_path: str):
239
306
  return get_chat_template("gemma-it")
240
307
 
241
308
 
309
+ @register_chat_template_matching_function
310
+ def match_c4ai_command_r(model_path: str):
311
+ model_path = model_path.lower()
312
+ if "c4ai-command-r" in model_path:
313
+ return get_chat_template("c4ai-command-r")
314
+
315
+
242
316
  if __name__ == "__main__":
243
317
  messages = [
244
318
  {"role": "system", "content": None}, # None means default
@@ -1,6 +1,7 @@
1
1
  """The interpreter that executes SGL programs"""
2
2
 
3
3
  import asyncio
4
+ import contextvars
4
5
  import multiprocessing
5
6
  import queue
6
7
  import threading
@@ -10,6 +11,7 @@ from contextlib import contextmanager
10
11
  from typing import Any, Callable, Dict, List, Optional, Union
11
12
 
12
13
  import tqdm
14
+
13
15
  from sglang.global_config import global_config
14
16
  from sglang.lang.ir import (
15
17
  SglCommitLazy,
@@ -217,7 +219,13 @@ class StreamExecutor:
217
219
  self.use_thread = use_thread
218
220
  if self.use_thread:
219
221
  self.queue = queue.Queue()
220
- self.worker = threading.Thread(target=self._thread_worker_func)
222
+
223
+ def _run_worker_in_context():
224
+ self._thread_worker_func()
225
+
226
+ self.worker = threading.Thread(
227
+ target=contextvars.copy_context().run, args=(_run_worker_in_context,)
228
+ )
221
229
  self.worker.start()
222
230
 
223
231
  # For streaming
@@ -248,17 +256,24 @@ class StreamExecutor:
248
256
  def set_var(self, name, value):
249
257
  self.variables[name] = value
250
258
 
251
- def get_meta_info(self, name):
259
+ def get_meta_info(self, name, timeout=None):
252
260
  if name in self.variable_event:
253
- self.variable_event[name].wait()
261
+ got = self.variable_event[name].wait(timeout)
262
+ if not got:
263
+ raise TimeoutError(f"Timeout while waiting for event '{name}'")
254
264
  ret = self.meta_info.get(name, None)
255
265
  return ret
256
266
 
257
- def fork(self, number: int, position_ids_offset: Optional[List[int]] = None):
258
- self.submit(SglCommitLazy())
259
- self.sync()
267
+ def fork(
268
+ self,
269
+ size: int = 1,
270
+ position_ids_offset: Optional[List[int]] = None,
271
+ ):
272
+ if size > 1:
273
+ self.submit(SglCommitLazy())
260
274
 
261
- number = int(number)
275
+ self.sync()
276
+ size = int(size)
262
277
 
263
278
  exes = [
264
279
  StreamExecutor(
@@ -268,14 +283,15 @@ class StreamExecutor:
268
283
  self.chat_template,
269
284
  self.stream,
270
285
  )
271
- for _ in range(number)
286
+ for _ in range(size)
272
287
  ]
273
- for i in range(number):
288
+ for i in range(size):
274
289
  exes[i].variables = dict(self.variables)
275
290
  exes[i].text_ = str(self.text_)
276
291
  exes[i].messages_ = list(self.messages_)
277
292
  exes[i].cur_role = self.cur_role
278
293
  exes[i].fork_start_text_pos = len(self.text_)
294
+ exes[i].images_ = list(self.images_)
279
295
 
280
296
  return exes
281
297
 
@@ -454,15 +470,19 @@ class StreamExecutor:
454
470
  self.stream_var_event[name].set()
455
471
 
456
472
  def _execute_select(self, expr: SglSelect):
457
- decision, normalized_prompt_logprob, prompt_logprob = self.backend.select(
458
- self, expr.choices, expr.temperature
459
- )
473
+ (
474
+ decision,
475
+ normalized_prompt_logprobs,
476
+ prefill_token_logprobs,
477
+ decode_token_logprobs,
478
+ ) = self.backend.select(self, expr.choices, expr.temperature)
460
479
  if expr.name is not None:
461
480
  name = expr.name
462
481
  self.variables[name] = decision
463
482
  self.meta_info[name] = {
464
- "normalized_prompt_logprob": normalized_prompt_logprob,
465
- "prompt_logprob": prompt_logprob,
483
+ "normalized_prompt_logprobs": normalized_prompt_logprobs,
484
+ "prefill_token_logprobs": prefill_token_logprobs,
485
+ "decode_token_logprobs": decode_token_logprobs,
466
486
  }
467
487
  self.variable_event[name].set()
468
488
  self.text_ += decision
@@ -634,8 +654,12 @@ class ProgramState:
634
654
  yield
635
655
  self.stream_executor.submit(SglVarScopeEnd(name))
636
656
 
637
- def fork(self, number: int = 1, position_ids_offset: Optional[List[int]] = None):
638
- stream_executors = self.stream_executor.fork(number, position_ids_offset)
657
+ def fork(
658
+ self,
659
+ size: int = 1,
660
+ position_ids_offset: Optional[List[int]] = None,
661
+ ):
662
+ stream_executors = self.stream_executor.fork(size, position_ids_offset)
639
663
  states = [ProgramState(x) for x in stream_executors]
640
664
  state_group = ProgramStateGroup(states, self)
641
665
  return state_group
@@ -73,7 +73,7 @@ class SglSamplingParams:
73
73
  "Regular expression is not supported in the Anthropic backend."
74
74
  )
75
75
  return {
76
- "max_tokens_to_sample": self.max_new_tokens,
76
+ "max_tokens": self.max_new_tokens,
77
77
  "stop_sequences": (
78
78
  self.stop if isinstance(self.stop, (list, tuple)) else [self.stop]
79
79
  ),