sglang 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. sglang/__init__.py +57 -2
  2. sglang/api.py +8 -5
  3. sglang/backend/anthropic.py +18 -4
  4. sglang/backend/openai.py +2 -1
  5. sglang/backend/runtime_endpoint.py +18 -5
  6. sglang/backend/vertexai.py +1 -0
  7. sglang/global_config.py +5 -1
  8. sglang/lang/chat_template.py +83 -2
  9. sglang/lang/interpreter.py +92 -35
  10. sglang/lang/ir.py +12 -9
  11. sglang/lang/tracer.py +6 -4
  12. sglang/launch_server_llavavid.py +31 -0
  13. sglang/srt/constrained/fsm_cache.py +1 -0
  14. sglang/srt/constrained/jump_forward.py +1 -0
  15. sglang/srt/conversation.py +2 -2
  16. sglang/srt/flush_cache.py +16 -0
  17. sglang/srt/hf_transformers_utils.py +10 -2
  18. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  19. sglang/srt/layers/extend_attention.py +1 -0
  20. sglang/srt/layers/logits_processor.py +114 -54
  21. sglang/srt/layers/radix_attention.py +2 -1
  22. sglang/srt/layers/token_attention.py +1 -0
  23. sglang/srt/managers/detokenizer_manager.py +5 -1
  24. sglang/srt/managers/io_struct.py +27 -3
  25. sglang/srt/managers/router/infer_batch.py +97 -48
  26. sglang/srt/managers/router/manager.py +11 -8
  27. sglang/srt/managers/router/model_rpc.py +169 -90
  28. sglang/srt/managers/router/model_runner.py +110 -166
  29. sglang/srt/managers/router/radix_cache.py +89 -51
  30. sglang/srt/managers/router/scheduler.py +17 -28
  31. sglang/srt/managers/tokenizer_manager.py +110 -33
  32. sglang/srt/memory_pool.py +5 -14
  33. sglang/srt/model_config.py +11 -0
  34. sglang/srt/models/commandr.py +372 -0
  35. sglang/srt/models/dbrx.py +412 -0
  36. sglang/srt/models/dbrx_config.py +281 -0
  37. sglang/srt/models/gemma.py +24 -25
  38. sglang/srt/models/llama2.py +25 -26
  39. sglang/srt/models/llava.py +8 -10
  40. sglang/srt/models/llavavid.py +307 -0
  41. sglang/srt/models/mixtral.py +29 -33
  42. sglang/srt/models/qwen.py +34 -25
  43. sglang/srt/models/qwen2.py +25 -26
  44. sglang/srt/models/stablelm.py +26 -26
  45. sglang/srt/models/yivl.py +3 -5
  46. sglang/srt/openai_api_adapter.py +356 -0
  47. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
  48. sglang/srt/sampling_params.py +2 -0
  49. sglang/srt/server.py +91 -456
  50. sglang/srt/server_args.py +79 -49
  51. sglang/srt/utils.py +212 -47
  52. sglang/srt/weight_utils.py +417 -0
  53. sglang/test/test_programs.py +8 -7
  54. sglang/test/test_utils.py +195 -7
  55. sglang/utils.py +77 -26
  56. {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/METADATA +20 -18
  57. sglang-0.1.16.dist-info/RECORD +72 -0
  58. sglang-0.1.14.dist-info/RECORD +0 -64
  59. {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/LICENSE +0 -0
  60. {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/WHEEL +0 -0
  61. {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py CHANGED
@@ -1,13 +1,20 @@
1
1
  """Common utilities for testing and benchmarking"""
2
2
 
3
+ import asyncio
4
+ from functools import partial
5
+
3
6
  import numpy as np
4
7
  import requests
8
+
5
9
  from sglang.backend.openai import OpenAI
6
10
  from sglang.backend.runtime_endpoint import RuntimeEndpoint
7
11
  from sglang.global_config import global_config
12
+ from sglang.srt.utils import get_exception_traceback
8
13
 
9
14
 
10
- def call_generate_lightllm(prompt, temperature, max_tokens, stop, url):
15
+ def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
16
+ assert url is not None
17
+
11
18
  data = {
12
19
  "inputs": prompt,
13
20
  "parameters": {
@@ -22,7 +29,9 @@ def call_generate_lightllm(prompt, temperature, max_tokens, stop, url):
22
29
  return pred
23
30
 
24
31
 
25
- def call_generate_vllm(prompt, temperature, max_tokens, stop, url, n=1):
32
+ def call_generate_vllm(prompt, temperature, max_tokens, stop=None, n=1, url=None):
33
+ assert url is not None
34
+
26
35
  data = {
27
36
  "prompt": prompt,
28
37
  "temperature": temperature,
@@ -40,8 +49,10 @@ def call_generate_vllm(prompt, temperature, max_tokens, stop, url, n=1):
40
49
 
41
50
 
42
51
  def call_generate_outlines(
43
- prompt, temperature, max_tokens, url, stop=[], regex=None, n=1
52
+ prompt, temperature, max_tokens, stop=[], regex=None, n=1, url=None
44
53
  ):
54
+ assert url is not None
55
+
45
56
  data = {
46
57
  "prompt": prompt,
47
58
  "temperature": temperature,
@@ -59,7 +70,9 @@ def call_generate_outlines(
59
70
  return pred
60
71
 
61
72
 
62
- def call_generate_srt_raw(prompt, temperature, max_tokens, stop, url):
73
+ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
74
+ assert url is not None
75
+
63
76
  data = {
64
77
  "text": prompt,
65
78
  "sampling_params": {
@@ -75,7 +88,71 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop, url):
75
88
  return pred
76
89
 
77
90
 
78
- def call_select_lightllm(context, choices, url):
91
+ def call_generate_guidance(
92
+ prompt, temperature, max_tokens, stop=None, n=1, regex=None, model=None
93
+ ):
94
+ assert model is not None
95
+ from guidance import gen
96
+
97
+ rets = []
98
+ for _ in range(n):
99
+ out = (
100
+ model
101
+ + prompt
102
+ + gen(
103
+ name="answer",
104
+ max_tokens=max_tokens,
105
+ temperature=temperature,
106
+ stop=stop,
107
+ regex=regex,
108
+ )
109
+ )
110
+ rets.append(out["answer"])
111
+ return rets if n > 1 else rets[0]
112
+
113
+
114
+ async def call_generate_lmql(
115
+ prompt, temperature, max_tokens, stop=None, n=1, max_len=4096, model=None, **kwargs
116
+ ):
117
+ assert model is not None
118
+ import lmql
119
+
120
+ if stop != None:
121
+
122
+ @lmql.query(model=model)
123
+ async def program(question, max_tokens, stop):
124
+ '''lmql
125
+ """{question}[ANSWER]""" where len(TOKENS(ANSWER)) < max_tokens and STOPS_AT(ANSWER, stop)
126
+ return ANSWER
127
+ '''
128
+
129
+ else:
130
+
131
+ @lmql.query(model=model)
132
+ async def program(question, max_tokens):
133
+ '''lmql
134
+ """{question}[ANSWER]""" where len(TOKENS(ANSWER)) < max_tokens
135
+ return ANSWER
136
+ '''
137
+
138
+ tasks = [
139
+ program(
140
+ question=prompt,
141
+ temperature=temperature,
142
+ max_tokens=max_tokens,
143
+ stop=stop,
144
+ max_len=max_len,
145
+ **kwargs,
146
+ )
147
+ for _ in range(n)
148
+ ]
149
+ rets = await asyncio.gather(*tasks)
150
+ return rets if n > 1 else rets[0]
151
+
152
+
153
+ def call_select_lightllm(context, choices, url=None):
154
+ assert url is not None
155
+
79
156
  scores = []
80
157
  for i in range(len(choices)):
81
158
  data = {
@@ -90,7 +167,9 @@ def call_select_lightllm(context, choices, url):
90
167
  return np.argmax(scores)
91
168
 
92
169
 
93
- def call_select_vllm(context, choices, url):
170
+ def call_select_vllm(context, choices, url=None):
171
+ assert url is not None
172
+
94
173
  scores = []
95
174
  for i in range(len(choices)):
96
175
  data = {
@@ -112,6 +191,31 @@ def call_select_vllm(context, choices, url):
112
191
  """
113
192
 
114
193
 
194
+ def call_select_guidance(context, choices, model=None):
195
+ assert model is not None
196
+ from guidance import select
197
+
198
+ out = model + context + select(choices, name="answer")
199
+ return choices.index(out["answer"])
200
+
201
+
202
+ async def call_select_lmql(context, choices, temperature=0, max_len=4096, model=None):
203
+ assert model is not None
204
+ import lmql
205
+
206
+ @lmql.query(model=model)
207
+ async def program(ctx, choices):
208
+ '''lmql
209
+ """{ctx}[ANSWER]""" where ANSWER in set(choices)
210
+ return ANSWER
211
+ '''
212
+
213
+ answer = await program(
214
+ ctx=context, choices=choices, temperature=temperature, max_len=max_len
215
+ )
216
+ return choices.index(answer)
217
+
218
+
115
219
  def add_common_other_args_and_parse(parser):
116
220
  parser.add_argument("--parallel", type=int, default=64)
117
221
  parser.add_argument("--host", type=str, default="http://127.0.0.1")
@@ -120,8 +224,17 @@ def add_common_other_args_and_parse(parser):
120
224
  "--backend",
121
225
  type=str,
122
226
  required=True,
123
- choices=["vllm", "lightllm", "guidance", "lmql", "srt-raw", "llama.cpp"],
227
+ choices=[
228
+ "vllm",
229
+ "outlines",
230
+ "lightllm",
231
+ "guidance",
232
+ "lmql",
233
+ "srt-raw",
234
+ "llama.cpp",
235
+ ],
124
236
  )
237
+ parser.add_argument("--n-ctx", type=int, default=4096)
125
238
  parser.add_argument(
126
239
  "--model-path", type=str, default="meta-llama/Llama-2-7b-chat-hf"
127
240
  )
@@ -131,6 +244,7 @@ def add_common_other_args_and_parse(parser):
131
244
  if args.port is None:
132
245
  default_port = {
133
246
  "vllm": 21000,
247
+ "outlines": 21000,
134
248
  "lightllm": 22000,
135
249
  "lmql": 23000,
136
250
  "srt-raw": 30000,
@@ -160,3 +274,77 @@ def select_sglang_backend(args):
160
274
  else:
161
275
  raise ValueError(f"Invalid backend: {args.backend}")
162
276
  return backend
277
+
278
+
279
+ def _get_call_generate(args):
280
+ if args.backend == "lightllm":
281
+ return partial(call_generate_lightllm, url=f"{args.host}:{args.port}/generate")
282
+ elif args.backend == "vllm":
283
+ return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
284
+ elif args.backend == "srt-raw":
285
+ return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
286
+ elif args.backend == "outlines":
287
+ return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
288
+ elif args.backend == "guidance":
289
+ from guidance import models
290
+
291
+ model = models.LlamaCpp(args.model_path, n_gpu_layers=-1, n_ctx=args.n_ctx)
292
+ call_generate = partial(call_generate_guidance, model=model)
293
+ call_generate("Hello,", 1.0, 8, ".")
294
+ return call_generate
295
+ elif args.backend == "lmql":
296
+ import lmql
297
+
298
+ model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
299
+ return partial(call_generate_lmql, model=model)
300
+ else:
301
+ raise ValueError(f"Invalid backend: {args.backend}")
302
+
303
+
304
+ def _get_call_select(args):
305
+ if args.backend == "lightllm":
306
+ return partial(call_select_lightllm, url=f"{args.host}:{args.port}/generate")
307
+ elif args.backend == "vllm":
308
+ return partial(call_select_vllm, url=f"{args.host}:{args.port}/generate")
309
+ elif args.backend == "guidance":
310
+ from guidance import models
311
+
312
+ model = models.LlamaCpp(args.model_path, n_gpu_layers=-1, n_ctx=args.n_ctx)
313
+ call_select = partial(call_select_guidance, model=model)
314
+
315
+ call_select("Hello,", ["world", "earth"])
316
+ return call_select
317
+
318
+ elif args.backend == "lmql":
319
+ import lmql
320
+
321
+ model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
322
+ return partial(call_select_lmql, model=model)
323
+ else:
324
+ raise ValueError(f"Invalid backend: {args.backend}")
325
+
326
+
327
+ def get_call_generate(args):
328
+ call_generate = _get_call_generate(args)
329
+
330
+ def func(*args, **kwargs):
331
+ try:
332
+ return call_generate(*args, **kwargs)
333
+ except Exception:
334
+ print("Exception in call_generate:\n" + get_exception_traceback())
335
+ raise
336
+
337
+ return func
338
+
339
+
340
+ def get_call_select(args):
341
+ call_select = _get_call_select(args)
342
+
343
+ def func(*args, **kwargs):
344
+ try:
345
+ return call_select(*args, **kwargs)
346
+ except Exception:
347
+ print("Exception in call_select:\n" + get_exception_traceback())
348
+ raise
349
+
350
+ return func
sglang/utils.py CHANGED
@@ -2,40 +2,23 @@
2
2
 
3
3
  import base64
4
4
  import json
5
+ import os
6
+ import sys
5
7
  import threading
8
+ import traceback
6
9
  import urllib.request
10
+ from concurrent.futures import ThreadPoolExecutor
7
11
  from io import BytesIO
8
12
  from json import dumps
9
13
 
14
+ import numpy as np
10
15
  import requests
11
16
 
12
17
 
13
- def get_available_gpu_memory(gpu_id, distributed=True):
14
- """
15
- Get available memory for cuda:gpu_id device.
16
- When distributed is True, the available memory is the minimum available memory of all GPUs.
17
- """
18
- import torch
19
-
20
- num_gpus = torch.cuda.device_count()
21
- assert gpu_id < num_gpus
22
-
23
- if torch.cuda.current_device() != gpu_id:
24
- print(
25
- f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
26
- "which may cause useless memory allocation for torch CUDA context.",
27
- )
28
-
29
- free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
30
-
31
- if distributed:
32
- tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
33
- torch.device("cuda", gpu_id)
34
- )
35
- torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.MIN)
36
- free_gpu_memory = tensor.item()
37
-
38
- return free_gpu_memory / (1 << 30)
18
+ def get_exception_traceback():
19
+ etype, value, tb = sys.exc_info()
20
+ err_str = "".join(traceback.format_exception(etype, value, tb))
21
+ return err_str
39
22
 
40
23
 
41
24
  def is_same_type(values):
@@ -130,6 +113,74 @@ def encode_image_base64(image_path):
130
113
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
131
114
 
132
115
 
116
+ def encode_frame(frame):
117
+ import cv2 # pip install opencv-python-headless
118
+ from PIL import Image
119
+
120
+ # Convert the frame to RGB (OpenCV uses BGR by default)
121
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
122
+
123
+ # Convert the frame to PIL Image to easily convert to bytes
124
+ im_pil = Image.fromarray(frame)
125
+
126
+ # Convert to bytes
127
+ buffered = BytesIO()
128
+
129
+ # frame_format = str(os.getenv('FRAME_FORMAT', "JPEG"))
130
+
131
+ im_pil.save(buffered, format="PNG")
132
+
133
+ frame_bytes = buffered.getvalue()
134
+
135
+ # Return the bytes of the frame
136
+ return frame_bytes
137
+
138
+
139
+ def encode_video_base64(video_path, num_frames=16):
140
+ import cv2
141
+ cap = cv2.VideoCapture(video_path)
142
+ if not cap.isOpened():
143
+ raise IOError(f"Could not open video file:{video_path}")
144
+
145
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
146
+ print(f"target_frames: {num_frames}")
147
+
148
+ frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
149
+
150
+ frames = []
151
+ for i in range(total_frames):
152
+ ret, frame = cap.read()
153
+ if ret:
154
+ frames.append(frame)
155
+ else:
156
+ # Handle the case where the frame could not be read
157
+ # print(f"Warning: Could not read frame at index {i}.")
158
+ pass
159
+
160
+ cap.release()
161
+
162
+ # Safely select frames based on frame_indices, avoiding IndexError
163
+ frames = [frames[i] for i in frame_indices if i < len(frames)]
164
+
165
+ # If there are not enough frames, duplicate the last frame until we reach the target
166
+ while len(frames) < num_frames:
167
+ frames.append(frames[-1])
168
+
169
+ # Use ThreadPoolExecutor to process and encode frames in parallel
170
+ with ThreadPoolExecutor() as executor:
171
+ encoded_frames = list(executor.map(encode_frame, frames))
172
+
173
+ # encoded_frames = list(map(encode_frame, frames))
174
+
175
+ # Concatenate all frames bytes
176
+ video_bytes = b"".join(encoded_frames)
177
+
178
+ # Encode the concatenated bytes to base64
179
+ video_base64 = "video:" + base64.b64encode(video_bytes).decode("utf-8")
180
+
181
+ return video_base64
182
+
183
+
133
184
  def _is_chinese_char(cp):
134
185
  """Checks whether CP is the codepoint of a CJK character."""
135
186
  # This defines a "chinese character" as anything in the CJK Unicode block:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.14
3
+ Version: 0.1.16
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -212,6 +212,7 @@ Requires-Python: >=3.8
212
212
  Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: requests
215
+ Requires-Dist: tqdm
215
216
  Provides-Extra: all
216
217
  Requires-Dist: sglang[srt] ; extra == 'all'
217
218
  Requires-Dist: sglang[openai] ; extra == 'all'
@@ -222,6 +223,7 @@ Requires-Dist: numpy ; extra == 'anthropic'
222
223
  Provides-Extra: openai
223
224
  Requires-Dist: openai >=1.0 ; extra == 'openai'
224
225
  Requires-Dist: numpy ; extra == 'openai'
226
+ Requires-Dist: tiktoken ; extra == 'openai'
225
227
  Provides-Extra: srt
226
228
  Requires-Dist: aiohttp ; extra == 'srt'
227
229
  Requires-Dist: fastapi ; extra == 'srt'
@@ -231,16 +233,14 @@ Requires-Dist: torch ; extra == 'srt'
231
233
  Requires-Dist: uvloop ; extra == 'srt'
232
234
  Requires-Dist: uvicorn ; extra == 'srt'
233
235
  Requires-Dist: zmq ; extra == 'srt'
234
- Requires-Dist: vllm >=0.3.3 ; extra == 'srt'
236
+ Requires-Dist: vllm >=0.4.2 ; extra == 'srt'
235
237
  Requires-Dist: interegular ; extra == 'srt'
236
- Requires-Dist: lark ; extra == 'srt'
237
- Requires-Dist: numba ; extra == 'srt'
238
238
  Requires-Dist: pydantic ; extra == 'srt'
239
- Requires-Dist: referencing ; extra == 'srt'
240
- Requires-Dist: diskcache ; extra == 'srt'
241
- Requires-Dist: cloudpickle ; extra == 'srt'
242
239
  Requires-Dist: pillow ; extra == 'srt'
243
- Requires-Dist: outlines >=0.0.27 ; extra == 'srt'
240
+ Requires-Dist: packaging ; extra == 'srt'
241
+ Requires-Dist: huggingface-hub ; extra == 'srt'
242
+ Requires-Dist: hf-transfer ; extra == 'srt'
243
+ Requires-Dist: outlines >=0.0.34 ; extra == 'srt'
244
244
 
245
245
  <div align="center">
246
246
  <img src="assets/logo.png" alt="logo" width="400"></img>
@@ -541,7 +541,6 @@ curl http://localhost:30000/generate \
541
541
  Learn more about the argument format [here](docs/sampling_params.md).
542
542
 
543
543
  ### OpenAI Compatible API
544
-
545
544
  In addition, the server supports an experimental OpenAI-compatible API.
546
545
 
547
546
  ```python
@@ -571,15 +570,17 @@ response = client.chat.completions.create(
571
570
  print(response)
572
571
  ```
573
572
 
574
- In above example, the server uses the chat template specified in the model tokenizer.
575
- You can override the chat template if needed when launching the server:
573
+
574
+ By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
575
+
576
+ If needed, you can also override the chat template when launching the server:
576
577
 
577
578
  ```
578
579
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
579
580
  ```
580
581
 
581
582
  If the chat template you are looking for is missing, you are welcome to contribute it.
582
- Meanwhile, you can also temporary register your chat template as follows:
583
+ Meanwhile, you can also temporarily register your chat template as follows:
583
584
 
584
585
  ```json
585
586
  {
@@ -606,7 +607,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
606
607
  ```
607
608
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
608
609
  ```
609
- - You can turn on [flashinfer](docs/flashinfer.md) to acclerate the inference by using highly optimized CUDA kernels.
610
+ - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
610
611
 
611
612
  ### Supported Models
612
613
  - Llama
@@ -622,10 +623,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
622
623
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
623
624
  - Yi-VL
624
625
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
625
- - AWQ/GPTQ quantization
626
+ - StableLM
627
+ - Command-R
628
+ - DBRX
629
+ - AWQ/GPTQ/Marlin quantization
626
630
 
627
- ## Benchmark And Performance
631
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
628
632
 
633
+ ## Benchmark And Performance
629
634
  - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
630
635
  ![llama_7b](assets/llama_7b.jpg)
631
636
 
@@ -649,7 +654,4 @@ https://github.com/sgl-project/sglang/issues/157
649
654
  }
650
655
  ```
651
656
 
652
- [![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/2312.07104)
653
-
654
-
655
657
  We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
@@ -0,0 +1,72 @@
1
+ sglang/__init__.py,sha256=lKabCNZM2OhtymVLUuW4bpt-Jdxwk81wP1TkhVqIJEg,1058
2
+ sglang/api.py,sha256=hnVPt_p2ALLrraAKpVbkGocVtgb0MqgOH5NUQKOA6sY,4548
3
+ sglang/global_config.py,sha256=LxoF7VGCYszeEafC8zBbzUQ5PPFdv2rPzw2zEGPLgfg,961
4
+ sglang/launch_server.py,sha256=jKPZRDN5bUe8Wgz5eoDkqeePhmKa8DLD4DpXQLT5auo,294
5
+ sglang/launch_server_llavavid.py,sha256=UWo_qUCJ9yknp1TVPzrz4B_aZtEuQpLQq0l96FMgynI,1058
6
+ sglang/utils.py,sha256=Xp5mmhLoXNLB5U0NmCg-WMkfV0Ov4KVqzOvGZa3XKmc,7610
7
+ sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ sglang/backend/anthropic.py,sha256=gpxYWNRKDiRs1-dUUA53tuBH6TT2mSVgi-J9iOKuNNo,2075
9
+ sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
10
+ sglang/backend/openai.py,sha256=QQS09WHqMpgg70r-uB1LocqxUZ7vhv4R3FHlt7NNaKg,9583
11
+ sglang/backend/runtime_endpoint.py,sha256=ZnQ4DtbNIUr_Me5F6iYwMYsYhom8ZCs6A5kRjWwAANA,8695
12
+ sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
13
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ sglang/lang/chat_template.py,sha256=ogIT8iMlDcSEgcNBTh5pRLoCkdQI_ec5Hc27wFUFDIg,11532
15
+ sglang/lang/compiler.py,sha256=wNn_UqV6Sxl22mv-PpzFUtRgiFFV-Y4OYpO4LshEoRM,7527
16
+ sglang/lang/interpreter.py,sha256=GSIbO9N6ThfchdURb7XzQMZ9U6p1xirKHgXGmqLxKtg,28434
17
+ sglang/lang/ir.py,sha256=NxvIWlUidvtpQpPG4GAXZEN64Y2vLOBjN2Z2JkZVG1U,13350
18
+ sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
19
+ sglang/srt/backend_config.py,sha256=UIV6kIU2j-Xh0eoezn1aXcYIy0miftHsWFeAZwqpbGE,227
20
+ sglang/srt/conversation.py,sha256=NwTVuQXd3NqPq5WCllaYUgPLG2w2pMMbzIKDQfJMMO0,15491
21
+ sglang/srt/flush_cache.py,sha256=JOXLH4pmweVbuEWDPu3SEDrLYFG82nR2SpzbslW4b-A,381
22
+ sglang/srt/hf_transformers_utils.py,sha256=UneOMsw3w7taH9EKIi6uHZ-GNUZG0vbZIWN-ZoQZ5gM,5417
23
+ sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
24
+ sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
25
+ sglang/srt/model_config.py,sha256=843L1KxEPZcEk1uwQH10BwSX9L5DYJ3OGUUBo8wMdZg,1695
26
+ sglang/srt/openai_api_adapter.py,sha256=w3zvahyzvCnQd2pphQ6ViRBgHJmyI-TyIul6Q-CBY5Q,13214
27
+ sglang/srt/openai_protocol.py,sha256=87pLM0hxocd5LUvhYopnL61cEKz3iu8TKdJtHbk3C5o,5211
28
+ sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
29
+ sglang/srt/server.py,sha256=YAUiniJs9ebNrJ0Lweg2TnUL_yZ0P3PtWoT0Z_3d8vk,10371
30
+ sglang/srt/server_args.py,sha256=TQxIEdF0crqtY6WfZ6q7SKOQcCSomBEVjJ5K4HyTSvQ,9539
31
+ sglang/srt/utils.py,sha256=cr2uZmEB-Exq-wi3Y8B3yQu7kFUiyV4PAvzouvKYkWg,13090
32
+ sglang/srt/weight_utils.py,sha256=bFNh9-T8gseB0zKeu1qsMww8FpyrGFxbPcOFSeJtL5Q,15505
33
+ sglang/srt/constrained/__init__.py,sha256=BPRNDJnWtzYJ13X4urRS5aE6wFuwAVNBA9qeWIHF8rE,1236
34
+ sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
35
+ sglang/srt/constrained/fsm_cache.py,sha256=B9FPtpqzm4jKqciXTbfgNJL44hV2-rUG6-omDECN7iA,902
36
+ sglang/srt/constrained/jump_forward.py,sha256=fUa4AlnGX40gYiWTLuICTJfq4b7wA3AL5dydTqT3jz4,2483
37
+ sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
38
+ sglang/srt/layers/extend_attention.py,sha256=5gvRggy6qPLrLvjctoMMsYh1w70mOGxiPjxstHqjqsY,12623
39
+ sglang/srt/layers/logits_processor.py,sha256=Vbkr6ANNfiBGkkNobqjNm1KQTqtuYQWZvmPjhhIWnS8,7267
40
+ sglang/srt/layers/radix_attention.py,sha256=PBucvAdGI27Z1qQOUxUi-YJp-tKGm6LX3L2kp99pOV4,5598
41
+ sglang/srt/layers/token_attention.py,sha256=Wm-Gj0VdmFE8krZeHjDWic9dmVxRvg1WRAIHbbA3M34,8517
42
+ sglang/srt/managers/detokenizer_manager.py,sha256=-zuI2ZLyLD3wf21u8xWZm91JkcZZ57DwUFbFxnP2vFI,3462
43
+ sglang/srt/managers/io_struct.py,sha256=fFfUQtC-D31xGYdCAfuNVuX3QyaNDgGpfzC8qnKt0YA,4294
44
+ sglang/srt/managers/tokenizer_manager.py,sha256=TlGyFhWz1b24vkeUVvCwKFBERffi-esxGRhoukBnET8,13116
45
+ sglang/srt/managers/router/infer_batch.py,sha256=a1F3EjSBdER5pbgZFifuTdrE2Xom8Mt4aT9rmB8n35M,20204
46
+ sglang/srt/managers/router/manager.py,sha256=tdvYmwGHMeG2MMYZ4ZThdAJ_b4fp94UpemISFWOddno,2697
47
+ sglang/srt/managers/router/model_rpc.py,sha256=FJFgf1KAJ0Z8Yq4EPyczxZkCmZBjwNwCwXcjwyhU0k4,29775
48
+ sglang/srt/managers/router/model_runner.py,sha256=fp9wPh4sQY6Q-5PVtv_e9p5GgkkixSDUIqfFt7lVlV8,16527
49
+ sglang/srt/managers/router/radix_cache.py,sha256=GE6oY8bppRJCIxZWiDKO4P6al58zcqLQe605Y1d2bdo,7924
50
+ sglang/srt/managers/router/scheduler.py,sha256=pvlKSyCyIXmu14eyy1mvP9-QdG78eLUqMlr4cnfes2Y,2259
51
+ sglang/srt/models/commandr.py,sha256=DVdUF5C5etm82RoXJTNjYqlS2W2_9awzxzXNMubRoVg,13579
52
+ sglang/srt/models/dbrx.py,sha256=NIhlJp2B_y_L1ltK_Y7SEenAiHTUUp3p1rf8LIydC0o,14173
53
+ sglang/srt/models/dbrx_config.py,sha256=6EKMCAP1kS4pkQ9Ycr39PeEeTCPG4JhKRm2rtA4jS2s,11071
54
+ sglang/srt/models/gemma.py,sha256=Wk25zFkqkdG62xVVJEzeIjDES1LnoO0EY2W2p9XMvbA,11637
55
+ sglang/srt/models/llama2.py,sha256=Y2XwS5XXG77OfPAvbju7zp53CP5izzee_4-laVqu5ZM,11655
56
+ sglang/srt/models/llava.py,sha256=HtR7lUnAYW39vWw6xmDZkbG7AueswZDJxXeu6rQfpSU,14921
57
+ sglang/srt/models/llavavid.py,sha256=ueImEwOR4ZlNFUoBvXbwZPNRcrYWg54sPNK7pmGnrp0,13219
58
+ sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
59
+ sglang/srt/models/mixtral.py,sha256=1aggGw0P0MVQu5C5D3pMaZpRpY_PmrK_nwBOygOlPEM,13839
60
+ sglang/srt/models/qwen.py,sha256=cakvxjghKdGg5iGq9TJ_nGlVQaJ4-9V91EyyZnV4rmc,9390
61
+ sglang/srt/models/qwen2.py,sha256=PyOA8-RA_frRVLXfh8d1Ui1hUd1YmM3GfsPw2q5rCDI,11351
62
+ sglang/srt/models/stablelm.py,sha256=TCfQumj0acu2lCGujJj_PuzHFp3kFIwENQEfT-hnHUA,10867
63
+ sglang/srt/models/yivl.py,sha256=q8MUvIFIWpKCQ4pSZBoFpw-pnbdjkfr-M8jBJfGFu7E,4393
64
+ sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
65
+ sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
66
+ sglang/test/test_programs.py,sha256=-2AoddzOOmXoj3muVUKX6Uih63UNTm3MFg2fcNnsy7Y,11498
67
+ sglang/test/test_utils.py,sha256=9VFNGUMW0LBvmtDEHZ7ponakv5ZVF7B2Lg3xX353DXw,10083
68
+ sglang-0.1.16.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
69
+ sglang-0.1.16.dist-info/METADATA,sha256=yiziPDpVr6NPPhX58sA0GaLYKCut4FnBKD7TE50HH6k,28911
70
+ sglang-0.1.16.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
71
+ sglang-0.1.16.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
72
+ sglang-0.1.16.dist-info/RECORD,,
@@ -1,64 +0,0 @@
1
- sglang/__init__.py,sha256=Nxa2M7XCh2-e6I7VrCg7OSBL6BvEW3gyRD14ZdykpRM,96
2
- sglang/api.py,sha256=0-Eh7c41hWKjPXrzzvLFdLAUVkvmPGJGLAsrG9evDTE,4576
3
- sglang/global_config.py,sha256=PAX7TWeFcq0HBzNUWyCONAOjqIokWqw8vT7I6sBSKTc,797
4
- sglang/launch_server.py,sha256=jKPZRDN5bUe8Wgz5eoDkqeePhmKa8DLD4DpXQLT5auo,294
5
- sglang/utils.py,sha256=2dUXLMPz9VhhzbIRQABmfZnVW5yz61F3UVtb6yKyevM,6237
6
- sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- sglang/backend/anthropic.py,sha256=GJ_T1Jg0VOtajgkgczPKt5sjuVYdbAiWd2jXlJRNRmg,1677
8
- sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
9
- sglang/backend/openai.py,sha256=nPdA88A5GISJTH88svJdww3qHWIHZcGG2NEn0XjMkLU,9578
10
- sglang/backend/runtime_endpoint.py,sha256=r7dTazselaudlFx8hqk-PQLYDHZhpbAKjyFF1zLuM_E,8022
11
- sglang/backend/vertexai.py,sha256=BLfWf_tEgoHY9srCufJM5PLe3tql2j0G6ia7cPykxCM,4713
12
- sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- sglang/lang/chat_template.py,sha256=MaCF0fvNky0nJC9OvmAeApeHYgM6Lr03mtRhF0lS31U,8000
14
- sglang/lang/compiler.py,sha256=wNn_UqV6Sxl22mv-PpzFUtRgiFFV-Y4OYpO4LshEoRM,7527
15
- sglang/lang/interpreter.py,sha256=ahRxuEJZ7b1Tts2Lr7wViWIqL-Z12T3anvgj0XdvMN8,26666
16
- sglang/lang/ir.py,sha256=8Ap-uEUz6K9eNQTOKtMixePuLwRFHFKcN0Z5Yn44nKk,13320
17
- sglang/lang/tracer.py,sha256=pFiSNzPSg0l7ZZIlGqJDLCmQALR-wyo2dFgJP73J4_Y,8260
18
- sglang/srt/backend_config.py,sha256=UIV6kIU2j-Xh0eoezn1aXcYIy0miftHsWFeAZwqpbGE,227
19
- sglang/srt/conversation.py,sha256=mTstD-SsXG5p_YhWQUPEWU-vzzDMF4RgQ7KmLkOOC7U,15496
20
- sglang/srt/hf_transformers_utils.py,sha256=soRyYLoCn7GxgxvonufGFkdFBA3eH5i3Izk_wi7p1l0,5285
21
- sglang/srt/memory_pool.py,sha256=BMoX2wvicj214mV-xvcr_Iv_Je0qs3zTuzXfQVpV8u4,3609
22
- sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
23
- sglang/srt/model_config.py,sha256=ned-odjmKBKBhVPo04FEpus9gJsUWxrFLrLxahLwSaw,1328
24
- sglang/srt/sampling_params.py,sha256=83Fp-4HWThC20TEh139XcIb_erBqfI7KZg5txdRBq7c,2896
25
- sglang/srt/server.py,sha256=WLXissKuXQI7JFb2V8D47QSF-PPHnW-JZCiQm4YW0xE,24070
26
- sglang/srt/server_args.py,sha256=bvbi-Rb_JudqztFFfRsuXBYtUsG9hq4zMFt7X97uDhA,8954
27
- sglang/srt/utils.py,sha256=IEqpmWx_hl4eXn_KoHM0EPXmxeN2wKkgK7H01_t0x5Q,7355
28
- sglang/srt/constrained/__init__.py,sha256=BPRNDJnWtzYJ13X4urRS5aE6wFuwAVNBA9qeWIHF8rE,1236
29
- sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
30
- sglang/srt/constrained/fsm_cache.py,sha256=20mEgtDXU1Zeoicl5KBQC3arkg-RhRWiYnchJc00m1g,901
31
- sglang/srt/constrained/jump_forward.py,sha256=Z-pz2Jnvk1CxSEZA65OVq0GryqdiKuOkhhc13v5T6Lo,2482
32
- sglang/srt/layers/context_flashattention_nopad.py,sha256=TVYQ6IjftWVXORmKpEROMqQxDOnF6n2g0G1Ci4LquYM,5209
33
- sglang/srt/layers/extend_attention.py,sha256=KGqQOA5mel9qScXMAQP_3Qyhp3BNbiQ7Y_6wi38Lxcs,12622
34
- sglang/srt/layers/logits_processor.py,sha256=MW2bpqSXyghODMojqeMSYWZhUHuAFPk_gUkyyLw9HkM,4827
35
- sglang/srt/layers/radix_attention.py,sha256=bqrb8H8K8RbKTr1PzVmpnUxRzMj0H-OWCi1JYZKuRDw,5597
36
- sglang/srt/layers/token_attention.py,sha256=waOjGsWZlvf6epFhYerRJlAaMwvDTy_Z3uzPaXsVQUU,8516
37
- sglang/srt/managers/detokenizer_manager.py,sha256=1lPNh_Pe6Pr0v-TzlCBBREbvz4uFWxyw31SmnEZh0s8,3292
38
- sglang/srt/managers/io_struct.py,sha256=nXJh3CrOvv9MdAfIFoo6SCXuNQTG3KswmRKkwF61Tek,3141
39
- sglang/srt/managers/openai_protocol.py,sha256=cttqg9iv3de8fhtCqDI4cYoPPZ_gULedMXstV1ok6WA,4563
40
- sglang/srt/managers/tokenizer_manager.py,sha256=hgsR9AMj6ic9S3-2WiELh7Hnp8Xnb_bzp7kpbjHwHtM,9733
41
- sglang/srt/managers/router/infer_batch.py,sha256=U-Ckt9ad1WaOQF_dW6Eo9AMIRQoOJQ-Pm-MMXnEmPP8,18399
42
- sglang/srt/managers/router/manager.py,sha256=TNYs0IrkZGkPvZJViwL7BMUg0VlvzeyTjDMjuvRoMDI,2529
43
- sglang/srt/managers/router/model_rpc.py,sha256=VlwLNpHZ92bnteQl4PhVKoAXM0C8Y4_2LBBVaffeu3g,26766
44
- sglang/srt/managers/router/model_runner.py,sha256=-wWv00EbB_UkkLpio6VKGBTagfzxLHfY-eKDDQ0rZQc,18292
45
- sglang/srt/managers/router/radix_cache.py,sha256=XGUF5mxQTSCzD7GW_ltNP2p5aelEKrMXzdezufJ7NCQ,6484
46
- sglang/srt/managers/router/scheduler.py,sha256=V-LAnVSzgD2ddy2eXW3jWURCeq9Lv7YxCGk4kHyytfM,2818
47
- sglang/srt/models/gemma.py,sha256=8XlfHPtVixPYYjz5F9T4DOAuoordWFStmyFFWGfny1k,11582
48
- sglang/srt/models/llama2.py,sha256=VL4iN8R3wyTNr0bDxxKdLNnVGEvdXF6iGvA768YeakA,11611
49
- sglang/srt/models/llava.py,sha256=42sn-AgI-6dMaTEU4aEbi4Js5epy0J3JVQoMooUOKt8,14922
50
- sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
51
- sglang/srt/models/mixtral.py,sha256=wqIwKfR90ih0gDiTZkFZcQD4PIYpZFD3CmzxRcuKIqw,13915
52
- sglang/srt/models/qwen.py,sha256=CvdbcF90aI1tJPSQ-3OMUaQGMuaxCGe0y29m5nU_Yj0,9225
53
- sglang/srt/models/qwen2.py,sha256=myPc0wvgf5ZzJyGhUGN49YjY-tMf4t8Jn_Imjg8D7Mk,11307
54
- sglang/srt/models/stablelm.py,sha256=vMZUNgwXKPGYr5FcdYHw5g3QifVu9owKqq51_-EBOY0,10817
55
- sglang/srt/models/yivl.py,sha256=Qvp-zQ93cOZGg3zVyaiQLhRsfXiLrQhxu9TyQP2FMm4,4414
56
- sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
57
- sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
58
- sglang/test/test_programs.py,sha256=mrLhGuprwvx8ZJ-0Qe28E-iCw5Qv-9T0SAv1Jgo1AJw,11421
59
- sglang/test/test_utils.py,sha256=6PhTRi8UnR-BRNjit6aGu0M5lO0RebNQwEcDt712hE4,4830
60
- sglang-0.1.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
61
- sglang-0.1.14.dist-info/METADATA,sha256=C5N0VOYRHixdJcsf4dExIvP-Q099kYBMKs_dA4LBXSM,28809
62
- sglang-0.1.14.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
63
- sglang-0.1.14.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
64
- sglang-0.1.14.dist-info/RECORD,,