sglang 0.1.21__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. sglang/__init__.py +8 -8
  2. sglang/api.py +1 -1
  3. sglang/backend/vertexai.py +5 -4
  4. sglang/bench.py +627 -0
  5. sglang/bench_latency.py +22 -19
  6. sglang/bench_serving.py +976 -0
  7. sglang/check_env.py +171 -0
  8. sglang/global_config.py +3 -2
  9. sglang/lang/backend/__init__.py +0 -0
  10. sglang/lang/backend/anthropic.py +77 -0
  11. sglang/lang/backend/base_backend.py +80 -0
  12. sglang/lang/backend/litellm.py +90 -0
  13. sglang/lang/backend/openai.py +438 -0
  14. sglang/lang/backend/runtime_endpoint.py +283 -0
  15. sglang/lang/backend/vertexai.py +149 -0
  16. sglang/lang/interpreter.py +1 -0
  17. sglang/lang/tracer.py +1 -1
  18. sglang/launch_server.py +1 -1
  19. sglang/launch_server_llavavid.py +1 -4
  20. sglang/srt/conversation.py +1 -1
  21. sglang/srt/hf_transformers_utils.py +13 -1
  22. sglang/srt/layers/context_flashattention_nopad.py +0 -29
  23. sglang/srt/layers/extend_attention.py +0 -39
  24. sglang/srt/layers/linear.py +869 -0
  25. sglang/srt/layers/logits_processor.py +4 -5
  26. sglang/srt/layers/quantization/__init__.py +49 -0
  27. sglang/srt/layers/quantization/fp8.py +662 -0
  28. sglang/srt/layers/radix_attention.py +39 -24
  29. sglang/srt/layers/token_attention.py +1 -51
  30. sglang/srt/managers/controller/cuda_graph_runner.py +72 -28
  31. sglang/srt/managers/controller/infer_batch.py +90 -63
  32. sglang/srt/managers/controller/manager_multi.py +107 -100
  33. sglang/srt/managers/controller/manager_single.py +76 -96
  34. sglang/srt/managers/controller/model_runner.py +41 -26
  35. sglang/srt/managers/controller/schedule_heuristic.py +8 -3
  36. sglang/srt/managers/controller/tp_worker.py +136 -149
  37. sglang/srt/managers/detokenizer_manager.py +49 -5
  38. sglang/srt/managers/io_struct.py +36 -17
  39. sglang/srt/managers/tokenizer_manager.py +228 -125
  40. sglang/srt/memory_pool.py +32 -11
  41. sglang/srt/model_loader/model_loader.py +277 -0
  42. sglang/srt/model_loader/utils.py +260 -0
  43. sglang/srt/models/chatglm.py +1 -0
  44. sglang/srt/models/dbrx.py +1 -0
  45. sglang/srt/models/deepseek.py +430 -0
  46. sglang/srt/models/gpt_bigcode.py +282 -0
  47. sglang/srt/models/grok.py +1 -0
  48. sglang/srt/models/internlm2.py +317 -0
  49. sglang/srt/models/llama2.py +81 -23
  50. sglang/srt/models/llama_classification.py +1 -0
  51. sglang/srt/models/llava.py +1 -0
  52. sglang/srt/models/llavavid.py +1 -0
  53. sglang/srt/models/minicpm.py +1 -0
  54. sglang/srt/models/mixtral.py +1 -0
  55. sglang/srt/models/mixtral_quant.py +1 -0
  56. sglang/srt/models/qwen.py +1 -0
  57. sglang/srt/models/qwen2.py +6 -0
  58. sglang/srt/models/qwen2_moe.py +7 -4
  59. sglang/srt/models/stablelm.py +1 -0
  60. sglang/srt/openai_api/adapter.py +432 -0
  61. sglang/srt/openai_api/api_adapter.py +432 -0
  62. sglang/srt/openai_api/openai_api_adapter.py +431 -0
  63. sglang/srt/openai_api/openai_protocol.py +207 -0
  64. sglang/srt/openai_api/protocol.py +208 -0
  65. sglang/srt/openai_protocol.py +17 -0
  66. sglang/srt/sampling_params.py +2 -0
  67. sglang/srt/server.py +132 -84
  68. sglang/srt/server_args.py +35 -21
  69. sglang/srt/utils.py +65 -117
  70. sglang/test/test_conversation.py +1 -1
  71. sglang/test/test_openai_protocol.py +1 -1
  72. sglang/test/test_programs.py +1 -1
  73. sglang/test/test_utils.py +2 -2
  74. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/METADATA +162 -168
  75. sglang-0.1.24.dist-info/RECORD +105 -0
  76. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/WHEEL +1 -1
  77. sglang-0.1.21.dist-info/RECORD +0 -82
  78. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/LICENSE +0 -0
  79. {sglang-0.1.21.dist-info → sglang-0.1.24.dist-info}/top_level.txt +0 -0
@@ -355,6 +355,7 @@ class Qwen2MoeForCausalLM(nn.Module):
355
355
  self.logits_processor = LogitsProcessor(config)
356
356
  self.sampler = Sampler()
357
357
 
358
+ @torch.no_grad()
358
359
  def forward(
359
360
  self,
360
361
  input_ids: torch.Tensor,
@@ -400,9 +401,11 @@ class Qwen2MoeForCausalLM(nn.Module):
400
401
  # These are the weights for the experts
401
402
  # (param_name, weight_name, expert_id, shard_id)
402
403
  (
403
- "experts.w13_weight"
404
- if weight_name in ["gate_proj", "up_proj"]
405
- else "experts.w2_weight",
404
+ (
405
+ "experts.w13_weight"
406
+ if weight_name in ["gate_proj", "up_proj"]
407
+ else "experts.w2_weight"
408
+ ),
406
409
  f"experts.{expert_id}.{weight_name}.weight",
407
410
  expert_id,
408
411
  shard_id,
@@ -417,7 +420,7 @@ class Qwen2MoeForCausalLM(nn.Module):
417
420
  for name, loaded_weight in weights:
418
421
  if "rotary_emb.inv_freq" in name:
419
422
  continue
420
- for (param_name, weight_name, shard_id) in stacked_params_mapping:
423
+ for param_name, weight_name, shard_id in stacked_params_mapping:
421
424
  # Skip non-stacked layers and experts (experts handled below).
422
425
  if weight_name not in name:
423
426
  continue
@@ -235,6 +235,7 @@ class StableLmForCausalLM(nn.Module):
235
235
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
236
236
  self.logits_processor = LogitsProcessor(config)
237
237
 
238
+ @torch.no_grad()
238
239
  def forward(
239
240
  self,
240
241
  input_ids: torch.Tensor,
@@ -0,0 +1,432 @@
1
+ """Conversion between OpenAI APIs and native SRT APIs"""
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ from http import HTTPStatus
7
+
8
+ from fastapi import Request
9
+ from fastapi.responses import JSONResponse, StreamingResponse
10
+
11
+ from sglang.srt.conversation import (
12
+ Conversation,
13
+ SeparatorStyle,
14
+ chat_template_exists,
15
+ generate_chat_conv,
16
+ register_conv_template,
17
+ )
18
+ from sglang.srt.managers.io_struct import GenerateReqInput
19
+ from sglang.srt.openai_api.protocol import (
20
+ ChatCompletionRequest,
21
+ ChatCompletionResponse,
22
+ ChatCompletionResponseChoice,
23
+ ChatCompletionResponseStreamChoice,
24
+ ChatCompletionStreamResponse,
25
+ ChatMessage,
26
+ CompletionRequest,
27
+ CompletionResponse,
28
+ CompletionResponseChoice,
29
+ CompletionResponseStreamChoice,
30
+ CompletionStreamResponse,
31
+ DeltaMessage,
32
+ ErrorResponse,
33
+ LogProbs,
34
+ UsageInfo,
35
+ )
36
+
37
+ chat_template_name = None
38
+
39
+
40
+ def create_error_response(
41
+ message: str,
42
+ err_type: str = "BadRequestError",
43
+ status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
44
+ ):
45
+ error = ErrorResponse(message=message, type=err_type, code=status_code.value)
46
+ return JSONResponse(content=error.model_dump(), status_code=error.code)
47
+
48
+
49
+ def create_streaming_error_response(
50
+ message: str,
51
+ err_type: str = "BadRequestError",
52
+ status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
53
+ ) -> str:
54
+ error = ErrorResponse(message=message, type=err_type, code=status_code.value)
55
+ json_str = json.dumps({"error": error.model_dump()})
56
+ return json_str
57
+
58
+
59
+ def load_chat_template_for_openai_api(chat_template_arg):
60
+ global chat_template_name
61
+
62
+ print(f"Use chat template: {chat_template_arg}")
63
+ if not chat_template_exists(chat_template_arg):
64
+ if not os.path.exists(chat_template_arg):
65
+ raise RuntimeError(
66
+ f"Chat template {chat_template_arg} is not a built-in template name "
67
+ "or a valid chat template file path."
68
+ )
69
+ with open(chat_template_arg, "r") as filep:
70
+ template = json.load(filep)
71
+ try:
72
+ sep_style = SeparatorStyle[template["sep_style"]]
73
+ except KeyError:
74
+ raise ValueError(
75
+ f"Unknown separator style: {template['sep_style']}"
76
+ ) from None
77
+ register_conv_template(
78
+ Conversation(
79
+ name=template["name"],
80
+ system_template=template["system"] + "\n{system_message}",
81
+ system_message=template.get("system_message", ""),
82
+ roles=(template["user"], template["assistant"]),
83
+ sep_style=sep_style,
84
+ sep=template.get("sep", "\n"),
85
+ stop_str=template["stop_str"],
86
+ ),
87
+ override=True,
88
+ )
89
+ chat_template_name = template["name"]
90
+ else:
91
+ chat_template_name = chat_template_arg
92
+
93
+
94
+ async def v1_completions(tokenizer_manager, raw_request: Request):
95
+ request_json = await raw_request.json()
96
+ request = CompletionRequest(**request_json)
97
+
98
+ adapted_request = GenerateReqInput(
99
+ text=request.prompt,
100
+ sampling_params={
101
+ "temperature": request.temperature,
102
+ "max_new_tokens": request.max_tokens,
103
+ "stop": request.stop,
104
+ "top_p": request.top_p,
105
+ "presence_penalty": request.presence_penalty,
106
+ "frequency_penalty": request.frequency_penalty,
107
+ "regex": request.regex,
108
+ "n": request.n,
109
+ "ignore_eos": request.ignore_eos,
110
+ },
111
+ return_logprob=request.logprobs is not None and request.logprobs > 0,
112
+ top_logprobs_num=request.logprobs if request.logprobs is not None else 0,
113
+ return_text_in_logprobs=True,
114
+ stream=request.stream,
115
+ )
116
+
117
+ if adapted_request.stream:
118
+
119
+ async def generate_stream_resp():
120
+ stream_buffer = ""
121
+ n_prev_token = 0
122
+ try:
123
+ async for content in tokenizer_manager.generate_request(
124
+ adapted_request, raw_request
125
+ ):
126
+ text = content["text"]
127
+ prompt_tokens = content["meta_info"]["prompt_tokens"]
128
+ completion_tokens = content["meta_info"]["completion_tokens"]
129
+
130
+ if not stream_buffer: # The first chunk
131
+ if request.echo:
132
+ # Prepend prompt in response text.
133
+ text = request.prompt + text
134
+
135
+ if request.logprobs:
136
+ # The first chunk and echo is enabled.
137
+ if not stream_buffer and request.echo:
138
+ prefill_token_logprobs = content["meta_info"][
139
+ "prefill_token_logprobs"
140
+ ]
141
+ prefill_top_logprobs = content["meta_info"][
142
+ "prefill_top_logprobs"
143
+ ]
144
+ else:
145
+ prefill_token_logprobs = None
146
+ prefill_top_logprobs = None
147
+
148
+ logprobs = to_openai_style_logprobs(
149
+ prefill_token_logprobs=prefill_token_logprobs,
150
+ prefill_top_logprobs=prefill_top_logprobs,
151
+ decode_token_logprobs=content["meta_info"][
152
+ "decode_token_logprobs"
153
+ ][n_prev_token:],
154
+ decode_top_logprobs=content["meta_info"][
155
+ "decode_top_logprobs"
156
+ ][n_prev_token:],
157
+ )
158
+
159
+ n_prev_token = len(
160
+ content["meta_info"]["decode_token_logprobs"]
161
+ )
162
+ else:
163
+ logprobs = None
164
+
165
+ delta = text[len(stream_buffer) :]
166
+ stream_buffer = stream_buffer + delta
167
+ choice_data = CompletionResponseStreamChoice(
168
+ index=0,
169
+ text=delta,
170
+ logprobs=logprobs,
171
+ finish_reason=content["meta_info"]["finish_reason"],
172
+ )
173
+ chunk = CompletionStreamResponse(
174
+ id=content["meta_info"]["id"],
175
+ object="text_completion",
176
+ choices=[choice_data],
177
+ model=request.model,
178
+ usage=UsageInfo(
179
+ prompt_tokens=prompt_tokens,
180
+ completion_tokens=completion_tokens,
181
+ total_tokens=prompt_tokens + completion_tokens,
182
+ ),
183
+ )
184
+ yield f"data: {chunk.model_dump_json()}\n\n"
185
+ except ValueError as e:
186
+ error = create_streaming_error_response(str(e))
187
+ yield f"data: {error}\n\n"
188
+ yield "data: [DONE]\n\n"
189
+
190
+ return StreamingResponse(
191
+ generate_stream_resp(),
192
+ media_type="text/event-stream",
193
+ background=tokenizer_manager.create_abort_task(adapted_request),
194
+ )
195
+
196
+ # Non-streaming response.
197
+ try:
198
+ ret = await tokenizer_manager.generate_request(
199
+ adapted_request, raw_request
200
+ ).__anext__()
201
+ except ValueError as e:
202
+ return create_error_response(str(e))
203
+
204
+ if not isinstance(ret, list):
205
+ ret = [ret]
206
+ choices = []
207
+
208
+ for idx, ret_item in enumerate(ret):
209
+ text = ret_item["text"]
210
+
211
+ if request.echo:
212
+ text = request.prompt + text
213
+
214
+ if request.logprobs:
215
+ if request.echo:
216
+ prefill_token_logprobs = ret_item["meta_info"]["prefill_token_logprobs"]
217
+ prefill_top_logprobs = ret_item["meta_info"]["prefill_top_logprobs"]
218
+ else:
219
+ prefill_token_logprobs = None
220
+ prefill_top_logprobs = None
221
+
222
+ logprobs = to_openai_style_logprobs(
223
+ prefill_token_logprobs=prefill_token_logprobs,
224
+ prefill_top_logprobs=prefill_top_logprobs,
225
+ decode_token_logprobs=ret_item["meta_info"]["decode_token_logprobs"],
226
+ decode_top_logprobs=ret_item["meta_info"]["decode_top_logprobs"],
227
+ )
228
+ else:
229
+ logprobs = None
230
+
231
+ choice_data = CompletionResponseChoice(
232
+ index=idx,
233
+ text=text,
234
+ logprobs=logprobs,
235
+ finish_reason=ret_item["meta_info"]["finish_reason"],
236
+ )
237
+
238
+ choices.append(choice_data)
239
+
240
+ response = CompletionResponse(
241
+ id=ret[0]["meta_info"]["id"],
242
+ model=request.model,
243
+ choices=choices,
244
+ usage=UsageInfo(
245
+ prompt_tokens=ret[0]["meta_info"]["prompt_tokens"],
246
+ completion_tokens=sum(
247
+ item["meta_info"]["completion_tokens"] for item in ret
248
+ ),
249
+ total_tokens=ret[0]["meta_info"]["prompt_tokens"]
250
+ + sum(item["meta_info"]["completion_tokens"] for item in ret),
251
+ ),
252
+ )
253
+
254
+ return response
255
+
256
+
257
+ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
258
+ request_json = await raw_request.json()
259
+ request = ChatCompletionRequest(**request_json)
260
+
261
+ # Prep the data needed for the underlying GenerateReqInput:
262
+ # - prompt: The full prompt string.
263
+ # - stop: Custom stop tokens.
264
+ # - image_data: None or a list of image strings (URLs or base64 strings).
265
+ # None skips any image processing in GenerateReqInput.
266
+ if not isinstance(request.messages, str):
267
+ # Apply chat template and its stop strings.
268
+ if chat_template_name is None:
269
+ prompt = tokenizer_manager.tokenizer.apply_chat_template(
270
+ request.messages, tokenize=False, add_generation_prompt=True
271
+ )
272
+ stop = request.stop
273
+ image_data = None
274
+ else:
275
+ conv = generate_chat_conv(request, chat_template_name)
276
+ prompt = conv.get_prompt()
277
+ image_data = conv.image_data
278
+ stop = conv.stop_str or []
279
+ if request.stop:
280
+ if isinstance(request.stop, str):
281
+ stop.append(request.stop)
282
+ else:
283
+ stop.extend(request.stop)
284
+ else:
285
+ # Use the raw prompt and stop strings if the messages is already a string.
286
+ prompt = request.messages
287
+ stop = request.stop
288
+ image_data = None
289
+
290
+ adapted_request = GenerateReqInput(
291
+ text=prompt,
292
+ image_data=image_data,
293
+ sampling_params={
294
+ "temperature": request.temperature,
295
+ "max_new_tokens": request.max_tokens,
296
+ "stop": stop,
297
+ "top_p": request.top_p,
298
+ "presence_penalty": request.presence_penalty,
299
+ "frequency_penalty": request.frequency_penalty,
300
+ "regex": request.regex,
301
+ "n": request.n,
302
+ },
303
+ stream=request.stream,
304
+ )
305
+
306
+ if adapted_request.stream:
307
+
308
+ async def generate_stream_resp():
309
+ is_first = True
310
+
311
+ stream_buffer = ""
312
+ try:
313
+ async for content in tokenizer_manager.generate_request(
314
+ adapted_request, raw_request
315
+ ):
316
+ if is_first:
317
+ # First chunk with role
318
+ is_first = False
319
+ choice_data = ChatCompletionResponseStreamChoice(
320
+ index=0,
321
+ delta=DeltaMessage(role="assistant"),
322
+ finish_reason=content["meta_info"]["finish_reason"],
323
+ )
324
+ chunk = ChatCompletionStreamResponse(
325
+ id=content["meta_info"]["id"],
326
+ choices=[choice_data],
327
+ model=request.model,
328
+ )
329
+ yield f"data: {chunk.model_dump_json()}\n\n"
330
+
331
+ text = content["text"]
332
+ delta = text[len(stream_buffer) :]
333
+ stream_buffer = stream_buffer + delta
334
+ choice_data = ChatCompletionResponseStreamChoice(
335
+ index=0,
336
+ delta=DeltaMessage(content=delta),
337
+ finish_reason=content["meta_info"]["finish_reason"],
338
+ )
339
+ chunk = ChatCompletionStreamResponse(
340
+ id=content["meta_info"]["id"],
341
+ choices=[choice_data],
342
+ model=request.model,
343
+ )
344
+ yield f"data: {chunk.model_dump_json()}\n\n"
345
+ except ValueError as e:
346
+ error = create_streaming_error_response(str(e))
347
+ yield f"data: {error}\n\n"
348
+ yield "data: [DONE]\n\n"
349
+
350
+ return StreamingResponse(
351
+ generate_stream_resp(),
352
+ media_type="text/event-stream",
353
+ background=tokenizer_manager.create_abort_task(adapted_request),
354
+ )
355
+
356
+ # Non-streaming response.
357
+ try:
358
+ ret = await tokenizer_manager.generate_request(
359
+ adapted_request, raw_request
360
+ ).__anext__()
361
+ except ValueError as e:
362
+ return create_error_response(str(e))
363
+
364
+ if not isinstance(ret, list):
365
+ ret = [ret]
366
+ choices = []
367
+ total_prompt_tokens = 0
368
+ total_completion_tokens = 0
369
+
370
+ for idx, ret_item in enumerate(ret):
371
+ prompt_tokens = ret_item["meta_info"]["prompt_tokens"]
372
+ completion_tokens = ret_item["meta_info"]["completion_tokens"]
373
+
374
+ choice_data = ChatCompletionResponseChoice(
375
+ index=idx,
376
+ message=ChatMessage(role="assistant", content=ret_item["text"]),
377
+ finish_reason=ret_item["meta_info"]["finish_reason"],
378
+ )
379
+
380
+ choices.append(choice_data)
381
+ total_prompt_tokens = prompt_tokens
382
+ total_completion_tokens += completion_tokens
383
+
384
+ response = ChatCompletionResponse(
385
+ id=ret[0]["meta_info"]["id"],
386
+ model=request.model,
387
+ choices=choices,
388
+ usage=UsageInfo(
389
+ prompt_tokens=total_prompt_tokens,
390
+ completion_tokens=total_completion_tokens,
391
+ total_tokens=total_prompt_tokens + total_completion_tokens,
392
+ ),
393
+ )
394
+
395
+ return response
396
+
397
+
398
+ def to_openai_style_logprobs(
399
+ prefill_token_logprobs=None,
400
+ decode_token_logprobs=None,
401
+ prefill_top_logprobs=None,
402
+ decode_top_logprobs=None,
403
+ ):
404
+ ret_logprobs = LogProbs()
405
+
406
+ def append_token_logprobs(token_logprobs):
407
+ for logprob, _, token_text in token_logprobs:
408
+ ret_logprobs.tokens.append(token_text)
409
+ ret_logprobs.token_logprobs.append(logprob)
410
+
411
+ # Not supported yet
412
+ ret_logprobs.text_offset.append(-1)
413
+
414
+ def append_top_logprobs(top_logprobs):
415
+ for tokens in top_logprobs:
416
+ if tokens is not None:
417
+ ret_logprobs.top_logprobs.append(
418
+ {token[2]: token[0] for token in tokens}
419
+ )
420
+ else:
421
+ ret_logprobs.top_logprobs.append(None)
422
+
423
+ if prefill_token_logprobs is not None:
424
+ append_token_logprobs(prefill_token_logprobs)
425
+ if decode_token_logprobs is not None:
426
+ append_token_logprobs(decode_token_logprobs)
427
+ if prefill_top_logprobs is not None:
428
+ append_top_logprobs(prefill_top_logprobs)
429
+ if decode_top_logprobs is not None:
430
+ append_top_logprobs(decode_top_logprobs)
431
+
432
+ return ret_logprobs