sglang 0.3.0__py3-none-any.whl → 0.3.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. sglang/bench_latency.py +17 -8
  2. sglang/bench_serving.py +33 -38
  3. sglang/global_config.py +5 -17
  4. sglang/lang/backend/runtime_endpoint.py +5 -2
  5. sglang/lang/interpreter.py +1 -4
  6. sglang/launch_server.py +3 -6
  7. sglang/launch_server_llavavid.py +7 -8
  8. sglang/srt/{model_config.py → configs/model_config.py} +5 -0
  9. sglang/srt/constrained/__init__.py +2 -0
  10. sglang/srt/constrained/fsm_cache.py +33 -38
  11. sglang/srt/constrained/jump_forward.py +0 -1
  12. sglang/srt/conversation.py +4 -1
  13. sglang/srt/hf_transformers_utils.py +1 -3
  14. sglang/srt/layers/activation.py +12 -0
  15. sglang/srt/layers/attention_backend.py +480 -0
  16. sglang/srt/layers/flashinfer_utils.py +235 -0
  17. sglang/srt/layers/fused_moe/layer.py +27 -7
  18. sglang/srt/layers/layernorm.py +12 -0
  19. sglang/srt/layers/logits_processor.py +64 -77
  20. sglang/srt/layers/radix_attention.py +11 -161
  21. sglang/srt/layers/sampler.py +38 -122
  22. sglang/srt/layers/torchao_utils.py +75 -0
  23. sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} +67 -63
  24. sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} +40 -132
  25. sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} +13 -7
  26. sglang/srt/lora/lora.py +403 -0
  27. sglang/srt/lora/lora_config.py +43 -0
  28. sglang/srt/lora/lora_manager.py +259 -0
  29. sglang/srt/managers/controller_multi.py +1 -5
  30. sglang/srt/managers/controller_single.py +0 -5
  31. sglang/srt/managers/io_struct.py +16 -1
  32. sglang/srt/managers/policy_scheduler.py +122 -5
  33. sglang/srt/managers/schedule_batch.py +105 -71
  34. sglang/srt/managers/tokenizer_manager.py +17 -8
  35. sglang/srt/managers/tp_worker.py +188 -121
  36. sglang/srt/model_executor/cuda_graph_runner.py +69 -133
  37. sglang/srt/model_executor/forward_batch_info.py +35 -312
  38. sglang/srt/model_executor/model_runner.py +123 -154
  39. sglang/srt/models/baichuan.py +416 -0
  40. sglang/srt/models/chatglm.py +1 -5
  41. sglang/srt/models/commandr.py +1 -5
  42. sglang/srt/models/dbrx.py +1 -5
  43. sglang/srt/models/deepseek.py +1 -5
  44. sglang/srt/models/deepseek_v2.py +7 -6
  45. sglang/srt/models/exaone.py +1 -5
  46. sglang/srt/models/gemma.py +1 -5
  47. sglang/srt/models/gemma2.py +1 -5
  48. sglang/srt/models/gpt_bigcode.py +1 -5
  49. sglang/srt/models/grok.py +1 -5
  50. sglang/srt/models/internlm2.py +1 -5
  51. sglang/srt/models/llama.py +51 -5
  52. sglang/srt/models/llama_classification.py +1 -20
  53. sglang/srt/models/llava.py +30 -5
  54. sglang/srt/models/llavavid.py +2 -2
  55. sglang/srt/models/minicpm.py +1 -5
  56. sglang/srt/models/minicpm3.py +669 -0
  57. sglang/srt/models/mixtral.py +6 -5
  58. sglang/srt/models/mixtral_quant.py +1 -5
  59. sglang/srt/models/olmoe.py +415 -0
  60. sglang/srt/models/qwen.py +1 -5
  61. sglang/srt/models/qwen2.py +1 -5
  62. sglang/srt/models/qwen2_moe.py +6 -5
  63. sglang/srt/models/stablelm.py +1 -5
  64. sglang/srt/models/xverse.py +375 -0
  65. sglang/srt/models/xverse_moe.py +445 -0
  66. sglang/srt/openai_api/adapter.py +65 -46
  67. sglang/srt/openai_api/protocol.py +11 -3
  68. sglang/srt/sampling/sampling_batch_info.py +46 -80
  69. sglang/srt/server.py +30 -15
  70. sglang/srt/server_args.py +163 -28
  71. sglang/srt/utils.py +19 -51
  72. sglang/test/few_shot_gsm8k.py +132 -0
  73. sglang/test/runners.py +114 -22
  74. sglang/test/test_programs.py +7 -5
  75. sglang/test/test_utils.py +85 -2
  76. sglang/utils.py +32 -37
  77. sglang/version.py +1 -1
  78. {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/METADATA +30 -18
  79. sglang-0.3.1.post1.dist-info/RECORD +130 -0
  80. {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/WHEEL +1 -1
  81. sglang-0.3.0.dist-info/RECORD +0 -118
  82. {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/LICENSE +0 -0
  83. {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ limitations under the License.
18
18
  import asyncio
19
19
  import concurrent.futures
20
20
  import dataclasses
21
+ import json
21
22
  import logging
22
23
  import multiprocessing as mp
23
24
  import os
@@ -77,7 +78,6 @@ class TokenizerManager:
77
78
  self,
78
79
  server_args: ServerArgs,
79
80
  port_args: PortArgs,
80
- model_override_args: dict = None,
81
81
  ):
82
82
  self.server_args = server_args
83
83
 
@@ -95,7 +95,7 @@ class TokenizerManager:
95
95
  self.hf_config = get_config(
96
96
  self.model_path,
97
97
  trust_remote_code=server_args.trust_remote_code,
98
- model_override_args=model_override_args,
98
+ model_override_args=json.loads(server_args.json_model_override_args),
99
99
  )
100
100
  self.is_generation = is_generation_model(
101
101
  self.hf_config.architectures, self.server_args.is_embedding
@@ -188,6 +188,7 @@ class TokenizerManager:
188
188
  pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
189
189
  obj.image_data if not_use_index else obj.image_data[index]
190
190
  )
191
+ modalities = obj.modalities
191
192
  return_logprob = (
192
193
  obj.return_logprob if not_use_index else obj.return_logprob[index]
193
194
  )
@@ -196,8 +197,6 @@ class TokenizerManager:
196
197
  if not_use_index
197
198
  else obj.logprob_start_len[index]
198
199
  )
199
- if return_logprob and logprob_start_len == -1:
200
- logprob_start_len = len(input_ids) - 1
201
200
  top_logprobs_num = (
202
201
  obj.top_logprobs_num
203
202
  if not_use_index
@@ -243,14 +242,13 @@ class TokenizerManager:
243
242
  pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
244
243
  obj.image_data[0]
245
244
  )
245
+ modalities = obj.modalities
246
246
  return_logprob = obj.return_logprob[0]
247
247
  logprob_start_len = obj.logprob_start_len[0]
248
248
  top_logprobs_num = obj.top_logprobs_num[0]
249
249
 
250
250
  # Send to the controller
251
251
  if self.is_generation:
252
- if return_logprob and logprob_start_len == -1:
253
- logprob_start_len = len(input_ids) - 1
254
252
  tokenized_obj = TokenizedGenerateReqInput(
255
253
  rid,
256
254
  input_text,
@@ -263,6 +261,12 @@ class TokenizerManager:
263
261
  logprob_start_len,
264
262
  top_logprobs_num,
265
263
  obj.stream,
264
+ modalities,
265
+ (
266
+ obj.lora_path[index]
267
+ if isinstance(obj.lora_path, list)
268
+ else obj.lora_path
269
+ ),
266
270
  )
267
271
  else: # is embedding
268
272
  tokenized_obj = TokenizedEmbeddingReqInput(
@@ -341,11 +345,10 @@ class TokenizerManager:
341
345
  sampling_params = self._get_sampling_params(obj.sampling_params[index])
342
346
 
343
347
  if self.is_generation:
344
- if obj.return_logprob[index] and obj.logprob_start_len[index] == -1:
345
- obj.logprob_start_len[index] = len(input_ids) - 1
346
348
  pixel_values, image_hashes, image_sizes = (
347
349
  await self._get_pixel_values(obj.image_data[index])
348
350
  )
351
+ modalities = obj.modalities
349
352
 
350
353
  tokenized_obj = TokenizedGenerateReqInput(
351
354
  rid,
@@ -359,6 +362,12 @@ class TokenizerManager:
359
362
  obj.logprob_start_len[index],
360
363
  obj.top_logprobs_num[index],
361
364
  obj.stream,
365
+ modalities,
366
+ (
367
+ obj.lora_path[index]
368
+ if isinstance(obj.lora_path, list)
369
+ else obj.lora_path
370
+ ),
362
371
  )
363
372
  else:
364
373
  tokenized_obj = TokenizedEmbeddingReqInput(