sglang 0.1.20__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. sglang/__init__.py +8 -8
  2. sglang/api.py +1 -1
  3. sglang/backend/runtime_endpoint.py +14 -4
  4. sglang/backend/vertexai.py +5 -4
  5. sglang/bench.py +627 -0
  6. sglang/bench_latency.py +22 -20
  7. sglang/bench_serving.py +758 -0
  8. sglang/check_env.py +171 -0
  9. sglang/global_config.py +3 -1
  10. sglang/lang/backend/__init__.py +0 -0
  11. sglang/lang/backend/anthropic.py +77 -0
  12. sglang/lang/backend/base_backend.py +80 -0
  13. sglang/lang/backend/litellm.py +90 -0
  14. sglang/lang/backend/openai.py +438 -0
  15. sglang/lang/backend/runtime_endpoint.py +283 -0
  16. sglang/lang/backend/vertexai.py +149 -0
  17. sglang/lang/chat_template.py +2 -2
  18. sglang/lang/ir.py +3 -3
  19. sglang/lang/tracer.py +1 -1
  20. sglang/launch_server.py +1 -1
  21. sglang/launch_server_llavavid.py +1 -4
  22. sglang/srt/conversation.py +1 -1
  23. sglang/srt/layers/context_flashattention_nopad.py +0 -29
  24. sglang/srt/layers/extend_attention.py +0 -39
  25. sglang/srt/layers/linear.py +869 -0
  26. sglang/srt/layers/quantization/__init__.py +49 -0
  27. sglang/srt/layers/quantization/fp8.py +662 -0
  28. sglang/srt/layers/radix_attention.py +31 -5
  29. sglang/srt/layers/token_attention.py +1 -51
  30. sglang/srt/managers/controller/cuda_graph_runner.py +44 -18
  31. sglang/srt/managers/controller/infer_batch.py +76 -72
  32. sglang/srt/managers/controller/manager_multi.py +109 -98
  33. sglang/srt/managers/controller/manager_single.py +105 -50
  34. sglang/srt/managers/controller/model_runner.py +42 -18
  35. sglang/srt/managers/controller/radix_cache.py +4 -3
  36. sglang/srt/managers/controller/schedule_heuristic.py +4 -0
  37. sglang/srt/managers/controller/tp_worker.py +143 -156
  38. sglang/srt/managers/detokenizer_manager.py +49 -5
  39. sglang/srt/managers/io_struct.py +36 -17
  40. sglang/srt/managers/tokenizer_manager.py +228 -125
  41. sglang/srt/memory_pool.py +46 -58
  42. sglang/srt/model_loader/model_loader.py +277 -0
  43. sglang/srt/model_loader/utils.py +260 -0
  44. sglang/srt/models/chatglm.py +1 -0
  45. sglang/srt/models/dbrx.py +1 -0
  46. sglang/srt/models/grok.py +1 -0
  47. sglang/srt/models/internlm2.py +317 -0
  48. sglang/srt/models/llama2.py +65 -16
  49. sglang/srt/models/llama_classification.py +1 -0
  50. sglang/srt/models/llava.py +1 -0
  51. sglang/srt/models/llavavid.py +1 -0
  52. sglang/srt/models/minicpm.py +2 -8
  53. sglang/srt/models/mixtral.py +1 -0
  54. sglang/srt/models/mixtral_quant.py +1 -0
  55. sglang/srt/models/qwen.py +1 -0
  56. sglang/srt/models/qwen2.py +6 -0
  57. sglang/srt/models/qwen2_moe.py +130 -108
  58. sglang/srt/models/stablelm.py +1 -0
  59. sglang/srt/openai_api/adapter.py +432 -0
  60. sglang/srt/openai_api/api_adapter.py +432 -0
  61. sglang/srt/openai_api/openai_api_adapter.py +431 -0
  62. sglang/srt/openai_api/openai_protocol.py +207 -0
  63. sglang/srt/openai_api/protocol.py +208 -0
  64. sglang/srt/openai_protocol.py +17 -0
  65. sglang/srt/sampling_params.py +2 -0
  66. sglang/srt/server.py +114 -90
  67. sglang/srt/server_args.py +27 -17
  68. sglang/srt/utils.py +17 -118
  69. sglang/test/test_conversation.py +1 -1
  70. sglang/test/test_openai_protocol.py +1 -1
  71. sglang/test/test_programs.py +1 -1
  72. sglang/test/test_utils.py +2 -2
  73. {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/METADATA +157 -159
  74. sglang-0.1.22.dist-info/RECORD +103 -0
  75. {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/WHEEL +1 -1
  76. sglang-0.1.20.dist-info/RECORD +0 -82
  77. {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/LICENSE +0 -0
  78. {sglang-0.1.20.dist-info → sglang-0.1.22.dist-info}/top_level.txt +0 -0
sglang/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.20"
1
+ __version__ = "0.1.22"
2
2
 
3
3
  # SGL API Components
4
4
  from sglang.api import (
@@ -22,16 +22,16 @@ from sglang.api import (
22
22
  video,
23
23
  )
24
24
 
25
- # SGL Backends
26
- from sglang.backend.anthropic import Anthropic
27
- from sglang.backend.litellm import LiteLLM
28
- from sglang.backend.openai import OpenAI
29
- from sglang.backend.runtime_endpoint import RuntimeEndpoint
30
- from sglang.backend.vertexai import VertexAI
31
-
32
25
  # Global Configurations
33
26
  from sglang.global_config import global_config
34
27
 
28
+ # SGL Backends
29
+ from sglang.lang.backend.anthropic import Anthropic
30
+ from sglang.lang.backend.litellm import LiteLLM
31
+ from sglang.lang.backend.openai import OpenAI
32
+ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
33
+ from sglang.lang.backend.vertexai import VertexAI
34
+
35
35
  # public APIs management
36
36
  __all__ = [
37
37
  "global_config",
sglang/api.py CHANGED
@@ -4,8 +4,8 @@ import os
4
4
  import re
5
5
  from typing import Callable, List, Optional, Union
6
6
 
7
- from sglang.backend.base_backend import BaseBackend
8
7
  from sglang.global_config import global_config
8
+ from sglang.lang.backend.base_backend import BaseBackend
9
9
  from sglang.lang.ir import (
10
10
  SglExpr,
11
11
  SglExprList,
@@ -12,7 +12,6 @@ from sglang.utils import http_request
12
12
 
13
13
 
14
14
  class RuntimeEndpoint(BaseBackend):
15
-
16
15
  def __init__(
17
16
  self,
18
17
  base_url: str,
@@ -38,7 +37,8 @@ class RuntimeEndpoint(BaseBackend):
38
37
  self.model_info = res.json()
39
38
 
40
39
  self.chat_template = get_chat_template_by_model_path(
41
- self.model_info["model_path"])
40
+ self.model_info["model_path"]
41
+ )
42
42
 
43
43
  def get_model_name(self):
44
44
  return self.model_info["model_path"]
@@ -124,7 +124,12 @@ class RuntimeEndpoint(BaseBackend):
124
124
  else:
125
125
  raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
126
126
 
127
- for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
127
+ for item in [
128
+ "return_logprob",
129
+ "logprob_start_len",
130
+ "top_logprobs_num",
131
+ "return_text_in_logprobs",
132
+ ]:
128
133
  value = getattr(sampling_params, item, None)
129
134
  if value is not None:
130
135
  data[item] = value
@@ -171,7 +176,12 @@ class RuntimeEndpoint(BaseBackend):
171
176
  else:
172
177
  raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
173
178
 
174
- for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
179
+ for item in [
180
+ "return_logprob",
181
+ "logprob_start_len",
182
+ "top_logprobs_num",
183
+ "return_text_in_logprobs",
184
+ ]:
175
185
  value = getattr(sampling_params, item, None)
176
186
  if value is not None:
177
187
  data[item] = value
@@ -1,8 +1,6 @@
1
1
  import os
2
2
  import warnings
3
- from typing import List, Optional, Union
4
-
5
- import numpy as np
3
+ from typing import Optional
6
4
 
7
5
  from sglang.backend.base_backend import BaseBackend
8
6
  from sglang.lang.chat_template import get_chat_template
@@ -21,7 +19,7 @@ except ImportError as e:
21
19
 
22
20
 
23
21
  class VertexAI(BaseBackend):
24
- def __init__(self, model_name):
22
+ def __init__(self, model_name, safety_settings=None):
25
23
  super().__init__()
26
24
 
27
25
  if isinstance(GenerativeModel, Exception):
@@ -33,6 +31,7 @@ class VertexAI(BaseBackend):
33
31
 
34
32
  self.model_name = model_name
35
33
  self.chat_template = get_chat_template("default")
34
+ self.safety_settings = safety_settings
36
35
 
37
36
  def get_chat_template(self):
38
37
  return self.chat_template
@@ -54,6 +53,7 @@ class VertexAI(BaseBackend):
54
53
  ret = GenerativeModel(self.model_name).generate_content(
55
54
  prompt,
56
55
  generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
56
+ safety_settings=self.safety_settings,
57
57
  )
58
58
 
59
59
  comp = ret.text
@@ -78,6 +78,7 @@ class VertexAI(BaseBackend):
78
78
  prompt,
79
79
  stream=True,
80
80
  generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
81
+ safety_settings=self.safety_settings,
81
82
  )
82
83
  for ret in generator:
83
84
  yield ret.text, {}