sglang 0.1.24__tar.gz → 0.1.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {sglang-0.1.24/sglang.egg-info → sglang-0.1.26}/PKG-INFO +4 -4
  2. {sglang-0.1.24 → sglang-0.1.26}/README.md +3 -3
  3. {sglang-0.1.24 → sglang-0.1.26}/pyproject.toml +1 -1
  4. {sglang-0.1.24 → sglang-0.1.26}/sglang/__init__.py +2 -2
  5. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/controller/model_runner.py +51 -0
  6. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/server.py +6 -0
  7. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/utils.py +44 -1
  8. sglang-0.1.26/sglang/version.py +1 -0
  9. {sglang-0.1.24 → sglang-0.1.26/sglang.egg-info}/PKG-INFO +4 -4
  10. {sglang-0.1.24 → sglang-0.1.26}/sglang.egg-info/SOURCES.txt +1 -14
  11. sglang-0.1.24/sglang/backend/anthropic.py +0 -77
  12. sglang-0.1.24/sglang/backend/litellm.py +0 -90
  13. sglang-0.1.24/sglang/backend/openai.py +0 -438
  14. sglang-0.1.24/sglang/backend/runtime_endpoint.py +0 -283
  15. sglang-0.1.24/sglang/backend/vertexai.py +0 -149
  16. sglang-0.1.24/sglang/bench.py +0 -627
  17. sglang-0.1.24/sglang/lang/backend/__init__.py +0 -0
  18. sglang-0.1.24/sglang/lang/backend/base_backend.py +0 -80
  19. sglang-0.1.24/sglang/srt/managers/controller/dp_worker.py +0 -113
  20. sglang-0.1.24/sglang/srt/openai_api/api_adapter.py +0 -432
  21. sglang-0.1.24/sglang/srt/openai_api/openai_api_adapter.py +0 -431
  22. sglang-0.1.24/sglang/srt/openai_api/openai_protocol.py +0 -207
  23. sglang-0.1.24/sglang/srt/openai_api_adapter.py +0 -411
  24. sglang-0.1.24/sglang/srt/openai_protocol.py +0 -207
  25. {sglang-0.1.24 → sglang-0.1.26}/LICENSE +0 -0
  26. {sglang-0.1.24 → sglang-0.1.26}/setup.cfg +0 -0
  27. {sglang-0.1.24 → sglang-0.1.26}/sglang/api.py +0 -0
  28. {sglang-0.1.24 → sglang-0.1.26}/sglang/bench_latency.py +0 -0
  29. {sglang-0.1.24 → sglang-0.1.26}/sglang/bench_serving.py +0 -0
  30. {sglang-0.1.24 → sglang-0.1.26}/sglang/check_env.py +0 -0
  31. {sglang-0.1.24 → sglang-0.1.26}/sglang/global_config.py +0 -0
  32. {sglang-0.1.24/sglang/backend → sglang-0.1.26/sglang/lang}/__init__.py +0 -0
  33. {sglang-0.1.24/sglang/lang → sglang-0.1.26/sglang/lang/backend}/__init__.py +0 -0
  34. {sglang-0.1.24 → sglang-0.1.26}/sglang/lang/backend/anthropic.py +0 -0
  35. {sglang-0.1.24/sglang → sglang-0.1.26/sglang/lang}/backend/base_backend.py +0 -0
  36. {sglang-0.1.24 → sglang-0.1.26}/sglang/lang/backend/litellm.py +0 -0
  37. {sglang-0.1.24 → sglang-0.1.26}/sglang/lang/backend/openai.py +0 -0
  38. {sglang-0.1.24 → sglang-0.1.26}/sglang/lang/backend/runtime_endpoint.py +0 -0
  39. {sglang-0.1.24 → sglang-0.1.26}/sglang/lang/backend/vertexai.py +0 -0
  40. {sglang-0.1.24 → sglang-0.1.26}/sglang/lang/chat_template.py +0 -0
  41. {sglang-0.1.24 → sglang-0.1.26}/sglang/lang/compiler.py +0 -0
  42. {sglang-0.1.24 → sglang-0.1.26}/sglang/lang/interpreter.py +0 -0
  43. {sglang-0.1.24 → sglang-0.1.26}/sglang/lang/ir.py +0 -0
  44. {sglang-0.1.24 → sglang-0.1.26}/sglang/lang/tracer.py +0 -0
  45. {sglang-0.1.24 → sglang-0.1.26}/sglang/launch_server.py +0 -0
  46. {sglang-0.1.24 → sglang-0.1.26}/sglang/launch_server_llavavid.py +0 -0
  47. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/constrained/__init__.py +0 -0
  48. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/constrained/base_cache.py +0 -0
  49. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/constrained/fsm_cache.py +0 -0
  50. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/constrained/jump_forward.py +0 -0
  51. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/conversation.py +0 -0
  52. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/flush_cache.py +0 -0
  53. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/hf_transformers_utils.py +0 -0
  54. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
  55. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/layers/extend_attention.py +0 -0
  56. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/layers/fused_moe.py +0 -0
  57. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/layers/linear.py +0 -0
  58. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/layers/logits_processor.py +0 -0
  59. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/layers/quantization/__init__.py +0 -0
  60. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/layers/quantization/fp8.py +0 -0
  61. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/layers/radix_attention.py +0 -0
  62. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/layers/token_attention.py +0 -0
  63. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/controller/cuda_graph_runner.py +0 -0
  64. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/controller/infer_batch.py +0 -0
  65. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/controller/manager_multi.py +0 -0
  66. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/controller/manager_single.py +0 -0
  67. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/controller/radix_cache.py +0 -0
  68. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/controller/schedule_heuristic.py +0 -0
  69. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/controller/tp_worker.py +0 -0
  70. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/detokenizer_manager.py +0 -0
  71. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/io_struct.py +0 -0
  72. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/managers/tokenizer_manager.py +0 -0
  73. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/memory_pool.py +0 -0
  74. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/mm_utils.py +0 -0
  75. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/model_config.py +0 -0
  76. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/model_loader/model_loader.py +0 -0
  77. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/model_loader/utils.py +0 -0
  78. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/chatglm.py +0 -0
  79. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/commandr.py +0 -0
  80. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/dbrx.py +0 -0
  81. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/deepseek.py +0 -0
  82. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/gemma.py +0 -0
  83. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/gemma2.py +0 -0
  84. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/gpt_bigcode.py +0 -0
  85. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/grok.py +0 -0
  86. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/internlm2.py +0 -0
  87. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/llama2.py +0 -0
  88. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/llama_classification.py +0 -0
  89. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/llava.py +0 -0
  90. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/llavavid.py +0 -0
  91. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/minicpm.py +0 -0
  92. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/mistral.py +0 -0
  93. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/mixtral.py +0 -0
  94. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/mixtral_quant.py +0 -0
  95. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/qwen.py +0 -0
  96. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/qwen2.py +0 -0
  97. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/qwen2_moe.py +0 -0
  98. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/stablelm.py +0 -0
  99. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/models/yivl.py +0 -0
  100. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/openai_api/adapter.py +0 -0
  101. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/openai_api/protocol.py +0 -0
  102. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/sampling_params.py +0 -0
  103. {sglang-0.1.24 → sglang-0.1.26}/sglang/srt/server_args.py +0 -0
  104. {sglang-0.1.24 → sglang-0.1.26}/sglang/test/test_conversation.py +0 -0
  105. {sglang-0.1.24 → sglang-0.1.26}/sglang/test/test_openai_protocol.py +0 -0
  106. {sglang-0.1.24 → sglang-0.1.26}/sglang/test/test_programs.py +0 -0
  107. {sglang-0.1.24 → sglang-0.1.26}/sglang/test/test_utils.py +0 -0
  108. {sglang-0.1.24 → sglang-0.1.26}/sglang/utils.py +0 -0
  109. {sglang-0.1.24 → sglang-0.1.26}/sglang.egg-info/dependency_links.txt +0 -0
  110. {sglang-0.1.24 → sglang-0.1.26}/sglang.egg-info/requires.txt +0 -0
  111. {sglang-0.1.24 → sglang-0.1.26}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.24
3
+ Version: 0.1.26
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -244,7 +244,7 @@ Requires-Dist: sglang[anthropic]; extra == "all"
244
244
  Requires-Dist: sglang[litellm]; extra == "all"
245
245
 
246
246
  <div align="center">
247
- <img src="assets/logo.png" alt="logo" width="400"></img>
247
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
248
248
  </div>
249
249
 
250
250
  --------------------------------------------------------------------------------
@@ -282,7 +282,7 @@ The core features include:
282
282
 
283
283
  ### Method 1: With pip
284
284
  ```
285
- pip install --upgrade pip setuptools wheel
285
+ pip install --upgrade pip
286
286
  pip install "sglang[all]"
287
287
 
288
288
  # Install FlashInfer CUDA kernels
@@ -405,7 +405,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
405
405
 
406
406
  ### Supported Models
407
407
 
408
- - Llama / Llama 2 / Llama 3
408
+ - Llama / Llama 2 / Llama 3 / Llama 3.1
409
409
  - Mistral / Mixtral
410
410
  - Gemma / Gemma 2
411
411
  - Qwen / Qwen 2 / Qwen 2 MoE
@@ -1,5 +1,5 @@
1
1
  <div align="center">
2
- <img src="assets/logo.png" alt="logo" width="400"></img>
2
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
3
3
  </div>
4
4
 
5
5
  --------------------------------------------------------------------------------
@@ -37,7 +37,7 @@ The core features include:
37
37
 
38
38
  ### Method 1: With pip
39
39
  ```
40
- pip install --upgrade pip setuptools wheel
40
+ pip install --upgrade pip
41
41
  pip install "sglang[all]"
42
42
 
43
43
  # Install FlashInfer CUDA kernels
@@ -160,7 +160,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
160
160
 
161
161
  ### Supported Models
162
162
 
163
- - Llama / Llama 2 / Llama 3
163
+ - Llama / Llama 2 / Llama 3 / Llama 3.1
164
164
  - Mistral / Mixtral
165
165
  - Gemma / Gemma 2
166
166
  - Qwen / Qwen 2 / Qwen 2 MoE
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.24"
7
+ version = "0.1.26"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -1,5 +1,3 @@
1
- __version__ = "0.1.24"
2
-
3
1
  # SGL API Components
4
2
  from sglang.api import (
5
3
  Runtime,
@@ -32,6 +30,8 @@ from sglang.lang.backend.openai import OpenAI
32
30
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
33
31
  from sglang.lang.backend.vertexai import VertexAI
34
32
 
33
+ from .version import __version__
34
+
35
35
  # public APIs management
36
36
  __all__ = [
37
37
  "global_config",
@@ -15,6 +15,7 @@ from flashinfer import (
15
15
  BatchPrefillWithRaggedKVCacheWrapper,
16
16
  )
17
17
  from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
18
+ from torch.nn.parameter import Parameter
18
19
  from vllm.config import DeviceConfig, LoadConfig
19
20
  from vllm.config import ModelConfig as VllmModelConfig
20
21
  from vllm.distributed import (
@@ -22,6 +23,7 @@ from vllm.distributed import (
22
23
  init_distributed_environment,
23
24
  initialize_model_parallel,
24
25
  )
26
+ from vllm.model_executor.layers.linear import QKVParallelLinear
25
27
  from vllm.model_executor.models import ModelRegistry
26
28
 
27
29
  from sglang.global_config import global_config
@@ -38,6 +40,18 @@ from sglang.srt.utils import (
38
40
  logger = logging.getLogger("srt.model_runner")
39
41
 
40
42
 
43
+ def is_llama3_405b_fp8(model_config):
44
+ if (
45
+ model_config.hf_config.architectures[0] == "LlamaForCausalLM"
46
+ and model_config.hf_config.hidden_size == 16384
47
+ and model_config.hf_config.intermediate_size == 53248
48
+ and model_config.hf_config.num_hidden_layers == 126
49
+ and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
50
+ ):
51
+ return True
52
+ return False
53
+
54
+
41
55
  class ModelRunner:
42
56
  def __init__(
43
57
  self,
@@ -118,6 +132,9 @@ class ModelRunner:
118
132
  seed=42,
119
133
  skip_tokenizer_init=True,
120
134
  )
135
+ if is_llama3_405b_fp8(self.model_config):
136
+ self.model_config.hf_config.num_key_value_heads = 8
137
+ vllm_model_config.hf_config.num_key_value_heads = 8
121
138
  self.dtype = vllm_model_config.dtype
122
139
  if self.model_config.model_overide_args is not None:
123
140
  vllm_model_config.hf_config.update(self.model_config.model_overide_args)
@@ -370,5 +387,39 @@ def load_model_cls_srt(model_arch: str) -> Optional[Type[nn.Module]]:
370
387
  return model_arch_name_to_cls[model_arch]
371
388
 
372
389
 
390
+ def get_original_weight(loaded_weight, head_dim):
391
+ n_kv_head = loaded_weight.shape[0] // (2 * head_dim)
392
+ dim = loaded_weight.shape[1]
393
+ for i in range(n_kv_head):
394
+ loaded_weight[i * head_dim : (i + 1) * head_dim, :] = loaded_weight[
395
+ 2 * i * head_dim : (2 * i + 1) * head_dim, :
396
+ ]
397
+ original_kv_weight = loaded_weight[: n_kv_head * head_dim, :]
398
+ assert original_kv_weight.shape == (n_kv_head * head_dim, dim)
399
+ return original_kv_weight
400
+
401
+
402
+ def get_weight_loader_srt(weight_loader):
403
+ def weight_loader_srt(
404
+ self,
405
+ param: Parameter,
406
+ loaded_weight: torch.Tensor,
407
+ loaded_shard_id: Optional[str] = None,
408
+ ):
409
+ if (
410
+ loaded_shard_id in ["k", "v"]
411
+ and loaded_weight.shape[0] == self.head_size * self.total_num_kv_heads * 2
412
+ ):
413
+ loaded_weight = get_original_weight(loaded_weight, self.head_size)
414
+
415
+ weight_loader(self, param, loaded_weight, loaded_shard_id)
416
+
417
+ return weight_loader_srt
418
+
419
+
373
420
  # Monkey patch model loader
374
421
  setattr(ModelRegistry, "load_model_cls", load_model_cls_srt)
422
+ original_weight_loader = QKVParallelLinear.weight_loader
423
+ setattr(
424
+ QKVParallelLinear, "weight_loader", get_weight_loader_srt(original_weight_loader)
425
+ )
@@ -52,6 +52,7 @@ from sglang.srt.utils import (
52
52
  allocate_init_ports,
53
53
  assert_pkg_version,
54
54
  enable_show_time_cost,
55
+ maybe_set_triton_cache_manager,
55
56
  set_ulimit,
56
57
  )
57
58
  from sglang.utils import get_exception_traceback
@@ -201,6 +202,11 @@ def launch_server(
201
202
  "reinstall the latest version by following the instructions "
202
203
  "at https://docs.flashinfer.ai/installation.html.",
203
204
  )
205
+
206
+ if server_args.tp_size // server_args.dp_size > 1:
207
+ # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
208
+ maybe_set_triton_cache_manager()
209
+
204
210
  if server_args.chat_template:
205
211
  # TODO: replace this with huggingface transformers template
206
212
  load_chat_template_for_openai_api(server_args.chat_template)
@@ -18,10 +18,15 @@ import psutil
18
18
  import requests
19
19
  import torch
20
20
  import torch.distributed as dist
21
- import triton
22
21
  from fastapi.responses import JSONResponse
23
22
  from packaging import version as pkg_version
24
23
  from starlette.middleware.base import BaseHTTPMiddleware
24
+ from triton.runtime.cache import (
25
+ FileCacheManager,
26
+ default_cache_dir,
27
+ default_dump_dir,
28
+ default_override_dir,
29
+ )
25
30
 
26
31
  logger = logging.getLogger(__name__)
27
32
 
@@ -460,6 +465,44 @@ def monkey_patch_vllm_all_gather(reverse: bool = False):
460
465
  setattr(GroupCoordinator, "all_gather", all_gather)
461
466
 
462
467
 
468
+ def maybe_set_triton_cache_manager() -> None:
469
+ """Set environment variable to tell Triton to use a
470
+ custom cache manager"""
471
+ cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
472
+ if cache_manger is None:
473
+ manager = "sglang.srt.utils:CustomCacheManager"
474
+ logger.info("Setting Triton cache manager to: %s", manager)
475
+ os.environ["TRITON_CACHE_MANAGER"] = manager
476
+
477
+
478
+ class CustomCacheManager(FileCacheManager):
479
+ # Adapted from: https://github.com/tdoublep/vllm/blob/3307522289fdfefe323b6c00d0db696651989a2f/vllm/triton_utils/custom_cache_manager.py
480
+ def __init__(self, key, override=False, dump=False):
481
+
482
+ self.key = key
483
+ self.lock_path = None
484
+ if dump:
485
+ self.cache_dir = default_dump_dir()
486
+ self.cache_dir = os.path.join(self.cache_dir, self.key)
487
+ self.lock_path = os.path.join(self.cache_dir, "lock")
488
+ os.makedirs(self.cache_dir, exist_ok=True)
489
+ elif override:
490
+ self.cache_dir = default_override_dir()
491
+ self.cache_dir = os.path.join(self.cache_dir, self.key)
492
+ else:
493
+ # create cache directory if it doesn't exist
494
+ self.cache_dir = (
495
+ os.getenv("TRITON_CACHE_DIR", "").strip() or default_cache_dir()
496
+ )
497
+ if self.cache_dir:
498
+ self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
499
+ self.cache_dir = os.path.join(self.cache_dir, self.key)
500
+ self.lock_path = os.path.join(self.cache_dir, "lock")
501
+ os.makedirs(self.cache_dir, exist_ok=True)
502
+ else:
503
+ raise RuntimeError("Could not create or locate cache dir")
504
+
505
+
463
506
  API_KEY_HEADER_NAME = "X-API-Key"
464
507
 
465
508
 
@@ -0,0 +1 @@
1
+ __version__ = "0.1.26"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.24
3
+ Version: 0.1.26
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -244,7 +244,7 @@ Requires-Dist: sglang[anthropic]; extra == "all"
244
244
  Requires-Dist: sglang[litellm]; extra == "all"
245
245
 
246
246
  <div align="center">
247
- <img src="assets/logo.png" alt="logo" width="400"></img>
247
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
248
248
  </div>
249
249
 
250
250
  --------------------------------------------------------------------------------
@@ -282,7 +282,7 @@ The core features include:
282
282
 
283
283
  ### Method 1: With pip
284
284
  ```
285
- pip install --upgrade pip setuptools wheel
285
+ pip install --upgrade pip
286
286
  pip install "sglang[all]"
287
287
 
288
288
  # Install FlashInfer CUDA kernels
@@ -405,7 +405,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
405
405
 
406
406
  ### Supported Models
407
407
 
408
- - Llama / Llama 2 / Llama 3
408
+ - Llama / Llama 2 / Llama 3 / Llama 3.1
409
409
  - Mistral / Mixtral
410
410
  - Gemma / Gemma 2
411
411
  - Qwen / Qwen 2 / Qwen 2 MoE
@@ -3,7 +3,6 @@ README.md
3
3
  pyproject.toml
4
4
  sglang/__init__.py
5
5
  sglang/api.py
6
- sglang/bench.py
7
6
  sglang/bench_latency.py
8
7
  sglang/bench_serving.py
9
8
  sglang/check_env.py
@@ -11,18 +10,12 @@ sglang/global_config.py
11
10
  sglang/launch_server.py
12
11
  sglang/launch_server_llavavid.py
13
12
  sglang/utils.py
13
+ sglang/version.py
14
14
  sglang.egg-info/PKG-INFO
15
15
  sglang.egg-info/SOURCES.txt
16
16
  sglang.egg-info/dependency_links.txt
17
17
  sglang.egg-info/requires.txt
18
18
  sglang.egg-info/top_level.txt
19
- sglang/backend/__init__.py
20
- sglang/backend/anthropic.py
21
- sglang/backend/base_backend.py
22
- sglang/backend/litellm.py
23
- sglang/backend/openai.py
24
- sglang/backend/runtime_endpoint.py
25
- sglang/backend/vertexai.py
26
19
  sglang/lang/__init__.py
27
20
  sglang/lang/chat_template.py
28
21
  sglang/lang/compiler.py
@@ -42,8 +35,6 @@ sglang/srt/hf_transformers_utils.py
42
35
  sglang/srt/memory_pool.py
43
36
  sglang/srt/mm_utils.py
44
37
  sglang/srt/model_config.py
45
- sglang/srt/openai_api_adapter.py
46
- sglang/srt/openai_protocol.py
47
38
  sglang/srt/sampling_params.py
48
39
  sglang/srt/server.py
49
40
  sglang/srt/server_args.py
@@ -65,7 +56,6 @@ sglang/srt/managers/detokenizer_manager.py
65
56
  sglang/srt/managers/io_struct.py
66
57
  sglang/srt/managers/tokenizer_manager.py
67
58
  sglang/srt/managers/controller/cuda_graph_runner.py
68
- sglang/srt/managers/controller/dp_worker.py
69
59
  sglang/srt/managers/controller/infer_batch.py
70
60
  sglang/srt/managers/controller/manager_multi.py
71
61
  sglang/srt/managers/controller/manager_single.py
@@ -98,9 +88,6 @@ sglang/srt/models/qwen2_moe.py
98
88
  sglang/srt/models/stablelm.py
99
89
  sglang/srt/models/yivl.py
100
90
  sglang/srt/openai_api/adapter.py
101
- sglang/srt/openai_api/api_adapter.py
102
- sglang/srt/openai_api/openai_api_adapter.py
103
- sglang/srt/openai_api/openai_protocol.py
104
91
  sglang/srt/openai_api/protocol.py
105
92
  sglang/test/test_conversation.py
106
93
  sglang/test/test_openai_protocol.py
@@ -1,77 +0,0 @@
1
- from typing import List, Optional, Union
2
-
3
- import numpy as np
4
-
5
- from sglang.backend.base_backend import BaseBackend
6
- from sglang.lang.chat_template import get_chat_template
7
- from sglang.lang.interpreter import StreamExecutor
8
- from sglang.lang.ir import SglSamplingParams
9
-
10
- try:
11
- import anthropic
12
- except ImportError as e:
13
- anthropic = e
14
-
15
-
16
- class Anthropic(BaseBackend):
17
- def __init__(self, model_name, *args, **kwargs):
18
- super().__init__()
19
-
20
- if isinstance(anthropic, Exception):
21
- raise anthropic
22
-
23
- self.model_name = model_name
24
- self.chat_template = get_chat_template("claude")
25
- self.client = anthropic.Anthropic(*args, **kwargs)
26
-
27
- def get_chat_template(self):
28
- return self.chat_template
29
-
30
- def generate(
31
- self,
32
- s: StreamExecutor,
33
- sampling_params: SglSamplingParams,
34
- ):
35
- if s.messages_:
36
- messages = s.messages_
37
- else:
38
- messages = [{"role": "user", "content": s.text_}]
39
-
40
- if messages and messages[0]["role"] == "system":
41
- system = messages.pop(0)["content"]
42
- else:
43
- system = ""
44
-
45
- ret = self.client.messages.create(
46
- model=self.model_name,
47
- system=system,
48
- messages=messages,
49
- **sampling_params.to_anthropic_kwargs(),
50
- )
51
- comp = ret.content[0].text
52
-
53
- return comp, {}
54
-
55
- def generate_stream(
56
- self,
57
- s: StreamExecutor,
58
- sampling_params: SglSamplingParams,
59
- ):
60
- if s.messages_:
61
- messages = s.messages_
62
- else:
63
- messages = [{"role": "user", "content": s.text_}]
64
-
65
- if messages and messages[0]["role"] == "system":
66
- system = messages.pop(0)["content"]
67
- else:
68
- system = ""
69
-
70
- with self.client.messages.stream(
71
- model=self.model_name,
72
- system=system,
73
- messages=messages,
74
- **sampling_params.to_anthropic_kwargs(),
75
- ) as stream:
76
- for text in stream.text_stream:
77
- yield text, {}
@@ -1,90 +0,0 @@
1
- from typing import Mapping, Optional
2
-
3
- from sglang.backend.base_backend import BaseBackend
4
- from sglang.lang.chat_template import get_chat_template_by_model_path
5
- from sglang.lang.interpreter import StreamExecutor
6
- from sglang.lang.ir import SglSamplingParams
7
-
8
- try:
9
- import litellm
10
- except ImportError as e:
11
- litellm = e
12
- litellm.num_retries = 1
13
-
14
-
15
- class LiteLLM(BaseBackend):
16
- def __init__(
17
- self,
18
- model_name,
19
- chat_template=None,
20
- api_key=None,
21
- organization: Optional[str] = None,
22
- base_url: Optional[str] = None,
23
- timeout: Optional[float] = 600,
24
- max_retries: Optional[int] = litellm.num_retries,
25
- default_headers: Optional[Mapping[str, str]] = None,
26
- ):
27
- super().__init__()
28
-
29
- if isinstance(litellm, Exception):
30
- raise litellm
31
-
32
- self.model_name = model_name
33
-
34
- self.chat_template = chat_template or get_chat_template_by_model_path(
35
- model_name
36
- )
37
-
38
- self.client_params = {
39
- "api_key": api_key,
40
- "organization": organization,
41
- "base_url": base_url,
42
- "timeout": timeout,
43
- "max_retries": max_retries,
44
- "default_headers": default_headers,
45
- }
46
-
47
- def get_chat_template(self):
48
- return self.chat_template
49
-
50
- def generate(
51
- self,
52
- s: StreamExecutor,
53
- sampling_params: SglSamplingParams,
54
- ):
55
- if s.messages_:
56
- messages = s.messages_
57
- else:
58
- messages = [{"role": "user", "content": s.text_}]
59
-
60
- ret = litellm.completion(
61
- model=self.model_name,
62
- messages=messages,
63
- **self.client_params,
64
- **sampling_params.to_anthropic_kwargs(),
65
- )
66
- comp = ret.choices[0].message.content
67
-
68
- return comp, {}
69
-
70
- def generate_stream(
71
- self,
72
- s: StreamExecutor,
73
- sampling_params: SglSamplingParams,
74
- ):
75
- if s.messages_:
76
- messages = s.messages_
77
- else:
78
- messages = [{"role": "user", "content": s.text_}]
79
-
80
- ret = litellm.completion(
81
- model=self.model_name,
82
- messages=messages,
83
- stream=True,
84
- **self.client_params,
85
- **sampling_params.to_litellm_kwargs(),
86
- )
87
- for chunk in ret:
88
- text = chunk.choices[0].delta.content
89
- if text is not None:
90
- yield text, {}