sglang 0.4.1.post3__py3-none-any.whl → 0.4.1.post5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. sglang/bench_one_batch.py +2 -0
  2. sglang/bench_serving.py +18 -1
  3. sglang/lang/interpreter.py +71 -1
  4. sglang/lang/ir.py +2 -0
  5. sglang/srt/configs/__init__.py +4 -0
  6. sglang/srt/configs/chatglm.py +78 -0
  7. sglang/srt/configs/dbrx.py +279 -0
  8. sglang/srt/configs/model_config.py +1 -1
  9. sglang/srt/hf_transformers_utils.py +9 -14
  10. sglang/srt/layers/attention/__init__.py +22 -6
  11. sglang/srt/layers/attention/double_sparsity_backend.py +0 -52
  12. sglang/srt/layers/attention/flashinfer_backend.py +215 -83
  13. sglang/srt/layers/attention/torch_native_backend.py +1 -38
  14. sglang/srt/layers/attention/triton_backend.py +20 -11
  15. sglang/srt/layers/attention/triton_ops/decode_attention.py +4 -0
  16. sglang/srt/layers/linear.py +159 -55
  17. sglang/srt/layers/logits_processor.py +170 -215
  18. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  20. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  21. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  23. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  37. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  38. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  39. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +198 -29
  40. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -7
  41. sglang/srt/layers/parameter.py +431 -0
  42. sglang/srt/layers/quantization/__init__.py +3 -2
  43. sglang/srt/layers/quantization/fp8.py +3 -3
  44. sglang/srt/layers/quantization/modelopt_quant.py +174 -0
  45. sglang/srt/layers/sampler.py +57 -21
  46. sglang/srt/layers/torchao_utils.py +17 -3
  47. sglang/srt/layers/vocab_parallel_embedding.py +1 -1
  48. sglang/srt/managers/cache_controller.py +307 -0
  49. sglang/srt/managers/data_parallel_controller.py +2 -0
  50. sglang/srt/managers/io_struct.py +1 -2
  51. sglang/srt/managers/schedule_batch.py +33 -3
  52. sglang/srt/managers/schedule_policy.py +159 -90
  53. sglang/srt/managers/scheduler.py +68 -28
  54. sglang/srt/managers/session_controller.py +1 -1
  55. sglang/srt/managers/tokenizer_manager.py +27 -21
  56. sglang/srt/managers/tp_worker.py +16 -4
  57. sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  58. sglang/srt/mem_cache/memory_pool.py +206 -1
  59. sglang/srt/metrics/collector.py +22 -30
  60. sglang/srt/model_executor/cuda_graph_runner.py +129 -77
  61. sglang/srt/model_executor/forward_batch_info.py +51 -21
  62. sglang/srt/model_executor/model_runner.py +72 -64
  63. sglang/srt/models/chatglm.py +1 -1
  64. sglang/srt/models/dbrx.py +1 -1
  65. sglang/srt/models/deepseek_v2.py +34 -7
  66. sglang/srt/models/grok.py +109 -29
  67. sglang/srt/models/llama.py +9 -2
  68. sglang/srt/openai_api/adapter.py +0 -17
  69. sglang/srt/openai_api/protocol.py +3 -3
  70. sglang/srt/sampling/sampling_batch_info.py +22 -0
  71. sglang/srt/sampling/sampling_params.py +9 -1
  72. sglang/srt/server.py +20 -13
  73. sglang/srt/server_args.py +120 -58
  74. sglang/srt/speculative/build_eagle_tree.py +347 -0
  75. sglang/srt/speculative/eagle_utils.py +626 -0
  76. sglang/srt/speculative/eagle_worker.py +184 -0
  77. sglang/srt/speculative/spec_info.py +5 -0
  78. sglang/srt/utils.py +47 -7
  79. sglang/test/test_programs.py +23 -1
  80. sglang/test/test_utils.py +36 -7
  81. sglang/version.py +1 -1
  82. {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/METADATA +12 -12
  83. {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/RECORD +86 -57
  84. {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/WHEEL +1 -1
  85. {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/LICENSE +0 -0
  86. {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,184 @@
1
+ from typing import List, Optional, Union
2
+
3
+ import torch
4
+
5
+ from sglang.srt.layers.logits_processor import LogitsProcessorOutput
6
+ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
7
+ from sglang.srt.managers.tp_worker import TpModelWorker
8
+ from sglang.srt.model_executor.forward_batch_info import (
9
+ CaptureHiddenMode,
10
+ ForwardBatch,
11
+ ForwardMode,
12
+ )
13
+ from sglang.srt.model_executor.model_runner import ModelRunner
14
+ from sglang.srt.server_args import ServerArgs
15
+ from sglang.srt.speculative.eagle_utils import EAGLEDraftInput
16
+
17
+
18
+ class EAGLEWorker(TpModelWorker):
19
+
20
+ def __init__(
21
+ self,
22
+ server_args: ServerArgs,
23
+ gpu_id: int,
24
+ tp_rank: int,
25
+ dp_rank: Optional[int],
26
+ nccl_port: int,
27
+ target_worker: TpModelWorker,
28
+ ):
29
+ # Do not capture cuda graph in `super().__init__()`
30
+ # We will capture it later
31
+ backup_disable_cuda_graph = server_args.disable_cuda_graph
32
+ server_args.disable_cuda_graph = True
33
+ super().__init__(
34
+ gpu_id=gpu_id,
35
+ tp_rank=tp_rank,
36
+ server_args=server_args,
37
+ nccl_port=nccl_port,
38
+ dp_rank=dp_rank,
39
+ is_draft_worker=True,
40
+ )
41
+ self.target_worker = target_worker
42
+ self.server_args = server_args
43
+
44
+ # Share the embedding and lm_head
45
+ embed, head = self.target_worker.model_runner.model.get_embed_and_head()
46
+ self.model_runner.model.set_embed_and_head(embed, head)
47
+ self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
48
+ self.model_runner.init_cuda_graphs()
49
+
50
+ def forward_draft_decode(self, batch: ScheduleBatch):
51
+ batch.spec_info.prepare_for_decode(batch)
52
+ model_worker_batch = batch.get_model_worker_batch()
53
+ forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
54
+ forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
55
+ logits_output = self.model_runner.forward(forward_batch)
56
+ self.capture_for_decode(logits_output, forward_batch)
57
+
58
+ def forward_draft_extend(self, batch: ScheduleBatch):
59
+ self._set_mem_pool(batch, self.model_runner)
60
+ batch.spec_info.prepare_for_extend(batch)
61
+ model_worker_batch = batch.get_model_worker_batch()
62
+ forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
63
+ forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
64
+ logits_output = self.model_runner.forward(forward_batch)
65
+ self.capture_for_decode(logits_output, forward_batch)
66
+ self._set_mem_pool(batch, self.target_worker.model_runner)
67
+
68
+ def forward_batch_speculative_generation(self, batch: ScheduleBatch):
69
+ if batch.forward_mode.is_decode():
70
+ # Draft
71
+ self._set_mem_pool(batch, self.model_runner)
72
+ for i in range(self.server_args.speculative_num_steps):
73
+ self.forward_draft_decode(batch)
74
+ batch.spec_info.clear_draft_cache(batch)
75
+ self._set_mem_pool(batch, self.target_worker.model_runner)
76
+
77
+ # Verify
78
+ (
79
+ next_draft_input,
80
+ logits_output,
81
+ verified_id,
82
+ self.finish_extend_len,
83
+ accept_length_cpu,
84
+ model_worker_batch,
85
+ ) = self.verify(batch)
86
+ next_draft_input.load_server_args(self.server_args)
87
+ batch.spec_info = next_draft_input
88
+ # if it is None, means all requsets are finished
89
+ if batch.spec_info.verified_id is not None:
90
+ self.forward_draft_extend_after_decode(batch)
91
+ return (
92
+ logits_output,
93
+ verified_id,
94
+ model_worker_batch,
95
+ sum(accept_length_cpu),
96
+ )
97
+
98
+ else:
99
+ # Forward with the target model and get hidden states.
100
+ # We need the full hidden states to prefill the KV cache of the draft model.
101
+ model_worker_batch = batch.get_model_worker_batch()
102
+ model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
103
+ logits_output, next_token_ids = self.target_worker.forward_batch_generation(
104
+ model_worker_batch
105
+ )
106
+
107
+ # Forward with the draft model.
108
+ spec_info = EAGLEDraftInput()
109
+ spec_info.load_server_args(self.server_args)
110
+ spec_info.hidden_states = logits_output.hidden_states
111
+ spec_info.verified_id = next_token_ids
112
+ batch.spec_info = spec_info
113
+ self.forward_draft_extend(batch)
114
+ return logits_output, next_token_ids, model_worker_batch, 0
115
+
116
+ def verify(self, batch: ScheduleBatch):
117
+ verify_input = batch.spec_info.prepare_for_verify(batch)
118
+ verify_input.prepare_for_verify(batch)
119
+ batch.forward_mode = ForwardMode.TARGET_VERIFY
120
+ batch.spec_info = verify_input
121
+ batch.spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
122
+ model_worker_batch = batch.get_model_worker_batch()
123
+ logits_output, _ = self.target_worker.forward_batch_generation(
124
+ model_worker_batch, skip_sample=True
125
+ )
126
+ verify_input.hidden_states = logits_output.hidden_states
127
+ res = verify_input.verify(batch, logits_output)
128
+ batch.forward_mode = ForwardMode.DECODE
129
+ return res + (model_worker_batch,)
130
+
131
+ def _set_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
132
+ batch.token_to_kv_pool = runner.token_to_kv_pool
133
+ batch.req_to_token_pool = runner.req_to_token_pool
134
+
135
+ def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
136
+ self._set_mem_pool(batch, self.model_runner)
137
+ batch.forward_mode = ForwardMode.DRAFT_EXTEND
138
+ if batch.spec_info.has_finished:
139
+ index = batch.spec_info.unfinished_index
140
+ seq_lens = batch.seq_lens
141
+ batch.seq_lens = batch.seq_lens[index]
142
+
143
+ batch.spec_info.prepare_extend_after_decode(batch)
144
+ model_worker_batch = batch.get_model_worker_batch()
145
+ forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
146
+ forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
147
+ logits_output = self.model_runner.forward(forward_batch)
148
+
149
+ batch.spec_info.hidden_states = logits_output.hidden_states
150
+ self.capture_for_decode(logits_output, forward_batch)
151
+ batch.forward_mode = ForwardMode.DECODE
152
+ if batch.spec_info.has_finished:
153
+ batch.seq_lens = seq_lens
154
+ self._set_mem_pool(batch, self.target_worker.model_runner)
155
+
156
+ def capture_for_decode(
157
+ self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
158
+ ):
159
+ sample_output = torch.softmax(
160
+ logits_output.next_token_logits, dim=-1
161
+ ) # TODO(kavioyu): Support more sampling methods
162
+ spec_info = forward_batch.spec_info
163
+ spec_info.sample_output = sample_output
164
+ spec_info.hidden_states = logits_output.hidden_states
165
+ spec_info.prev_mode = forward_batch.forward_mode
166
+
167
+ # Don't support prefix share now.
168
+ def finish_request(self, reqs: Union[Req, List[Req]]):
169
+ if not isinstance(reqs, List):
170
+ reqs = [reqs]
171
+ for req in reqs:
172
+ if req.rid not in self.finish_extend_len:
173
+ continue
174
+ req_len = (
175
+ len(req.origin_input_ids)
176
+ + len(req.output_ids)
177
+ - self.finish_extend_len[req.rid]
178
+ - 1
179
+ )
180
+ kv_indices = self.model_runner.req_to_token_pool.req_to_token[
181
+ req.req_pool_idx
182
+ ][:req_len]
183
+ self.model_runner.token_to_kv_pool.free(kv_indices)
184
+ self.model_runner.req_to_token_pool.free(req.req_pool_idx)
@@ -2,8 +2,12 @@ from enum import IntEnum, auto
2
2
 
3
3
 
4
4
  class SpeculativeAlgorithm(IntEnum):
5
+ NONE = auto()
5
6
  EAGLE = auto()
6
7
 
8
+ def is_none(self):
9
+ return self == SpeculativeAlgorithm.NONE
10
+
7
11
  def is_eagle(self):
8
12
  return self == SpeculativeAlgorithm.EAGLE
9
13
 
@@ -11,6 +15,7 @@ class SpeculativeAlgorithm(IntEnum):
11
15
  def from_string(name: str):
12
16
  name_map = {
13
17
  "EAGLE": SpeculativeAlgorithm.EAGLE,
18
+ None: SpeculativeAlgorithm.NONE,
14
19
  }
15
20
  return name_map[name]
16
21
 
sglang/srt/utils.py CHANGED
@@ -15,6 +15,7 @@
15
15
 
16
16
  import base64
17
17
  import dataclasses
18
+ import io
18
19
  import ipaddress
19
20
  import itertools
20
21
  import json
@@ -34,6 +35,7 @@ import warnings
34
35
  from functools import lru_cache
35
36
  from importlib.metadata import PackageNotFoundError, version
36
37
  from io import BytesIO
38
+ from multiprocessing.reduction import ForkingPickler
37
39
  from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
38
40
 
39
41
  import numpy as np
@@ -60,7 +62,6 @@ from triton.runtime.cache import (
60
62
 
61
63
  logger = logging.getLogger(__name__)
62
64
 
63
-
64
65
  show_time_cost = False
65
66
  time_infos = {}
66
67
 
@@ -334,6 +335,8 @@ def is_port_available(port):
334
335
  return True
335
336
  except socket.error:
336
337
  return False
338
+ except OverflowError:
339
+ return False
337
340
 
338
341
 
339
342
  def decode_video_base64(video_base64):
@@ -708,13 +711,14 @@ def broadcast_pyobj(
708
711
  data: List[Any],
709
712
  rank: int,
710
713
  dist_group: Optional[torch.distributed.ProcessGroup] = None,
714
+ src: int = 0,
711
715
  ):
712
716
  """Broadcast inputs from rank=0 to all other ranks with torch.dist backend."""
713
717
 
714
718
  if rank == 0:
715
719
  if len(data) == 0:
716
720
  tensor_size = torch.tensor([0], dtype=torch.long)
717
- dist.broadcast(tensor_size, src=0, group=dist_group)
721
+ dist.broadcast(tensor_size, src=src, group=dist_group)
718
722
  else:
719
723
  serialized_data = pickle.dumps(data)
720
724
  size = len(serialized_data)
@@ -723,19 +727,19 @@ def broadcast_pyobj(
723
727
  )
724
728
  tensor_size = torch.tensor([size], dtype=torch.long)
725
729
 
726
- dist.broadcast(tensor_size, src=0, group=dist_group)
727
- dist.broadcast(tensor_data, src=0, group=dist_group)
730
+ dist.broadcast(tensor_size, src=src, group=dist_group)
731
+ dist.broadcast(tensor_data, src=src, group=dist_group)
728
732
  return data
729
733
  else:
730
734
  tensor_size = torch.tensor([0], dtype=torch.long)
731
- dist.broadcast(tensor_size, src=0, group=dist_group)
735
+ dist.broadcast(tensor_size, src=src, group=dist_group)
732
736
  size = tensor_size.item()
733
737
 
734
738
  if size == 0:
735
739
  return []
736
740
 
737
741
  tensor_data = torch.empty(size, dtype=torch.uint8)
738
- dist.broadcast(tensor_data, src=0, group=dist_group)
742
+ dist.broadcast(tensor_data, src=src, group=dist_group)
739
743
 
740
744
  serialized_data = bytes(tensor_data.cpu().numpy())
741
745
  data = pickle.loads(serialized_data)
@@ -1206,7 +1210,6 @@ def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) ->
1206
1210
  # https://github.com/pytorch/pytorch/blob/
1207
1211
  # c1cd946818442aca8c7f812b16d187ce1586c3bc/
1208
1212
  # torch/cuda/__init__.py#L831C1-L831C17
1209
- import torch.cuda
1210
1213
  import torch.version
1211
1214
 
1212
1215
  if not torch.cuda._is_compiled():
@@ -1335,3 +1338,40 @@ def parse_tool_response(text, tools, **kwargs):
1335
1338
  for call_info in call_info_list
1336
1339
  ]
1337
1340
  return text, call_info_list
1341
+
1342
+
1343
+ class MultiprocessingSerializer:
1344
+ @staticmethod
1345
+ def serialize(obj):
1346
+ buf = io.BytesIO()
1347
+ ForkingPickler(buf).dump(obj)
1348
+ buf.seek(0)
1349
+ return buf.read()
1350
+
1351
+ @staticmethod
1352
+ def deserialize(data):
1353
+ return ForkingPickler.loads(data)
1354
+
1355
+
1356
+ def debug_timing(func):
1357
+ # todo: replace with a more organized instrumentation
1358
+ def wrapper(*args, **kwargs):
1359
+ if logger.isEnabledFor(logging.DEBUG):
1360
+ tic = torch.cuda.Event(enable_timing=True)
1361
+ toc = torch.cuda.Event(enable_timing=True)
1362
+ tic.record()
1363
+ result = func(*args, **kwargs)
1364
+ toc.record()
1365
+ torch.cuda.synchronize() # Ensure all CUDA operations are complete
1366
+ elapsed = tic.elapsed_time(toc)
1367
+ indices = kwargs.get("indices", args[1] if len(args) > 1 else None)
1368
+ num_tokens = len(indices) if indices is not None else 0
1369
+ throughput = num_tokens / elapsed * 1000 if elapsed > 0 else 0
1370
+ logger.debug(
1371
+ f"Transfer time: {elapsed} ms, throughput: {throughput} tokens/s"
1372
+ )
1373
+ return result
1374
+ else:
1375
+ return func(*args, **kwargs)
1376
+
1377
+ return wrapper
@@ -509,13 +509,35 @@ def test_hellaswag_select():
509
509
  temperature=0,
510
510
  num_threads=64,
511
511
  progress_bar=True,
512
+ generator_style=False,
512
513
  )
513
- preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
514
+ preds = []
515
+ for i, ret in enumerate(rets):
516
+ preds.append(choices[i].index(ret["answer"]))
514
517
  latency = time.time() - tic
515
518
 
516
519
  # Compute accuracy
517
520
  accuracy = np.mean(np.array(preds) == np.array(labels))
518
521
 
522
+ # Test generator style of run_batch
523
+ tic = time.time()
524
+ rets = few_shot_hellaswag.run_batch(
525
+ arguments,
526
+ temperature=0,
527
+ num_threads=64,
528
+ progress_bar=True,
529
+ generator_style=True,
530
+ )
531
+ preds_gen = []
532
+ for i, ret in enumerate(rets):
533
+ preds_gen.append(choices[i].index(ret["answer"]))
534
+ latency_gen = time.time() - tic
535
+
536
+ # Compute accuracy
537
+ accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
538
+ assert np.abs(accuracy_gen - accuracy) < 0.01
539
+ assert np.abs(latency_gen - latency) < 1
540
+
519
541
  return accuracy, latency
520
542
 
521
543
 
sglang/test/test_utils.py CHANGED
@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
36
36
  DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
37
37
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
38
38
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
39
- DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
39
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
40
40
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
41
41
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
42
42
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
@@ -532,6 +532,8 @@ def run_bench_serving(
532
532
  request_rate,
533
533
  other_server_args,
534
534
  dataset_name="random",
535
+ dataset_path="",
536
+ tokenizer=None,
535
537
  random_input_len=4096,
536
538
  random_output_len=2048,
537
539
  disable_stream=False,
@@ -553,9 +555,9 @@ def run_bench_serving(
553
555
  host=None,
554
556
  port=None,
555
557
  dataset_name=dataset_name,
556
- dataset_path="",
558
+ dataset_path=dataset_path,
557
559
  model=None,
558
- tokenizer=None,
560
+ tokenizer=tokenizer,
559
561
  num_prompts=num_prompts,
560
562
  sharegpt_output_len=None,
561
563
  random_input_len=random_input_len,
@@ -657,16 +659,16 @@ STDERR_FILENAME = "stderr.txt"
657
659
  STDOUT_FILENAME = "stdout.txt"
658
660
 
659
661
 
660
- def read_output(output_lines):
662
+ def read_output(output_lines: List[str], filename: str = STDERR_FILENAME):
661
663
  """Print the output in real time with another thread."""
662
- while not os.path.exists(STDERR_FILENAME):
664
+ while not os.path.exists(filename):
663
665
  time.sleep(1)
664
666
 
665
667
  pt = 0
666
668
  while pt >= 0:
667
- if pt > 0 and not os.path.exists(STDERR_FILENAME):
669
+ if pt > 0 and not os.path.exists(filename):
668
670
  break
669
- lines = open(STDERR_FILENAME).readlines()
671
+ lines = open(filename).readlines()
670
672
  for line in lines[pt:]:
671
673
  print(line, end="", flush=True)
672
674
  output_lines.append(line)
@@ -747,6 +749,33 @@ def run_and_check_memory_leak(
747
749
  assert has_abort
748
750
 
749
751
 
752
+ def run_command_and_capture_output(command, env: Optional[dict] = None):
753
+ stdout = open(STDOUT_FILENAME, "w")
754
+ stderr = open(STDERR_FILENAME, "w")
755
+ process = subprocess.Popen(
756
+ command, stdout=stdout, stderr=stderr, env=env, text=True
757
+ )
758
+
759
+ # Launch a thread to stream the output
760
+ output_lines = []
761
+ t = threading.Thread(target=read_output, args=(output_lines, STDOUT_FILENAME))
762
+ t.start()
763
+
764
+ # Join the process
765
+ process.wait()
766
+
767
+ stdout.close()
768
+ stderr.close()
769
+ if os.path.exists(STDOUT_FILENAME):
770
+ os.remove(STDOUT_FILENAME)
771
+ if os.path.exists(STDERR_FILENAME):
772
+ os.remove(STDERR_FILENAME)
773
+ kill_process_tree(process.pid)
774
+ t.join()
775
+
776
+ return output_lines
777
+
778
+
750
779
  def run_mmlu_test(
751
780
  disable_radix_cache=False,
752
781
  enable_mixed_chunk=False,
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.1.post3"
1
+ __version__ = "0.4.1.post5"
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.1.post3
3
+ Version: 0.4.1.post5
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -239,15 +239,15 @@ Requires-Dist: uvloop; extra == "runtime-common"
239
239
  Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
240
240
  Provides-Extra: srt
241
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
242
+ Requires-Dist: cuda-python; extra == "srt"
243
+ Requires-Dist: sgl-kernel>=0.0.2.post11; extra == "srt"
242
244
  Requires-Dist: torch; extra == "srt"
243
245
  Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
244
- Requires-Dist: cuda-python; extra == "srt"
245
246
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
246
- Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
249
  Requires-Dist: torch; extra == "srt-hip"
250
- Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
250
+ Requires-Dist: vllm==0.6.3.post2.dev1; extra == "srt-hip"
251
251
  Provides-Extra: srt-xpu
252
252
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
253
253
  Provides-Extra: srt-hpu
@@ -315,7 +315,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
315
315
 
316
316
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
317
317
  | [**Documentation**](https://sgl-project.github.io/)
318
- | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA)
318
+ | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
319
319
  | [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
320
320
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
321
321
 
@@ -347,12 +347,13 @@ The core features include:
347
347
 
348
348
  ## Getting Started
349
349
  - [Install SGLang](https://sgl-project.github.io/start/install.html)
350
- - [Send requests](https://sgl-project.github.io/start/send_request.html)
351
- - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
352
- - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
350
+ - [Quick Start](https://sgl-project.github.io/start/send_request.html)
351
+ - [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
352
+ - [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
353
+ - [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
353
354
 
354
355
  ## Benchmark and Performance
355
- Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
356
+ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
356
357
 
357
358
  ## Roadmap
358
359
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
@@ -361,5 +362,4 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
361
362
  The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
362
363
 
363
364
  ## Acknowledgment and Citation
364
- We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
365
- Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
365
+ We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.