sglang 0.2.7__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -553,7 +553,8 @@ class StreamExecutor:
553
553
  "output_token_logprobs": output_token_logprobs,
554
554
  }
555
555
  self.variable_event[name].set()
556
- self.stream_var_event[name].set()
556
+ if self.stream_var_event:
557
+ self.stream_var_event[name].set()
557
558
  self.text_ += decision
558
559
 
559
560
  def _execute_variable(self, expr: SglVariable):
sglang/lang/ir.py CHANGED
@@ -99,7 +99,6 @@ class SglSamplingParams:
99
99
  "stop": self.stop or None,
100
100
  "temperature": self.temperature,
101
101
  "top_p": self.top_p,
102
- "top_k": self.top_k,
103
102
  "frequency_penalty": self.frequency_penalty,
104
103
  "presence_penalty": self.presence_penalty,
105
104
  }
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
13
13
  limitations under the License.
14
14
  """
15
15
 
16
- """Base cache class."""
16
+ """Base tool cache for constrained decoding tools."""
17
17
 
18
18
  import time
19
19
 
20
20
 
21
- class BaseCache:
21
+ class BaseToolCache:
22
22
  def __init__(self, enable=True):
23
23
  self.enable = enable
24
24
  self.reset()
@@ -16,10 +16,10 @@ limitations under the License.
16
16
  """Cache for the compressed finite state machine."""
17
17
 
18
18
  from sglang.srt.constrained import RegexGuide, TransformerTokenizer
19
- from sglang.srt.constrained.base_cache import BaseCache
19
+ from sglang.srt.constrained.base_tool_cache import BaseToolCache
20
20
 
21
21
 
22
- class FSMCache(BaseCache):
22
+ class FSMCache(BaseToolCache):
23
23
  def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
24
24
  super().__init__(enable=enable)
25
25
 
@@ -30,7 +30,7 @@ from sglang.srt.constrained import (
30
30
  make_byte_level_fsm,
31
31
  make_deterministic_fsm,
32
32
  )
33
- from sglang.srt.constrained.base_cache import BaseCache
33
+ from sglang.srt.constrained.base_tool_cache import BaseToolCache
34
34
 
35
35
  IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
36
36
 
@@ -151,7 +151,7 @@ class JumpForwardMap:
151
151
  )
152
152
 
153
153
 
154
- class JumpForwardCache(BaseCache):
154
+ class JumpForwardCache(BaseToolCache):
155
155
  def __init__(self):
156
156
  super().__init__()
157
157
 
@@ -28,6 +28,7 @@ from flashinfer.sampling import top_k_top_p_sampling_from_probs
28
28
  from sglang.global_config import global_config
29
29
  from sglang.srt.constrained import RegexGuide
30
30
  from sglang.srt.constrained.jump_forward import JumpForwardMap
31
+ from sglang.srt.mem_cache.chunk_cache import ChunkCache
31
32
  from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPool
32
33
  from sglang.srt.mem_cache.radix_cache import RadixCache
33
34
 
@@ -486,15 +487,33 @@ class Batch:
486
487
  req = self.reqs[idx]
487
488
  retracted_reqs.append(req)
488
489
 
489
- # TODO: apply more fine-grained retraction
490
- last_uncached_pos = len(req.prefix_indices)
491
- token_indices = self.req_to_token_pool.req_to_token[
492
- req_pool_indices_cpu[idx]
493
- ][last_uncached_pos : seq_lens_cpu[idx]]
494
- self.token_to_kv_pool.free(token_indices)
495
-
496
- # release the last node
497
- self.tree_cache.dec_lock_ref(req.last_node)
490
+ if isinstance(self.tree_cache, ChunkCache):
491
+ # ChunkCache does not have eviction
492
+ token_indices = self.req_to_token_pool.req_to_token[
493
+ req_pool_indices_cpu[idx]
494
+ ][: seq_lens_cpu[idx]]
495
+ self.token_to_kv_pool.free(token_indices)
496
+ self.req_to_token_pool.free(int(req_pool_indices_cpu[idx]))
497
+ del self.tree_cache.entries[req.rid]
498
+ else:
499
+ # TODO: apply more fine-grained retraction
500
+ last_uncached_pos = len(req.prefix_indices)
501
+ token_indices = self.req_to_token_pool.req_to_token[
502
+ req_pool_indices_cpu[idx]
503
+ ][last_uncached_pos : seq_lens_cpu[idx]]
504
+ self.token_to_kv_pool.free(token_indices)
505
+ self.req_to_token_pool.free(int(req_pool_indices_cpu[idx]))
506
+
507
+ # release the last node
508
+ self.tree_cache.dec_lock_ref(req.last_node)
509
+
510
+ # NOTE(lsyin): we should use the newly evictable memory instantly.
511
+ residual_size = (
512
+ len(sorted_indices) * global_config.retract_decode_steps
513
+ - self.token_to_kv_pool.available_size()
514
+ )
515
+ residual_size = max(0, residual_size)
516
+ self.tree_cache.evict(residual_size, self.token_to_kv_pool.free)
498
517
 
499
518
  req.prefix_indices = None
500
519
  req.last_node = None
@@ -575,6 +594,7 @@ class Batch:
575
594
  if req_pool_indices_cpu is None:
576
595
  req_pool_indices_cpu = self.req_pool_indices.tolist()
577
596
  self.tree_cache.cache_req(
597
+ rid=req.rid,
578
598
  token_ids=cur_all_ids,
579
599
  last_uncached_pos=len(req.prefix_indices),
580
600
  req_pool_idx=req_pool_indices_cpu[i],
@@ -43,6 +43,7 @@ from sglang.srt.managers.schedule_batch import (
43
43
  ForwardMode,
44
44
  Req,
45
45
  )
46
+ from sglang.srt.mem_cache.chunk_cache import ChunkCache
46
47
  from sglang.srt.mem_cache.radix_cache import RadixCache
47
48
  from sglang.srt.model_config import ModelConfig
48
49
  from sglang.srt.model_executor.model_runner import ModelRunner
@@ -144,11 +145,20 @@ class ModelTpServer:
144
145
  )
145
146
 
146
147
  # Init cache
147
- self.tree_cache = RadixCache(
148
- req_to_token_pool=self.model_runner.req_to_token_pool,
149
- token_to_kv_pool=self.model_runner.token_to_kv_pool,
150
- disable=server_args.disable_radix_cache,
151
- )
148
+ if (
149
+ server_args.chunked_prefill_size is not None
150
+ and server_args.disable_radix_cache
151
+ ):
152
+ self.tree_cache = ChunkCache(
153
+ req_to_token_pool=self.model_runner.req_to_token_pool,
154
+ token_to_kv_pool=self.model_runner.token_to_kv_pool,
155
+ )
156
+ else:
157
+ self.tree_cache = RadixCache(
158
+ req_to_token_pool=self.model_runner.req_to_token_pool,
159
+ token_to_kv_pool=self.model_runner.token_to_kv_pool,
160
+ disable=server_args.disable_radix_cache,
161
+ )
152
162
  self.tree_cache_metrics = {"total": 0, "hit": 0}
153
163
  self.scheduler = PolicyScheduler(
154
164
  self.schedule_policy,
@@ -280,6 +290,14 @@ class ModelTpServer:
280
290
  "KV cache pool leak detected!"
281
291
  )
282
292
 
293
+ if self.req_to_token_pool.can_use_mem_size != self.req_to_token_pool.size:
294
+ warnings.warn(
295
+ "Warning: "
296
+ f"available req slots={self.req_to_token_pool.can_use_mem_size}, "
297
+ f"total slots={self.req_to_token_pool.size}\n"
298
+ "Memory pool leak detected!"
299
+ )
300
+
283
301
  def handle_generate_request(
284
302
  self,
285
303
  recv_req: TokenizedGenerateReqInput,
@@ -346,7 +364,10 @@ class ModelTpServer:
346
364
  # Compute matched prefix length
347
365
  for req in self.waiting_queue:
348
366
  req.input_ids = req.origin_input_ids + req.output_ids
349
- prefix_indices, last_node = self.tree_cache.match_prefix(req.input_ids)
367
+ prefix_indices, last_node = self.tree_cache.match_prefix(
368
+ rid=req.rid,
369
+ key=req.input_ids,
370
+ )
350
371
  if req.return_logprob:
351
372
  prefix_indices = prefix_indices[: req.logprob_start_len]
352
373
  req.extend_input_len = len(req.input_ids) - len(prefix_indices)
@@ -606,6 +627,7 @@ class ModelTpServer:
606
627
  req_pool_indices_cpu = batch.req_pool_indices.cpu().numpy()
607
628
  for i, req in enumerate(batch.reqs):
608
629
  new_prefix_indices, new_last_node = self.tree_cache.cache_req(
630
+ rid=req.rid,
609
631
  token_ids=tuple(req.input_ids),
610
632
  last_uncached_pos=len(req.prefix_indices),
611
633
  req_pool_idx=req_pool_indices_cpu[i],
@@ -763,6 +785,7 @@ class ModelTpServer:
763
785
  for i in finished_indices:
764
786
  req = batch.reqs[i]
765
787
  self.tree_cache.cache_req(
788
+ rid=req.rid,
766
789
  token_ids=tuple(req.origin_input_ids + req.output_ids)[:-1],
767
790
  last_uncached_pos=len(req.prefix_indices),
768
791
  req_pool_idx=req_pool_indices_cpu[i],
@@ -0,0 +1,43 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class BasePrefixCache(ABC):
5
+ """Cache can be indexed by either rid or key."""
6
+
7
+ @abstractmethod
8
+ def reset(self):
9
+ pass
10
+
11
+ @abstractmethod
12
+ def match_prefix(self, **kwargs):
13
+ pass
14
+
15
+ @abstractmethod
16
+ def insert(self, **kwargs):
17
+ pass
18
+
19
+ @abstractmethod
20
+ def cache_req(self, **kwargs):
21
+ pass
22
+
23
+ @abstractmethod
24
+ def evict(self, num_tokens, evict_callback):
25
+ pass
26
+
27
+ @abstractmethod
28
+ def inc_lock_ref(self, node):
29
+ pass
30
+
31
+ @abstractmethod
32
+ def dec_lock_ref(self, node):
33
+ pass
34
+
35
+ @abstractmethod
36
+ def evictable_size(self):
37
+ pass
38
+
39
+ def total_size(self):
40
+ raise NotImplementedError
41
+
42
+ def pretty_print(self):
43
+ raise NotImplementedError
@@ -0,0 +1,60 @@
1
+ """Cache for chunked prefill, used when RadixCache is disabled."""
2
+
3
+ from sglang.srt.mem_cache.base_cache import BasePrefixCache
4
+
5
+
6
+ class ChunkCacheEntry:
7
+ def __init__(self, rid, value):
8
+ self.rid = rid
9
+ self.value = value
10
+
11
+
12
+ class ChunkCache(BasePrefixCache):
13
+ def __init__(self, req_to_token_pool, token_to_kv_pool):
14
+ self.disable = True
15
+ self.req_to_token_pool = req_to_token_pool
16
+ self.token_to_kv_pool = token_to_kv_pool
17
+
18
+ self.reset()
19
+
20
+ def reset(self):
21
+ self.entries = {}
22
+
23
+ def match_prefix(self, rid, **kwargs):
24
+ if rid not in self.entries:
25
+ return [], None
26
+
27
+ entry = self.entries[rid]
28
+ return entry.value, entry
29
+
30
+ def cache_req(
31
+ self, rid, token_ids, req_pool_idx, del_in_memory_pool=True, **kwargs
32
+ ):
33
+ indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]
34
+ if del_in_memory_pool:
35
+ assert rid in self.entries
36
+ self.req_to_token_pool.free(req_pool_idx)
37
+ self.token_to_kv_pool.free(indices)
38
+ return
39
+
40
+ if rid not in self.entries:
41
+ self.entries[rid] = ChunkCacheEntry(rid, indices)
42
+
43
+ entry = self.entries[rid]
44
+ entry.value = indices
45
+ return indices, entry
46
+
47
+ def insert(self):
48
+ raise NotImplementedError
49
+
50
+ def evict(self, num_tokens, evict_callback):
51
+ pass
52
+
53
+ def inc_lock_ref(self, node):
54
+ return 0
55
+
56
+ def dec_lock_ref(self, node):
57
+ return 0
58
+
59
+ def evictable_size(self):
60
+ return 0
@@ -23,6 +23,8 @@ from collections import defaultdict
23
23
 
24
24
  import torch
25
25
 
26
+ from sglang.srt.mem_cache.base_cache import BasePrefixCache
27
+
26
28
 
27
29
  class TreeNode:
28
30
  def __init__(self):
@@ -46,7 +48,7 @@ def _key_match(key0, key1):
46
48
  return i
47
49
 
48
50
 
49
- class RadixCache:
51
+ class RadixCache(BasePrefixCache):
50
52
  def __init__(self, req_to_token_pool, token_to_kv_pool, disable: bool = False):
51
53
  self.req_to_token_pool = req_to_token_pool
52
54
  self.token_to_kv_pool = token_to_kv_pool
@@ -62,7 +64,7 @@ class RadixCache:
62
64
  self.root_node.lock_ref = 1
63
65
  self.evictable_size_ = 0
64
66
 
65
- def match_prefix(self, key):
67
+ def match_prefix(self, key, **kwargs):
66
68
  if self.disable:
67
69
  return [], self.root_node
68
70
 
@@ -90,6 +92,7 @@ class RadixCache:
90
92
  req_pool_idx,
91
93
  del_in_memory_pool=True,
92
94
  old_last_node=None,
95
+ **kwargs,
93
96
  ):
94
97
  # Insert the request into radix cache
95
98
  indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]
@@ -19,6 +19,7 @@ import importlib
19
19
  import importlib.resources
20
20
  import logging
21
21
  import pkgutil
22
+ import warnings
22
23
  from functools import lru_cache
23
24
  from typing import Optional, Type
24
25
 
@@ -121,7 +122,11 @@ class ModelRunner:
121
122
 
122
123
  # Load the model and create memory pool
123
124
  self.load_model()
124
- self.init_memory_pool(total_gpu_memory, server_args.max_num_reqs)
125
+ self.init_memory_pool(
126
+ total_gpu_memory,
127
+ server_args.max_num_reqs,
128
+ server_args.max_total_tokens,
129
+ )
125
130
  self.init_cublas()
126
131
  self.init_flash_infer()
127
132
 
@@ -203,8 +208,18 @@ class ModelRunner:
203
208
  max_num_token = int(rest_memory * (1 << 30) // cell_size)
204
209
  return max_num_token
205
210
 
206
- def init_memory_pool(self, total_gpu_memory, max_num_reqs=None):
211
+ def init_memory_pool(
212
+ self, total_gpu_memory, max_num_reqs=None, max_total_tokens=None
213
+ ):
207
214
  self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
215
+ if max_total_tokens is not None:
216
+ if max_total_tokens > self.max_total_num_tokens:
217
+ warnings.warn(
218
+ f"max_total_tokens={max_total_tokens} is larger than the profiled value "
219
+ f"{self.max_total_num_tokens}. "
220
+ f"Use the profiled value instead."
221
+ )
222
+ self.max_total_num_tokens = min(self.max_total_num_tokens, max_total_tokens)
208
223
 
209
224
  if self.max_total_num_tokens <= 0:
210
225
  raise RuntimeError(
@@ -26,6 +26,11 @@ from vllm.config import CacheConfig
26
26
  from vllm.distributed import get_tensor_model_parallel_world_size
27
27
  from vllm.model_executor.layers.activation import SiluAndMul
28
28
  from vllm.model_executor.layers.layernorm import RMSNorm
29
+ from vllm.model_executor.layers.linear import (
30
+ MergedColumnParallelLinear,
31
+ QKVParallelLinear,
32
+ RowParallelLinear,
33
+ )
29
34
  from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
30
35
  from vllm.model_executor.layers.rotary_embedding import get_rope
31
36
  from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -38,10 +43,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
38
43
  from sglang.srt.layers.radix_attention import RadixAttention
39
44
  from sglang.srt.model_executor.model_runner import InputMetadata
40
45
 
41
- MergedColumnParallelLinear = None
42
- QKVParallelLinear = None
43
- RowParallelLinear = None
44
-
45
46
 
46
47
  class LlamaMLP(nn.Module):
47
48
  def __init__(
@@ -295,23 +296,6 @@ class LlamaForCausalLM(nn.Module):
295
296
  cache_config: Optional[CacheConfig] = None,
296
297
  efficient_weight_load=False,
297
298
  ) -> None:
298
- global MergedColumnParallelLinear
299
- global QKVParallelLinear
300
- global RowParallelLinear
301
-
302
- if efficient_weight_load:
303
- from sglang.srt.layers.linear import (
304
- MergedColumnParallelLinear,
305
- QKVParallelLinear,
306
- RowParallelLinear,
307
- )
308
- else:
309
- from vllm.model_executor.layers.linear import (
310
- MergedColumnParallelLinear,
311
- QKVParallelLinear,
312
- RowParallelLinear,
313
- )
314
-
315
299
  super().__init__()
316
300
  self.config = config
317
301
  self.quant_config = quant_config
@@ -43,7 +43,9 @@ from sglang.srt.openai_api.protocol import (
43
43
  ChatCompletionResponseChoice,
44
44
  ChatCompletionResponseStreamChoice,
45
45
  ChatCompletionStreamResponse,
46
+ ChatCompletionTokenLogprob,
46
47
  ChatMessage,
48
+ ChoiceLogprobs,
47
49
  CompletionRequest,
48
50
  CompletionResponse,
49
51
  CompletionResponseChoice,
@@ -54,6 +56,7 @@ from sglang.srt.openai_api.protocol import (
54
56
  FileRequest,
55
57
  FileResponse,
56
58
  LogProbs,
59
+ TopLogprob,
57
60
  UsageInfo,
58
61
  )
59
62
 
@@ -70,7 +73,7 @@ class FileMetadata:
70
73
  batch_storage: Dict[str, BatchResponse] = {}
71
74
  file_id_request: Dict[str, FileMetadata] = {}
72
75
  file_id_response: Dict[str, FileResponse] = {}
73
- ## map file id to file path in SGlang backend
76
+ # map file id to file path in SGlang backend
74
77
  file_id_storage: Dict[str, str] = {}
75
78
 
76
79
 
@@ -261,7 +264,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
261
264
  failed_requests += len(file_request_list)
262
265
 
263
266
  for idx, response in enumerate(responses):
264
- ## the batch_req here can be changed to be named within a batch granularity
267
+ # the batch_req here can be changed to be named within a batch granularity
265
268
  response_json = {
266
269
  "id": f"batch_req_{uuid.uuid4()}",
267
270
  "custom_id": file_request_list[idx].get("custom_id"),
@@ -333,6 +336,8 @@ def v1_generate_request(all_requests):
333
336
 
334
337
  prompts = []
335
338
  sampling_params_list = []
339
+ return_logprobs = []
340
+ top_logprobs_nums = []
336
341
  first_prompt_type = type(all_requests[0].prompt)
337
342
  for request in all_requests:
338
343
  prompt = request.prompt
@@ -340,6 +345,10 @@ def v1_generate_request(all_requests):
340
345
  type(prompt) == first_prompt_type
341
346
  ), "All prompts must be of the same type in file input settings"
342
347
  prompts.append(prompt)
348
+ return_logprobs.append(request.logprobs is not None and request.logprobs > 0)
349
+ top_logprobs_nums.append(
350
+ request.logprobs if request.logprobs is not None else 0
351
+ )
343
352
  sampling_params_list.append(
344
353
  {
345
354
  "temperature": request.temperature,
@@ -361,7 +370,9 @@ def v1_generate_request(all_requests):
361
370
  if len(all_requests) == 1:
362
371
  prompt = prompts[0]
363
372
  sampling_params_list = sampling_params_list[0]
364
- if isinstance(prompts, str) or isinstance(prompts[0], str):
373
+ return_logprobs = return_logprobs[0]
374
+ top_logprobs_nums = top_logprobs_nums[0]
375
+ if isinstance(prompt, str) or isinstance(prompt[0], str):
365
376
  prompt_kwargs = {"text": prompt}
366
377
  else:
367
378
  prompt_kwargs = {"input_ids": prompt}
@@ -370,15 +381,11 @@ def v1_generate_request(all_requests):
370
381
  prompt_kwargs = {"text": prompts}
371
382
  else:
372
383
  prompt_kwargs = {"input_ids": prompts}
373
-
374
384
  adapted_request = GenerateReqInput(
375
385
  **prompt_kwargs,
376
386
  sampling_params=sampling_params_list,
377
- return_logprob=all_requests[0].logprobs is not None
378
- and all_requests[0].logprobs > 0,
379
- top_logprobs_num=(
380
- all_requests[0].logprobs if all_requests[0].logprobs is not None else 0
381
- ),
387
+ return_logprob=return_logprobs,
388
+ top_logprobs_num=top_logprobs_nums,
382
389
  return_text_in_logprobs=True,
383
390
  stream=all_requests[0].stream,
384
391
  )
@@ -430,7 +437,7 @@ def v1_generate_response(request, ret, to_file=False):
430
437
  logprobs = None
431
438
 
432
439
  if to_file:
433
- ## to make the choise data json serializable
440
+ # to make the choise data json serializable
434
441
  choice_data = {
435
442
  "index": 0,
436
443
  "text": text,
@@ -454,7 +461,7 @@ def v1_generate_response(request, ret, to_file=False):
454
461
  "status_code": 200,
455
462
  "request_id": ret[i]["meta_info"]["id"],
456
463
  "body": {
457
- ## remain the same but if needed we can change that
464
+ # remain the same but if needed we can change that
458
465
  "id": ret[i]["meta_info"]["id"],
459
466
  "object": "text_completion",
460
467
  "created": int(time.time()),
@@ -590,6 +597,8 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
590
597
  texts = []
591
598
  sampling_params_list = []
592
599
  image_data_list = []
600
+ return_logprobs = []
601
+ top_logprobs_nums = []
593
602
  for request in all_requests:
594
603
  # Prep the data needed for the underlying GenerateReqInput:
595
604
  # - prompt: The full prompt string.
@@ -620,6 +629,8 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
620
629
  stop = request.stop
621
630
  image_data = None
622
631
  texts.append(prompt)
632
+ return_logprobs.append(request.logprobs)
633
+ top_logprobs_nums.append(request.top_logprobs)
623
634
  sampling_params_list.append(
624
635
  {
625
636
  "temperature": request.temperature,
@@ -637,11 +648,16 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
637
648
  texts = texts[0]
638
649
  sampling_params_list = sampling_params_list[0]
639
650
  image_data = image_data_list[0]
651
+ return_logprobs = return_logprobs[0]
652
+ top_logprobs_nums = top_logprobs_nums[0]
640
653
  adapted_request = GenerateReqInput(
641
654
  text=texts,
642
655
  image_data=image_data,
643
656
  sampling_params=sampling_params_list,
644
- stream=request.stream,
657
+ return_logprob=return_logprobs,
658
+ top_logprobs_num=top_logprobs_nums,
659
+ stream=all_requests[0].stream,
660
+ return_text_in_logprobs=True,
645
661
  )
646
662
  if len(all_requests) == 1:
647
663
  return adapted_request, all_requests[0]
@@ -654,26 +670,63 @@ def v1_chat_generate_response(request, ret, to_file=False):
654
670
  total_completion_tokens = 0
655
671
 
656
672
  for idx, ret_item in enumerate(ret):
673
+ logprobs = False
674
+ if isinstance(request, List) and request[idx].logprobs:
675
+ logprobs = True
676
+ elif (not isinstance(request, List)) and request.logprobs:
677
+ logprobs = True
678
+ if logprobs:
679
+ logprobs = to_openai_style_logprobs(
680
+ output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
681
+ output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
682
+ )
683
+ token_logprobs = []
684
+ for token, logprob in zip(logprobs.tokens, logprobs.token_logprobs):
685
+ token_bytes = list(token.encode("utf-8"))
686
+ top_logprobs = []
687
+ if logprobs.top_logprobs:
688
+ for top_token, top_logprob in logprobs.top_logprobs[0].items():
689
+ top_token_bytes = list(top_token.encode("utf-8"))
690
+ top_logprobs.append(
691
+ TopLogprob(
692
+ token=top_token,
693
+ bytes=top_token_bytes,
694
+ logprob=top_logprob,
695
+ )
696
+ )
697
+ token_logprobs.append(
698
+ ChatCompletionTokenLogprob(
699
+ token=token,
700
+ bytes=token_bytes,
701
+ logprob=logprob,
702
+ top_logprobs=top_logprobs,
703
+ )
704
+ )
705
+
706
+ choice_logprobs = ChoiceLogprobs(content=token_logprobs)
707
+ else:
708
+ choice_logprobs = None
657
709
  prompt_tokens = ret_item["meta_info"]["prompt_tokens"]
658
710
  completion_tokens = ret_item["meta_info"]["completion_tokens"]
659
711
 
660
712
  if to_file:
661
- ## to make the choice data json serializable
713
+ # to make the choice data json serializable
662
714
  choice_data = {
663
715
  "index": 0,
664
716
  "message": {"role": "assistant", "content": ret_item["text"]},
665
- "logprobs": None,
717
+ "logprobs": choice_logprobs,
666
718
  "finish_reason": ret_item["meta_info"]["finish_reason"],
667
719
  }
668
720
  else:
669
721
  choice_data = ChatCompletionResponseChoice(
670
722
  index=idx,
671
723
  message=ChatMessage(role="assistant", content=ret_item["text"]),
724
+ logprobs=choice_logprobs,
672
725
  finish_reason=ret_item["meta_info"]["finish_reason"],
673
726
  )
674
727
 
675
728
  choices.append(choice_data)
676
- total_prompt_tokens = prompt_tokens
729
+ total_prompt_tokens += prompt_tokens
677
730
  total_completion_tokens += completion_tokens
678
731
  if to_file:
679
732
  responses = []
@@ -683,7 +736,7 @@ def v1_chat_generate_response(request, ret, to_file=False):
683
736
  "status_code": 200,
684
737
  "request_id": ret[i]["meta_info"]["id"],
685
738
  "body": {
686
- ## remain the same but if needed we can change that
739
+ # remain the same but if needed we can change that
687
740
  "id": ret[i]["meta_info"]["id"],
688
741
  "object": "chat.completion",
689
742
  "created": int(time.time()),
@@ -54,6 +54,24 @@ class LogProbs(BaseModel):
54
54
  top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
55
55
 
56
56
 
57
+ class TopLogprob(BaseModel):
58
+ token: str
59
+ bytes: List[int]
60
+ logprob: float
61
+
62
+
63
+ class ChatCompletionTokenLogprob(BaseModel):
64
+ token: str
65
+ bytes: List[int]
66
+ logprob: float
67
+ top_logprobs: List[TopLogprob]
68
+
69
+
70
+ class ChoiceLogprobs(BaseModel):
71
+ # build for v1/chat/completions response
72
+ content: List[ChatCompletionTokenLogprob]
73
+
74
+
57
75
  class UsageInfo(BaseModel):
58
76
  prompt_tokens: int = 0
59
77
  total_tokens: int = 0
@@ -239,8 +257,8 @@ class ChatMessage(BaseModel):
239
257
  class ChatCompletionResponseChoice(BaseModel):
240
258
  index: int
241
259
  message: ChatMessage
242
- logprobs: Optional[LogProbs] = None
243
- finish_reason: Optional[str] = None
260
+ logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
261
+ finish_reason: str
244
262
 
245
263
 
246
264
  class ChatCompletionResponse(BaseModel):
sglang/srt/server.py CHANGED
@@ -260,7 +260,7 @@ def launch_server(
260
260
  if not server_args.disable_flashinfer:
261
261
  assert_pkg_version(
262
262
  "flashinfer",
263
- "0.1.2",
263
+ "0.1.3",
264
264
  "Please uninstall the old version and "
265
265
  "reinstall the latest version by following the instructions "
266
266
  "at https://docs.flashinfer.ai/installation.html.",
@@ -479,6 +479,9 @@ class Runtime:
479
479
  parent.wait(timeout=5)
480
480
  self.pid = None
481
481
 
482
+ def cache_prefix(self, prefix: str):
483
+ self.endpoint.cache_prefix(prefix)
484
+
482
485
  def get_tokenizer(self):
483
486
  return get_tokenizer(
484
487
  self.server_args.tokenizer_path,
sglang/srt/server_args.py CHANGED
@@ -44,6 +44,7 @@ class ServerArgs:
44
44
  max_prefill_tokens: Optional[int] = None
45
45
  max_running_requests: Optional[int] = None
46
46
  max_num_reqs: Optional[int] = None
47
+ max_total_tokens: Optional[int] = None
47
48
  schedule_policy: str = "lpm"
48
49
  schedule_conservativeness: float = 1.0
49
50
 
@@ -231,6 +232,12 @@ class ServerArgs:
231
232
  default=ServerArgs.max_num_reqs,
232
233
  help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
233
234
  )
235
+ parser.add_argument(
236
+ "--max-total-tokens",
237
+ type=int,
238
+ default=ServerArgs.max_total_tokens,
239
+ help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
240
+ )
234
241
  parser.add_argument(
235
242
  "--schedule-policy",
236
243
  type=str,
@@ -412,10 +419,6 @@ class ServerArgs:
412
419
  self.dp_size > 1 and self.node_rank is not None
413
420
  ), "multi-node data parallel is not supported"
414
421
 
415
- assert not (
416
- self.chunked_prefill_size is not None and self.disable_radix_cache
417
- ), "chunked prefill is not supported with radix cache disabled currently"
418
-
419
422
 
420
423
  @dataclasses.dataclass
421
424
  class PortArgs:
@@ -113,15 +113,14 @@ def test_decode_json_regex():
113
113
  s += ' "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
114
114
  s += ' "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
115
115
  s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT + ",") + "\n"
116
- s += ' "country": ' + sgl.gen(regex=REGEX_STRING + ",") + "\n"
117
- s += ' "timezone": ' + sgl.gen(regex=REGEX_STRING) + "\n"
116
+ s += ' "country": ' + sgl.gen(regex=REGEX_STRING) + "\n"
118
117
  s += "}"
119
118
 
120
- ret = decode_json.run()
119
+ ret = decode_json.run(temperature=0.0)
121
120
  try:
122
121
  js_obj = json.loads(ret["json_output"])
123
122
  except json.decoder.JSONDecodeError:
124
- print(ret["json_output"])
123
+ print("JSONDecodeError", ret["json_output"])
125
124
  raise
126
125
  assert isinstance(js_obj["name"], str)
127
126
  assert isinstance(js_obj["population"], int)
@@ -141,8 +140,12 @@ def test_decode_json():
141
140
  s += ' "timezone": ' + sgl.gen(dtype=str) + "\n"
142
141
  s += "}"
143
142
 
144
- ret = decode_json.run()
145
- js_obj = json.loads(ret["json_output"])
143
+ ret = decode_json.run(max_new_tokens=64)
144
+ try:
145
+ js_obj = json.loads(ret["json_output"])
146
+ except json.decoder.JSONDecodeError:
147
+ print("JSONDecodeError", ret["json_output"])
148
+ raise
146
149
  assert isinstance(js_obj["name"], str)
147
150
  assert isinstance(js_obj["population"], int)
148
151
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.7"
1
+ __version__ = "0.2.8"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.7
3
+ Version: 0.2.8
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -299,8 +299,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
299
299
 
300
300
  ### Method 2: From source
301
301
  ```
302
- # Use the stable release branch
303
- git clone -b release https://github.com/sgl-project/sglang.git
302
+ # Use the stable v0.2.8 branch
303
+ git clone -b v0.2.8 https://github.com/sgl-project/sglang.git
304
304
  cd sglang
305
305
 
306
306
  pip install --upgrade pip
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
312
312
 
313
313
  ### Method 3: Using docker
314
314
  The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
315
- Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
315
+ Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
316
316
 
317
317
  ```bash
318
318
  docker run --gpus all \
@@ -7,12 +7,12 @@ sglang/global_config.py,sha256=CyhGL7PE-KlMcg7IHWykzImU1y4NQlpeIlh9lHA77uo,1749
7
7
  sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
8
8
  sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
9
9
  sglang/utils.py,sha256=r0Z7hY_bFFk-b6WeQJir9br-hCW2-p7n5E7Et2WziaQ,8776
10
- sglang/version.py,sha256=XHypfHSPdgXFKmOdoewn7czU670gt8InhHhzlP5j_aA,22
10
+ sglang/version.py,sha256=G6Dbxq2ws-1ZAXwDD8q0KWueYtso_Y6Uyvtj8sRWsPI,22
11
11
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
13
13
  sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
14
- sglang/lang/interpreter.py,sha256=dt_NAAMv2oSYxwSMjhMr2pIGTe5_d12cSR91SUWvpCQ,30298
15
- sglang/lang/ir.py,sha256=THa6hwnuTVXVYxnovNQP_o7A9v5O8uXE4eLXH9vDRLA,16648
14
+ sglang/lang/interpreter.py,sha256=_MbvYB0vweCgALklpM2DlofiCXuITCmX_fl8rPPcp5U,30340
15
+ sglang/lang/ir.py,sha256=0r-mhA4aO-uuS97Dvkw99ERTcJXfzuV6jJQMmuCwHEg,16615
16
16
  sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
17
17
  sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
@@ -26,13 +26,13 @@ sglang/srt/hf_transformers_utils.py,sha256=Fg-3panb6lsqOhHmAYA0ivkXyBjdnvY5mqvil
26
26
  sglang/srt/mm_utils.py,sha256=n7_GmbOM_0IWVXovpM34rKIBw0Py9yb_NXSQw27u4OA,9454
27
27
  sglang/srt/model_config.py,sha256=DO7m84WiT3dzPWmyKz_UXDAHEdqEjq8Lq5wCjzjYMME,6023
28
28
  sglang/srt/sampling_params.py,sha256=uZFDlTUPnNR5_3IDH-INDeN-tm6LlRkC2KT-B3njxJs,3687
29
- sglang/srt/server.py,sha256=2qgluP7_6-e36PDK_mr-rLK9us3_9KvXLG3255h-tS4,16022
30
- sglang/srt/server_args.py,sha256=0cV-r5QTV_9Arl3hVf9mc20BlOhYhWSkICU0T3dS180,15412
29
+ sglang/srt/server.py,sha256=8uDMWGAp2EZ8bywQumEa6T2G2k78-oYXgLfk6qBkv8o,16107
30
+ sglang/srt/server_args.py,sha256=zGAbZqKKN4dkn5BDcZdjxLM-jIFsHX2ThAEfvPKUm6c,15645
31
31
  sglang/srt/utils.py,sha256=uIatocIFzqi6fWSscz2MjF3jUcIRBJlqLgYeicM_W9s,22950
32
32
  sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
33
- sglang/srt/constrained/base_cache.py,sha256=Aeu2HMPhXMPNQNEwPJ19sECN0PYPZKjisrZiCcocHiw,1970
34
- sglang/srt/constrained/fsm_cache.py,sha256=Q7wfGx7VOghErqcC_0kK4aI-lBEO9TxoFPyUiBxEGVE,2626
35
- sglang/srt/constrained/jump_forward.py,sha256=SYKj5Pd3d7oym5fAI8zUzj3zKk-lV30m_ksAy0ubgO8,6180
33
+ sglang/srt/constrained/base_tool_cache.py,sha256=1_m-AivPtWRwUgGiEZBafCrSFUGahK4UM4vgAd8TkMg,2004
34
+ sglang/srt/constrained/fsm_cache.py,sha256=GoPBr_9ZdJizF2PKbYoQw2I4ckfrUYwCeMZxB9sY3TM,2639
35
+ sglang/srt/constrained/jump_forward.py,sha256=IgZ8D0woy5FLIQvXkE8wZRYejDsfVkjU0sqUlkiv_f4,6193
36
36
  sglang/srt/layers/context_flashattention_nopad.py,sha256=r_TpHuYAVgq1pN81PiWe1bebtY-p9MBndBaoIE2VXrk,5180
37
37
  sglang/srt/layers/extend_attention.py,sha256=zuNnAdL_wF6BX0Mwn1dgDJvh3YJjYwqa5Fbzp8muOVc,12573
38
38
  sglang/srt/layers/fused_moe.py,sha256=KmyXwau2OOZpQimGIQrHptzGNs1trIud5AKEEKXdzPU,20823
@@ -47,14 +47,16 @@ sglang/srt/managers/controller_single.py,sha256=CdQ9_XPZdcWF5jArDmVR8K-WZ9_8Gpgk
47
47
  sglang/srt/managers/detokenizer_manager.py,sha256=GXWdW4n2N-otL3zcgdr0t1PcEe2EmQJA8AElntiNV1o,5606
48
48
  sglang/srt/managers/io_struct.py,sha256=Rz7Ur9Yw6prDGdy6XjsSiUmVBccS6cef-G_9TW7HA_4,7105
49
49
  sglang/srt/managers/policy_scheduler.py,sha256=ajSB-gCC6VJkXvnKU8FYU3Kgcigozp2pMTwF84Wp14o,3138
50
- sglang/srt/managers/schedule_batch.py,sha256=tbos5i4KSfk1K8VH5HCNm2pQGlJMKVAE_mZ8haVMelc,36620
50
+ sglang/srt/managers/schedule_batch.py,sha256=LIoVCPNivh0u1dOrrWRgFD6a4ywq3nrG_4dNgCK0kIw,37697
51
51
  sglang/srt/managers/tokenizer_manager.py,sha256=tEct3shjjw_7ickj_cmt9IxoBHfgbryQHI7DZS0m4TA,20511
52
- sglang/srt/managers/tp_worker.py,sha256=91gbWi7hSuyTC3Qvo7EXKmHM6GKWTK0Nqpda001jOw0,34349
52
+ sglang/srt/managers/tp_worker.py,sha256=JPLneFwcPlmPXZX1QxZHWgcdau8FC8wNuVqfCqsgOkU,35234
53
+ sglang/srt/mem_cache/base_cache.py,sha256=czyN8IumXcMQskYOZDV3DzjfD4kdR-qwLVxceDqnOmE,788
54
+ sglang/srt/mem_cache/chunk_cache.py,sha256=u1mkGoTI7_31H0i0mhKT7S57StYSsdmsSPqyGubE7lY,1560
53
55
  sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
54
56
  sglang/srt/mem_cache/memory_pool.py,sha256=wkhjyYLbAZrl2FB5i4ODkxgMufBuDpe4N0kbXhu6ZO0,4509
55
- sglang/srt/mem_cache/radix_cache.py,sha256=Xk0c8nwyPHEUsobVJQrr7edwyzUMk9MBYTQBprN8a0Y,8775
57
+ sglang/srt/mem_cache/radix_cache.py,sha256=pa5RD4xNKPSuvL55BnC4mimoca5oJRXr4Rg91-sbTcs,8881
56
58
  sglang/srt/model_executor/cuda_graph_runner.py,sha256=OdmO6R7nHWrRJCtZOxYkt0KNdGoX7Md4knsypwPYjaQ,9365
57
- sglang/srt/model_executor/model_runner.py,sha256=WyPsO73MD3ziKAk76j4HemePYZluXjs9WGYeajUgfQA,15507
59
+ sglang/srt/model_executor/model_runner.py,sha256=fo3fbnNaHkcHz2UDkyvFjU7sGvdClhmhdelQh0n9PgA,16079
58
60
  sglang/srt/model_loader/model_loader.py,sha256=QmZUhHh1nmWrfYlunfnxMcTsIvip1l6aMIlrXoCED4I,10697
59
61
  sglang/srt/model_loader/utils.py,sha256=0AoWXX9uV5rKRYXJ4HduSnvdeerytI4ONCLCH6X4XFQ,10675
60
62
  sglang/srt/models/chatglm.py,sha256=vYWooqyPmcSFZNjxj_g5I_FgHJlDytbEiz6vyv3JBNM,13856
@@ -67,7 +69,7 @@ sglang/srt/models/gemma2.py,sha256=kTjZcsptgtYaO8BL_NlygjVSMSloq2Mc4Rf3FKvEhbs,1
67
69
  sglang/srt/models/gpt_bigcode.py,sha256=U7GmHKywSu12D-EwvuWv3RwHkx6bPawaRIjlFIpQkfs,10194
68
70
  sglang/srt/models/grok.py,sha256=NfZdsRVErDIUWFqjhtNf2pqC9G4cRdYHBFpgDq1IZ2A,27855
69
71
  sglang/srt/models/internlm2.py,sha256=Ld2GUxZeqqqJ2vd4QiX2s1y2AceJLA1nVnUYY88GMQk,12219
70
- sglang/srt/models/llama2.py,sha256=zhoCUh_3dNC7FOzDnaoHcHF3-y7vTVYDZzHKqIsUJgs,14764
72
+ sglang/srt/models/llama2.py,sha256=zfOk3OK1_B6s6yuXsZFmNCf07RsfytVD72GunLBt8Cc,14282
71
73
  sglang/srt/models/llama_classification.py,sha256=4r_orFZqBR3U_yC4bus1K3Z3-ADscYGSzgA82_VDN0g,4926
72
74
  sglang/srt/models/llava.py,sha256=BJphgyQGdo7uTpJcKGEfWwdpH9GTMDnyiznLSSgmvm8,18476
73
75
  sglang/srt/models/llavavid.py,sha256=-7vaVqaIfukCvMkNakEPblpwjIHC6ezrAvmpE5RzlUY,13602
@@ -80,14 +82,14 @@ sglang/srt/models/qwen2.py,sha256=mXlVd6UTCXY3VdgodFpQnlaY-NYLIbA-SknxdA9R13w,12
80
82
  sglang/srt/models/qwen2_moe.py,sha256=YYdJEezic7GyW-_bXlNIaqBa0C4IHQpz_vuRBLxms4k,18141
81
83
  sglang/srt/models/stablelm.py,sha256=b3d-ZwLQoLjZ6CupnkIq7d-z9tzGSxAyIcgSmZiZxZw,11362
82
84
  sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
83
- sglang/srt/openai_api/adapter.py,sha256=Jn8Awi93zkb3Wq5gqK698kOhmqYdtxZlRePciA50Ud4,30213
84
- sglang/srt/openai_api/protocol.py,sha256=_mBNdxb_4ZRIeP0wmW8tMTc2x7zu4foVxBDCuCWkaiw,7822
85
+ sglang/srt/openai_api/adapter.py,sha256=MaWz78cvkk5RdotRMCIf_K5xYAClX7TonjxH_dzUrVI,32495
86
+ sglang/srt/openai_api/protocol.py,sha256=JXLnnQ63I-bJv93ICPfP0cBpyomQA5IYE_mkUg5X4Es,8177
85
87
  sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
86
88
  sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
87
- sglang/test/test_programs.py,sha256=s4WGpTmYP4Yx5g8JYZpbkeF9RN5iUnlKdi8FGAZovTc,13756
89
+ sglang/test/test_programs.py,sha256=0M8blaIy--eEE2dQnG4FyjIETT_wa7eEG3S9UWna6_4,13851
88
90
  sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
89
- sglang-0.2.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
90
- sglang-0.2.7.dist-info/METADATA,sha256=NU4S55-t6q87AKPkgbDORvX_Om0XbAJ9K67_p30JnQ0,33216
91
- sglang-0.2.7.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
92
- sglang-0.2.7.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
93
- sglang-0.2.7.dist-info/RECORD,,
91
+ sglang-0.2.8.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
92
+ sglang-0.2.8.dist-info/METADATA,sha256=FRkxB6W7NQlj9ar65-oppfES5tc1pS8LRPJXU-43hsQ,33214
93
+ sglang-0.2.8.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
94
+ sglang-0.2.8.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
95
+ sglang-0.2.8.dist-info/RECORD,,
File without changes