sglang 0.2.7__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/interpreter.py +2 -1
- sglang/lang/ir.py +0 -1
- sglang/srt/constrained/{base_cache.py → base_tool_cache.py} +2 -2
- sglang/srt/constrained/fsm_cache.py +2 -2
- sglang/srt/constrained/jump_forward.py +2 -2
- sglang/srt/managers/schedule_batch.py +29 -9
- sglang/srt/managers/tp_worker.py +29 -6
- sglang/srt/mem_cache/base_cache.py +43 -0
- sglang/srt/mem_cache/chunk_cache.py +60 -0
- sglang/srt/mem_cache/radix_cache.py +5 -2
- sglang/srt/model_executor/model_runner.py +17 -2
- sglang/srt/models/llama2.py +5 -21
- sglang/srt/openai_api/adapter.py +69 -16
- sglang/srt/openai_api/protocol.py +20 -2
- sglang/srt/server.py +4 -1
- sglang/srt/server_args.py +7 -4
- sglang/test/test_programs.py +9 -6
- sglang/version.py +1 -1
- {sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/METADATA +4 -4
- {sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/RECORD +23 -21
- {sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/LICENSE +0 -0
- {sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/WHEEL +0 -0
- {sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/top_level.txt +0 -0
sglang/lang/interpreter.py
CHANGED
@@ -553,7 +553,8 @@ class StreamExecutor:
|
|
553
553
|
"output_token_logprobs": output_token_logprobs,
|
554
554
|
}
|
555
555
|
self.variable_event[name].set()
|
556
|
-
self.stream_var_event
|
556
|
+
if self.stream_var_event:
|
557
|
+
self.stream_var_event[name].set()
|
557
558
|
self.text_ += decision
|
558
559
|
|
559
560
|
def _execute_variable(self, expr: SglVariable):
|
sglang/lang/ir.py
CHANGED
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
|
|
13
13
|
limitations under the License.
|
14
14
|
"""
|
15
15
|
|
16
|
-
"""Base cache
|
16
|
+
"""Base tool cache for constrained decoding tools."""
|
17
17
|
|
18
18
|
import time
|
19
19
|
|
20
20
|
|
21
|
-
class
|
21
|
+
class BaseToolCache:
|
22
22
|
def __init__(self, enable=True):
|
23
23
|
self.enable = enable
|
24
24
|
self.reset()
|
@@ -16,10 +16,10 @@ limitations under the License.
|
|
16
16
|
"""Cache for the compressed finite state machine."""
|
17
17
|
|
18
18
|
from sglang.srt.constrained import RegexGuide, TransformerTokenizer
|
19
|
-
from sglang.srt.constrained.
|
19
|
+
from sglang.srt.constrained.base_tool_cache import BaseToolCache
|
20
20
|
|
21
21
|
|
22
|
-
class FSMCache(
|
22
|
+
class FSMCache(BaseToolCache):
|
23
23
|
def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
|
24
24
|
super().__init__(enable=enable)
|
25
25
|
|
@@ -30,7 +30,7 @@ from sglang.srt.constrained import (
|
|
30
30
|
make_byte_level_fsm,
|
31
31
|
make_deterministic_fsm,
|
32
32
|
)
|
33
|
-
from sglang.srt.constrained.
|
33
|
+
from sglang.srt.constrained.base_tool_cache import BaseToolCache
|
34
34
|
|
35
35
|
IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
|
36
36
|
|
@@ -151,7 +151,7 @@ class JumpForwardMap:
|
|
151
151
|
)
|
152
152
|
|
153
153
|
|
154
|
-
class JumpForwardCache(
|
154
|
+
class JumpForwardCache(BaseToolCache):
|
155
155
|
def __init__(self):
|
156
156
|
super().__init__()
|
157
157
|
|
@@ -28,6 +28,7 @@ from flashinfer.sampling import top_k_top_p_sampling_from_probs
|
|
28
28
|
from sglang.global_config import global_config
|
29
29
|
from sglang.srt.constrained import RegexGuide
|
30
30
|
from sglang.srt.constrained.jump_forward import JumpForwardMap
|
31
|
+
from sglang.srt.mem_cache.chunk_cache import ChunkCache
|
31
32
|
from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPool
|
32
33
|
from sglang.srt.mem_cache.radix_cache import RadixCache
|
33
34
|
|
@@ -486,15 +487,33 @@ class Batch:
|
|
486
487
|
req = self.reqs[idx]
|
487
488
|
retracted_reqs.append(req)
|
488
489
|
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
490
|
+
if isinstance(self.tree_cache, ChunkCache):
|
491
|
+
# ChunkCache does not have eviction
|
492
|
+
token_indices = self.req_to_token_pool.req_to_token[
|
493
|
+
req_pool_indices_cpu[idx]
|
494
|
+
][: seq_lens_cpu[idx]]
|
495
|
+
self.token_to_kv_pool.free(token_indices)
|
496
|
+
self.req_to_token_pool.free(int(req_pool_indices_cpu[idx]))
|
497
|
+
del self.tree_cache.entries[req.rid]
|
498
|
+
else:
|
499
|
+
# TODO: apply more fine-grained retraction
|
500
|
+
last_uncached_pos = len(req.prefix_indices)
|
501
|
+
token_indices = self.req_to_token_pool.req_to_token[
|
502
|
+
req_pool_indices_cpu[idx]
|
503
|
+
][last_uncached_pos : seq_lens_cpu[idx]]
|
504
|
+
self.token_to_kv_pool.free(token_indices)
|
505
|
+
self.req_to_token_pool.free(int(req_pool_indices_cpu[idx]))
|
506
|
+
|
507
|
+
# release the last node
|
508
|
+
self.tree_cache.dec_lock_ref(req.last_node)
|
509
|
+
|
510
|
+
# NOTE(lsyin): we should use the newly evictable memory instantly.
|
511
|
+
residual_size = (
|
512
|
+
len(sorted_indices) * global_config.retract_decode_steps
|
513
|
+
- self.token_to_kv_pool.available_size()
|
514
|
+
)
|
515
|
+
residual_size = max(0, residual_size)
|
516
|
+
self.tree_cache.evict(residual_size, self.token_to_kv_pool.free)
|
498
517
|
|
499
518
|
req.prefix_indices = None
|
500
519
|
req.last_node = None
|
@@ -575,6 +594,7 @@ class Batch:
|
|
575
594
|
if req_pool_indices_cpu is None:
|
576
595
|
req_pool_indices_cpu = self.req_pool_indices.tolist()
|
577
596
|
self.tree_cache.cache_req(
|
597
|
+
rid=req.rid,
|
578
598
|
token_ids=cur_all_ids,
|
579
599
|
last_uncached_pos=len(req.prefix_indices),
|
580
600
|
req_pool_idx=req_pool_indices_cpu[i],
|
sglang/srt/managers/tp_worker.py
CHANGED
@@ -43,6 +43,7 @@ from sglang.srt.managers.schedule_batch import (
|
|
43
43
|
ForwardMode,
|
44
44
|
Req,
|
45
45
|
)
|
46
|
+
from sglang.srt.mem_cache.chunk_cache import ChunkCache
|
46
47
|
from sglang.srt.mem_cache.radix_cache import RadixCache
|
47
48
|
from sglang.srt.model_config import ModelConfig
|
48
49
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
@@ -144,11 +145,20 @@ class ModelTpServer:
|
|
144
145
|
)
|
145
146
|
|
146
147
|
# Init cache
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
148
|
+
if (
|
149
|
+
server_args.chunked_prefill_size is not None
|
150
|
+
and server_args.disable_radix_cache
|
151
|
+
):
|
152
|
+
self.tree_cache = ChunkCache(
|
153
|
+
req_to_token_pool=self.model_runner.req_to_token_pool,
|
154
|
+
token_to_kv_pool=self.model_runner.token_to_kv_pool,
|
155
|
+
)
|
156
|
+
else:
|
157
|
+
self.tree_cache = RadixCache(
|
158
|
+
req_to_token_pool=self.model_runner.req_to_token_pool,
|
159
|
+
token_to_kv_pool=self.model_runner.token_to_kv_pool,
|
160
|
+
disable=server_args.disable_radix_cache,
|
161
|
+
)
|
152
162
|
self.tree_cache_metrics = {"total": 0, "hit": 0}
|
153
163
|
self.scheduler = PolicyScheduler(
|
154
164
|
self.schedule_policy,
|
@@ -280,6 +290,14 @@ class ModelTpServer:
|
|
280
290
|
"KV cache pool leak detected!"
|
281
291
|
)
|
282
292
|
|
293
|
+
if self.req_to_token_pool.can_use_mem_size != self.req_to_token_pool.size:
|
294
|
+
warnings.warn(
|
295
|
+
"Warning: "
|
296
|
+
f"available req slots={self.req_to_token_pool.can_use_mem_size}, "
|
297
|
+
f"total slots={self.req_to_token_pool.size}\n"
|
298
|
+
"Memory pool leak detected!"
|
299
|
+
)
|
300
|
+
|
283
301
|
def handle_generate_request(
|
284
302
|
self,
|
285
303
|
recv_req: TokenizedGenerateReqInput,
|
@@ -346,7 +364,10 @@ class ModelTpServer:
|
|
346
364
|
# Compute matched prefix length
|
347
365
|
for req in self.waiting_queue:
|
348
366
|
req.input_ids = req.origin_input_ids + req.output_ids
|
349
|
-
prefix_indices, last_node = self.tree_cache.match_prefix(
|
367
|
+
prefix_indices, last_node = self.tree_cache.match_prefix(
|
368
|
+
rid=req.rid,
|
369
|
+
key=req.input_ids,
|
370
|
+
)
|
350
371
|
if req.return_logprob:
|
351
372
|
prefix_indices = prefix_indices[: req.logprob_start_len]
|
352
373
|
req.extend_input_len = len(req.input_ids) - len(prefix_indices)
|
@@ -606,6 +627,7 @@ class ModelTpServer:
|
|
606
627
|
req_pool_indices_cpu = batch.req_pool_indices.cpu().numpy()
|
607
628
|
for i, req in enumerate(batch.reqs):
|
608
629
|
new_prefix_indices, new_last_node = self.tree_cache.cache_req(
|
630
|
+
rid=req.rid,
|
609
631
|
token_ids=tuple(req.input_ids),
|
610
632
|
last_uncached_pos=len(req.prefix_indices),
|
611
633
|
req_pool_idx=req_pool_indices_cpu[i],
|
@@ -763,6 +785,7 @@ class ModelTpServer:
|
|
763
785
|
for i in finished_indices:
|
764
786
|
req = batch.reqs[i]
|
765
787
|
self.tree_cache.cache_req(
|
788
|
+
rid=req.rid,
|
766
789
|
token_ids=tuple(req.origin_input_ids + req.output_ids)[:-1],
|
767
790
|
last_uncached_pos=len(req.prefix_indices),
|
768
791
|
req_pool_idx=req_pool_indices_cpu[i],
|
@@ -0,0 +1,43 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
|
3
|
+
|
4
|
+
class BasePrefixCache(ABC):
|
5
|
+
"""Cache can be indexed by either rid or key."""
|
6
|
+
|
7
|
+
@abstractmethod
|
8
|
+
def reset(self):
|
9
|
+
pass
|
10
|
+
|
11
|
+
@abstractmethod
|
12
|
+
def match_prefix(self, **kwargs):
|
13
|
+
pass
|
14
|
+
|
15
|
+
@abstractmethod
|
16
|
+
def insert(self, **kwargs):
|
17
|
+
pass
|
18
|
+
|
19
|
+
@abstractmethod
|
20
|
+
def cache_req(self, **kwargs):
|
21
|
+
pass
|
22
|
+
|
23
|
+
@abstractmethod
|
24
|
+
def evict(self, num_tokens, evict_callback):
|
25
|
+
pass
|
26
|
+
|
27
|
+
@abstractmethod
|
28
|
+
def inc_lock_ref(self, node):
|
29
|
+
pass
|
30
|
+
|
31
|
+
@abstractmethod
|
32
|
+
def dec_lock_ref(self, node):
|
33
|
+
pass
|
34
|
+
|
35
|
+
@abstractmethod
|
36
|
+
def evictable_size(self):
|
37
|
+
pass
|
38
|
+
|
39
|
+
def total_size(self):
|
40
|
+
raise NotImplementedError
|
41
|
+
|
42
|
+
def pretty_print(self):
|
43
|
+
raise NotImplementedError
|
@@ -0,0 +1,60 @@
|
|
1
|
+
"""Cache for chunked prefill, used when RadixCache is disabled."""
|
2
|
+
|
3
|
+
from sglang.srt.mem_cache.base_cache import BasePrefixCache
|
4
|
+
|
5
|
+
|
6
|
+
class ChunkCacheEntry:
|
7
|
+
def __init__(self, rid, value):
|
8
|
+
self.rid = rid
|
9
|
+
self.value = value
|
10
|
+
|
11
|
+
|
12
|
+
class ChunkCache(BasePrefixCache):
|
13
|
+
def __init__(self, req_to_token_pool, token_to_kv_pool):
|
14
|
+
self.disable = True
|
15
|
+
self.req_to_token_pool = req_to_token_pool
|
16
|
+
self.token_to_kv_pool = token_to_kv_pool
|
17
|
+
|
18
|
+
self.reset()
|
19
|
+
|
20
|
+
def reset(self):
|
21
|
+
self.entries = {}
|
22
|
+
|
23
|
+
def match_prefix(self, rid, **kwargs):
|
24
|
+
if rid not in self.entries:
|
25
|
+
return [], None
|
26
|
+
|
27
|
+
entry = self.entries[rid]
|
28
|
+
return entry.value, entry
|
29
|
+
|
30
|
+
def cache_req(
|
31
|
+
self, rid, token_ids, req_pool_idx, del_in_memory_pool=True, **kwargs
|
32
|
+
):
|
33
|
+
indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]
|
34
|
+
if del_in_memory_pool:
|
35
|
+
assert rid in self.entries
|
36
|
+
self.req_to_token_pool.free(req_pool_idx)
|
37
|
+
self.token_to_kv_pool.free(indices)
|
38
|
+
return
|
39
|
+
|
40
|
+
if rid not in self.entries:
|
41
|
+
self.entries[rid] = ChunkCacheEntry(rid, indices)
|
42
|
+
|
43
|
+
entry = self.entries[rid]
|
44
|
+
entry.value = indices
|
45
|
+
return indices, entry
|
46
|
+
|
47
|
+
def insert(self):
|
48
|
+
raise NotImplementedError
|
49
|
+
|
50
|
+
def evict(self, num_tokens, evict_callback):
|
51
|
+
pass
|
52
|
+
|
53
|
+
def inc_lock_ref(self, node):
|
54
|
+
return 0
|
55
|
+
|
56
|
+
def dec_lock_ref(self, node):
|
57
|
+
return 0
|
58
|
+
|
59
|
+
def evictable_size(self):
|
60
|
+
return 0
|
@@ -23,6 +23,8 @@ from collections import defaultdict
|
|
23
23
|
|
24
24
|
import torch
|
25
25
|
|
26
|
+
from sglang.srt.mem_cache.base_cache import BasePrefixCache
|
27
|
+
|
26
28
|
|
27
29
|
class TreeNode:
|
28
30
|
def __init__(self):
|
@@ -46,7 +48,7 @@ def _key_match(key0, key1):
|
|
46
48
|
return i
|
47
49
|
|
48
50
|
|
49
|
-
class RadixCache:
|
51
|
+
class RadixCache(BasePrefixCache):
|
50
52
|
def __init__(self, req_to_token_pool, token_to_kv_pool, disable: bool = False):
|
51
53
|
self.req_to_token_pool = req_to_token_pool
|
52
54
|
self.token_to_kv_pool = token_to_kv_pool
|
@@ -62,7 +64,7 @@ class RadixCache:
|
|
62
64
|
self.root_node.lock_ref = 1
|
63
65
|
self.evictable_size_ = 0
|
64
66
|
|
65
|
-
def match_prefix(self, key):
|
67
|
+
def match_prefix(self, key, **kwargs):
|
66
68
|
if self.disable:
|
67
69
|
return [], self.root_node
|
68
70
|
|
@@ -90,6 +92,7 @@ class RadixCache:
|
|
90
92
|
req_pool_idx,
|
91
93
|
del_in_memory_pool=True,
|
92
94
|
old_last_node=None,
|
95
|
+
**kwargs,
|
93
96
|
):
|
94
97
|
# Insert the request into radix cache
|
95
98
|
indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]
|
@@ -19,6 +19,7 @@ import importlib
|
|
19
19
|
import importlib.resources
|
20
20
|
import logging
|
21
21
|
import pkgutil
|
22
|
+
import warnings
|
22
23
|
from functools import lru_cache
|
23
24
|
from typing import Optional, Type
|
24
25
|
|
@@ -121,7 +122,11 @@ class ModelRunner:
|
|
121
122
|
|
122
123
|
# Load the model and create memory pool
|
123
124
|
self.load_model()
|
124
|
-
self.init_memory_pool(
|
125
|
+
self.init_memory_pool(
|
126
|
+
total_gpu_memory,
|
127
|
+
server_args.max_num_reqs,
|
128
|
+
server_args.max_total_tokens,
|
129
|
+
)
|
125
130
|
self.init_cublas()
|
126
131
|
self.init_flash_infer()
|
127
132
|
|
@@ -203,8 +208,18 @@ class ModelRunner:
|
|
203
208
|
max_num_token = int(rest_memory * (1 << 30) // cell_size)
|
204
209
|
return max_num_token
|
205
210
|
|
206
|
-
def init_memory_pool(
|
211
|
+
def init_memory_pool(
|
212
|
+
self, total_gpu_memory, max_num_reqs=None, max_total_tokens=None
|
213
|
+
):
|
207
214
|
self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
|
215
|
+
if max_total_tokens is not None:
|
216
|
+
if max_total_tokens > self.max_total_num_tokens:
|
217
|
+
warnings.warn(
|
218
|
+
f"max_total_tokens={max_total_tokens} is larger than the profiled value "
|
219
|
+
f"{self.max_total_num_tokens}. "
|
220
|
+
f"Use the profiled value instead."
|
221
|
+
)
|
222
|
+
self.max_total_num_tokens = min(self.max_total_num_tokens, max_total_tokens)
|
208
223
|
|
209
224
|
if self.max_total_num_tokens <= 0:
|
210
225
|
raise RuntimeError(
|
sglang/srt/models/llama2.py
CHANGED
@@ -26,6 +26,11 @@ from vllm.config import CacheConfig
|
|
26
26
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
27
27
|
from vllm.model_executor.layers.activation import SiluAndMul
|
28
28
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
29
|
+
from vllm.model_executor.layers.linear import (
|
30
|
+
MergedColumnParallelLinear,
|
31
|
+
QKVParallelLinear,
|
32
|
+
RowParallelLinear,
|
33
|
+
)
|
29
34
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
30
35
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
31
36
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
@@ -38,10 +43,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
38
43
|
from sglang.srt.layers.radix_attention import RadixAttention
|
39
44
|
from sglang.srt.model_executor.model_runner import InputMetadata
|
40
45
|
|
41
|
-
MergedColumnParallelLinear = None
|
42
|
-
QKVParallelLinear = None
|
43
|
-
RowParallelLinear = None
|
44
|
-
|
45
46
|
|
46
47
|
class LlamaMLP(nn.Module):
|
47
48
|
def __init__(
|
@@ -295,23 +296,6 @@ class LlamaForCausalLM(nn.Module):
|
|
295
296
|
cache_config: Optional[CacheConfig] = None,
|
296
297
|
efficient_weight_load=False,
|
297
298
|
) -> None:
|
298
|
-
global MergedColumnParallelLinear
|
299
|
-
global QKVParallelLinear
|
300
|
-
global RowParallelLinear
|
301
|
-
|
302
|
-
if efficient_weight_load:
|
303
|
-
from sglang.srt.layers.linear import (
|
304
|
-
MergedColumnParallelLinear,
|
305
|
-
QKVParallelLinear,
|
306
|
-
RowParallelLinear,
|
307
|
-
)
|
308
|
-
else:
|
309
|
-
from vllm.model_executor.layers.linear import (
|
310
|
-
MergedColumnParallelLinear,
|
311
|
-
QKVParallelLinear,
|
312
|
-
RowParallelLinear,
|
313
|
-
)
|
314
|
-
|
315
299
|
super().__init__()
|
316
300
|
self.config = config
|
317
301
|
self.quant_config = quant_config
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -43,7 +43,9 @@ from sglang.srt.openai_api.protocol import (
|
|
43
43
|
ChatCompletionResponseChoice,
|
44
44
|
ChatCompletionResponseStreamChoice,
|
45
45
|
ChatCompletionStreamResponse,
|
46
|
+
ChatCompletionTokenLogprob,
|
46
47
|
ChatMessage,
|
48
|
+
ChoiceLogprobs,
|
47
49
|
CompletionRequest,
|
48
50
|
CompletionResponse,
|
49
51
|
CompletionResponseChoice,
|
@@ -54,6 +56,7 @@ from sglang.srt.openai_api.protocol import (
|
|
54
56
|
FileRequest,
|
55
57
|
FileResponse,
|
56
58
|
LogProbs,
|
59
|
+
TopLogprob,
|
57
60
|
UsageInfo,
|
58
61
|
)
|
59
62
|
|
@@ -70,7 +73,7 @@ class FileMetadata:
|
|
70
73
|
batch_storage: Dict[str, BatchResponse] = {}
|
71
74
|
file_id_request: Dict[str, FileMetadata] = {}
|
72
75
|
file_id_response: Dict[str, FileResponse] = {}
|
73
|
-
|
76
|
+
# map file id to file path in SGlang backend
|
74
77
|
file_id_storage: Dict[str, str] = {}
|
75
78
|
|
76
79
|
|
@@ -261,7 +264,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
|
|
261
264
|
failed_requests += len(file_request_list)
|
262
265
|
|
263
266
|
for idx, response in enumerate(responses):
|
264
|
-
|
267
|
+
# the batch_req here can be changed to be named within a batch granularity
|
265
268
|
response_json = {
|
266
269
|
"id": f"batch_req_{uuid.uuid4()}",
|
267
270
|
"custom_id": file_request_list[idx].get("custom_id"),
|
@@ -333,6 +336,8 @@ def v1_generate_request(all_requests):
|
|
333
336
|
|
334
337
|
prompts = []
|
335
338
|
sampling_params_list = []
|
339
|
+
return_logprobs = []
|
340
|
+
top_logprobs_nums = []
|
336
341
|
first_prompt_type = type(all_requests[0].prompt)
|
337
342
|
for request in all_requests:
|
338
343
|
prompt = request.prompt
|
@@ -340,6 +345,10 @@ def v1_generate_request(all_requests):
|
|
340
345
|
type(prompt) == first_prompt_type
|
341
346
|
), "All prompts must be of the same type in file input settings"
|
342
347
|
prompts.append(prompt)
|
348
|
+
return_logprobs.append(request.logprobs is not None and request.logprobs > 0)
|
349
|
+
top_logprobs_nums.append(
|
350
|
+
request.logprobs if request.logprobs is not None else 0
|
351
|
+
)
|
343
352
|
sampling_params_list.append(
|
344
353
|
{
|
345
354
|
"temperature": request.temperature,
|
@@ -361,7 +370,9 @@ def v1_generate_request(all_requests):
|
|
361
370
|
if len(all_requests) == 1:
|
362
371
|
prompt = prompts[0]
|
363
372
|
sampling_params_list = sampling_params_list[0]
|
364
|
-
|
373
|
+
return_logprobs = return_logprobs[0]
|
374
|
+
top_logprobs_nums = top_logprobs_nums[0]
|
375
|
+
if isinstance(prompt, str) or isinstance(prompt[0], str):
|
365
376
|
prompt_kwargs = {"text": prompt}
|
366
377
|
else:
|
367
378
|
prompt_kwargs = {"input_ids": prompt}
|
@@ -370,15 +381,11 @@ def v1_generate_request(all_requests):
|
|
370
381
|
prompt_kwargs = {"text": prompts}
|
371
382
|
else:
|
372
383
|
prompt_kwargs = {"input_ids": prompts}
|
373
|
-
|
374
384
|
adapted_request = GenerateReqInput(
|
375
385
|
**prompt_kwargs,
|
376
386
|
sampling_params=sampling_params_list,
|
377
|
-
return_logprob=
|
378
|
-
|
379
|
-
top_logprobs_num=(
|
380
|
-
all_requests[0].logprobs if all_requests[0].logprobs is not None else 0
|
381
|
-
),
|
387
|
+
return_logprob=return_logprobs,
|
388
|
+
top_logprobs_num=top_logprobs_nums,
|
382
389
|
return_text_in_logprobs=True,
|
383
390
|
stream=all_requests[0].stream,
|
384
391
|
)
|
@@ -430,7 +437,7 @@ def v1_generate_response(request, ret, to_file=False):
|
|
430
437
|
logprobs = None
|
431
438
|
|
432
439
|
if to_file:
|
433
|
-
|
440
|
+
# to make the choise data json serializable
|
434
441
|
choice_data = {
|
435
442
|
"index": 0,
|
436
443
|
"text": text,
|
@@ -454,7 +461,7 @@ def v1_generate_response(request, ret, to_file=False):
|
|
454
461
|
"status_code": 200,
|
455
462
|
"request_id": ret[i]["meta_info"]["id"],
|
456
463
|
"body": {
|
457
|
-
|
464
|
+
# remain the same but if needed we can change that
|
458
465
|
"id": ret[i]["meta_info"]["id"],
|
459
466
|
"object": "text_completion",
|
460
467
|
"created": int(time.time()),
|
@@ -590,6 +597,8 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
|
|
590
597
|
texts = []
|
591
598
|
sampling_params_list = []
|
592
599
|
image_data_list = []
|
600
|
+
return_logprobs = []
|
601
|
+
top_logprobs_nums = []
|
593
602
|
for request in all_requests:
|
594
603
|
# Prep the data needed for the underlying GenerateReqInput:
|
595
604
|
# - prompt: The full prompt string.
|
@@ -620,6 +629,8 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
|
|
620
629
|
stop = request.stop
|
621
630
|
image_data = None
|
622
631
|
texts.append(prompt)
|
632
|
+
return_logprobs.append(request.logprobs)
|
633
|
+
top_logprobs_nums.append(request.top_logprobs)
|
623
634
|
sampling_params_list.append(
|
624
635
|
{
|
625
636
|
"temperature": request.temperature,
|
@@ -637,11 +648,16 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
|
|
637
648
|
texts = texts[0]
|
638
649
|
sampling_params_list = sampling_params_list[0]
|
639
650
|
image_data = image_data_list[0]
|
651
|
+
return_logprobs = return_logprobs[0]
|
652
|
+
top_logprobs_nums = top_logprobs_nums[0]
|
640
653
|
adapted_request = GenerateReqInput(
|
641
654
|
text=texts,
|
642
655
|
image_data=image_data,
|
643
656
|
sampling_params=sampling_params_list,
|
644
|
-
|
657
|
+
return_logprob=return_logprobs,
|
658
|
+
top_logprobs_num=top_logprobs_nums,
|
659
|
+
stream=all_requests[0].stream,
|
660
|
+
return_text_in_logprobs=True,
|
645
661
|
)
|
646
662
|
if len(all_requests) == 1:
|
647
663
|
return adapted_request, all_requests[0]
|
@@ -654,26 +670,63 @@ def v1_chat_generate_response(request, ret, to_file=False):
|
|
654
670
|
total_completion_tokens = 0
|
655
671
|
|
656
672
|
for idx, ret_item in enumerate(ret):
|
673
|
+
logprobs = False
|
674
|
+
if isinstance(request, List) and request[idx].logprobs:
|
675
|
+
logprobs = True
|
676
|
+
elif (not isinstance(request, List)) and request.logprobs:
|
677
|
+
logprobs = True
|
678
|
+
if logprobs:
|
679
|
+
logprobs = to_openai_style_logprobs(
|
680
|
+
output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
|
681
|
+
output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
|
682
|
+
)
|
683
|
+
token_logprobs = []
|
684
|
+
for token, logprob in zip(logprobs.tokens, logprobs.token_logprobs):
|
685
|
+
token_bytes = list(token.encode("utf-8"))
|
686
|
+
top_logprobs = []
|
687
|
+
if logprobs.top_logprobs:
|
688
|
+
for top_token, top_logprob in logprobs.top_logprobs[0].items():
|
689
|
+
top_token_bytes = list(top_token.encode("utf-8"))
|
690
|
+
top_logprobs.append(
|
691
|
+
TopLogprob(
|
692
|
+
token=top_token,
|
693
|
+
bytes=top_token_bytes,
|
694
|
+
logprob=top_logprob,
|
695
|
+
)
|
696
|
+
)
|
697
|
+
token_logprobs.append(
|
698
|
+
ChatCompletionTokenLogprob(
|
699
|
+
token=token,
|
700
|
+
bytes=token_bytes,
|
701
|
+
logprob=logprob,
|
702
|
+
top_logprobs=top_logprobs,
|
703
|
+
)
|
704
|
+
)
|
705
|
+
|
706
|
+
choice_logprobs = ChoiceLogprobs(content=token_logprobs)
|
707
|
+
else:
|
708
|
+
choice_logprobs = None
|
657
709
|
prompt_tokens = ret_item["meta_info"]["prompt_tokens"]
|
658
710
|
completion_tokens = ret_item["meta_info"]["completion_tokens"]
|
659
711
|
|
660
712
|
if to_file:
|
661
|
-
|
713
|
+
# to make the choice data json serializable
|
662
714
|
choice_data = {
|
663
715
|
"index": 0,
|
664
716
|
"message": {"role": "assistant", "content": ret_item["text"]},
|
665
|
-
"logprobs":
|
717
|
+
"logprobs": choice_logprobs,
|
666
718
|
"finish_reason": ret_item["meta_info"]["finish_reason"],
|
667
719
|
}
|
668
720
|
else:
|
669
721
|
choice_data = ChatCompletionResponseChoice(
|
670
722
|
index=idx,
|
671
723
|
message=ChatMessage(role="assistant", content=ret_item["text"]),
|
724
|
+
logprobs=choice_logprobs,
|
672
725
|
finish_reason=ret_item["meta_info"]["finish_reason"],
|
673
726
|
)
|
674
727
|
|
675
728
|
choices.append(choice_data)
|
676
|
-
total_prompt_tokens
|
729
|
+
total_prompt_tokens += prompt_tokens
|
677
730
|
total_completion_tokens += completion_tokens
|
678
731
|
if to_file:
|
679
732
|
responses = []
|
@@ -683,7 +736,7 @@ def v1_chat_generate_response(request, ret, to_file=False):
|
|
683
736
|
"status_code": 200,
|
684
737
|
"request_id": ret[i]["meta_info"]["id"],
|
685
738
|
"body": {
|
686
|
-
|
739
|
+
# remain the same but if needed we can change that
|
687
740
|
"id": ret[i]["meta_info"]["id"],
|
688
741
|
"object": "chat.completion",
|
689
742
|
"created": int(time.time()),
|
@@ -54,6 +54,24 @@ class LogProbs(BaseModel):
|
|
54
54
|
top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
|
55
55
|
|
56
56
|
|
57
|
+
class TopLogprob(BaseModel):
|
58
|
+
token: str
|
59
|
+
bytes: List[int]
|
60
|
+
logprob: float
|
61
|
+
|
62
|
+
|
63
|
+
class ChatCompletionTokenLogprob(BaseModel):
|
64
|
+
token: str
|
65
|
+
bytes: List[int]
|
66
|
+
logprob: float
|
67
|
+
top_logprobs: List[TopLogprob]
|
68
|
+
|
69
|
+
|
70
|
+
class ChoiceLogprobs(BaseModel):
|
71
|
+
# build for v1/chat/completions response
|
72
|
+
content: List[ChatCompletionTokenLogprob]
|
73
|
+
|
74
|
+
|
57
75
|
class UsageInfo(BaseModel):
|
58
76
|
prompt_tokens: int = 0
|
59
77
|
total_tokens: int = 0
|
@@ -239,8 +257,8 @@ class ChatMessage(BaseModel):
|
|
239
257
|
class ChatCompletionResponseChoice(BaseModel):
|
240
258
|
index: int
|
241
259
|
message: ChatMessage
|
242
|
-
logprobs: Optional[LogProbs] = None
|
243
|
-
finish_reason:
|
260
|
+
logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
|
261
|
+
finish_reason: str
|
244
262
|
|
245
263
|
|
246
264
|
class ChatCompletionResponse(BaseModel):
|
sglang/srt/server.py
CHANGED
@@ -260,7 +260,7 @@ def launch_server(
|
|
260
260
|
if not server_args.disable_flashinfer:
|
261
261
|
assert_pkg_version(
|
262
262
|
"flashinfer",
|
263
|
-
"0.1.
|
263
|
+
"0.1.3",
|
264
264
|
"Please uninstall the old version and "
|
265
265
|
"reinstall the latest version by following the instructions "
|
266
266
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -479,6 +479,9 @@ class Runtime:
|
|
479
479
|
parent.wait(timeout=5)
|
480
480
|
self.pid = None
|
481
481
|
|
482
|
+
def cache_prefix(self, prefix: str):
|
483
|
+
self.endpoint.cache_prefix(prefix)
|
484
|
+
|
482
485
|
def get_tokenizer(self):
|
483
486
|
return get_tokenizer(
|
484
487
|
self.server_args.tokenizer_path,
|
sglang/srt/server_args.py
CHANGED
@@ -44,6 +44,7 @@ class ServerArgs:
|
|
44
44
|
max_prefill_tokens: Optional[int] = None
|
45
45
|
max_running_requests: Optional[int] = None
|
46
46
|
max_num_reqs: Optional[int] = None
|
47
|
+
max_total_tokens: Optional[int] = None
|
47
48
|
schedule_policy: str = "lpm"
|
48
49
|
schedule_conservativeness: float = 1.0
|
49
50
|
|
@@ -231,6 +232,12 @@ class ServerArgs:
|
|
231
232
|
default=ServerArgs.max_num_reqs,
|
232
233
|
help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
|
233
234
|
)
|
235
|
+
parser.add_argument(
|
236
|
+
"--max-total-tokens",
|
237
|
+
type=int,
|
238
|
+
default=ServerArgs.max_total_tokens,
|
239
|
+
help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
|
240
|
+
)
|
234
241
|
parser.add_argument(
|
235
242
|
"--schedule-policy",
|
236
243
|
type=str,
|
@@ -412,10 +419,6 @@ class ServerArgs:
|
|
412
419
|
self.dp_size > 1 and self.node_rank is not None
|
413
420
|
), "multi-node data parallel is not supported"
|
414
421
|
|
415
|
-
assert not (
|
416
|
-
self.chunked_prefill_size is not None and self.disable_radix_cache
|
417
|
-
), "chunked prefill is not supported with radix cache disabled currently"
|
418
|
-
|
419
422
|
|
420
423
|
@dataclasses.dataclass
|
421
424
|
class PortArgs:
|
sglang/test/test_programs.py
CHANGED
@@ -113,15 +113,14 @@ def test_decode_json_regex():
|
|
113
113
|
s += ' "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
|
114
114
|
s += ' "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
|
115
115
|
s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT + ",") + "\n"
|
116
|
-
s += ' "country": ' + sgl.gen(regex=REGEX_STRING
|
117
|
-
s += ' "timezone": ' + sgl.gen(regex=REGEX_STRING) + "\n"
|
116
|
+
s += ' "country": ' + sgl.gen(regex=REGEX_STRING) + "\n"
|
118
117
|
s += "}"
|
119
118
|
|
120
|
-
ret = decode_json.run()
|
119
|
+
ret = decode_json.run(temperature=0.0)
|
121
120
|
try:
|
122
121
|
js_obj = json.loads(ret["json_output"])
|
123
122
|
except json.decoder.JSONDecodeError:
|
124
|
-
print(ret["json_output"])
|
123
|
+
print("JSONDecodeError", ret["json_output"])
|
125
124
|
raise
|
126
125
|
assert isinstance(js_obj["name"], str)
|
127
126
|
assert isinstance(js_obj["population"], int)
|
@@ -141,8 +140,12 @@ def test_decode_json():
|
|
141
140
|
s += ' "timezone": ' + sgl.gen(dtype=str) + "\n"
|
142
141
|
s += "}"
|
143
142
|
|
144
|
-
ret = decode_json.run()
|
145
|
-
|
143
|
+
ret = decode_json.run(max_new_tokens=64)
|
144
|
+
try:
|
145
|
+
js_obj = json.loads(ret["json_output"])
|
146
|
+
except json.decoder.JSONDecodeError:
|
147
|
+
print("JSONDecodeError", ret["json_output"])
|
148
|
+
raise
|
146
149
|
assert isinstance(js_obj["name"], str)
|
147
150
|
assert isinstance(js_obj["population"], int)
|
148
151
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.8"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.8
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -299,8 +299,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
299
299
|
|
300
300
|
### Method 2: From source
|
301
301
|
```
|
302
|
-
# Use the stable
|
303
|
-
git clone -b
|
302
|
+
# Use the stable v0.2.8 branch
|
303
|
+
git clone -b v0.2.8 https://github.com/sgl-project/sglang.git
|
304
304
|
cd sglang
|
305
305
|
|
306
306
|
pip install --upgrade pip
|
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
312
312
|
|
313
313
|
### Method 3: Using docker
|
314
314
|
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
|
315
|
-
|
315
|
+
Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
316
316
|
|
317
317
|
```bash
|
318
318
|
docker run --gpus all \
|
@@ -7,12 +7,12 @@ sglang/global_config.py,sha256=CyhGL7PE-KlMcg7IHWykzImU1y4NQlpeIlh9lHA77uo,1749
|
|
7
7
|
sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
|
8
8
|
sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
|
9
9
|
sglang/utils.py,sha256=r0Z7hY_bFFk-b6WeQJir9br-hCW2-p7n5E7Et2WziaQ,8776
|
10
|
-
sglang/version.py,sha256=
|
10
|
+
sglang/version.py,sha256=G6Dbxq2ws-1ZAXwDD8q0KWueYtso_Y6Uyvtj8sRWsPI,22
|
11
11
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
|
13
13
|
sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
|
14
|
-
sglang/lang/interpreter.py,sha256=
|
15
|
-
sglang/lang/ir.py,sha256=
|
14
|
+
sglang/lang/interpreter.py,sha256=_MbvYB0vweCgALklpM2DlofiCXuITCmX_fl8rPPcp5U,30340
|
15
|
+
sglang/lang/ir.py,sha256=0r-mhA4aO-uuS97Dvkw99ERTcJXfzuV6jJQMmuCwHEg,16615
|
16
16
|
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
17
17
|
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
@@ -26,13 +26,13 @@ sglang/srt/hf_transformers_utils.py,sha256=Fg-3panb6lsqOhHmAYA0ivkXyBjdnvY5mqvil
|
|
26
26
|
sglang/srt/mm_utils.py,sha256=n7_GmbOM_0IWVXovpM34rKIBw0Py9yb_NXSQw27u4OA,9454
|
27
27
|
sglang/srt/model_config.py,sha256=DO7m84WiT3dzPWmyKz_UXDAHEdqEjq8Lq5wCjzjYMME,6023
|
28
28
|
sglang/srt/sampling_params.py,sha256=uZFDlTUPnNR5_3IDH-INDeN-tm6LlRkC2KT-B3njxJs,3687
|
29
|
-
sglang/srt/server.py,sha256=
|
30
|
-
sglang/srt/server_args.py,sha256=
|
29
|
+
sglang/srt/server.py,sha256=8uDMWGAp2EZ8bywQumEa6T2G2k78-oYXgLfk6qBkv8o,16107
|
30
|
+
sglang/srt/server_args.py,sha256=zGAbZqKKN4dkn5BDcZdjxLM-jIFsHX2ThAEfvPKUm6c,15645
|
31
31
|
sglang/srt/utils.py,sha256=uIatocIFzqi6fWSscz2MjF3jUcIRBJlqLgYeicM_W9s,22950
|
32
32
|
sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
|
33
|
-
sglang/srt/constrained/
|
34
|
-
sglang/srt/constrained/fsm_cache.py,sha256=
|
35
|
-
sglang/srt/constrained/jump_forward.py,sha256=
|
33
|
+
sglang/srt/constrained/base_tool_cache.py,sha256=1_m-AivPtWRwUgGiEZBafCrSFUGahK4UM4vgAd8TkMg,2004
|
34
|
+
sglang/srt/constrained/fsm_cache.py,sha256=GoPBr_9ZdJizF2PKbYoQw2I4ckfrUYwCeMZxB9sY3TM,2639
|
35
|
+
sglang/srt/constrained/jump_forward.py,sha256=IgZ8D0woy5FLIQvXkE8wZRYejDsfVkjU0sqUlkiv_f4,6193
|
36
36
|
sglang/srt/layers/context_flashattention_nopad.py,sha256=r_TpHuYAVgq1pN81PiWe1bebtY-p9MBndBaoIE2VXrk,5180
|
37
37
|
sglang/srt/layers/extend_attention.py,sha256=zuNnAdL_wF6BX0Mwn1dgDJvh3YJjYwqa5Fbzp8muOVc,12573
|
38
38
|
sglang/srt/layers/fused_moe.py,sha256=KmyXwau2OOZpQimGIQrHptzGNs1trIud5AKEEKXdzPU,20823
|
@@ -47,14 +47,16 @@ sglang/srt/managers/controller_single.py,sha256=CdQ9_XPZdcWF5jArDmVR8K-WZ9_8Gpgk
|
|
47
47
|
sglang/srt/managers/detokenizer_manager.py,sha256=GXWdW4n2N-otL3zcgdr0t1PcEe2EmQJA8AElntiNV1o,5606
|
48
48
|
sglang/srt/managers/io_struct.py,sha256=Rz7Ur9Yw6prDGdy6XjsSiUmVBccS6cef-G_9TW7HA_4,7105
|
49
49
|
sglang/srt/managers/policy_scheduler.py,sha256=ajSB-gCC6VJkXvnKU8FYU3Kgcigozp2pMTwF84Wp14o,3138
|
50
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
50
|
+
sglang/srt/managers/schedule_batch.py,sha256=LIoVCPNivh0u1dOrrWRgFD6a4ywq3nrG_4dNgCK0kIw,37697
|
51
51
|
sglang/srt/managers/tokenizer_manager.py,sha256=tEct3shjjw_7ickj_cmt9IxoBHfgbryQHI7DZS0m4TA,20511
|
52
|
-
sglang/srt/managers/tp_worker.py,sha256=
|
52
|
+
sglang/srt/managers/tp_worker.py,sha256=JPLneFwcPlmPXZX1QxZHWgcdau8FC8wNuVqfCqsgOkU,35234
|
53
|
+
sglang/srt/mem_cache/base_cache.py,sha256=czyN8IumXcMQskYOZDV3DzjfD4kdR-qwLVxceDqnOmE,788
|
54
|
+
sglang/srt/mem_cache/chunk_cache.py,sha256=u1mkGoTI7_31H0i0mhKT7S57StYSsdmsSPqyGubE7lY,1560
|
53
55
|
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
54
56
|
sglang/srt/mem_cache/memory_pool.py,sha256=wkhjyYLbAZrl2FB5i4ODkxgMufBuDpe4N0kbXhu6ZO0,4509
|
55
|
-
sglang/srt/mem_cache/radix_cache.py,sha256=
|
57
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=pa5RD4xNKPSuvL55BnC4mimoca5oJRXr4Rg91-sbTcs,8881
|
56
58
|
sglang/srt/model_executor/cuda_graph_runner.py,sha256=OdmO6R7nHWrRJCtZOxYkt0KNdGoX7Md4knsypwPYjaQ,9365
|
57
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
59
|
+
sglang/srt/model_executor/model_runner.py,sha256=fo3fbnNaHkcHz2UDkyvFjU7sGvdClhmhdelQh0n9PgA,16079
|
58
60
|
sglang/srt/model_loader/model_loader.py,sha256=QmZUhHh1nmWrfYlunfnxMcTsIvip1l6aMIlrXoCED4I,10697
|
59
61
|
sglang/srt/model_loader/utils.py,sha256=0AoWXX9uV5rKRYXJ4HduSnvdeerytI4ONCLCH6X4XFQ,10675
|
60
62
|
sglang/srt/models/chatglm.py,sha256=vYWooqyPmcSFZNjxj_g5I_FgHJlDytbEiz6vyv3JBNM,13856
|
@@ -67,7 +69,7 @@ sglang/srt/models/gemma2.py,sha256=kTjZcsptgtYaO8BL_NlygjVSMSloq2Mc4Rf3FKvEhbs,1
|
|
67
69
|
sglang/srt/models/gpt_bigcode.py,sha256=U7GmHKywSu12D-EwvuWv3RwHkx6bPawaRIjlFIpQkfs,10194
|
68
70
|
sglang/srt/models/grok.py,sha256=NfZdsRVErDIUWFqjhtNf2pqC9G4cRdYHBFpgDq1IZ2A,27855
|
69
71
|
sglang/srt/models/internlm2.py,sha256=Ld2GUxZeqqqJ2vd4QiX2s1y2AceJLA1nVnUYY88GMQk,12219
|
70
|
-
sglang/srt/models/llama2.py,sha256=
|
72
|
+
sglang/srt/models/llama2.py,sha256=zfOk3OK1_B6s6yuXsZFmNCf07RsfytVD72GunLBt8Cc,14282
|
71
73
|
sglang/srt/models/llama_classification.py,sha256=4r_orFZqBR3U_yC4bus1K3Z3-ADscYGSzgA82_VDN0g,4926
|
72
74
|
sglang/srt/models/llava.py,sha256=BJphgyQGdo7uTpJcKGEfWwdpH9GTMDnyiznLSSgmvm8,18476
|
73
75
|
sglang/srt/models/llavavid.py,sha256=-7vaVqaIfukCvMkNakEPblpwjIHC6ezrAvmpE5RzlUY,13602
|
@@ -80,14 +82,14 @@ sglang/srt/models/qwen2.py,sha256=mXlVd6UTCXY3VdgodFpQnlaY-NYLIbA-SknxdA9R13w,12
|
|
80
82
|
sglang/srt/models/qwen2_moe.py,sha256=YYdJEezic7GyW-_bXlNIaqBa0C4IHQpz_vuRBLxms4k,18141
|
81
83
|
sglang/srt/models/stablelm.py,sha256=b3d-ZwLQoLjZ6CupnkIq7d-z9tzGSxAyIcgSmZiZxZw,11362
|
82
84
|
sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
|
83
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
84
|
-
sglang/srt/openai_api/protocol.py,sha256=
|
85
|
+
sglang/srt/openai_api/adapter.py,sha256=MaWz78cvkk5RdotRMCIf_K5xYAClX7TonjxH_dzUrVI,32495
|
86
|
+
sglang/srt/openai_api/protocol.py,sha256=JXLnnQ63I-bJv93ICPfP0cBpyomQA5IYE_mkUg5X4Es,8177
|
85
87
|
sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
|
86
88
|
sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
|
87
|
-
sglang/test/test_programs.py,sha256=
|
89
|
+
sglang/test/test_programs.py,sha256=0M8blaIy--eEE2dQnG4FyjIETT_wa7eEG3S9UWna6_4,13851
|
88
90
|
sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
|
89
|
-
sglang-0.2.
|
90
|
-
sglang-0.2.
|
91
|
-
sglang-0.2.
|
92
|
-
sglang-0.2.
|
93
|
-
sglang-0.2.
|
91
|
+
sglang-0.2.8.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
92
|
+
sglang-0.2.8.dist-info/METADATA,sha256=FRkxB6W7NQlj9ar65-oppfES5tc1pS8LRPJXU-43hsQ,33214
|
93
|
+
sglang-0.2.8.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
94
|
+
sglang-0.2.8.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
95
|
+
sglang-0.2.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|