sglang 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/backend/runtime_endpoint.py +4 -4
- sglang/lang/interpreter.py +4 -4
- sglang/srt/constrained/fsm_cache.py +21 -1
- sglang/srt/hf_transformers_utils.py +3 -1
- sglang/srt/layers/logits_processor.py +70 -61
- sglang/srt/layers/radix_attention.py +5 -2
- sglang/srt/layers/token_attention.py +1 -1
- sglang/srt/managers/controller/cuda_graph_runner.py +26 -17
- sglang/srt/managers/controller/infer_batch.py +54 -13
- sglang/srt/managers/controller/model_runner.py +22 -7
- sglang/srt/managers/controller/tp_worker.py +47 -41
- sglang/srt/managers/io_struct.py +2 -2
- sglang/srt/managers/tokenizer_manager.py +62 -43
- sglang/srt/model_config.py +5 -0
- sglang/srt/models/deepseek_v2.py +517 -0
- sglang/srt/models/llama_classification.py +3 -3
- sglang/srt/openai_api/adapter.py +33 -33
- sglang/srt/openai_api/protocol.py +1 -1
- sglang/srt/sampling_params.py +5 -4
- sglang/srt/server.py +2 -15
- sglang/srt/server_args.py +28 -7
- sglang/test/test_programs.py +5 -1
- sglang/version.py +1 -1
- {sglang-0.2.5.dist-info → sglang-0.2.6.dist-info}/METADATA +9 -7
- {sglang-0.2.5.dist-info → sglang-0.2.6.dist-info}/RECORD +28 -27
- {sglang-0.2.5.dist-info → sglang-0.2.6.dist-info}/LICENSE +0 -0
- {sglang-0.2.5.dist-info → sglang-0.2.6.dist-info}/WHEEL +0 -0
- {sglang-0.2.5.dist-info → sglang-0.2.6.dist-info}/top_level.txt +0 -0
sglang/srt/server.py
CHANGED
@@ -65,9 +65,6 @@ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
|
65
65
|
app = FastAPI()
|
66
66
|
tokenizer_manager = None
|
67
67
|
|
68
|
-
# Put some args for easily access
|
69
|
-
global_server_args_dict = {}
|
70
|
-
|
71
68
|
|
72
69
|
@app.get("/health")
|
73
70
|
async def health() -> Response:
|
@@ -150,14 +147,6 @@ def available_models():
|
|
150
147
|
return ModelList(data=model_cards)
|
151
148
|
|
152
149
|
|
153
|
-
def _set_global_server_args(server_args: ServerArgs):
|
154
|
-
global global_server_args_dict
|
155
|
-
global_server_args_dict = {
|
156
|
-
"disable_flashinfer": server_args.disable_flashinfer,
|
157
|
-
"attention_reduce_in_fp32": server_args.attention_reduce_in_fp32,
|
158
|
-
}
|
159
|
-
|
160
|
-
|
161
150
|
def _set_torch_compile_config():
|
162
151
|
# The following configurations are for torch compile optimizations
|
163
152
|
import torch._dynamo.config
|
@@ -176,6 +165,8 @@ def launch_server(
|
|
176
165
|
model_overide_args: Optional[dict] = None,
|
177
166
|
pipe_finish_writer: Optional[mp.connection.Connection] = None,
|
178
167
|
):
|
168
|
+
server_args.check_server_args()
|
169
|
+
|
179
170
|
"""Launch an HTTP server."""
|
180
171
|
global tokenizer_manager
|
181
172
|
|
@@ -211,8 +202,6 @@ def launch_server(
|
|
211
202
|
if server_args.enable_torch_compile:
|
212
203
|
_set_torch_compile_config()
|
213
204
|
|
214
|
-
_set_global_server_args(server_args)
|
215
|
-
|
216
205
|
# Allocate ports
|
217
206
|
server_args.port, server_args.additional_ports = allocate_init_ports(
|
218
207
|
server_args.port,
|
@@ -230,8 +219,6 @@ def launch_server(
|
|
230
219
|
|
231
220
|
# Handle multi-node tensor parallelism
|
232
221
|
if server_args.nnodes > 1:
|
233
|
-
assert server_args.dp_size == 1, "Multi-node dp is not supported."
|
234
|
-
|
235
222
|
if server_args.node_rank != 0:
|
236
223
|
tp_size_local = server_args.tp_size // server_args.nnodes
|
237
224
|
gpu_ids = [
|
sglang/srt/server_args.py
CHANGED
@@ -28,6 +28,7 @@ class ServerArgs:
|
|
28
28
|
mem_fraction_static: Optional[float] = None
|
29
29
|
max_prefill_tokens: Optional[int] = None
|
30
30
|
max_running_requests: Optional[int] = None
|
31
|
+
max_num_reqs: Optional[int] = None
|
31
32
|
schedule_heuristic: str = "lpm"
|
32
33
|
schedule_conservativeness: float = 1.0
|
33
34
|
|
@@ -51,13 +52,14 @@ class ServerArgs:
|
|
51
52
|
|
52
53
|
# Optimization/debug options
|
53
54
|
disable_flashinfer: bool = False
|
55
|
+
disable_flashinfer_sampling: bool = False
|
54
56
|
disable_radix_cache: bool = False
|
55
57
|
disable_regex_jump_forward: bool = False
|
56
58
|
disable_cuda_graph: bool = False
|
57
59
|
disable_disk_cache: bool = False
|
58
60
|
enable_torch_compile: bool = False
|
59
|
-
attention_reduce_in_fp32: bool = False
|
60
61
|
enable_p2p_check: bool = False
|
62
|
+
attention_reduce_in_fp32: bool = False
|
61
63
|
efficient_weight_load: bool = False
|
62
64
|
|
63
65
|
# Distributed args
|
@@ -203,6 +205,12 @@ class ServerArgs:
|
|
203
205
|
default=ServerArgs.max_running_requests,
|
204
206
|
help="The maximum number of running requests.",
|
205
207
|
)
|
208
|
+
parser.add_argument(
|
209
|
+
"--max-num-reqs",
|
210
|
+
type=int,
|
211
|
+
default=None,
|
212
|
+
help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
|
213
|
+
)
|
206
214
|
parser.add_argument(
|
207
215
|
"--schedule-heuristic",
|
208
216
|
type=str,
|
@@ -296,7 +304,12 @@ class ServerArgs:
|
|
296
304
|
parser.add_argument(
|
297
305
|
"--disable-flashinfer",
|
298
306
|
action="store_true",
|
299
|
-
help="Disable flashinfer
|
307
|
+
help="Disable flashinfer attention kernels.",
|
308
|
+
)
|
309
|
+
parser.add_argument(
|
310
|
+
"--disable-flashinfer-sampling",
|
311
|
+
action="store_true",
|
312
|
+
help="Disable flashinfer sampling kernels.",
|
300
313
|
)
|
301
314
|
parser.add_argument(
|
302
315
|
"--disable-radix-cache",
|
@@ -324,15 +337,15 @@ class ServerArgs:
|
|
324
337
|
help="Optimize the model with torch.compile, experimental feature.",
|
325
338
|
)
|
326
339
|
parser.add_argument(
|
327
|
-
"--
|
340
|
+
"--enable-p2p-check",
|
328
341
|
action="store_true",
|
329
|
-
help="
|
330
|
-
"This only affects Triton attention kernels",
|
342
|
+
help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
|
331
343
|
)
|
332
344
|
parser.add_argument(
|
333
|
-
"--
|
345
|
+
"--attention-reduce-in-fp32",
|
334
346
|
action="store_true",
|
335
|
-
help="
|
347
|
+
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
|
348
|
+
"This only affects Triton attention kernels",
|
336
349
|
)
|
337
350
|
parser.add_argument(
|
338
351
|
"--efficient-weight-load",
|
@@ -357,6 +370,14 @@ class ServerArgs:
|
|
357
370
|
f"disable_disk_cache={self.disable_disk_cache}, "
|
358
371
|
)
|
359
372
|
|
373
|
+
def check_server_args(self):
|
374
|
+
assert (
|
375
|
+
self.tp_size % self.nnodes == 0
|
376
|
+
), "tp_size must be divisible by number of nodes"
|
377
|
+
assert not (
|
378
|
+
self.dp_size > 1 and self.node_rank is not None
|
379
|
+
), "multi-node data parallel is not supported"
|
380
|
+
|
360
381
|
|
361
382
|
@dataclasses.dataclass
|
362
383
|
class PortArgs:
|
sglang/test/test_programs.py
CHANGED
@@ -118,7 +118,11 @@ def test_decode_json_regex():
|
|
118
118
|
s += "}"
|
119
119
|
|
120
120
|
ret = decode_json.run()
|
121
|
-
|
121
|
+
try:
|
122
|
+
js_obj = json.loads(ret["json_output"])
|
123
|
+
except json.decoder.JSONDecodeError:
|
124
|
+
print(ret["json_output"])
|
125
|
+
raise
|
122
126
|
assert isinstance(js_obj["name"], str)
|
123
127
|
assert isinstance(js_obj["population"], int)
|
124
128
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.6"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.6
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -249,7 +249,7 @@ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
|
|
249
249
|
|
250
250
|
--------------------------------------------------------------------------------
|
251
251
|
|
252
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
252
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
253
253
|
|
254
254
|
SGLang is a fast serving framework for large language models and vision language models.
|
255
255
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -404,16 +404,17 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
404
404
|
### Run Llama 3.1 405B
|
405
405
|
|
406
406
|
```bash
|
407
|
-
|
407
|
+
## Run 405B (fp8) on a single node
|
408
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
409
|
+
|
410
|
+
## Run 405B (fp16) on two nodes
|
408
411
|
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
412
|
+
|
409
413
|
# on the first node
|
410
414
|
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
411
415
|
|
412
416
|
# on the second
|
413
417
|
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
414
|
-
|
415
|
-
# single node run 405B fp8
|
416
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
417
418
|
```
|
418
419
|
|
419
420
|
### Supported Models
|
@@ -422,6 +423,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
|
|
422
423
|
- Mistral / Mixtral
|
423
424
|
- Gemma / Gemma 2
|
424
425
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
426
|
+
- DeepSeek / DeepSeek 2
|
425
427
|
- LLaVA 1.5 / 1.6
|
426
428
|
- `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
427
429
|
- `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -442,7 +444,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
442
444
|
|
443
445
|
### Benchmark Performance
|
444
446
|
|
445
|
-
- Benchmark a single static batch
|
447
|
+
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
|
446
448
|
```
|
447
449
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
448
450
|
```
|
@@ -7,11 +7,11 @@ sglang/global_config.py,sha256=CyhGL7PE-KlMcg7IHWykzImU1y4NQlpeIlh9lHA77uo,1749
|
|
7
7
|
sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
|
8
8
|
sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
|
9
9
|
sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
|
10
|
-
sglang/version.py,sha256=
|
10
|
+
sglang/version.py,sha256=Oz5HbwHMyE87nmwV80AZzpkJPf-wBg7eDuJr_BXZkhU,22
|
11
11
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
|
13
13
|
sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
|
14
|
-
sglang/lang/interpreter.py,sha256=
|
14
|
+
sglang/lang/interpreter.py,sha256=fbPrKF_SDpVPsiV2WbmlMfwRA7C9T9_IyVmGnpaXa0A,29687
|
15
15
|
sglang/lang/ir.py,sha256=5VVK2JnbspdysrhcGgkmp_JlAprd2XqqRnS_GfP_XWc,16645
|
16
16
|
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
17
17
|
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -19,55 +19,56 @@ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtx
|
|
19
19
|
sglang/lang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
|
20
20
|
sglang/lang/backend/litellm.py,sha256=QsaLRh0KVyuaxRZGAvLOdCCSStIMs-V0XyMX0PR6y0w,2452
|
21
21
|
sglang/lang/backend/openai.py,sha256=-ScfI2TFALB_FTYBur9ab0gNYxK1ogHkhdLxX19t6-Y,14808
|
22
|
-
sglang/lang/backend/runtime_endpoint.py,sha256=
|
22
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=6iW1S62KmYyQGiWsHJFhZidK01vlIE55IsYN2tP38WQ,9202
|
23
23
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
24
24
|
sglang/srt/conversation.py,sha256=Il7JJuu4o42k2xdBWVfONNmstTsAM-4idX6AcEOnrXQ,15526
|
25
25
|
sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
|
26
|
-
sglang/srt/hf_transformers_utils.py,sha256=
|
26
|
+
sglang/srt/hf_transformers_utils.py,sha256=RnyxC1_OmOf-QzdPBziqAUOIQXyRzrb4RNlqFB1ArEc,11354
|
27
27
|
sglang/srt/memory_pool.py,sha256=FhJk5GtYortO3MJIsMMQ-o49agwDHVX1aEQH2LITq6c,3949
|
28
28
|
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
29
|
-
sglang/srt/model_config.py,sha256=
|
30
|
-
sglang/srt/sampling_params.py,sha256=
|
31
|
-
sglang/srt/server.py,sha256=
|
32
|
-
sglang/srt/server_args.py,sha256=
|
29
|
+
sglang/srt/model_config.py,sha256=9VF7ET0CGKEY-zdiU7kGv8Cg7H_9Q1fmqtI3C0z22S0,5458
|
30
|
+
sglang/srt/sampling_params.py,sha256=WjJ_sOhbJVMKIBH8gJWQKhzeK5Ipu9XRNV7soWnLtak,3122
|
31
|
+
sglang/srt/server.py,sha256=IKSTgp6FJN6TE9anog47zh9GJYXoyMjEKBNXUZ89Cuk,14197
|
32
|
+
sglang/srt/server_args.py,sha256=RfWoipSUURmv5NqT4L_YF9qJ6gOkZ8omRUFC_5fmgts,14043
|
33
33
|
sglang/srt/utils.py,sha256=HvKkGbut8sOxMpGIzYsJ9NEZJg48LOnxyGESaGZmANs,22385
|
34
34
|
sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
|
35
35
|
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
36
|
-
sglang/srt/constrained/fsm_cache.py,sha256=
|
36
|
+
sglang/srt/constrained/fsm_cache.py,sha256=HlzFs9TXvMFmeZhTpXmJU3UNQ_Kix4Ir-SwpqXGhX8k,2061
|
37
37
|
sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
|
38
38
|
sglang/srt/layers/context_flashattention_nopad.py,sha256=7ps_9W_ia9zikL9HqsSUwWHyBVotywosE-dOiPtaGY8,4615
|
39
39
|
sglang/srt/layers/extend_attention.py,sha256=aYAAL9HZJpaSASp-ulMvbmSmyMcqdYUsgVQC-Lbm7_U,12008
|
40
40
|
sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
|
41
41
|
sglang/srt/layers/linear.py,sha256=qLwFkOiRAljzE7LkAkLRdcCdVMk-t7b56jEjwQAuYDM,33953
|
42
|
-
sglang/srt/layers/logits_processor.py,sha256=
|
43
|
-
sglang/srt/layers/radix_attention.py,sha256=
|
44
|
-
sglang/srt/layers/token_attention.py,sha256=
|
42
|
+
sglang/srt/layers/logits_processor.py,sha256=VjP6T582K64X0mfyPUkhcIEZxsqJNu6ziqR3V82N_jE,10118
|
43
|
+
sglang/srt/layers/radix_attention.py,sha256=to6w0kIq6dtaOYJtqIZcqR3t1yf05qBH1LWnFlE-jEQ,6374
|
44
|
+
sglang/srt/layers/token_attention.py,sha256=uBtk3I6KeFjBRKRuQoG5BEZtVJsX4p7UOtJoej6ILZI,7411
|
45
45
|
sglang/srt/layers/quantization/__init__.py,sha256=PQFzdPpul98DvywBA6YMBOnrMjtHE1LMlMpJ7FM8J3I,1971
|
46
46
|
sglang/srt/layers/quantization/fp8.py,sha256=jaqgRFnHC--IL8iqB6Qygi-KXYPYBKKqt_j4Rk55_h4,24946
|
47
47
|
sglang/srt/managers/detokenizer_manager.py,sha256=8rN2cdMr61LWy07lingEqLnNy0W5Rebdn14IsTQ9PCs,5049
|
48
|
-
sglang/srt/managers/io_struct.py,sha256=
|
49
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
50
|
-
sglang/srt/managers/controller/cuda_graph_runner.py,sha256=
|
51
|
-
sglang/srt/managers/controller/infer_batch.py,sha256=
|
48
|
+
sglang/srt/managers/io_struct.py,sha256=WmBGrWR8R6X2zh2p1FkfPZtJzuGSlNW8cmIDm0EEqMA,5528
|
49
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=2it1o4dKd7nFzfZflOw1cT03gFktqC2sVPICbBSR4c0,19594
|
50
|
+
sglang/srt/managers/controller/cuda_graph_runner.py,sha256=KEqX4Tc1yEWW52LzzFb4THb-guYIaft2pxxH8rWchSA,8808
|
51
|
+
sglang/srt/managers/controller/infer_batch.py,sha256=3DixMdSW0odH5I6p7h8_xtRlHx4q76ArR6YZW8Gkqzg,35888
|
52
52
|
sglang/srt/managers/controller/manager_multi.py,sha256=DT8Y9RF5OyTxlrLEZYz4claNWir3UrVztdOZaVPiA6g,6077
|
53
53
|
sglang/srt/managers/controller/manager_single.py,sha256=2xO_iWK6tWvc0B31nKbe2N3klxwQBJmPTnFhNjzhVSI,4566
|
54
|
-
sglang/srt/managers/controller/model_runner.py,sha256=
|
54
|
+
sglang/srt/managers/controller/model_runner.py,sha256=9o4xWnfI9-FJU6-S7WfEFlGMjWA2YesAhUKpuq8urhk,14854
|
55
55
|
sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
|
56
56
|
sglang/srt/managers/controller/schedule_heuristic.py,sha256=SQAGzPS3aB_TPj7rnPBhewwyR6W1sVwW4D3zG3JUY00,2714
|
57
|
-
sglang/srt/managers/controller/tp_worker.py,sha256=
|
57
|
+
sglang/srt/managers/controller/tp_worker.py,sha256=VYhO3xcJrcDQwonGLWSWKHq4T7BvFmb6-L5LxY3-fhE,30607
|
58
58
|
sglang/srt/model_loader/model_loader.py,sha256=VS8VQL5ITN3akZ9eU_-uHWMan1axLMNG2_O12HzGysA,10132
|
59
59
|
sglang/srt/model_loader/utils.py,sha256=I2PS5HIH5Cg-p7xKO_Cw_foK2vQ61xVc3zQv7CbeGEw,10120
|
60
60
|
sglang/srt/models/chatglm.py,sha256=pH8g2Dj8qQLGPYpWVTb-IONfXsdfmpWi0-IEYNdSi4s,13296
|
61
61
|
sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
|
62
62
|
sglang/srt/models/dbrx.py,sha256=rRxOusGPu670ommeqXg62AllwB1apzE4yZoWc1fcr2M,14095
|
63
63
|
sglang/srt/models/deepseek.py,sha256=YtoPmv4fKmiH_jsRMSab9Wxq3aOZga9pCPGnkCs3Vvs,15457
|
64
|
+
sglang/srt/models/deepseek_v2.py,sha256=1FqLe6tSENFpYgcEkmMr2-M4qksgne2glU3kZhSBB0Q,19527
|
64
65
|
sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
|
65
66
|
sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
|
66
67
|
sglang/srt/models/gpt_bigcode.py,sha256=XHO1naPdXfiKYQRQ6uZe1fN3PBDhKH3-bchsaaZvfE4,9637
|
67
68
|
sglang/srt/models/grok.py,sha256=611zrlIchvFaVfztRdBY7z97oU3KB-anykbOZy1hK6M,27295
|
68
69
|
sglang/srt/models/internlm2.py,sha256=8MNcwxU5Th9IxWa314HqqmbCRlPUFScnfneBDs0riIU,11659
|
69
70
|
sglang/srt/models/llama2.py,sha256=OyAf_lun5aZEsT80WmrIYBF8QXTXRpW8sUlylr4AZIc,14204
|
70
|
-
sglang/srt/models/llama_classification.py,sha256=
|
71
|
+
sglang/srt/models/llama_classification.py,sha256=Z2dvZAdOwCnN-lGFZRcwU0rNreE1gKwLefeWzEH36Uw,4366
|
71
72
|
sglang/srt/models/llava.py,sha256=vBI6EEeOG_9o23Shi9h8k58rxTOHZnSKMmPl3B3Q3uc,17924
|
72
73
|
sglang/srt/models/llavavid.py,sha256=SrNQ-U2wekHvP_up-ZXRkCSros2NzheHpPfXHrp0YBU,13050
|
73
74
|
sglang/srt/models/minicpm.py,sha256=9uE8D-NopAj-sfaKJ7d-0x-PuCTEevQPoHPZvZlwstA,13277
|
@@ -79,14 +80,14 @@ sglang/srt/models/qwen2.py,sha256=87Tt1Bti-Py3AGudcf7k5ni-OHhtDKPj_Hke44YGw4U,11
|
|
79
80
|
sglang/srt/models/qwen2_moe.py,sha256=oHNoo45myV5kitkls2GWVzuGt1Q4pRHN2nLlXEltFI8,17581
|
80
81
|
sglang/srt/models/stablelm.py,sha256=Z_XCDSHY_QMz3lZwwkZdIZjEOizZjLYJU9GDi8o08qQ,10802
|
81
82
|
sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
|
82
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
83
|
-
sglang/srt/openai_api/protocol.py,sha256=
|
83
|
+
sglang/srt/openai_api/adapter.py,sha256=DVZ2niAEOgE8GQdYnuvwjrGiFRkAu5YtOB-yxOlF_Eg,15868
|
84
|
+
sglang/srt/openai_api/protocol.py,sha256=jTb22jv5caB7k7Ub2ltYEbTtDheZjwwWAAUdvjiLTR0,5741
|
84
85
|
sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
|
85
86
|
sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
|
86
|
-
sglang/test/test_programs.py,sha256=
|
87
|
+
sglang/test/test_programs.py,sha256=s4WGpTmYP4Yx5g8JYZpbkeF9RN5iUnlKdi8FGAZovTc,13756
|
87
88
|
sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
|
88
|
-
sglang-0.2.
|
89
|
-
sglang-0.2.
|
90
|
-
sglang-0.2.
|
91
|
-
sglang-0.2.
|
92
|
-
sglang-0.2.
|
89
|
+
sglang-0.2.6.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
90
|
+
sglang-0.2.6.dist-info/METADATA,sha256=g_G_XHbWCNSY9F6RieXV43svnNzq1wonwrArNxX0VNA,32095
|
91
|
+
sglang-0.2.6.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
|
92
|
+
sglang-0.2.6.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
93
|
+
sglang-0.2.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|