sglang 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +13 -1
- sglang/bench_latency.py +10 -5
- sglang/bench_serving.py +50 -26
- sglang/check_env.py +15 -0
- sglang/global_config.py +1 -1
- sglang/lang/backend/runtime_endpoint.py +60 -49
- sglang/lang/chat_template.py +10 -5
- sglang/lang/compiler.py +4 -0
- sglang/lang/interpreter.py +5 -2
- sglang/lang/ir.py +22 -4
- sglang/launch_server.py +8 -1
- sglang/srt/constrained/jump_forward.py +13 -2
- sglang/srt/conversation.py +50 -1
- sglang/srt/hf_transformers_utils.py +22 -23
- sglang/srt/layers/activation.py +24 -2
- sglang/srt/layers/decode_attention.py +338 -50
- sglang/srt/layers/extend_attention.py +3 -1
- sglang/srt/layers/fused_moe/__init__.py +1 -0
- sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
- sglang/srt/layers/fused_moe/layer.py +587 -0
- sglang/srt/layers/layernorm.py +3 -0
- sglang/srt/layers/logits_processor.py +64 -27
- sglang/srt/layers/radix_attention.py +41 -18
- sglang/srt/layers/sampler.py +154 -0
- sglang/srt/managers/controller_multi.py +2 -8
- sglang/srt/managers/controller_single.py +7 -10
- sglang/srt/managers/detokenizer_manager.py +20 -9
- sglang/srt/managers/io_struct.py +44 -11
- sglang/srt/managers/policy_scheduler.py +5 -2
- sglang/srt/managers/schedule_batch.py +59 -179
- sglang/srt/managers/tokenizer_manager.py +193 -84
- sglang/srt/managers/tp_worker.py +131 -50
- sglang/srt/mem_cache/memory_pool.py +82 -8
- sglang/srt/mm_utils.py +79 -7
- sglang/srt/model_executor/cuda_graph_runner.py +97 -28
- sglang/srt/model_executor/forward_batch_info.py +188 -82
- sglang/srt/model_executor/model_runner.py +269 -87
- sglang/srt/models/chatglm.py +6 -14
- sglang/srt/models/commandr.py +6 -2
- sglang/srt/models/dbrx.py +5 -1
- sglang/srt/models/deepseek.py +7 -3
- sglang/srt/models/deepseek_v2.py +12 -7
- sglang/srt/models/gemma.py +6 -2
- sglang/srt/models/gemma2.py +22 -8
- sglang/srt/models/gpt_bigcode.py +5 -1
- sglang/srt/models/grok.py +66 -398
- sglang/srt/models/internlm2.py +5 -1
- sglang/srt/models/llama2.py +7 -3
- sglang/srt/models/llama_classification.py +2 -2
- sglang/srt/models/llama_embedding.py +4 -0
- sglang/srt/models/llava.py +176 -59
- sglang/srt/models/minicpm.py +7 -3
- sglang/srt/models/mixtral.py +61 -255
- sglang/srt/models/mixtral_quant.py +6 -5
- sglang/srt/models/qwen.py +7 -4
- sglang/srt/models/qwen2.py +15 -5
- sglang/srt/models/qwen2_moe.py +7 -16
- sglang/srt/models/stablelm.py +6 -2
- sglang/srt/openai_api/adapter.py +149 -58
- sglang/srt/sampling/sampling_batch_info.py +209 -0
- sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -4
- sglang/srt/server.py +107 -71
- sglang/srt/server_args.py +49 -15
- sglang/srt/utils.py +27 -18
- sglang/test/runners.py +38 -38
- sglang/test/simple_eval_common.py +9 -10
- sglang/test/simple_eval_gpqa.py +2 -1
- sglang/test/simple_eval_humaneval.py +2 -2
- sglang/test/simple_eval_math.py +2 -1
- sglang/test/simple_eval_mmlu.py +2 -1
- sglang/test/test_activation.py +55 -0
- sglang/test/test_programs.py +32 -5
- sglang/test/test_utils.py +37 -50
- sglang/version.py +1 -1
- {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/METADATA +102 -27
- sglang-0.2.14.dist-info/RECORD +114 -0
- {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
- sglang/launch_server_llavavid.py +0 -29
- sglang/srt/model_loader/model_loader.py +0 -292
- sglang/srt/model_loader/utils.py +0 -275
- sglang-0.2.12.dist-info/RECORD +0 -112
- {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
- {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py
CHANGED
@@ -2,11 +2,10 @@
|
|
2
2
|
|
3
3
|
import argparse
|
4
4
|
import asyncio
|
5
|
-
import
|
5
|
+
import os
|
6
6
|
import subprocess
|
7
7
|
import threading
|
8
8
|
import time
|
9
|
-
import unittest
|
10
9
|
from functools import partial
|
11
10
|
from typing import Callable, List, Optional
|
12
11
|
|
@@ -18,10 +17,19 @@ import torch.nn.functional as F
|
|
18
17
|
from sglang.global_config import global_config
|
19
18
|
from sglang.lang.backend.openai import OpenAI
|
20
19
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
20
|
+
from sglang.srt.utils import kill_child_process
|
21
21
|
from sglang.utils import get_exception_traceback
|
22
22
|
|
23
23
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
24
|
-
|
24
|
+
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
25
|
+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
26
|
+
|
27
|
+
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
28
|
+
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
|
29
|
+
DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
|
30
|
+
else:
|
31
|
+
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
|
32
|
+
DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
|
25
33
|
|
26
34
|
|
27
35
|
def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
|
@@ -100,31 +108,8 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
|
|
100
108
|
return pred
|
101
109
|
|
102
110
|
|
103
|
-
def
|
104
|
-
|
105
|
-
from ginfer import sampler_pb2, sampler_pb2_grpc
|
106
|
-
|
107
|
-
sampler_channel = grpc.insecure_channel(url.replace("http://", ""))
|
108
|
-
sampler = sampler_pb2_grpc.SamplerStub(sampler_channel)
|
109
|
-
|
110
|
-
if stop is None:
|
111
|
-
stop_strings = None
|
112
|
-
else:
|
113
|
-
stop_strings = [stop]
|
114
|
-
|
115
|
-
sample_request = sampler_pb2.SampleTextRequest(
|
116
|
-
prompt=prompt,
|
117
|
-
settings=sampler_pb2.SampleSettings(
|
118
|
-
max_len=max_tokens,
|
119
|
-
rng_seed=0,
|
120
|
-
temperature=max(temperature, 1e-7),
|
121
|
-
nucleus_p=1,
|
122
|
-
stop_strings=stop_strings,
|
123
|
-
),
|
124
|
-
)
|
125
|
-
stream = sampler.SampleText(sample_request)
|
126
|
-
response = "".join([x.text for x in stream])
|
127
|
-
return response
|
111
|
+
def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
|
112
|
+
raise NotImplementedError()
|
128
113
|
|
129
114
|
|
130
115
|
def call_generate_guidance(
|
@@ -267,7 +252,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
|
267
252
|
"vllm",
|
268
253
|
"outlines",
|
269
254
|
"lightllm",
|
270
|
-
"
|
255
|
+
"gserver",
|
271
256
|
"guidance",
|
272
257
|
"lmql",
|
273
258
|
"srt-raw",
|
@@ -288,7 +273,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
|
288
273
|
"lightllm": 22000,
|
289
274
|
"lmql": 23000,
|
290
275
|
"srt-raw": 30000,
|
291
|
-
"
|
276
|
+
"gserver": 9988,
|
292
277
|
}
|
293
278
|
args.port = default_port.get(args.backend, None)
|
294
279
|
return args
|
@@ -324,8 +309,8 @@ def _get_call_generate(args: argparse.Namespace):
|
|
324
309
|
return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
|
325
310
|
elif args.backend == "srt-raw":
|
326
311
|
return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
|
327
|
-
elif args.backend == "
|
328
|
-
return partial(
|
312
|
+
elif args.backend == "gserver":
|
313
|
+
return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
|
329
314
|
elif args.backend == "outlines":
|
330
315
|
return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
|
331
316
|
elif args.backend == "guidance":
|
@@ -476,34 +461,36 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
|
|
476
461
|
success = True
|
477
462
|
|
478
463
|
for filename in files:
|
464
|
+
global process
|
479
465
|
|
480
|
-
def
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
p.join()
|
466
|
+
def run_one_file(filename):
|
467
|
+
filename = os.path.join(os.getcwd(), filename)
|
468
|
+
print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
|
469
|
+
process = subprocess.Popen(
|
470
|
+
["python3", filename], stdout=None, stderr=None, env=os.environ
|
471
|
+
)
|
472
|
+
process.wait()
|
473
|
+
return process.returncode
|
489
474
|
|
490
475
|
try:
|
491
|
-
run_with_timeout(
|
492
|
-
|
493
|
-
|
494
|
-
|
476
|
+
ret_code = run_with_timeout(
|
477
|
+
run_one_file, args=(filename,), timeout=timeout_per_file
|
478
|
+
)
|
479
|
+
assert ret_code == 0
|
495
480
|
except TimeoutError:
|
496
|
-
|
481
|
+
kill_child_process(process.pid)
|
497
482
|
time.sleep(5)
|
498
483
|
print(
|
499
|
-
f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
|
484
|
+
f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
|
485
|
+
flush=True,
|
500
486
|
)
|
501
|
-
|
487
|
+
success = False
|
488
|
+
break
|
502
489
|
|
503
490
|
if success:
|
504
|
-
print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
|
491
|
+
print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
|
505
492
|
else:
|
506
|
-
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
|
493
|
+
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
|
507
494
|
|
508
495
|
return 0 if success else -1
|
509
496
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.14"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.14
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -231,6 +231,7 @@ Requires-Dist: openai>=1.0; extra == "openai"
|
|
231
231
|
Requires-Dist: tiktoken; extra == "openai"
|
232
232
|
Provides-Extra: srt
|
233
233
|
Requires-Dist: aiohttp; extra == "srt"
|
234
|
+
Requires-Dist: decord; extra == "srt"
|
234
235
|
Requires-Dist: fastapi; extra == "srt"
|
235
236
|
Requires-Dist: hf-transfer; extra == "srt"
|
236
237
|
Requires-Dist: huggingface-hub; extra == "srt"
|
@@ -244,12 +245,14 @@ Requires-Dist: torch; extra == "srt"
|
|
244
245
|
Requires-Dist: uvicorn; extra == "srt"
|
245
246
|
Requires-Dist: uvloop; extra == "srt"
|
246
247
|
Requires-Dist: zmq; extra == "srt"
|
247
|
-
Requires-Dist: vllm==0.5.
|
248
|
+
Requires-Dist: vllm==0.5.5; extra == "srt"
|
248
249
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
249
250
|
Provides-Extra: test
|
250
251
|
Requires-Dist: jsonlines; extra == "test"
|
251
252
|
Requires-Dist: matplotlib; extra == "test"
|
252
253
|
Requires-Dist: pandas; extra == "test"
|
254
|
+
Requires-Dist: sentence-transformers; extra == "test"
|
255
|
+
Requires-Dist: accelerate; extra == "test"
|
253
256
|
|
254
257
|
<div align="center">
|
255
258
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
|
|
270
273
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
271
274
|
|
272
275
|
The core features include:
|
273
|
-
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism,
|
276
|
+
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
|
274
277
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
275
278
|
|
276
279
|
## News
|
277
280
|
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
278
|
-
- [2024/
|
281
|
+
- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
|
279
282
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
280
283
|
|
281
284
|
<details>
|
282
285
|
<summary>More</summary>
|
283
286
|
|
287
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
284
288
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
285
289
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
286
290
|
|
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
308
312
|
### Method 2: From source
|
309
313
|
```
|
310
314
|
# Use the last release branch
|
311
|
-
git clone -b v0.2.
|
315
|
+
git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
|
312
316
|
cd sglang
|
313
317
|
|
314
318
|
pip install --upgrade pip
|
@@ -329,11 +333,63 @@ docker run --gpus all \
|
|
329
333
|
--env "HF_TOKEN=<secret>" \
|
330
334
|
--ipc=host \
|
331
335
|
lmsysorg/sglang:latest \
|
332
|
-
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
336
|
+
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
333
337
|
```
|
334
338
|
|
339
|
+
### Method 4: Using docker compose
|
340
|
+
|
341
|
+
<details>
|
342
|
+
|
343
|
+
> This method is recommended if you plan to serve it as a service.
|
344
|
+
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
345
|
+
|
346
|
+
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
347
|
+
2. Execute the command `docker compose up -d` in your terminal.
|
348
|
+
</details>
|
349
|
+
|
350
|
+
### Method 5: Run on Kubernetes or Clouds with SkyPilot
|
351
|
+
|
352
|
+
<details>
|
353
|
+
|
354
|
+
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
355
|
+
|
356
|
+
1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
|
357
|
+
2. Deploy on your own infra with a single command and get the HTTP API endpoint:
|
358
|
+
<details>
|
359
|
+
<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
|
360
|
+
|
361
|
+
```yaml
|
362
|
+
# sglang.yaml
|
363
|
+
envs:
|
364
|
+
HF_TOKEN: null
|
365
|
+
|
366
|
+
resources:
|
367
|
+
image_id: docker:lmsysorg/sglang:latest
|
368
|
+
accelerators: A100
|
369
|
+
ports: 30000
|
370
|
+
|
371
|
+
run: |
|
372
|
+
conda deactivate
|
373
|
+
python3 -m sglang.launch_server \
|
374
|
+
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
375
|
+
--host 0.0.0.0 \
|
376
|
+
--port 30000
|
377
|
+
```
|
378
|
+
</details>
|
379
|
+
|
380
|
+
```bash
|
381
|
+
# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
|
382
|
+
HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
|
383
|
+
|
384
|
+
# Get the HTTP API endpoint
|
385
|
+
sky status --endpoint 30000 sglang
|
386
|
+
```
|
387
|
+
3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
|
388
|
+
</details>
|
389
|
+
|
390
|
+
|
335
391
|
### Common Notes
|
336
|
-
-
|
392
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
|
337
393
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
338
394
|
|
339
395
|
## Backend: SGLang Runtime (SRT)
|
@@ -387,6 +443,13 @@ response = client.chat.completions.create(
|
|
387
443
|
max_tokens=64,
|
388
444
|
)
|
389
445
|
print(response)
|
446
|
+
|
447
|
+
# Text embedding
|
448
|
+
response = client.embeddings.create(
|
449
|
+
model="default",
|
450
|
+
input="How are you today",
|
451
|
+
)
|
452
|
+
print(response)
|
390
453
|
```
|
391
454
|
|
392
455
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
@@ -423,19 +486,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
423
486
|
|
424
487
|
### Supported Models
|
425
488
|
|
489
|
+
**Generative Models**
|
490
|
+
|
426
491
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
427
492
|
- Mistral / Mixtral / Mistral NeMo
|
428
493
|
- Gemma / Gemma 2
|
429
494
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
430
495
|
- DeepSeek / DeepSeek 2
|
431
|
-
- LLaVA
|
432
|
-
- `
|
433
|
-
-
|
434
|
-
|
435
|
-
-
|
436
|
-
-
|
496
|
+
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
497
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
|
498
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
499
|
+
- LLaVA 1.5 / 1.6 / NeXT
|
500
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
501
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
502
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
437
503
|
- Yi-VL
|
438
|
-
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
439
504
|
- StableLM
|
440
505
|
- Command-R
|
441
506
|
- DBRX
|
@@ -443,34 +508,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
443
508
|
- ChatGLM
|
444
509
|
- InternLM 2
|
445
510
|
|
511
|
+
**Embedding Models**
|
512
|
+
|
513
|
+
- e5-mistral
|
514
|
+
- gte-Qwen2
|
515
|
+
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
516
|
+
|
446
517
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
447
518
|
|
448
519
|
#### Use Models From ModelScope
|
449
|
-
|
520
|
+
<details>
|
521
|
+
|
522
|
+
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
|
450
523
|
```
|
451
524
|
export SGLANG_USE_MODELSCOPE=true
|
452
525
|
```
|
453
526
|
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
454
527
|
```
|
455
528
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
456
|
-
```
|
529
|
+
```
|
530
|
+
|
531
|
+
</details>
|
457
532
|
|
458
533
|
#### Run Llama 3.1 405B
|
534
|
+
<details>
|
459
535
|
|
460
536
|
```bash
|
461
|
-
|
537
|
+
# Run 405B (fp8) on a single node
|
462
538
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
463
539
|
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
# on the first node
|
468
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
540
|
+
# Run 405B (fp16) on two nodes
|
541
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
542
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
|
469
543
|
|
470
|
-
|
471
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
544
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
545
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
472
546
|
```
|
473
547
|
|
548
|
+
</details>
|
549
|
+
|
474
550
|
### Benchmark Performance
|
475
551
|
|
476
552
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -606,7 +682,7 @@ def tip_suggestion(s):
|
|
606
682
|
s += "In summary" + sgl.gen("summary")
|
607
683
|
```
|
608
684
|
|
609
|
-
#### Multi
|
685
|
+
#### Multi-Modality
|
610
686
|
Use `sgl.image` to pass an image as input.
|
611
687
|
|
612
688
|
```python
|
@@ -660,7 +736,7 @@ def character_gen(s, name):
|
|
660
736
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
661
737
|
```
|
662
738
|
|
663
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example
|
739
|
+
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
664
740
|
|
665
741
|
#### Batching
|
666
742
|
Use `run_batch` to run a batch of requests with continuous batching.
|
@@ -722,7 +798,6 @@ def chat_example(s):
|
|
722
798
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
723
799
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
724
800
|
|
725
|
-
|
726
801
|
## Benchmark And Performance
|
727
802
|

|
728
803
|

|
@@ -0,0 +1,114 @@
|
|
1
|
+
sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
|
2
|
+
sglang/api.py,sha256=8B_ADgLN2fjo9Ej123hInfHA4wmpUkV0yyErSiRnfAA,6408
|
3
|
+
sglang/bench_latency.py,sha256=VEdGBX5vZSngS8AeOdJJRW65BIJsZXhKwAK5z20SZoI,16344
|
4
|
+
sglang/bench_serving.py,sha256=J_mMwnmDn0Jt07mzdGAuYOxpockHPLYJFL-kwoaqASY,36527
|
5
|
+
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
6
|
+
sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
|
7
|
+
sglang/launch_server.py,sha256=FODfO0DW546dh-u1qDlWtrhsmj6hxkarXXv3cIdgkj8,549
|
8
|
+
sglang/utils.py,sha256=zFYGkC4vOUR3sTv1TmQXcsOLZDtDBR3wnjqnDp3xMIs,8352
|
9
|
+
sglang/version.py,sha256=3fSLgeJpZq4cUgzAH_CdFzXwJEO3NH_VVDv2pQnmwN0,23
|
10
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
12
|
+
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
13
|
+
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
14
|
+
sglang/lang/interpreter.py,sha256=-9VjAb5JqlxtBuQUDT08Cj2BW8VbLxTmJACe2cqza-s,30215
|
15
|
+
sglang/lang/ir.py,sha256=GRcPsEjnR4k5q5Kf-Rb2YgDBseCTGQoasclhjmQtL8Y,17511
|
16
|
+
sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
|
17
|
+
sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
19
|
+
sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
|
20
|
+
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
21
|
+
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
22
|
+
sglang/lang/backend/runtime_endpoint.py,sha256=SDlp03EuQEK1eGK4_IaFySWgxlp4wCs3EPewZ6O640E,9549
|
23
|
+
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
24
|
+
sglang/srt/conversation.py,sha256=Ze2_dTHG6jc04ti7vuOEnoEe1ehvhxCJRpa4EYD0T_8,18494
|
25
|
+
sglang/srt/hf_transformers_utils.py,sha256=OP5uBwnWiam6h9QvkBaG-nrDgkEUEwLXy1IWvW7rrRo,11737
|
26
|
+
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
27
|
+
sglang/srt/model_config.py,sha256=k4OfRV-szWkFaJMIC40JoJGJ75AfYQ2hf4M1dS1aQ-o,6366
|
28
|
+
sglang/srt/server.py,sha256=KM6fq1RXbnBr0nWj8IO54T-K14o0iscgdFR4z3uU5C4,19572
|
29
|
+
sglang/srt/server_args.py,sha256=GiDyPWCvYA_98mSE9LuvUoEodo9gRnNPPIPn0nFkxUs,18259
|
30
|
+
sglang/srt/utils.py,sha256=x9MdBu0e8HAgaNIGuxiMVL7_nh03kl_rWuMnLas_Dgo,24327
|
31
|
+
sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
|
32
|
+
sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
|
33
|
+
sglang/srt/constrained/fsm_cache.py,sha256=QTrBFoZCp2FeigtIakz2MCgQLtvQFXgl2lDPQaGtu9M,2784
|
34
|
+
sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
|
35
|
+
sglang/srt/layers/activation.py,sha256=4RIgqvAIXPpZV4q0YVbAPVygz_YFAbpI4x47p7LcOw4,1911
|
36
|
+
sglang/srt/layers/decode_attention.py,sha256=TPD_608ZX9fQ_HDImifkxG_qcEYmimbEYY8lCBIjFuM,16628
|
37
|
+
sglang/srt/layers/extend_attention.py,sha256=h4O0R7PJpAVKS3Vx_583zhrFPD0vv6XqzvOcHBI3zoc,14268
|
38
|
+
sglang/srt/layers/layernorm.py,sha256=sI_oveGW4uyFI2LOtWF2yd77wH2k5LGAvUIZuoOn2Oo,2227
|
39
|
+
sglang/srt/layers/logits_processor.py,sha256=Zx4eFAkFlThPrmz_-HuCN9SqGLanARm0wdZSVDyASAc,13085
|
40
|
+
sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
|
41
|
+
sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
|
42
|
+
sglang/srt/layers/radix_attention.py,sha256=o5a8r3XQ-oRwaxBlAgzJGv7p3dMbu0LrYsDc4uvpPgA,8338
|
43
|
+
sglang/srt/layers/sampler.py,sha256=YVzlrXE6uJoDwFHaZcUyxgUOUdR5a5myZvrRL6qckoA,5544
|
44
|
+
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
45
|
+
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
46
|
+
sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
|
47
|
+
sglang/srt/managers/controller_multi.py,sha256=R45ST6oBlIwfUwuibMw0sgTk8iqphb_rFyIdW048JA4,6472
|
48
|
+
sglang/srt/managers/controller_single.py,sha256=tnc71OTe8KDYouMdfqgwBT4lX5nZt6Rak9t2GmKtAME,5119
|
49
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
|
50
|
+
sglang/srt/managers/io_struct.py,sha256=4Cs655K4n_F_usu6R3YE5_RdcE0XO9AXQNk5vl2II2c,10534
|
51
|
+
sglang/srt/managers/policy_scheduler.py,sha256=7HNUxBKJE444s_bHcPpbnHCygsnH-NIXYNSC2q6mRmc,8584
|
52
|
+
sglang/srt/managers/schedule_batch.py,sha256=yW7fkBi31vytfNEkFzs1Z3xzEzLMevXvoCyuoubut3M,25920
|
53
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=aaZV7G3-m35pba1meRapqO7bdPjM2Cmkue5lbR_Jv3M,28836
|
54
|
+
sglang/srt/managers/tp_worker.py,sha256=DBrrd3QbjzAAvANvPs0zdYogsaFlusGx-IjpDVCP8RA,35976
|
55
|
+
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
56
|
+
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
57
|
+
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
58
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
|
59
|
+
sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
|
60
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=ba4WZhBbkJyZjronzwoDJmoh7l8oz0s5oj_i_3PLzSY,12662
|
61
|
+
sglang/srt/model_executor/forward_batch_info.py,sha256=MUcquCqmK-Jc1WNEciREmPj4iZu39tJk0axpexfyEXg,15775
|
62
|
+
sglang/srt/model_executor/model_runner.py,sha256=9L0cvNK2ELNfE4L6Hq9-K74ltXYenkFl4UVnY9d9JkU,24205
|
63
|
+
sglang/srt/models/chatglm.py,sha256=EaZKaRlsAbSP5rob6vUGqDuJLAY1HC2Oh-jgEUS4ZVY,13634
|
64
|
+
sglang/srt/models/commandr.py,sha256=k86ykwWOlxLGaBbGUoMSaXngUxCbMVRbY5AoMOWpbU8,14377
|
65
|
+
sglang/srt/models/dbrx.py,sha256=goLJ9Yt-9vxkwhCUFBidvP41H_dYTFsvrMZ4xm4FqGA,14875
|
66
|
+
sglang/srt/models/deepseek.py,sha256=aYP6HUgxQbhcQGQEF4vX0ronBF8AirqIFG98EQn0YzY,16220
|
67
|
+
sglang/srt/models/deepseek_v2.py,sha256=kzqfZvidRe6uydaMJI40qh_Qg7-gI0oBVH0rdWp7ONg,27218
|
68
|
+
sglang/srt/models/gemma.py,sha256=iC424guGOdsYC43xke5_uul9UIY0j6t7lUsDcB_uqa8,12492
|
69
|
+
sglang/srt/models/gemma2.py,sha256=JQvM6rYvjmLqdhQIQ9mRAAO1MhnIqTb32CqdL8X0o80,16798
|
70
|
+
sglang/srt/models/gpt_bigcode.py,sha256=jaolXlRp1PRHNEQPT-ZZ_cWAQ2us5DiNheSaNQ4Es_c,10418
|
71
|
+
sglang/srt/models/grok.py,sha256=FF_eURzXYXe1b39AbGtEPv2yYNzWarjmBsjkgutOkek,15019
|
72
|
+
sglang/srt/models/internlm2.py,sha256=VtWATs2eLIqbadYXTPY_vycFIstVk4zg3kxycA9H0Qw,12416
|
73
|
+
sglang/srt/models/llama2.py,sha256=JZPvaLSPiFMN-4qlOUBXZxsUsz6XtTGD-bB_fidxcfU,14516
|
74
|
+
sglang/srt/models/llama_classification.py,sha256=2zhBJtO9uieVj4Cd94KNiA8M_IdLuILDeTv1rePVJXw,4934
|
75
|
+
sglang/srt/models/llama_embedding.py,sha256=NQCQ3MnK3iRohL-UdY5UWxW4LlZ3RQZ7w4mlFOnpVrM,3696
|
76
|
+
sglang/srt/models/llava.py,sha256=iuXLJVDWBiYo8zJuDPSSjt2LYqbkg2MAcOFUZO1fOX4,24353
|
77
|
+
sglang/srt/models/llavavid.py,sha256=MX7YpqYh5J4BoOnV7vVAIfoOlBFQXYpp8Kpe7WK0ejk,13562
|
78
|
+
sglang/srt/models/minicpm.py,sha256=7RZEJ2TCqBL1JmMFVJ3J9DmZHRw0q90st49Wkh-sdL4,14039
|
79
|
+
sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
|
80
|
+
sglang/srt/models/mixtral.py,sha256=StnGKdRhoweY46M2b2pv-vrfXaNqbhaVU4iKhEkMEfM,13837
|
81
|
+
sglang/srt/models/mixtral_quant.py,sha256=O_97UKDYZokFhIBnamWfw0HLhln9_BUk_KfQ-sQnd8s,14286
|
82
|
+
sglang/srt/models/qwen.py,sha256=geK88AyEyPbbDvMHJNY8XMSNpsCeu8g9kxnKyiJBpK4,10168
|
83
|
+
sglang/srt/models/qwen2.py,sha256=B1qfqukSA3_02Q3tvIxqIg-6kmxdJ36Roxn0WFmnVxQ,12776
|
84
|
+
sglang/srt/models/qwen2_moe.py,sha256=JZRd8AzvJgjVlHww1eCMPdF8rzC93X_1rgk3PEWE70M,17499
|
85
|
+
sglang/srt/models/stablelm.py,sha256=9feHoiDEXSIe0WCrt4AfWXqxliJwRvr8w4XSnk6ipSI,11573
|
86
|
+
sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
|
87
|
+
sglang/srt/openai_api/adapter.py,sha256=KaIYqkeguuVNHhpfSBvL7M0wRPhcivRAtuG-DsyXExI,46654
|
88
|
+
sglang/srt/openai_api/protocol.py,sha256=knf-nds0XO2LYg-hPM-Ho1f1y2XZIV_Gvg3xcCKLfgQ,9411
|
89
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=encziVWrUDswoay0qfFVALHx_96Vra2mzD6_GHthZ3s,7771
|
90
|
+
sglang/srt/sampling/sampling_params.py,sha256=dmjUlTY4VfuRtyc_sR59zMzhkjiTzHmljyTIogCFd0k,5411
|
91
|
+
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
92
|
+
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
|
93
|
+
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
|
94
|
+
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
|
95
|
+
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
|
96
|
+
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
97
|
+
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
98
|
+
sglang/test/runners.py,sha256=IOaaNJ4y3GSbUCsnbKZrbZDoBR2_us2zWKWxccfrGlk,7687
|
99
|
+
sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
|
100
|
+
sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
|
101
|
+
sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
|
102
|
+
sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
|
103
|
+
sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
|
104
|
+
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
105
|
+
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
106
|
+
sglang/test/test_layernorm.py,sha256=VDdoeqGvebUa-l3rDiid6cC7wZq0Phpbm5fxxD0-cpg,1910
|
107
|
+
sglang/test/test_programs.py,sha256=V_-Bx3lLkw37P6gDyA7mZCqxlyNMaFLBkRrPMQQQqn4,14909
|
108
|
+
sglang/test/test_utils.py,sha256=HD-9rcj7EFS_NX1GQFU5613ITQlZaTK2l9RmqA0F7x4,14380
|
109
|
+
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
110
|
+
sglang-0.2.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
111
|
+
sglang-0.2.14.dist-info/METADATA,sha256=V3t6L-QOiHsJYTihE9W1YeR_YyRC_ZPZwlWjw0Mymsg,37161
|
112
|
+
sglang-0.2.14.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
|
113
|
+
sglang-0.2.14.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
114
|
+
sglang-0.2.14.dist-info/RECORD,,
|
sglang/launch_server_llavavid.py
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
"""Launch the inference server for Llava-video model."""
|
2
|
-
|
3
|
-
import argparse
|
4
|
-
|
5
|
-
from sglang.srt.server import ServerArgs, launch_server
|
6
|
-
|
7
|
-
if __name__ == "__main__":
|
8
|
-
model_overide_args = {}
|
9
|
-
|
10
|
-
model_overide_args["mm_spatial_pool_stride"] = 2
|
11
|
-
model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
|
12
|
-
model_overide_args["num_frames"] = 16
|
13
|
-
model_overide_args["model_type"] = "llavavid"
|
14
|
-
if model_overide_args["num_frames"] == 32:
|
15
|
-
model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
|
16
|
-
model_overide_args["max_sequence_length"] = 4096 * 2
|
17
|
-
model_overide_args["tokenizer_model_max_length"] = 4096 * 2
|
18
|
-
model_overide_args["model_max_length"] = 4096 * 2
|
19
|
-
|
20
|
-
parser = argparse.ArgumentParser()
|
21
|
-
ServerArgs.add_cli_args(parser)
|
22
|
-
args = parser.parse_args()
|
23
|
-
|
24
|
-
if "34b" in args.model_path.lower():
|
25
|
-
model_overide_args["image_token_index"] = 64002
|
26
|
-
|
27
|
-
server_args = ServerArgs.from_cli_args(args)
|
28
|
-
|
29
|
-
launch_server(server_args, model_overide_args, None)
|