sglang 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. sglang/api.py +13 -1
  2. sglang/bench_latency.py +10 -5
  3. sglang/bench_serving.py +50 -26
  4. sglang/check_env.py +15 -0
  5. sglang/global_config.py +1 -1
  6. sglang/lang/backend/runtime_endpoint.py +60 -49
  7. sglang/lang/chat_template.py +10 -5
  8. sglang/lang/compiler.py +4 -0
  9. sglang/lang/interpreter.py +5 -2
  10. sglang/lang/ir.py +22 -4
  11. sglang/launch_server.py +8 -1
  12. sglang/srt/constrained/jump_forward.py +13 -2
  13. sglang/srt/conversation.py +50 -1
  14. sglang/srt/hf_transformers_utils.py +22 -23
  15. sglang/srt/layers/activation.py +24 -2
  16. sglang/srt/layers/decode_attention.py +338 -50
  17. sglang/srt/layers/extend_attention.py +3 -1
  18. sglang/srt/layers/fused_moe/__init__.py +1 -0
  19. sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
  20. sglang/srt/layers/fused_moe/layer.py +587 -0
  21. sglang/srt/layers/layernorm.py +3 -0
  22. sglang/srt/layers/logits_processor.py +64 -27
  23. sglang/srt/layers/radix_attention.py +41 -18
  24. sglang/srt/layers/sampler.py +154 -0
  25. sglang/srt/managers/controller_multi.py +2 -8
  26. sglang/srt/managers/controller_single.py +7 -10
  27. sglang/srt/managers/detokenizer_manager.py +20 -9
  28. sglang/srt/managers/io_struct.py +44 -11
  29. sglang/srt/managers/policy_scheduler.py +5 -2
  30. sglang/srt/managers/schedule_batch.py +59 -179
  31. sglang/srt/managers/tokenizer_manager.py +193 -84
  32. sglang/srt/managers/tp_worker.py +131 -50
  33. sglang/srt/mem_cache/memory_pool.py +82 -8
  34. sglang/srt/mm_utils.py +79 -7
  35. sglang/srt/model_executor/cuda_graph_runner.py +97 -28
  36. sglang/srt/model_executor/forward_batch_info.py +188 -82
  37. sglang/srt/model_executor/model_runner.py +269 -87
  38. sglang/srt/models/chatglm.py +6 -14
  39. sglang/srt/models/commandr.py +6 -2
  40. sglang/srt/models/dbrx.py +5 -1
  41. sglang/srt/models/deepseek.py +7 -3
  42. sglang/srt/models/deepseek_v2.py +12 -7
  43. sglang/srt/models/gemma.py +6 -2
  44. sglang/srt/models/gemma2.py +22 -8
  45. sglang/srt/models/gpt_bigcode.py +5 -1
  46. sglang/srt/models/grok.py +66 -398
  47. sglang/srt/models/internlm2.py +5 -1
  48. sglang/srt/models/llama2.py +7 -3
  49. sglang/srt/models/llama_classification.py +2 -2
  50. sglang/srt/models/llama_embedding.py +4 -0
  51. sglang/srt/models/llava.py +176 -59
  52. sglang/srt/models/minicpm.py +7 -3
  53. sglang/srt/models/mixtral.py +61 -255
  54. sglang/srt/models/mixtral_quant.py +6 -5
  55. sglang/srt/models/qwen.py +7 -4
  56. sglang/srt/models/qwen2.py +15 -5
  57. sglang/srt/models/qwen2_moe.py +7 -16
  58. sglang/srt/models/stablelm.py +6 -2
  59. sglang/srt/openai_api/adapter.py +149 -58
  60. sglang/srt/sampling/sampling_batch_info.py +209 -0
  61. sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -4
  62. sglang/srt/server.py +107 -71
  63. sglang/srt/server_args.py +49 -15
  64. sglang/srt/utils.py +27 -18
  65. sglang/test/runners.py +38 -38
  66. sglang/test/simple_eval_common.py +9 -10
  67. sglang/test/simple_eval_gpqa.py +2 -1
  68. sglang/test/simple_eval_humaneval.py +2 -2
  69. sglang/test/simple_eval_math.py +2 -1
  70. sglang/test/simple_eval_mmlu.py +2 -1
  71. sglang/test/test_activation.py +55 -0
  72. sglang/test/test_programs.py +32 -5
  73. sglang/test/test_utils.py +37 -50
  74. sglang/version.py +1 -1
  75. {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/METADATA +102 -27
  76. sglang-0.2.14.dist-info/RECORD +114 -0
  77. {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
  78. sglang/launch_server_llavavid.py +0 -29
  79. sglang/srt/model_loader/model_loader.py +0 -292
  80. sglang/srt/model_loader/utils.py +0 -275
  81. sglang-0.2.12.dist-info/RECORD +0 -112
  82. {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
  83. {sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py CHANGED
@@ -2,11 +2,10 @@
2
2
 
3
3
  import argparse
4
4
  import asyncio
5
- import multiprocessing
5
+ import os
6
6
  import subprocess
7
7
  import threading
8
8
  import time
9
- import unittest
10
9
  from functools import partial
11
10
  from typing import Callable, List, Optional
12
11
 
@@ -18,10 +17,19 @@ import torch.nn.functional as F
18
17
  from sglang.global_config import global_config
19
18
  from sglang.lang.backend.openai import OpenAI
20
19
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
20
+ from sglang.srt.utils import kill_child_process
21
21
  from sglang.utils import get_exception_traceback
22
22
 
23
23
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
24
- DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157"
24
+ DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
25
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
26
+
27
+ if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
28
+ DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
29
+ DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
30
+ else:
31
+ DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
32
+ DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
25
33
 
26
34
 
27
35
  def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
@@ -100,31 +108,8 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
100
108
  return pred
101
109
 
102
110
 
103
- def call_generate_ginfer(prompt, temperature, max_tokens, stop=None, url=None):
104
- import grpc
105
- from ginfer import sampler_pb2, sampler_pb2_grpc
106
-
107
- sampler_channel = grpc.insecure_channel(url.replace("http://", ""))
108
- sampler = sampler_pb2_grpc.SamplerStub(sampler_channel)
109
-
110
- if stop is None:
111
- stop_strings = None
112
- else:
113
- stop_strings = [stop]
114
-
115
- sample_request = sampler_pb2.SampleTextRequest(
116
- prompt=prompt,
117
- settings=sampler_pb2.SampleSettings(
118
- max_len=max_tokens,
119
- rng_seed=0,
120
- temperature=max(temperature, 1e-7),
121
- nucleus_p=1,
122
- stop_strings=stop_strings,
123
- ),
124
- )
125
- stream = sampler.SampleText(sample_request)
126
- response = "".join([x.text for x in stream])
127
- return response
111
+ def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
112
+ raise NotImplementedError()
128
113
 
129
114
 
130
115
  def call_generate_guidance(
@@ -267,7 +252,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
267
252
  "vllm",
268
253
  "outlines",
269
254
  "lightllm",
270
- "ginfer",
255
+ "gserver",
271
256
  "guidance",
272
257
  "lmql",
273
258
  "srt-raw",
@@ -288,7 +273,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
288
273
  "lightllm": 22000,
289
274
  "lmql": 23000,
290
275
  "srt-raw": 30000,
291
- "ginfer": 9988,
276
+ "gserver": 9988,
292
277
  }
293
278
  args.port = default_port.get(args.backend, None)
294
279
  return args
@@ -324,8 +309,8 @@ def _get_call_generate(args: argparse.Namespace):
324
309
  return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
325
310
  elif args.backend == "srt-raw":
326
311
  return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
327
- elif args.backend == "ginfer":
328
- return partial(call_generate_ginfer, url=f"{args.host}:{args.port}")
312
+ elif args.backend == "gserver":
313
+ return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
329
314
  elif args.backend == "outlines":
330
315
  return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
331
316
  elif args.backend == "guidance":
@@ -476,34 +461,36 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
476
461
  success = True
477
462
 
478
463
  for filename in files:
464
+ global process
479
465
 
480
- def func():
481
- print(f"\n\nRun {filename}\n\n")
482
- ret = unittest.main(module=None, argv=["", "-vb"] + [filename])
483
-
484
- p = multiprocessing.Process(target=func)
485
-
486
- def run_one_file():
487
- p.start()
488
- p.join()
466
+ def run_one_file(filename):
467
+ filename = os.path.join(os.getcwd(), filename)
468
+ print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
469
+ process = subprocess.Popen(
470
+ ["python3", filename], stdout=None, stderr=None, env=os.environ
471
+ )
472
+ process.wait()
473
+ return process.returncode
489
474
 
490
475
  try:
491
- run_with_timeout(run_one_file, timeout=timeout_per_file)
492
- if p.exitcode != 0:
493
- success = False
494
- break
476
+ ret_code = run_with_timeout(
477
+ run_one_file, args=(filename,), timeout=timeout_per_file
478
+ )
479
+ assert ret_code == 0
495
480
  except TimeoutError:
496
- p.terminate()
481
+ kill_child_process(process.pid)
497
482
  time.sleep(5)
498
483
  print(
499
- f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
484
+ f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
485
+ flush=True,
500
486
  )
501
- return False
487
+ success = False
488
+ break
502
489
 
503
490
  if success:
504
- print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
491
+ print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
505
492
  else:
506
- print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
493
+ print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
507
494
 
508
495
  return 0 if success else -1
509
496
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.12"
1
+ __version__ = "0.2.14"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.12
3
+ Version: 0.2.14
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -231,6 +231,7 @@ Requires-Dist: openai>=1.0; extra == "openai"
231
231
  Requires-Dist: tiktoken; extra == "openai"
232
232
  Provides-Extra: srt
233
233
  Requires-Dist: aiohttp; extra == "srt"
234
+ Requires-Dist: decord; extra == "srt"
234
235
  Requires-Dist: fastapi; extra == "srt"
235
236
  Requires-Dist: hf-transfer; extra == "srt"
236
237
  Requires-Dist: huggingface-hub; extra == "srt"
@@ -244,12 +245,14 @@ Requires-Dist: torch; extra == "srt"
244
245
  Requires-Dist: uvicorn; extra == "srt"
245
246
  Requires-Dist: uvloop; extra == "srt"
246
247
  Requires-Dist: zmq; extra == "srt"
247
- Requires-Dist: vllm==0.5.4; extra == "srt"
248
+ Requires-Dist: vllm==0.5.5; extra == "srt"
248
249
  Requires-Dist: outlines>=0.0.44; extra == "srt"
249
250
  Provides-Extra: test
250
251
  Requires-Dist: jsonlines; extra == "test"
251
252
  Requires-Dist: matplotlib; extra == "test"
252
253
  Requires-Dist: pandas; extra == "test"
254
+ Requires-Dist: sentence-transformers; extra == "test"
255
+ Requires-Dist: accelerate; extra == "test"
253
256
 
254
257
  <div align="center">
255
258
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
270
273
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
271
274
 
272
275
  The core features include:
273
- - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
276
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
274
277
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
275
278
 
276
279
  ## News
277
280
  - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
278
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
281
+ - [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
279
282
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
280
283
 
281
284
  <details>
282
285
  <summary>More</summary>
283
286
 
287
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
284
288
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
285
289
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
286
290
 
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
308
312
  ### Method 2: From source
309
313
  ```
310
314
  # Use the last release branch
311
- git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.2.14 https://github.com/sgl-project/sglang.git
312
316
  cd sglang
313
317
 
314
318
  pip install --upgrade pip
@@ -329,11 +333,63 @@ docker run --gpus all \
329
333
  --env "HF_TOKEN=<secret>" \
330
334
  --ipc=host \
331
335
  lmsysorg/sglang:latest \
332
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
336
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
333
337
  ```
334
338
 
339
+ ### Method 4: Using docker compose
340
+
341
+ <details>
342
+
343
+ > This method is recommended if you plan to serve it as a service.
344
+ > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
345
+
346
+ 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
347
+ 2. Execute the command `docker compose up -d` in your terminal.
348
+ </details>
349
+
350
+ ### Method 5: Run on Kubernetes or Clouds with SkyPilot
351
+
352
+ <details>
353
+
354
+ To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
355
+
356
+ 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
357
+ 2. Deploy on your own infra with a single command and get the HTTP API endpoint:
358
+ <details>
359
+ <summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
360
+
361
+ ```yaml
362
+ # sglang.yaml
363
+ envs:
364
+ HF_TOKEN: null
365
+
366
+ resources:
367
+ image_id: docker:lmsysorg/sglang:latest
368
+ accelerators: A100
369
+ ports: 30000
370
+
371
+ run: |
372
+ conda deactivate
373
+ python3 -m sglang.launch_server \
374
+ --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
375
+ --host 0.0.0.0 \
376
+ --port 30000
377
+ ```
378
+ </details>
379
+
380
+ ```bash
381
+ # Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
382
+ HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
383
+
384
+ # Get the HTTP API endpoint
385
+ sky status --endpoint 30000 sglang
386
+ ```
387
+ 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
388
+ </details>
389
+
390
+
335
391
  ### Common Notes
336
- - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
392
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
337
393
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
338
394
 
339
395
  ## Backend: SGLang Runtime (SRT)
@@ -387,6 +443,13 @@ response = client.chat.completions.create(
387
443
  max_tokens=64,
388
444
  )
389
445
  print(response)
446
+
447
+ # Text embedding
448
+ response = client.embeddings.create(
449
+ model="default",
450
+ input="How are you today",
451
+ )
452
+ print(response)
390
453
  ```
391
454
 
392
455
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
@@ -423,19 +486,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
423
486
 
424
487
  ### Supported Models
425
488
 
489
+ **Generative Models**
490
+
426
491
  - Llama / Llama 2 / Llama 3 / Llama 3.1
427
492
  - Mistral / Mixtral / Mistral NeMo
428
493
  - Gemma / Gemma 2
429
494
  - Qwen / Qwen 2 / Qwen 2 MoE
430
495
  - DeepSeek / DeepSeek 2
431
- - LLaVA 1.5 / 1.6
432
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
433
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
434
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
435
- - LLaVA-NeXT-Video
436
- - see [examples/usage/llava_video](examples/usage/llava_video)
496
+ - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
497
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
498
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
499
+ - LLaVA 1.5 / 1.6 / NeXT
500
+ - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
501
+ - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
502
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
437
503
  - Yi-VL
438
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
439
504
  - StableLM
440
505
  - Command-R
441
506
  - DBRX
@@ -443,34 +508,45 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
443
508
  - ChatGLM
444
509
  - InternLM 2
445
510
 
511
+ **Embedding Models**
512
+
513
+ - e5-mistral
514
+ - gte-Qwen2
515
+ - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
516
+
446
517
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
447
518
 
448
519
  #### Use Models From ModelScope
449
- To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
520
+ <details>
521
+
522
+ To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
450
523
  ```
451
524
  export SGLANG_USE_MODELSCOPE=true
452
525
  ```
453
526
  Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
454
527
  ```
455
528
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
456
- ```
529
+ ```
530
+
531
+ </details>
457
532
 
458
533
  #### Run Llama 3.1 405B
534
+ <details>
459
535
 
460
536
  ```bash
461
- ## Run 405B (fp8) on a single node
537
+ # Run 405B (fp8) on a single node
462
538
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
463
539
 
464
- ## Run 405B (fp16) on two nodes
465
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
466
-
467
- # on the first node
468
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
540
+ # Run 405B (fp16) on two nodes
541
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
542
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
469
543
 
470
- # on the second
471
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
544
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
545
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
472
546
  ```
473
547
 
548
+ </details>
549
+
474
550
  ### Benchmark Performance
475
551
 
476
552
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -606,7 +682,7 @@ def tip_suggestion(s):
606
682
  s += "In summary" + sgl.gen("summary")
607
683
  ```
608
684
 
609
- #### Multi Modality
685
+ #### Multi-Modality
610
686
  Use `sgl.image` to pass an image as input.
611
687
 
612
688
  ```python
@@ -660,7 +736,7 @@ def character_gen(s, name):
660
736
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
661
737
  ```
662
738
 
663
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
739
+ See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
664
740
 
665
741
  #### Batching
666
742
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -722,7 +798,6 @@ def chat_example(s):
722
798
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
723
799
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
724
800
 
725
-
726
801
  ## Benchmark And Performance
727
802
  ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
728
803
  ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
@@ -0,0 +1,114 @@
1
+ sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
2
+ sglang/api.py,sha256=8B_ADgLN2fjo9Ej123hInfHA4wmpUkV0yyErSiRnfAA,6408
3
+ sglang/bench_latency.py,sha256=VEdGBX5vZSngS8AeOdJJRW65BIJsZXhKwAK5z20SZoI,16344
4
+ sglang/bench_serving.py,sha256=J_mMwnmDn0Jt07mzdGAuYOxpockHPLYJFL-kwoaqASY,36527
5
+ sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
6
+ sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
7
+ sglang/launch_server.py,sha256=FODfO0DW546dh-u1qDlWtrhsmj6hxkarXXv3cIdgkj8,549
8
+ sglang/utils.py,sha256=zFYGkC4vOUR3sTv1TmQXcsOLZDtDBR3wnjqnDp3xMIs,8352
9
+ sglang/version.py,sha256=3fSLgeJpZq4cUgzAH_CdFzXwJEO3NH_VVDv2pQnmwN0,23
10
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
12
+ sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
13
+ sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
14
+ sglang/lang/interpreter.py,sha256=-9VjAb5JqlxtBuQUDT08Cj2BW8VbLxTmJACe2cqza-s,30215
15
+ sglang/lang/ir.py,sha256=GRcPsEjnR4k5q5Kf-Rb2YgDBseCTGQoasclhjmQtL8Y,17511
16
+ sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
17
+ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
19
+ sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
20
+ sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
21
+ sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
22
+ sglang/lang/backend/runtime_endpoint.py,sha256=SDlp03EuQEK1eGK4_IaFySWgxlp4wCs3EPewZ6O640E,9549
23
+ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
24
+ sglang/srt/conversation.py,sha256=Ze2_dTHG6jc04ti7vuOEnoEe1ehvhxCJRpa4EYD0T_8,18494
25
+ sglang/srt/hf_transformers_utils.py,sha256=OP5uBwnWiam6h9QvkBaG-nrDgkEUEwLXy1IWvW7rrRo,11737
26
+ sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
27
+ sglang/srt/model_config.py,sha256=k4OfRV-szWkFaJMIC40JoJGJ75AfYQ2hf4M1dS1aQ-o,6366
28
+ sglang/srt/server.py,sha256=KM6fq1RXbnBr0nWj8IO54T-K14o0iscgdFR4z3uU5C4,19572
29
+ sglang/srt/server_args.py,sha256=GiDyPWCvYA_98mSE9LuvUoEodo9gRnNPPIPn0nFkxUs,18259
30
+ sglang/srt/utils.py,sha256=x9MdBu0e8HAgaNIGuxiMVL7_nh03kl_rWuMnLas_Dgo,24327
31
+ sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
32
+ sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
33
+ sglang/srt/constrained/fsm_cache.py,sha256=QTrBFoZCp2FeigtIakz2MCgQLtvQFXgl2lDPQaGtu9M,2784
34
+ sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
35
+ sglang/srt/layers/activation.py,sha256=4RIgqvAIXPpZV4q0YVbAPVygz_YFAbpI4x47p7LcOw4,1911
36
+ sglang/srt/layers/decode_attention.py,sha256=TPD_608ZX9fQ_HDImifkxG_qcEYmimbEYY8lCBIjFuM,16628
37
+ sglang/srt/layers/extend_attention.py,sha256=h4O0R7PJpAVKS3Vx_583zhrFPD0vv6XqzvOcHBI3zoc,14268
38
+ sglang/srt/layers/layernorm.py,sha256=sI_oveGW4uyFI2LOtWF2yd77wH2k5LGAvUIZuoOn2Oo,2227
39
+ sglang/srt/layers/logits_processor.py,sha256=Zx4eFAkFlThPrmz_-HuCN9SqGLanARm0wdZSVDyASAc,13085
40
+ sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
41
+ sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
42
+ sglang/srt/layers/radix_attention.py,sha256=o5a8r3XQ-oRwaxBlAgzJGv7p3dMbu0LrYsDc4uvpPgA,8338
43
+ sglang/srt/layers/sampler.py,sha256=YVzlrXE6uJoDwFHaZcUyxgUOUdR5a5myZvrRL6qckoA,5544
44
+ sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
45
+ sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
46
+ sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
47
+ sglang/srt/managers/controller_multi.py,sha256=R45ST6oBlIwfUwuibMw0sgTk8iqphb_rFyIdW048JA4,6472
48
+ sglang/srt/managers/controller_single.py,sha256=tnc71OTe8KDYouMdfqgwBT4lX5nZt6Rak9t2GmKtAME,5119
49
+ sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
50
+ sglang/srt/managers/io_struct.py,sha256=4Cs655K4n_F_usu6R3YE5_RdcE0XO9AXQNk5vl2II2c,10534
51
+ sglang/srt/managers/policy_scheduler.py,sha256=7HNUxBKJE444s_bHcPpbnHCygsnH-NIXYNSC2q6mRmc,8584
52
+ sglang/srt/managers/schedule_batch.py,sha256=yW7fkBi31vytfNEkFzs1Z3xzEzLMevXvoCyuoubut3M,25920
53
+ sglang/srt/managers/tokenizer_manager.py,sha256=aaZV7G3-m35pba1meRapqO7bdPjM2Cmkue5lbR_Jv3M,28836
54
+ sglang/srt/managers/tp_worker.py,sha256=DBrrd3QbjzAAvANvPs0zdYogsaFlusGx-IjpDVCP8RA,35976
55
+ sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
56
+ sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
57
+ sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
58
+ sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
59
+ sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
60
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=ba4WZhBbkJyZjronzwoDJmoh7l8oz0s5oj_i_3PLzSY,12662
61
+ sglang/srt/model_executor/forward_batch_info.py,sha256=MUcquCqmK-Jc1WNEciREmPj4iZu39tJk0axpexfyEXg,15775
62
+ sglang/srt/model_executor/model_runner.py,sha256=9L0cvNK2ELNfE4L6Hq9-K74ltXYenkFl4UVnY9d9JkU,24205
63
+ sglang/srt/models/chatglm.py,sha256=EaZKaRlsAbSP5rob6vUGqDuJLAY1HC2Oh-jgEUS4ZVY,13634
64
+ sglang/srt/models/commandr.py,sha256=k86ykwWOlxLGaBbGUoMSaXngUxCbMVRbY5AoMOWpbU8,14377
65
+ sglang/srt/models/dbrx.py,sha256=goLJ9Yt-9vxkwhCUFBidvP41H_dYTFsvrMZ4xm4FqGA,14875
66
+ sglang/srt/models/deepseek.py,sha256=aYP6HUgxQbhcQGQEF4vX0ronBF8AirqIFG98EQn0YzY,16220
67
+ sglang/srt/models/deepseek_v2.py,sha256=kzqfZvidRe6uydaMJI40qh_Qg7-gI0oBVH0rdWp7ONg,27218
68
+ sglang/srt/models/gemma.py,sha256=iC424guGOdsYC43xke5_uul9UIY0j6t7lUsDcB_uqa8,12492
69
+ sglang/srt/models/gemma2.py,sha256=JQvM6rYvjmLqdhQIQ9mRAAO1MhnIqTb32CqdL8X0o80,16798
70
+ sglang/srt/models/gpt_bigcode.py,sha256=jaolXlRp1PRHNEQPT-ZZ_cWAQ2us5DiNheSaNQ4Es_c,10418
71
+ sglang/srt/models/grok.py,sha256=FF_eURzXYXe1b39AbGtEPv2yYNzWarjmBsjkgutOkek,15019
72
+ sglang/srt/models/internlm2.py,sha256=VtWATs2eLIqbadYXTPY_vycFIstVk4zg3kxycA9H0Qw,12416
73
+ sglang/srt/models/llama2.py,sha256=JZPvaLSPiFMN-4qlOUBXZxsUsz6XtTGD-bB_fidxcfU,14516
74
+ sglang/srt/models/llama_classification.py,sha256=2zhBJtO9uieVj4Cd94KNiA8M_IdLuILDeTv1rePVJXw,4934
75
+ sglang/srt/models/llama_embedding.py,sha256=NQCQ3MnK3iRohL-UdY5UWxW4LlZ3RQZ7w4mlFOnpVrM,3696
76
+ sglang/srt/models/llava.py,sha256=iuXLJVDWBiYo8zJuDPSSjt2LYqbkg2MAcOFUZO1fOX4,24353
77
+ sglang/srt/models/llavavid.py,sha256=MX7YpqYh5J4BoOnV7vVAIfoOlBFQXYpp8Kpe7WK0ejk,13562
78
+ sglang/srt/models/minicpm.py,sha256=7RZEJ2TCqBL1JmMFVJ3J9DmZHRw0q90st49Wkh-sdL4,14039
79
+ sglang/srt/models/mistral.py,sha256=jlrWBVNXbAUziAaIdHAjFcOJnKtn9Bl8rBd65ypJM-I,819
80
+ sglang/srt/models/mixtral.py,sha256=StnGKdRhoweY46M2b2pv-vrfXaNqbhaVU4iKhEkMEfM,13837
81
+ sglang/srt/models/mixtral_quant.py,sha256=O_97UKDYZokFhIBnamWfw0HLhln9_BUk_KfQ-sQnd8s,14286
82
+ sglang/srt/models/qwen.py,sha256=geK88AyEyPbbDvMHJNY8XMSNpsCeu8g9kxnKyiJBpK4,10168
83
+ sglang/srt/models/qwen2.py,sha256=B1qfqukSA3_02Q3tvIxqIg-6kmxdJ36Roxn0WFmnVxQ,12776
84
+ sglang/srt/models/qwen2_moe.py,sha256=JZRd8AzvJgjVlHww1eCMPdF8rzC93X_1rgk3PEWE70M,17499
85
+ sglang/srt/models/stablelm.py,sha256=9feHoiDEXSIe0WCrt4AfWXqxliJwRvr8w4XSnk6ipSI,11573
86
+ sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
87
+ sglang/srt/openai_api/adapter.py,sha256=KaIYqkeguuVNHhpfSBvL7M0wRPhcivRAtuG-DsyXExI,46654
88
+ sglang/srt/openai_api/protocol.py,sha256=knf-nds0XO2LYg-hPM-Ho1f1y2XZIV_Gvg3xcCKLfgQ,9411
89
+ sglang/srt/sampling/sampling_batch_info.py,sha256=encziVWrUDswoay0qfFVALHx_96Vra2mzD6_GHthZ3s,7771
90
+ sglang/srt/sampling/sampling_params.py,sha256=dmjUlTY4VfuRtyc_sR59zMzhkjiTzHmljyTIogCFd0k,5411
91
+ sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
92
+ sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
93
+ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
94
+ sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
95
+ sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
96
+ sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
97
+ sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
98
+ sglang/test/runners.py,sha256=IOaaNJ4y3GSbUCsnbKZrbZDoBR2_us2zWKWxccfrGlk,7687
99
+ sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
100
+ sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
101
+ sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
102
+ sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
103
+ sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
104
+ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
105
+ sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
106
+ sglang/test/test_layernorm.py,sha256=VDdoeqGvebUa-l3rDiid6cC7wZq0Phpbm5fxxD0-cpg,1910
107
+ sglang/test/test_programs.py,sha256=V_-Bx3lLkw37P6gDyA7mZCqxlyNMaFLBkRrPMQQQqn4,14909
108
+ sglang/test/test_utils.py,sha256=HD-9rcj7EFS_NX1GQFU5613ITQlZaTK2l9RmqA0F7x4,14380
109
+ sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
110
+ sglang-0.2.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
111
+ sglang-0.2.14.dist-info/METADATA,sha256=V3t6L-QOiHsJYTihE9W1YeR_YyRC_ZPZwlWjw0Mymsg,37161
112
+ sglang-0.2.14.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
113
+ sglang-0.2.14.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
114
+ sglang-0.2.14.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.1.0)
2
+ Generator: setuptools (73.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,29 +0,0 @@
1
- """Launch the inference server for Llava-video model."""
2
-
3
- import argparse
4
-
5
- from sglang.srt.server import ServerArgs, launch_server
6
-
7
- if __name__ == "__main__":
8
- model_overide_args = {}
9
-
10
- model_overide_args["mm_spatial_pool_stride"] = 2
11
- model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
12
- model_overide_args["num_frames"] = 16
13
- model_overide_args["model_type"] = "llavavid"
14
- if model_overide_args["num_frames"] == 32:
15
- model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
16
- model_overide_args["max_sequence_length"] = 4096 * 2
17
- model_overide_args["tokenizer_model_max_length"] = 4096 * 2
18
- model_overide_args["model_max_length"] = 4096 * 2
19
-
20
- parser = argparse.ArgumentParser()
21
- ServerArgs.add_cli_args(parser)
22
- args = parser.parse_args()
23
-
24
- if "34b" in args.model_path.lower():
25
- model_overide_args["image_token_index"] = 64002
26
-
27
- server_args = ServerArgs.from_cli_args(args)
28
-
29
- launch_server(server_args, model_overide_args, None)