sglang 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. sglang/__init__.py +1 -1
  2. sglang/api.py +26 -0
  3. sglang/backend/runtime_endpoint.py +18 -14
  4. sglang/bench_latency.py +34 -16
  5. sglang/global_config.py +1 -0
  6. sglang/lang/chat_template.py +41 -6
  7. sglang/lang/interpreter.py +5 -1
  8. sglang/lang/ir.py +61 -25
  9. sglang/srt/constrained/__init__.py +3 -2
  10. sglang/srt/hf_transformers_utils.py +7 -3
  11. sglang/srt/layers/extend_attention.py +2 -1
  12. sglang/srt/layers/fused_moe.py +181 -167
  13. sglang/srt/layers/logits_processor.py +55 -19
  14. sglang/srt/layers/radix_attention.py +24 -27
  15. sglang/srt/layers/token_attention.py +4 -1
  16. sglang/srt/managers/controller/infer_batch.py +2 -2
  17. sglang/srt/managers/controller/manager_single.py +1 -1
  18. sglang/srt/managers/controller/model_runner.py +27 -15
  19. sglang/srt/managers/controller/tp_worker.py +31 -14
  20. sglang/srt/managers/detokenizer_manager.py +4 -2
  21. sglang/srt/managers/io_struct.py +1 -1
  22. sglang/srt/managers/tokenizer_manager.py +14 -13
  23. sglang/srt/model_config.py +6 -0
  24. sglang/srt/models/gemma2.py +436 -0
  25. sglang/srt/models/llama2.py +3 -3
  26. sglang/srt/models/llama_classification.py +10 -7
  27. sglang/srt/models/minicpm.py +373 -0
  28. sglang/srt/models/qwen2_moe.py +454 -0
  29. sglang/srt/openai_api_adapter.py +2 -2
  30. sglang/srt/openai_protocol.py +1 -1
  31. sglang/srt/server.py +17 -8
  32. sglang/srt/server_args.py +14 -16
  33. sglang/srt/utils.py +68 -35
  34. {sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/METADATA +19 -13
  35. {sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/RECORD +38 -35
  36. {sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
  37. {sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/WHEEL +0 -0
  38. {sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -432,13 +432,12 @@ def assert_pkg_version(pkg: str, min_version: str, message: str):
432
432
  if pkg_version.parse(installed_version) < pkg_version.parse(min_version):
433
433
  raise Exception(
434
434
  f"{pkg} is installed with version {installed_version}, which "
435
- f"is less than the minimum required version {min_version}. " +
436
- message
435
+ f"is less than the minimum required version {min_version}. " + message
437
436
  )
438
437
  except PackageNotFoundError:
439
438
  raise Exception(
440
- f"{pkg} with minimum required version {min_version} is not installed. " +
441
- message
439
+ f"{pkg} with minimum required version {min_version} is not installed. "
440
+ + message
442
441
  )
443
442
 
444
443
 
@@ -459,13 +458,9 @@ def monkey_patch_vllm_p2p_access_check(gpu_id: int):
459
458
  NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
460
459
  """
461
460
 
462
- # TODO: need a better check than just dev str name match
463
- # compat: skip RTX 40 series as they do not have P2P feature and even checking for them may cause errors
464
- device_name = torch.cuda.get_device_name(gpu_id)
465
- if "RTX 40" not in device_name:
466
- import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
461
+ import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
467
462
 
468
- setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
463
+ setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
469
464
 
470
465
 
471
466
  def monkey_patch_vllm_dummy_weight_loader():
@@ -474,24 +469,40 @@ def monkey_patch_vllm_dummy_weight_loader():
474
469
  """
475
470
 
476
471
  from vllm.model_executor.model_loader.loader import (
477
- ModelConfig, DeviceConfig, LoRAConfig, VisionLanguageConfig,
478
- ParallelConfig, SchedulerConfig, CacheConfig, nn,
479
- set_default_torch_dtype, _initialize_model, initialize_dummy_weights,
480
- DummyModelLoader
472
+ CacheConfig,
473
+ DeviceConfig,
474
+ DummyModelLoader,
475
+ LoRAConfig,
476
+ ModelConfig,
477
+ ParallelConfig,
478
+ SchedulerConfig,
479
+ MultiModalConfig,
480
+ _initialize_model,
481
+ initialize_dummy_weights,
482
+ nn,
483
+ set_default_torch_dtype,
481
484
  )
482
485
 
483
- def load_model(self, *, model_config: ModelConfig,
484
- device_config: DeviceConfig,
485
- lora_config: Optional[LoRAConfig],
486
- vision_language_config: Optional[VisionLanguageConfig],
487
- parallel_config: ParallelConfig,
488
- scheduler_config: SchedulerConfig,
489
- cache_config: CacheConfig) -> nn.Module:
486
+ def load_model(
487
+ self,
488
+ *,
489
+ model_config: ModelConfig,
490
+ device_config: DeviceConfig,
491
+ lora_config: Optional[LoRAConfig],
492
+ multimodal_config: Optional[MultiModalConfig],
493
+ parallel_config: ParallelConfig,
494
+ scheduler_config: SchedulerConfig,
495
+ cache_config: CacheConfig,
496
+ ) -> nn.Module:
490
497
  with set_default_torch_dtype(model_config.dtype):
491
498
  with torch.device(device_config.device):
492
- model = _initialize_model(model_config, self.load_config,
493
- lora_config, vision_language_config,
494
- cache_config)
499
+ model = _initialize_model(
500
+ model_config,
501
+ self.load_config,
502
+ lora_config,
503
+ multimodal_config,
504
+ cache_config,
505
+ )
495
506
 
496
507
  for _, module in model.named_modules():
497
508
  quant_method = getattr(module, "quant_method", None)
@@ -541,7 +552,7 @@ def get_ip_address(ifname):
541
552
  ip_address = fcntl.ioctl(
542
553
  s.fileno(),
543
554
  0x8915, # SIOCGIFADDR
544
- struct.pack('256s', bytes(ifname[:15], 'utf-8'))
555
+ struct.pack("256s", bytes(ifname[:15], "utf-8")),
545
556
  )[20:24]
546
557
  return socket.inet_ntoa(ip_address)
547
558
 
@@ -550,44 +561,66 @@ def send_addrs_to_rank_0(model_port_args, server_args):
550
561
  assert server_args.node_rank != 0 and server_args.dp_size == 1
551
562
  import torch.distributed as dist
552
563
 
553
- ifname = os.environ.get("SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0"))
564
+ ifname = os.environ.get(
565
+ "SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
566
+ )
554
567
  ip_addr = get_ip_address(ifname)
555
568
 
556
569
  num_tp_ports = server_args.tp_size // server_args.nnodes
557
570
  model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
558
571
  ip_addr = [int(x) for x in ip_addr.split(".")]
559
- addrs_tensor = torch.tensor(ip_addr + model_port_args.model_tp_ports, dtype=torch.int)
572
+ addrs_tensor = torch.tensor(
573
+ ip_addr + model_port_args.model_tp_ports, dtype=torch.int
574
+ )
560
575
 
561
576
  init_method = f"tcp://{server_args.nccl_init_addr}"
562
- dist.init_process_group(backend="gloo", init_method=init_method, rank=server_args.node_rank, world_size=server_args.nnodes)
577
+ dist.init_process_group(
578
+ backend="gloo",
579
+ init_method=init_method,
580
+ rank=server_args.node_rank,
581
+ world_size=server_args.nnodes,
582
+ )
563
583
  dist.send(addrs_tensor, dst=0)
564
- print(f"Node {server_args.node_rank} sent: ip_address {ip_addr} and ports {model_port_args.model_tp_ports}")
584
+ print(
585
+ f"Node {server_args.node_rank} sent: ip_address {ip_addr} and ports {model_port_args.model_tp_ports}"
586
+ )
565
587
 
566
588
  dist.barrier()
567
- dist.destroy_process_group()
589
+ dist.destroy_process_group()
568
590
 
569
591
 
570
592
  def receive_addrs(model_port_args, server_args):
571
593
  assert server_args.node_rank == 0 and server_args.dp_size == 1
572
594
  import torch.distributed as dist
573
595
 
574
- ifname = os.environ.get("SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0"))
596
+ ifname = os.environ.get(
597
+ "SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
598
+ )
575
599
  ip_addr = get_ip_address(ifname)
576
600
 
577
601
  num_tp_ports = server_args.tp_size // server_args.nnodes
578
602
  model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
579
603
 
580
604
  init_method = f"tcp://{server_args.nccl_init_addr}"
581
- dist.init_process_group(backend="gloo", init_method=init_method, rank=server_args.node_rank, world_size=server_args.nnodes)
605
+ dist.init_process_group(
606
+ backend="gloo",
607
+ init_method=init_method,
608
+ rank=server_args.node_rank,
609
+ world_size=server_args.nnodes,
610
+ )
582
611
 
583
612
  for src_rank in range(1, server_args.nnodes):
584
613
  tensor = torch.zeros(4 + num_tp_ports, dtype=torch.int)
585
614
  dist.recv(tensor, src=src_rank)
586
615
  ip = ".".join([str(x) for x in tensor[:4].tolist()])
587
616
  ports = tensor[4:].tolist()
588
- model_port_args.model_tp_ips[num_tp_ports * src_rank: num_tp_ports * (src_rank + 1)] = [ip] * num_tp_ports
589
- model_port_args.model_tp_ports[num_tp_ports * src_rank: num_tp_ports * (src_rank + 1)] = ports
617
+ model_port_args.model_tp_ips[
618
+ num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
619
+ ] = [ip] * num_tp_ports
620
+ model_port_args.model_tp_ports[
621
+ num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
622
+ ] = ports
590
623
  print(f"Node 0 received from rank {src_rank}: {tensor.tolist()}")
591
624
 
592
625
  dist.barrier()
593
- dist.destroy_process_group()
626
+ dist.destroy_process_group()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.18
3
+ Version: 0.1.19
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -241,7 +241,7 @@ Requires-Dist: torch ; extra == 'srt'
241
241
  Requires-Dist: uvicorn ; extra == 'srt'
242
242
  Requires-Dist: uvloop ; extra == 'srt'
243
243
  Requires-Dist: zmq ; extra == 'srt'
244
- Requires-Dist: vllm ==0.5.0 ; extra == 'srt'
244
+ Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
245
245
  Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
246
246
 
247
247
  <div align="center">
@@ -257,7 +257,7 @@ It makes your interaction with LLMs faster and more controllable by co-designing
257
257
 
258
258
  The core features include:
259
259
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
- - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
260
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
261
261
 
262
262
  ## News
263
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -288,15 +288,21 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
288
288
  git clone https://github.com/sgl-project/sglang.git
289
289
  cd sglang
290
290
 
291
- pip install --upgrade pip
292
291
  pip install -e "python[all]"
293
292
 
294
293
  # Install FlashInfer CUDA kernels
295
294
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
296
295
  ```
297
296
 
298
- ### Notes
299
- - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
297
+ ### Method 3: Using docker
298
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
299
+
300
+ ### Common Notes
301
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
302
+ ```
303
+ pip uninstall -y triton triton-nightly
304
+ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
305
+ ```
300
306
  - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
301
307
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
302
308
 
@@ -518,8 +524,8 @@ for out in state.text_iter():
518
524
  ```
519
525
 
520
526
  ### Tips and Implementation Details
521
- - The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
522
- - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
527
+ - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
528
+ - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
523
529
 
524
530
  ## Backend: SGLang Runtime (SRT)
525
531
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -576,7 +582,6 @@ response = client.chat.completions.create(
576
582
  print(response)
577
583
  ```
578
584
 
579
-
580
585
  By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
581
586
 
582
587
  If needed, you can also override the chat template when launching the server:
@@ -605,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
605
610
  ```
606
611
 
607
612
  ### Additional Arguments
608
- - Add `--tp 2` to enable tensor parallelism.
613
+ - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
609
614
  ```
610
615
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
611
616
  ```
@@ -623,9 +628,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
623
628
  - Llama
624
629
  - Mistral
625
630
  - Mixtral
626
- - Qwen / Qwen 2
627
- - Gemma
628
- - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
631
+ - Qwen / Qwen 2 / Qwen 2 MoE
632
+ - Gemma / Gemma 2
629
633
  - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
630
634
  - LLaVA
631
635
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
@@ -638,6 +642,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
638
642
  - StableLM
639
643
  - Command-R
640
644
  - DBRX
645
+ - Grok
646
+ - ChatGLM
641
647
  - AWQ/GPTQ/Marlin quantization
642
648
 
643
649
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
@@ -1,7 +1,7 @@
1
- sglang/__init__.py,sha256=PhkN9MopSdHLXHG9_7l5JB-awRDI9CdR6Qht1vWA9C8,1116
2
- sglang/api.py,sha256=92oqUgVeKq9B9If2A8LHzEhPicZK5Rq3rKUShwPAq0E,4579
3
- sglang/bench_latency.py,sha256=MNxmVCwBM7ZWFYSFy2m-y8MmEWNWvZO2gUBbuMyWSBI,10018
4
- sglang/global_config.py,sha256=xMX7JqPgDRwtvcbULkwHJ-bfysNefEN42V3BGss9mlo,1425
1
+ sglang/__init__.py,sha256=GriWuMrszCcPLrLQRv50jP0Crc6b8CLsBA3UYM36ISw,1116
2
+ sglang/api.py,sha256=W_FO5JTrW9I-DoGx2O8cLhcSA6LJqgplrOIqAX-ryNA,5560
3
+ sglang/bench_latency.py,sha256=Ln3DbLmTwIhgsiFZH0_L5Fd3Sc5jM_Vb9PFZytX76hM,10299
4
+ sglang/global_config.py,sha256=1HsHrPFgkqCc5iIwrweKQ0HLip0DLogtpm9vaqbZqfE,1426
5
5
  sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
6
6
  sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
7
7
  sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
@@ -10,69 +10,72 @@ sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2
10
10
  sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
11
11
  sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
12
12
  sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
13
- sglang/backend/runtime_endpoint.py,sha256=8NyWgMvhzUcA5VEsPLo1AacZ_UPVSnpxpzt6vYdVQSU,8871
13
+ sglang/backend/runtime_endpoint.py,sha256=XTHAoN_EAwdfADc6vq9tuqri7udGMUih8dStgTuKV1g,9077
14
14
  sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
15
15
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- sglang/lang/chat_template.py,sha256=ogIT8iMlDcSEgcNBTh5pRLoCkdQI_ec5Hc27wFUFDIg,11532
16
+ sglang/lang/chat_template.py,sha256=hLX1qpXaUQi7PFndAwbOoOeGlX0NekskR_HndAvGnwQ,13307
17
17
  sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
18
- sglang/lang/interpreter.py,sha256=MMdvli-75ySiKiULlsnoVmb8oEu5bvSkYz8GRdtZoVk,29494
19
- sglang/lang/ir.py,sha256=KZxXVva2r1UihYOVWRKcU_zILMx05oWV2yLy3SeZfnA,14603
18
+ sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
19
+ sglang/lang/ir.py,sha256=ZGXJbJELlt8D8H7CyW3IqcRpZm8Pp7h_hLQw46NSb6I,16639
20
20
  sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
21
21
  sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
22
22
  sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
23
- sglang/srt/hf_transformers_utils.py,sha256=P6eXfGwH-OeU6hDrlGYL5GACcTNPdYOimpKZ0ZBZUao,10683
23
+ sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
24
24
  sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
25
25
  sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
26
- sglang/srt/model_config.py,sha256=eGt8hTtipSTqp-AsB-Cl4wfZDb14CTcOtIz-iXgaVk8,4997
27
- sglang/srt/openai_api_adapter.py,sha256=pqGP0bON-wEZOnZyo85gzrO9MSzeIkHh5xqhpN5RkyY,15120
28
- sglang/srt/openai_protocol.py,sha256=CNJOMr3PJvoRGI2TIh9t8f_4wYTtT0EF8kzsrYsASYY,5350
26
+ sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
27
+ sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
28
+ sglang/srt/openai_protocol.py,sha256=-KJsGx2izL3Fc5EhOGi9PAXExuaq-DKRk0UlNjts11E,5348
29
29
  sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
30
- sglang/srt/server.py,sha256=742w8gn0GgE7w3EwgPhq7MYabaVxcdPpMAovEE6-DaU,13112
31
- sglang/srt/server_args.py,sha256=j0-Aj8sHQ-zgumd4w0IaezRqDdjDC6MMMG5M8zzITVw,12166
32
- sglang/srt/utils.py,sha256=V2C4fb93oKS4D3lezlRgHkD7MQDNBZlIy_4ZTNzAC9E,19423
33
- sglang/srt/constrained/__init__.py,sha256=Q-XnKFChC9q6WDCnJKAKAuXzKHHg4QoFlYODge8ZKCs,1504
30
+ sglang/srt/server.py,sha256=ntl5XwnbOm2favQWbqVULXBUOLhXsgZ3mf1i2MY4e14,13226
31
+ sglang/srt/server_args.py,sha256=rvJImd-b9CVveg_V7n7dSotlro6q6pAqBk7lOxRC7nk,12307
32
+ sglang/srt/utils.py,sha256=e-yPzqDMCGsPgEf4TIe7CEh44lsKpZnclsrMtBggS_Y,19366
33
+ sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
34
34
  sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
35
35
  sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
36
36
  sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
37
37
  sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
38
- sglang/srt/layers/extend_attention.py,sha256=JUYuYSAhfbgOXrwIK5YHJCXPq54a6IZ7vQrze-3VvMQ,12955
39
- sglang/srt/layers/fused_moe.py,sha256=M_cTHMNSoD-wdh6XjzHseuq3zsdqOmECWxNeEVJklu4,22257
40
- sglang/srt/layers/logits_processor.py,sha256=t-bZIcGj70KKf2Jcor9K7Va1NsBlDVNrQ4Ktlq0lUlU,8506
41
- sglang/srt/layers/radix_attention.py,sha256=XsHFf7myNKZwyt3qB5LEXAttTKMY9OP3M3t5CZnyu3g,6911
42
- sglang/srt/layers/token_attention.py,sha256=rVbPlFpmLoU3nx3qtK2YZdynDxfvMKtQNTPeKi0KNP0,8823
43
- sglang/srt/managers/detokenizer_manager.py,sha256=tOjURt-XQofPblnGECoJfoRSMPoWFVAH99R05hXeYNw,3353
44
- sglang/srt/managers/io_struct.py,sha256=O1cz6hDV6BjXbZ0-tk6VaDNjYFuMBUOGswbG3H_GliY,4532
45
- sglang/srt/managers/tokenizer_manager.py,sha256=TswupFKrlXAvUM5-1eT2cR6uNJoQVivp2MQkEFu4axQ,14848
38
+ sglang/srt/layers/extend_attention.py,sha256=sVd94ViwwQaQDuE94sPMg6Ac6VOp7nX80hFol8qr85Q,13008
39
+ sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
40
+ sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
41
+ sglang/srt/layers/radix_attention.py,sha256=e468GCYteIuVOW7T9xols-IqXS0hJysmicvEiwD0xIM,6857
42
+ sglang/srt/layers/token_attention.py,sha256=eKUUU5pvYsF5EGthfbv-L_IUlg366l5e5X1eWTkE_Xw,8908
43
+ sglang/srt/managers/detokenizer_manager.py,sha256=2oYNtYrSwtfu8G-QcFz_vZK6Buq-eHuZGg9VpxVhYOI,3492
44
+ sglang/srt/managers/io_struct.py,sha256=aCI4yYtKoioP459lWRN8kqVf4tvYYr_IhZaSnvJylgY,4533
45
+ sglang/srt/managers/tokenizer_manager.py,sha256=h5nOR8NHCwEm52wiL-ZA1hoM_pvMuyG0j7Zj1h7aMxk,14898
46
46
  sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
47
- sglang/srt/managers/controller/infer_batch.py,sha256=-Q17Pk_Mmccobxly7UM8wCC6dYKJ4zmjplMboN1q8b0,25700
47
+ sglang/srt/managers/controller/infer_batch.py,sha256=wOuvi4lNhVEZtfXZKinBXCubG_VEaRTv60ijbHpSMgM,25713
48
48
  sglang/srt/managers/controller/manager_multi.py,sha256=Z0a-iZzqk8T2Xl7ak2DgE9j00GA6Eb0XoNVx7UlxKa4,6630
49
- sglang/srt/managers/controller/manager_single.py,sha256=OIm_BjbDaEcYmpb_E_7wv0xfOlb2le0zXjPMqf1pU9U,3468
50
- sglang/srt/managers/controller/model_runner.py,sha256=HjOHp_Rtdm7OnMmhtnSwPWPmEYHDpnt5LjeKbiYb6mo,21718
49
+ sglang/srt/managers/controller/manager_single.py,sha256=5c33d1jPgOtys5gmfZe79UD7aXrsV--1Yq9Yc24bh1g,3469
50
+ sglang/srt/managers/controller/model_runner.py,sha256=a-1RKjA12U11BvDbnOECyPf6rpxes895pEZ0-Hyxo6c,21888
51
51
  sglang/srt/managers/controller/radix_cache.py,sha256=fMqIm1fTvufI9I_QMoFLfQMkSUWp8VN4wh3-63KJUL0,8193
52
52
  sglang/srt/managers/controller/schedule_heuristic.py,sha256=_ne7W2mrpuO794uh5tYLR3q6XBbgTMdNmE6VpzY1sJE,2312
53
- sglang/srt/managers/controller/tp_worker.py,sha256=VNVQ0oqPGllC00cZCxHB-0LqudxgS74jf-it2zDHzTA,31411
53
+ sglang/srt/managers/controller/tp_worker.py,sha256=WBqL5_VVDAf3o12ymZwxQn7RYZ_dm_w2dXCnMVQ5L3M,31828
54
54
  sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
55
55
  sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
56
56
  sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
57
57
  sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
58
+ sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
58
59
  sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
59
- sglang/srt/models/llama2.py,sha256=7aPPSLABRIy7_iy4YvFHV7Beqc2I1-Vc1obSbsgzNzY,12190
60
- sglang/srt/models/llama_classification.py,sha256=LrPRFB-Yd2haZADNY3uIusbajQwcZNQrOCTd92L2vS0,4304
60
+ sglang/srt/models/llama2.py,sha256=FIUlkFoBhRNidU_Tlcr4UbSqzKPdz3wBc9OocN_CzQs,12188
61
+ sglang/srt/models/llama_classification.py,sha256=bLuugRFcPGEaNd58_LFOkWqOru2rCAGChhBw9dSu7pc,4349
61
62
  sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
62
63
  sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
64
+ sglang/srt/models/minicpm.py,sha256=vYCGjUjYIYVroiV2kOXWdWIPF6__vkN8JnRK-DqgKNI,13271
63
65
  sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
64
66
  sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
65
67
  sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
66
68
  sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
67
69
  sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
70
+ sglang/srt/models/qwen2_moe.py,sha256=hV3dF_AzYONd-pQEmEkrrwpTZC6A7K4wY1_cph9UC54,18421
68
71
  sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
69
72
  sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
70
73
  sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
71
74
  sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
72
75
  sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
73
76
  sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
74
- sglang-0.1.18.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
75
- sglang-0.1.18.dist-info/METADATA,sha256=tDdBZo2qvH8wWC4faXxfryjh7-6frEsBnH0vJ_ia1w4,29752
76
- sglang-0.1.18.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
77
- sglang-0.1.18.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
78
- sglang-0.1.18.dist-info/RECORD,,
77
+ sglang-0.1.19.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
78
+ sglang-0.1.19.dist-info/METADATA,sha256=iSIkO_DxfMHQIEv7ZdMXWwi_weLZtf8YRNS80vjf1Kk,30262
79
+ sglang-0.1.19.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
80
+ sglang-0.1.19.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
81
+ sglang-0.1.19.dist-info/RECORD,,