PyPI - sglang - Versions diffs - 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl - Mend

sglang 0.1.18py3-none-any.whl → 0.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

sglang/__init__.py +1 -1
sglang/api.py +26 -0
sglang/backend/runtime_endpoint.py +18 -14
sglang/bench_latency.py +34 -16
sglang/global_config.py +1 -0
sglang/lang/chat_template.py +41 -6
sglang/lang/interpreter.py +5 -1
sglang/lang/ir.py +61 -25
sglang/srt/constrained/__init__.py +3 -2
sglang/srt/hf_transformers_utils.py +7 -3
sglang/srt/layers/extend_attention.py +2 -1
sglang/srt/layers/fused_moe.py +181 -167
sglang/srt/layers/logits_processor.py +55 -19
sglang/srt/layers/radix_attention.py +24 -27
sglang/srt/layers/token_attention.py +4 -1
sglang/srt/managers/controller/infer_batch.py +2 -2
sglang/srt/managers/controller/manager_single.py +1 -1
sglang/srt/managers/controller/model_runner.py +27 -15
sglang/srt/managers/controller/tp_worker.py +31 -14
sglang/srt/managers/detokenizer_manager.py +4 -2
sglang/srt/managers/io_struct.py +1 -1
sglang/srt/managers/tokenizer_manager.py +14 -13
sglang/srt/model_config.py +6 -0
sglang/srt/models/gemma2.py +436 -0
sglang/srt/models/llama2.py +3 -3
sglang/srt/models/llama_classification.py +10 -7
sglang/srt/models/minicpm.py +373 -0
sglang/srt/models/qwen2_moe.py +454 -0
sglang/srt/openai_api_adapter.py +2 -2
sglang/srt/openai_protocol.py +1 -1
sglang/srt/server.py +17 -8
sglang/srt/server_args.py +14 -16
sglang/srt/utils.py +68 -35
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/METADATA +19 -13
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/RECORD +38 -35
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/WHEEL +0 -0
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0

sglang/srt/utils.py CHANGED Viewed

@@ -432,13 +432,12 @@ def assert_pkg_version(pkg: str, min_version: str, message: str):
         if pkg_version.parse(installed_version) < pkg_version.parse(min_version):
             raise Exception(
                 f"{pkg} is installed with version {installed_version}, which "
-                f"is less than the minimum required version {min_version}. " +
-                message
+                f"is less than the minimum required version {min_version}. " + message
             )
     except PackageNotFoundError:
         raise Exception(
-            f"{pkg} with minimum required version {min_version} is not installed. " +
-            message
+            f"{pkg} with minimum required version {min_version} is not installed. "
+            + message
         )
@@ -459,13 +458,9 @@ def monkey_patch_vllm_p2p_access_check(gpu_id: int):
     NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
     """
-    # TODO: need a better check than just dev str name match
-    # compat: skip RTX 40 series as they do not have P2P feature and even checking for them may cause errors
-    device_name = torch.cuda.get_device_name(gpu_id)
-    if "RTX 40" not in device_name:
-        import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
+    import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
-        setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
+    setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
 def monkey_patch_vllm_dummy_weight_loader():
@@ -474,24 +469,40 @@ def monkey_patch_vllm_dummy_weight_loader():
     """
     from vllm.model_executor.model_loader.loader import (
-        ModelConfig, DeviceConfig, LoRAConfig, VisionLanguageConfig,
-        ParallelConfig, SchedulerConfig, CacheConfig, nn,
-        set_default_torch_dtype, _initialize_model, initialize_dummy_weights,
-        DummyModelLoader
+        CacheConfig,
+        DeviceConfig,
+        DummyModelLoader,
+        LoRAConfig,
+        ModelConfig,
+        ParallelConfig,
+        SchedulerConfig,
+        MultiModalConfig,
+        _initialize_model,
+        initialize_dummy_weights,
+        nn,
+        set_default_torch_dtype,
     )
-    def load_model(self, *, model_config: ModelConfig,
-                   device_config: DeviceConfig,
-                   lora_config: Optional[LoRAConfig],
-                   vision_language_config: Optional[VisionLanguageConfig],
-                   parallel_config: ParallelConfig,
-                   scheduler_config: SchedulerConfig,
-                   cache_config: CacheConfig) -> nn.Module:
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+        multimodal_config: Optional[MultiModalConfig],
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+    ) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
-                model = _initialize_model(model_config, self.load_config,
-                                          lora_config, vision_language_config,
-                                          cache_config)
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                    lora_config,
+                    multimodal_config,
+                    cache_config,
+                )
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)
@@ -541,7 +552,7 @@ def get_ip_address(ifname):
     ip_address = fcntl.ioctl(
         s.fileno(),
         0x8915,  # SIOCGIFADDR
-        struct.pack('256s', bytes(ifname[:15], 'utf-8'))
+        struct.pack("256s", bytes(ifname[:15], "utf-8")),
     )[20:24]
     return socket.inet_ntoa(ip_address)
@@ -550,44 +561,66 @@ def send_addrs_to_rank_0(model_port_args, server_args):
     assert server_args.node_rank != 0 and server_args.dp_size == 1
     import torch.distributed as dist
-    ifname = os.environ.get("SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0"))
+    ifname = os.environ.get(
+        "SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
+    )
     ip_addr = get_ip_address(ifname)
     num_tp_ports = server_args.tp_size // server_args.nnodes
     model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
     ip_addr = [int(x) for x in ip_addr.split(".")]
-    addrs_tensor = torch.tensor(ip_addr + model_port_args.model_tp_ports, dtype=torch.int)
+    addrs_tensor = torch.tensor(
+        ip_addr + model_port_args.model_tp_ports, dtype=torch.int
+    )
     init_method = f"tcp://{server_args.nccl_init_addr}"
-    dist.init_process_group(backend="gloo", init_method=init_method, rank=server_args.node_rank, world_size=server_args.nnodes)
+    dist.init_process_group(
+        backend="gloo",
+        init_method=init_method,
+        rank=server_args.node_rank,
+        world_size=server_args.nnodes,
+    )
     dist.send(addrs_tensor, dst=0)
-    print(f"Node {server_args.node_rank} sent: ip_address {ip_addr} and ports {model_port_args.model_tp_ports}")
+    print(
+        f"Node {server_args.node_rank} sent: ip_address {ip_addr} and ports {model_port_args.model_tp_ports}"
+    )
     dist.barrier()
-    dist.destroy_process_group()
+    dist.destroy_process_group()
 def receive_addrs(model_port_args, server_args):
     assert server_args.node_rank == 0 and server_args.dp_size == 1
     import torch.distributed as dist
-    ifname = os.environ.get("SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0"))
+    ifname = os.environ.get(
+        "SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
+    )
     ip_addr = get_ip_address(ifname)
     num_tp_ports = server_args.tp_size // server_args.nnodes
     model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
     init_method = f"tcp://{server_args.nccl_init_addr}"
-    dist.init_process_group(backend="gloo", init_method=init_method, rank=server_args.node_rank, world_size=server_args.nnodes)
+    dist.init_process_group(
+        backend="gloo",
+        init_method=init_method,
+        rank=server_args.node_rank,
+        world_size=server_args.nnodes,
+    )
     for src_rank in range(1, server_args.nnodes):
         tensor = torch.zeros(4 + num_tp_ports, dtype=torch.int)
         dist.recv(tensor, src=src_rank)
         ip = ".".join([str(x) for x in tensor[:4].tolist()])
         ports = tensor[4:].tolist()
-        model_port_args.model_tp_ips[num_tp_ports * src_rank: num_tp_ports * (src_rank + 1)] = [ip] * num_tp_ports
-        model_port_args.model_tp_ports[num_tp_ports * src_rank: num_tp_ports * (src_rank + 1)] = ports
+        model_port_args.model_tp_ips[
+            num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
+        ] = [ip] * num_tp_ports
+        model_port_args.model_tp_ports[
+            num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
+        ] = ports
         print(f"Node 0 received from rank {src_rank}: {tensor.tolist()}")
     dist.barrier()
-    dist.destroy_process_group()
+    dist.destroy_process_group()

{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.1.18
+Version: 0.1.19
 Summary: A structured generation langauge for LLMs.
 License: Apache License
                                    Version 2.0, January 2004
@@ -241,7 +241,7 @@ Requires-Dist: torch ; extra == 'srt'
 Requires-Dist: uvicorn ; extra == 'srt'
 Requires-Dist: uvloop ; extra == 'srt'
 Requires-Dist: zmq ; extra == 'srt'
-Requires-Dist: vllm ==0.5.0 ; extra == 'srt'
+Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
 Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
 <div align="center">
@@ -257,7 +257,7 @@ It makes your interaction with LLMs faster and more controllable by co-designing
 The core features include:
 - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
-- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
+- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
 ## News
 - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -288,15 +288,21 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 git clone https://github.com/sgl-project/sglang.git
 cd sglang
-pip install --upgrade pip
 pip install -e "python[all]"
 # Install FlashInfer CUDA kernels
 pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ```
-### Notes
-- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
+### Method 3: Using docker
+The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
+### Common Notes
+- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
+```
+pip uninstall -y triton triton-nightly
+pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
+```
 - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
@@ -518,8 +524,8 @@ for out in state.text_iter():
 ```
 ### Tips and Implementation Details
-- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
-- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
+- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
+- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
 ## Backend: SGLang Runtime (SRT)
 The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -576,7 +582,6 @@ response = client.chat.completions.create(
 print(response)
 ```
 By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
 If needed, you can also override the chat template when launching the server:
@@ -605,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
 ```
 ### Additional Arguments
-- Add `--tp 2` to enable tensor parallelism.
+- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
 ```
@@ -623,9 +628,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
 - Llama
 - Mistral
 - Mixtral
-- Qwen / Qwen 2
-- Gemma
-  - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
+- Qwen / Qwen 2 / Qwen 2 MoE
+- Gemma / Gemma 2
   - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
 - LLaVA
   - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
@@ -638,6 +642,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
 - StableLM
 - Command-R
 - DBRX
+- Grok
+- ChatGLM
 - AWQ/GPTQ/Marlin quantization
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).

{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-sglang/__init__.py,sha256=PhkN9MopSdHLXHG9_7l5JB-awRDI9CdR6Qht1vWA9C8,1116
-sglang/api.py,sha256=92oqUgVeKq9B9If2A8LHzEhPicZK5Rq3rKUShwPAq0E,4579
-sglang/bench_latency.py,sha256=MNxmVCwBM7ZWFYSFy2m-y8MmEWNWvZO2gUBbuMyWSBI,10018
-sglang/global_config.py,sha256=xMX7JqPgDRwtvcbULkwHJ-bfysNefEN42V3BGss9mlo,1425
+sglang/__init__.py,sha256=GriWuMrszCcPLrLQRv50jP0Crc6b8CLsBA3UYM36ISw,1116
+sglang/api.py,sha256=W_FO5JTrW9I-DoGx2O8cLhcSA6LJqgplrOIqAX-ryNA,5560
+sglang/bench_latency.py,sha256=Ln3DbLmTwIhgsiFZH0_L5Fd3Sc5jM_Vb9PFZytX76hM,10299
+sglang/global_config.py,sha256=1HsHrPFgkqCc5iIwrweKQ0HLip0DLogtpm9vaqbZqfE,1426
 sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
 sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
 sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
@@ -10,69 +10,72 @@ sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2
 sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
 sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
 sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
-sglang/backend/runtime_endpoint.py,sha256=8NyWgMvhzUcA5VEsPLo1AacZ_UPVSnpxpzt6vYdVQSU,8871
+sglang/backend/runtime_endpoint.py,sha256=XTHAoN_EAwdfADc6vq9tuqri7udGMUih8dStgTuKV1g,9077
 sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
 sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sglang/lang/chat_template.py,sha256=ogIT8iMlDcSEgcNBTh5pRLoCkdQI_ec5Hc27wFUFDIg,11532
+sglang/lang/chat_template.py,sha256=hLX1qpXaUQi7PFndAwbOoOeGlX0NekskR_HndAvGnwQ,13307
 sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
-sglang/lang/interpreter.py,sha256=MMdvli-75ySiKiULlsnoVmb8oEu5bvSkYz8GRdtZoVk,29494
-sglang/lang/ir.py,sha256=KZxXVva2r1UihYOVWRKcU_zILMx05oWV2yLy3SeZfnA,14603
+sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
+sglang/lang/ir.py,sha256=ZGXJbJELlt8D8H7CyW3IqcRpZm8Pp7h_hLQw46NSb6I,16639
 sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
 sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
 sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
-sglang/srt/hf_transformers_utils.py,sha256=P6eXfGwH-OeU6hDrlGYL5GACcTNPdYOimpKZ0ZBZUao,10683
+sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
 sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
 sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
-sglang/srt/model_config.py,sha256=eGt8hTtipSTqp-AsB-Cl4wfZDb14CTcOtIz-iXgaVk8,4997
-sglang/srt/openai_api_adapter.py,sha256=pqGP0bON-wEZOnZyo85gzrO9MSzeIkHh5xqhpN5RkyY,15120
-sglang/srt/openai_protocol.py,sha256=CNJOMr3PJvoRGI2TIh9t8f_4wYTtT0EF8kzsrYsASYY,5350
+sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
+sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
+sglang/srt/openai_protocol.py,sha256=-KJsGx2izL3Fc5EhOGi9PAXExuaq-DKRk0UlNjts11E,5348
 sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
-sglang/srt/server.py,sha256=742w8gn0GgE7w3EwgPhq7MYabaVxcdPpMAovEE6-DaU,13112
-sglang/srt/server_args.py,sha256=j0-Aj8sHQ-zgumd4w0IaezRqDdjDC6MMMG5M8zzITVw,12166
-sglang/srt/utils.py,sha256=V2C4fb93oKS4D3lezlRgHkD7MQDNBZlIy_4ZTNzAC9E,19423
-sglang/srt/constrained/__init__.py,sha256=Q-XnKFChC9q6WDCnJKAKAuXzKHHg4QoFlYODge8ZKCs,1504
+sglang/srt/server.py,sha256=ntl5XwnbOm2favQWbqVULXBUOLhXsgZ3mf1i2MY4e14,13226
+sglang/srt/server_args.py,sha256=rvJImd-b9CVveg_V7n7dSotlro6q6pAqBk7lOxRC7nk,12307
+sglang/srt/utils.py,sha256=e-yPzqDMCGsPgEf4TIe7CEh44lsKpZnclsrMtBggS_Y,19366
+sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
 sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
 sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
 sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
 sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
-sglang/srt/layers/extend_attention.py,sha256=JUYuYSAhfbgOXrwIK5YHJCXPq54a6IZ7vQrze-3VvMQ,12955
-sglang/srt/layers/fused_moe.py,sha256=M_cTHMNSoD-wdh6XjzHseuq3zsdqOmECWxNeEVJklu4,22257
-sglang/srt/layers/logits_processor.py,sha256=t-bZIcGj70KKf2Jcor9K7Va1NsBlDVNrQ4Ktlq0lUlU,8506
-sglang/srt/layers/radix_attention.py,sha256=XsHFf7myNKZwyt3qB5LEXAttTKMY9OP3M3t5CZnyu3g,6911
-sglang/srt/layers/token_attention.py,sha256=rVbPlFpmLoU3nx3qtK2YZdynDxfvMKtQNTPeKi0KNP0,8823
-sglang/srt/managers/detokenizer_manager.py,sha256=tOjURt-XQofPblnGECoJfoRSMPoWFVAH99R05hXeYNw,3353
-sglang/srt/managers/io_struct.py,sha256=O1cz6hDV6BjXbZ0-tk6VaDNjYFuMBUOGswbG3H_GliY,4532
-sglang/srt/managers/tokenizer_manager.py,sha256=TswupFKrlXAvUM5-1eT2cR6uNJoQVivp2MQkEFu4axQ,14848
+sglang/srt/layers/extend_attention.py,sha256=sVd94ViwwQaQDuE94sPMg6Ac6VOp7nX80hFol8qr85Q,13008
+sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
+sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
+sglang/srt/layers/radix_attention.py,sha256=e468GCYteIuVOW7T9xols-IqXS0hJysmicvEiwD0xIM,6857
+sglang/srt/layers/token_attention.py,sha256=eKUUU5pvYsF5EGthfbv-L_IUlg366l5e5X1eWTkE_Xw,8908
+sglang/srt/managers/detokenizer_manager.py,sha256=2oYNtYrSwtfu8G-QcFz_vZK6Buq-eHuZGg9VpxVhYOI,3492
+sglang/srt/managers/io_struct.py,sha256=aCI4yYtKoioP459lWRN8kqVf4tvYYr_IhZaSnvJylgY,4533
+sglang/srt/managers/tokenizer_manager.py,sha256=h5nOR8NHCwEm52wiL-ZA1hoM_pvMuyG0j7Zj1h7aMxk,14898
 sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
-sglang/srt/managers/controller/infer_batch.py,sha256=-Q17Pk_Mmccobxly7UM8wCC6dYKJ4zmjplMboN1q8b0,25700
+sglang/srt/managers/controller/infer_batch.py,sha256=wOuvi4lNhVEZtfXZKinBXCubG_VEaRTv60ijbHpSMgM,25713
 sglang/srt/managers/controller/manager_multi.py,sha256=Z0a-iZzqk8T2Xl7ak2DgE9j00GA6Eb0XoNVx7UlxKa4,6630
-sglang/srt/managers/controller/manager_single.py,sha256=OIm_BjbDaEcYmpb_E_7wv0xfOlb2le0zXjPMqf1pU9U,3468
-sglang/srt/managers/controller/model_runner.py,sha256=HjOHp_Rtdm7OnMmhtnSwPWPmEYHDpnt5LjeKbiYb6mo,21718
+sglang/srt/managers/controller/manager_single.py,sha256=5c33d1jPgOtys5gmfZe79UD7aXrsV--1Yq9Yc24bh1g,3469
+sglang/srt/managers/controller/model_runner.py,sha256=a-1RKjA12U11BvDbnOECyPf6rpxes895pEZ0-Hyxo6c,21888
 sglang/srt/managers/controller/radix_cache.py,sha256=fMqIm1fTvufI9I_QMoFLfQMkSUWp8VN4wh3-63KJUL0,8193
 sglang/srt/managers/controller/schedule_heuristic.py,sha256=_ne7W2mrpuO794uh5tYLR3q6XBbgTMdNmE6VpzY1sJE,2312
-sglang/srt/managers/controller/tp_worker.py,sha256=VNVQ0oqPGllC00cZCxHB-0LqudxgS74jf-it2zDHzTA,31411
+sglang/srt/managers/controller/tp_worker.py,sha256=WBqL5_VVDAf3o12ymZwxQn7RYZ_dm_w2dXCnMVQ5L3M,31828
 sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
 sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
 sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
 sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
+sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
 sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
-sglang/srt/models/llama2.py,sha256=7aPPSLABRIy7_iy4YvFHV7Beqc2I1-Vc1obSbsgzNzY,12190
-sglang/srt/models/llama_classification.py,sha256=LrPRFB-Yd2haZADNY3uIusbajQwcZNQrOCTd92L2vS0,4304
+sglang/srt/models/llama2.py,sha256=FIUlkFoBhRNidU_Tlcr4UbSqzKPdz3wBc9OocN_CzQs,12188
+sglang/srt/models/llama_classification.py,sha256=bLuugRFcPGEaNd58_LFOkWqOru2rCAGChhBw9dSu7pc,4349
 sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
 sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
+sglang/srt/models/minicpm.py,sha256=vYCGjUjYIYVroiV2kOXWdWIPF6__vkN8JnRK-DqgKNI,13271
 sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
 sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
 sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
 sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
 sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
+sglang/srt/models/qwen2_moe.py,sha256=hV3dF_AzYONd-pQEmEkrrwpTZC6A7K4wY1_cph9UC54,18421
 sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
 sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
 sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
 sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
 sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
 sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
-sglang-0.1.18.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sglang-0.1.18.dist-info/METADATA,sha256=tDdBZo2qvH8wWC4faXxfryjh7-6frEsBnH0vJ_ia1w4,29752
-sglang-0.1.18.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
-sglang-0.1.18.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
-sglang-0.1.18.dist-info/RECORD,,
+sglang-0.1.19.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.1.19.dist-info/METADATA,sha256=iSIkO_DxfMHQIEv7ZdMXWwi_weLZtf8YRNS80vjf1Kk,30262
+sglang-0.1.19.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
+sglang-0.1.19.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.1.19.dist-info/RECORD,,

{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/LICENSE RENAMED Viewed

File without changes

{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/WHEEL RENAMED Viewed

File without changes

{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/top_level.txt RENAMED Viewed

File without changes

sglang 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl

sglang 0.1.18py3-none-any.whl → 0.1.19py3-none-any.whl