PyPI - sglang - Versions diffs - 0.2.14.post2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

sglang 0.2.14.post2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

sglang/api.py +2 -0
sglang/bench_latency.py +39 -28
sglang/lang/backend/runtime_endpoint.py +8 -4
sglang/lang/interpreter.py +3 -0
sglang/lang/ir.py +5 -0
sglang/launch_server_llavavid.py +12 -12
sglang/srt/configs/__init__.py +5 -0
sglang/srt/configs/exaone.py +195 -0
sglang/srt/constrained/fsm_cache.py +1 -1
sglang/srt/conversation.py +24 -2
sglang/srt/hf_transformers_utils.py +12 -12
sglang/srt/layers/extend_attention.py +13 -8
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/sampler.py +94 -17
sglang/srt/managers/controller_multi.py +5 -5
sglang/srt/managers/controller_single.py +5 -5
sglang/srt/managers/io_struct.py +6 -1
sglang/srt/managers/schedule_batch.py +26 -11
sglang/srt/managers/tokenizer_manager.py +9 -9
sglang/srt/managers/tp_worker.py +38 -26
sglang/srt/model_config.py +3 -3
sglang/srt/model_executor/cuda_graph_runner.py +26 -9
sglang/srt/model_executor/forward_batch_info.py +68 -23
sglang/srt/model_executor/model_runner.py +15 -22
sglang/srt/models/chatglm.py +9 -15
sglang/srt/models/commandr.py +5 -1
sglang/srt/models/dbrx.py +5 -1
sglang/srt/models/deepseek.py +5 -1
sglang/srt/models/deepseek_v2.py +57 -25
sglang/srt/models/exaone.py +368 -0
sglang/srt/models/gemma.py +5 -1
sglang/srt/models/gemma2.py +5 -1
sglang/srt/models/gpt_bigcode.py +5 -1
sglang/srt/models/grok.py +5 -1
sglang/srt/models/internlm2.py +5 -1
sglang/srt/models/{llama2.py → llama.py} +25 -45
sglang/srt/models/llama_classification.py +34 -41
sglang/srt/models/llama_embedding.py +7 -6
sglang/srt/models/llava.py +8 -11
sglang/srt/models/llavavid.py +5 -6
sglang/srt/models/minicpm.py +5 -1
sglang/srt/models/mistral.py +2 -3
sglang/srt/models/mixtral.py +6 -2
sglang/srt/models/mixtral_quant.py +5 -1
sglang/srt/models/qwen.py +5 -2
sglang/srt/models/qwen2.py +6 -2
sglang/srt/models/qwen2_moe.py +5 -14
sglang/srt/models/stablelm.py +5 -1
sglang/srt/openai_api/adapter.py +16 -1
sglang/srt/openai_api/protocol.py +5 -5
sglang/srt/sampling/sampling_batch_info.py +75 -6
sglang/srt/server.py +6 -6
sglang/srt/utils.py +0 -3
sglang/test/runners.py +1 -1
sglang/test/test_programs.py +68 -0
sglang/test/test_utils.py +4 -0
sglang/utils.py +39 -0
sglang/version.py +1 -1
{sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/METADATA +9 -8
sglang-0.3.0.dist-info/RECORD +118 -0
{sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/WHEEL +1 -1
sglang-0.2.14.post2.dist-info/RECORD +0 -115
{sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/LICENSE +0 -0
{sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/top_level.txt +0 -0

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -21,10 +21,59 @@ class SamplingBatchInfo:
     top_ps: torch.Tensor = None
     top_ks: torch.Tensor = None
     min_ps: torch.Tensor = None
-    penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
+    # Dispatch in CUDA graph
+    need_min_p_sampling: bool = False
+    # Bias Tensors
     logit_bias: torch.Tensor = None
     vocab_mask: torch.Tensor = None
+    # Penalizer
+    penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
+    linear_penalties: torch.Tensor = None
+    scaling_penalties: torch.Tensor = None
+    def can_run_in_cuda_graph(self):
+        # Vocab bias and min_ps are not supported in CUDA graph
+        return (
+            self.logit_bias is None
+            and self.vocab_mask is None
+            and self.linear_penalties is None
+            and self.scaling_penalties is None
+            and not self.need_min_p_sampling
+        )
+    @classmethod
+    def dummy_one(cls, max_bs: int, vocab_size: int):
+        ret = cls(vocab_size=vocab_size)
+        ret.temperatures = torch.ones((max_bs, 1), dtype=torch.float, device="cuda")
+        ret.top_ps = torch.ones((max_bs,), dtype=torch.float, device="cuda")
+        ret.top_ks = torch.ones((max_bs,), dtype=torch.int, device="cuda")
+        return ret
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            # NOTE:This method is only used in CUDA graph
+            assert self.can_run_in_cuda_graph()
+            return SamplingBatchInfo(
+                vocab_size=self.vocab_size,
+                temperatures=self.temperatures[key],
+                top_ps=self.top_ps[key],
+                top_ks=self.top_ks[key],
+            )
+        else:
+            raise NotImplementedError
+    def inplace_assign(self, bs: int, other: SamplingBatchInfo):
+        # NOTE:This method is only used in CUDA graph
+        assert self.can_run_in_cuda_graph()
+        self.vocab_size = other.vocab_size
+        self.temperatures[:bs] = other.temperatures
+        self.top_ps[:bs] = other.top_ps
+        self.top_ks[:bs] = other.top_ks
     @classmethod
     def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
         device = "cuda"
@@ -45,6 +94,7 @@ class SamplingBatchInfo:
         ret.min_ps = torch.tensor(
             [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
         )
+        ret.need_min_p_sampling = any(r.sampling_params.min_p > 0 for r in reqs)
         # Each penalizers will do nothing if they evaluate themselves as not required by looking at
         # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
@@ -72,6 +122,25 @@ class SamplingBatchInfo:
         return ret
+    def prepare_penalties(self):
+        self.scaling_penalties = None
+        self.linear_penalties = None
+        for penalizer in self.penalizer_orchestrator.penalizers.values():
+            if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
+                if penalizer.is_prepared():
+                    self.scaling_penalties = penalizer.cumulated_repetition_penalties
+            else:
+                if penalizer.is_prepared():
+                    if self.linear_penalties is None:
+                        bs = self.penalizer_orchestrator.batch.batch_size()
+                        self.linear_penalties = torch.zeros(
+                            (bs, self.vocab_size),
+                            dtype=torch.float32,
+                            device="cuda",
+                        )
+                    self.linear_penalties = penalizer.apply(self.linear_penalties)
     def update_regex_vocab_mask(self, batch: ScheduleBatch):
         bs, reqs = batch.batch_size(), batch.reqs
         device = "cuda"
@@ -81,15 +150,15 @@ class SamplingBatchInfo:
         self.vocab_mask = None
         if has_regex:
+            self.vocab_mask = torch.zeros(
+                bs, self.vocab_size, dtype=torch.bool, device=device
+            )
             for i, req in enumerate(reqs):
                 if req.regex_fsm is not None:
-                    if self.vocab_mask is None:
-                        self.vocab_mask = torch.zeros(
-                            bs, self.vocab_size, dtype=torch.bool, device=device
-                        )
+                    self.vocab_mask[i].fill_(1)
                     self.vocab_mask[i][
                         req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens
-                    ] = 1
+                    ] = 0
     def filter(self, unfinished_indices: List[int], new_indices: torch.Tensor):
         self.penalizer_orchestrator.filter(unfinished_indices, new_indices)

sglang/srt/server.py CHANGED Viewed

@@ -272,7 +272,7 @@ async def retrieve_file_content(file_id: str):
 def launch_server(
     server_args: ServerArgs,
-    model_overide_args: Optional[dict] = None,
+    model_override_args: Optional[dict] = None,
     pipe_finish_writer: Optional[mp.connection.Connection] = None,
 ):
     """Launch an HTTP server."""
@@ -317,7 +317,7 @@ def launch_server(
             tp_rank_range,
             server_args,
             ports[3],
-            model_overide_args,
+            model_override_args,
         )
         try:
@@ -328,7 +328,7 @@ def launch_server(
             return
     # Launch processes
-    tokenizer_manager = TokenizerManager(server_args, port_args, model_overide_args)
+    tokenizer_manager = TokenizerManager(server_args, port_args, model_override_args)
     if server_args.chat_template:
         load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
     pipe_controller_reader, pipe_controller_writer = mp.Pipe(duplex=False)
@@ -341,7 +341,7 @@ def launch_server(
     proc_controller = mp.Process(
         target=start_controller_process,
-        args=(server_args, port_args, pipe_controller_writer, model_overide_args),
+        args=(server_args, port_args, pipe_controller_writer, model_override_args),
     )
     proc_controller.start()
@@ -501,7 +501,7 @@ class Runtime:
     def __init__(
         self,
         log_level: str = "error",
-        model_overide_args: Optional[dict] = None,
+        model_override_args: Optional[dict] = None,
         *args,
         **kwargs,
     ):
@@ -525,7 +525,7 @@ class Runtime:
         proc = mp.Process(
             target=launch_server,
-            args=(self.server_args, model_overide_args, pipe_writer),
+            args=(self.server_args, model_override_args, pipe_writer),
         )
         proc.start()
         pipe_writer.close()

sglang/srt/utils.py CHANGED Viewed

@@ -407,7 +407,6 @@ def monkey_patch_vllm_dummy_weight_loader():
         DummyModelLoader,
         LoRAConfig,
         ModelConfig,
-        MultiModalConfig,
         ParallelConfig,
         SchedulerConfig,
         _initialize_model,
@@ -422,7 +421,6 @@ def monkey_patch_vllm_dummy_weight_loader():
         model_config: ModelConfig,
         device_config: DeviceConfig,
         lora_config: Optional[LoRAConfig],
-        multimodal_config: Optional[MultiModalConfig],
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         cache_config: CacheConfig,
@@ -433,7 +431,6 @@ def monkey_patch_vllm_dummy_weight_loader():
                     model_config,
                     self.load_config,
                     lora_config,
-                    multimodal_config,
                     cache_config,
                 )

sglang/test/runners.py CHANGED Viewed

@@ -180,7 +180,7 @@ class SRTRunner:
             tp_size=tp_size,
             dtype=get_dtype_str(torch_dtype),
             port=port,
-            mem_fraction_static=0.7,
+            mem_fraction_static=0.69,
             trust_remote_code=False,
             is_embedding=not self.is_generation,
         )

sglang/test/test_programs.py CHANGED Viewed

@@ -2,8 +2,12 @@
 import json
 import re
+import time
+import numpy as np
 import sglang as sgl
+from sglang.utils import fetch_and_cache_jsonl
 def test_few_shot_qa():
@@ -447,3 +451,67 @@ def test_chat_completion_speculative():
         )
     gen_character_spec().sync()
+def test_hellaswag_select():
+    """Benchmark the accuracy of sgl.select on the HellaSwag dataset."""
+    url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
+    lines = fetch_and_cache_jsonl(url)
+    # Construct prompts
+    def get_one_example(lines, i, include_answer):
+        ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
+        if include_answer:
+            ret += lines[i]["endings"][lines[i]["label"]]
+        return ret
+    def get_few_shot_examples(lines, k):
+        ret = ""
+        for i in range(k):
+            ret += get_one_example(lines, i, True) + "\n\n"
+        return ret
+    num_questions = 200
+    num_shots = 20
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+    questions = []
+    choices = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        choices.append(lines[i]["endings"])
+        labels.append(lines[i]["label"])
+    arguments = [{"question": q, "choices": c} for q, c in zip(questions, choices)]
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+    import sglang as sgl
+    @sgl.function
+    def few_shot_hellaswag(s, question, choices):
+        s += few_shot_examples + question
+        s += sgl.select("answer", choices=choices)
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+    # Run requests
+    tic = time.time()
+    rets = few_shot_hellaswag.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=64,
+        progress_bar=True,
+    )
+    preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
+    latency = time.time() - tic
+    # Compute accuracy
+    accuracy = np.mean(np.array(preds) == np.array(labels))
+    return accuracy, latency

sglang/test/test_utils.py CHANGED Viewed

@@ -23,6 +23,10 @@ from sglang.utils import get_exception_traceback
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
 if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
     DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157

sglang/utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ import base64
 import importlib
 import json
 import logging
+import os
 import signal
 import sys
 import traceback
@@ -15,6 +16,7 @@ from typing import Union
 import numpy as np
 import requests
+from tqdm import tqdm
 logger = logging.getLogger(__name__)
@@ -260,3 +262,40 @@ class LazyImport:
     def __call__(self, *args, **kwargs):
         module = self._load()
         return module(*args, **kwargs)
+def fetch_and_cache_jsonl(url, cache_file="cached_data.jsonl"):
+    """Read and cache a jsonl file from a url."""
+    # Check if the cache file already exists
+    if os.path.exists(cache_file):
+        print("Loading data from cache...")
+        with open(cache_file, "r") as f:
+            data = [json.loads(line) for line in f]
+    else:
+        print("Downloading data from URL...")
+        # Stream the response to show the progress bar
+        response = requests.get(url, stream=True)
+        response.raise_for_status()  # Check for request errors
+        # Total size of the file in bytes
+        total_size = int(response.headers.get("content-length", 0))
+        chunk_size = 1024  # Download in chunks of 1KB
+        # Use tqdm to display the progress bar
+        with open(cache_file, "wb") as f, tqdm(
+            desc=cache_file,
+            total=total_size,
+            unit="B",
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as bar:
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                f.write(chunk)
+                bar.update(len(chunk))
+        # Convert the data to a list of dictionaries
+        with open(cache_file, "r") as f:
+            data = [json.loads(line) for line in f]
+    return data

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.~~14.post2~~"
1	+ __version__ = "0.3.0"

{sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.2.14.post2
+Version: 0.3.0
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.2.14.post2 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.0 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -461,7 +461,7 @@ It supports streaming, vision, and most features of the Chat/Completions/Models/
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
 ```
-- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
+- Add `--dp 2` to enable multi-GPU data parallelism. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
 ```
@@ -489,13 +489,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ### Supported Models
 **Generative Models**
 - Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral / Mistral NeMo
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE
 - DeepSeek / DeepSeek 2
 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
+  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
   - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
   - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
 - LLaVA 1.5 / 1.6 / NeXT
@@ -509,6 +509,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - Grok
 - ChatGLM
 - InternLM 2
+- Exaone 3
 **Embedding Models**
@@ -636,7 +637,7 @@ print(state["answer_1"])
 #### More Examples
 Anthropic and VertexAI (Gemini) models are also supported.
-You can find more examples at [examples/quick_start](examples/quick_start).
+You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
 ### Language Feature
 To begin with, import sglang.
@@ -649,7 +650,7 @@ You can implement your prompt flow in a function decorated by `sgl.function`.
 You can then invoke the function with `run` or `run_batch`.
 The system will manage the state, chat template, parallelism and batching for you.
-The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
+The complete code for the examples below can be found at [readme_examples.py](examples/frontend_language/usage/readme_examples.py)
 #### Control Flow
 You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
@@ -698,7 +699,7 @@ def image_qa(s, image_file, question):
     s += sgl.assistant(sgl.gen("answer", max_tokens=256)
 ```
-See also [srt_example_llava.py](examples/quick_start/srt_example_llava.py).
+See also [srt_example_llava.py](examples/frontend_language/quick_start/local_example_llava_next.py).
 #### Constrained Decoding
 Use `regex` to specify a regular expression as a decoding constraint.
@@ -742,7 +743,7 @@ def character_gen(s, name):
     s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
 ```
-See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
+See also [json_decode.py](examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
 #### Batching
 Use `run_batch` to run a batch of requests with continuous batching.

sglang-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,118 @@
+sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
+sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
+sglang/bench_latency.py,sha256=F7jMfKqMf1XFKJgkpR_yE33VJpsIhSr_SOJeRbngkb0,16758
+sglang/bench_serving.py,sha256=J_mMwnmDn0Jt07mzdGAuYOxpockHPLYJFL-kwoaqASY,36527
+sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
+sglang/global_config.py,sha256=nwOjUflwqLQySPUMvk8Hk63TIS6mknh_ODSW3CZ1rJw,1704
+sglang/launch_server.py,sha256=FODfO0DW546dh-u1qDlWtrhsmj6hxkarXXv3cIdgkj8,549
+sglang/launch_server_llavavid.py,sha256=xnpSILJxsrbvqkERav5P26bErCQnhoTFmoKeScJltUA,1034
+sglang/utils.py,sha256=zxHwQhVxW_lWf-IH0wUw_pBTRLHLPypdRiU5M4XosMM,9669
+sglang/version.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
+sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
+sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
+sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
+sglang/lang/interpreter.py,sha256=AC3tNNDwYfiu87jCldBWXYpFicCv6NMPJACMFEfCXu4,30331
+sglang/lang/ir.py,sha256=W3UfZikcGeT86PDDjDjw-yNzrKY2e2UYO4DTatMCfm0,17704
+sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
+sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
+sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
+sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
+sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
+sglang/lang/backend/runtime_endpoint.py,sha256=hpezro0H6vG9KzLeKfYpPMwb4TaE0UanCIM0uG8Kdjw,9746
+sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
+sglang/srt/conversation.py,sha256=2KDNe1suUPy6xqSkCx2xcO3pDPxTwqx5FaUxaqwCJ-M,19525
+sglang/srt/hf_transformers_utils.py,sha256=5UXJ-LdP92Sk_T843M9BHdnxRrcyiYfWH2IEg3dWgKI,6085
+sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
+sglang/srt/model_config.py,sha256=68QQ8iUWQHPv01RBeH23mvay6iJg9DWmCogC_vUgFLk,6371
+sglang/srt/server.py,sha256=yi8prs9_M0P0dOInrQLkHKiZ-oTigk_uzW8otEHImbU,19846
+sglang/srt/server_args.py,sha256=GiDyPWCvYA_98mSE9LuvUoEodo9gRnNPPIPn0nFkxUs,18259
+sglang/srt/utils.py,sha256=JJOlqRPbN_tSSNWj63syQpfz4v7hUwNvzWvOUpBh9SM,23746
+sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
+sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
+sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
+sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
+sglang/srt/constrained/fsm_cache.py,sha256=wigJs9PeTt-vYPJQEeUZwEKl6MFIfb5xy8uIg18bDbM,3132
+sglang/srt/constrained/jump_forward.py,sha256=LWRsmGPQcH6KT87wXwCRqtblU3pcAVCEzO0nWPxevs0,6636
+sglang/srt/layers/activation.py,sha256=JEXNTgqxoiU4N-gVm4XMjobhft4JKDcMrgTkfpsRUzM,4856
+sglang/srt/layers/decode_attention.py,sha256=TPD_608ZX9fQ_HDImifkxG_qcEYmimbEYY8lCBIjFuM,16628
+sglang/srt/layers/extend_attention.py,sha256=XIXm3p2cvKrDg10Po4qYGaEkXJOJBtCIhTB_lTyjAFE,14390
+sglang/srt/layers/layernorm.py,sha256=RXuS4UyksatqTF6lSK7VYyEiUEnBiNIBlEn8q4w84UA,3404
+sglang/srt/layers/logits_processor.py,sha256=Zx4eFAkFlThPrmz_-HuCN9SqGLanARm0wdZSVDyASAc,13085
+sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
+sglang/srt/layers/prefill_attention.py,sha256=y7vdcuX8lMa9Qf_jQYNDvQO9PVCBQSs3hb5LV2DFgpU,5256
+sglang/srt/layers/radix_attention.py,sha256=o5a8r3XQ-oRwaxBlAgzJGv7p3dMbu0LrYsDc4uvpPgA,8338
+sglang/srt/layers/sampler.py,sha256=zPVa3PHc-tjDM_oP-1XFeHSRIErx844SLoe6MG8Qef0,6418
+sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
+sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
+sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
+sglang/srt/managers/controller_multi.py,sha256=z3rguY1YYlSvVqLjKuurgJW1h0dxwPgIdPCQdJsVzYs,6478
+sglang/srt/managers/controller_single.py,sha256=5brrZ8vZxjvrSJHWrm5H3qGEZShN4EROG5r1o3pSjps,5124
+sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
+sglang/srt/managers/io_struct.py,sha256=Bd91cydX9_960NNP2xngqK-lsIaDB3oMYd56QddN4_Q,10722
+sglang/srt/managers/policy_scheduler.py,sha256=7HNUxBKJE444s_bHcPpbnHCygsnH-NIXYNSC2q6mRmc,8584
+sglang/srt/managers/schedule_batch.py,sha256=i68O-e9I_gDlme96xSBDjA2xDF1p-XBKvJRiJ9CsgcY,26423
+sglang/srt/managers/tokenizer_manager.py,sha256=8aHR5h9nYZsfdZE80uBc9egDFOQgKvjxmp-30Ha4ELk,29463
+sglang/srt/managers/tp_worker.py,sha256=4UuaBLzV6NMsG4XEIcpa4xMcOKIFvTan51ynKz85HXg,36842
+sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
+sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
+sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
+sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
+sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
+sglang/srt/model_executor/cuda_graph_runner.py,sha256=4vIUqVQpnHNhwWrokMVmGM4Dp5JFPHyXIvpEQsi2pNU,12862
+sglang/srt/model_executor/forward_batch_info.py,sha256=fSLhatN8vCgxn0Mft9D-r0pNi3SN0EQSTJmgaOtrqJc,16471
+sglang/srt/model_executor/model_runner.py,sha256=93YCStmZfdZlY0r-GGIVi0Xw66VwF77dEtGVmQf1VfU,23893
+sglang/srt/models/chatglm.py,sha256=PPOaeqipbkcsTUhMPbLb1HItWgW7KntefUfjEoMSxUM,13585
+sglang/srt/models/commandr.py,sha256=k86ykwWOlxLGaBbGUoMSaXngUxCbMVRbY5AoMOWpbU8,14377
+sglang/srt/models/dbrx.py,sha256=goLJ9Yt-9vxkwhCUFBidvP41H_dYTFsvrMZ4xm4FqGA,14875
+sglang/srt/models/deepseek.py,sha256=aYP6HUgxQbhcQGQEF4vX0ronBF8AirqIFG98EQn0YzY,16220
+sglang/srt/models/deepseek_v2.py,sha256=Htw_HDju9huYU5gBu2dqq6bKVao-AsifxfkGl2xRx-8,28521
+sglang/srt/models/exaone.py,sha256=ZFr0G0WITxg3dDfV_-vWqZpK_wMmiZi4r0vOT0gO9V4,13301
+sglang/srt/models/gemma.py,sha256=Ya_u2lKPKAc9iHEsW_HAEfCDgYTbxUOCzBI0LDuoOYs,12489
+sglang/srt/models/gemma2.py,sha256=MCmzzRAAafEQuQj6aGtB-TF4jH0RWrXcOPxSz6LRsXs,15137
+sglang/srt/models/gpt_bigcode.py,sha256=HEhMRO1Y37JfZtP7mDp0MexWj5h6XT9rKvxorOMKoQA,10409
+sglang/srt/models/grok.py,sha256=ZcJ4E11rKh-xo4k_j-H1XRreJWWv8yii-bMYC1lO2R8,15143
+sglang/srt/models/internlm2.py,sha256=VtWATs2eLIqbadYXTPY_vycFIstVk4zg3kxycA9H0Qw,12416
+sglang/srt/models/llama.py,sha256=MfDnlVWoJUG9DxgGYPiwhoU-0ZeRbhp6UmBR2ZAJSNk,13402
+sglang/srt/models/llama_classification.py,sha256=oSeROs633Gnak8vrbnWnCWDxfgP_zmKGO1A_43ukEQ4,4029
+sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
+sglang/srt/models/llava.py,sha256=OXmlOVIjFnMRKGwLweYB1N-xlfpZlTlZpqhsbwUCY6Y,23471
+sglang/srt/models/llavavid.py,sha256=4R2t8BZJKN85IrTLsLFb4yZuKVI2Cwp7kY8AJ-nEVoE,12012
+sglang/srt/models/minicpm.py,sha256=7RZEJ2TCqBL1JmMFVJ3J9DmZHRw0q90st49Wkh-sdL4,14039
+sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
+sglang/srt/models/mixtral.py,sha256=KIsvruhXNq3Fwrs4_YE7J6fx54ObfnMuRNxgScE3Bmo,13830
+sglang/srt/models/mixtral_quant.py,sha256=O_97UKDYZokFhIBnamWfw0HLhln9_BUk_KfQ-sQnd8s,14286
+sglang/srt/models/qwen.py,sha256=geK88AyEyPbbDvMHJNY8XMSNpsCeu8g9kxnKyiJBpK4,10168
+sglang/srt/models/qwen2.py,sha256=WGYy3wcRY3f8Drd9I8GblXfv0bbHluRKVhnnhEZf584,12654
+sglang/srt/models/qwen2_moe.py,sha256=b0gd42GBWyvDmUu8BZbD9ZJO_ExbXBLQZRvu61UuXOA,17086
+sglang/srt/models/stablelm.py,sha256=9feHoiDEXSIe0WCrt4AfWXqxliJwRvr8w4XSnk6ipSI,11573
+sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
+sglang/srt/openai_api/adapter.py,sha256=3EeqASZXogpUkOP4xj7Rg_LfOLiIMUrZ9uFdeAy_pcc,50144
+sglang/srt/openai_api/protocol.py,sha256=onhnCjXpXCysvx_dLgOEmXz5XHHYB1t772cvHcK1GlY,9538
+sglang/srt/sampling/sampling_batch_info.py,sha256=CIoD0SzHSWCe7Wc4jkJj5vIPHGnOdfbgkC6fG5KQxOw,7551
+sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
+sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
+sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
+sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
+sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
+sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
+sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
+sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
+sglang/test/runners.py,sha256=7N2g4vyqN98o6F0Lem5LUNAlW9ShEVxZxZuzSjmc0i4,7688
+sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
+sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
+sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
+sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
+sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
+sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
+sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
+sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
+sglang/test/test_programs.py,sha256=l21J8N91QTMO9TOvXPWNvPZVT0DgxYxOPHh1pOoFV_k,16927
+sglang/test/test_utils.py,sha256=3tt-BBv-lx7BT3whbVTMyRz6sh5jIbdBEbLZ08m2Ms8,15132
+sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
+sglang-0.3.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.3.0.dist-info/METADATA,sha256=muukBuN4kq_4mCG_r_RFY94pQliDcVh-WuXNMApXoak,37383
+sglang-0.3.0.dist-info/WHEEL,sha256=uCRv0ZEik_232NlR4YDw4Pv3Ajt5bKvMH13NUU7hFuI,91
+sglang-0.3.0.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.3.0.dist-info/RECORD,,

{sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (74.0.0)
+Generator: setuptools (74.1.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

sglang 0.2.14.post2__py3-none-any.whl → 0.3.0__py3-none-any.whl

sglang 0.2.14.post2py3-none-any.whl → 0.3.0py3-none-any.whl