sglang 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +1 -1
- sglang/api.py +14 -0
- sglang/backend/anthropic.py +18 -12
- sglang/backend/base_backend.py +6 -0
- sglang/backend/openai.py +41 -12
- sglang/backend/runtime_endpoint.py +57 -6
- sglang/lang/chat_template.py +47 -26
- sglang/lang/interpreter.py +15 -2
- sglang/lang/ir.py +1 -1
- sglang/srt/constrained/__init__.py +23 -1
- sglang/srt/constrained/fsm_cache.py +14 -3
- sglang/srt/layers/context_flashattention_nopad.py +1 -1
- sglang/srt/layers/extend_attention.py +7 -6
- sglang/srt/layers/radix_attention.py +2 -10
- sglang/srt/layers/token_attention.py +12 -4
- sglang/srt/managers/io_struct.py +3 -1
- sglang/srt/managers/router/infer_batch.py +6 -2
- sglang/srt/managers/router/model_rpc.py +45 -32
- sglang/srt/managers/router/model_runner.py +40 -25
- sglang/srt/managers/tokenizer_manager.py +2 -0
- sglang/srt/model_config.py +12 -5
- sglang/srt/models/gemma.py +340 -0
- sglang/srt/models/llama2.py +5 -5
- sglang/srt/models/llava.py +2 -4
- sglang/srt/models/mixtral.py +5 -5
- sglang/srt/models/qwen.py +4 -4
- sglang/srt/models/qwen2.py +5 -5
- sglang/srt/models/stablelm.py +293 -0
- sglang/srt/server.py +111 -47
- sglang/srt/server_args.py +44 -9
- sglang/srt/utils.py +1 -0
- sglang/test/test_utils.py +1 -1
- sglang/utils.py +15 -12
- {sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/METADATA +16 -6
- sglang-0.1.14.dist-info/RECORD +64 -0
- {sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/WHEEL +1 -1
- sglang/srt/models/gpt_neox.py +0 -274
- sglang-0.1.12.dist-info/RECORD +0 -63
- {sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/LICENSE +0 -0
- {sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/top_level.txt +0 -0
sglang/utils.py
CHANGED
@@ -88,26 +88,29 @@ class HttpResponse:
|
|
88
88
|
return self.resp.status
|
89
89
|
|
90
90
|
|
91
|
-
def http_request(
|
91
|
+
def http_request(
|
92
|
+
url, json=None, stream=False, auth_token=None, api_key=None, verify=None
|
93
|
+
):
|
92
94
|
"""A faster version of requests.post with low-level urllib API."""
|
95
|
+
headers = {"Content-Type": "application/json; charset=utf-8"}
|
96
|
+
|
97
|
+
# add the Authorization header if an auth token is provided
|
98
|
+
if auth_token is not None:
|
99
|
+
headers["Authorization"] = f"Bearer {auth_token}"
|
100
|
+
|
101
|
+
# add the API Key header if an API key is provided
|
102
|
+
if api_key is not None:
|
103
|
+
headers["X-API-Key"] = api_key
|
104
|
+
|
93
105
|
if stream:
|
94
|
-
if auth_token is None:
|
95
|
-
return requests.post(url, json=json, stream=True)
|
96
|
-
headers = {
|
97
|
-
"Content-Type": "application/json",
|
98
|
-
"Authentication": f"Bearer {auth_token}",
|
99
|
-
}
|
100
106
|
return requests.post(url, json=json, stream=True, headers=headers)
|
101
107
|
else:
|
102
|
-
req = urllib.request.Request(url)
|
103
|
-
req.add_header("Content-Type", "application/json; charset=utf-8")
|
104
|
-
if auth_token is not None:
|
105
|
-
req.add_header("Authentication", f"Bearer {auth_token}")
|
108
|
+
req = urllib.request.Request(url, headers=headers)
|
106
109
|
if json is None:
|
107
110
|
data = None
|
108
111
|
else:
|
109
112
|
data = bytes(dumps(json), encoding="utf-8")
|
110
|
-
resp = urllib.request.urlopen(req, data=data)
|
113
|
+
resp = urllib.request.urlopen(req, data=data, cafile=verify)
|
111
114
|
return HttpResponse(resp)
|
112
115
|
|
113
116
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.14
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -217,7 +217,7 @@ Requires-Dist: sglang[srt] ; extra == 'all'
|
|
217
217
|
Requires-Dist: sglang[openai] ; extra == 'all'
|
218
218
|
Requires-Dist: sglang[anthropic] ; extra == 'all'
|
219
219
|
Provides-Extra: anthropic
|
220
|
-
Requires-Dist: anthropic ; extra == 'anthropic'
|
220
|
+
Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
|
221
221
|
Requires-Dist: numpy ; extra == 'anthropic'
|
222
222
|
Provides-Extra: openai
|
223
223
|
Requires-Dist: openai >=1.0 ; extra == 'openai'
|
@@ -231,7 +231,7 @@ Requires-Dist: torch ; extra == 'srt'
|
|
231
231
|
Requires-Dist: uvloop ; extra == 'srt'
|
232
232
|
Requires-Dist: uvicorn ; extra == 'srt'
|
233
233
|
Requires-Dist: zmq ; extra == 'srt'
|
234
|
-
Requires-Dist: vllm >=0.
|
234
|
+
Requires-Dist: vllm >=0.3.3 ; extra == 'srt'
|
235
235
|
Requires-Dist: interegular ; extra == 'srt'
|
236
236
|
Requires-Dist: lark ; extra == 'srt'
|
237
237
|
Requires-Dist: numba ; extra == 'srt'
|
@@ -242,7 +242,12 @@ Requires-Dist: cloudpickle ; extra == 'srt'
|
|
242
242
|
Requires-Dist: pillow ; extra == 'srt'
|
243
243
|
Requires-Dist: outlines >=0.0.27 ; extra == 'srt'
|
244
244
|
|
245
|
-
|
245
|
+
<div align="center">
|
246
|
+
<img src="assets/logo.png" alt="logo" width="400"></img>
|
247
|
+
</div>
|
248
|
+
|
249
|
+
--------------------------------------------------------------------------------
|
250
|
+
|
246
251
|
| [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
247
252
|
|
248
253
|
SGLang is a structured generation language designed for large language models (LLMs).
|
@@ -254,7 +259,7 @@ The core features of SGLang include:
|
|
254
259
|
|
255
260
|
## News
|
256
261
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
257
|
-
- [2024/01] 🔥 SGLang powers the serving of the
|
262
|
+
- [2024/01] 🔥 SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
258
263
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
259
264
|
|
260
265
|
## Contents
|
@@ -496,7 +501,7 @@ def text_qa(s, question):
|
|
496
501
|
s += "Q: " + question + "\n"
|
497
502
|
s += "A:" + sgl.gen("answer", stop="\n")
|
498
503
|
|
499
|
-
|
504
|
+
state = text_qa.run(
|
500
505
|
question="What is the capital of France?",
|
501
506
|
temperature=0.1,
|
502
507
|
stream=True
|
@@ -608,8 +613,13 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
608
613
|
- Mistral
|
609
614
|
- Mixtral
|
610
615
|
- Qwen / Qwen 2
|
616
|
+
- Gemma
|
617
|
+
- Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
|
618
|
+
- `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
|
611
619
|
- LLaVA
|
612
620
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
621
|
+
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
622
|
+
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
613
623
|
- Yi-VL
|
614
624
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
615
625
|
- AWQ/GPTQ quantization
|
@@ -0,0 +1,64 @@
|
|
1
|
+
sglang/__init__.py,sha256=Nxa2M7XCh2-e6I7VrCg7OSBL6BvEW3gyRD14ZdykpRM,96
|
2
|
+
sglang/api.py,sha256=0-Eh7c41hWKjPXrzzvLFdLAUVkvmPGJGLAsrG9evDTE,4576
|
3
|
+
sglang/global_config.py,sha256=PAX7TWeFcq0HBzNUWyCONAOjqIokWqw8vT7I6sBSKTc,797
|
4
|
+
sglang/launch_server.py,sha256=jKPZRDN5bUe8Wgz5eoDkqeePhmKa8DLD4DpXQLT5auo,294
|
5
|
+
sglang/utils.py,sha256=2dUXLMPz9VhhzbIRQABmfZnVW5yz61F3UVtb6yKyevM,6237
|
6
|
+
sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
sglang/backend/anthropic.py,sha256=GJ_T1Jg0VOtajgkgczPKt5sjuVYdbAiWd2jXlJRNRmg,1677
|
8
|
+
sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
|
9
|
+
sglang/backend/openai.py,sha256=nPdA88A5GISJTH88svJdww3qHWIHZcGG2NEn0XjMkLU,9578
|
10
|
+
sglang/backend/runtime_endpoint.py,sha256=r7dTazselaudlFx8hqk-PQLYDHZhpbAKjyFF1zLuM_E,8022
|
11
|
+
sglang/backend/vertexai.py,sha256=BLfWf_tEgoHY9srCufJM5PLe3tql2j0G6ia7cPykxCM,4713
|
12
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
sglang/lang/chat_template.py,sha256=MaCF0fvNky0nJC9OvmAeApeHYgM6Lr03mtRhF0lS31U,8000
|
14
|
+
sglang/lang/compiler.py,sha256=wNn_UqV6Sxl22mv-PpzFUtRgiFFV-Y4OYpO4LshEoRM,7527
|
15
|
+
sglang/lang/interpreter.py,sha256=ahRxuEJZ7b1Tts2Lr7wViWIqL-Z12T3anvgj0XdvMN8,26666
|
16
|
+
sglang/lang/ir.py,sha256=8Ap-uEUz6K9eNQTOKtMixePuLwRFHFKcN0Z5Yn44nKk,13320
|
17
|
+
sglang/lang/tracer.py,sha256=pFiSNzPSg0l7ZZIlGqJDLCmQALR-wyo2dFgJP73J4_Y,8260
|
18
|
+
sglang/srt/backend_config.py,sha256=UIV6kIU2j-Xh0eoezn1aXcYIy0miftHsWFeAZwqpbGE,227
|
19
|
+
sglang/srt/conversation.py,sha256=mTstD-SsXG5p_YhWQUPEWU-vzzDMF4RgQ7KmLkOOC7U,15496
|
20
|
+
sglang/srt/hf_transformers_utils.py,sha256=soRyYLoCn7GxgxvonufGFkdFBA3eH5i3Izk_wi7p1l0,5285
|
21
|
+
sglang/srt/memory_pool.py,sha256=BMoX2wvicj214mV-xvcr_Iv_Je0qs3zTuzXfQVpV8u4,3609
|
22
|
+
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
23
|
+
sglang/srt/model_config.py,sha256=ned-odjmKBKBhVPo04FEpus9gJsUWxrFLrLxahLwSaw,1328
|
24
|
+
sglang/srt/sampling_params.py,sha256=83Fp-4HWThC20TEh139XcIb_erBqfI7KZg5txdRBq7c,2896
|
25
|
+
sglang/srt/server.py,sha256=WLXissKuXQI7JFb2V8D47QSF-PPHnW-JZCiQm4YW0xE,24070
|
26
|
+
sglang/srt/server_args.py,sha256=bvbi-Rb_JudqztFFfRsuXBYtUsG9hq4zMFt7X97uDhA,8954
|
27
|
+
sglang/srt/utils.py,sha256=IEqpmWx_hl4eXn_KoHM0EPXmxeN2wKkgK7H01_t0x5Q,7355
|
28
|
+
sglang/srt/constrained/__init__.py,sha256=BPRNDJnWtzYJ13X4urRS5aE6wFuwAVNBA9qeWIHF8rE,1236
|
29
|
+
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
30
|
+
sglang/srt/constrained/fsm_cache.py,sha256=20mEgtDXU1Zeoicl5KBQC3arkg-RhRWiYnchJc00m1g,901
|
31
|
+
sglang/srt/constrained/jump_forward.py,sha256=Z-pz2Jnvk1CxSEZA65OVq0GryqdiKuOkhhc13v5T6Lo,2482
|
32
|
+
sglang/srt/layers/context_flashattention_nopad.py,sha256=TVYQ6IjftWVXORmKpEROMqQxDOnF6n2g0G1Ci4LquYM,5209
|
33
|
+
sglang/srt/layers/extend_attention.py,sha256=KGqQOA5mel9qScXMAQP_3Qyhp3BNbiQ7Y_6wi38Lxcs,12622
|
34
|
+
sglang/srt/layers/logits_processor.py,sha256=MW2bpqSXyghODMojqeMSYWZhUHuAFPk_gUkyyLw9HkM,4827
|
35
|
+
sglang/srt/layers/radix_attention.py,sha256=bqrb8H8K8RbKTr1PzVmpnUxRzMj0H-OWCi1JYZKuRDw,5597
|
36
|
+
sglang/srt/layers/token_attention.py,sha256=waOjGsWZlvf6epFhYerRJlAaMwvDTy_Z3uzPaXsVQUU,8516
|
37
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=1lPNh_Pe6Pr0v-TzlCBBREbvz4uFWxyw31SmnEZh0s8,3292
|
38
|
+
sglang/srt/managers/io_struct.py,sha256=nXJh3CrOvv9MdAfIFoo6SCXuNQTG3KswmRKkwF61Tek,3141
|
39
|
+
sglang/srt/managers/openai_protocol.py,sha256=cttqg9iv3de8fhtCqDI4cYoPPZ_gULedMXstV1ok6WA,4563
|
40
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=hgsR9AMj6ic9S3-2WiELh7Hnp8Xnb_bzp7kpbjHwHtM,9733
|
41
|
+
sglang/srt/managers/router/infer_batch.py,sha256=U-Ckt9ad1WaOQF_dW6Eo9AMIRQoOJQ-Pm-MMXnEmPP8,18399
|
42
|
+
sglang/srt/managers/router/manager.py,sha256=TNYs0IrkZGkPvZJViwL7BMUg0VlvzeyTjDMjuvRoMDI,2529
|
43
|
+
sglang/srt/managers/router/model_rpc.py,sha256=VlwLNpHZ92bnteQl4PhVKoAXM0C8Y4_2LBBVaffeu3g,26766
|
44
|
+
sglang/srt/managers/router/model_runner.py,sha256=-wWv00EbB_UkkLpio6VKGBTagfzxLHfY-eKDDQ0rZQc,18292
|
45
|
+
sglang/srt/managers/router/radix_cache.py,sha256=XGUF5mxQTSCzD7GW_ltNP2p5aelEKrMXzdezufJ7NCQ,6484
|
46
|
+
sglang/srt/managers/router/scheduler.py,sha256=V-LAnVSzgD2ddy2eXW3jWURCeq9Lv7YxCGk4kHyytfM,2818
|
47
|
+
sglang/srt/models/gemma.py,sha256=8XlfHPtVixPYYjz5F9T4DOAuoordWFStmyFFWGfny1k,11582
|
48
|
+
sglang/srt/models/llama2.py,sha256=VL4iN8R3wyTNr0bDxxKdLNnVGEvdXF6iGvA768YeakA,11611
|
49
|
+
sglang/srt/models/llava.py,sha256=42sn-AgI-6dMaTEU4aEbi4Js5epy0J3JVQoMooUOKt8,14922
|
50
|
+
sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
|
51
|
+
sglang/srt/models/mixtral.py,sha256=wqIwKfR90ih0gDiTZkFZcQD4PIYpZFD3CmzxRcuKIqw,13915
|
52
|
+
sglang/srt/models/qwen.py,sha256=CvdbcF90aI1tJPSQ-3OMUaQGMuaxCGe0y29m5nU_Yj0,9225
|
53
|
+
sglang/srt/models/qwen2.py,sha256=myPc0wvgf5ZzJyGhUGN49YjY-tMf4t8Jn_Imjg8D7Mk,11307
|
54
|
+
sglang/srt/models/stablelm.py,sha256=vMZUNgwXKPGYr5FcdYHw5g3QifVu9owKqq51_-EBOY0,10817
|
55
|
+
sglang/srt/models/yivl.py,sha256=Qvp-zQ93cOZGg3zVyaiQLhRsfXiLrQhxu9TyQP2FMm4,4414
|
56
|
+
sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
|
57
|
+
sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
|
58
|
+
sglang/test/test_programs.py,sha256=mrLhGuprwvx8ZJ-0Qe28E-iCw5Qv-9T0SAv1Jgo1AJw,11421
|
59
|
+
sglang/test/test_utils.py,sha256=6PhTRi8UnR-BRNjit6aGu0M5lO0RebNQwEcDt712hE4,4830
|
60
|
+
sglang-0.1.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
61
|
+
sglang-0.1.14.dist-info/METADATA,sha256=C5N0VOYRHixdJcsf4dExIvP-Q099kYBMKs_dA4LBXSM,28809
|
62
|
+
sglang-0.1.14.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
63
|
+
sglang-0.1.14.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
64
|
+
sglang-0.1.14.dist-info/RECORD,,
|
sglang/srt/models/gpt_neox.py
DELETED
@@ -1,274 +0,0 @@
|
|
1
|
-
# Adapted from
|
2
|
-
# https://github.com/vllm-project/vllm/blob/c81dddb45c71e630b907f9d84686ecd73b4105c7/vllm/model_executor/models/gpt_neox.py#L1
|
3
|
-
"""Inference-only GPT-NeoX model compatible with HuggingFace weights."""
|
4
|
-
from typing import List, Optional, Tuple
|
5
|
-
|
6
|
-
import torch
|
7
|
-
from torch import nn
|
8
|
-
from transformers import GPTNeoXConfig
|
9
|
-
|
10
|
-
from vllm.model_executor.layers.activation import get_act_fn
|
11
|
-
from sglang.srt.layers.logits_processor import LogitsProcessor
|
12
|
-
from sglang.srt.layers.radix_attention import RadixAttention
|
13
|
-
from sglang.srt.managers.router.model_runner import InputMetadata
|
14
|
-
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
15
|
-
LinearMethodBase,
|
16
|
-
QKVParallelLinear,
|
17
|
-
RowParallelLinear)
|
18
|
-
from vllm.model_executor.layers.rotary_embedding import get_rope
|
19
|
-
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
20
|
-
VocabParallelEmbedding, ParallelLMHead)
|
21
|
-
from vllm.model_executor.parallel_utils.parallel_state import (
|
22
|
-
get_tensor_model_parallel_world_size)
|
23
|
-
from vllm.model_executor.weight_utils import (default_weight_loader,
|
24
|
-
hf_model_weights_iterator)
|
25
|
-
|
26
|
-
class GPTNeoXAttention(nn.Module):
|
27
|
-
|
28
|
-
def __init__(
|
29
|
-
self,
|
30
|
-
config: GPTNeoXConfig,
|
31
|
-
layer_id: int = 0,
|
32
|
-
linear_method: Optional[LinearMethodBase] = None,
|
33
|
-
):
|
34
|
-
super().__init__()
|
35
|
-
self.total_num_heads = config.num_attention_heads
|
36
|
-
self.hidden_size = config.hidden_size
|
37
|
-
self.head_size = self.hidden_size // self.total_num_heads
|
38
|
-
self.bias = getattr(config, "attention_bias", True)
|
39
|
-
|
40
|
-
tensor_model_parallel_world_size = (
|
41
|
-
get_tensor_model_parallel_world_size())
|
42
|
-
assert self.total_num_heads % tensor_model_parallel_world_size == 0
|
43
|
-
self.num_heads = (self.total_num_heads //
|
44
|
-
tensor_model_parallel_world_size)
|
45
|
-
|
46
|
-
self.query_key_value = QKVParallelLinear(
|
47
|
-
config.hidden_size,
|
48
|
-
self.head_size,
|
49
|
-
self.total_num_heads,
|
50
|
-
bias=self.bias,
|
51
|
-
linear_method=linear_method,
|
52
|
-
)
|
53
|
-
self.dense = RowParallelLinear(
|
54
|
-
config.hidden_size,
|
55
|
-
config.hidden_size,
|
56
|
-
bias=self.bias,
|
57
|
-
linear_method=linear_method,
|
58
|
-
)
|
59
|
-
scaling = self.head_size**-0.5
|
60
|
-
rotary_dim = int(self.head_size * config.rotary_pct)
|
61
|
-
assert rotary_dim % 2 == 0
|
62
|
-
rope_theta = getattr(config, "rope_theta", 10000)
|
63
|
-
max_position_embeddings = getattr(config, "max_position_embeddings",
|
64
|
-
8192)
|
65
|
-
self.rotary_emb = get_rope(
|
66
|
-
self.head_size,
|
67
|
-
rotary_dim=rotary_dim,
|
68
|
-
max_position=max_position_embeddings,
|
69
|
-
base=rope_theta,
|
70
|
-
)
|
71
|
-
self.attn = RadixAttention(self.num_heads,
|
72
|
-
self.head_size,
|
73
|
-
scaling,
|
74
|
-
num_kv_heads=self.num_heads,
|
75
|
-
layer_id=layer_id)
|
76
|
-
|
77
|
-
def forward(
|
78
|
-
self,
|
79
|
-
position_ids: torch.Tensor,
|
80
|
-
hidden_states: torch.Tensor,
|
81
|
-
input_metadata: InputMetadata,
|
82
|
-
) -> torch.Tensor:
|
83
|
-
qkv, _ = self.query_key_value(hidden_states)
|
84
|
-
q, k, v = qkv.chunk(chunks=3, dim=-1)
|
85
|
-
q, k = self.rotary_emb(position_ids, q, k)
|
86
|
-
attn_output = self.attn(q, k, v, input_metadata)
|
87
|
-
output, _ = self.dense(attn_output)
|
88
|
-
return output
|
89
|
-
|
90
|
-
|
91
|
-
class GPTNeoXMLP(nn.Module):
|
92
|
-
|
93
|
-
def __init__(
|
94
|
-
self,
|
95
|
-
config: GPTNeoXConfig,
|
96
|
-
linear_method: Optional[LinearMethodBase] = None,
|
97
|
-
):
|
98
|
-
super().__init__()
|
99
|
-
self.dense_h_to_4h = ColumnParallelLinear(
|
100
|
-
config.hidden_size,
|
101
|
-
config.intermediate_size,
|
102
|
-
linear_method=linear_method,
|
103
|
-
)
|
104
|
-
self.dense_4h_to_h = RowParallelLinear(
|
105
|
-
config.intermediate_size,
|
106
|
-
config.hidden_size,
|
107
|
-
linear_method=linear_method,
|
108
|
-
)
|
109
|
-
quant_config = getattr(linear_method, "quant_config", None)
|
110
|
-
self.act = get_act_fn(config.hidden_act, quant_config,
|
111
|
-
config.intermediate_size)
|
112
|
-
|
113
|
-
def forward(self, hidden_states):
|
114
|
-
hidden_states, _ = self.dense_h_to_4h(hidden_states)
|
115
|
-
hidden_states = self.act(hidden_states)
|
116
|
-
hidden_states, _ = self.dense_4h_to_h(hidden_states)
|
117
|
-
return hidden_states
|
118
|
-
|
119
|
-
|
120
|
-
class GPTNeoXLayer(nn.Module):
|
121
|
-
|
122
|
-
def __init__(
|
123
|
-
self,
|
124
|
-
config: GPTNeoXConfig,
|
125
|
-
layer_id: int = 0,
|
126
|
-
linear_method: Optional[LinearMethodBase] = None,
|
127
|
-
):
|
128
|
-
super().__init__()
|
129
|
-
self.use_parallel_residual = config.use_parallel_residual
|
130
|
-
self.input_layernorm = nn.LayerNorm(config.hidden_size,
|
131
|
-
eps=config.layer_norm_eps)
|
132
|
-
self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
|
133
|
-
eps=config.layer_norm_eps)
|
134
|
-
self.attention = GPTNeoXAttention(config, layer_id=layer_id, linear_method=linear_method)
|
135
|
-
self.mlp = GPTNeoXMLP(config, linear_method)
|
136
|
-
|
137
|
-
def forward(
|
138
|
-
self,
|
139
|
-
position_ids: torch.Tensor,
|
140
|
-
hidden_states: torch.Tensor,
|
141
|
-
input_metadata: InputMetadata,
|
142
|
-
) -> torch.Tensor:
|
143
|
-
attn_input = self.input_layernorm(hidden_states)
|
144
|
-
attn_output = self.attention(
|
145
|
-
position_ids=position_ids,
|
146
|
-
hidden_states=attn_input,
|
147
|
-
input_metadata=input_metadata,
|
148
|
-
)
|
149
|
-
|
150
|
-
if self.use_parallel_residual:
|
151
|
-
# pseudocode:
|
152
|
-
# x = x + attn(ln1(x)) + mlp(ln2(x))
|
153
|
-
mlp_input = self.post_attention_layernorm(hidden_states)
|
154
|
-
mlp_output = self.mlp(mlp_input)
|
155
|
-
hidden_states = mlp_output + attn_output + hidden_states
|
156
|
-
else:
|
157
|
-
# pseudocode:
|
158
|
-
# x = x + attn(ln1(x))
|
159
|
-
# x = x + mlp(ln2(x))
|
160
|
-
attn_output = attn_output + hidden_states
|
161
|
-
mlp_input = self.post_attention_layernorm(attn_output)
|
162
|
-
mlp_output = self.mlp(mlp_input)
|
163
|
-
hidden_states = mlp_output + attn_output
|
164
|
-
return hidden_states
|
165
|
-
|
166
|
-
|
167
|
-
class GPTNeoXModel(nn.Module):
|
168
|
-
|
169
|
-
def __init__(
|
170
|
-
self,
|
171
|
-
config: GPTNeoXConfig,
|
172
|
-
linear_method: Optional[LinearMethodBase] = None,
|
173
|
-
):
|
174
|
-
super().__init__()
|
175
|
-
self.config = config
|
176
|
-
|
177
|
-
self.embed_in = VocabParallelEmbedding(
|
178
|
-
config.vocab_size,
|
179
|
-
config.hidden_size,
|
180
|
-
)
|
181
|
-
self.layers = nn.ModuleList([
|
182
|
-
GPTNeoXLayer(config, i, linear_method)
|
183
|
-
for i in range(config.num_hidden_layers)
|
184
|
-
])
|
185
|
-
self.final_layer_norm = nn.LayerNorm(config.hidden_size,
|
186
|
-
eps=config.layer_norm_eps)
|
187
|
-
|
188
|
-
def forward(
|
189
|
-
self,
|
190
|
-
input_ids: torch.Tensor,
|
191
|
-
position_ids: torch.Tensor,
|
192
|
-
input_metadata: InputMetadata,
|
193
|
-
skip_embed: bool = False,
|
194
|
-
) -> torch.Tensor:
|
195
|
-
if not skip_embed:
|
196
|
-
hidden_states = self.embed_in(input_ids)
|
197
|
-
else:
|
198
|
-
hidden_states = input_ids
|
199
|
-
hidden_states = self.embed_in(input_ids)
|
200
|
-
for i in range(len(self.layers)):
|
201
|
-
layer = self.layers[i]
|
202
|
-
hidden_states = layer(
|
203
|
-
position_ids,
|
204
|
-
hidden_states,
|
205
|
-
input_metadata,
|
206
|
-
)
|
207
|
-
hidden_states = self.final_layer_norm(hidden_states)
|
208
|
-
return hidden_states
|
209
|
-
|
210
|
-
|
211
|
-
class GPTNeoXForCausalLM(nn.Module):
|
212
|
-
|
213
|
-
def __init__(
|
214
|
-
self,
|
215
|
-
config,
|
216
|
-
linear_method: Optional[LinearMethodBase] = None,
|
217
|
-
):
|
218
|
-
super().__init__()
|
219
|
-
self.config = config
|
220
|
-
self.linear_method = linear_method
|
221
|
-
self.gpt_neox = GPTNeoXModel(config, linear_method)
|
222
|
-
self.embed_out = ParallelLMHead(
|
223
|
-
config.vocab_size,
|
224
|
-
config.hidden_size,
|
225
|
-
)
|
226
|
-
self.logits_processor = LogitsProcessor(config)
|
227
|
-
|
228
|
-
def forward(
|
229
|
-
self,
|
230
|
-
input_ids: torch.Tensor,
|
231
|
-
positions: torch.Tensor,
|
232
|
-
input_metadata: InputMetadata,
|
233
|
-
skip_embed: bool = False,
|
234
|
-
) -> torch.Tensor:
|
235
|
-
hidden_states = self.gpt_neox(input_ids, positions, input_metadata, skip_embed)
|
236
|
-
return self.logits_processor(
|
237
|
-
input_ids, hidden_states, self.embed_out.weight, input_metadata
|
238
|
-
)
|
239
|
-
|
240
|
-
def load_weights(self,
|
241
|
-
model_name_or_path: str,
|
242
|
-
cache_dir: Optional[str] = None,
|
243
|
-
load_format: str = "auto",
|
244
|
-
revision: Optional[str] = None):
|
245
|
-
params_dict = dict(self.named_parameters())
|
246
|
-
for name, loaded_weight in hf_model_weights_iterator(
|
247
|
-
model_name_or_path, cache_dir, load_format, revision):
|
248
|
-
if ("attention.bias" in name or "attention.masked_bias" in name
|
249
|
-
or "rotary_emb.inv_freq" in name):
|
250
|
-
continue
|
251
|
-
param = params_dict[name]
|
252
|
-
|
253
|
-
if "query_key_value" in name:
|
254
|
-
# NOTE: GPT-NeoX's fused QKV's output_dim has the shape of
|
255
|
-
# (num_heads * 3 * head_size), while the
|
256
|
-
# required shape is (3 * num_heads * head_size).
|
257
|
-
# Thus, we need weight conversion.
|
258
|
-
output_dim = getattr(param, "output_dim", None)
|
259
|
-
num_heads = self.config.num_attention_heads
|
260
|
-
if output_dim is not None:
|
261
|
-
loaded_weight_shape = loaded_weight.shape
|
262
|
-
loaded_weight = loaded_weight.view(
|
263
|
-
loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
|
264
|
-
loaded_weight_shape[output_dim + 1:])
|
265
|
-
loaded_weight = loaded_weight.transpose(
|
266
|
-
output_dim, output_dim + 1)
|
267
|
-
loaded_weight = loaded_weight.reshape(loaded_weight_shape)
|
268
|
-
|
269
|
-
weight_loader = getattr(param, "weight_loader",
|
270
|
-
default_weight_loader)
|
271
|
-
weight_loader(param, loaded_weight)
|
272
|
-
|
273
|
-
|
274
|
-
EntryClass = GPTNeoXForCausalLM
|
sglang-0.1.12.dist-info/RECORD
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
sglang/__init__.py,sha256=MsaKtUijK193Lw2Hw8ydva_X5Le0sKvWUVeKaOjdYqY,96
|
2
|
-
sglang/api.py,sha256=E2G93eTlM7wT451iGEDESZnt0NZjK03Xt0Lyx-NYCOc,4207
|
3
|
-
sglang/global_config.py,sha256=PAX7TWeFcq0HBzNUWyCONAOjqIokWqw8vT7I6sBSKTc,797
|
4
|
-
sglang/launch_server.py,sha256=jKPZRDN5bUe8Wgz5eoDkqeePhmKa8DLD4DpXQLT5auo,294
|
5
|
-
sglang/utils.py,sha256=a3RjlWZ-K2LjO8GTwD_ExYu-QvgSrcDh-_NKeqzBziM,6231
|
6
|
-
sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
sglang/backend/anthropic.py,sha256=y5TN9EDrJtOH4JEUxpXu-endloeYBy7xMUr3r7Ah3MA,1462
|
8
|
-
sglang/backend/base_backend.py,sha256=pPalZfoezxnUBs752j7lm0uMwa8tZuCWd-ijSdStMO8,1745
|
9
|
-
sglang/backend/openai.py,sha256=L49Ga3E1rgOyxpH9NyMrKw2Exm-WyDM_pCUQZetCH_Q,8555
|
10
|
-
sglang/backend/runtime_endpoint.py,sha256=hx3D-Dv3XAVbnAtbW975RrNN6Jaw2ZvR6XGMFz61h7A,6689
|
11
|
-
sglang/backend/vertexai.py,sha256=BLfWf_tEgoHY9srCufJM5PLe3tql2j0G6ia7cPykxCM,4713
|
12
|
-
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
sglang/lang/chat_template.py,sha256=9aiR_4euCjrSdduYYiSnVjHE48GOqaHHigwX1oXu0lU,7461
|
14
|
-
sglang/lang/compiler.py,sha256=wNn_UqV6Sxl22mv-PpzFUtRgiFFV-Y4OYpO4LshEoRM,7527
|
15
|
-
sglang/lang/interpreter.py,sha256=qCIssjVWltToFXpZDfNx6tiemQpXiK_NF_qUPWu_rvU,26262
|
16
|
-
sglang/lang/ir.py,sha256=QSx0vMepQ01SaQ4EQjUqbJknHSrF557CqHuosQi6otQ,13330
|
17
|
-
sglang/lang/tracer.py,sha256=pFiSNzPSg0l7ZZIlGqJDLCmQALR-wyo2dFgJP73J4_Y,8260
|
18
|
-
sglang/srt/backend_config.py,sha256=UIV6kIU2j-Xh0eoezn1aXcYIy0miftHsWFeAZwqpbGE,227
|
19
|
-
sglang/srt/conversation.py,sha256=mTstD-SsXG5p_YhWQUPEWU-vzzDMF4RgQ7KmLkOOC7U,15496
|
20
|
-
sglang/srt/hf_transformers_utils.py,sha256=soRyYLoCn7GxgxvonufGFkdFBA3eH5i3Izk_wi7p1l0,5285
|
21
|
-
sglang/srt/memory_pool.py,sha256=BMoX2wvicj214mV-xvcr_Iv_Je0qs3zTuzXfQVpV8u4,3609
|
22
|
-
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
23
|
-
sglang/srt/model_config.py,sha256=MDfjfhfZxXTPrshLsZANWyCN8RPS-pCV4RTAcA8IUG8,1124
|
24
|
-
sglang/srt/sampling_params.py,sha256=83Fp-4HWThC20TEh139XcIb_erBqfI7KZg5txdRBq7c,2896
|
25
|
-
sglang/srt/server.py,sha256=21EdEjG_EmVs4BhL37wI5wNtcmir44CPRX5cZ-5bofM,21454
|
26
|
-
sglang/srt/server_args.py,sha256=WihASLqvxYDX65LAtdMzQ2kqbARxFds20jF8HFfXmRc,7567
|
27
|
-
sglang/srt/utils.py,sha256=2gCOlsErsFz969V5kaTPm0-jFwOpbIU1ufrRyWkYvYE,7287
|
28
|
-
sglang/srt/constrained/__init__.py,sha256=wcGWZNn19jK2m-KN6P4ui1BrQy2wwUt-qG4_Orv0ouY,490
|
29
|
-
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
30
|
-
sglang/srt/constrained/fsm_cache.py,sha256=Q0J4St3XUOt2tKFVpj0B2KIZ6z3X6cIzTcjREVqy3pg,471
|
31
|
-
sglang/srt/constrained/jump_forward.py,sha256=Z-pz2Jnvk1CxSEZA65OVq0GryqdiKuOkhhc13v5T6Lo,2482
|
32
|
-
sglang/srt/layers/context_flashattention_nopad.py,sha256=GkjLiTkS4px_uLcW0aDocE3_OBXtujZ-SlsN2b2U7ng,5204
|
33
|
-
sglang/srt/layers/extend_attention.py,sha256=pWVE6ySnPiVLFON__bie73eDhmXHk4tECMK8zTiJNbI,12558
|
34
|
-
sglang/srt/layers/logits_processor.py,sha256=MW2bpqSXyghODMojqeMSYWZhUHuAFPk_gUkyyLw9HkM,4827
|
35
|
-
sglang/srt/layers/radix_attention.py,sha256=Tl1zE2c87Fm5qW5Ylffkgh48QQIwd93UK6IaRMZltFI,5789
|
36
|
-
sglang/srt/layers/token_attention.py,sha256=Z3YVuFmqca3ho2NKSXjLXZNz4L67qrhaec_Pd38IA_4,8157
|
37
|
-
sglang/srt/managers/detokenizer_manager.py,sha256=1lPNh_Pe6Pr0v-TzlCBBREbvz4uFWxyw31SmnEZh0s8,3292
|
38
|
-
sglang/srt/managers/io_struct.py,sha256=E5Lt81n7-DkRR-pl7XoaJXIBaa2nT9swABNwXEsTsUw,3064
|
39
|
-
sglang/srt/managers/openai_protocol.py,sha256=cttqg9iv3de8fhtCqDI4cYoPPZ_gULedMXstV1ok6WA,4563
|
40
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=B-F6diI1sV0pW1HxQgH_v8VzJwzvgkcGB1_MkOh2unE,9693
|
41
|
-
sglang/srt/managers/router/infer_batch.py,sha256=bvUY1EmIKqdF38N5ALWVsgD2rl6GNzvUHMGkluoUpv8,18126
|
42
|
-
sglang/srt/managers/router/manager.py,sha256=TNYs0IrkZGkPvZJViwL7BMUg0VlvzeyTjDMjuvRoMDI,2529
|
43
|
-
sglang/srt/managers/router/model_rpc.py,sha256=dKBRzPoERK-TCgUgnaQfFFlZtB6_xWT9eSTVwizCCiA,25938
|
44
|
-
sglang/srt/managers/router/model_runner.py,sha256=TUEqfsQedPUFGA5cpTYi8sW5whtEM-4ui3s7YoP0cBg,17604
|
45
|
-
sglang/srt/managers/router/radix_cache.py,sha256=XGUF5mxQTSCzD7GW_ltNP2p5aelEKrMXzdezufJ7NCQ,6484
|
46
|
-
sglang/srt/managers/router/scheduler.py,sha256=V-LAnVSzgD2ddy2eXW3jWURCeq9Lv7YxCGk4kHyytfM,2818
|
47
|
-
sglang/srt/models/gpt_neox.py,sha256=0NwrX9hqVD9biE0bfJYFC4TMhQKhYdNowLkVYo2OG24,10271
|
48
|
-
sglang/srt/models/llama2.py,sha256=tICX536zPcPup3KmwRmRASqBSgyY6_XpThjfMpQ1evM,11582
|
49
|
-
sglang/srt/models/llava.py,sha256=OaJF9Lal4Txtg_FuDsQTL_kHR7PB1BUf3nhngCdFnfU,14963
|
50
|
-
sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
|
51
|
-
sglang/srt/models/mixtral.py,sha256=iTuuyJdT8cq6W7CCqHz5nyY6I8r_m1SLMiGfGTkPW6w,13886
|
52
|
-
sglang/srt/models/qwen.py,sha256=xPvwO4YBhht4ROSK-ef9Zysk_UvB06GxCBPYjyElgUY,9225
|
53
|
-
sglang/srt/models/qwen2.py,sha256=B_dH2QQtfuz38LmGOGcFnQwTRMRFUkHAaO32rnNB_-s,11278
|
54
|
-
sglang/srt/models/yivl.py,sha256=Qvp-zQ93cOZGg3zVyaiQLhRsfXiLrQhxu9TyQP2FMm4,4414
|
55
|
-
sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
|
56
|
-
sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
|
57
|
-
sglang/test/test_programs.py,sha256=mrLhGuprwvx8ZJ-0Qe28E-iCw5Qv-9T0SAv1Jgo1AJw,11421
|
58
|
-
sglang/test/test_utils.py,sha256=DyZAic3KIBQ0PmZeLc9uv1ckcM5jpEE5CirjHO48_sk,4829
|
59
|
-
sglang-0.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
60
|
-
sglang-0.1.12.dist-info/METADATA,sha256=4Q0u9J9QUQFlAbMnzioXZ0i47F-HZznyKA7qcnrv_K4,28129
|
61
|
-
sglang-0.1.12.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
62
|
-
sglang-0.1.12.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
63
|
-
sglang-0.1.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|