sglang 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. sglang/__init__.py +1 -1
  2. sglang/api.py +14 -0
  3. sglang/backend/anthropic.py +18 -12
  4. sglang/backend/base_backend.py +6 -0
  5. sglang/backend/openai.py +41 -12
  6. sglang/backend/runtime_endpoint.py +57 -6
  7. sglang/lang/chat_template.py +47 -26
  8. sglang/lang/interpreter.py +15 -2
  9. sglang/lang/ir.py +1 -1
  10. sglang/srt/constrained/__init__.py +23 -1
  11. sglang/srt/constrained/fsm_cache.py +14 -3
  12. sglang/srt/layers/context_flashattention_nopad.py +1 -1
  13. sglang/srt/layers/extend_attention.py +7 -6
  14. sglang/srt/layers/radix_attention.py +2 -10
  15. sglang/srt/layers/token_attention.py +12 -4
  16. sglang/srt/managers/io_struct.py +3 -1
  17. sglang/srt/managers/router/infer_batch.py +6 -2
  18. sglang/srt/managers/router/model_rpc.py +45 -32
  19. sglang/srt/managers/router/model_runner.py +40 -25
  20. sglang/srt/managers/tokenizer_manager.py +2 -0
  21. sglang/srt/model_config.py +12 -5
  22. sglang/srt/models/gemma.py +340 -0
  23. sglang/srt/models/llama2.py +5 -5
  24. sglang/srt/models/llava.py +2 -4
  25. sglang/srt/models/mixtral.py +5 -5
  26. sglang/srt/models/qwen.py +4 -4
  27. sglang/srt/models/qwen2.py +5 -5
  28. sglang/srt/models/stablelm.py +293 -0
  29. sglang/srt/server.py +111 -47
  30. sglang/srt/server_args.py +44 -9
  31. sglang/srt/utils.py +1 -0
  32. sglang/test/test_utils.py +1 -1
  33. sglang/utils.py +15 -12
  34. {sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/METADATA +16 -6
  35. sglang-0.1.14.dist-info/RECORD +64 -0
  36. {sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/WHEEL +1 -1
  37. sglang/srt/models/gpt_neox.py +0 -274
  38. sglang-0.1.12.dist-info/RECORD +0 -63
  39. {sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/LICENSE +0 -0
  40. {sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/top_level.txt +0 -0
sglang/utils.py CHANGED
@@ -88,26 +88,29 @@ class HttpResponse:
88
88
  return self.resp.status
89
89
 
90
90
 
91
- def http_request(url, json=None, stream=False, auth_token=None):
91
+ def http_request(
92
+ url, json=None, stream=False, auth_token=None, api_key=None, verify=None
93
+ ):
92
94
  """A faster version of requests.post with low-level urllib API."""
95
+ headers = {"Content-Type": "application/json; charset=utf-8"}
96
+
97
+ # add the Authorization header if an auth token is provided
98
+ if auth_token is not None:
99
+ headers["Authorization"] = f"Bearer {auth_token}"
100
+
101
+ # add the API Key header if an API key is provided
102
+ if api_key is not None:
103
+ headers["X-API-Key"] = api_key
104
+
93
105
  if stream:
94
- if auth_token is None:
95
- return requests.post(url, json=json, stream=True)
96
- headers = {
97
- "Content-Type": "application/json",
98
- "Authentication": f"Bearer {auth_token}",
99
- }
100
106
  return requests.post(url, json=json, stream=True, headers=headers)
101
107
  else:
102
- req = urllib.request.Request(url)
103
- req.add_header("Content-Type", "application/json; charset=utf-8")
104
- if auth_token is not None:
105
- req.add_header("Authentication", f"Bearer {auth_token}")
108
+ req = urllib.request.Request(url, headers=headers)
106
109
  if json is None:
107
110
  data = None
108
111
  else:
109
112
  data = bytes(dumps(json), encoding="utf-8")
110
- resp = urllib.request.urlopen(req, data=data)
113
+ resp = urllib.request.urlopen(req, data=data, cafile=verify)
111
114
  return HttpResponse(resp)
112
115
 
113
116
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -217,7 +217,7 @@ Requires-Dist: sglang[srt] ; extra == 'all'
217
217
  Requires-Dist: sglang[openai] ; extra == 'all'
218
218
  Requires-Dist: sglang[anthropic] ; extra == 'all'
219
219
  Provides-Extra: anthropic
220
- Requires-Dist: anthropic ; extra == 'anthropic'
220
+ Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
221
221
  Requires-Dist: numpy ; extra == 'anthropic'
222
222
  Provides-Extra: openai
223
223
  Requires-Dist: openai >=1.0 ; extra == 'openai'
@@ -231,7 +231,7 @@ Requires-Dist: torch ; extra == 'srt'
231
231
  Requires-Dist: uvloop ; extra == 'srt'
232
232
  Requires-Dist: uvicorn ; extra == 'srt'
233
233
  Requires-Dist: zmq ; extra == 'srt'
234
- Requires-Dist: vllm >=0.2.5 ; extra == 'srt'
234
+ Requires-Dist: vllm >=0.3.3 ; extra == 'srt'
235
235
  Requires-Dist: interegular ; extra == 'srt'
236
236
  Requires-Dist: lark ; extra == 'srt'
237
237
  Requires-Dist: numba ; extra == 'srt'
@@ -242,7 +242,12 @@ Requires-Dist: cloudpickle ; extra == 'srt'
242
242
  Requires-Dist: pillow ; extra == 'srt'
243
243
  Requires-Dist: outlines >=0.0.27 ; extra == 'srt'
244
244
 
245
- # SGLang
245
+ <div align="center">
246
+ <img src="assets/logo.png" alt="logo" width="400"></img>
247
+ </div>
248
+
249
+ --------------------------------------------------------------------------------
250
+
246
251
  | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
247
252
 
248
253
  SGLang is a structured generation language designed for large language models (LLMs).
@@ -254,7 +259,7 @@ The core features of SGLang include:
254
259
 
255
260
  ## News
256
261
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
257
- - [2024/01] 🔥 SGLang powers the serving of the offical **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
262
+ - [2024/01] 🔥 SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
258
263
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
259
264
 
260
265
  ## Contents
@@ -496,7 +501,7 @@ def text_qa(s, question):
496
501
  s += "Q: " + question + "\n"
497
502
  s += "A:" + sgl.gen("answer", stop="\n")
498
503
 
499
- states = text_qa.run(
504
+ state = text_qa.run(
500
505
  question="What is the capital of France?",
501
506
  temperature=0.1,
502
507
  stream=True
@@ -608,8 +613,13 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
608
613
  - Mistral
609
614
  - Mixtral
610
615
  - Qwen / Qwen 2
616
+ - Gemma
617
+ - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
618
+ - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
611
619
  - LLaVA
612
620
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
621
+ - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
622
+ - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
613
623
  - Yi-VL
614
624
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
615
625
  - AWQ/GPTQ quantization
@@ -0,0 +1,64 @@
1
+ sglang/__init__.py,sha256=Nxa2M7XCh2-e6I7VrCg7OSBL6BvEW3gyRD14ZdykpRM,96
2
+ sglang/api.py,sha256=0-Eh7c41hWKjPXrzzvLFdLAUVkvmPGJGLAsrG9evDTE,4576
3
+ sglang/global_config.py,sha256=PAX7TWeFcq0HBzNUWyCONAOjqIokWqw8vT7I6sBSKTc,797
4
+ sglang/launch_server.py,sha256=jKPZRDN5bUe8Wgz5eoDkqeePhmKa8DLD4DpXQLT5auo,294
5
+ sglang/utils.py,sha256=2dUXLMPz9VhhzbIRQABmfZnVW5yz61F3UVtb6yKyevM,6237
6
+ sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ sglang/backend/anthropic.py,sha256=GJ_T1Jg0VOtajgkgczPKt5sjuVYdbAiWd2jXlJRNRmg,1677
8
+ sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
9
+ sglang/backend/openai.py,sha256=nPdA88A5GISJTH88svJdww3qHWIHZcGG2NEn0XjMkLU,9578
10
+ sglang/backend/runtime_endpoint.py,sha256=r7dTazselaudlFx8hqk-PQLYDHZhpbAKjyFF1zLuM_E,8022
11
+ sglang/backend/vertexai.py,sha256=BLfWf_tEgoHY9srCufJM5PLe3tql2j0G6ia7cPykxCM,4713
12
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ sglang/lang/chat_template.py,sha256=MaCF0fvNky0nJC9OvmAeApeHYgM6Lr03mtRhF0lS31U,8000
14
+ sglang/lang/compiler.py,sha256=wNn_UqV6Sxl22mv-PpzFUtRgiFFV-Y4OYpO4LshEoRM,7527
15
+ sglang/lang/interpreter.py,sha256=ahRxuEJZ7b1Tts2Lr7wViWIqL-Z12T3anvgj0XdvMN8,26666
16
+ sglang/lang/ir.py,sha256=8Ap-uEUz6K9eNQTOKtMixePuLwRFHFKcN0Z5Yn44nKk,13320
17
+ sglang/lang/tracer.py,sha256=pFiSNzPSg0l7ZZIlGqJDLCmQALR-wyo2dFgJP73J4_Y,8260
18
+ sglang/srt/backend_config.py,sha256=UIV6kIU2j-Xh0eoezn1aXcYIy0miftHsWFeAZwqpbGE,227
19
+ sglang/srt/conversation.py,sha256=mTstD-SsXG5p_YhWQUPEWU-vzzDMF4RgQ7KmLkOOC7U,15496
20
+ sglang/srt/hf_transformers_utils.py,sha256=soRyYLoCn7GxgxvonufGFkdFBA3eH5i3Izk_wi7p1l0,5285
21
+ sglang/srt/memory_pool.py,sha256=BMoX2wvicj214mV-xvcr_Iv_Je0qs3zTuzXfQVpV8u4,3609
22
+ sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
23
+ sglang/srt/model_config.py,sha256=ned-odjmKBKBhVPo04FEpus9gJsUWxrFLrLxahLwSaw,1328
24
+ sglang/srt/sampling_params.py,sha256=83Fp-4HWThC20TEh139XcIb_erBqfI7KZg5txdRBq7c,2896
25
+ sglang/srt/server.py,sha256=WLXissKuXQI7JFb2V8D47QSF-PPHnW-JZCiQm4YW0xE,24070
26
+ sglang/srt/server_args.py,sha256=bvbi-Rb_JudqztFFfRsuXBYtUsG9hq4zMFt7X97uDhA,8954
27
+ sglang/srt/utils.py,sha256=IEqpmWx_hl4eXn_KoHM0EPXmxeN2wKkgK7H01_t0x5Q,7355
28
+ sglang/srt/constrained/__init__.py,sha256=BPRNDJnWtzYJ13X4urRS5aE6wFuwAVNBA9qeWIHF8rE,1236
29
+ sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
30
+ sglang/srt/constrained/fsm_cache.py,sha256=20mEgtDXU1Zeoicl5KBQC3arkg-RhRWiYnchJc00m1g,901
31
+ sglang/srt/constrained/jump_forward.py,sha256=Z-pz2Jnvk1CxSEZA65OVq0GryqdiKuOkhhc13v5T6Lo,2482
32
+ sglang/srt/layers/context_flashattention_nopad.py,sha256=TVYQ6IjftWVXORmKpEROMqQxDOnF6n2g0G1Ci4LquYM,5209
33
+ sglang/srt/layers/extend_attention.py,sha256=KGqQOA5mel9qScXMAQP_3Qyhp3BNbiQ7Y_6wi38Lxcs,12622
34
+ sglang/srt/layers/logits_processor.py,sha256=MW2bpqSXyghODMojqeMSYWZhUHuAFPk_gUkyyLw9HkM,4827
35
+ sglang/srt/layers/radix_attention.py,sha256=bqrb8H8K8RbKTr1PzVmpnUxRzMj0H-OWCi1JYZKuRDw,5597
36
+ sglang/srt/layers/token_attention.py,sha256=waOjGsWZlvf6epFhYerRJlAaMwvDTy_Z3uzPaXsVQUU,8516
37
+ sglang/srt/managers/detokenizer_manager.py,sha256=1lPNh_Pe6Pr0v-TzlCBBREbvz4uFWxyw31SmnEZh0s8,3292
38
+ sglang/srt/managers/io_struct.py,sha256=nXJh3CrOvv9MdAfIFoo6SCXuNQTG3KswmRKkwF61Tek,3141
39
+ sglang/srt/managers/openai_protocol.py,sha256=cttqg9iv3de8fhtCqDI4cYoPPZ_gULedMXstV1ok6WA,4563
40
+ sglang/srt/managers/tokenizer_manager.py,sha256=hgsR9AMj6ic9S3-2WiELh7Hnp8Xnb_bzp7kpbjHwHtM,9733
41
+ sglang/srt/managers/router/infer_batch.py,sha256=U-Ckt9ad1WaOQF_dW6Eo9AMIRQoOJQ-Pm-MMXnEmPP8,18399
42
+ sglang/srt/managers/router/manager.py,sha256=TNYs0IrkZGkPvZJViwL7BMUg0VlvzeyTjDMjuvRoMDI,2529
43
+ sglang/srt/managers/router/model_rpc.py,sha256=VlwLNpHZ92bnteQl4PhVKoAXM0C8Y4_2LBBVaffeu3g,26766
44
+ sglang/srt/managers/router/model_runner.py,sha256=-wWv00EbB_UkkLpio6VKGBTagfzxLHfY-eKDDQ0rZQc,18292
45
+ sglang/srt/managers/router/radix_cache.py,sha256=XGUF5mxQTSCzD7GW_ltNP2p5aelEKrMXzdezufJ7NCQ,6484
46
+ sglang/srt/managers/router/scheduler.py,sha256=V-LAnVSzgD2ddy2eXW3jWURCeq9Lv7YxCGk4kHyytfM,2818
47
+ sglang/srt/models/gemma.py,sha256=8XlfHPtVixPYYjz5F9T4DOAuoordWFStmyFFWGfny1k,11582
48
+ sglang/srt/models/llama2.py,sha256=VL4iN8R3wyTNr0bDxxKdLNnVGEvdXF6iGvA768YeakA,11611
49
+ sglang/srt/models/llava.py,sha256=42sn-AgI-6dMaTEU4aEbi4Js5epy0J3JVQoMooUOKt8,14922
50
+ sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
51
+ sglang/srt/models/mixtral.py,sha256=wqIwKfR90ih0gDiTZkFZcQD4PIYpZFD3CmzxRcuKIqw,13915
52
+ sglang/srt/models/qwen.py,sha256=CvdbcF90aI1tJPSQ-3OMUaQGMuaxCGe0y29m5nU_Yj0,9225
53
+ sglang/srt/models/qwen2.py,sha256=myPc0wvgf5ZzJyGhUGN49YjY-tMf4t8Jn_Imjg8D7Mk,11307
54
+ sglang/srt/models/stablelm.py,sha256=vMZUNgwXKPGYr5FcdYHw5g3QifVu9owKqq51_-EBOY0,10817
55
+ sglang/srt/models/yivl.py,sha256=Qvp-zQ93cOZGg3zVyaiQLhRsfXiLrQhxu9TyQP2FMm4,4414
56
+ sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
57
+ sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
58
+ sglang/test/test_programs.py,sha256=mrLhGuprwvx8ZJ-0Qe28E-iCw5Qv-9T0SAv1Jgo1AJw,11421
59
+ sglang/test/test_utils.py,sha256=6PhTRi8UnR-BRNjit6aGu0M5lO0RebNQwEcDt712hE4,4830
60
+ sglang-0.1.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
61
+ sglang-0.1.14.dist-info/METADATA,sha256=C5N0VOYRHixdJcsf4dExIvP-Q099kYBMKs_dA4LBXSM,28809
62
+ sglang-0.1.14.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
63
+ sglang-0.1.14.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
64
+ sglang-0.1.14.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,274 +0,0 @@
1
- # Adapted from
2
- # https://github.com/vllm-project/vllm/blob/c81dddb45c71e630b907f9d84686ecd73b4105c7/vllm/model_executor/models/gpt_neox.py#L1
3
- """Inference-only GPT-NeoX model compatible with HuggingFace weights."""
4
- from typing import List, Optional, Tuple
5
-
6
- import torch
7
- from torch import nn
8
- from transformers import GPTNeoXConfig
9
-
10
- from vllm.model_executor.layers.activation import get_act_fn
11
- from sglang.srt.layers.logits_processor import LogitsProcessor
12
- from sglang.srt.layers.radix_attention import RadixAttention
13
- from sglang.srt.managers.router.model_runner import InputMetadata
14
- from vllm.model_executor.layers.linear import (ColumnParallelLinear,
15
- LinearMethodBase,
16
- QKVParallelLinear,
17
- RowParallelLinear)
18
- from vllm.model_executor.layers.rotary_embedding import get_rope
19
- from vllm.model_executor.layers.vocab_parallel_embedding import (
20
- VocabParallelEmbedding, ParallelLMHead)
21
- from vllm.model_executor.parallel_utils.parallel_state import (
22
- get_tensor_model_parallel_world_size)
23
- from vllm.model_executor.weight_utils import (default_weight_loader,
24
- hf_model_weights_iterator)
25
-
26
- class GPTNeoXAttention(nn.Module):
27
-
28
- def __init__(
29
- self,
30
- config: GPTNeoXConfig,
31
- layer_id: int = 0,
32
- linear_method: Optional[LinearMethodBase] = None,
33
- ):
34
- super().__init__()
35
- self.total_num_heads = config.num_attention_heads
36
- self.hidden_size = config.hidden_size
37
- self.head_size = self.hidden_size // self.total_num_heads
38
- self.bias = getattr(config, "attention_bias", True)
39
-
40
- tensor_model_parallel_world_size = (
41
- get_tensor_model_parallel_world_size())
42
- assert self.total_num_heads % tensor_model_parallel_world_size == 0
43
- self.num_heads = (self.total_num_heads //
44
- tensor_model_parallel_world_size)
45
-
46
- self.query_key_value = QKVParallelLinear(
47
- config.hidden_size,
48
- self.head_size,
49
- self.total_num_heads,
50
- bias=self.bias,
51
- linear_method=linear_method,
52
- )
53
- self.dense = RowParallelLinear(
54
- config.hidden_size,
55
- config.hidden_size,
56
- bias=self.bias,
57
- linear_method=linear_method,
58
- )
59
- scaling = self.head_size**-0.5
60
- rotary_dim = int(self.head_size * config.rotary_pct)
61
- assert rotary_dim % 2 == 0
62
- rope_theta = getattr(config, "rope_theta", 10000)
63
- max_position_embeddings = getattr(config, "max_position_embeddings",
64
- 8192)
65
- self.rotary_emb = get_rope(
66
- self.head_size,
67
- rotary_dim=rotary_dim,
68
- max_position=max_position_embeddings,
69
- base=rope_theta,
70
- )
71
- self.attn = RadixAttention(self.num_heads,
72
- self.head_size,
73
- scaling,
74
- num_kv_heads=self.num_heads,
75
- layer_id=layer_id)
76
-
77
- def forward(
78
- self,
79
- position_ids: torch.Tensor,
80
- hidden_states: torch.Tensor,
81
- input_metadata: InputMetadata,
82
- ) -> torch.Tensor:
83
- qkv, _ = self.query_key_value(hidden_states)
84
- q, k, v = qkv.chunk(chunks=3, dim=-1)
85
- q, k = self.rotary_emb(position_ids, q, k)
86
- attn_output = self.attn(q, k, v, input_metadata)
87
- output, _ = self.dense(attn_output)
88
- return output
89
-
90
-
91
- class GPTNeoXMLP(nn.Module):
92
-
93
- def __init__(
94
- self,
95
- config: GPTNeoXConfig,
96
- linear_method: Optional[LinearMethodBase] = None,
97
- ):
98
- super().__init__()
99
- self.dense_h_to_4h = ColumnParallelLinear(
100
- config.hidden_size,
101
- config.intermediate_size,
102
- linear_method=linear_method,
103
- )
104
- self.dense_4h_to_h = RowParallelLinear(
105
- config.intermediate_size,
106
- config.hidden_size,
107
- linear_method=linear_method,
108
- )
109
- quant_config = getattr(linear_method, "quant_config", None)
110
- self.act = get_act_fn(config.hidden_act, quant_config,
111
- config.intermediate_size)
112
-
113
- def forward(self, hidden_states):
114
- hidden_states, _ = self.dense_h_to_4h(hidden_states)
115
- hidden_states = self.act(hidden_states)
116
- hidden_states, _ = self.dense_4h_to_h(hidden_states)
117
- return hidden_states
118
-
119
-
120
- class GPTNeoXLayer(nn.Module):
121
-
122
- def __init__(
123
- self,
124
- config: GPTNeoXConfig,
125
- layer_id: int = 0,
126
- linear_method: Optional[LinearMethodBase] = None,
127
- ):
128
- super().__init__()
129
- self.use_parallel_residual = config.use_parallel_residual
130
- self.input_layernorm = nn.LayerNorm(config.hidden_size,
131
- eps=config.layer_norm_eps)
132
- self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
133
- eps=config.layer_norm_eps)
134
- self.attention = GPTNeoXAttention(config, layer_id=layer_id, linear_method=linear_method)
135
- self.mlp = GPTNeoXMLP(config, linear_method)
136
-
137
- def forward(
138
- self,
139
- position_ids: torch.Tensor,
140
- hidden_states: torch.Tensor,
141
- input_metadata: InputMetadata,
142
- ) -> torch.Tensor:
143
- attn_input = self.input_layernorm(hidden_states)
144
- attn_output = self.attention(
145
- position_ids=position_ids,
146
- hidden_states=attn_input,
147
- input_metadata=input_metadata,
148
- )
149
-
150
- if self.use_parallel_residual:
151
- # pseudocode:
152
- # x = x + attn(ln1(x)) + mlp(ln2(x))
153
- mlp_input = self.post_attention_layernorm(hidden_states)
154
- mlp_output = self.mlp(mlp_input)
155
- hidden_states = mlp_output + attn_output + hidden_states
156
- else:
157
- # pseudocode:
158
- # x = x + attn(ln1(x))
159
- # x = x + mlp(ln2(x))
160
- attn_output = attn_output + hidden_states
161
- mlp_input = self.post_attention_layernorm(attn_output)
162
- mlp_output = self.mlp(mlp_input)
163
- hidden_states = mlp_output + attn_output
164
- return hidden_states
165
-
166
-
167
- class GPTNeoXModel(nn.Module):
168
-
169
- def __init__(
170
- self,
171
- config: GPTNeoXConfig,
172
- linear_method: Optional[LinearMethodBase] = None,
173
- ):
174
- super().__init__()
175
- self.config = config
176
-
177
- self.embed_in = VocabParallelEmbedding(
178
- config.vocab_size,
179
- config.hidden_size,
180
- )
181
- self.layers = nn.ModuleList([
182
- GPTNeoXLayer(config, i, linear_method)
183
- for i in range(config.num_hidden_layers)
184
- ])
185
- self.final_layer_norm = nn.LayerNorm(config.hidden_size,
186
- eps=config.layer_norm_eps)
187
-
188
- def forward(
189
- self,
190
- input_ids: torch.Tensor,
191
- position_ids: torch.Tensor,
192
- input_metadata: InputMetadata,
193
- skip_embed: bool = False,
194
- ) -> torch.Tensor:
195
- if not skip_embed:
196
- hidden_states = self.embed_in(input_ids)
197
- else:
198
- hidden_states = input_ids
199
- hidden_states = self.embed_in(input_ids)
200
- for i in range(len(self.layers)):
201
- layer = self.layers[i]
202
- hidden_states = layer(
203
- position_ids,
204
- hidden_states,
205
- input_metadata,
206
- )
207
- hidden_states = self.final_layer_norm(hidden_states)
208
- return hidden_states
209
-
210
-
211
- class GPTNeoXForCausalLM(nn.Module):
212
-
213
- def __init__(
214
- self,
215
- config,
216
- linear_method: Optional[LinearMethodBase] = None,
217
- ):
218
- super().__init__()
219
- self.config = config
220
- self.linear_method = linear_method
221
- self.gpt_neox = GPTNeoXModel(config, linear_method)
222
- self.embed_out = ParallelLMHead(
223
- config.vocab_size,
224
- config.hidden_size,
225
- )
226
- self.logits_processor = LogitsProcessor(config)
227
-
228
- def forward(
229
- self,
230
- input_ids: torch.Tensor,
231
- positions: torch.Tensor,
232
- input_metadata: InputMetadata,
233
- skip_embed: bool = False,
234
- ) -> torch.Tensor:
235
- hidden_states = self.gpt_neox(input_ids, positions, input_metadata, skip_embed)
236
- return self.logits_processor(
237
- input_ids, hidden_states, self.embed_out.weight, input_metadata
238
- )
239
-
240
- def load_weights(self,
241
- model_name_or_path: str,
242
- cache_dir: Optional[str] = None,
243
- load_format: str = "auto",
244
- revision: Optional[str] = None):
245
- params_dict = dict(self.named_parameters())
246
- for name, loaded_weight in hf_model_weights_iterator(
247
- model_name_or_path, cache_dir, load_format, revision):
248
- if ("attention.bias" in name or "attention.masked_bias" in name
249
- or "rotary_emb.inv_freq" in name):
250
- continue
251
- param = params_dict[name]
252
-
253
- if "query_key_value" in name:
254
- # NOTE: GPT-NeoX's fused QKV's output_dim has the shape of
255
- # (num_heads * 3 * head_size), while the
256
- # required shape is (3 * num_heads * head_size).
257
- # Thus, we need weight conversion.
258
- output_dim = getattr(param, "output_dim", None)
259
- num_heads = self.config.num_attention_heads
260
- if output_dim is not None:
261
- loaded_weight_shape = loaded_weight.shape
262
- loaded_weight = loaded_weight.view(
263
- loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
264
- loaded_weight_shape[output_dim + 1:])
265
- loaded_weight = loaded_weight.transpose(
266
- output_dim, output_dim + 1)
267
- loaded_weight = loaded_weight.reshape(loaded_weight_shape)
268
-
269
- weight_loader = getattr(param, "weight_loader",
270
- default_weight_loader)
271
- weight_loader(param, loaded_weight)
272
-
273
-
274
- EntryClass = GPTNeoXForCausalLM
@@ -1,63 +0,0 @@
1
- sglang/__init__.py,sha256=MsaKtUijK193Lw2Hw8ydva_X5Le0sKvWUVeKaOjdYqY,96
2
- sglang/api.py,sha256=E2G93eTlM7wT451iGEDESZnt0NZjK03Xt0Lyx-NYCOc,4207
3
- sglang/global_config.py,sha256=PAX7TWeFcq0HBzNUWyCONAOjqIokWqw8vT7I6sBSKTc,797
4
- sglang/launch_server.py,sha256=jKPZRDN5bUe8Wgz5eoDkqeePhmKa8DLD4DpXQLT5auo,294
5
- sglang/utils.py,sha256=a3RjlWZ-K2LjO8GTwD_ExYu-QvgSrcDh-_NKeqzBziM,6231
6
- sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- sglang/backend/anthropic.py,sha256=y5TN9EDrJtOH4JEUxpXu-endloeYBy7xMUr3r7Ah3MA,1462
8
- sglang/backend/base_backend.py,sha256=pPalZfoezxnUBs752j7lm0uMwa8tZuCWd-ijSdStMO8,1745
9
- sglang/backend/openai.py,sha256=L49Ga3E1rgOyxpH9NyMrKw2Exm-WyDM_pCUQZetCH_Q,8555
10
- sglang/backend/runtime_endpoint.py,sha256=hx3D-Dv3XAVbnAtbW975RrNN6Jaw2ZvR6XGMFz61h7A,6689
11
- sglang/backend/vertexai.py,sha256=BLfWf_tEgoHY9srCufJM5PLe3tql2j0G6ia7cPykxCM,4713
12
- sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- sglang/lang/chat_template.py,sha256=9aiR_4euCjrSdduYYiSnVjHE48GOqaHHigwX1oXu0lU,7461
14
- sglang/lang/compiler.py,sha256=wNn_UqV6Sxl22mv-PpzFUtRgiFFV-Y4OYpO4LshEoRM,7527
15
- sglang/lang/interpreter.py,sha256=qCIssjVWltToFXpZDfNx6tiemQpXiK_NF_qUPWu_rvU,26262
16
- sglang/lang/ir.py,sha256=QSx0vMepQ01SaQ4EQjUqbJknHSrF557CqHuosQi6otQ,13330
17
- sglang/lang/tracer.py,sha256=pFiSNzPSg0l7ZZIlGqJDLCmQALR-wyo2dFgJP73J4_Y,8260
18
- sglang/srt/backend_config.py,sha256=UIV6kIU2j-Xh0eoezn1aXcYIy0miftHsWFeAZwqpbGE,227
19
- sglang/srt/conversation.py,sha256=mTstD-SsXG5p_YhWQUPEWU-vzzDMF4RgQ7KmLkOOC7U,15496
20
- sglang/srt/hf_transformers_utils.py,sha256=soRyYLoCn7GxgxvonufGFkdFBA3eH5i3Izk_wi7p1l0,5285
21
- sglang/srt/memory_pool.py,sha256=BMoX2wvicj214mV-xvcr_Iv_Je0qs3zTuzXfQVpV8u4,3609
22
- sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
23
- sglang/srt/model_config.py,sha256=MDfjfhfZxXTPrshLsZANWyCN8RPS-pCV4RTAcA8IUG8,1124
24
- sglang/srt/sampling_params.py,sha256=83Fp-4HWThC20TEh139XcIb_erBqfI7KZg5txdRBq7c,2896
25
- sglang/srt/server.py,sha256=21EdEjG_EmVs4BhL37wI5wNtcmir44CPRX5cZ-5bofM,21454
26
- sglang/srt/server_args.py,sha256=WihASLqvxYDX65LAtdMzQ2kqbARxFds20jF8HFfXmRc,7567
27
- sglang/srt/utils.py,sha256=2gCOlsErsFz969V5kaTPm0-jFwOpbIU1ufrRyWkYvYE,7287
28
- sglang/srt/constrained/__init__.py,sha256=wcGWZNn19jK2m-KN6P4ui1BrQy2wwUt-qG4_Orv0ouY,490
29
- sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
30
- sglang/srt/constrained/fsm_cache.py,sha256=Q0J4St3XUOt2tKFVpj0B2KIZ6z3X6cIzTcjREVqy3pg,471
31
- sglang/srt/constrained/jump_forward.py,sha256=Z-pz2Jnvk1CxSEZA65OVq0GryqdiKuOkhhc13v5T6Lo,2482
32
- sglang/srt/layers/context_flashattention_nopad.py,sha256=GkjLiTkS4px_uLcW0aDocE3_OBXtujZ-SlsN2b2U7ng,5204
33
- sglang/srt/layers/extend_attention.py,sha256=pWVE6ySnPiVLFON__bie73eDhmXHk4tECMK8zTiJNbI,12558
34
- sglang/srt/layers/logits_processor.py,sha256=MW2bpqSXyghODMojqeMSYWZhUHuAFPk_gUkyyLw9HkM,4827
35
- sglang/srt/layers/radix_attention.py,sha256=Tl1zE2c87Fm5qW5Ylffkgh48QQIwd93UK6IaRMZltFI,5789
36
- sglang/srt/layers/token_attention.py,sha256=Z3YVuFmqca3ho2NKSXjLXZNz4L67qrhaec_Pd38IA_4,8157
37
- sglang/srt/managers/detokenizer_manager.py,sha256=1lPNh_Pe6Pr0v-TzlCBBREbvz4uFWxyw31SmnEZh0s8,3292
38
- sglang/srt/managers/io_struct.py,sha256=E5Lt81n7-DkRR-pl7XoaJXIBaa2nT9swABNwXEsTsUw,3064
39
- sglang/srt/managers/openai_protocol.py,sha256=cttqg9iv3de8fhtCqDI4cYoPPZ_gULedMXstV1ok6WA,4563
40
- sglang/srt/managers/tokenizer_manager.py,sha256=B-F6diI1sV0pW1HxQgH_v8VzJwzvgkcGB1_MkOh2unE,9693
41
- sglang/srt/managers/router/infer_batch.py,sha256=bvUY1EmIKqdF38N5ALWVsgD2rl6GNzvUHMGkluoUpv8,18126
42
- sglang/srt/managers/router/manager.py,sha256=TNYs0IrkZGkPvZJViwL7BMUg0VlvzeyTjDMjuvRoMDI,2529
43
- sglang/srt/managers/router/model_rpc.py,sha256=dKBRzPoERK-TCgUgnaQfFFlZtB6_xWT9eSTVwizCCiA,25938
44
- sglang/srt/managers/router/model_runner.py,sha256=TUEqfsQedPUFGA5cpTYi8sW5whtEM-4ui3s7YoP0cBg,17604
45
- sglang/srt/managers/router/radix_cache.py,sha256=XGUF5mxQTSCzD7GW_ltNP2p5aelEKrMXzdezufJ7NCQ,6484
46
- sglang/srt/managers/router/scheduler.py,sha256=V-LAnVSzgD2ddy2eXW3jWURCeq9Lv7YxCGk4kHyytfM,2818
47
- sglang/srt/models/gpt_neox.py,sha256=0NwrX9hqVD9biE0bfJYFC4TMhQKhYdNowLkVYo2OG24,10271
48
- sglang/srt/models/llama2.py,sha256=tICX536zPcPup3KmwRmRASqBSgyY6_XpThjfMpQ1evM,11582
49
- sglang/srt/models/llava.py,sha256=OaJF9Lal4Txtg_FuDsQTL_kHR7PB1BUf3nhngCdFnfU,14963
50
- sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
51
- sglang/srt/models/mixtral.py,sha256=iTuuyJdT8cq6W7CCqHz5nyY6I8r_m1SLMiGfGTkPW6w,13886
52
- sglang/srt/models/qwen.py,sha256=xPvwO4YBhht4ROSK-ef9Zysk_UvB06GxCBPYjyElgUY,9225
53
- sglang/srt/models/qwen2.py,sha256=B_dH2QQtfuz38LmGOGcFnQwTRMRFUkHAaO32rnNB_-s,11278
54
- sglang/srt/models/yivl.py,sha256=Qvp-zQ93cOZGg3zVyaiQLhRsfXiLrQhxu9TyQP2FMm4,4414
55
- sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
56
- sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
57
- sglang/test/test_programs.py,sha256=mrLhGuprwvx8ZJ-0Qe28E-iCw5Qv-9T0SAv1Jgo1AJw,11421
58
- sglang/test/test_utils.py,sha256=DyZAic3KIBQ0PmZeLc9uv1ckcM5jpEE5CirjHO48_sk,4829
59
- sglang-0.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
60
- sglang-0.1.12.dist-info/METADATA,sha256=4Q0u9J9QUQFlAbMnzioXZ0i47F-HZznyKA7qcnrv_K4,28129
61
- sglang-0.1.12.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
62
- sglang-0.1.12.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
63
- sglang-0.1.12.dist-info/RECORD,,