ipex-llm 2.2.0b20250121__py3-none-win_amd64.whl → 2.2.0b20250123__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipex_llm/libs/bloom-api.dll +0 -0
- ipex_llm/libs/bloom.dll +0 -0
- ipex_llm/libs/gptneox-api.dll +0 -0
- ipex_llm/libs/gptneox.dll +0 -0
- ipex_llm/libs/libbloom_avx.dll +0 -0
- ipex_llm/libs/libbloom_vnni.dll +0 -0
- ipex_llm/libs/libgptneox_avx.dll +0 -0
- ipex_llm/libs/libgptneox_vnni.dll +0 -0
- ipex_llm/libs/libllama_avx.dll +0 -0
- ipex_llm/libs/libllama_vnni.dll +0 -0
- ipex_llm/libs/libstarcoder_avx.dll +0 -0
- ipex_llm/libs/libstarcoder_vnni.dll +0 -0
- ipex_llm/libs/llama-api.dll +0 -0
- ipex_llm/libs/llama.dll +0 -0
- ipex_llm/libs/main-bloom.exe +0 -0
- ipex_llm/libs/main-gptneox.exe +0 -0
- ipex_llm/libs/main-llama.exe +0 -0
- ipex_llm/libs/main-starcoder.exe +0 -0
- ipex_llm/libs/pipeline.dll +0 -0
- ipex_llm/libs/quantize-bloom.exe +0 -0
- ipex_llm/libs/quantize-bloom_vnni.exe +0 -0
- ipex_llm/libs/quantize-gptneox.exe +0 -0
- ipex_llm/libs/quantize-gptneox_vnni.exe +0 -0
- ipex_llm/libs/quantize-llama.exe +0 -0
- ipex_llm/libs/quantize-llama_vnni.exe +0 -0
- ipex_llm/libs/quantize-starcoder.exe +0 -0
- ipex_llm/libs/quantize-starcoder_vnni.exe +0 -0
- ipex_llm/libs/starcoder-api.dll +0 -0
- ipex_llm/libs/starcoder.dll +0 -0
- ipex_llm/transformers/convert.py +0 -1
- ipex_llm/transformers/low_bit_linear.py +1 -1
- ipex_llm/transformers/model.py +1 -3
- ipex_llm/transformers/npu_models/mp_models_base.py +3 -1
- ipex_llm/transformers/patches.py +0 -11
- ipex_llm/vllm/cpu/engine/__init__.py +2 -1
- ipex_llm/vllm/cpu/engine/engine.py +159 -75
- ipex_llm/vllm/cpu/entrypoints/api_server.py +787 -0
- ipex_llm/vllm/cpu/entrypoints/openai/api_server.py +680 -95
- ipex_llm/vllm/cpu/entrypoints/openai/cli_args.py +277 -0
- ipex_llm/vllm/cpu/ipex_llm_v1_wrapper.py +23 -0
- ipex_llm/vllm/cpu/ipex_llm_wrapper.py +24 -0
- ipex_llm/vllm/cpu/model_convert.py +126 -233
- {ipex_llm-2.2.0b20250121.dist-info → ipex_llm-2.2.0b20250123.dist-info}/METADATA +20 -20
- {ipex_llm-2.2.0b20250121.dist-info → ipex_llm-2.2.0b20250123.dist-info}/RECORD +50 -46
- {ipex_llm-2.2.0b20250121.data → ipex_llm-2.2.0b20250123.data}/scripts/ipex-llm-init.bat +0 -0
- {ipex_llm-2.2.0b20250121.data → ipex_llm-2.2.0b20250123.data}/scripts/llm-chat.ps1 +0 -0
- {ipex_llm-2.2.0b20250121.data → ipex_llm-2.2.0b20250123.data}/scripts/llm-cli.ps1 +0 -0
- {ipex_llm-2.2.0b20250121.dist-info → ipex_llm-2.2.0b20250123.dist-info}/WHEEL +0 -0
- {ipex_llm-2.2.0b20250121.dist-info → ipex_llm-2.2.0b20250123.dist-info}/entry_points.txt +0 -0
- {ipex_llm-2.2.0b20250121.dist-info → ipex_llm-2.2.0b20250123.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,277 @@
|
|
1
|
+
"""
|
2
|
+
This file contains the command line arguments for the vLLM's
|
3
|
+
OpenAI-compatible server. It is kept in a separate file for documentation
|
4
|
+
purposes.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import argparse
|
8
|
+
import json
|
9
|
+
import ssl
|
10
|
+
from typing import List, Optional, Sequence, Union, get_args
|
11
|
+
|
12
|
+
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
|
13
|
+
from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
|
14
|
+
validate_chat_template)
|
15
|
+
from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
|
16
|
+
PromptAdapterPath)
|
17
|
+
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
|
18
|
+
from vllm.utils import FlexibleArgumentParser
|
19
|
+
|
20
|
+
|
21
|
+
class LoRAParserAction(argparse.Action):
|
22
|
+
|
23
|
+
def __call__(
|
24
|
+
self,
|
25
|
+
parser: argparse.ArgumentParser,
|
26
|
+
namespace: argparse.Namespace,
|
27
|
+
values: Optional[Union[str, Sequence[str]]],
|
28
|
+
option_string: Optional[str] = None,
|
29
|
+
):
|
30
|
+
if values is None:
|
31
|
+
values = []
|
32
|
+
if isinstance(values, str):
|
33
|
+
raise TypeError("Expected values to be a list") # noqa
|
34
|
+
|
35
|
+
lora_list: List[LoRAModulePath] = []
|
36
|
+
for item in values:
|
37
|
+
if item in [None, '']: # Skip if item is None or empty string
|
38
|
+
continue
|
39
|
+
if '=' in item and ',' not in item: # Old format: name=path
|
40
|
+
name, path = item.split('=')
|
41
|
+
lora_list.append(LoRAModulePath(name, path))
|
42
|
+
else: # Assume JSON format
|
43
|
+
try:
|
44
|
+
lora_dict = json.loads(item)
|
45
|
+
lora = LoRAModulePath(**lora_dict)
|
46
|
+
lora_list.append(lora)
|
47
|
+
except json.JSONDecodeError:
|
48
|
+
parser.error(
|
49
|
+
f"Invalid JSON format for --lora-modules: {item}")
|
50
|
+
except TypeError as e:
|
51
|
+
parser.error(
|
52
|
+
f"Invalid fields for --lora-modules: {item} - {str(e)}"
|
53
|
+
)
|
54
|
+
setattr(namespace, self.dest, lora_list)
|
55
|
+
|
56
|
+
|
57
|
+
class PromptAdapterParserAction(argparse.Action):
|
58
|
+
|
59
|
+
def __call__(
|
60
|
+
self,
|
61
|
+
parser: argparse.ArgumentParser,
|
62
|
+
namespace: argparse.Namespace,
|
63
|
+
values: Optional[Union[str, Sequence[str]]],
|
64
|
+
option_string: Optional[str] = None,
|
65
|
+
):
|
66
|
+
if values is None:
|
67
|
+
values = []
|
68
|
+
if isinstance(values, str):
|
69
|
+
raise TypeError("Expected values to be a list") # noqa
|
70
|
+
|
71
|
+
adapter_list: List[PromptAdapterPath] = []
|
72
|
+
for item in values:
|
73
|
+
name, path = item.split('=')
|
74
|
+
adapter_list.append(PromptAdapterPath(name, path))
|
75
|
+
setattr(namespace, self.dest, adapter_list)
|
76
|
+
|
77
|
+
|
78
|
+
def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
79
|
+
parser.add_argument("--host",
|
80
|
+
type=nullable_str,
|
81
|
+
default=None,
|
82
|
+
help="host name")
|
83
|
+
parser.add_argument("--port", type=int, default=8000, help="port number")
|
84
|
+
parser.add_argument(
|
85
|
+
"--uvicorn-log-level",
|
86
|
+
type=str,
|
87
|
+
default="info",
|
88
|
+
choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
|
89
|
+
help="log level for uvicorn")
|
90
|
+
parser.add_argument("--allow-credentials",
|
91
|
+
action="store_true",
|
92
|
+
help="allow credentials")
|
93
|
+
parser.add_argument("--allowed-origins",
|
94
|
+
type=json.loads,
|
95
|
+
default=["*"],
|
96
|
+
help="allowed origins")
|
97
|
+
parser.add_argument("--allowed-methods",
|
98
|
+
type=json.loads,
|
99
|
+
default=["*"],
|
100
|
+
help="allowed methods")
|
101
|
+
parser.add_argument("--allowed-headers",
|
102
|
+
type=json.loads,
|
103
|
+
default=["*"],
|
104
|
+
help="allowed headers")
|
105
|
+
parser.add_argument("--api-key",
|
106
|
+
type=nullable_str,
|
107
|
+
default=None,
|
108
|
+
help="If provided, the server will require this key "
|
109
|
+
"to be presented in the header.")
|
110
|
+
parser.add_argument(
|
111
|
+
"--lora-modules",
|
112
|
+
type=nullable_str,
|
113
|
+
default=None,
|
114
|
+
nargs='+',
|
115
|
+
action=LoRAParserAction,
|
116
|
+
help="LoRA module configurations in either 'name=path' format"
|
117
|
+
"or JSON format. "
|
118
|
+
"Example (old format): 'name=path' "
|
119
|
+
"Example (new format): "
|
120
|
+
"'{\"name\": \"name\", \"local_path\": \"path\", "
|
121
|
+
"\"base_model_name\": \"id\"}'")
|
122
|
+
parser.add_argument(
|
123
|
+
"--prompt-adapters",
|
124
|
+
type=nullable_str,
|
125
|
+
default=None,
|
126
|
+
nargs='+',
|
127
|
+
action=PromptAdapterParserAction,
|
128
|
+
help="Prompt adapter configurations in the format name=path. "
|
129
|
+
"Multiple adapters can be specified.")
|
130
|
+
parser.add_argument("--chat-template",
|
131
|
+
type=nullable_str,
|
132
|
+
default=None,
|
133
|
+
help="The file path to the chat template, "
|
134
|
+
"or the template in single-line form "
|
135
|
+
"for the specified model")
|
136
|
+
parser.add_argument(
|
137
|
+
'--chat-template-content-format',
|
138
|
+
type=str,
|
139
|
+
default="auto",
|
140
|
+
choices=get_args(ChatTemplateContentFormatOption),
|
141
|
+
help='The format to render message content within a chat template.'
|
142
|
+
'\n\n'
|
143
|
+
'* "string" will render the content as a string. '
|
144
|
+
'Example: "Hello World"\n'
|
145
|
+
'* "openai" will render the content as a list of dictionaries, '
|
146
|
+
'similar to OpenAI schema. '
|
147
|
+
'Example: [{"type": "text", "text": "Hello world!"}]')
|
148
|
+
parser.add_argument("--response-role",
|
149
|
+
type=nullable_str,
|
150
|
+
default="assistant",
|
151
|
+
help="The role name to return if "
|
152
|
+
"`request.add_generation_prompt=true`.")
|
153
|
+
parser.add_argument("--ssl-keyfile",
|
154
|
+
type=nullable_str,
|
155
|
+
default=None,
|
156
|
+
help="The file path to the SSL key file")
|
157
|
+
parser.add_argument("--ssl-certfile",
|
158
|
+
type=nullable_str,
|
159
|
+
default=None,
|
160
|
+
help="The file path to the SSL cert file")
|
161
|
+
parser.add_argument("--ssl-ca-certs",
|
162
|
+
type=nullable_str,
|
163
|
+
default=None,
|
164
|
+
help="The CA certificates file")
|
165
|
+
parser.add_argument(
|
166
|
+
"--ssl-cert-reqs",
|
167
|
+
type=int,
|
168
|
+
default=int(ssl.CERT_NONE),
|
169
|
+
help="Whether client certificate is required (see stdlib ssl module's)"
|
170
|
+
)
|
171
|
+
parser.add_argument(
|
172
|
+
"--root-path",
|
173
|
+
type=nullable_str,
|
174
|
+
default=None,
|
175
|
+
help="FastAPI root_path when app is behind a path based routing proxy")
|
176
|
+
parser.add_argument(
|
177
|
+
"--middleware",
|
178
|
+
type=nullable_str,
|
179
|
+
action="append",
|
180
|
+
default=[],
|
181
|
+
help="Additional ASGI middleware to apply to the app. "
|
182
|
+
"We accept multiple --middleware arguments. "
|
183
|
+
"The value should be an import path. "
|
184
|
+
"If a function is provided, vLLM will add it to the server "
|
185
|
+
"using @app.middleware('http'). "
|
186
|
+
"If a class is provided, vLLM will add it to the server "
|
187
|
+
"using app.add_middleware(). ")
|
188
|
+
parser.add_argument(
|
189
|
+
"--return-tokens-as-token-ids",
|
190
|
+
action="store_true",
|
191
|
+
help="When --max-logprobs is specified, represents single tokens as "
|
192
|
+
"strings of the form 'token_id:{token_id}' so that tokens that "
|
193
|
+
"are not JSON-encodable can be identified.")
|
194
|
+
parser.add_argument(
|
195
|
+
"--disable-frontend-multiprocessing",
|
196
|
+
action="store_true",
|
197
|
+
help="If specified, will run the OpenAI frontend server in the same "
|
198
|
+
"process as the model serving engine.")
|
199
|
+
parser.add_argument(
|
200
|
+
"--enable-request-id-headers",
|
201
|
+
action="store_true",
|
202
|
+
help="If specified, API server will add X-Request-Id header to "
|
203
|
+
"responses. Caution: this hurts performance at high QPS.")
|
204
|
+
parser.add_argument(
|
205
|
+
"--enable-auto-tool-choice",
|
206
|
+
action="store_true",
|
207
|
+
default=False,
|
208
|
+
help="Enable auto tool choice for supported models. Use --tool-call-parser"
|
209
|
+
" to specify which parser to use")
|
210
|
+
|
211
|
+
valid_tool_parsers = ToolParserManager.tool_parsers.keys()
|
212
|
+
parser.add_argument(
|
213
|
+
"--tool-call-parser",
|
214
|
+
type=str,
|
215
|
+
metavar="{" + ",".join(valid_tool_parsers) + "} or name registered in "
|
216
|
+
"--tool-parser-plugin",
|
217
|
+
default=None,
|
218
|
+
help="Select the tool call parser depending on the model that you're using."
|
219
|
+
" This is used to parse the model-generated tool call into OpenAI API "
|
220
|
+
"format. Required for --enable-auto-tool-choice.")
|
221
|
+
|
222
|
+
parser.add_argument(
|
223
|
+
"--tool-parser-plugin",
|
224
|
+
type=str,
|
225
|
+
default="",
|
226
|
+
help="Special the tool parser plugin write to parse the model-generated tool"
|
227
|
+
" into OpenAI API format, the name register in this plugin can be used "
|
228
|
+
"in --tool-call-parser.")
|
229
|
+
|
230
|
+
parser = AsyncEngineArgs.add_cli_args(parser)
|
231
|
+
|
232
|
+
parser.add_argument('--max-log-len',
|
233
|
+
type=int,
|
234
|
+
default=None,
|
235
|
+
help='Max number of prompt characters or prompt '
|
236
|
+
'ID numbers being printed in log.'
|
237
|
+
'\n\nDefault: Unlimited')
|
238
|
+
|
239
|
+
parser.add_argument(
|
240
|
+
"--disable-fastapi-docs",
|
241
|
+
action='store_true',
|
242
|
+
default=False,
|
243
|
+
help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint"
|
244
|
+
)
|
245
|
+
parser.add_argument(
|
246
|
+
"--enable-prompt-tokens-details",
|
247
|
+
action='store_true',
|
248
|
+
default=False,
|
249
|
+
help="If set to True, enable prompt_tokens_details in usage.")
|
250
|
+
|
251
|
+
parser.add_argument(
|
252
|
+
"--load-in-low-bit",
|
253
|
+
type=str,
|
254
|
+
default="sym_int4",
|
255
|
+
help="Low-bit quantization for IPEX-LLM models")
|
256
|
+
|
257
|
+
return parser
|
258
|
+
|
259
|
+
|
260
|
+
def validate_parsed_serve_args(args: argparse.Namespace):
|
261
|
+
"""Quick checks for model serve args that raise prior to loading.""" # noqa
|
262
|
+
if hasattr(args, "subparser") and args.subparser != "serve":
|
263
|
+
return
|
264
|
+
|
265
|
+
# Ensure that the chat template is valid; raises if it likely isn't
|
266
|
+
validate_chat_template(args.chat_template)
|
267
|
+
|
268
|
+
# Enable auto tool needs a tool call parser to be valid
|
269
|
+
if args.enable_auto_tool_choice and not args.tool_call_parser:
|
270
|
+
raise TypeError("Error: --enable-auto-tool-choice requires " # noqa
|
271
|
+
"--tool-call-parser")
|
272
|
+
|
273
|
+
|
274
|
+
def create_parser_for_docs() -> FlexibleArgumentParser:
|
275
|
+
parser_for_docs = FlexibleArgumentParser(
|
276
|
+
prog="-m vllm.entrypoints.openai.api_server")
|
277
|
+
return make_arg_parser(parser_for_docs)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
from vllm.logger import init_logger
|
2
|
+
from vllm.v1.executor.ray_utils import RayWorkerWrapper
|
3
|
+
|
4
|
+
|
5
|
+
logger = init_logger(__name__)
|
6
|
+
|
7
|
+
|
8
|
+
class IPEXLLMV1Wrapper(RayWorkerWrapper):
|
9
|
+
def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None:
|
10
|
+
super().__init__(*args, **kwargs)
|
11
|
+
from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
|
12
|
+
_ipex_llm_convert(load_in_low_bit=load_in_low_bit)
|
13
|
+
self.compiled_dag_cuda_device_set = False
|
14
|
+
|
15
|
+
|
16
|
+
def get_ipex_llm_v1_wrapper(load_in_low_bit):
|
17
|
+
# The reason why we not using functools.partial is that
|
18
|
+
# ray seems not work well with it.
|
19
|
+
class WrapperWithLoadBit(IPEXLLMV1Wrapper):
|
20
|
+
def __init__(self, *args, **kwargs) -> None:
|
21
|
+
super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs)
|
22
|
+
|
23
|
+
return WrapperWithLoadBit
|
@@ -0,0 +1,24 @@
|
|
1
|
+
from vllm.logger import init_logger
|
2
|
+
from vllm.executor.ray_utils import RayWorkerWrapper
|
3
|
+
|
4
|
+
|
5
|
+
logger = init_logger(__name__)
|
6
|
+
|
7
|
+
|
8
|
+
class IPEXLLMWrapper(RayWorkerWrapper):
|
9
|
+
def __init__(self, load_in_low_bit="sym_int4", *args, **kwargs) -> None:
|
10
|
+
super().__init__(*args, **kwargs)
|
11
|
+
from ipex_llm.vllm.cpu.model_convert import _ipex_llm_convert
|
12
|
+
_ipex_llm_convert(load_in_low_bit=load_in_low_bit)
|
13
|
+
self.compiled_dag_cuda_device_set = False
|
14
|
+
|
15
|
+
|
16
|
+
def get_ipex_llm_wrapper(load_in_low_bit):
|
17
|
+
# The reason why we not using functools.partial is that
|
18
|
+
# ray seems not work well with it.
|
19
|
+
class WrapperWithLoadBit(IPEXLLMWrapper):
|
20
|
+
def __init__(self, *args, **kwargs) -> None:
|
21
|
+
super().__init__(load_in_low_bit=load_in_low_bit, *args, **kwargs)
|
22
|
+
|
23
|
+
# a = functools.partial(IPEXLLMWrapper, load_in_low_bit=load_in_low_bit)
|
24
|
+
return WrapperWithLoadBit
|