xinference 0.16.1__py3-none-any.whl → 0.16.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/conftest.py +0 -8
- xinference/constants.py +2 -0
- xinference/core/model.py +34 -2
- xinference/core/supervisor.py +5 -5
- xinference/core/utils.py +9 -10
- xinference/core/worker.py +8 -5
- xinference/deploy/cmdline.py +5 -0
- xinference/deploy/utils.py +7 -4
- xinference/model/audio/core.py +6 -2
- xinference/model/audio/model_spec.json +1 -1
- xinference/model/core.py +3 -1
- xinference/model/embedding/core.py +6 -2
- xinference/model/image/core.py +6 -2
- xinference/model/image/ocr/got_ocr2.py +3 -0
- xinference/model/llm/__init__.py +33 -0
- xinference/model/llm/core.py +4 -4
- xinference/model/llm/llm_family.json +87 -0
- xinference/model/llm/llm_family.py +68 -2
- xinference/model/llm/llm_family_modelscope.json +91 -0
- xinference/model/llm/llm_family_openmind_hub.json +1359 -0
- xinference/model/llm/vllm/core.py +2 -1
- xinference/model/rerank/core.py +9 -1
- xinference/model/utils.py +7 -0
- xinference/model/video/core.py +6 -2
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.b76aeeb7.js → main.2f269bb3.js} +3 -3
- xinference/web/ui/build/static/js/main.2f269bb3.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1f269fb2a368363c1cb2237825f1dba093b6bdd8c44cc05954fd19ec2c1fff03.json +1 -0
- {xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/METADATA +5 -4
- {xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/RECORD +37 -36
- xinference/web/ui/build/static/js/main.b76aeeb7.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/32ea2c04cf0bba2761b4883d2c40cc259952c94d2d6bb774e510963ca37aac0a.json +0 -1
- /xinference/web/ui/build/static/js/{main.b76aeeb7.js.LICENSE.txt → main.2f269bb3.js.LICENSE.txt} +0 -0
- {xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/LICENSE +0 -0
- {xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/WHEEL +0 -0
- {xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.16.1.dist-info → xinference-0.16.3.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-
|
|
11
|
+
"date": "2024-11-07T16:55:36+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.16.
|
|
14
|
+
"full-revisionid": "85ab86bf1c0967e45fbec995534cd5a0c9a9c439",
|
|
15
|
+
"version": "0.16.3"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/conftest.py
CHANGED
|
@@ -58,10 +58,6 @@ TEST_LOGGING_CONF = {
|
|
|
58
58
|
"propagate": False,
|
|
59
59
|
}
|
|
60
60
|
},
|
|
61
|
-
"root": {
|
|
62
|
-
"level": "WARN",
|
|
63
|
-
"handlers": ["stream_handler"],
|
|
64
|
-
},
|
|
65
61
|
}
|
|
66
62
|
|
|
67
63
|
TEST_LOG_FILE_PATH = get_log_file(f"test_{get_timestamp_ms()}")
|
|
@@ -102,10 +98,6 @@ TEST_FILE_LOGGING_CONF = {
|
|
|
102
98
|
"propagate": False,
|
|
103
99
|
}
|
|
104
100
|
},
|
|
105
|
-
"root": {
|
|
106
|
-
"level": "WARN",
|
|
107
|
-
"handlers": ["stream_handler", "file_handler"],
|
|
108
|
-
},
|
|
109
101
|
}
|
|
110
102
|
|
|
111
103
|
|
xinference/constants.py
CHANGED
|
@@ -39,6 +39,7 @@ def get_xinference_home() -> str:
|
|
|
39
39
|
# if user has already set `XINFERENCE_HOME` env, change huggingface and modelscope default download path
|
|
40
40
|
os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(home_path, "huggingface")
|
|
41
41
|
os.environ["MODELSCOPE_CACHE"] = os.path.join(home_path, "modelscope")
|
|
42
|
+
os.environ["XDG_CACHE_HOME"] = os.path.join(home_path, "openmind_hub")
|
|
42
43
|
# In multi-tenant mode,
|
|
43
44
|
# gradio's temporary files are stored in their respective home directories,
|
|
44
45
|
# to prevent insufficient permissions
|
|
@@ -86,3 +87,4 @@ XINFERENCE_DOWNLOAD_MAX_ATTEMPTS = int(
|
|
|
86
87
|
XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE = os.environ.get(
|
|
87
88
|
XINFERENCE_ENV_TEXT_TO_IMAGE_BATCHING_SIZE, None
|
|
88
89
|
)
|
|
90
|
+
XINFERENCE_LAUNCH_MODEL_RETRY = 3
|
xinference/core/model.py
CHANGED
|
@@ -40,7 +40,10 @@ from typing import (
|
|
|
40
40
|
import sse_starlette.sse
|
|
41
41
|
import xoscar as xo
|
|
42
42
|
|
|
43
|
-
from ..constants import
|
|
43
|
+
from ..constants import (
|
|
44
|
+
XINFERENCE_LAUNCH_MODEL_RETRY,
|
|
45
|
+
XINFERENCE_TEXT_TO_IMAGE_BATCHING_SIZE,
|
|
46
|
+
)
|
|
44
47
|
|
|
45
48
|
if TYPE_CHECKING:
|
|
46
49
|
from .progress_tracker import ProgressTrackerActor
|
|
@@ -134,6 +137,8 @@ def oom_check(fn):
|
|
|
134
137
|
|
|
135
138
|
|
|
136
139
|
class ModelActor(xo.StatelessActor):
|
|
140
|
+
_replica_model_uid: Optional[str]
|
|
141
|
+
|
|
137
142
|
@classmethod
|
|
138
143
|
def gen_uid(cls, model: "LLM"):
|
|
139
144
|
return f"{model.__class__}-model-actor"
|
|
@@ -192,6 +197,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
192
197
|
supervisor_address: str,
|
|
193
198
|
worker_address: str,
|
|
194
199
|
model: "LLM",
|
|
200
|
+
replica_model_uid: str,
|
|
195
201
|
model_description: Optional["ModelDescription"] = None,
|
|
196
202
|
request_limits: Optional[int] = None,
|
|
197
203
|
):
|
|
@@ -203,6 +209,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
203
209
|
|
|
204
210
|
self._supervisor_address = supervisor_address
|
|
205
211
|
self._worker_address = worker_address
|
|
212
|
+
self._replica_model_uid = replica_model_uid
|
|
206
213
|
self._model = model
|
|
207
214
|
self._model_description = (
|
|
208
215
|
model_description.to_dict() if model_description else {}
|
|
@@ -257,6 +264,9 @@ class ModelActor(xo.StatelessActor):
|
|
|
257
264
|
uid=FluxBatchSchedulerActor.gen_uid(self.model_uid()),
|
|
258
265
|
)
|
|
259
266
|
|
|
267
|
+
def __repr__(self) -> str:
|
|
268
|
+
return f"ModelActor({self._replica_model_uid})"
|
|
269
|
+
|
|
260
270
|
async def _record_completion_metrics(
|
|
261
271
|
self, duration, completion_tokens, prompt_tokens
|
|
262
272
|
):
|
|
@@ -374,7 +384,28 @@ class ModelActor(xo.StatelessActor):
|
|
|
374
384
|
return condition
|
|
375
385
|
|
|
376
386
|
async def load(self):
|
|
377
|
-
|
|
387
|
+
try:
|
|
388
|
+
# Change process title for model
|
|
389
|
+
import setproctitle
|
|
390
|
+
|
|
391
|
+
setproctitle.setproctitle(f"Model: {self._replica_model_uid}")
|
|
392
|
+
except ImportError:
|
|
393
|
+
pass
|
|
394
|
+
i = 0
|
|
395
|
+
while True:
|
|
396
|
+
i += 1
|
|
397
|
+
try:
|
|
398
|
+
self._model.load()
|
|
399
|
+
break
|
|
400
|
+
except Exception as e:
|
|
401
|
+
if (
|
|
402
|
+
i < XINFERENCE_LAUNCH_MODEL_RETRY
|
|
403
|
+
and str(e).find("busy or unavailable") >= 0
|
|
404
|
+
):
|
|
405
|
+
await asyncio.sleep(5)
|
|
406
|
+
logger.warning("Retry to load model {model_uid}: %d times", i)
|
|
407
|
+
continue
|
|
408
|
+
raise
|
|
378
409
|
if self.allow_batching():
|
|
379
410
|
await self._scheduler_ref.set_model(self._model)
|
|
380
411
|
logger.debug(
|
|
@@ -385,6 +416,7 @@ class ModelActor(xo.StatelessActor):
|
|
|
385
416
|
logger.debug(
|
|
386
417
|
f"Batching enabled for model: {self.model_uid()}, max_num_images: {self._model.get_max_num_images_for_batching()}"
|
|
387
418
|
)
|
|
419
|
+
logger.info(f"{self} loaded")
|
|
388
420
|
|
|
389
421
|
def model_uid(self):
|
|
390
422
|
return (
|
xinference/core/supervisor.py
CHANGED
|
@@ -970,7 +970,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
970
970
|
raise ValueError(
|
|
971
971
|
f"Model is already in the model list, uid: {_replica_model_uid}"
|
|
972
972
|
)
|
|
973
|
-
replica_gpu_idx = assign_replica_gpu(_replica_model_uid, gpu_idx)
|
|
973
|
+
replica_gpu_idx = assign_replica_gpu(_replica_model_uid, replica, gpu_idx)
|
|
974
974
|
nonlocal model_type
|
|
975
975
|
|
|
976
976
|
worker_ref = (
|
|
@@ -1084,7 +1084,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
1084
1084
|
dead_models,
|
|
1085
1085
|
)
|
|
1086
1086
|
for replica_model_uid in dead_models:
|
|
1087
|
-
model_uid, _
|
|
1087
|
+
model_uid, _ = parse_replica_model_uid(replica_model_uid)
|
|
1088
1088
|
self._model_uid_to_replica_info.pop(model_uid, None)
|
|
1089
1089
|
self._replica_model_uid_to_worker.pop(
|
|
1090
1090
|
replica_model_uid, None
|
|
@@ -1137,7 +1137,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
1137
1137
|
raise ValueError(f"Model not found in the model list, uid: {model_uid}")
|
|
1138
1138
|
|
|
1139
1139
|
replica_model_uid = build_replica_model_uid(
|
|
1140
|
-
model_uid,
|
|
1140
|
+
model_uid, next(replica_info.scheduler)
|
|
1141
1141
|
)
|
|
1142
1142
|
|
|
1143
1143
|
worker_ref = self._replica_model_uid_to_worker.get(replica_model_uid, None)
|
|
@@ -1154,7 +1154,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
1154
1154
|
raise ValueError(f"Model not found in the model list, uid: {model_uid}")
|
|
1155
1155
|
# Use rep id 0 to instead of next(replica_info.scheduler) to avoid
|
|
1156
1156
|
# consuming the generator.
|
|
1157
|
-
replica_model_uid = build_replica_model_uid(model_uid,
|
|
1157
|
+
replica_model_uid = build_replica_model_uid(model_uid, 0)
|
|
1158
1158
|
worker_ref = self._replica_model_uid_to_worker.get(replica_model_uid, None)
|
|
1159
1159
|
if worker_ref is None:
|
|
1160
1160
|
raise ValueError(
|
|
@@ -1260,7 +1260,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
1260
1260
|
uids_to_remove.append(model_uid)
|
|
1261
1261
|
|
|
1262
1262
|
for replica_model_uid in uids_to_remove:
|
|
1263
|
-
model_uid, _
|
|
1263
|
+
model_uid, _ = parse_replica_model_uid(replica_model_uid)
|
|
1264
1264
|
self._model_uid_to_replica_info.pop(model_uid, None)
|
|
1265
1265
|
self._replica_model_uid_to_worker.pop(replica_model_uid, None)
|
|
1266
1266
|
|
xinference/core/utils.py
CHANGED
|
@@ -146,27 +146,26 @@ def iter_replica_model_uid(model_uid: str, replica: int) -> Generator[str, None,
|
|
|
146
146
|
"""
|
|
147
147
|
replica = int(replica)
|
|
148
148
|
for rep_id in range(replica):
|
|
149
|
-
yield f"{model_uid}-{
|
|
149
|
+
yield f"{model_uid}-{rep_id}"
|
|
150
150
|
|
|
151
151
|
|
|
152
|
-
def build_replica_model_uid(model_uid: str,
|
|
152
|
+
def build_replica_model_uid(model_uid: str, rep_id: int) -> str:
|
|
153
153
|
"""
|
|
154
154
|
Build a replica model uid.
|
|
155
155
|
"""
|
|
156
|
-
return f"{model_uid}-{
|
|
156
|
+
return f"{model_uid}-{rep_id}"
|
|
157
157
|
|
|
158
158
|
|
|
159
|
-
def parse_replica_model_uid(replica_model_uid: str) -> Tuple[str, int
|
|
159
|
+
def parse_replica_model_uid(replica_model_uid: str) -> Tuple[str, int]:
|
|
160
160
|
"""
|
|
161
|
-
Parse replica model uid to model uid
|
|
161
|
+
Parse replica model uid to model uid and rep id.
|
|
162
162
|
"""
|
|
163
163
|
parts = replica_model_uid.split("-")
|
|
164
164
|
if len(parts) == 1:
|
|
165
|
-
return replica_model_uid, -1
|
|
165
|
+
return replica_model_uid, -1
|
|
166
166
|
rep_id = int(parts.pop())
|
|
167
|
-
replica = int(parts.pop())
|
|
168
167
|
model_uid = "-".join(parts)
|
|
169
|
-
return model_uid,
|
|
168
|
+
return model_uid, rep_id
|
|
170
169
|
|
|
171
170
|
|
|
172
171
|
def is_valid_model_uid(model_uid: str) -> bool:
|
|
@@ -261,9 +260,9 @@ def get_nvidia_gpu_info() -> Dict:
|
|
|
261
260
|
|
|
262
261
|
|
|
263
262
|
def assign_replica_gpu(
|
|
264
|
-
_replica_model_uid: str, gpu_idx: Union[int, List[int]]
|
|
263
|
+
_replica_model_uid: str, replica: int, gpu_idx: Union[int, List[int]]
|
|
265
264
|
) -> List[int]:
|
|
266
|
-
model_uid,
|
|
265
|
+
model_uid, rep_id = parse_replica_model_uid(_replica_model_uid)
|
|
267
266
|
rep_id, replica = int(rep_id), int(replica)
|
|
268
267
|
if isinstance(gpu_idx, int):
|
|
269
268
|
gpu_idx = [gpu_idx]
|
xinference/core/worker.py
CHANGED
|
@@ -157,7 +157,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
157
157
|
model_uid,
|
|
158
158
|
recover_count - 1,
|
|
159
159
|
)
|
|
160
|
-
event_model_uid, _
|
|
160
|
+
event_model_uid, _ = parse_replica_model_uid(model_uid)
|
|
161
161
|
try:
|
|
162
162
|
if self._event_collector_ref is not None:
|
|
163
163
|
await self._event_collector_ref.report_event(
|
|
@@ -377,7 +377,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
377
377
|
return len(self._model_uid_to_model)
|
|
378
378
|
|
|
379
379
|
async def is_model_vllm_backend(self, model_uid: str) -> bool:
|
|
380
|
-
_model_uid, _
|
|
380
|
+
_model_uid, _ = parse_replica_model_uid(model_uid)
|
|
381
381
|
supervisor_ref = await self.get_supervisor_ref()
|
|
382
382
|
model_ref = await supervisor_ref.get_model(_model_uid)
|
|
383
383
|
return await model_ref.is_vllm_backend()
|
|
@@ -785,7 +785,9 @@ class WorkerActor(xo.StatelessActor):
|
|
|
785
785
|
peft_model_config: Optional[PeftModelConfig] = None,
|
|
786
786
|
request_limits: Optional[int] = None,
|
|
787
787
|
gpu_idx: Optional[Union[int, List[int]]] = None,
|
|
788
|
-
download_hub: Optional[
|
|
788
|
+
download_hub: Optional[
|
|
789
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
790
|
+
] = None,
|
|
789
791
|
model_path: Optional[str] = None,
|
|
790
792
|
**kwargs,
|
|
791
793
|
):
|
|
@@ -798,7 +800,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
798
800
|
launch_args.update(kwargs)
|
|
799
801
|
|
|
800
802
|
try:
|
|
801
|
-
origin_uid, _
|
|
803
|
+
origin_uid, _ = parse_replica_model_uid(model_uid)
|
|
802
804
|
except Exception as e:
|
|
803
805
|
logger.exception(e)
|
|
804
806
|
raise
|
|
@@ -887,6 +889,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
887
889
|
uid=model_uid,
|
|
888
890
|
supervisor_address=self._supervisor_address,
|
|
889
891
|
worker_address=self.address,
|
|
892
|
+
replica_model_uid=model_uid,
|
|
890
893
|
model=model,
|
|
891
894
|
model_description=model_description,
|
|
892
895
|
request_limits=request_limits,
|
|
@@ -924,7 +927,7 @@ class WorkerActor(xo.StatelessActor):
|
|
|
924
927
|
# Terminate model while its launching is not allow
|
|
925
928
|
if model_uid in self._model_uid_launching_guard:
|
|
926
929
|
raise ValueError(f"{model_uid} is launching")
|
|
927
|
-
origin_uid, _
|
|
930
|
+
origin_uid, _ = parse_replica_model_uid(model_uid)
|
|
928
931
|
try:
|
|
929
932
|
_ = await self.get_supervisor_ref()
|
|
930
933
|
if self._event_collector_ref is not None:
|
xinference/deploy/cmdline.py
CHANGED
|
@@ -43,6 +43,7 @@ from .utils import (
|
|
|
43
43
|
get_log_file,
|
|
44
44
|
get_timestamp_ms,
|
|
45
45
|
handle_click_args_type,
|
|
46
|
+
set_envs,
|
|
46
47
|
)
|
|
47
48
|
|
|
48
49
|
try:
|
|
@@ -106,6 +107,8 @@ def start_local_cluster(
|
|
|
106
107
|
XINFERENCE_LOG_MAX_BYTES,
|
|
107
108
|
)
|
|
108
109
|
logging.config.dictConfig(dict_config) # type: ignore
|
|
110
|
+
# refer to https://huggingface.co/docs/transformers/main_classes/logging
|
|
111
|
+
set_envs("TRANSFORMERS_VERBOSITY", log_level.lower())
|
|
109
112
|
|
|
110
113
|
main(
|
|
111
114
|
host=host,
|
|
@@ -280,6 +283,7 @@ def supervisor(
|
|
|
280
283
|
XINFERENCE_LOG_MAX_BYTES,
|
|
281
284
|
)
|
|
282
285
|
logging.config.dictConfig(dict_config) # type: ignore
|
|
286
|
+
set_envs("TRANSFORMERS_VERBOSITY", log_level.lower())
|
|
283
287
|
|
|
284
288
|
main(
|
|
285
289
|
host=host,
|
|
@@ -342,6 +346,7 @@ def worker(
|
|
|
342
346
|
XINFERENCE_LOG_MAX_BYTES,
|
|
343
347
|
)
|
|
344
348
|
logging.config.dictConfig(dict_config) # type: ignore
|
|
349
|
+
set_envs("TRANSFORMERS_VERBOSITY", log_level.lower())
|
|
345
350
|
|
|
346
351
|
endpoint = get_endpoint(endpoint)
|
|
347
352
|
|
xinference/deploy/utils.py
CHANGED
|
@@ -134,10 +134,6 @@ def get_config_dict(
|
|
|
134
134
|
"propagate": False,
|
|
135
135
|
},
|
|
136
136
|
},
|
|
137
|
-
"root": {
|
|
138
|
-
"level": "WARN",
|
|
139
|
-
"handlers": ["stream_handler", "file_handler"],
|
|
140
|
-
},
|
|
141
137
|
}
|
|
142
138
|
return config_dict
|
|
143
139
|
|
|
@@ -220,3 +216,10 @@ def handle_click_args_type(arg: str) -> Any:
|
|
|
220
216
|
pass
|
|
221
217
|
|
|
222
218
|
return arg
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def set_envs(key: str, value: str):
|
|
222
|
+
"""
|
|
223
|
+
Environment variables are set by the parent process and inherited by child processes
|
|
224
|
+
"""
|
|
225
|
+
os.environ[key] = value
|
xinference/model/audio/core.py
CHANGED
|
@@ -100,7 +100,9 @@ def generate_audio_description(
|
|
|
100
100
|
|
|
101
101
|
def match_audio(
|
|
102
102
|
model_name: str,
|
|
103
|
-
download_hub: Optional[
|
|
103
|
+
download_hub: Optional[
|
|
104
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
105
|
+
] = None,
|
|
104
106
|
) -> AudioModelFamilyV1:
|
|
105
107
|
from ..utils import download_from_modelscope
|
|
106
108
|
from . import BUILTIN_AUDIO_MODELS, MODELSCOPE_AUDIO_MODELS
|
|
@@ -152,7 +154,9 @@ def create_audio_model_instance(
|
|
|
152
154
|
devices: List[str],
|
|
153
155
|
model_uid: str,
|
|
154
156
|
model_name: str,
|
|
155
|
-
download_hub: Optional[
|
|
157
|
+
download_hub: Optional[
|
|
158
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
159
|
+
] = None,
|
|
156
160
|
model_path: Optional[str] = None,
|
|
157
161
|
**kwargs,
|
|
158
162
|
) -> Tuple[
|
|
@@ -127,7 +127,7 @@
|
|
|
127
127
|
"model_name": "ChatTTS",
|
|
128
128
|
"model_family": "ChatTTS",
|
|
129
129
|
"model_id": "2Noise/ChatTTS",
|
|
130
|
-
"model_revision": "
|
|
130
|
+
"model_revision": "1a3c04a8b0651689bd9242fbb55b1f4b5a9aef84",
|
|
131
131
|
"model_ability": "text-to-audio",
|
|
132
132
|
"multilingual": true
|
|
133
133
|
},
|
xinference/model/core.py
CHANGED
|
@@ -55,7 +55,9 @@ def create_model_instance(
|
|
|
55
55
|
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
56
56
|
quantization: Optional[str] = None,
|
|
57
57
|
peft_model_config: Optional[PeftModelConfig] = None,
|
|
58
|
-
download_hub: Optional[
|
|
58
|
+
download_hub: Optional[
|
|
59
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
60
|
+
] = None,
|
|
59
61
|
model_path: Optional[str] = None,
|
|
60
62
|
**kwargs,
|
|
61
63
|
) -> Tuple[Any, ModelDescription]:
|
|
@@ -433,7 +433,9 @@ class EmbeddingModel:
|
|
|
433
433
|
|
|
434
434
|
def match_embedding(
|
|
435
435
|
model_name: str,
|
|
436
|
-
download_hub: Optional[
|
|
436
|
+
download_hub: Optional[
|
|
437
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
438
|
+
] = None,
|
|
437
439
|
) -> EmbeddingModelSpec:
|
|
438
440
|
from ..utils import download_from_modelscope
|
|
439
441
|
from . import BUILTIN_EMBEDDING_MODELS, MODELSCOPE_EMBEDDING_MODELS
|
|
@@ -469,7 +471,9 @@ def create_embedding_model_instance(
|
|
|
469
471
|
devices: List[str],
|
|
470
472
|
model_uid: str,
|
|
471
473
|
model_name: str,
|
|
472
|
-
download_hub: Optional[
|
|
474
|
+
download_hub: Optional[
|
|
475
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
476
|
+
] = None,
|
|
473
477
|
model_path: Optional[str] = None,
|
|
474
478
|
**kwargs,
|
|
475
479
|
) -> Tuple[EmbeddingModel, EmbeddingModelDescription]:
|
xinference/model/image/core.py
CHANGED
|
@@ -125,7 +125,9 @@ def generate_image_description(
|
|
|
125
125
|
|
|
126
126
|
def match_diffusion(
|
|
127
127
|
model_name: str,
|
|
128
|
-
download_hub: Optional[
|
|
128
|
+
download_hub: Optional[
|
|
129
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
130
|
+
] = None,
|
|
129
131
|
) -> ImageModelFamilyV1:
|
|
130
132
|
from ..utils import download_from_modelscope
|
|
131
133
|
from . import BUILTIN_IMAGE_MODELS, MODELSCOPE_IMAGE_MODELS
|
|
@@ -213,7 +215,9 @@ def create_image_model_instance(
|
|
|
213
215
|
model_uid: str,
|
|
214
216
|
model_name: str,
|
|
215
217
|
peft_model_config: Optional[PeftModelConfig] = None,
|
|
216
|
-
download_hub: Optional[
|
|
218
|
+
download_hub: Optional[
|
|
219
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
220
|
+
] = None,
|
|
217
221
|
model_path: Optional[str] = None,
|
|
218
222
|
**kwargs,
|
|
219
223
|
) -> Tuple[
|
|
@@ -71,6 +71,9 @@ class GotOCR2Model:
|
|
|
71
71
|
logger.info("Got OCR 2.0 kwargs: %s", kwargs)
|
|
72
72
|
if "ocr_type" not in kwargs:
|
|
73
73
|
kwargs["ocr_type"] = "ocr"
|
|
74
|
+
if image.mode == "RGBA" or image.mode == "CMYK":
|
|
75
|
+
# convert to RGB
|
|
76
|
+
image = image.convert("RGB")
|
|
74
77
|
assert self._model is not None
|
|
75
78
|
# This chat API limits the max new tokens inside.
|
|
76
79
|
return self._model.chat(self._tokenizer, image, gradio_input=True, **kwargs)
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -32,6 +32,7 @@ from .llm_family import (
|
|
|
32
32
|
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES,
|
|
33
33
|
BUILTIN_LLM_PROMPT_STYLE,
|
|
34
34
|
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
35
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
|
|
35
36
|
LLAMA_CLASSES,
|
|
36
37
|
LLM_ENGINES,
|
|
37
38
|
LMDEPLOY_CLASSES,
|
|
@@ -258,6 +259,36 @@ def _install():
|
|
|
258
259
|
if "tools" in model_spec.model_ability:
|
|
259
260
|
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
|
|
260
261
|
|
|
262
|
+
openmind_hub_json_path = os.path.join(
|
|
263
|
+
os.path.dirname(os.path.abspath(__file__)), "llm_family_openmind_hub.json"
|
|
264
|
+
)
|
|
265
|
+
for json_obj in json.load(
|
|
266
|
+
codecs.open(openmind_hub_json_path, "r", encoding="utf-8")
|
|
267
|
+
):
|
|
268
|
+
model_spec = LLMFamilyV1.parse_obj(json_obj)
|
|
269
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES.append(model_spec)
|
|
270
|
+
|
|
271
|
+
# register prompt style, in case that we have something missed
|
|
272
|
+
# if duplicated with huggingface json, keep it as the huggingface style
|
|
273
|
+
|
|
274
|
+
if (
|
|
275
|
+
"chat" in model_spec.model_ability
|
|
276
|
+
and isinstance(model_spec.chat_template, str)
|
|
277
|
+
and model_spec.model_name not in BUILTIN_LLM_PROMPT_STYLE
|
|
278
|
+
):
|
|
279
|
+
BUILTIN_LLM_PROMPT_STYLE[model_spec.model_name] = {
|
|
280
|
+
"chat_template": model_spec.chat_template,
|
|
281
|
+
"stop_token_ids": model_spec.stop_token_ids,
|
|
282
|
+
"stop": model_spec.stop,
|
|
283
|
+
}
|
|
284
|
+
# register model family
|
|
285
|
+
if "chat" in model_spec.model_ability:
|
|
286
|
+
BUILTIN_LLM_MODEL_CHAT_FAMILIES.add(model_spec.model_name)
|
|
287
|
+
else:
|
|
288
|
+
BUILTIN_LLM_MODEL_GENERATE_FAMILIES.add(model_spec.model_name)
|
|
289
|
+
if "tools" in model_spec.model_ability:
|
|
290
|
+
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES.add(model_spec.model_name)
|
|
291
|
+
|
|
261
292
|
csghub_json_path = os.path.join(
|
|
262
293
|
os.path.dirname(os.path.abspath(__file__)), "llm_family_csghub.json"
|
|
263
294
|
)
|
|
@@ -288,6 +319,7 @@ def _install():
|
|
|
288
319
|
for llm_specs in [
|
|
289
320
|
BUILTIN_LLM_FAMILIES,
|
|
290
321
|
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
322
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
|
|
291
323
|
BUILTIN_CSGHUB_LLM_FAMILIES,
|
|
292
324
|
]:
|
|
293
325
|
for llm_spec in llm_specs:
|
|
@@ -298,6 +330,7 @@ def _install():
|
|
|
298
330
|
for families in [
|
|
299
331
|
BUILTIN_LLM_FAMILIES,
|
|
300
332
|
BUILTIN_MODELSCOPE_LLM_FAMILIES,
|
|
333
|
+
BUILTIN_OPENMIND_HUB_LLM_FAMILIES,
|
|
301
334
|
BUILTIN_CSGHUB_LLM_FAMILIES,
|
|
302
335
|
]:
|
|
303
336
|
for family in families:
|
xinference/model/llm/core.py
CHANGED
|
@@ -52,9 +52,7 @@ class LLM(abc.ABC):
|
|
|
52
52
|
*args,
|
|
53
53
|
**kwargs,
|
|
54
54
|
):
|
|
55
|
-
self.model_uid, self.
|
|
56
|
-
replica_model_uid
|
|
57
|
-
)
|
|
55
|
+
self.model_uid, self.rep_id = parse_replica_model_uid(replica_model_uid)
|
|
58
56
|
self.model_family = model_family
|
|
59
57
|
self.model_spec = model_spec
|
|
60
58
|
self.quantization = quantization
|
|
@@ -193,7 +191,9 @@ def create_llm_model_instance(
|
|
|
193
191
|
model_size_in_billions: Optional[Union[int, str]] = None,
|
|
194
192
|
quantization: Optional[str] = None,
|
|
195
193
|
peft_model_config: Optional[PeftModelConfig] = None,
|
|
196
|
-
download_hub: Optional[
|
|
194
|
+
download_hub: Optional[
|
|
195
|
+
Literal["huggingface", "modelscope", "openmind_hub", "csghub"]
|
|
196
|
+
] = None,
|
|
197
197
|
model_path: Optional[str] = None,
|
|
198
198
|
**kwargs,
|
|
199
199
|
) -> Tuple[LLM, LLMDescription]:
|
|
@@ -1312,6 +1312,93 @@
|
|
|
1312
1312
|
"<|eom_id|>"
|
|
1313
1313
|
]
|
|
1314
1314
|
},
|
|
1315
|
+
{
|
|
1316
|
+
"version": 1,
|
|
1317
|
+
"context_length": 131072,
|
|
1318
|
+
"model_name": "llama-3.2-vision-instruct",
|
|
1319
|
+
"model_lang": [
|
|
1320
|
+
"en",
|
|
1321
|
+
"de",
|
|
1322
|
+
"fr",
|
|
1323
|
+
"it",
|
|
1324
|
+
"pt",
|
|
1325
|
+
"hi",
|
|
1326
|
+
"es",
|
|
1327
|
+
"th"
|
|
1328
|
+
],
|
|
1329
|
+
"model_ability": [
|
|
1330
|
+
"chat",
|
|
1331
|
+
"vision"
|
|
1332
|
+
],
|
|
1333
|
+
"model_description": "Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image...",
|
|
1334
|
+
"model_specs": [
|
|
1335
|
+
{
|
|
1336
|
+
"model_format": "pytorch",
|
|
1337
|
+
"model_size_in_billions": 11,
|
|
1338
|
+
"quantizations": [
|
|
1339
|
+
"none"
|
|
1340
|
+
],
|
|
1341
|
+
"model_id": "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
|
1342
|
+
},
|
|
1343
|
+
{
|
|
1344
|
+
"model_format": "pytorch",
|
|
1345
|
+
"model_size_in_billions": 90,
|
|
1346
|
+
"quantizations": [
|
|
1347
|
+
"none"
|
|
1348
|
+
],
|
|
1349
|
+
"model_id": "meta-llama/Llama-3.2-90B-Vision-Instruct"
|
|
1350
|
+
}
|
|
1351
|
+
],
|
|
1352
|
+
"chat_template": "{% for message in messages %}{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<|image|>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{ '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
|
|
1353
|
+
"stop_token_ids": [
|
|
1354
|
+
128001,
|
|
1355
|
+
128008,
|
|
1356
|
+
128009
|
|
1357
|
+
],
|
|
1358
|
+
"stop": [
|
|
1359
|
+
"<|end_of_text|>",
|
|
1360
|
+
"<|eot_id|>",
|
|
1361
|
+
"<|eom_id|>"
|
|
1362
|
+
]
|
|
1363
|
+
},
|
|
1364
|
+
{
|
|
1365
|
+
"version": 1,
|
|
1366
|
+
"context_length": 131072,
|
|
1367
|
+
"model_name": "llama-3.2-vision",
|
|
1368
|
+
"model_lang": [
|
|
1369
|
+
"en",
|
|
1370
|
+
"de",
|
|
1371
|
+
"fr",
|
|
1372
|
+
"it",
|
|
1373
|
+
"pt",
|
|
1374
|
+
"hi",
|
|
1375
|
+
"es",
|
|
1376
|
+
"th"
|
|
1377
|
+
],
|
|
1378
|
+
"model_ability": [
|
|
1379
|
+
"generate",
|
|
1380
|
+
"vision"
|
|
1381
|
+
],
|
|
1382
|
+
"model_description": "The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image...",
|
|
1383
|
+
"model_specs": [
|
|
1384
|
+
{
|
|
1385
|
+
"model_format": "pytorch",
|
|
1386
|
+
"model_size_in_billions": 11,
|
|
1387
|
+
"quantizations": [
|
|
1388
|
+
"none"
|
|
1389
|
+
],
|
|
1390
|
+
"model_id": "meta-llama/Meta-Llama-3.2-11B-Vision"
|
|
1391
|
+
},
|
|
1392
|
+
{
|
|
1393
|
+
"model_format": "pytorch",
|
|
1394
|
+
"model_size_in_billions": 90,
|
|
1395
|
+
"quantizations": [
|
|
1396
|
+
"none"
|
|
1397
|
+
],
|
|
1398
|
+
"model_id": "meta-llama/Meta-Llama-3.2-90B-Vision"
|
|
1399
|
+
}
|
|
1400
|
+
]
|
|
1401
|
+
},
|
|
1315
1402
|
{
|
|
1316
1403
|
"version": 1,
|
|
1317
1404
|
"context_length": 2048,
|