xinference 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +5 -5
  3. xinference/core/model.py +6 -1
  4. xinference/deploy/cmdline.py +3 -1
  5. xinference/deploy/test/test_cmdline.py +56 -0
  6. xinference/isolation.py +24 -0
  7. xinference/model/audio/core.py +5 -0
  8. xinference/model/audio/f5tts.py +195 -0
  9. xinference/model/audio/fish_speech.py +2 -1
  10. xinference/model/audio/model_spec.json +8 -0
  11. xinference/model/audio/model_spec_modelscope.json +9 -0
  12. xinference/model/embedding/core.py +203 -142
  13. xinference/model/embedding/model_spec.json +7 -0
  14. xinference/model/embedding/model_spec_modelscope.json +8 -0
  15. xinference/model/llm/__init__.py +2 -2
  16. xinference/model/llm/llm_family.json +172 -53
  17. xinference/model/llm/llm_family_modelscope.json +118 -20
  18. xinference/model/llm/mlx/core.py +230 -49
  19. xinference/model/llm/sglang/core.py +1 -0
  20. xinference/model/llm/transformers/chatglm.py +9 -5
  21. xinference/model/llm/transformers/utils.py +16 -8
  22. xinference/model/llm/utils.py +4 -1
  23. xinference/model/llm/vllm/core.py +5 -0
  24. xinference/thirdparty/f5_tts/__init__.py +0 -0
  25. xinference/thirdparty/f5_tts/api.py +166 -0
  26. xinference/thirdparty/f5_tts/configs/E2TTS_Base_train.yaml +44 -0
  27. xinference/thirdparty/f5_tts/configs/E2TTS_Small_train.yaml +44 -0
  28. xinference/thirdparty/f5_tts/configs/F5TTS_Base_train.yaml +46 -0
  29. xinference/thirdparty/f5_tts/configs/F5TTS_Small_train.yaml +46 -0
  30. xinference/thirdparty/f5_tts/eval/README.md +49 -0
  31. xinference/thirdparty/f5_tts/eval/ecapa_tdnn.py +330 -0
  32. xinference/thirdparty/f5_tts/eval/eval_infer_batch.py +207 -0
  33. xinference/thirdparty/f5_tts/eval/eval_infer_batch.sh +13 -0
  34. xinference/thirdparty/f5_tts/eval/eval_librispeech_test_clean.py +84 -0
  35. xinference/thirdparty/f5_tts/eval/eval_seedtts_testset.py +84 -0
  36. xinference/thirdparty/f5_tts/eval/utils_eval.py +405 -0
  37. xinference/thirdparty/f5_tts/infer/README.md +191 -0
  38. xinference/thirdparty/f5_tts/infer/SHARED.md +74 -0
  39. xinference/thirdparty/f5_tts/infer/examples/basic/basic.toml +11 -0
  40. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  41. xinference/thirdparty/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  42. xinference/thirdparty/f5_tts/infer/examples/multi/country.flac +0 -0
  43. xinference/thirdparty/f5_tts/infer/examples/multi/main.flac +0 -0
  44. xinference/thirdparty/f5_tts/infer/examples/multi/story.toml +19 -0
  45. xinference/thirdparty/f5_tts/infer/examples/multi/story.txt +1 -0
  46. xinference/thirdparty/f5_tts/infer/examples/multi/town.flac +0 -0
  47. xinference/thirdparty/f5_tts/infer/examples/vocab.txt +2545 -0
  48. xinference/thirdparty/f5_tts/infer/infer_cli.py +226 -0
  49. xinference/thirdparty/f5_tts/infer/infer_gradio.py +851 -0
  50. xinference/thirdparty/f5_tts/infer/speech_edit.py +193 -0
  51. xinference/thirdparty/f5_tts/infer/utils_infer.py +538 -0
  52. xinference/thirdparty/f5_tts/model/__init__.py +10 -0
  53. xinference/thirdparty/f5_tts/model/backbones/README.md +20 -0
  54. xinference/thirdparty/f5_tts/model/backbones/dit.py +163 -0
  55. xinference/thirdparty/f5_tts/model/backbones/mmdit.py +146 -0
  56. xinference/thirdparty/f5_tts/model/backbones/unett.py +219 -0
  57. xinference/thirdparty/f5_tts/model/cfm.py +285 -0
  58. xinference/thirdparty/f5_tts/model/dataset.py +319 -0
  59. xinference/thirdparty/f5_tts/model/modules.py +658 -0
  60. xinference/thirdparty/f5_tts/model/trainer.py +366 -0
  61. xinference/thirdparty/f5_tts/model/utils.py +185 -0
  62. xinference/thirdparty/f5_tts/scripts/count_max_epoch.py +33 -0
  63. xinference/thirdparty/f5_tts/scripts/count_params_gflops.py +39 -0
  64. xinference/thirdparty/f5_tts/socket_server.py +159 -0
  65. xinference/thirdparty/f5_tts/train/README.md +77 -0
  66. xinference/thirdparty/f5_tts/train/datasets/prepare_csv_wavs.py +139 -0
  67. xinference/thirdparty/f5_tts/train/datasets/prepare_emilia.py +230 -0
  68. xinference/thirdparty/f5_tts/train/datasets/prepare_libritts.py +92 -0
  69. xinference/thirdparty/f5_tts/train/datasets/prepare_ljspeech.py +65 -0
  70. xinference/thirdparty/f5_tts/train/datasets/prepare_wenetspeech4tts.py +125 -0
  71. xinference/thirdparty/f5_tts/train/finetune_cli.py +174 -0
  72. xinference/thirdparty/f5_tts/train/finetune_gradio.py +1846 -0
  73. xinference/thirdparty/f5_tts/train/train.py +75 -0
  74. xinference/web/ui/build/asset-manifest.json +3 -3
  75. xinference/web/ui/build/index.html +1 -1
  76. xinference/web/ui/build/static/js/{main.2f269bb3.js → main.4eb4ee80.js} +3 -3
  77. xinference/web/ui/build/static/js/main.4eb4ee80.js.map +1 -0
  78. xinference/web/ui/node_modules/.cache/babel-loader/8c5eeb02f772d02cbe8b89c05428d0dd41a97866f75f7dc1c2164a67f5a1cf98.json +1 -0
  79. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/METADATA +33 -14
  80. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/RECORD +85 -34
  81. xinference/web/ui/build/static/js/main.2f269bb3.js.map +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +0 -1
  83. /xinference/web/ui/build/static/js/{main.2f269bb3.js.LICENSE.txt → main.4eb4ee80.js.LICENSE.txt} +0 -0
  84. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/LICENSE +0 -0
  85. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/WHEEL +0 -0
  86. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/entry_points.txt +0 -0
  87. {xinference-1.0.1.dist-info → xinference-1.1.0.dist-info}/top_level.txt +0 -0
@@ -208,12 +208,14 @@ class EmbeddingModel:
208
208
  ]
209
209
  raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
210
210
 
211
- model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
211
+ if torch_dtype and torch_dtype == torch.float16:
212
+ model_kwargs = {"use_fp16": True}
213
+ else:
214
+ model_kwargs = {}
212
215
  self._model = BGEM3FlagModel(
213
216
  self._model_path,
214
217
  device=self._device,
215
- model_kwargs=model_kwargs,
216
- trust_remote_code=True,
218
+ **model_kwargs,
217
219
  )
218
220
  else:
219
221
  model_kwargs = {"torch_dtype": torch_dtype} if torch_dtype else None
@@ -224,7 +226,9 @@ class EmbeddingModel:
224
226
  trust_remote_code=True,
225
227
  )
226
228
 
227
- def _fix_langchain_openai_inputs(self, sentences: Union[str, List[str]]):
229
+ def _fix_langchain_openai_inputs(
230
+ self, sentences: Union[str, List[str], Dict[str, str], List[Dict[str, str]]]
231
+ ):
228
232
  # Check if sentences is a two-dimensional list of integers
229
233
  if (
230
234
  isinstance(sentences, list)
@@ -258,157 +262,172 @@ class EmbeddingModel:
258
262
  sentences = lines_decoded
259
263
  return sentences
260
264
 
261
- def create_embedding(self, sentences: Union[str, List[str]], **kwargs):
265
+ def create_embedding(
266
+ self,
267
+ sentences: Union[str, List[str]],
268
+ **kwargs,
269
+ ):
262
270
  sentences = self._fix_langchain_openai_inputs(sentences)
263
271
 
264
- from FlagEmbedding import BGEM3FlagModel
265
272
  from sentence_transformers import SentenceTransformer
266
273
 
267
274
  kwargs.setdefault("normalize_embeddings", True)
268
275
 
269
- @no_type_check
270
- def _encode_bgem3(
271
- model: Union[SentenceTransformer, BGEM3FlagModel],
272
- sentences: Union[str, List[str]],
273
- batch_size: int = 32,
274
- show_progress_bar: bool = None,
275
- output_value: str = "sparse_embedding",
276
- convert_to_numpy: bool = True,
277
- convert_to_tensor: bool = False,
278
- device: str = None,
279
- normalize_embeddings: bool = False,
280
- **kwargs,
281
- ):
282
- """
283
- Computes sentence embeddings with bge-m3 model
284
- Nothing special here, just replace sentence-transformer with FlagEmbedding
285
- TODO: think about how to solve the redundant code of encode method in the future
286
-
287
- :param sentences: the sentences to embed
288
- :param batch_size: the batch size used for the computation
289
- :param show_progress_bar: Output a progress bar when encode sentences
290
- :param output_value: Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
291
- :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
292
- :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
293
- :param device: Which torch.device to use for the computation
294
- :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
276
+ try:
277
+ from FlagEmbedding import BGEM3FlagModel
278
+
279
+ @no_type_check
280
+ def _encode_bgem3(
281
+ model: Union[SentenceTransformer, BGEM3FlagModel],
282
+ sentences: Union[str, List[str]],
283
+ batch_size: int = 32,
284
+ show_progress_bar: bool = None,
285
+ output_value: str = "sparse_embedding",
286
+ convert_to_numpy: bool = True,
287
+ convert_to_tensor: bool = False,
288
+ device: str = None,
289
+ normalize_embeddings: bool = False,
290
+ **kwargs,
291
+ ):
292
+ """
293
+ Computes sentence embeddings with bge-m3 model
294
+ Nothing special here, just replace sentence-transformer with FlagEmbedding
295
+ TODO: think about how to solve the redundant code of encode method in the future
296
+
297
+ :param sentences: the sentences to embed
298
+ :param batch_size: the batch size used for the computation
299
+ :param show_progress_bar: Output a progress bar when encode sentences
300
+ :param output_value: Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
301
+ :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
302
+ :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
303
+ :param device: Which torch.device to use for the computation
304
+ :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
305
+
306
+ :return:
307
+ By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
308
+ """
309
+ import torch
310
+ from tqdm.autonotebook import trange
311
+
312
+ if show_progress_bar is None:
313
+ show_progress_bar = (
314
+ logger.getEffectiveLevel() == logging.INFO
315
+ or logger.getEffectiveLevel() == logging.DEBUG
316
+ )
295
317
 
296
- :return:
297
- By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
298
- """
299
- import torch
300
- from tqdm.autonotebook import trange
318
+ if convert_to_tensor:
319
+ convert_to_numpy = False
320
+
321
+ if output_value != "sparse_embedding":
322
+ convert_to_tensor = False
323
+ convert_to_numpy = False
324
+
325
+ input_was_string = False
326
+ if isinstance(sentences, str) or not hasattr(
327
+ sentences, "__len__"
328
+ ): # Cast an individual sentence to a list with length 1
329
+ sentences = [sentences]
330
+ input_was_string = True
331
+
332
+ if device is None:
333
+ # Same as SentenceTransformer.py
334
+ from sentence_transformers.util import get_device_name
335
+
336
+ device = get_device_name()
337
+ logger.info(f"Use pytorch device_name: {device}")
338
+
339
+ all_embeddings = []
340
+ all_token_nums = 0
341
+
342
+ # The original code does not support other inference engines
343
+ def _text_length(text):
344
+ if isinstance(text, dict): # {key: value} case
345
+ return len(next(iter(text.values())))
346
+ elif not hasattr(text, "__len__"): # Object has no len() method
347
+ return 1
348
+ elif len(text) == 0 or isinstance(
349
+ text[0], int
350
+ ): # Empty string or list of ints
351
+ return len(text)
352
+ else:
353
+ return sum(
354
+ [len(t) for t in text]
355
+ ) # Sum of length of individual strings
301
356
 
302
- if show_progress_bar is None:
303
- show_progress_bar = (
304
- logger.getEffectiveLevel() == logging.INFO
305
- or logger.getEffectiveLevel() == logging.DEBUG
357
+ length_sorted_idx = np.argsort(
358
+ [-_text_length(sen) for sen in sentences]
306
359
  )
360
+ sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
361
+
362
+ for start_index in trange(
363
+ 0,
364
+ len(sentences),
365
+ batch_size,
366
+ desc="Batches",
367
+ disable=not show_progress_bar,
368
+ ):
369
+ sentences_batch = sentences_sorted[
370
+ start_index : start_index + batch_size
371
+ ]
372
+
373
+ with torch.no_grad():
374
+ out_features = model.encode(sentences_batch, **kwargs)
375
+
376
+ if output_value == "token_embeddings":
377
+ embeddings = []
378
+ for token_emb, attention in zip(
379
+ out_features[output_value],
380
+ out_features["attention_mask"],
381
+ ):
382
+ last_mask_id = len(attention) - 1
383
+ while (
384
+ last_mask_id > 0
385
+ and attention[last_mask_id].item() == 0
386
+ ):
387
+ last_mask_id -= 1
388
+
389
+ embeddings.append(token_emb[0 : last_mask_id + 1])
390
+ elif output_value is None: # Return all outputs
391
+ embeddings = []
392
+ for sent_idx in range(
393
+ len(out_features["sentence_embedding"])
394
+ ):
395
+ row = {
396
+ name: out_features[name][sent_idx]
397
+ for name in out_features
398
+ }
399
+ embeddings.append(row)
400
+ # for sparse embedding
401
+ else:
402
+ if kwargs.get("return_sparse"):
403
+ embeddings = out_features["lexical_weights"]
404
+ else:
405
+ embeddings = out_features["dense_vecs"]
307
406
 
308
- if convert_to_tensor:
309
- convert_to_numpy = False
310
-
311
- if output_value != "sparse_embedding":
312
- convert_to_tensor = False
313
- convert_to_numpy = False
314
-
315
- input_was_string = False
316
- if isinstance(sentences, str) or not hasattr(
317
- sentences, "__len__"
318
- ): # Cast an individual sentence to a list with length 1
319
- sentences = [sentences]
320
- input_was_string = True
321
-
322
- if device is None:
323
- # Same as SentenceTransformer.py
324
- from sentence_transformers.util import get_device_name
325
-
326
- device = get_device_name()
327
- logger.info(f"Use pytorch device_name: {device}")
328
-
329
- all_embeddings = []
330
- all_token_nums = 0
331
-
332
- # The original code does not support other inference engines
333
- def _text_length(text):
334
- if isinstance(text, dict): # {key: value} case
335
- return len(next(iter(text.values())))
336
- elif not hasattr(text, "__len__"): # Object has no len() method
337
- return 1
338
- elif len(text) == 0 or isinstance(
339
- text[0], int
340
- ): # Empty string or list of ints
341
- return len(text)
342
- else:
343
- return sum(
344
- [len(t) for t in text]
345
- ) # Sum of length of individual strings
407
+ if convert_to_numpy:
408
+ embeddings = embeddings.cpu()
346
409
 
347
- length_sorted_idx = np.argsort([-_text_length(sen) for sen in sentences])
348
- sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
410
+ all_embeddings.extend(embeddings)
349
411
 
350
- for start_index in trange(
351
- 0,
352
- len(sentences),
353
- batch_size,
354
- desc="Batches",
355
- disable=not show_progress_bar,
356
- ):
357
- sentences_batch = sentences_sorted[
358
- start_index : start_index + batch_size
412
+ all_embeddings = [
413
+ all_embeddings[idx] for idx in np.argsort(length_sorted_idx)
359
414
  ]
360
415
 
361
- with torch.no_grad():
362
- out_features = model.encode(sentences_batch, **kwargs)
363
-
364
- if output_value == "token_embeddings":
365
- embeddings = []
366
- for token_emb, attention in zip(
367
- out_features[output_value], out_features["attention_mask"]
368
- ):
369
- last_mask_id = len(attention) - 1
370
- while (
371
- last_mask_id > 0 and attention[last_mask_id].item() == 0
372
- ):
373
- last_mask_id -= 1
374
-
375
- embeddings.append(token_emb[0 : last_mask_id + 1])
376
- elif output_value is None: # Return all outputs
377
- embeddings = []
378
- for sent_idx in range(len(out_features["sentence_embedding"])):
379
- row = {
380
- name: out_features[name][sent_idx]
381
- for name in out_features
382
- }
383
- embeddings.append(row)
384
- # for sparse embedding
416
+ if convert_to_tensor:
417
+ if len(all_embeddings):
418
+ all_embeddings = torch.stack(all_embeddings)
385
419
  else:
386
- if kwargs.get("return_sparse"):
387
- embeddings = out_features["lexical_weights"]
388
- else:
389
- embeddings = out_features["dense_vecs"]
390
-
391
- if convert_to_numpy:
392
- embeddings = embeddings.cpu()
393
-
394
- all_embeddings.extend(embeddings)
395
-
396
- all_embeddings = [
397
- all_embeddings[idx] for idx in np.argsort(length_sorted_idx)
398
- ]
420
+ all_embeddings = torch.Tensor()
421
+ elif convert_to_numpy:
422
+ all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
399
423
 
400
- if convert_to_tensor:
401
- if len(all_embeddings):
402
- all_embeddings = torch.stack(all_embeddings)
403
- else:
404
- all_embeddings = torch.Tensor()
405
- elif convert_to_numpy:
406
- all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
424
+ if input_was_string:
425
+ all_embeddings = all_embeddings[0]
407
426
 
408
- if input_was_string:
409
- all_embeddings = all_embeddings[0]
427
+ return all_embeddings, all_token_nums
410
428
 
411
- return all_embeddings, all_token_nums
429
+ except ImportError:
430
+ _encode_bgem3 = None
412
431
 
413
432
  # copied from sentence-transformers, and modify it to return tokens num
414
433
  @no_type_check
@@ -526,7 +545,11 @@ class EmbeddingModel:
526
545
  features.update(extra_features)
527
546
  # when batching, the attention mask 1 means there is a token
528
547
  # thus we just sum up it to get the total number of tokens
529
- all_token_nums += features["attention_mask"].sum().item()
548
+ if "clip" in self._model_spec.model_name.lower():
549
+ all_token_nums += features["input_ids"].numel()
550
+ all_token_nums += features["pixel_values"].numel()
551
+ else:
552
+ all_token_nums += features["attention_mask"].sum().item()
530
553
 
531
554
  with torch.no_grad():
532
555
  out_features = model.forward(features, **kwargs)
@@ -582,6 +605,10 @@ class EmbeddingModel:
582
605
 
583
606
  return all_embeddings, all_token_nums
584
607
 
608
+ is_bge_m3_flag_model = (
609
+ self._kwargs.get("hybrid_mode")
610
+ and "m3" in self._model_spec.model_name.lower()
611
+ )
585
612
  if (
586
613
  "gte" in self._model_spec.model_name.lower()
587
614
  and "qwen2" in self._model_spec.model_name.lower()
@@ -593,10 +620,45 @@ class EmbeddingModel:
593
620
  convert_to_numpy=False,
594
621
  **kwargs,
595
622
  )
596
- elif isinstance(self._model, BGEM3FlagModel):
623
+ elif is_bge_m3_flag_model:
624
+ assert _encode_bgem3 is not None
597
625
  all_embeddings, all_token_nums = _encode_bgem3(
598
626
  self._model, sentences, convert_to_numpy=False, **kwargs
599
627
  )
628
+ elif "clip" in self._model_spec.model_name.lower():
629
+ import base64
630
+ import re
631
+ from io import BytesIO
632
+
633
+ from PIL import Image
634
+
635
+ def base64_to_image(base64_str: str) -> Image.Image:
636
+ # base64_data = re.sub("^data:image/.+;base64,", "", base64_str)
637
+ base64_data = base64_str.split(",", 1)[1]
638
+ byte_data = base64.b64decode(base64_data)
639
+ image_data = BytesIO(byte_data)
640
+ img = Image.open(image_data)
641
+ return img
642
+
643
+ objs: list[dict[str, str]] = []
644
+ for item in sentences:
645
+ if isinstance(item, dict):
646
+ if item.get("text") is not None:
647
+ objs.append(item["text"])
648
+ elif item.get("image") is not None:
649
+ if re.match(r"^data:image/.+;base64,", item["image"]):
650
+ image = base64_to_image(item["image"])
651
+ objs.append(image)
652
+ else:
653
+ objs.append(item["image"])
654
+ else:
655
+ logger.error("Please check the input data.")
656
+ all_embeddings, all_token_nums = encode(
657
+ self._model,
658
+ objs,
659
+ convert_to_numpy=False,
660
+ **self._kwargs,
661
+ )
600
662
  else:
601
663
  all_embeddings, all_token_nums = encode(
602
664
  self._model,
@@ -608,7 +670,7 @@ class EmbeddingModel:
608
670
  all_embeddings = [all_embeddings]
609
671
  embedding_list = []
610
672
  for index, data in enumerate(all_embeddings):
611
- if kwargs.get("return_sparse") and isinstance(self._model, BGEM3FlagModel):
673
+ if kwargs.get("return_sparse") and is_bge_m3_flag_model:
612
674
  embedding_list.append(
613
675
  EmbeddingData(
614
676
  index=index,
@@ -628,8 +690,7 @@ class EmbeddingModel:
628
690
  result = Embedding(
629
691
  object=(
630
692
  "list" # type: ignore
631
- if not isinstance(self._model, BGEM3FlagModel)
632
- and not kwargs.get("return_sparse")
693
+ if not is_bge_m3_flag_model and not kwargs.get("return_sparse")
633
694
  else "dict"
634
695
  ),
635
696
  model=self._model_uid,
@@ -245,5 +245,12 @@
245
245
  "max_tokens": 8192,
246
246
  "language": ["zh", "en"],
247
247
  "model_id": "jinaai/jina-embeddings-v3"
248
+ },
249
+ {
250
+ "model_name": "jina-clip-v2",
251
+ "dimensions": 1024,
252
+ "max_tokens": 8192,
253
+ "language": ["89 languages supported"],
254
+ "model_id": "jinaai/jina-clip-v2"
248
255
  }
249
256
  ]
@@ -248,5 +248,13 @@
248
248
  "language": ["zh", "en"],
249
249
  "model_id": "jinaai/jina-embeddings-v3",
250
250
  "model_hub": "modelscope"
251
+ },
252
+ {
253
+ "model_name": "jina-clip-v2",
254
+ "dimensions": 1024,
255
+ "max_tokens": 8192,
256
+ "language": ["89 languages supported"],
257
+ "model_id": "jinaai/jina-clip-v2",
258
+ "model_hub": "modelscope"
251
259
  }
252
260
  ]
@@ -131,7 +131,7 @@ def register_custom_model():
131
131
  def _install():
132
132
  from .llama_cpp.core import LlamaCppChatModel, LlamaCppModel
133
133
  from .lmdeploy.core import LMDeployChatModel, LMDeployModel
134
- from .mlx.core import MLXChatModel, MLXModel
134
+ from .mlx.core import MLXChatModel, MLXModel, MLXVisionModel
135
135
  from .sglang.core import SGLANGChatModel, SGLANGModel
136
136
  from .transformers.chatglm import ChatglmPytorchChatModel
137
137
  from .transformers.cogvlm2 import CogVLM2Model
@@ -172,7 +172,7 @@ def _install():
172
172
  )
173
173
  SGLANG_CLASSES.extend([SGLANGModel, SGLANGChatModel])
174
174
  VLLM_CLASSES.extend([VLLMModel, VLLMChatModel, VLLMVisionModel])
175
- MLX_CLASSES.extend([MLXModel, MLXChatModel])
175
+ MLX_CLASSES.extend([MLXModel, MLXChatModel, MLXVisionModel])
176
176
  LMDEPLOY_CLASSES.extend([LMDeployModel, LMDeployChatModel])
177
177
  TRANSFORMERS_CLASSES.extend(
178
178
  [