xinference 0.13.0__py3-none-any.whl → 0.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (66) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +50 -2
  3. xinference/client/restful/restful_client.py +49 -2
  4. xinference/core/model.py +15 -0
  5. xinference/core/supervisor.py +132 -15
  6. xinference/core/worker.py +165 -8
  7. xinference/deploy/cmdline.py +5 -0
  8. xinference/model/audio/chattts.py +6 -6
  9. xinference/model/audio/core.py +23 -15
  10. xinference/model/core.py +12 -3
  11. xinference/model/embedding/core.py +25 -16
  12. xinference/model/flexible/__init__.py +40 -0
  13. xinference/model/flexible/core.py +228 -0
  14. xinference/model/flexible/launchers/__init__.py +15 -0
  15. xinference/model/flexible/launchers/transformers_launcher.py +63 -0
  16. xinference/model/flexible/utils.py +33 -0
  17. xinference/model/image/core.py +18 -14
  18. xinference/model/image/custom.py +1 -1
  19. xinference/model/llm/__init__.py +0 -2
  20. xinference/model/llm/core.py +3 -2
  21. xinference/model/llm/ggml/llamacpp.py +1 -10
  22. xinference/model/llm/llm_family.json +52 -35
  23. xinference/model/llm/llm_family.py +71 -46
  24. xinference/model/llm/llm_family_modelscope.json +55 -27
  25. xinference/model/llm/pytorch/core.py +0 -80
  26. xinference/model/llm/utils.py +4 -2
  27. xinference/model/rerank/core.py +24 -25
  28. xinference/types.py +0 -1
  29. xinference/web/ui/build/asset-manifest.json +3 -3
  30. xinference/web/ui/build/index.html +1 -1
  31. xinference/web/ui/build/static/js/{main.0fb6f3ab.js → main.95c1d652.js} +3 -3
  32. xinference/web/ui/build/static/js/main.95c1d652.js.map +1 -0
  33. xinference/web/ui/node_modules/.cache/babel-loader/07ce9e632e6aff24d7aa3ad8e48224433bbfeb0d633fca723453f1fcae0c9f1c.json +1 -0
  34. xinference/web/ui/node_modules/.cache/babel-loader/40f17338fc75ae095de7d2b4d8eae0d5ca0193a7e2bcece4ee745b22a7a2f4b7.json +1 -0
  35. xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +1 -0
  36. xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +1 -0
  37. xinference/web/ui/node_modules/.cache/babel-loader/70fa8c07463a5fe57c68bf92502910105a8f647371836fe8c3a7408246ca7ba0.json +1 -0
  38. xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +1 -0
  39. {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/METADATA +7 -11
  40. {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/RECORD +45 -54
  41. xinference/model/llm/ggml/chatglm.py +0 -457
  42. xinference/thirdparty/ChatTTS/__init__.py +0 -1
  43. xinference/thirdparty/ChatTTS/core.py +0 -200
  44. xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
  45. xinference/thirdparty/ChatTTS/experimental/llm.py +0 -40
  46. xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
  47. xinference/thirdparty/ChatTTS/infer/api.py +0 -125
  48. xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
  49. xinference/thirdparty/ChatTTS/model/dvae.py +0 -155
  50. xinference/thirdparty/ChatTTS/model/gpt.py +0 -265
  51. xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
  52. xinference/thirdparty/ChatTTS/utils/gpu_utils.py +0 -23
  53. xinference/thirdparty/ChatTTS/utils/infer_utils.py +0 -141
  54. xinference/thirdparty/ChatTTS/utils/io_utils.py +0 -14
  55. xinference/web/ui/build/static/js/main.0fb6f3ab.js.map +0 -1
  56. xinference/web/ui/node_modules/.cache/babel-loader/0f6b391abec76271137faad13a3793fe7acc1024e8cd2269c147b653ecd3a73b.json +0 -1
  57. xinference/web/ui/node_modules/.cache/babel-loader/30a0c79d8025d6441eb75b2df5bc2750a14f30119c869ef02570d294dff65c2f.json +0 -1
  58. xinference/web/ui/node_modules/.cache/babel-loader/40486e655c3c5801f087e2cf206c0b5511aaa0dfdba78046b7181bf9c17e54c5.json +0 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/b5507cd57f16a3a230aa0128e39fe103e928de139ea29e2679e4c64dcbba3b3a.json +0 -1
  60. xinference/web/ui/node_modules/.cache/babel-loader/d779b915f83f9c7b5a72515b6932fdd114f1822cef90ae01cc0d12bca59abc2d.json +0 -1
  61. xinference/web/ui/node_modules/.cache/babel-loader/d87824cb266194447a9c0c69ebab2d507bfc3e3148976173760d18c035e9dd26.json +0 -1
  62. /xinference/web/ui/build/static/js/{main.0fb6f3ab.js.LICENSE.txt → main.95c1d652.js.LICENSE.txt} +0 -0
  63. {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/LICENSE +0 -0
  64. {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/WHEEL +0 -0
  65. {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/entry_points.txt +0 -0
  66. {xinference-0.13.0.dist-info → xinference-0.13.1.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,6 @@
14
14
 
15
15
  import logging
16
16
  import os
17
- import platform
18
17
  import shutil
19
18
  from threading import Lock
20
19
  from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
@@ -541,15 +540,20 @@ def _get_cache_dir_for_model_mem(
541
540
  def _get_cache_dir(
542
541
  llm_family: LLMFamilyV1,
543
542
  llm_spec: "LLMSpecV1",
543
+ quantization: Optional[str] = None,
544
544
  create_if_not_exist=True,
545
545
  ):
546
546
  # If the model id contains quantization, then we should give each
547
547
  # quantization a dedicated cache dir.
548
548
  quant_suffix = ""
549
- for q in llm_spec.quantizations:
550
- if llm_spec.model_id and q in llm_spec.model_id:
551
- quant_suffix = q
552
- break
549
+ if llm_spec.model_id and "{" in llm_spec.model_id and quantization is not None:
550
+ quant_suffix = quantization
551
+ else:
552
+ for q in llm_spec.quantizations:
553
+ if llm_spec.model_id and q in llm_spec.model_id:
554
+ quant_suffix = q
555
+ break
556
+
553
557
  cache_dir_name = (
554
558
  f"{llm_family.model_name}-{llm_spec.model_format}"
555
559
  f"-{llm_spec.model_size_in_billions}b"
@@ -900,6 +904,7 @@ def _check_revision(
900
904
  llm_spec: "LLMSpecV1",
901
905
  builtin: list,
902
906
  meta_path: str,
907
+ quantization: Optional[str] = None,
903
908
  ) -> bool:
904
909
  for family in builtin:
905
910
  if llm_family.model_name == family.model_name:
@@ -908,59 +913,63 @@ def _check_revision(
908
913
  if (
909
914
  spec.model_format == "pytorch"
910
915
  and spec.model_size_in_billions == llm_spec.model_size_in_billions
916
+ and (quantization is None or quantization in spec.quantizations)
911
917
  ):
912
918
  return valid_model_revision(meta_path, spec.model_revision)
913
919
  return False
914
920
 
915
921
 
916
922
  def get_cache_status(
917
- llm_family: LLMFamilyV1,
918
- llm_spec: "LLMSpecV1",
923
+ llm_family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: Optional[str] = None
919
924
  ) -> Union[bool, List[bool]]:
920
925
  """
921
- When calling this function from above, `llm_family` is constructed only from BUILTIN_LLM_FAMILIES,
922
- so we should check both huggingface and modelscope cache files.
926
+ Checks if a model's cache status is available based on the model format and quantization.
927
+ Supports different directories and model formats.
923
928
  """
924
- cache_dir = _get_cache_dir(llm_family, llm_spec, create_if_not_exist=False)
925
- # check revision for pytorch model
926
- if llm_spec.model_format == "pytorch":
927
- hf_meta_path = _get_meta_path(cache_dir, "pytorch", "huggingface", "none")
928
- ms_meta_path = _get_meta_path(cache_dir, "pytorch", "modelscope", "none")
929
- revisions = [
930
- _check_revision(llm_family, llm_spec, BUILTIN_LLM_FAMILIES, hf_meta_path),
931
- _check_revision(
932
- llm_family, llm_spec, BUILTIN_MODELSCOPE_LLM_FAMILIES, ms_meta_path
933
- ),
934
- ]
935
- return any(revisions)
936
- # just check meta file for ggml and gptq model
937
- elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq", "mlx"]:
938
- ret = []
939
- for q in llm_spec.quantizations:
940
- assert q is not None
941
- hf_meta_path = _get_meta_path(
942
- cache_dir, llm_spec.model_format, "huggingface", q
943
- )
944
- ms_meta_path = _get_meta_path(
945
- cache_dir, llm_spec.model_format, "modelscope", q
946
- )
947
- results = [os.path.exists(hf_meta_path), os.path.exists(ms_meta_path)]
948
- ret.append(any(results))
949
- return ret
950
- else:
951
- raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
952
-
953
929
 
954
- def _is_linux():
955
- return platform.system() == "Linux"
930
+ def check_file_status(meta_path: str) -> bool:
931
+ return os.path.exists(meta_path)
956
932
 
933
+ def check_revision_status(
934
+ meta_path: str, families: list, quantization: Optional[str] = None
935
+ ) -> bool:
936
+ return _check_revision(llm_family, llm_spec, families, meta_path, quantization)
957
937
 
958
- def _has_cuda_device():
959
- # `cuda_count` method already contains the logic for the
960
- # number of GPUs specified by `CUDA_VISIBLE_DEVICES`.
961
- from ...utils import cuda_count
938
+ def handle_quantization(q: Union[str, None]) -> bool:
939
+ specific_cache_dir = _get_cache_dir(
940
+ llm_family, llm_spec, q, create_if_not_exist=False
941
+ )
942
+ meta_paths = {
943
+ "huggingface": _get_meta_path(
944
+ specific_cache_dir, llm_spec.model_format, "huggingface", q
945
+ ),
946
+ "modelscope": _get_meta_path(
947
+ specific_cache_dir, llm_spec.model_format, "modelscope", q
948
+ ),
949
+ }
950
+ if llm_spec.model_format == "pytorch":
951
+ return check_revision_status(
952
+ meta_paths["huggingface"], BUILTIN_LLM_FAMILIES, q
953
+ ) or check_revision_status(
954
+ meta_paths["modelscope"], BUILTIN_MODELSCOPE_LLM_FAMILIES, q
955
+ )
956
+ else:
957
+ return check_file_status(meta_paths["huggingface"]) or check_file_status(
958
+ meta_paths["modelscope"]
959
+ )
962
960
 
963
- return cuda_count() > 0
961
+ if llm_spec.model_id and "{" in llm_spec.model_id:
962
+ return (
963
+ [handle_quantization(q) for q in llm_spec.quantizations]
964
+ if quantization is None
965
+ else handle_quantization(quantization)
966
+ )
967
+ else:
968
+ return (
969
+ [handle_quantization(q) for q in llm_spec.quantizations]
970
+ if llm_spec.model_format != "pytorch"
971
+ else handle_quantization(None)
972
+ )
964
973
 
965
974
 
966
975
  def get_user_defined_llm_families():
@@ -1006,6 +1015,7 @@ def match_llm(
1006
1015
  model_format: Optional[str] = None,
1007
1016
  model_size_in_billions: Optional[Union[int, str]] = None,
1008
1017
  quantization: Optional[str] = None,
1018
+ download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
1009
1019
  ) -> Optional[Tuple[LLMFamilyV1, LLMSpecV1, str]]:
1010
1020
  """
1011
1021
  Find an LLM family, spec, and quantization that satisfy given criteria.
@@ -1029,7 +1039,22 @@ def match_llm(
1029
1039
  spec.model_id = spec.model_id.format(quantization=q)
1030
1040
  return spec
1031
1041
 
1032
- if download_from_modelscope():
1042
+ # priority: download_hub > download_from_modelscope() and download_from_csghub()
1043
+ if download_hub == "modelscope":
1044
+ all_families = (
1045
+ BUILTIN_MODELSCOPE_LLM_FAMILIES
1046
+ + BUILTIN_LLM_FAMILIES
1047
+ + user_defined_llm_families
1048
+ )
1049
+ elif download_hub == "csghub":
1050
+ all_families = (
1051
+ BUILTIN_CSGHUB_LLM_FAMILIES
1052
+ + BUILTIN_LLM_FAMILIES
1053
+ + user_defined_llm_families
1054
+ )
1055
+ elif download_hub == "huggingface":
1056
+ all_families = BUILTIN_LLM_FAMILIES + user_defined_llm_families
1057
+ elif download_from_modelscope():
1033
1058
  all_families = (
1034
1059
  BUILTIN_MODELSCOPE_LLM_FAMILIES
1035
1060
  + BUILTIN_LLM_FAMILIES
@@ -304,21 +304,6 @@
304
304
  ],
305
305
  "model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
306
306
  "model_specs": [
307
- {
308
- "model_format": "ggmlv3",
309
- "model_size_in_billions": 6,
310
- "quantizations": [
311
- "q4_0",
312
- "q4_1",
313
- "q5_0",
314
- "q5_1",
315
- "q8_0"
316
- ],
317
- "model_hub": "modelscope",
318
- "model_id": "Xorbits/chatglm2-6B-GGML",
319
- "model_revision": "v1.0.0",
320
- "model_file_name_template": "chatglm2-ggml-{quantization}.bin"
321
- },
322
307
  {
323
308
  "model_format": "pytorch",
324
309
  "model_size_in_billions": 6,
@@ -392,17 +377,6 @@
392
377
  ],
393
378
  "model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
394
379
  "model_specs": [
395
- {
396
- "model_format": "ggmlv3",
397
- "model_size_in_billions": 6,
398
- "quantizations": [
399
- "q4_0"
400
- ],
401
- "model_hub": "modelscope",
402
- "model_id": "Xorbits/chatglm3-ggml",
403
- "model_revision": "v1.0.0",
404
- "model_file_name_template": "chatglm3-ggml-{quantization}.bin"
405
- },
406
380
  {
407
381
  "model_format": "pytorch",
408
382
  "model_size_in_billions": 6,
@@ -547,6 +521,33 @@
547
521
  "model_hub": "modelscope",
548
522
  "model_id": "ZhipuAI/glm-4-9b-chat",
549
523
  "model_revision": "master"
524
+ },
525
+ {
526
+ "model_format": "ggufv2",
527
+ "model_size_in_billions": 9,
528
+ "quantizations": [
529
+ "Q2_K",
530
+ "IQ3_XS",
531
+ "IQ3_S",
532
+ "IQ3_M",
533
+ "Q3_K_S",
534
+ "Q3_K_L",
535
+ "Q3_K",
536
+ "IQ4_XS",
537
+ "IQ4_NL",
538
+ "Q4_K_S",
539
+ "Q4_K",
540
+ "Q5_K_S",
541
+ "Q5_K",
542
+ "Q6_K",
543
+ "Q8_0",
544
+ "BF16",
545
+ "FP16"
546
+ ],
547
+ "model_file_name_template": "glm-4-9b-chat.{quantization}.gguf",
548
+ "model_hub": "modelscope",
549
+ "model_id": "LLM-Research/glm-4-9b-chat-GGUF",
550
+ "model_revision": "master"
550
551
  }
551
552
  ],
552
553
  "prompt_style": {
@@ -593,6 +594,33 @@
593
594
  "model_hub": "modelscope",
594
595
  "model_id": "ZhipuAI/glm-4-9b-chat-1m",
595
596
  "model_revision": "master"
597
+ },
598
+ {
599
+ "model_format": "ggufv2",
600
+ "model_size_in_billions": 9,
601
+ "quantizations": [
602
+ "Q2_K",
603
+ "IQ3_XS",
604
+ "IQ3_S",
605
+ "IQ3_M",
606
+ "Q3_K_S",
607
+ "Q3_K_L",
608
+ "Q3_K",
609
+ "IQ4_XS",
610
+ "IQ4_NL",
611
+ "Q4_K_S",
612
+ "Q4_K",
613
+ "Q5_K_S",
614
+ "Q5_K",
615
+ "Q6_K",
616
+ "Q8_0",
617
+ "BF16",
618
+ "FP16"
619
+ ],
620
+ "model_file_name_template": "glm-4-9b-chat-1m.{quantization}.gguf",
621
+ "model_hub": "modelscope",
622
+ "model_id": "LLM-Research/glm-4-9b-chat-1m-GGUF",
623
+ "model_revision": "master"
596
624
  }
597
625
  ],
598
626
  "prompt_style": {
@@ -4115,7 +4143,7 @@
4115
4143
  "zh"
4116
4144
  ],
4117
4145
  "model_ability": [
4118
- "generate"
4146
+ "chat"
4119
4147
  ],
4120
4148
  "model_description": "Aquila2-chat series models are the chat models",
4121
4149
  "model_specs": [
@@ -34,9 +34,6 @@ from ....types import (
34
34
  CompletionChoice,
35
35
  CompletionChunk,
36
36
  CreateCompletionTorch,
37
- Embedding,
38
- EmbeddingData,
39
- EmbeddingUsage,
40
37
  LoRA,
41
38
  PytorchGenerateConfig,
42
39
  PytorchModelConfig,
@@ -673,83 +670,6 @@ class PytorchModel(LLM):
673
670
  )
674
671
  self.handle_batch_inference_results(req_list)
675
672
 
676
- def create_embedding(self, input: Union[str, List[str]]) -> Embedding:
677
- try:
678
- import torch
679
- import torch.nn.functional as F
680
- except ImportError as e:
681
- raise ImportError(
682
- "Could not import torch. Please install it with `pip install torch`."
683
- ) from e
684
-
685
- if isinstance(input, str):
686
- inputs = [input]
687
- else:
688
- inputs = input
689
-
690
- tokenizer = self._tokenizer
691
- tokenizer.pad_token = tokenizer.eos_token
692
- is_llama = "llama" in str(type(self._model)) # llama supports batch inference
693
- is_chatglm = "chatglm" in str(type(self._model))
694
- if is_llama:
695
- encoding = tokenizer.batch_encode_plus(
696
- inputs, padding=True, return_tensors="pt"
697
- )
698
- input_ids = encoding["input_ids"].to(self._device)
699
- attention_mask = encoding["attention_mask"].to(self._device)
700
- model_output = self._model(
701
- input_ids, attention_mask, output_hidden_states=True
702
- )
703
- data = model_output.hidden_states[-1]
704
- mask = attention_mask.unsqueeze(-1).expand(data.size()).float()
705
- masked_embeddings = data * mask
706
- sum_embeddings = torch.sum(masked_embeddings, dim=1)
707
- seq_length = torch.sum(mask, dim=1)
708
- embedding = sum_embeddings / seq_length
709
- normalized_embeddings = F.normalize(embedding, p=2, dim=1)
710
- normalized_embeddings = normalized_embeddings.tolist()
711
- token_num = torch.sum(attention_mask).item()
712
-
713
- embedding_list = []
714
- for index, data in enumerate(normalized_embeddings):
715
- embedding_list.append(
716
- EmbeddingData(index=index, object="embedding", embedding=data)
717
- )
718
-
719
- usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
720
-
721
- ret = Embedding(
722
- object="list",
723
- model=self.model_uid,
724
- data=embedding_list,
725
- usage=usage,
726
- )
727
-
728
- else:
729
- embedding = []
730
- token_num = 0
731
- for index, text in enumerate(inputs):
732
- input_ids = tokenizer.encode(text, return_tensors="pt").to(self._device)
733
- model_output = self._model(input_ids, output_hidden_states=True)
734
- if is_chatglm:
735
- data = (model_output.hidden_states[-1].transpose(0, 1))[0]
736
- else:
737
- data = model_output.hidden_states[-1][0]
738
- data = F.normalize(torch.mean(data, dim=0), p=2, dim=0)
739
- data = data.tolist()
740
-
741
- embedding.append(
742
- EmbeddingData(index=index, object="embedding", embedding=data)
743
- )
744
- token_num += len(input_ids[0])
745
-
746
- usage = EmbeddingUsage(prompt_tokens=token_num, total_tokens=token_num)
747
- ret = Embedding(
748
- object="list", model=self.model_uid, data=embedding, usage=usage
749
- )
750
-
751
- return ret
752
-
753
673
 
754
674
  class PytorchChatModel(PytorchModel, ChatModelMixin):
755
675
  def __init__(
@@ -779,8 +779,10 @@ Begin!"""
779
779
  def get_file_location(
780
780
  llm_family: LLMFamilyV1, spec: LLMSpecV1, quantization: str
781
781
  ) -> Tuple[str, bool]:
782
- cache_dir = _get_cache_dir(llm_family, spec, create_if_not_exist=False)
783
- cache_status = get_cache_status(llm_family, spec)
782
+ cache_dir = _get_cache_dir(
783
+ llm_family, spec, quantization, create_if_not_exist=False
784
+ )
785
+ cache_status = get_cache_status(llm_family, spec, quantization)
784
786
  if isinstance(cache_status, list):
785
787
  is_cached = None
786
788
  for q, cs in zip(spec.quantizations, cache_status):
@@ -18,7 +18,7 @@ import os
18
18
  import uuid
19
19
  from collections import defaultdict
20
20
  from collections.abc import Sequence
21
- from typing import Dict, List, Optional, Tuple
21
+ from typing import Dict, List, Literal, Optional, Tuple
22
22
 
23
23
  import numpy as np
24
24
  import torch
@@ -285,7 +285,12 @@ def cache(model_spec: RerankModelSpec):
285
285
 
286
286
 
287
287
  def create_rerank_model_instance(
288
- subpool_addr: str, devices: List[str], model_uid: str, model_name: str, **kwargs
288
+ subpool_addr: str,
289
+ devices: List[str],
290
+ model_uid: str,
291
+ model_name: str,
292
+ download_hub: Optional[Literal["huggingface", "modelscope", "csghub"]] = None,
293
+ **kwargs,
289
294
  ) -> Tuple[RerankModel, RerankModelDescription]:
290
295
  from ..utils import download_from_modelscope
291
296
  from . import BUILTIN_RERANK_MODELS, MODELSCOPE_RERANK_MODELS
@@ -298,30 +303,24 @@ def create_rerank_model_instance(
298
303
  break
299
304
 
300
305
  if model_spec is None:
301
- if download_from_modelscope():
302
- if model_name in MODELSCOPE_RERANK_MODELS:
303
- logger.debug(f"Rerank model {model_name} found in ModelScope.")
304
- model_spec = MODELSCOPE_RERANK_MODELS[model_name]
305
- else:
306
- logger.debug(
307
- f"Rerank model {model_name} not found in ModelScope, "
308
- f"now try to download from huggingface."
309
- )
310
- if model_name in BUILTIN_RERANK_MODELS:
311
- model_spec = BUILTIN_RERANK_MODELS[model_name]
312
- else:
313
- raise ValueError(
314
- f"Rerank model {model_name} not found, available"
315
- f"model list: {BUILTIN_RERANK_MODELS.keys()}"
316
- )
306
+ if download_hub == "huggingface" and model_name in BUILTIN_RERANK_MODELS:
307
+ logger.debug(f"Rerank model {model_name} found in Huggingface.")
308
+ model_spec = BUILTIN_RERANK_MODELS[model_name]
309
+ elif download_hub == "modelscope" and model_name in MODELSCOPE_RERANK_MODELS:
310
+ logger.debug(f"Rerank model {model_name} found in ModelScope.")
311
+ model_spec = MODELSCOPE_RERANK_MODELS[model_name]
312
+ elif download_from_modelscope() and model_name in MODELSCOPE_RERANK_MODELS:
313
+ logger.debug(f"Rerank model {model_name} found in ModelScope.")
314
+ model_spec = MODELSCOPE_RERANK_MODELS[model_name]
315
+ elif model_name in BUILTIN_RERANK_MODELS:
316
+ logger.debug(f"Rerank model {model_name} found in Huggingface.")
317
+ model_spec = BUILTIN_RERANK_MODELS[model_name]
317
318
  else:
318
- if model_name in BUILTIN_RERANK_MODELS:
319
- model_spec = BUILTIN_RERANK_MODELS[model_name]
320
- else:
321
- raise ValueError(
322
- f"Rerank model {model_name} not found, available"
323
- f"model list: {BUILTIN_RERANK_MODELS.keys()}"
324
- )
319
+ raise ValueError(
320
+ f"Rerank model {model_name} not found, available"
321
+ f"Huggingface: {BUILTIN_RERANK_MODELS.keys()}"
322
+ f"ModelScope: {MODELSCOPE_RERANK_MODELS.keys()}"
323
+ )
325
324
 
326
325
  model_path = cache(model_spec)
327
326
  use_fp16 = kwargs.pop("use_fp16", False)
xinference/types.py CHANGED
@@ -285,7 +285,6 @@ class LlamaCppModelConfig(TypedDict, total=False):
285
285
  vocab_only: bool
286
286
  use_mmap: bool
287
287
  use_mlock: bool
288
- embedding: bool
289
288
  n_threads: Optional[int]
290
289
  n_batch: int
291
290
  last_n_tokens_size: int
@@ -1,14 +1,14 @@
1
1
  {
2
2
  "files": {
3
3
  "main.css": "./static/css/main.4bafd904.css",
4
- "main.js": "./static/js/main.0fb6f3ab.js",
4
+ "main.js": "./static/js/main.95c1d652.js",
5
5
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
6
6
  "index.html": "./index.html",
7
7
  "main.4bafd904.css.map": "./static/css/main.4bafd904.css.map",
8
- "main.0fb6f3ab.js.map": "./static/js/main.0fb6f3ab.js.map"
8
+ "main.95c1d652.js.map": "./static/js/main.95c1d652.js.map"
9
9
  },
10
10
  "entrypoints": [
11
11
  "static/css/main.4bafd904.css",
12
- "static/js/main.0fb6f3ab.js"
12
+ "static/js/main.95c1d652.js"
13
13
  ]
14
14
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.0fb6f3ab.js"></script><link href="./static/css/main.4bafd904.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.95c1d652.js"></script><link href="./static/css/main.4bafd904.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>