PyPI - xinference - Versions diffs - 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl - Mend

xinference 0.11.1py3-none-any.whl → 0.11.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (31) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +30 -0
xinference/client/restful/restful_client.py +29 -0
xinference/core/cache_tracker.py +12 -1
xinference/core/supervisor.py +30 -2
xinference/core/utils.py +12 -0
xinference/core/worker.py +4 -1
xinference/deploy/cmdline.py +126 -0
xinference/deploy/test/test_cmdline.py +24 -0
xinference/model/llm/__init__.py +2 -0
xinference/model/llm/llm_family.json +501 -6
xinference/model/llm/llm_family.py +84 -10
xinference/model/llm/llm_family_modelscope.json +198 -7
xinference/model/llm/memory.py +332 -0
xinference/model/llm/pytorch/core.py +2 -0
xinference/model/llm/pytorch/intern_vl.py +387 -0
xinference/model/llm/utils.py +13 -0
xinference/model/llm/vllm/core.py +5 -2
xinference/model/rerank/core.py +23 -1
xinference/model/utils.py +17 -7
xinference/thirdparty/deepseek_vl/models/processing_vlm.py +1 -1
xinference/thirdparty/deepseek_vl/models/siglip_vit.py +2 -2
xinference/thirdparty/llava/mm_utils.py +3 -2
xinference/thirdparty/llava/model/llava_arch.py +1 -1
xinference/thirdparty/omnilmm/chat.py +6 -5
{xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/METADATA +8 -7
{xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/RECORD +31 -29
{xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/LICENSE +0 -0
{xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/WHEEL +0 -0
{xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/entry_points.txt +0 -0
{xinference-0.11.1.dist-info → xinference-0.11.2.dist-info}/top_level.txt +0 -0

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -34,6 +34,8 @@ from ..._compat import (
 )
 from ...constants import XINFERENCE_CACHE_DIR, XINFERENCE_MODEL_DIR
 from ..utils import (
+    IS_NEW_HUGGINGFACE_HUB,
+    create_symlink,
     download_from_modelscope,
     is_valid_model_uri,
     parse_uri,
@@ -447,6 +449,61 @@ def cache_from_uri(
         raise ValueError(f"Unsupported URL scheme: {src_scheme}")
+def cache_model_config(
+    llm_family: LLMFamilyV1,
+    llm_spec: "LLMSpecV1",
+):
+    """Download model config.json into cache_dir,
+    returns local filepath
+    """
+    cache_dir = _get_cache_dir_for_model_mem(llm_family, llm_spec)
+    config_file = os.path.join(cache_dir, "config.json")
+    if not os.path.islink(config_file) and not os.path.exists(config_file):
+        os.makedirs(cache_dir, exist_ok=True)
+        if llm_spec.model_hub == "huggingface":
+            from huggingface_hub import hf_hub_download
+            hf_hub_download(
+                repo_id=llm_spec.model_id, filename="config.json", local_dir=cache_dir
+            )
+        else:
+            from modelscope.hub.file_download import model_file_download
+            download_path = model_file_download(
+                model_id=llm_spec.model_id, file_path="config.json"
+            )
+            os.symlink(download_path, config_file)
+    return config_file
+def _get_cache_dir_for_model_mem(
+    llm_family: LLMFamilyV1,
+    llm_spec: "LLMSpecV1",
+    create_if_not_exist=True,
+):
+    """
+    For cal-model-mem only. (might called from supervisor / cli)
+    Temporary use separate dir from worker's cache_dir, due to issue of different style of symlink.
+    """
+    quant_suffix = ""
+    for q in llm_spec.quantizations:
+        if llm_spec.model_id and q in llm_spec.model_id:
+            quant_suffix = q
+            break
+    cache_dir_name = (
+        f"{llm_family.model_name}-{llm_spec.model_format}"
+        f"-{llm_spec.model_size_in_billions}b"
+    )
+    if quant_suffix:
+        cache_dir_name += f"-{quant_suffix}"
+    cache_dir = os.path.realpath(
+        os.path.join(XINFERENCE_CACHE_DIR, "model_mem", cache_dir_name)
+    )
+    if create_if_not_exist and not os.path.exists(cache_dir):
+        os.makedirs(cache_dir, exist_ok=True)
+    return cache_dir
 def _get_cache_dir(
     llm_family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
@@ -625,10 +682,7 @@ def cache_from_modelscope(
             llm_spec.model_id,
             revision=llm_spec.model_revision,
         )
-        for subdir, dirs, files in os.walk(download_dir):
-            for file in files:
-                relpath = os.path.relpath(os.path.join(subdir, file), download_dir)
-                symlink_local_file(os.path.join(subdir, file), cache_dir, relpath)
+        create_symlink(download_dir, cache_dir)
     elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
         file_names, final_file_name, need_merge = _generate_model_file_names(
@@ -682,9 +736,13 @@ def cache_from_huggingface(
     ):
         return cache_dir
+    use_symlinks = {}
+    if not IS_NEW_HUGGINGFACE_HUB:
+        use_symlinks = {"local_dir_use_symlinks": True, "local_dir": cache_dir}
     if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
         assert isinstance(llm_spec, PytorchLLMSpecV1)
-        retry_download(
+        download_dir = retry_download(
             huggingface_hub.snapshot_download,
             llm_family.model_name,
             {
@@ -693,9 +751,10 @@ def cache_from_huggingface(
             },
             llm_spec.model_id,
             revision=llm_spec.model_revision,
-            local_dir=cache_dir,
-            local_dir_use_symlinks=True,
+            **use_symlinks,
         )
+        if IS_NEW_HUGGINGFACE_HUB:
+            create_symlink(download_dir, cache_dir)
     elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
         assert isinstance(llm_spec, GgmlLLMSpecV1)
@@ -704,7 +763,7 @@ def cache_from_huggingface(
         )
         for file_name in file_names:
-            retry_download(
+            download_file_path = retry_download(
                 huggingface_hub.hf_hub_download,
                 llm_family.model_name,
                 {
@@ -714,9 +773,10 @@ def cache_from_huggingface(
                 llm_spec.model_id,
                 revision=llm_spec.model_revision,
                 filename=file_name,
-                local_dir=cache_dir,
-                local_dir_use_symlinks=True,
+                **use_symlinks,
             )
+            if IS_NEW_HUGGINGFACE_HUB:
+                symlink_local_file(download_file_path, cache_dir, file_name)
         if need_merge:
             _merge_cached_files(cache_dir, file_names, final_file_name)
@@ -823,6 +883,20 @@ def match_model_size(
         return False
+def convert_model_size_to_float(
+    model_size_in_billions: Union[float, int, str]
+) -> float:
+    if isinstance(model_size_in_billions, str):
+        if "_" in model_size_in_billions:
+            ms = model_size_in_billions.replace("_", ".")
+            return float(ms)
+        elif "." in model_size_in_billions:
+            return float(model_size_in_billions)
+        else:
+            return int(model_size_in_billions)
+    return model_size_in_billions
 def match_llm(
     model_name: str,
     model_format: Optional[str] = None,

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -2430,6 +2430,32 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 65536,
+    "model_name": "codeqwen1.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "qwen/CodeQwen1.5-7B",
+        "model_hub": "modelscope"
+      }
+    ]
+  },
   {
     "version": 1,
     "context_length": 65536,
@@ -2548,6 +2574,43 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "deepseek",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "DDeepSeek LLM, trained from scratch on a vast dataset of 2 trillion tokens in both English and Chinese. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-llm-7b-base",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 67,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-llm-67b-base",
+        "model_hub": "modelscope"
+      }
+    ]
+  },
   {
     "version": 1,
     "context_length": 4096,
@@ -2600,7 +2663,55 @@
   },
   {
     "version": 1,
-    "context_length": 4096,
+    "context_length": 16384,
+    "model_name": "deepseek-coder",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Deepseek Coder is composed of a series of code language models, each trained from scratch on 2T tokens, with a composition of 87% code and 13% natural language in both English and Chinese.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_3",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-coder-1.3b-base",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "6_7",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-coder-6.7b-base",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 33,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-coder-33b-base",
+        "model_hub": "modelscope"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 16384,
     "model_name": "deepseek-coder-instruct",
     "model_lang": [
       "en",
@@ -3389,7 +3500,7 @@
       "ar"
     ],
     "model_ability": [
-      "generate"
+      "chat"
     ],
     "model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
     "model_specs": [
@@ -3408,11 +3519,12 @@
         "model_size_in_billions": 35,
         "quantizations": [
           "Q2_K",
+          "Q3_K_M",
           "Q4_K_M",
           "Q5_K_M"
         ],
         "model_id": "mirror013/C4AI-Command-R-v01-GGUF",
-        "model_file_name_template": "c4ai-command-r-v01.{quantization}.gguf",
+        "model_file_name_template": "c4ai-command-r-v01-{quantization}.gguf",
         "model_hub": "modelscope",
         "model_revision": "master"
       },
@@ -3426,7 +3538,21 @@
         "model_id": "AI-ModelScope/c4ai-command-r-plus",
         "model_revision": "master"
       }
-    ]
+    ],
+    "prompt_style": {
+      "style_name": "c4ai-command-r",
+      "system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.",
+      "roles": [
+        "<|USER_TOKEN|>",
+        "<|CHATBOT_TOKEN|>"
+      ],
+      "intra_message_sep": "",
+      "inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>",
+      "stop_token_ids": [
+        6,
+        255001
+      ]
+    }
   },
   {
     "version": 1,
@@ -3445,7 +3571,7 @@
       "ar"
     ],
     "model_ability": [
-      "generate"
+      "chat"
     ],
     "model_description": "This model is 4bit quantized version of C4AI Command-R using bitsandbytes.",
     "model_specs": [
@@ -3459,7 +3585,21 @@
         "model_id": "mirror013/c4ai-command-r-v01-4bit",
         "model_revision": "master"
       }
-    ]
+    ],
+    "prompt_style": {
+      "style_name": "c4ai-command-r",
+      "system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.",
+      "roles": [
+        "<|USER_TOKEN|>",
+        "<|CHATBOT_TOKEN|>"
+      ],
+      "intra_message_sep": "",
+      "inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>",
+      "stop_token_ids": [
+        6,
+        255001
+      ]
+    }
   },
   {
     "version": 1,
@@ -3548,5 +3688,56 @@
         "<|end|>"
       ]
     }
-  }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "internvl-chat",
+    "model_lang": [
+        "en",
+        "zh"
+    ],
+    "model_ability": [
+        "chat",
+        "vision"
+    ],
+    "model_description": "InternVL 1.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
+    "model_specs": [
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 26,
+            "quantizations": [
+                "none"
+            ],
+          "model_hub": "modelscope",
+            "model_id": "AI-ModelScope/InternVL-Chat-V1-5",
+            "model_revision": "master"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 26,
+            "quantizations": [
+                "Int8"
+            ],
+          "model_hub": "modelscope",
+            "model_id": "AI-ModelScope/InternVL-Chat-V1-5-{quantization}",
+            "model_revision": "master"
+        }
+    ],
+    "prompt_style": {
+        "style_name": "INTERNLM2",
+        "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+        "roles": [
+            "<|im_start|>user",
+            "<|im_start|>assistant"
+        ],
+        "intra_message_sep": "<|im_end|>",
+        "stop_token_ids": [
+            92542
+        ],
+        "stop": [
+            "<|im_end|>"
+        ]
+    }
+}
 ]

xinference 0.11.1__py3-none-any.whl → 0.11.2__py3-none-any.whl

Potentially problematic release.

xinference 0.11.1py3-none-any.whl → 0.11.2py3-none-any.whl