PyPI - xinference - Versions diffs - 0.11.1__py3-none-any.whl → 0.11.2.post1__py3-none-any.whl - Mend

xinference 0.11.1py3-none-any.whl → 0.11.2.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (31) hide show

xinference/_version.py +3 -3
xinference/api/restful_api.py +30 -0
xinference/client/restful/restful_client.py +29 -0
xinference/core/cache_tracker.py +12 -1
xinference/core/supervisor.py +30 -2
xinference/core/utils.py +12 -0
xinference/core/worker.py +4 -1
xinference/deploy/cmdline.py +126 -0
xinference/deploy/test/test_cmdline.py +24 -0
xinference/model/llm/__init__.py +2 -0
xinference/model/llm/llm_family.json +501 -6
xinference/model/llm/llm_family.py +84 -10
xinference/model/llm/llm_family_modelscope.json +198 -7
xinference/model/llm/memory.py +332 -0
xinference/model/llm/pytorch/core.py +2 -0
xinference/model/llm/pytorch/intern_vl.py +347 -0
xinference/model/llm/utils.py +13 -0
xinference/model/llm/vllm/core.py +5 -2
xinference/model/rerank/core.py +23 -1
xinference/model/utils.py +17 -7
xinference/thirdparty/deepseek_vl/models/processing_vlm.py +1 -1
xinference/thirdparty/deepseek_vl/models/siglip_vit.py +2 -2
xinference/thirdparty/llava/mm_utils.py +3 -2
xinference/thirdparty/llava/model/llava_arch.py +1 -1
xinference/thirdparty/omnilmm/chat.py +6 -5
{xinference-0.11.1.dist-info → xinference-0.11.2.post1.dist-info}/METADATA +8 -7
{xinference-0.11.1.dist-info → xinference-0.11.2.post1.dist-info}/RECORD +31 -29
{xinference-0.11.1.dist-info → xinference-0.11.2.post1.dist-info}/LICENSE +0 -0
{xinference-0.11.1.dist-info → xinference-0.11.2.post1.dist-info}/WHEEL +0 -0
{xinference-0.11.1.dist-info → xinference-0.11.2.post1.dist-info}/entry_points.txt +0 -0
{xinference-0.11.1.dist-info → xinference-0.11.2.post1.dist-info}/top_level.txt +0 -0

xinference/model/llm/llm_family.json CHANGED Viewed

@@ -2198,6 +2198,31 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 65536,
+    "model_name": "codeqwen1.5",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "CodeQwen1.5 is the Code-Specific version of Qwen1.5. It is a transformer-based decoder-only language model pretrained on a large amount of data of codes.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "Qwen/CodeQwen1.5-7B"
+      }
+    ]
+  },
   {
     "version": 1,
     "context_length": 65536,
@@ -4335,6 +4360,83 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "deepseek",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "DeepSeek LLM, trained from scratch on a vast dataset of 2 trillion tokens in both English and Chinese. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-llm-7b-base",
+        "model_revision": "7683fea62db869066ddaff6a41d032262c490d4f"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 67,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-llm-67b-base",
+        "model_revision": "c3f813a1121c95488a20132d3a4da89f4a46452f"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/deepseek-llm-7B-chat-GGUF",
+        "model_file_name_template": "deepseek-llm-7b-chat.{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 67,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/deepseek-llm-67b-chat-GGUF",
+        "model_file_name_template": "deepseek-llm-67b-chat.{quantization}.gguf"
+      }
+    ]
+  },
   {
     "version": 1,
     "context_length": 4096,
@@ -4427,7 +4529,199 @@
   },
   {
     "version": 1,
-    "context_length": 4096,
+    "context_length": 16384,
+    "model_name": "deepseek-coder",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "generate"
+    ],
+    "model_description": "Deepseek Coder is composed of a series of code language models, each trained from scratch on 2T tokens, with a composition of 87% code and 13% natural language in both English and Chinese. ",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_3",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-coder-1.3b-base",
+        "model_revision": "c919139c3a9b4070729c8b2cca4847ab29ca8d94"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "6_7",
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-coder-6.7b-base",
+        "model_revision": "ce2207a8bfef3ee92bd7dd4cc31c52cfa0046912"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-coder-7b-base-v1.5",
+        "model_revision": "98f0904cee2237e235f10408ae12292037b21dac"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 33,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-coder-33b-base",
+        "model_revision": "45c85cadf3720ef3e85a492e24fd4b8c5d21d8ac"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "1_3",
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/deepseek-coder-1.3b-base-GGUF",
+        "model_file_name_template": "deepseek-coder-1.3b-base.{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": "6_7",
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/deepseek-coder-6.7B-base-GGUF",
+        "model_file_name_template": "deepseek-coder-6.7b-base.{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "dagbs/deepseek-coder-7b-base-v1.5-GGUF",
+        "model_file_name_template": "deepseek-coder-7b-base-v1.5.{quantization}.gguf"
+      },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 33,
+        "quantizations": [
+          "Q2_K",
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "TheBloke/deepseek-coder-33B-base-GGUF",
+        "model_file_name_template": "deepseek-coder-33b-base.{quantization}.gguf"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_3",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-1.3b-base-GPTQ",
+        "model_revision": "a5bf3b76d70cda53327311a631b1003024d5de29"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "6_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-6.7B-base-GPTQ",
+        "model_revision": "6476ea3d6e623a1313d363dbc6e172773e031bb1"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 33,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-33B-base-GPTQ",
+        "model_revision": "f527d7325e463a5cb091d044e4f2b15902674a70"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_3",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-1.3b-base-AWQ",
+        "model_revision": "ffb66f1a2a194401b4f29025edcd261d7f0a08a7"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "6_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-6.7B-base-AWQ",
+        "model_revision": "e3d4bdf39712665f5e9d5c05c9df6f20fe1e2d5a"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 33,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-33B-base-AWQ",
+        "model_revision": "c7edb2d5868d61a5dcf2591933a8992c8cbe3ef4"
+      }
+    ]
+  },
+  {
+    "version": 1,
+    "context_length": 16384,
     "model_name": "deepseek-coder-instruct",
     "model_lang": [
       "en",
@@ -4460,6 +4754,17 @@
         "model_id": "deepseek-ai/deepseek-coder-6.7b-instruct",
         "model_revision": "cbb77d7448ea3168d884758817e7f895e3828d1c"
       },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "4-bit",
+          "8-bit",
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
+        "model_revision": "2a050a4c59d687a85324d32e147517992117ed30"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 33,
@@ -4511,6 +4816,25 @@
         "model_id": "TheBloke/deepseek-coder-6.7B-instruct-GGUF",
         "model_file_name_template": "deepseek-coder-6.7b-instruct.{quantization}.gguf"
       },
+      {
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "Q3_K_L",
+          "Q3_K_M",
+          "Q3_K_S",
+          "Q4_0",
+          "Q4_K_M",
+          "Q4_K_S",
+          "Q5_0",
+          "Q5_K_M",
+          "Q5_K_S",
+          "Q6_K",
+          "Q8_0"
+        ],
+        "model_id": "LoneStriker/deepseek-coder-7b-instruct-v1.5-GGUF",
+        "model_file_name_template": "deepseek-coder-7b-instruct-v1.5-{quantization}.gguf"
+      },
       {
         "model_format": "ggufv2",
         "model_size_in_billions": 33,
@@ -4530,6 +4854,60 @@
         ],
         "model_id": "TheBloke/deepseek-coder-33B-instruct-GGUF",
         "model_file_name_template": "deepseek-coder-33b-instruct.{quantization}.gguf"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "1_3",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-1.3b-instruct-GPTQ",
+        "model_revision": "9c002e9af6cbdf3bd9244e2d7264b6a35d1dcacf"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": "6_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-6.7B-instruct-GPTQ",
+        "model_revision": "13ccea6e3a43dcfdcb655d92097610018b431a17"
+      },
+      {
+        "model_format": "gptq",
+        "model_size_in_billions": 33,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-33B-instruct-GPTQ",
+        "model_revision": "08372729d98dfc248f9531a412fe69e14e607027"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "1_3",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-1.3b-instruct-AWQ",
+        "model_revision": "a2a484da6e4146d055316a9a63cf5b13955715a4"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": "6_7",
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-6.7B-instruct-AWQ",
+        "model_revision": "502ae3e19e57ae78dc30a791ba33c565da72dc62"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 33,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "TheBloke/deepseek-coder-33B-instruct-AWQ",
+        "model_revision": "c40b499bac2712cd3c445cf1b05d2c6558ab0d29"
       }
     ],
     "prompt_style": {
@@ -5455,9 +5833,9 @@
       "ar"
     ],
     "model_ability": [
-      "generate"
+      "chat"
     ],
-    "model_description": "C4AI Command-R is a research release of a 35 billion parameter highly performant generative model.",
+    "model_description": "C4AI Command-R(+) is a research release of a 35 and 104 billion parameter highly performant generative model.",
     "model_specs": [
       {
         "model_format": "pytorch",
@@ -5506,7 +5884,21 @@
         "model_id": "alpindale/c4ai-command-r-plus-GPTQ",
         "model_revision": "35febfc08f723ac0df32480eb4af349a7d08656e"
       }
-    ]
+    ],
+    "prompt_style": {
+      "style_name": "c4ai-command-r",
+      "system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.",
+      "roles": [
+        "<|USER_TOKEN|>",
+        "<|CHATBOT_TOKEN|>"
+      ],
+      "intra_message_sep": "",
+      "inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>",
+      "stop_token_ids": [
+        6,
+        255001
+      ]
+    }
   },
   {
     "version": 1,
@@ -5547,7 +5939,21 @@
         "model_id": "CohereForAI/c4ai-command-r-plus-4bit",
         "model_revision": "bb63b5b7005ecedb30b0cfd0d5953b02a5817f7b"
       }
-    ]
+    ],
+    "prompt_style": {
+      "style_name": "c4ai-command-r",
+      "system_prompt": "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.",
+      "roles": [
+        "<|USER_TOKEN|>",
+        "<|CHATBOT_TOKEN|>"
+      ],
+      "intra_message_sep": "",
+      "inter_message_sep": "<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|>",
+      "stop_token_ids": [
+        6,
+        255001
+      ]
+    }
   },
   {
     "version": 1,
@@ -5588,5 +5994,94 @@
         32000
       ]
     }
-  }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "internvl-chat",
+    "model_lang": [
+        "en",
+        "zh"
+    ],
+    "model_ability": [
+        "chat",
+        "vision"
+    ],
+    "model_description": "InternVL 1.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
+    "model_specs": [
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 26,
+            "quantizations": [
+                "none"
+            ],
+            "model_id": "OpenGVLab/InternVL-Chat-V1-5",
+            "model_revision": "e822119e5806946ce128043023a73d715ecabf8d"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 26,
+            "quantizations": [
+                "Int8"
+            ],
+            "model_id": "OpenGVLab/InternVL-Chat-V1-5-{quantization}",
+            "model_revision": "acaaed06937c603ab04f084216ecb0268160f538"
+        }
+    ],
+    "prompt_style": {
+        "style_name": "INTERNLM2",
+        "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+        "roles": [
+            "<|im_start|>user",
+            "<|im_start|>assistant"
+        ],
+        "intra_message_sep": "<|im_end|>",
+        "stop_token_ids": [
+            92542
+        ],
+        "stop": [
+            "<|im_end|>"
+        ]
+    }
+},
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "mini-internvl-chat",
+    "model_lang": [
+        "en",
+        "zh"
+    ],
+    "model_ability": [
+        "chat",
+        "vision"
+    ],
+    "model_description": "InternVL 1.5 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
+    "model_specs": [
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 2,
+            "quantizations": [
+                "none"
+            ],
+            "model_id": "OpenGVLab/Mini-InternVL-Chat-2B-V1-5",
+            "model_revision": "ce3f67acff17281bacbf4b156f402a0580fb9605"
+        }
+    ],
+    "prompt_style": {
+        "style_name": "INTERNLM2",
+        "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+        "roles": [
+            "<|im_start|>user",
+            "<|im_start|>assistant"
+        ],
+        "intra_message_sep": "<|im_end|>",
+        "stop_token_ids": [
+            92542
+        ],
+        "stop": [
+            "<|im_end|>"
+        ]
+    }
+}
 ]

xinference 0.11.1__py3-none-any.whl → 0.11.2.post1__py3-none-any.whl

Potentially problematic release.

xinference 0.11.1py3-none-any.whl → 0.11.2.post1py3-none-any.whl