PyPI - xinference - Versions diffs - 0.14.1.post1__py3-none-any.whl → 0.14.2__py3-none-any.whl - Mend - Supply Chain Defender

xinference 0.14.1.post1py3-none-any.whl → 0.14.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (87) hide show

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -503,78 +503,6 @@
       }
     ]
   },
-  {
-    "version": 1,
-    "context_length": 8192,
-    "model_name": "chatglm2",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "ChatGLM2 is the second generation of ChatGLM, still open-source and trained on Chinese and English data.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "ZhipuAI/chatglm2-6b",
-        "model_revision": "v1.0.12"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "CHATGLM",
-      "system_prompt": "",
-      "roles": [
-        "问",
-        "答"
-      ],
-      "intra_message_sep": "\n\n"
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 32768,
-    "model_name": "chatglm2-32k",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "ChatGLM2-32k is a special version of ChatGLM2, with a context window of 32k tokens instead of 8k.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 6,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "ZhipuAI/chatglm2-6b-32k",
-        "model_revision": "v1.0.2"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "CHATGLM",
-      "system_prompt": "",
-      "roles": [
-        "问",
-        "答"
-      ],
-      "intra_message_sep": "\n\n"
-    }
-  },
   {
     "version": 1,
     "context_length": 8192,
@@ -1060,166 +988,60 @@
   },
   {
     "version": 1,
-    "context_length": 8192,
-    "model_name": "internlm-7b",
+    "context_length": 32768,
+    "model_name": "internlm2.5-chat",
     "model_lang": [
       "en",
       "zh"
     ],
     "model_ability": [
-      "generate"
+      "chat"
     ],
-    "model_description": "InternLM is a Transformer-based LLM that is trained on both Chinese and English data, focusing on practical scenarios.",
+    "model_description": "InternLM2.5 series of the InternLM model.",
     "model_specs": [
       {
         "model_format": "pytorch",
-        "model_size_in_billions": 7,
+        "model_size_in_billions": "1_8",
         "quantizations": [
-          "4-bit",
-          "8-bit",
           "none"
         ],
-        "model_id": "Shanghai_AI_Laboratory/internlm-7b",
-        "model_hub": "modelscope",
-        "model_revision": "v1.0.1"
-      }
-    ]
-  },
-  {
-    "version": 1,
-    "context_length": 4096,
-    "model_name": "internlm-chat-7b",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "Internlm-chat is a fine-tuned version of the Internlm LLM, specializing in chatting.",
-    "model_specs": [
+        "model_id": "Shanghai_AI_Laboratory/internlm2_5-1_8b-chat",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 7,
         "quantizations": [
-          "4-bit",
-          "8-bit",
           "none"
         ],
-        "model_id": "Shanghai_AI_Laboratory/internlm-chat-7b",
-        "model_hub": "modelscope",
-        "model_revision": "v1.0.1"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "INTERNLM",
-      "system_prompt": "",
-      "roles": [
-        "<|User|>",
-        "<|Bot|>"
-      ],
-      "intra_message_sep": "<eoh>\n",
-      "inter_message_sep": "<eoa>\n",
-      "stop_token_ids": [
-        1,
-        103028
-      ],
-      "stop": [
-        "<eoa>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 16384,
-    "model_name": "internlm-20b",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "generate"
-    ],
-    "model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data.",
-    "model_specs": [
+        "model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
+        "model_hub": "modelscope"
+      },
       {
-        "model_format": "pytorch",
-        "model_size_in_billions": 20,
+        "model_format": "ggufv2",
+        "model_size_in_billions": 7,
         "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
+          "q2_k",
+          "q3_k_m",
+          "q4_0",
+          "q4_k_m",
+          "q5_0",
+          "q5_k_m",
+          "q6_k",
+          "q8_0",
+          "fp16"
         ],
-        "model_id": "Shanghai_AI_Laboratory/internlm-20b",
-        "model_hub": "modelscope",
-        "model_revision": "v1.0.1"
-      }
-    ]
-  },
-  {
-    "version": 1,
-    "context_length": 16384,
-    "model_name": "internlm-chat-20b",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "Pre-trained on over 2.3T Tokens containing high-quality English, Chinese, and code data. The Chat version has undergone SFT and RLHF training.",
-    "model_specs": [
+        "model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat-gguf",
+        "model_file_name_template": "internlm2_5-7b-chat-{quantization}.gguf",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 20,
         "quantizations": [
-          "4-bit",
-          "8-bit",
           "none"
         ],
-        "model_id": "Shanghai_AI_Laboratory/internlm-chat-20b",
-        "model_hub": "modelscope",
-        "model_revision": "v1.0.1"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "INTERNLM",
-      "system_prompt": "",
-      "roles": [
-        "<|User|>",
-        "<|Bot|>"
-      ],
-      "intra_message_sep": "<eoh>\n",
-      "inter_message_sep": "<eoa>\n",
-      "stop_token_ids": [
-        1,
-        103028
-      ],
-      "stop": [
-        "<eoa>"
-      ]
-    }
-  },
-  {
-    "version": 1,
-    "context_length": 32768,
-    "model_name": "internlm2.5-chat",
-    "model_lang": [
-      "en",
-      "zh"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "InternLM2.5 series of the InternLM model.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 7,
-        "quantizations": [
-          "none"
-        ],
-        "model_id": "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
+        "model_id": "Shanghai_AI_Laboratory/internlm2_5-20b-chat",
         "model_hub": "modelscope"
       }
     ],
@@ -2403,59 +2225,6 @@
       ]
     }
   },
-  {
-    "version": 1,
-    "context_length": 2048,
-    "model_name": "falcon-instruct",
-    "model_lang": [
-      "en"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "Falcon-instruct is a fine-tuned version of the Falcon LLM, specializing in chatting.",
-    "model_specs": [
-      {
-        "model_format": "pytorch",
-        "model_size_in_billions": 7,
-        "quantizations": [
-          "4-bit",
-          "8-bit",
-          "none"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "Xorbits/falcon-7b-instruct",
-        "model_revision": "v1.0.0"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "FALCON",
-      "system_prompt": "",
-      "roles": [
-        "User",
-        "Assistant"
-      ],
-      "intra_message_sep": "\n",
-      "inter_message_sep": "<|endoftext|>",
-      "stop": [
-        "\nUser"
-      ],
-      "stop_token_ids": [
-        0,
-        1,
-        2,
-        3,
-        4,
-        5,
-        6,
-        7,
-        8,
-        9,
-        10,
-        11
-      ]
-    }
-  },
   {
     "version": 1,
     "context_length": 8192,
@@ -2540,53 +2309,6 @@
       ]
     }
   },
-  {
-    "version": 1,
-    "context_length": 2048,
-    "model_name": "OpenBuddy",
-    "model_lang": [
-      "en"
-    ],
-    "model_ability": [
-      "chat"
-    ],
-    "model_description": "OpenBuddy is a powerful open multilingual chatbot model aimed at global users.",
-    "model_specs": [
-      {
-        "model_format": "ggmlv3",
-        "model_size_in_billions": 13,
-        "quantizations": [
-          "Q2_K",
-          "Q3_K_S",
-          "Q3_K_M",
-          "Q3_K_L",
-          "Q4_0",
-          "Q4_1",
-          "Q4_K_S",
-          "Q4_K_M",
-          "Q5_0",
-          "Q5_1",
-          "Q5_K_S",
-          "Q5_K_M",
-          "Q6_K",
-          "Q8_0"
-        ],
-        "model_hub": "modelscope",
-        "model_id": "Xorbits/OpenBuddy-Llama2-13B-v11.1-GGML",
-        "model_file_name_template": "openbuddy-llama2-13b-v11.1.ggmlv3.{quantization}.bin"
-      }
-    ],
-    "prompt_style": {
-      "style_name": "INSTRUCTION",
-      "system_prompt": "You are a professional translator. Be faithful or accurate in translation. Make the translation readable or intelligible. Be elegant or natural in translation. Do not translate person's name. Do not add any additional text to the translation. Do not give me any comments or suggestions.\nUser:\n\n{0}\nAssistant:",
-      "roles": [
-        "User",
-        "Assistant"
-      ],
-      "intra_message_sep": "",
-      "inter_message_sep": ""
-    }
-  },
   {
     "version": 1,
     "context_length": 32768,
@@ -3416,6 +3138,24 @@
         "model_id": "qwen/Qwen2-72B-Instruct-AWQ",
         "model_hub": "modelscope"
       },
+      {
+        "model_format": "fp8",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "fp8"
+        ],
+        "model_id": "liuzhenghua/Qwen2-7B-FP8-Instruct",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "fp8",
+        "model_size_in_billions": 72,
+        "quantizations": [
+          "fp8"
+        ],
+        "model_id": "liuzhenghua/Qwen2-72B-FP8-Instruct",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "mlx",
         "model_size_in_billions": "0_5",
@@ -4245,6 +3985,17 @@
     ],
     "model_description": "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models.",
     "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 2,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "LLM-Research/gemma-2-2b-it",
+        "model_hub": "modelscope"
+      },
       {
         "model_format": "pytorch",
         "model_size_in_billions": 9,
@@ -4958,25 +4709,187 @@
             "model_format": "pytorch",
             "model_size_in_billions": 26,
             "quantizations": [
-                "none"
+              "4-bit",
+              "8-bit",
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL-Chat-V1-5",
+            "model_revision": "master"
+        }
+    ],
+    "prompt_style": {
+        "style_name": "INTERNVL",
+        "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
+        "roles": [
+            "<|im_start|>user",
+            "<|im_start|>assistant"
+        ],
+        "intra_message_sep": "<|im_end|>",
+        "stop_token_ids": [
+            2,
+            92543,
+            92542
+        ],
+        "stop": [
+            "</s>",
+            "<|im_end|>",
+            "<|im_start|>"
+        ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "internvl2",
+    "model_lang": [
+        "en",
+        "zh"
+    ],
+    "model_ability": [
+        "chat",
+        "vision"
+    ],
+    "model_description": "InternVL 2 is an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. ",
+    "model_specs": [
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 1,
+            "quantizations": [
+              "4-bit",
+              "8-bit",
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-1B",
+            "model_revision": "master"
+        },
+      {
+            "model_format": "pytorch",
+            "model_size_in_billions": 2,
+            "quantizations": [
+              "4-bit",
+              "8-bit",
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-2B",
+            "model_revision": "master"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 2,
+            "quantizations": [
+              "none"
             ],
-          "model_hub": "modelscope",
-            "model_id": "AI-ModelScope/InternVL-Chat-V1-5",
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-2B-AWQ",
+            "model_revision": "master"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 4,
+            "quantizations": [
+              "4-bit",
+              "8-bit",
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-4B",
+            "model_revision": "master"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 8,
+            "quantizations": [
+              "4-bit",
+              "8-bit",
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-8B",
+            "model_revision": "master"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 8,
+            "quantizations": [
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-8B-AWQ",
+            "model_revision": "master"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 26,
+            "quantizations": [
+              "4-bit",
+              "8-bit",
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-26B",
             "model_revision": "master"
         },
         {
             "model_format": "pytorch",
             "model_size_in_billions": 26,
             "quantizations": [
-                "Int8"
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-26B-AWQ",
+            "model_revision": "master"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 40,
+            "quantizations": [
+              "4-bit",
+              "8-bit",
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-40B",
+            "model_revision": "master"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 40,
+            "quantizations": [
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-40B-AWQ",
+            "model_revision": "master"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 76,
+            "quantizations": [
+              "4-bit",
+              "8-bit",
+              "none"
+            ],
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-Llama3-76B",
+            "model_revision": "master"
+        },
+        {
+            "model_format": "pytorch",
+            "model_size_in_billions": 76,
+            "quantizations": [
+              "none"
             ],
-          "model_hub": "modelscope",
-            "model_id": "AI-ModelScope/InternVL-Chat-V1-5-{quantization}",
+            "model_hub": "modelscope",
+            "model_id": "OpenGVLab/InternVL2-Llama3-76B-AWQ",
             "model_revision": "master"
         }
     ],
     "prompt_style": {
-        "style_name": "INTERNLM2",
+        "style_name": "INTERNVL",
         "system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
         "roles": [
             "<|im_start|>user",
@@ -4984,10 +4897,14 @@
         ],
         "intra_message_sep": "<|im_end|>",
         "stop_token_ids": [
+            2,
+            92543,
             92542
         ],
         "stop": [
-            "<|im_end|>"
+            "</s>",
+            "<|im_end|>",
+            "<|im_start|>"
         ]
     }
   },

xinference/model/llm/memory.py CHANGED Viewed

@@ -61,7 +61,7 @@ class ModelMemInfo:
 QUANT_NORMALIZE = {"int4": "4-bit", "int8": "8-bit", "4-bit": "4-bit", "8-bit": "8-bit"}
-GGML_MULTI_FACTOR_DICT = {
+GGUF_MULTI_FACTOR_DICT = {
     "q4_0": 18,
     "q4_1": 20,
     "q5_0": 22,
@@ -70,14 +70,14 @@ GGML_MULTI_FACTOR_DICT = {
     "q8_1": 40,
 }
-GGML_MULTI_FACTOR_DICT_64 = {
+GGUF_MULTI_FACTOR_DICT_64 = {
     "q6_K": 54.0,
     "q3": 26.0,
     "q4": 38.0,
     "q5": 46.0,
 }
-GGML_MULTI_FACTOR_DICT_COMBINE = {
+GGUF_MULTI_FACTOR_DICT_COMBINE = {
     "q3_K_L": [38.0, 26.0],
     "q3_K_M": [46.0, 26.0],
     "q4_K_S": [46.0, 38.0],
@@ -136,9 +136,9 @@ def estimate_llm_gpu_memory_details(
     else:
         kv_dtype_size = 4
     overhead = 650.0
-    if model_format == "ggmlv3":
+    if model_format == "ggufv2":
         assert quantization is not None and quantization != "none"
-        model_size_in_mb = _compute_model_size_ggml(info, quantization)
+        model_size_in_mb = _compute_model_size_gguf(info, quantization)
         inference_mem = float(
             context_length * kv_dtype_size * info.hidden_dim * info.num_layers
         )
@@ -291,7 +291,7 @@ def _compute_inference_only_activation_memory(
     return ret
-def _compute_model_size_ggml(info: ModelLayersInfo, quantization: str) -> float:
+def _compute_model_size_gguf(info: ModelLayersInfo, quantization: str) -> float:
     assert quantization is not None
     vocab_size = info.vocab_size
     num_layers = info.num_layers
@@ -310,13 +310,13 @@ def _compute_model_size_ggml(info: ModelLayersInfo, quantization: str) -> float:
     )
     total = 0.0
-    v1 = GGML_MULTI_FACTOR_DICT.get(quantization)
+    v1 = GGUF_MULTI_FACTOR_DICT.get(quantization)
     if v1 is not None:
         total = (v1 * total_params) / (32 * 1024 * 1024)
-    v2 = GGML_MULTI_FACTOR_DICT_64.get(quantization)
+    v2 = GGUF_MULTI_FACTOR_DICT_64.get(quantization)
     if v2 is not None:
         total = (v2 * total_params) / (64 * 1024 * 1024)
-    v3 = GGML_MULTI_FACTOR_DICT_COMBINE.get(quantization)
+    v3 = GGUF_MULTI_FACTOR_DICT_COMBINE.get(quantization)
     if v3 is not None:
         factors = v3
         if quantization == "q2_K":