PyPI - xinference - Versions diffs - 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl - Mend

xinference 0.9.1py3-none-any.whl → 0.9.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (42) hide show

xinference/model/llm/llm_family.py CHANGED Viewed

@@ -59,6 +59,8 @@ class GgmlLLMSpecV1(BaseModel):
     quantizations: List[str]
     model_id: Optional[str]
     model_file_name_template: str
+    model_file_name_split_template: Optional[str]
+    quantization_parts: Optional[Dict[str, List[str]]]
     model_hub: str = "huggingface"
     model_uri: Optional[str]
     model_revision: Optional[str]
@@ -210,6 +212,7 @@ CustomLLMFamilyV1.update_forward_refs()
 LLM_CLASSES: List[Type[LLM]] = []
+PEFT_SUPPORTED_CLASSES: List[Type[LLM]] = []
 BUILTIN_LLM_FAMILIES: List["LLMFamilyV1"] = []
 BUILTIN_MODELSCOPE_LLM_FAMILIES: List["LLMFamilyV1"] = []
@@ -522,6 +525,52 @@ def _generate_meta_file(
         json.dump(desc.to_dict(), f)
+def _generate_model_file_names(
+    llm_spec: "LLMSpecV1", quantization: Optional[str] = None
+) -> Tuple[List[str], str, bool]:
+    file_names = []
+    final_file_name = llm_spec.model_file_name_template.format(
+        quantization=quantization
+    )
+    need_merge = False
+    if llm_spec.quantization_parts is None:
+        file_names.append(final_file_name)
+    elif quantization is not None and quantization in llm_spec.quantization_parts:
+        parts = llm_spec.quantization_parts[quantization]
+        need_merge = True
+        logger.info(
+            f"Model {llm_spec.model_id} {llm_spec.model_format} {quantization} has {len(parts)} parts."
+        )
+        if llm_spec.model_file_name_split_template is None:
+            raise ValueError(
+                f"No model_file_name_split_template for model spec {llm_spec.model_id}"
+            )
+        for part in parts:
+            file_name = llm_spec.model_file_name_split_template.format(
+                quantization=quantization, part=part
+            )
+            file_names.append(file_name)
+    return file_names, final_file_name, need_merge
+def _merge_cached_files(
+    cache_dir: str, input_file_names: List[str], output_file_name: str
+):
+    with open(os.path.join(cache_dir, output_file_name), "wb") as output_file:
+        for file_name in input_file_names:
+            logger.info(f"Merging file {file_name} into {output_file_name} ...")
+            with open(os.path.join(cache_dir, file_name), "rb") as input_file:
+                shutil.copyfileobj(input_file, output_file)
+    logger.info(f"Merge complete.")
 def cache_from_modelscope(
     llm_family: LLMFamilyV1,
     llm_spec: "LLMSpecV1",
@@ -560,19 +609,26 @@ def cache_from_modelscope(
                 symlink_local_file(os.path.join(subdir, file), cache_dir, relpath)
     elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
-        filename = llm_spec.model_file_name_template.format(quantization=quantization)
-        download_path = retry_download(
-            model_file_download,
-            llm_family.model_name,
-            {
-                "model_size": llm_spec.model_size_in_billions,
-                "model_format": llm_spec.model_format,
-            },
-            llm_spec.model_id,
-            filename,
-            revision=llm_spec.model_revision,
+        file_names, final_file_name, need_merge = _generate_model_file_names(
+            llm_spec, quantization
         )
-        symlink_local_file(download_path, cache_dir, filename)
+        for filename in file_names:
+            download_path = retry_download(
+                model_file_download,
+                llm_family.model_name,
+                {
+                    "model_size": llm_spec.model_size_in_billions,
+                    "model_format": llm_spec.model_format,
+                },
+                llm_spec.model_id,
+                filename,
+                revision=llm_spec.model_revision,
+            )
+            symlink_local_file(download_path, cache_dir, filename)
+        if need_merge:
+            _merge_cached_files(cache_dir, file_names, final_file_name)
     else:
         raise ValueError(f"Unsupported format: {llm_spec.model_format}")
@@ -621,20 +677,27 @@ def cache_from_huggingface(
     elif llm_spec.model_format in ["ggmlv3", "ggufv2"]:
         assert isinstance(llm_spec, GgmlLLMSpecV1)
-        file_name = llm_spec.model_file_name_template.format(quantization=quantization)
-        retry_download(
-            huggingface_hub.hf_hub_download,
-            llm_family.model_name,
-            {
-                "model_size": llm_spec.model_size_in_billions,
-                "model_format": llm_spec.model_format,
-            },
-            llm_spec.model_id,
-            revision=llm_spec.model_revision,
-            filename=file_name,
-            local_dir=cache_dir,
-            local_dir_use_symlinks=True,
+        file_names, final_file_name, need_merge = _generate_model_file_names(
+            llm_spec, quantization
         )
+        for file_name in file_names:
+            retry_download(
+                huggingface_hub.hf_hub_download,
+                llm_family.model_name,
+                {
+                    "model_size": llm_spec.model_size_in_billions,
+                    "model_format": llm_spec.model_format,
+                },
+                llm_spec.model_id,
+                revision=llm_spec.model_revision,
+                filename=file_name,
+                local_dir=cache_dir,
+                local_dir_use_symlinks=True,
+            )
+        if need_merge:
+            _merge_cached_files(cache_dir, file_names, final_file_name)
     else:
         raise ValueError(f"Unsupported model format: {llm_spec.model_format}")
@@ -873,12 +936,20 @@ def unregister_llm(model_name: str, raise_error: bool = True):
 def match_llm_cls(
-    family: LLMFamilyV1, llm_spec: "LLMSpecV1", quantization: str
+    family: LLMFamilyV1,
+    llm_spec: "LLMSpecV1",
+    quantization: str,
+    peft_model_path: Optional[str] = None,
 ) -> Optional[Type[LLM]]:
     """
     Find an LLM implementation for given LLM family and spec.
     """
-    for cls in LLM_CLASSES:
-        if cls.match(family, llm_spec, quantization):
-            return cls
+    if peft_model_path is not None:
+        for cls in PEFT_SUPPORTED_CLASSES:
+            if cls.match(family, llm_spec, quantization):
+                return cls
+    else:
+        for cls in LLM_CLASSES:
+            if cls.match(family, llm_spec, quantization):
+                return cls
     return None

xinference/model/llm/llm_family_modelscope.json CHANGED Viewed

@@ -1909,11 +1909,16 @@
         "model_size_in_billions": 72,
         "quantizations": [
           "q2_k",
-          "q3_k_m"
+          "q3_k_m",
+          "q4_k_m"
         ],
         "model_id": "qwen/Qwen1.5-72B-Chat-GGUF",
         "model_hub": "modelscope",
-        "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf"
+        "model_file_name_template": "qwen1_5-72b-chat-{quantization}.gguf",
+        "model_file_name_split_template": "qwen1_5-72b-chat-{quantization}.gguf.{part}",
+        "quantization_parts": {
+          "q4_k_m": ["a", "b"]
+        }
       }
     ],
     "prompt_style": {

xinference/model/llm/pytorch/baichuan.py CHANGED Viewed

@@ -27,6 +27,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
+        peft_model_path: Optional[str] = None,
     ):
         super().__init__(
             model_uid,
@@ -35,6 +36,7 @@ class BaichuanPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
+            peft_model_path=peft_model_path,
         )
         self._use_fast_tokenizer = False

xinference/model/llm/pytorch/chatglm.py CHANGED Viewed

@@ -39,6 +39,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
+        peft_model_path: Optional[str] = None,
     ):
         super().__init__(
             model_uid,
@@ -47,6 +48,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
+            peft_model_path=peft_model_path,
         )
     def _load_model(self, **kwargs):

xinference/model/llm/pytorch/core.py CHANGED Viewed

@@ -52,12 +52,14 @@ class PytorchModel(LLM):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
+        peft_model_path: Optional[str] = None,
     ):
         super().__init__(model_uid, model_family, model_spec, quantization, model_path)
         self._use_fast_tokenizer = True
         self._pytorch_model_config: PytorchModelConfig = self._sanitize_model_config(
             pytorch_model_config
         )
+        self._peft_model_path = peft_model_path
     def _sanitize_model_config(
         self, pytorch_model_config: Optional[PytorchModelConfig]
@@ -112,6 +114,24 @@ class PytorchModel(LLM):
         )
         return model, tokenizer
+    def _apply_lora(self):
+        if self._peft_model_path is not None:
+            try:
+                from peft import PeftModel
+            except ImportError:
+                raise ImportError(
+                    f"Failed to import 'PeftModel' from 'peft'. Please make sure 'peft' is installed.\n\n"
+                )
+            # Apply LoRA
+            self._model = PeftModel.from_pretrained(
+                self._model,
+                self._peft_model_path,
+            )
+            logger.info(
+                f"Successfully loaded the PEFT adaptor for model {self.model_uid}."
+            )
     def load(self):
         try:
             import torch
@@ -200,6 +220,7 @@ class PytorchModel(LLM):
             is_device_map_auto = True
         self._model, self._tokenizer = self._load_model(**kwargs)
+        self._apply_lora()
         if not is_device_map_auto:
             self._model.to(self._device)
@@ -391,6 +412,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
+        peft_model_path: Optional[str] = None,
     ):
         super().__init__(
             model_uid,
@@ -399,6 +421,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
             quantization,
             model_path,
             pytorch_model_config,
+            peft_model_path,
         )
     def _sanitize_generate_config(

xinference/model/llm/pytorch/falcon.py CHANGED Viewed

@@ -27,6 +27,7 @@ class FalconPytorchModel(PytorchModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
+        peft_model_path: Optional[str] = None,
     ):
         super().__init__(
             model_uid,
@@ -35,6 +36,7 @@ class FalconPytorchModel(PytorchModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
+            peft_model_path=peft_model_path,
         )
     def _load_model(self, **kwargs):
@@ -84,6 +86,7 @@ class FalconPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
+        peft_model_path: Optional[str] = None,
     ):
         super().__init__(
             model_uid,
@@ -92,6 +95,7 @@ class FalconPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
+            peft_model_path=peft_model_path,
         )
     def _load_model(self, **kwargs):

xinference/model/llm/pytorch/internlm2.py CHANGED Viewed

@@ -38,6 +38,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
+        peft_model_path: Optional[str] = None,
     ):
         super().__init__(
             model_uid,
@@ -46,6 +47,7 @@ class Internlm2PytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
+            peft_model_path=peft_model_path,
         )
     def _load_model(self, **kwargs):

xinference/model/llm/pytorch/llama_2.py CHANGED Viewed

@@ -27,6 +27,7 @@ class LlamaPytorchModel(PytorchModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional[PytorchModelConfig] = None,
+        peft_model_path: Optional[str] = None,
     ):
         super().__init__(
             model_uid,
@@ -35,6 +36,7 @@ class LlamaPytorchModel(PytorchModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
+            peft_model_path=peft_model_path,
         )
     def _load_model(self, **kwargs):
@@ -67,6 +69,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
         model_spec: "LLMSpecV1",
         quantization: str,
         model_path: str,
+        peft_model_path: Optional[str] = None,
         pytorch_model_config: Optional["PytorchModelConfig"] = None,
     ):
         super().__init__(
@@ -75,6 +78,7 @@ class LlamaPytorchChatModel(PytorchChatModel):
             model_spec,
             quantization,
             model_path,
+            peft_model_path=peft_model_path,
             pytorch_model_config=pytorch_model_config,
         )
         self._use_fast_tokenizer = False

xinference/model/llm/pytorch/qwen_vl.py CHANGED Viewed

@@ -71,6 +71,7 @@ class QwenVLChatModel(PytorchChatModel):
             trust_remote_code=True,
             code_revision=self.model_spec.model_revision,
         )
+        self._apply_lora()
     def _message_content_to_qwen(self, content) -> str:
         def _ensure_url(_url):

xinference/model/llm/pytorch/vicuna.py CHANGED Viewed

@@ -41,6 +41,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
         quantization: str,
         model_path: str,
         pytorch_model_config: Optional["PytorchModelConfig"] = None,
+        peft_model_path: Optional[str] = None,
     ):
         super().__init__(
             model_uid,
@@ -49,6 +50,7 @@ class VicunaPytorchChatModel(PytorchChatModel):
             quantization,
             model_path,
             pytorch_model_config=pytorch_model_config,
+            peft_model_path=peft_model_path,
         )
         self._use_fast_tokenizer = False

xinference/model/llm/pytorch/yi_vl.py CHANGED Viewed

@@ -69,6 +69,7 @@ class YiVLChatModel(PytorchChatModel):
             self._image_processor,
             _,
         ) = load_pretrained_model(self.model_path, device_map=self._device)
+        self._apply_lora()
     @staticmethod
     def _message_content_to_yi(content) -> Union[str, tuple]:

xinference/web/ui/build/asset-manifest.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "files": {
-    "main.js": "./static/js/main.ebf7716d.js",
+    "main.js": "./static/js/main.78829790.js",
     "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
     "index.html": "./index.html",
-    "main.ebf7716d.js.map": "./static/js/main.ebf7716d.js.map"
+    "main.78829790.js.map": "./static/js/main.78829790.js.map"
   },
   "entrypoints": [
-    "static/js/main.ebf7716d.js"
+    "static/js/main.78829790.js"
   ]
 }

xinference/web/ui/build/index.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.~~ebf7716d~~.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1	+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.78829790.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

xinference 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

Potentially problematic release.

xinference 0.9.1py3-none-any.whl → 0.9.2py3-none-any.whl