PyPI - xinference - Versions diffs - 1.5.1__py3-none-any.whl → 1.6.0.post1__py3-none-any.whl - Mend

xinference 1.5.1py3-none-any.whl → 1.6.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (96) hide show

xinference/model/video/diffusers.py CHANGED Viewed

@@ -14,12 +14,13 @@
 import base64
 import logging
+import operator
 import os
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
-from functools import partial
-from typing import TYPE_CHECKING, List, Union
+from functools import partial, reduce
+from typing import TYPE_CHECKING, Any, List, Optional, Union
 import numpy as np
 import PIL.Image
@@ -29,6 +30,7 @@ from ...device_utils import gpu_count, move_model_to_available_device
 from ...types import Video, VideoList
 if TYPE_CHECKING:
+    from ....core.progress_tracker import Progressor
     from .core import VideoModelFamilyV1
@@ -53,7 +55,7 @@ def export_to_video_imageio(
     return output_video_path
-class DiffUsersVideoModel:
+class DiffusersVideoModel:
     def __init__(
         self,
         model_uid: str,
@@ -111,11 +113,27 @@ class DiffUsersVideoModel:
                 self._model_path, transformer=transformer, **kwargs
             )
         elif self.model_spec.model_family == "Wan":
-            from diffusers import WanPipeline
+            from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, WanPipeline
+            from transformers import CLIPVisionModel
-            pipeline = self._model = WanPipeline.from_pretrained(
-                self._model_path, **kwargs
-            )
+            if "text2video" in self.model_spec.model_ability:
+                pipeline = self._model = WanPipeline.from_pretrained(
+                    self._model_path, **kwargs
+                )
+            else:
+                assert "image2video" in self.model_spec.model_ability
+                image_encoder = CLIPVisionModel.from_pretrained(
+                    self._model_path,
+                    subfolder="image_encoder",
+                    torch_dtype=torch.float32,
+                )
+                vae = AutoencoderKLWan.from_pretrained(
+                    self._model_path, subfolder="vae", torch_dtype=torch.float32
+                )
+                pipeline = self._model = WanImageToVideoPipeline.from_pretrained(
+                    self._model_path, vae=vae, image_encoder=image_encoder, **kwargs
+                )
         else:
             raise Exception(
                 f"Unsupported model family: {self._model_spec.model_family}"
@@ -130,6 +148,11 @@ class DiffUsersVideoModel:
             pipeline.transformer = torch.compile(
                 pipeline.transformer, mode="max-autotune", fullgraph=True
             )
+        if kwargs.get("layerwise_cast", False):
+            compute_dtype = pipeline.transformer.dtype
+            pipeline.transformer.enable_layerwise_casting(
+                storage_dtype=torch.float8_e4m3fn, compute_dtype=compute_dtype
+            )
         if kwargs.get("cpu_offload", False):
             logger.debug("CPU offloading model")
             pipeline.enable_model_cpu_offload()
@@ -145,6 +168,33 @@ class DiffUsersVideoModel:
             except AttributeError:
                 # model does support tiling
                 pass
+        elif kwargs.get("group_offload", False):
+            from diffusers.hooks.group_offloading import apply_group_offloading
+            onload_device = torch.device("cuda")
+            offload_device = torch.device("cpu")
+            apply_group_offloading(
+                pipeline.text_encoder,
+                onload_device=onload_device,
+                offload_device=offload_device,
+                offload_type="block_level",
+                num_blocks_per_group=4,
+            )
+            group_offload_kwargs = {}
+            if kwargs.get("use_stream", False):
+                group_offload_kwargs["offload_type"] = "block_level"
+                group_offload_kwargs["num_blocks_per_group"] = 4
+            else:
+                group_offload_kwargs["offload_type"] = "leaf_level"
+                group_offload_kwargs["use_stream"] = True
+            pipeline.transformer.enable_group_offload(
+                onload_device=onload_device,
+                offload_device=offload_device,
+                **group_offload_kwargs,
+            )
+            # Since we've offloaded the larger models already, we can move the rest of the model components to GPU
+            pipeline = move_model_to_available_device(pipeline)
         elif not kwargs.get("device_map"):
             logger.debug("Loading model to available device")
             if gpu_count() > 1:
@@ -154,6 +204,26 @@ class DiffUsersVideoModel:
         # Recommended if your computer has < 64 GB of RAM
         pipeline.enable_attention_slicing()
+    @staticmethod
+    def _process_progressor(kwargs: dict):
+        import diffusers
+        progressor: Progressor = kwargs.pop("progressor", None)
+        def report_status_callback(
+            pipe: diffusers.DiffusionPipeline,
+            step: int,
+            timestep: int,
+            callback_kwargs: dict,
+        ):
+            num_steps = pipe.num_timesteps
+            progressor.set_progress((step + 1) / num_steps)
+            return callback_kwargs
+        if progressor and progressor.request_id:
+            kwargs["callback_on_step_end"] = report_status_callback
     def text_to_video(
         self,
         prompt: str,
@@ -162,15 +232,6 @@ class DiffUsersVideoModel:
         response_format: str = "b64_json",
         **kwargs,
     ) -> VideoList:
-        import gc
-        from diffusers.utils import export_to_video
-        # cv2 bug will cause the video cannot be normally displayed
-        # thus we use the imageio one
-        # from diffusers.utils import export_to_video
-        from ...device_utils import empty_cache
         assert self._model is not None
         assert callable(self._model)
         generate_kwargs = self._model_spec.default_generate_config.copy()
@@ -181,11 +242,67 @@ class DiffUsersVideoModel:
             "diffusers text_to_video args: %s",
             generate_kwargs,
         )
+        self._process_progressor(generate_kwargs)
         output = self._model(
             prompt=prompt,
             num_inference_steps=num_inference_steps,
             **generate_kwargs,
         )
+        return self._output_to_video(output, fps, response_format)
+    def image_to_video(
+        self,
+        image: PIL.Image,
+        prompt: str,
+        n: int = 1,
+        num_inference_steps: Optional[int] = None,
+        response_format: str = "b64_json",
+        **kwargs,
+    ):
+        assert self._model is not None
+        assert callable(self._model)
+        generate_kwargs = self._model_spec.default_generate_config.copy()
+        generate_kwargs.update(kwargs)
+        generate_kwargs["num_videos_per_prompt"] = n
+        if num_inference_steps:
+            generate_kwargs["num_inference_steps"] = num_inference_steps
+        fps = generate_kwargs.pop("fps", 10)
+        # process image
+        max_area = generate_kwargs.pop("max_area")
+        if isinstance(max_area, str):
+            max_area = [int(v) for v in max_area.split("*")]
+        max_area = reduce(operator.mul, max_area, 1)
+        image = self._process_image(image, max_area)
+        height, width = image.height, image.width
+        generate_kwargs.pop("width", None)
+        generate_kwargs.pop("height", None)
+        self._process_progressor(generate_kwargs)
+        output = self._model(
+            image=image, prompt=prompt, height=height, width=width, **generate_kwargs
+        )
+        return self._output_to_video(output, fps, response_format)
+    def _process_image(self, image: PIL.Image, max_area: int) -> PIL.Image:
+        assert self._model is not None
+        aspect_ratio = image.height / image.width
+        mod_value = (
+            self._model.vae_scale_factor_spatial
+            * self._model.transformer.config.patch_size[1]
+        )
+        height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+        width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+        return image.resize((width, height))
+    def _output_to_video(self, output: Any, fps: int, response_format: str):
+        import gc
+        # cv2 bug will cause the video cannot be normally displayed
+        # thus we use the imageio one
+        from diffusers.utils import export_to_video
+        from ...device_utils import empty_cache
         # clean cache
         gc.collect()

xinference/model/video/model_spec.json CHANGED Viewed

@@ -91,5 +91,59 @@
         "numpy==1.26.4"
       ]
     }
+  },
+  {
+    "model_name": "Wan2.1-i2v-14B-480p",
+    "model_family": "Wan",
+    "model_id": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers",
+    "model_revision": "b184e23a8a16b20f108f727c902e769e873ffc73",
+    "model_ability": [
+      "image2video"
+    ],
+    "default_model_config": {
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {
+      "max_area": [
+        480,
+        832
+      ]
+    },
+    "virtualenv": {
+      "packages": [
+        "diffusers>=0.33.0",
+        "ftfy",
+        "imageio-ffmpeg",
+        "imageio",
+        "numpy==1.26.4"
+      ]
+    }
+  },
+  {
+    "model_name": "Wan2.1-i2v-14B-720p",
+    "model_family": "Wan",
+    "model_id": "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers",
+    "model_revision": "eb849f76dfa246545b65774a9e25943ee69b3fa3",
+    "model_ability": [
+      "image2video"
+    ],
+    "default_model_config": {
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {
+      "max_area": [
+        720,
+        1280
+      ]
+    },
+    "virtualenv": {
+      "packages": [
+        "diffusers>=0.33.0",
+        "ftfy",
+        "imageio-ffmpeg",
+        "imageio",
+        "numpy==1.26.4"
+      ]
+    }
   }
 ]

xinference/model/video/model_spec_modelscope.json CHANGED Viewed

@@ -96,5 +96,61 @@
         "numpy==1.26.4"
       ]
     }
+  },
+  {
+    "model_name": "Wan2.1-i2v-14B-480p",
+    "model_family": "Wan",
+    "model_hub": "modelscope",
+    "model_id": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers",
+    "model_revision": "master",
+    "model_ability": [
+      "image2video"
+    ],
+    "default_model_config": {
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {
+      "max_area": [
+        480,
+        832
+      ]
+    },
+    "virtualenv": {
+      "packages": [
+        "diffusers>=0.33.0",
+        "ftfy",
+        "imageio-ffmpeg",
+        "imageio",
+        "numpy==1.26.4"
+      ]
+    }
+  },
+  {
+    "model_name": "Wan2.1-i2v-14B-720p",
+    "model_family": "Wan",
+    "model_hub": "modelscope",
+    "model_id": "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers",
+    "model_revision": "master",
+    "model_ability": [
+      "image2video"
+    ],
+    "default_model_config": {
+      "torch_dtype": "bfloat16"
+    },
+    "default_generate_config": {
+      "max_area": [
+        720,
+        1280
+      ]
+    },
+    "virtualenv": {
+      "packages": [
+        "diffusers>=0.33.0",
+        "ftfy",
+        "imageio-ffmpeg",
+        "imageio",
+        "numpy==1.26.4"
+      ]
+    }
   }
 ]

xinference/thirdparty/cosyvoice/bin/average_model.py CHANGED Viewed

@@ -75,10 +75,11 @@ def main():
         print('Processing {}'.format(path))
         states = torch.load(path, map_location=torch.device('cpu'))
         for k in states.keys():
-            if k not in avg.keys():
-                avg[k] = states[k].clone()
-            else:
-                avg[k] += states[k]
+            if k not in ['step', 'epoch']:
+                if k not in avg.keys():
+                    avg[k] = states[k].clone()
+                else:
+                    avg[k] += states[k]
     # average
     for k in avg.keys():
         if avg[k] is not None:

xinference/thirdparty/cosyvoice/bin/export_jit.py CHANGED Viewed

@@ -23,7 +23,8 @@ import torch
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append('{}/../..'.format(ROOT_DIR))
 sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
-from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+from cosyvoice.utils.file_utils import logging
 def get_args():
@@ -37,6 +38,16 @@ def get_args():
     return args
+def get_optimized_script(model, preserved_attrs=[]):
+    script = torch.jit.script(model)
+    if preserved_attrs != []:
+        script = torch.jit.freeze(script, preserved_attrs=preserved_attrs)
+    else:
+        script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    return script
 def main():
     args = get_args()
     logging.basicConfig(level=logging.DEBUG,
@@ -46,28 +57,47 @@ def main():
     torch._C._jit_set_profiling_mode(False)
     torch._C._jit_set_profiling_executor(False)
-    cosyvoice = CosyVoice(args.model_dir, load_jit=False, load_onnx=False)
+    try:
+        model = CosyVoice(args.model_dir)
+    except Exception:
+        try:
+            # NOTE set use_flow_cache=True when export jit for cache inference
+            model = CosyVoice2(args.model_dir, use_flow_cache=True)
+        except Exception:
+            raise TypeError('no valid model_type!')
-    # 1. export llm text_encoder
-    llm_text_encoder = cosyvoice.model.llm.text_encoder.half()
-    script = torch.jit.script(llm_text_encoder)
-    script = torch.jit.freeze(script)
-    script = torch.jit.optimize_for_inference(script)
-    script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+    if not isinstance(model, CosyVoice2):
+        # 1. export llm text_encoder
+        llm_text_encoder = model.model.llm.text_encoder
+        script = get_optimized_script(llm_text_encoder)
+        script.save('{}/llm.text_encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(llm_text_encoder.half())
+        script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export llm_text_encoder')
-    # 2. export llm llm
-    llm_llm = cosyvoice.model.llm.llm.half()
-    script = torch.jit.script(llm_llm)
-    script = torch.jit.freeze(script, preserved_attrs=['forward_chunk'])
-    script = torch.jit.optimize_for_inference(script)
-    script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+        # 2. export llm llm
+        llm_llm = model.model.llm.llm
+        script = get_optimized_script(llm_llm, ['forward_chunk'])
+        script.save('{}/llm.llm.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(llm_llm.half(), ['forward_chunk'])
+        script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export llm_llm')
-    # 3. export flow encoder
-    flow_encoder = cosyvoice.model.flow.encoder
-    script = torch.jit.script(flow_encoder)
-    script = torch.jit.freeze(script)
-    script = torch.jit.optimize_for_inference(script)
-    script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        # 3. export flow encoder
+        flow_encoder = model.model.flow.encoder
+        script = get_optimized_script(flow_encoder)
+        script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(flow_encoder.half())
+        script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export flow_encoder')
+    else:
+        # 3. export flow encoder
+        flow_encoder = model.model.flow.encoder
+        script = get_optimized_script(flow_encoder, ['forward_chunk'])
+        script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(flow_encoder.half(), ['forward_chunk'])
+        script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+        logging.info('successfully export flow_encoder')
 if __name__ == '__main__':

xinference/thirdparty/cosyvoice/bin/export_onnx.py CHANGED Viewed

@@ -27,7 +27,8 @@ from tqdm import tqdm
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append('{}/../..'.format(ROOT_DIR))
 sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
-from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+from cosyvoice.utils.file_utils import logging
 def get_dummy_input(batch_size, seq_len, out_channels, device):
@@ -51,61 +52,145 @@ def get_args():
     return args
+@torch.no_grad()
 def main():
     args = get_args()
     logging.basicConfig(level=logging.DEBUG,
                         format='%(asctime)s %(levelname)s %(message)s')
-    cosyvoice = CosyVoice(args.model_dir, load_jit=False, load_onnx=False)
-    # 1. export flow decoder estimator
-    estimator = cosyvoice.model.flow.decoder.estimator
-    device = cosyvoice.model.device
-    batch_size, seq_len = 1, 256
-    out_channels = cosyvoice.model.flow.decoder.estimator.out_channels
-    x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
-    torch.onnx.export(
-        estimator,
-        (x, mask, mu, t, spks, cond),
-        '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
-        export_params=True,
-        opset_version=18,
-        do_constant_folding=True,
-        input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
-        output_names=['estimator_out'],
-        dynamic_axes={
-            'x': {0: 'batch_size', 2: 'seq_len'},
-            'mask': {0: 'batch_size', 2: 'seq_len'},
-            'mu': {0: 'batch_size', 2: 'seq_len'},
-            'cond': {0: 'batch_size', 2: 'seq_len'},
-            't': {0: 'batch_size'},
-            'spks': {0: 'batch_size'},
-            'estimator_out': {0: 'batch_size', 2: 'seq_len'},
-        }
-    )
-    # 2. test computation consistency
-    option = onnxruntime.SessionOptions()
-    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-    option.intra_op_num_threads = 1
-    providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
-    estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
-                                                  sess_options=option, providers=providers)
-    for _ in tqdm(range(10)):
-        x, mask, mu, t, spks, cond = get_dummy_input(random.randint(1, 6), random.randint(16, 512), out_channels, device)
-        output_pytorch = estimator(x, mask, mu, t, spks, cond)
-        ort_inputs = {
-            'x': x.cpu().numpy(),
-            'mask': mask.cpu().numpy(),
-            'mu': mu.cpu().numpy(),
-            't': t.cpu().numpy(),
-            'spks': spks.cpu().numpy(),
-            'cond': cond.cpu().numpy()
-        }
-        output_onnx = estimator_onnx.run(None, ort_inputs)[0]
-        torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+    try:
+        model = CosyVoice(args.model_dir)
+    except Exception:
+        try:
+            # NOTE set use_flow_cache=True when export jit for cache inference
+            model = CosyVoice2(args.model_dir, use_flow_cache=True)
+        except Exception:
+            raise TypeError('no valid model_type!')
+    if not isinstance(model, CosyVoice2):
+        # 1. export flow decoder estimator
+        estimator = model.model.flow.decoder.estimator
+        estimator.eval()
+        device = model.model.device
+        batch_size, seq_len = 2, 256
+        out_channels = model.model.flow.decoder.estimator.out_channels
+        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+        torch.onnx.export(
+            estimator,
+            (x, mask, mu, t, spks, cond),
+            '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+            export_params=True,
+            opset_version=18,
+            do_constant_folding=True,
+            input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+            output_names=['estimator_out'],
+            dynamic_axes={
+                'x': {2: 'seq_len'},
+                'mask': {2: 'seq_len'},
+                'mu': {2: 'seq_len'},
+                'cond': {2: 'seq_len'},
+                'estimator_out': {2: 'seq_len'},
+            }
+        )
+        # 2. test computation consistency
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+        estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                      sess_options=option, providers=providers)
+        for _ in tqdm(range(10)):
+            x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
+            output_pytorch = estimator(x, mask, mu, t, spks, cond)
+            ort_inputs = {
+                'x': x.cpu().numpy(),
+                'mask': mask.cpu().numpy(),
+                'mu': mu.cpu().numpy(),
+                't': t.cpu().numpy(),
+                'spks': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy()
+            }
+            output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+            torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+        logging.info('successfully export estimator')
+    else:
+        # 1. export flow decoder estimator
+        estimator = model.model.flow.decoder.estimator
+        estimator.forward = estimator.forward_chunk
+        estimator.eval()
+        device = model.model.device
+        batch_size, seq_len = 2, 256
+        out_channels = model.model.flow.decoder.estimator.out_channels
+        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+        cache = model.model.init_flow_cache()['decoder_cache']
+        cache.pop('offset')
+        cache = {k: v[0] for k, v in cache.items()}
+        torch.onnx.export(
+            estimator,
+            (x, mask, mu, t, spks, cond,
+             cache['down_blocks_conv_cache'],
+             cache['down_blocks_kv_cache'],
+             cache['mid_blocks_conv_cache'],
+             cache['mid_blocks_kv_cache'],
+             cache['up_blocks_conv_cache'],
+             cache['up_blocks_kv_cache'],
+             cache['final_blocks_conv_cache']),
+            '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+            export_params=True,
+            opset_version=18,
+            do_constant_folding=True,
+            input_names=['x', 'mask', 'mu', 't', 'spks', 'cond', 'down_blocks_conv_cache', 'down_blocks_kv_cache', 'mid_blocks_conv_cache', 'mid_blocks_kv_cache',
+                         'up_blocks_conv_cache', 'up_blocks_kv_cache', 'final_blocks_conv_cache'],
+            output_names=['estimator_out', 'down_blocks_conv_cache_out', 'down_blocks_kv_cache_out', 'mid_blocks_conv_cache_out', 'mid_blocks_kv_cache_out',
+                          'up_blocks_conv_cache_out', 'up_blocks_kv_cache_out', 'final_blocks_conv_cache_out'],
+            dynamic_axes={
+                'x': {2: 'seq_len'},
+                'mask': {2: 'seq_len'},
+                'mu': {2: 'seq_len'},
+                'cond': {2: 'seq_len'},
+                'down_blocks_kv_cache': {3: 'cache_in_len'},
+                'mid_blocks_kv_cache': {3: 'cache_in_len'},
+                'up_blocks_kv_cache': {3: 'cache_in_len'},
+                'estimator_out': {2: 'seq_len'},
+                'down_blocks_kv_cache_out': {3: 'cache_out_len'},
+                'mid_blocks_kv_cache_out': {3: 'cache_out_len'},
+                'up_blocks_kv_cache_out': {3: 'cache_out_len'},
+            }
+        )
+        # 2. test computation consistency
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+        estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                      sess_options=option, providers=providers)
+        for iter in tqdm(range(10)):
+            x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
+            cache = model.model.init_flow_cache()['decoder_cache']
+            cache.pop('offset')
+            cache = {k: v[0] for k, v in cache.items()}
+            output_pytorch = estimator(x, mask, mu, t, spks, cond, **{k: v.clone() for k, v in cache.items()})
+            ort_inputs = {
+                'x': x.cpu().numpy(),
+                'mask': mask.cpu().numpy(),
+                'mu': mu.cpu().numpy(),
+                't': t.cpu().numpy(),
+                'spks': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy(),
+            }
+            output_onnx = estimator_onnx.run(None, {**ort_inputs, **{k: v.clone().cpu().numpy() for k, v in cache.items()}})
+            if iter == 0:
+                # NOTE why can not pass first iteration check?
+                continue
+            for i, j in zip(output_pytorch, output_onnx):
+                torch.testing.assert_allclose(i, torch.from_numpy(j).to(device), rtol=1e-2, atol=1e-4)
+        logging.info('successfully export estimator')
 if __name__ == "__main__":

xinference 1.5.1__py3-none-any.whl → 1.6.0.post1__py3-none-any.whl

Potentially problematic release.

xinference 1.5.1py3-none-any.whl → 1.6.0.post1py3-none-any.whl