PyPI - xinference - Versions diffs - 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

xinference 1.5.1py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (96) hide show

xinference/thirdparty/cosyvoice/flow/flow.py CHANGED Viewed

@@ -91,6 +91,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
         conds = conds.transpose(1, 2)
         mask = (~make_pad_mask(feat_len)).to(h)
+        # NOTE this is unnecessary, feat/h already same shape
         feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
         loss, _ = self.decoder.compute_loss(
             feat.transpose(1, 2).contiguous(),
@@ -116,7 +117,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
         embedding = F.normalize(embedding, dim=1)
         embedding = self.spk_embed_affine_layer(embedding)
-        # concat text and prompt_text
+        # concat speech token and prompt speech token
         token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
         token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
         mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
@@ -129,7 +130,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
         h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2, self.input_frame_rate)
         # get conditions
-        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device)
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
         conds[:, :mel_len1] = prompt_feat
         conds = conds.transpose(1, 2)
@@ -141,11 +142,11 @@ class MaskedDiffWithXvec(torch.nn.Module):
             cond=conds,
             n_timesteps=10,
             prompt_len=mel_len1,
-            flow_cache=flow_cache
+            cache=flow_cache
         )
         feat = feat[:, :, mel_len1:]
         assert feat.shape[2] == mel_len2
-        return feat, flow_cache
+        return feat.float(), flow_cache
 class CausalMaskedDiffWithXvec(torch.nn.Module):
@@ -186,6 +187,53 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         self.token_mel_ratio = token_mel_ratio
         self.pre_lookahead_len = pre_lookahead_len
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        token = batch['speech_token'].to(device)
+        token_len = batch['speech_token_len'].to(device)
+        feat = batch['speech_feat'].to(device)
+        feat_len = batch['speech_feat_len'].to(device)
+        embedding = batch['embedding'].to(device)
+        # NOTE unified training, static_chunk_size > 0 or = 0
+        streaming = True if random.random() < 0.5 else False
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len, streaming=streaming)
+        h = self.encoder_proj(h)
+        # get conditions
+        feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
+        conds = torch.zeros(feat.shape, device=token.device)
+        for i, j in enumerate(feat_len):
+            if random.random() < 0.5:
+                continue
+            index = random.randint(0, int(0.3 * j))
+            conds[i, :index] = feat[i, :index]
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(h_lengths.sum(dim=-1).squeeze(dim=1))).to(h)
+        loss, _ = self.decoder.compute_loss(
+            feat.transpose(1, 2).contiguous(),
+            mask.unsqueeze(1),
+            h.transpose(1, 2).contiguous(),
+            embedding,
+            cond=conds,
+            streaming=streaming,
+        )
+        return {'loss': loss}
     @torch.inference_mode()
     def inference(self,
                   token,
@@ -195,6 +243,7 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
                   prompt_feat,
                   prompt_feat_len,
                   embedding,
+                  cache,
                   finalize):
         assert token.shape[0] == 1
         # xvec projection
@@ -207,25 +256,34 @@ class CausalMaskedDiffWithXvec(torch.nn.Module):
         token = self.input_embedding(torch.clamp(token, min=0)) * mask
         # text encode
-        h, h_lengths = self.encoder(token, token_len)
-        if finalize is False:
-            h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
+        if finalize is True:
+            h, h_lengths, encoder_cache = self.encoder.forward_chunk(token, token_len, **cache['encoder_cache'])
+        else:
+            token, context = token[:, :-self.pre_lookahead_len], token[:, -self.pre_lookahead_len:]
+            h, h_lengths, encoder_cache = self.encoder.forward_chunk(token, token_len, context=context, **cache['encoder_cache'])
+        cache['encoder_cache']['offset'] = encoder_cache[0]
+        cache['encoder_cache']['pre_lookahead_layer_conv2_cache'] = encoder_cache[1]
+        cache['encoder_cache']['encoders_kv_cache'] = encoder_cache[2]
+        cache['encoder_cache']['upsample_offset'] = encoder_cache[3]
+        cache['encoder_cache']['upsample_conv_cache'] = encoder_cache[4]
+        cache['encoder_cache']['upsample_kv_cache'] = encoder_cache[5]
         mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
         h = self.encoder_proj(h)
         # get conditions
-        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device)
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
         conds[:, :mel_len1] = prompt_feat
         conds = conds.transpose(1, 2)
         mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
-        feat, _ = self.decoder(
+        feat, cache['decoder_cache'] = self.decoder(
             mu=h.transpose(1, 2).contiguous(),
             mask=mask.unsqueeze(1),
             spks=embedding,
             cond=conds,
-            n_timesteps=10
+            n_timesteps=10,
+            cache=cache['decoder_cache']
         )
         feat = feat[:, :, mel_len1:]
         assert feat.shape[2] == mel_len2
-        return feat, None
+        return feat.float(), cache

xinference/thirdparty/cosyvoice/flow/flow_matching.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import onnxruntime
+import threading
 import torch
 import torch.nn.functional as F
 from matcha.models.components.flow_matching import BASECFM
@@ -31,9 +31,10 @@ class ConditionalCFM(BASECFM):
         in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
         # Just change the architecture of the estimator here
         self.estimator = estimator
+        self.lock = threading.Lock()
     @torch.inference_mode()
-    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)):
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, cache=torch.zeros(1, 80, 0, 2)):
         """Forward diffusion
         Args:
@@ -52,20 +53,20 @@ class ConditionalCFM(BASECFM):
                 shape: (batch_size, n_feats, mel_timesteps)
         """
-        z = torch.randn_like(mu) * temperature
-        cache_size = flow_cache.shape[2]
+        z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
+        cache_size = cache.shape[2]
         # fix prompt and overlap part mu and z
         if cache_size != 0:
-            z[:, :, :cache_size] = flow_cache[:, :, :, 0]
-            mu[:, :, :cache_size] = flow_cache[:, :, :, 1]
+            z[:, :, :cache_size] = cache[:, :, :, 0]
+            mu[:, :, :cache_size] = cache[:, :, :, 1]
         z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
         mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
-        flow_cache = torch.stack([z_cache, mu_cache], dim=-1)
+        cache = torch.stack([z_cache, mu_cache], dim=-1)
         t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
         if self.t_scheduler == 'cosine':
             t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
-        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), flow_cache
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), cache
     def solve_euler(self, x, t_span, mu, mask, spks, cond):
         """
@@ -89,36 +90,29 @@ class ConditionalCFM(BASECFM):
         # Or in future might add like a return_all_steps flag
         sol = []
-        if self.inference_cfg_rate > 0:
-            # Do not use concat, it may cause memory format changed and trt infer with wrong results!
-            x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
-            mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
-            mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
-            t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
-            spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
-            cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
-        else:
-            x_in, mask_in, mu_in, t_in, spks_in, cond_in = x, mask, mu, t, spks, cond
+        # Do not use concat, it may cause memory format changed and trt infer with wrong results!
+        x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
+        mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
+        spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
+        cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
         for step in range(1, len(t_span)):
             # Classifier-Free Guidance inference introduced in VoiceBox
-            if self.inference_cfg_rate > 0:
-                x_in[:] = x
-                mask_in[:] = mask
-                mu_in[0] = mu
-                t_in[:] = t.unsqueeze(0)
-                spks_in[0] = spks
-                cond_in[0] = cond
-            else:
-                x_in, mask_in, mu_in, t_in, spks_in, cond_in = x, mask, mu, t, spks, cond
+            x_in[:] = x
+            mask_in[:] = mask
+            mu_in[0] = mu
+            t_in[:] = t.unsqueeze(0)
+            spks_in[0] = spks
+            cond_in[0] = cond
             dphi_dt = self.forward_estimator(
                 x_in, mask_in,
                 mu_in, t_in,
                 spks_in,
                 cond_in
             )
-            if self.inference_cfg_rate > 0:
-                dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
-                dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
+            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
             x = x + dt * dphi_dt
             t = t + dt
             sol.append(x)
@@ -129,36 +123,26 @@ class ConditionalCFM(BASECFM):
     def forward_estimator(self, x, mask, mu, t, spks, cond):
         if isinstance(self.estimator, torch.nn.Module):
-            return self.estimator.forward(x, mask, mu, t, spks, cond)
-        elif isinstance(self.estimator, onnxruntime.InferenceSession):
-            ort_inputs = {
-                'x': x.cpu().numpy(),
-                'mask': mask.cpu().numpy(),
-                'mu': mu.cpu().numpy(),
-                't': t.cpu().numpy(),
-                'spks': spks.cpu().numpy(),
-                'cond': cond.cpu().numpy()
-            }
-            output = self.estimator.run(None, ort_inputs)[0]
-            return torch.tensor(output, dtype=x.dtype, device=x.device)
+            return self.estimator(x, mask, mu, t, spks, cond)
         else:
-            self.estimator.set_input_shape('x', (2, 80, x.size(2)))
-            self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
-            self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
-            self.estimator.set_input_shape('t', (2,))
-            self.estimator.set_input_shape('spks', (2, 80))
-            self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
-            # run trt engine
-            self.estimator.execute_v2([x.contiguous().data_ptr(),
-                                       mask.contiguous().data_ptr(),
-                                       mu.contiguous().data_ptr(),
-                                       t.contiguous().data_ptr(),
-                                       spks.contiguous().data_ptr(),
-                                       cond.contiguous().data_ptr(),
-                                       x.data_ptr()])
+            with self.lock:
+                self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+                self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('t', (2,))
+                self.estimator.set_input_shape('spks', (2, 80))
+                self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+                # run trt engine
+                assert self.estimator.execute_v2([x.contiguous().data_ptr(),
+                                                  mask.contiguous().data_ptr(),
+                                                  mu.contiguous().data_ptr(),
+                                                  t.contiguous().data_ptr(),
+                                                  spks.contiguous().data_ptr(),
+                                                  cond.contiguous().data_ptr(),
+                                                  x.data_ptr()]) is True
             return x
-    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None, streaming=False):
         """Computes diffusion loss
         Args:
@@ -195,7 +179,7 @@ class ConditionalCFM(BASECFM):
             spks = spks * cfg_mask.view(-1, 1)
             cond = cond * cfg_mask.view(-1, 1, 1)
-        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
+        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond, streaming=streaming)
         loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
         return loss, y
@@ -206,7 +190,7 @@ class CausalConditionalCFM(ConditionalCFM):
         self.rand_noise = torch.randn([1, 80, 50 * 300])
     @torch.inference_mode()
-    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, cache={}):
         """Forward diffusion
         Args:
@@ -225,11 +209,131 @@ class CausalConditionalCFM(ConditionalCFM):
                 shape: (batch_size, n_feats, mel_timesteps)
         """
-        z = self.rand_noise[:, :, :mu.size(2)].to(mu.device) * temperature
-        if self.fp16 is True:
-            z = z.half()
+        offset = cache.pop('offset')
+        z = self.rand_noise[:, :, :mu.size(2) + offset].to(mu.device).to(mu.dtype) * temperature
+        z = z[:, :, offset:]
+        offset += mu.size(2)
         # fix prompt and overlap part mu and z
         t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
         if self.t_scheduler == 'cosine':
             t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
-        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None
+        mel, cache = self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond, cache=cache)
+        cache['offset'] = offset
+        return mel, cache
+    def solve_euler(self, x, t_span, mu, mask, spks, cond, cache):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        # Do not use concat, it may cause memory format changed and trt infer with wrong results!
+        x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
+        mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
+        spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
+        cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        flow_cache_size = cache['down_blocks_kv_cache'].shape[4]
+        for step in range(1, len(t_span)):
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            x_in[:] = x
+            mask_in[:] = mask
+            mu_in[0] = mu
+            t_in[:] = t.unsqueeze(0)
+            spks_in[0] = spks
+            cond_in[0] = cond
+            cache_step = {k: v[step - 1] for k, v in cache.items()}
+            dphi_dt, cache_step = self.forward_estimator(
+                x_in, mask_in,
+                mu_in, t_in,
+                spks_in,
+                cond_in,
+                cache_step
+            )
+            # NOTE if smaller than flow_cache_size, means last chunk, no need to cache
+            if flow_cache_size != 0 and x_in.shape[2] >= flow_cache_size:
+                cache['down_blocks_conv_cache'][step - 1] = cache_step[0]
+                cache['down_blocks_kv_cache'][step - 1] = cache_step[1][:, :, :, -flow_cache_size:]
+                cache['mid_blocks_conv_cache'][step - 1] = cache_step[2]
+                cache['mid_blocks_kv_cache'][step - 1] = cache_step[3][:, :, :, -flow_cache_size:]
+                cache['up_blocks_conv_cache'][step - 1] = cache_step[4]
+                cache['up_blocks_kv_cache'][step - 1] = cache_step[5][:, :, :, -flow_cache_size:]
+                cache['final_blocks_conv_cache'][step - 1] = cache_step[6]
+            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1].float(), cache
+    def forward_estimator(self, x, mask, mu, t, spks, cond, cache):
+        if isinstance(self.estimator, torch.nn.Module):
+            x, cache1, cache2, cache3, cache4, cache5, cache6, cache7 = self.estimator.forward_chunk(x, mask, mu, t, spks, cond, **cache)
+            cache = (cache1, cache2, cache3, cache4, cache5, cache6, cache7)
+        else:
+            with self.lock:
+                self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+                self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('t', (2,))
+                self.estimator.set_input_shape('spks', (2, 80))
+                self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('down_blocks_conv_cache', cache['down_blocks_conv_cache'].shape)
+                self.estimator.set_input_shape('down_blocks_kv_cache', cache['down_blocks_kv_cache'].shape)
+                self.estimator.set_input_shape('mid_blocks_conv_cache', cache['mid_blocks_conv_cache'].shape)
+                self.estimator.set_input_shape('mid_blocks_kv_cache', cache['mid_blocks_kv_cache'].shape)
+                self.estimator.set_input_shape('up_blocks_conv_cache', cache['up_blocks_conv_cache'].shape)
+                self.estimator.set_input_shape('up_blocks_kv_cache', cache['up_blocks_kv_cache'].shape)
+                self.estimator.set_input_shape('final_blocks_conv_cache', cache['final_blocks_conv_cache'].shape)
+                # run trt engine
+                down_blocks_kv_cache_out = torch.zeros(1, 4, 2, x.size(2), 512, 2).to(x)
+                mid_blocks_kv_cache_out = torch.zeros(12, 4, 2, x.size(2), 512, 2).to(x)
+                up_blocks_kv_cache_out = torch.zeros(1, 4, 2, x.size(2), 512, 2).to(x)
+                assert self.estimator.execute_v2([x.contiguous().data_ptr(),
+                                                  mask.contiguous().data_ptr(),
+                                                  mu.contiguous().data_ptr(),
+                                                  t.contiguous().data_ptr(),
+                                                  spks.contiguous().data_ptr(),
+                                                  cond.contiguous().data_ptr(),
+                                                  cache['down_blocks_conv_cache'].contiguous().data_ptr(),
+                                                  cache['down_blocks_kv_cache'].contiguous().data_ptr(),
+                                                  cache['mid_blocks_conv_cache'].contiguous().data_ptr(),
+                                                  cache['mid_blocks_kv_cache'].contiguous().data_ptr(),
+                                                  cache['up_blocks_conv_cache'].contiguous().data_ptr(),
+                                                  cache['up_blocks_kv_cache'].contiguous().data_ptr(),
+                                                  cache['final_blocks_conv_cache'].contiguous().data_ptr(),
+                                                  x.data_ptr(),
+                                                  cache['down_blocks_conv_cache'].data_ptr(),
+                                                  down_blocks_kv_cache_out.data_ptr(),
+                                                  cache['mid_blocks_conv_cache'].data_ptr(),
+                                                  mid_blocks_kv_cache_out.data_ptr(),
+                                                  cache['up_blocks_conv_cache'].data_ptr(),
+                                                  up_blocks_kv_cache_out.data_ptr(),
+                                                  cache['final_blocks_conv_cache'].data_ptr()]) is True
+                cache = (cache['down_blocks_conv_cache'],
+                         down_blocks_kv_cache_out,
+                         cache['mid_blocks_conv_cache'],
+                         mid_blocks_kv_cache_out,
+                         cache['up_blocks_conv_cache'],
+                         up_blocks_kv_cache_out,
+                         cache['final_blocks_conv_cache'])
+        return x, cache

xinference/thirdparty/cosyvoice/flow/length_regulator.py CHANGED Viewed

@@ -51,6 +51,7 @@ class InterpolateRegulator(nn.Module):
     def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
         # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
+        # NOTE 20 corresponds to token_overlap_len in cosyvoice/cli/model.py
         # x in (B, T, D)
         if x2.shape[1] > 40:
             x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')

xinference/thirdparty/cosyvoice/hifigan/discriminator.py CHANGED Viewed

@@ -1,10 +1,16 @@
 import torch
 import torch.nn as nn
-from torch.nn.utils import weight_norm
+import torch.nn.functional as F
+try:
+    from torch.nn.utils.parametrizations import weight_norm, spectral_norm
+except ImportError:
+    from torch.nn.utils import weight_norm, spectral_norm
 from typing import List, Optional, Tuple
 from einops import rearrange
 from torchaudio.transforms import Spectrogram
+LRELU_SLOPE = 0.1
 class MultipleDiscriminator(nn.Module):
     def __init__(
@@ -138,3 +144,87 @@ class DiscriminatorR(nn.Module):
         x += h
         return x, fmap
+class MultiResSpecDiscriminator(torch.nn.Module):
+    def __init__(self,
+                 fft_sizes=[1024, 2048, 512],
+                 hop_sizes=[120, 240, 50],
+                 win_lengths=[600, 1200, 240],
+                 window="hann_window"):
+        super(MultiResSpecDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window),
+            SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window),
+            SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window)])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for _, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    x_stft = torch.stft(x, fft_size, hop_size, win_length, window, return_complex=True)
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.abs(x_stft).transpose(2, 1)
+class SpecDiscriminator(nn.Module):
+    """docstring for Discriminator."""
+    def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window", use_spectral_norm=False):
+        super(SpecDiscriminator, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = getattr(torch, window)(win_length)
+        self.discriminators = nn.ModuleList([
+            norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
+        ])
+        self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1))
+    def forward(self, y):
+        fmap = []
+        y = y.squeeze(1)
+        y = stft(y, self.fft_size, self.shift_size, self.win_length, self.window.to(y.device))
+        y = y.unsqueeze(1)
+        for _, d in enumerate(self.discriminators):
+            y = d(y)
+            y = F.leaky_relu(y, LRELU_SLOPE)
+            fmap.append(y)
+        y = self.out(y)
+        fmap.append(y)
+        return torch.flatten(y, 1, -1), fmap

xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py CHANGED Viewed

@@ -13,7 +13,10 @@
 # limitations under the License.
 import torch
 import torch.nn as nn
-from torch.nn.utils import weight_norm
+try:
+    from torch.nn.utils.parametrizations import weight_norm
+except ImportError:
+    from torch.nn.utils import weight_norm
 class ConvRNNF0Predictor(nn.Module):

xinference/thirdparty/cosyvoice/hifigan/generator.py CHANGED Viewed

@@ -23,7 +23,10 @@ import torch.nn.functional as F
 from torch.nn import Conv1d
 from torch.nn import ConvTranspose1d
 from torch.nn.utils import remove_weight_norm
-from torch.nn.utils import weight_norm
+try:
+    from torch.nn.utils.parametrizations import weight_norm
+except ImportError:
+    from torch.nn.utils import weight_norm
 from torch.distributions.uniform import Uniform
 from cosyvoice.transformer.activation import Snake

xinference/thirdparty/cosyvoice/hifigan/hifigan.py CHANGED Viewed

@@ -41,7 +41,7 @@ class HiFiGan(nn.Module):
         loss_fm = feature_loss(fmap_rs, fmap_gs)
         loss_mel = mel_loss(real_speech, generated_speech, self.mel_spec_transform)
         if self.tpr_loss_weight != 0:
-            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+            loss_tpr = tpr_loss(y_d_gs, y_d_rs, self.tpr_loss_tau)
         else:
             loss_tpr = torch.zeros(1).to(device)
         loss_f0 = F.l1_loss(generated_f0, pitch_feat)
@@ -56,7 +56,7 @@ class HiFiGan(nn.Module):
         with torch.no_grad():
             generated_speech, generated_f0 = self.generator(batch, device)
         # 2. calculate discriminator outputs
-        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech.detach())
         # 3. calculate discriminator losses, tpr losses [Optional]
         loss_disc, _, _ = discriminator_loss(y_d_rs, y_d_gs)
         if self.tpr_loss_weight != 0:

xinference 1.5.1__py3-none-any.whl → 1.6.0__py3-none-any.whl

Potentially problematic release.

xinference 1.5.1py3-none-any.whl → 1.6.0py3-none-any.whl