xinference 0.14.3__py3-none-any.whl → 0.14.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (70) hide show
  1. xinference/_version.py +3 -3
  2. xinference/core/worker.py +18 -9
  3. xinference/model/audio/chattts.py +4 -3
  4. xinference/model/audio/cosyvoice.py +4 -3
  5. xinference/model/audio/custom.py +4 -5
  6. xinference/model/embedding/core.py +2 -0
  7. xinference/model/embedding/custom.py +4 -5
  8. xinference/model/flexible/core.py +5 -1
  9. xinference/model/image/custom.py +4 -5
  10. xinference/model/image/stable_diffusion/core.py +21 -6
  11. xinference/model/llm/llm_family.py +5 -6
  12. xinference/model/llm/sglang/core.py +7 -1
  13. xinference/model/llm/transformers/core.py +2 -0
  14. xinference/model/llm/utils.py +3 -0
  15. xinference/model/llm/vllm/core.py +0 -33
  16. xinference/model/rerank/custom.py +4 -5
  17. xinference/model/utils.py +41 -1
  18. xinference/model/video/core.py +3 -1
  19. xinference/model/video/diffusers.py +41 -38
  20. xinference/model/video/model_spec.json +24 -1
  21. xinference/model/video/model_spec_modelscope.json +25 -1
  22. xinference/thirdparty/fish_speech/tools/api.py +1 -1
  23. xinference/thirdparty/matcha/__init__.py +0 -0
  24. xinference/thirdparty/matcha/app.py +357 -0
  25. xinference/thirdparty/matcha/cli.py +419 -0
  26. xinference/thirdparty/matcha/data/__init__.py +0 -0
  27. xinference/thirdparty/matcha/data/components/__init__.py +0 -0
  28. xinference/thirdparty/matcha/data/text_mel_datamodule.py +274 -0
  29. xinference/thirdparty/matcha/hifigan/__init__.py +0 -0
  30. xinference/thirdparty/matcha/hifigan/config.py +28 -0
  31. xinference/thirdparty/matcha/hifigan/denoiser.py +64 -0
  32. xinference/thirdparty/matcha/hifigan/env.py +17 -0
  33. xinference/thirdparty/matcha/hifigan/meldataset.py +217 -0
  34. xinference/thirdparty/matcha/hifigan/models.py +368 -0
  35. xinference/thirdparty/matcha/hifigan/xutils.py +60 -0
  36. xinference/thirdparty/matcha/models/__init__.py +0 -0
  37. xinference/thirdparty/matcha/models/baselightningmodule.py +210 -0
  38. xinference/thirdparty/matcha/models/components/__init__.py +0 -0
  39. xinference/thirdparty/matcha/models/components/decoder.py +443 -0
  40. xinference/thirdparty/matcha/models/components/flow_matching.py +132 -0
  41. xinference/thirdparty/matcha/models/components/text_encoder.py +410 -0
  42. xinference/thirdparty/matcha/models/components/transformer.py +316 -0
  43. xinference/thirdparty/matcha/models/matcha_tts.py +244 -0
  44. xinference/thirdparty/matcha/onnx/__init__.py +0 -0
  45. xinference/thirdparty/matcha/onnx/export.py +181 -0
  46. xinference/thirdparty/matcha/onnx/infer.py +168 -0
  47. xinference/thirdparty/matcha/text/__init__.py +53 -0
  48. xinference/thirdparty/matcha/text/cleaners.py +121 -0
  49. xinference/thirdparty/matcha/text/numbers.py +71 -0
  50. xinference/thirdparty/matcha/text/symbols.py +17 -0
  51. xinference/thirdparty/matcha/train.py +122 -0
  52. xinference/thirdparty/matcha/utils/__init__.py +5 -0
  53. xinference/thirdparty/matcha/utils/audio.py +82 -0
  54. xinference/thirdparty/matcha/utils/generate_data_statistics.py +112 -0
  55. xinference/thirdparty/matcha/utils/get_durations_from_trained_model.py +195 -0
  56. xinference/thirdparty/matcha/utils/instantiators.py +56 -0
  57. xinference/thirdparty/matcha/utils/logging_utils.py +53 -0
  58. xinference/thirdparty/matcha/utils/model.py +90 -0
  59. xinference/thirdparty/matcha/utils/monotonic_align/__init__.py +22 -0
  60. xinference/thirdparty/matcha/utils/monotonic_align/core.pyx +47 -0
  61. xinference/thirdparty/matcha/utils/monotonic_align/setup.py +7 -0
  62. xinference/thirdparty/matcha/utils/pylogger.py +21 -0
  63. xinference/thirdparty/matcha/utils/rich_utils.py +101 -0
  64. xinference/thirdparty/matcha/utils/utils.py +259 -0
  65. {xinference-0.14.3.dist-info → xinference-0.14.4.dist-info}/METADATA +20 -12
  66. {xinference-0.14.3.dist-info → xinference-0.14.4.dist-info}/RECORD +70 -28
  67. {xinference-0.14.3.dist-info → xinference-0.14.4.dist-info}/LICENSE +0 -0
  68. {xinference-0.14.3.dist-info → xinference-0.14.4.dist-info}/WHEEL +0 -0
  69. {xinference-0.14.3.dist-info → xinference-0.14.4.dist-info}/entry_points.txt +0 -0
  70. {xinference-0.14.3.dist-info → xinference-0.14.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,410 @@
1
+ """ from https://github.com/jaywalnut310/glow-tts """
2
+
3
+ import math
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ from einops import rearrange
8
+
9
+ import matcha.utils as utils
10
+ from matcha.utils.model import sequence_mask
11
+
12
+ log = utils.get_pylogger(__name__)
13
+
14
+
15
+ class LayerNorm(nn.Module):
16
+ def __init__(self, channels, eps=1e-4):
17
+ super().__init__()
18
+ self.channels = channels
19
+ self.eps = eps
20
+
21
+ self.gamma = torch.nn.Parameter(torch.ones(channels))
22
+ self.beta = torch.nn.Parameter(torch.zeros(channels))
23
+
24
+ def forward(self, x):
25
+ n_dims = len(x.shape)
26
+ mean = torch.mean(x, 1, keepdim=True)
27
+ variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
28
+
29
+ x = (x - mean) * torch.rsqrt(variance + self.eps)
30
+
31
+ shape = [1, -1] + [1] * (n_dims - 2)
32
+ x = x * self.gamma.view(*shape) + self.beta.view(*shape)
33
+ return x
34
+
35
+
36
+ class ConvReluNorm(nn.Module):
37
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
38
+ super().__init__()
39
+ self.in_channels = in_channels
40
+ self.hidden_channels = hidden_channels
41
+ self.out_channels = out_channels
42
+ self.kernel_size = kernel_size
43
+ self.n_layers = n_layers
44
+ self.p_dropout = p_dropout
45
+
46
+ self.conv_layers = torch.nn.ModuleList()
47
+ self.norm_layers = torch.nn.ModuleList()
48
+ self.conv_layers.append(torch.nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
49
+ self.norm_layers.append(LayerNorm(hidden_channels))
50
+ self.relu_drop = torch.nn.Sequential(torch.nn.ReLU(), torch.nn.Dropout(p_dropout))
51
+ for _ in range(n_layers - 1):
52
+ self.conv_layers.append(
53
+ torch.nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
54
+ )
55
+ self.norm_layers.append(LayerNorm(hidden_channels))
56
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels, 1)
57
+ self.proj.weight.data.zero_()
58
+ self.proj.bias.data.zero_()
59
+
60
+ def forward(self, x, x_mask):
61
+ x_org = x
62
+ for i in range(self.n_layers):
63
+ x = self.conv_layers[i](x * x_mask)
64
+ x = self.norm_layers[i](x)
65
+ x = self.relu_drop(x)
66
+ x = x_org + self.proj(x)
67
+ return x * x_mask
68
+
69
+
70
+ class DurationPredictor(nn.Module):
71
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
72
+ super().__init__()
73
+ self.in_channels = in_channels
74
+ self.filter_channels = filter_channels
75
+ self.p_dropout = p_dropout
76
+
77
+ self.drop = torch.nn.Dropout(p_dropout)
78
+ self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
79
+ self.norm_1 = LayerNorm(filter_channels)
80
+ self.conv_2 = torch.nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
81
+ self.norm_2 = LayerNorm(filter_channels)
82
+ self.proj = torch.nn.Conv1d(filter_channels, 1, 1)
83
+
84
+ def forward(self, x, x_mask):
85
+ x = self.conv_1(x * x_mask)
86
+ x = torch.relu(x)
87
+ x = self.norm_1(x)
88
+ x = self.drop(x)
89
+ x = self.conv_2(x * x_mask)
90
+ x = torch.relu(x)
91
+ x = self.norm_2(x)
92
+ x = self.drop(x)
93
+ x = self.proj(x * x_mask)
94
+ return x * x_mask
95
+
96
+
97
+ class RotaryPositionalEmbeddings(nn.Module):
98
+ """
99
+ ## RoPE module
100
+
101
+ Rotary encoding transforms pairs of features by rotating in the 2D plane.
102
+ That is, it organizes the $d$ features as $\frac{d}{2}$ pairs.
103
+ Each pair can be considered a coordinate in a 2D plane, and the encoding will rotate it
104
+ by an angle depending on the position of the token.
105
+ """
106
+
107
+ def __init__(self, d: int, base: int = 10_000):
108
+ r"""
109
+ * `d` is the number of features $d$
110
+ * `base` is the constant used for calculating $\Theta$
111
+ """
112
+ super().__init__()
113
+
114
+ self.base = base
115
+ self.d = int(d)
116
+ self.cos_cached = None
117
+ self.sin_cached = None
118
+
119
+ def _build_cache(self, x: torch.Tensor):
120
+ r"""
121
+ Cache $\cos$ and $\sin$ values
122
+ """
123
+ # Return if cache is already built
124
+ if self.cos_cached is not None and x.shape[0] <= self.cos_cached.shape[0]:
125
+ return
126
+
127
+ # Get sequence length
128
+ seq_len = x.shape[0]
129
+
130
+ # $\Theta = {\theta_i = 10000^{-\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
131
+ theta = 1.0 / (self.base ** (torch.arange(0, self.d, 2).float() / self.d)).to(x.device)
132
+
133
+ # Create position indexes `[0, 1, ..., seq_len - 1]`
134
+ seq_idx = torch.arange(seq_len, device=x.device).float().to(x.device)
135
+
136
+ # Calculate the product of position index and $\theta_i$
137
+ idx_theta = torch.einsum("n,d->nd", seq_idx, theta)
138
+
139
+ # Concatenate so that for row $m$ we have
140
+ # $[m \theta_0, m \theta_1, ..., m \theta_{\frac{d}{2}}, m \theta_0, m \theta_1, ..., m \theta_{\frac{d}{2}}]$
141
+ idx_theta2 = torch.cat([idx_theta, idx_theta], dim=1)
142
+
143
+ # Cache them
144
+ self.cos_cached = idx_theta2.cos()[:, None, None, :]
145
+ self.sin_cached = idx_theta2.sin()[:, None, None, :]
146
+
147
+ def _neg_half(self, x: torch.Tensor):
148
+ # $\frac{d}{2}$
149
+ d_2 = self.d // 2
150
+
151
+ # Calculate $[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., x^{(\frac{d}{2})}]$
152
+ return torch.cat([-x[:, :, :, d_2:], x[:, :, :, :d_2]], dim=-1)
153
+
154
+ def forward(self, x: torch.Tensor):
155
+ """
156
+ * `x` is the Tensor at the head of a key or a query with shape `[seq_len, batch_size, n_heads, d]`
157
+ """
158
+ # Cache $\cos$ and $\sin$ values
159
+ x = rearrange(x, "b h t d -> t b h d")
160
+
161
+ self._build_cache(x)
162
+
163
+ # Split the features, we can choose to apply rotary embeddings only to a partial set of features.
164
+ x_rope, x_pass = x[..., : self.d], x[..., self.d :]
165
+
166
+ # Calculate
167
+ # $[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., x^{(\frac{d}{2})}]$
168
+ neg_half_x = self._neg_half(x_rope)
169
+
170
+ x_rope = (x_rope * self.cos_cached[: x.shape[0]]) + (neg_half_x * self.sin_cached[: x.shape[0]])
171
+
172
+ return rearrange(torch.cat((x_rope, x_pass), dim=-1), "t b h d -> b h t d")
173
+
174
+
175
+ class MultiHeadAttention(nn.Module):
176
+ def __init__(
177
+ self,
178
+ channels,
179
+ out_channels,
180
+ n_heads,
181
+ heads_share=True,
182
+ p_dropout=0.0,
183
+ proximal_bias=False,
184
+ proximal_init=False,
185
+ ):
186
+ super().__init__()
187
+ assert channels % n_heads == 0
188
+
189
+ self.channels = channels
190
+ self.out_channels = out_channels
191
+ self.n_heads = n_heads
192
+ self.heads_share = heads_share
193
+ self.proximal_bias = proximal_bias
194
+ self.p_dropout = p_dropout
195
+ self.attn = None
196
+
197
+ self.k_channels = channels // n_heads
198
+ self.conv_q = torch.nn.Conv1d(channels, channels, 1)
199
+ self.conv_k = torch.nn.Conv1d(channels, channels, 1)
200
+ self.conv_v = torch.nn.Conv1d(channels, channels, 1)
201
+
202
+ # from https://nn.labml.ai/transformers/rope/index.html
203
+ self.query_rotary_pe = RotaryPositionalEmbeddings(self.k_channels * 0.5)
204
+ self.key_rotary_pe = RotaryPositionalEmbeddings(self.k_channels * 0.5)
205
+
206
+ self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
207
+ self.drop = torch.nn.Dropout(p_dropout)
208
+
209
+ torch.nn.init.xavier_uniform_(self.conv_q.weight)
210
+ torch.nn.init.xavier_uniform_(self.conv_k.weight)
211
+ if proximal_init:
212
+ self.conv_k.weight.data.copy_(self.conv_q.weight.data)
213
+ self.conv_k.bias.data.copy_(self.conv_q.bias.data)
214
+ torch.nn.init.xavier_uniform_(self.conv_v.weight)
215
+
216
+ def forward(self, x, c, attn_mask=None):
217
+ q = self.conv_q(x)
218
+ k = self.conv_k(c)
219
+ v = self.conv_v(c)
220
+
221
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
222
+
223
+ x = self.conv_o(x)
224
+ return x
225
+
226
+ def attention(self, query, key, value, mask=None):
227
+ b, d, t_s, t_t = (*key.size(), query.size(2))
228
+ query = rearrange(query, "b (h c) t-> b h t c", h=self.n_heads)
229
+ key = rearrange(key, "b (h c) t-> b h t c", h=self.n_heads)
230
+ value = rearrange(value, "b (h c) t-> b h t c", h=self.n_heads)
231
+
232
+ query = self.query_rotary_pe(query)
233
+ key = self.key_rotary_pe(key)
234
+
235
+ scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels)
236
+
237
+ if self.proximal_bias:
238
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
239
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
240
+ if mask is not None:
241
+ scores = scores.masked_fill(mask == 0, -1e4)
242
+ p_attn = torch.nn.functional.softmax(scores, dim=-1)
243
+ p_attn = self.drop(p_attn)
244
+ output = torch.matmul(p_attn, value)
245
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t)
246
+ return output, p_attn
247
+
248
+ @staticmethod
249
+ def _attention_bias_proximal(length):
250
+ r = torch.arange(length, dtype=torch.float32)
251
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
252
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
253
+
254
+
255
+ class FFN(nn.Module):
256
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0):
257
+ super().__init__()
258
+ self.in_channels = in_channels
259
+ self.out_channels = out_channels
260
+ self.filter_channels = filter_channels
261
+ self.kernel_size = kernel_size
262
+ self.p_dropout = p_dropout
263
+
264
+ self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
265
+ self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size, padding=kernel_size // 2)
266
+ self.drop = torch.nn.Dropout(p_dropout)
267
+
268
+ def forward(self, x, x_mask):
269
+ x = self.conv_1(x * x_mask)
270
+ x = torch.relu(x)
271
+ x = self.drop(x)
272
+ x = self.conv_2(x * x_mask)
273
+ return x * x_mask
274
+
275
+
276
+ class Encoder(nn.Module):
277
+ def __init__(
278
+ self,
279
+ hidden_channels,
280
+ filter_channels,
281
+ n_heads,
282
+ n_layers,
283
+ kernel_size=1,
284
+ p_dropout=0.0,
285
+ **kwargs,
286
+ ):
287
+ super().__init__()
288
+ self.hidden_channels = hidden_channels
289
+ self.filter_channels = filter_channels
290
+ self.n_heads = n_heads
291
+ self.n_layers = n_layers
292
+ self.kernel_size = kernel_size
293
+ self.p_dropout = p_dropout
294
+
295
+ self.drop = torch.nn.Dropout(p_dropout)
296
+ self.attn_layers = torch.nn.ModuleList()
297
+ self.norm_layers_1 = torch.nn.ModuleList()
298
+ self.ffn_layers = torch.nn.ModuleList()
299
+ self.norm_layers_2 = torch.nn.ModuleList()
300
+ for _ in range(self.n_layers):
301
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
302
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
303
+ self.ffn_layers.append(
304
+ FFN(
305
+ hidden_channels,
306
+ hidden_channels,
307
+ filter_channels,
308
+ kernel_size,
309
+ p_dropout=p_dropout,
310
+ )
311
+ )
312
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
313
+
314
+ def forward(self, x, x_mask):
315
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
316
+ for i in range(self.n_layers):
317
+ x = x * x_mask
318
+ y = self.attn_layers[i](x, x, attn_mask)
319
+ y = self.drop(y)
320
+ x = self.norm_layers_1[i](x + y)
321
+ y = self.ffn_layers[i](x, x_mask)
322
+ y = self.drop(y)
323
+ x = self.norm_layers_2[i](x + y)
324
+ x = x * x_mask
325
+ return x
326
+
327
+
328
+ class TextEncoder(nn.Module):
329
+ def __init__(
330
+ self,
331
+ encoder_type,
332
+ encoder_params,
333
+ duration_predictor_params,
334
+ n_vocab,
335
+ n_spks=1,
336
+ spk_emb_dim=128,
337
+ ):
338
+ super().__init__()
339
+ self.encoder_type = encoder_type
340
+ self.n_vocab = n_vocab
341
+ self.n_feats = encoder_params.n_feats
342
+ self.n_channels = encoder_params.n_channels
343
+ self.spk_emb_dim = spk_emb_dim
344
+ self.n_spks = n_spks
345
+
346
+ self.emb = torch.nn.Embedding(n_vocab, self.n_channels)
347
+ torch.nn.init.normal_(self.emb.weight, 0.0, self.n_channels**-0.5)
348
+
349
+ if encoder_params.prenet:
350
+ self.prenet = ConvReluNorm(
351
+ self.n_channels,
352
+ self.n_channels,
353
+ self.n_channels,
354
+ kernel_size=5,
355
+ n_layers=3,
356
+ p_dropout=0.5,
357
+ )
358
+ else:
359
+ self.prenet = lambda x, x_mask: x
360
+
361
+ self.encoder = Encoder(
362
+ encoder_params.n_channels + (spk_emb_dim if n_spks > 1 else 0),
363
+ encoder_params.filter_channels,
364
+ encoder_params.n_heads,
365
+ encoder_params.n_layers,
366
+ encoder_params.kernel_size,
367
+ encoder_params.p_dropout,
368
+ )
369
+
370
+ self.proj_m = torch.nn.Conv1d(self.n_channels + (spk_emb_dim if n_spks > 1 else 0), self.n_feats, 1)
371
+ self.proj_w = DurationPredictor(
372
+ self.n_channels + (spk_emb_dim if n_spks > 1 else 0),
373
+ duration_predictor_params.filter_channels_dp,
374
+ duration_predictor_params.kernel_size,
375
+ duration_predictor_params.p_dropout,
376
+ )
377
+
378
+ def forward(self, x, x_lengths, spks=None):
379
+ """Run forward pass to the transformer based encoder and duration predictor
380
+
381
+ Args:
382
+ x (torch.Tensor): text input
383
+ shape: (batch_size, max_text_length)
384
+ x_lengths (torch.Tensor): text input lengths
385
+ shape: (batch_size,)
386
+ spks (torch.Tensor, optional): speaker ids. Defaults to None.
387
+ shape: (batch_size,)
388
+
389
+ Returns:
390
+ mu (torch.Tensor): average output of the encoder
391
+ shape: (batch_size, n_feats, max_text_length)
392
+ logw (torch.Tensor): log duration predicted by the duration predictor
393
+ shape: (batch_size, 1, max_text_length)
394
+ x_mask (torch.Tensor): mask for the text input
395
+ shape: (batch_size, 1, max_text_length)
396
+ """
397
+ x = self.emb(x) * math.sqrt(self.n_channels)
398
+ x = torch.transpose(x, 1, -1)
399
+ x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
400
+
401
+ x = self.prenet(x, x_mask)
402
+ if self.n_spks > 1:
403
+ x = torch.cat([x, spks.unsqueeze(-1).repeat(1, 1, x.shape[-1])], dim=1)
404
+ x = self.encoder(x, x_mask)
405
+ mu = self.proj_m(x) * x_mask
406
+
407
+ x_dp = torch.detach(x)
408
+ logw = self.proj_w(x_dp, x_mask)
409
+
410
+ return mu, logw, x_mask