xinference 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (132) hide show
  1. xinference/_compat.py +1 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +54 -1
  4. xinference/client/restful/restful_client.py +82 -2
  5. xinference/constants.py +3 -0
  6. xinference/core/chat_interface.py +297 -83
  7. xinference/core/model.py +24 -3
  8. xinference/core/progress_tracker.py +16 -8
  9. xinference/core/supervisor.py +51 -1
  10. xinference/core/worker.py +315 -47
  11. xinference/deploy/cmdline.py +33 -1
  12. xinference/model/audio/core.py +11 -1
  13. xinference/model/audio/megatts.py +105 -0
  14. xinference/model/audio/model_spec.json +24 -1
  15. xinference/model/audio/model_spec_modelscope.json +26 -1
  16. xinference/model/core.py +14 -0
  17. xinference/model/embedding/core.py +6 -1
  18. xinference/model/flexible/core.py +6 -1
  19. xinference/model/image/core.py +6 -1
  20. xinference/model/image/model_spec.json +17 -1
  21. xinference/model/image/model_spec_modelscope.json +17 -1
  22. xinference/model/llm/__init__.py +4 -6
  23. xinference/model/llm/core.py +5 -0
  24. xinference/model/llm/llama_cpp/core.py +46 -17
  25. xinference/model/llm/llm_family.json +530 -85
  26. xinference/model/llm/llm_family.py +24 -1
  27. xinference/model/llm/llm_family_modelscope.json +572 -1
  28. xinference/model/llm/mlx/core.py +16 -2
  29. xinference/model/llm/reasoning_parser.py +3 -3
  30. xinference/model/llm/sglang/core.py +111 -13
  31. xinference/model/llm/transformers/__init__.py +14 -0
  32. xinference/model/llm/transformers/core.py +31 -6
  33. xinference/model/llm/transformers/deepseek_vl.py +1 -1
  34. xinference/model/llm/transformers/deepseek_vl2.py +287 -0
  35. xinference/model/llm/transformers/gemma3.py +17 -2
  36. xinference/model/llm/transformers/intern_vl.py +28 -18
  37. xinference/model/llm/transformers/minicpmv26.py +21 -2
  38. xinference/model/llm/transformers/qwen-omni.py +308 -0
  39. xinference/model/llm/transformers/qwen2_audio.py +1 -1
  40. xinference/model/llm/transformers/qwen2_vl.py +20 -4
  41. xinference/model/llm/utils.py +37 -15
  42. xinference/model/llm/vllm/core.py +184 -8
  43. xinference/model/llm/vllm/distributed_executor.py +320 -0
  44. xinference/model/rerank/core.py +22 -12
  45. xinference/model/utils.py +118 -1
  46. xinference/model/video/core.py +6 -1
  47. xinference/thirdparty/deepseek_vl2/__init__.py +31 -0
  48. xinference/thirdparty/deepseek_vl2/models/__init__.py +26 -0
  49. xinference/thirdparty/deepseek_vl2/models/configuration_deepseek.py +210 -0
  50. xinference/thirdparty/deepseek_vl2/models/conversation.py +310 -0
  51. xinference/thirdparty/deepseek_vl2/models/modeling_deepseek.py +1975 -0
  52. xinference/thirdparty/deepseek_vl2/models/modeling_deepseek_vl_v2.py +697 -0
  53. xinference/thirdparty/deepseek_vl2/models/processing_deepseek_vl_v2.py +675 -0
  54. xinference/thirdparty/deepseek_vl2/models/siglip_vit.py +661 -0
  55. xinference/thirdparty/deepseek_vl2/serve/__init__.py +0 -0
  56. xinference/thirdparty/deepseek_vl2/serve/app_modules/__init__.py +0 -0
  57. xinference/thirdparty/deepseek_vl2/serve/app_modules/gradio_utils.py +83 -0
  58. xinference/thirdparty/deepseek_vl2/serve/app_modules/overwrites.py +81 -0
  59. xinference/thirdparty/deepseek_vl2/serve/app_modules/presets.py +115 -0
  60. xinference/thirdparty/deepseek_vl2/serve/app_modules/utils.py +333 -0
  61. xinference/thirdparty/deepseek_vl2/serve/assets/Kelpy-Codos.js +100 -0
  62. xinference/thirdparty/deepseek_vl2/serve/assets/avatar.png +0 -0
  63. xinference/thirdparty/deepseek_vl2/serve/assets/custom.css +355 -0
  64. xinference/thirdparty/deepseek_vl2/serve/assets/custom.js +22 -0
  65. xinference/thirdparty/deepseek_vl2/serve/assets/favicon.ico +0 -0
  66. xinference/thirdparty/deepseek_vl2/serve/assets/simsun.ttc +0 -0
  67. xinference/thirdparty/deepseek_vl2/serve/inference.py +197 -0
  68. xinference/thirdparty/deepseek_vl2/utils/__init__.py +18 -0
  69. xinference/thirdparty/deepseek_vl2/utils/io.py +80 -0
  70. xinference/thirdparty/megatts3/__init__.py +0 -0
  71. xinference/thirdparty/megatts3/tts/frontend_function.py +175 -0
  72. xinference/thirdparty/megatts3/tts/gradio_api.py +93 -0
  73. xinference/thirdparty/megatts3/tts/infer_cli.py +277 -0
  74. xinference/thirdparty/megatts3/tts/modules/aligner/whisper_small.py +318 -0
  75. xinference/thirdparty/megatts3/tts/modules/ar_dur/ar_dur_predictor.py +362 -0
  76. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/layers.py +64 -0
  77. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/nar_tts_modules.py +73 -0
  78. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/rel_transformer.py +403 -0
  79. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/rot_transformer.py +649 -0
  80. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/seq_utils.py +342 -0
  81. xinference/thirdparty/megatts3/tts/modules/ar_dur/commons/transformer.py +767 -0
  82. xinference/thirdparty/megatts3/tts/modules/llm_dit/cfm.py +309 -0
  83. xinference/thirdparty/megatts3/tts/modules/llm_dit/dit.py +180 -0
  84. xinference/thirdparty/megatts3/tts/modules/llm_dit/time_embedding.py +44 -0
  85. xinference/thirdparty/megatts3/tts/modules/llm_dit/transformer.py +230 -0
  86. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/diag_gaussian.py +67 -0
  87. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/hifigan_modules.py +283 -0
  88. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/seanet_encoder.py +38 -0
  89. xinference/thirdparty/megatts3/tts/modules/wavvae/decoder/wavvae_v3.py +60 -0
  90. xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/conv.py +154 -0
  91. xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/lstm.py +51 -0
  92. xinference/thirdparty/megatts3/tts/modules/wavvae/encoder/common_modules/seanet.py +126 -0
  93. xinference/thirdparty/megatts3/tts/utils/audio_utils/align.py +36 -0
  94. xinference/thirdparty/megatts3/tts/utils/audio_utils/io.py +95 -0
  95. xinference/thirdparty/megatts3/tts/utils/audio_utils/plot.py +90 -0
  96. xinference/thirdparty/megatts3/tts/utils/commons/ckpt_utils.py +171 -0
  97. xinference/thirdparty/megatts3/tts/utils/commons/hparams.py +215 -0
  98. xinference/thirdparty/megatts3/tts/utils/text_utils/dict.json +1 -0
  99. xinference/thirdparty/megatts3/tts/utils/text_utils/ph_tone_convert.py +94 -0
  100. xinference/thirdparty/megatts3/tts/utils/text_utils/split_text.py +90 -0
  101. xinference/thirdparty/megatts3/tts/utils/text_utils/text_encoder.py +280 -0
  102. xinference/types.py +10 -0
  103. xinference/utils.py +54 -0
  104. xinference/web/ui/build/asset-manifest.json +6 -6
  105. xinference/web/ui/build/index.html +1 -1
  106. xinference/web/ui/build/static/css/main.0f6523be.css +2 -0
  107. xinference/web/ui/build/static/css/main.0f6523be.css.map +1 -0
  108. xinference/web/ui/build/static/js/main.58bd483c.js +3 -0
  109. xinference/web/ui/build/static/js/main.58bd483c.js.map +1 -0
  110. xinference/web/ui/node_modules/.cache/babel-loader/3bff8cbe9141f937f4d98879a9771b0f48e0e4e0dbee8e647adbfe23859e7048.json +1 -0
  111. xinference/web/ui/node_modules/.cache/babel-loader/4500b1a622a031011f0a291701e306b87e08cbc749c50e285103536b85b6a914.json +1 -0
  112. xinference/web/ui/node_modules/.cache/babel-loader/51709f5d3e53bcf19e613662ef9b91fb9174942c5518987a248348dd4e1e0e02.json +1 -0
  113. xinference/web/ui/node_modules/.cache/babel-loader/69081049f0c7447544b7cfd73dd13d8846c02fe5febe4d81587e95c89a412d5b.json +1 -0
  114. xinference/web/ui/node_modules/.cache/babel-loader/b8551e9775a01b28ae674125c688febe763732ea969ae344512e64ea01bf632e.json +1 -0
  115. xinference/web/ui/node_modules/.cache/babel-loader/bf2b211b0d1b6465eff512d64c869d748f803c5651a7c24e48de6ea3484a7bfe.json +1 -0
  116. xinference/web/ui/src/locales/en.json +2 -1
  117. xinference/web/ui/src/locales/zh.json +2 -1
  118. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/METADATA +128 -115
  119. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/RECORD +124 -63
  120. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/WHEEL +1 -1
  121. xinference/web/ui/build/static/css/main.b494ae7e.css +0 -2
  122. xinference/web/ui/build/static/css/main.b494ae7e.css.map +0 -1
  123. xinference/web/ui/build/static/js/main.3cea968e.js +0 -3
  124. xinference/web/ui/build/static/js/main.3cea968e.js.map +0 -1
  125. xinference/web/ui/node_modules/.cache/babel-loader/27bcada3ee8f89d21184b359f022fc965f350ffaca52c9814c29f1fc37121173.json +0 -1
  126. xinference/web/ui/node_modules/.cache/babel-loader/7f59e45e3f268ab8a4788b6fb024cf8dab088736dff22f5a3a39c122a83ab930.json +0 -1
  127. xinference/web/ui/node_modules/.cache/babel-loader/dcd60488509450bfff37bfff56de2c096d51de17dd00ec60d4db49c8b483ada1.json +0 -1
  128. xinference/web/ui/node_modules/.cache/babel-loader/e547bbb18abb4a474b675a8d5782d25617566bea0af8caa9b836ce5649e2250a.json +0 -1
  129. /xinference/web/ui/build/static/js/{main.3cea968e.js.LICENSE.txt → main.58bd483c.js.LICENSE.txt} +0 -0
  130. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/entry_points.txt +0 -0
  131. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info/licenses}/LICENSE +0 -0
  132. {xinference-1.4.0.dist-info → xinference-1.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,318 @@
1
+ # MIT License
2
+
3
+ # Copyright (c) 2022 OpenAI
4
+
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ # Copyright (c) [2022] [OpenAI]
24
+ # Copyright (c) [2025] [Ziyue Jiang]
25
+ # SPDX-License-Identifier: MIT
26
+ # This file has been modified by Ziyue Jiang on 2025/03/19
27
+ # Original file was released under MIT, with the full license text # available at https://github.com/openai/whisper/blob/v20240930/LICENSE.
28
+ # This modified file is released under the same license.
29
+
30
+ from contextlib import contextmanager
31
+ from typing import Dict, Iterable, Optional, Tuple
32
+
33
+ import numpy as np
34
+ import torch
35
+ import torch.nn.functional as F
36
+ from torch import Tensor, nn
37
+
38
+ from torch.nn.functional import scaled_dot_product_attention
39
+ SDPA_AVAILABLE = True
40
+
41
+
42
+ class LayerNorm(nn.LayerNorm):
43
+ def forward(self, x: Tensor) -> Tensor:
44
+ return super().forward(x.float()).type(x.dtype)
45
+
46
+
47
+ class Linear(nn.Linear):
48
+ def forward(self, x: Tensor) -> Tensor:
49
+ return F.linear(
50
+ x,
51
+ self.weight.to(x.dtype),
52
+ None if self.bias is None else self.bias.to(x.dtype),
53
+ )
54
+
55
+
56
+ class Conv1d(nn.Conv1d):
57
+ def _conv_forward(
58
+ self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
59
+ ) -> Tensor:
60
+ return super()._conv_forward(
61
+ x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
62
+ )
63
+
64
+
65
+ def sinusoids(length, channels, max_timescale=10000):
66
+ """Returns sinusoids for positional embedding"""
67
+ assert channels % 2 == 0
68
+ log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
69
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
70
+ scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
71
+ return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
72
+
73
+
74
+ @contextmanager
75
+ def disable_sdpa():
76
+ prev_state = MultiHeadAttention.use_sdpa
77
+ try:
78
+ MultiHeadAttention.use_sdpa = False
79
+ yield
80
+ finally:
81
+ MultiHeadAttention.use_sdpa = prev_state
82
+
83
+
84
+ class MultiHeadAttention(nn.Module):
85
+ use_sdpa = True
86
+
87
+ def __init__(self, n_state: int, n_head: int):
88
+ super().__init__()
89
+ self.n_head = n_head
90
+ self.query = Linear(n_state, n_state)
91
+ self.key = Linear(n_state, n_state, bias=False)
92
+ self.value = Linear(n_state, n_state)
93
+ self.out = Linear(n_state, n_state)
94
+
95
+ def forward(
96
+ self,
97
+ x: Tensor,
98
+ xa: Optional[Tensor] = None,
99
+ mask: Optional[Tensor] = None,
100
+ kv_cache: Optional[dict] = None,
101
+ casual: Optional[bool] = None
102
+ ):
103
+ q = self.query(x)
104
+
105
+ if kv_cache is None or xa is None or self.key not in kv_cache:
106
+ # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
107
+ # otherwise, perform key/value projections for self- or cross-attention as usual.
108
+ k = self.key(x if xa is None else xa)
109
+ v = self.value(x if xa is None else xa)
110
+ else:
111
+ # for cross-attention, calculate keys and values once and reuse in subsequent calls.
112
+ k = kv_cache[self.key]
113
+ v = kv_cache[self.value]
114
+
115
+ wv = self.qkv_attention(q, k, v, mask, casual)
116
+ return self.out(wv)
117
+
118
+ def qkv_attention(
119
+ self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None, casual: Optional[bool] = None
120
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
121
+ n_batch, n_ctx, n_state = q.shape
122
+ scale = (n_state // self.n_head) ** -0.25
123
+ q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
124
+ k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
125
+ v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
126
+
127
+ a = scaled_dot_product_attention(
128
+ q, k, v, is_causal=casual and n_ctx > 1, attn_mask=mask[:, None, None, :] if mask is not None else None
129
+ )
130
+ out = a.permute(0, 2, 1, 3).flatten(start_dim=2)
131
+ return out
132
+
133
+
134
+ class ResidualAttentionBlock(nn.Module):
135
+ def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
136
+ super().__init__()
137
+
138
+ self.attn = MultiHeadAttention(n_state, n_head)
139
+ self.attn_ln = LayerNorm(n_state)
140
+
141
+ self.cross_attn = (
142
+ MultiHeadAttention(n_state, n_head) if cross_attention else None
143
+ )
144
+ self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
145
+
146
+ n_mlp = n_state * 4
147
+ self.mlp = nn.Sequential(
148
+ Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
149
+ )
150
+ self.mlp_ln = LayerNorm(n_state)
151
+
152
+ def forward(
153
+ self,
154
+ x: Tensor,
155
+ xa: Optional[Tensor] = None,
156
+ mask: Optional[Tensor] = None,
157
+ kv_cache: Optional[dict] = None,
158
+ casual: Optional[bool] = None,
159
+ ):
160
+ x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache, casual=casual)
161
+ if self.cross_attn:
162
+ # TODO: Cross attention mask
163
+ x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache, casual=False)
164
+ x = x + self.mlp(self.mlp_ln(x))
165
+ return x
166
+
167
+
168
+ class AudioEncoder(nn.Module):
169
+ def __init__(
170
+ self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
171
+ ):
172
+ super().__init__()
173
+ self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
174
+ self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
175
+ self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
176
+
177
+ self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
178
+ [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
179
+ )
180
+ self.ln_post = LayerNorm(n_state)
181
+
182
+ def forward(self, x: Tensor, attn_mask: Tensor):
183
+ """
184
+ x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
185
+ the mel spectrogram of the audio
186
+ """
187
+ x = F.gelu(self.conv1(x))
188
+ x = F.gelu(self.conv2(x))
189
+ x = x.permute(0, 2, 1)
190
+
191
+ # assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
192
+ x = (x + self.positional_embedding[:x.size(1)]).to(x.dtype)
193
+
194
+ for block in self.blocks:
195
+ x = block(x, mask=attn_mask, casual=False)
196
+
197
+ x = self.ln_post(x)
198
+ return x
199
+
200
+
201
+ class TextDecoder(nn.Module):
202
+ def __init__(
203
+ self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
204
+ ):
205
+ super().__init__()
206
+
207
+ self.token_embedding = nn.Embedding(n_vocab, n_state)
208
+ self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
209
+
210
+ self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
211
+ [
212
+ ResidualAttentionBlock(n_state, n_head, cross_attention=True)
213
+ for _ in range(n_layer)
214
+ ]
215
+ )
216
+ self.ln = LayerNorm(n_state)
217
+
218
+ self.out_proj = nn.Linear(n_state, n_vocab)
219
+
220
+ def forward(self, x: Tensor, attn_mask: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
221
+ """
222
+ x : torch.LongTensor, shape = (batch_size, <= n_ctx)
223
+ the text tokens
224
+ xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
225
+ the encoded audio features to be attended on
226
+ """
227
+ offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
228
+ x = (
229
+ self.token_embedding(x)
230
+ + self.positional_embedding[offset : offset + x.shape[-1]]
231
+ )
232
+ x = x.to(xa.dtype)
233
+
234
+ for block in self.blocks:
235
+ x = block(x, xa, mask=attn_mask, kv_cache=kv_cache, casual=True)
236
+
237
+ x = self.ln(x)
238
+ # logits = (
239
+ # x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
240
+ # ).float()
241
+ logits = self.out_proj(x)
242
+
243
+ return logits
244
+
245
+
246
+ class Whisper(nn.Module):
247
+ def __init__(self):
248
+ super().__init__()
249
+ self.n_vocab = 6800
250
+ self.n_text_layer = 6
251
+ self.n_text_head = 8
252
+ self.n_text_ctx = 2048
253
+
254
+ self.encoder = AudioEncoder(
255
+ n_mels=80, n_ctx=3000, n_state=512, n_head=8, n_layer=6,
256
+ )
257
+ self.decoder = TextDecoder(
258
+ n_vocab=6800, n_ctx=2048, n_state=512, n_head=8, n_layer=6,
259
+ )
260
+
261
+ def embed_audio(self, mel: torch.Tensor):
262
+ return self.encoder(mel, None)
263
+
264
+ def logits(self, tokens, audio_features, kv_cache=None):
265
+ return self.decoder(tokens, None, audio_features, kv_cache=kv_cache)
266
+
267
+ def forward(
268
+ self, mel, mel_len, token, token_len
269
+ ) -> Dict[str, torch.Tensor]:
270
+ attn_mask_enc = self.sequence_mask(mel_len//2, device=mel.device) > 0
271
+ attn_mask_dec = self.sequence_mask(token_len, device=mel.device) > 0
272
+ return self.decoder(token, attn_mask_dec, self.encoder(mel, attn_mask_enc))
273
+
274
+ @property
275
+ def device(self):
276
+ return next(self.parameters()).device
277
+
278
+ def install_kv_cache_hooks(self, cache: Optional[dict] = None):
279
+ """
280
+ The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
281
+ tensors calculated for the previous positions. This method returns a dictionary that stores
282
+ all caches, and the necessary hooks for the key and value projection modules that save the
283
+ intermediate tensors to be reused during later calculations.
284
+
285
+ Returns
286
+ -------
287
+ cache : Dict[nn.Module, torch.Tensor]
288
+ A dictionary object mapping the key/value projection modules to its cache
289
+ hooks : List[RemovableHandle]
290
+ List of PyTorch RemovableHandle objects to stop the hooks to be called
291
+ """
292
+ cache = {**cache} if cache is not None else {}
293
+ hooks = []
294
+
295
+ def save_to_cache(module, _, output):
296
+ if module not in cache or output.shape[1] > self.n_text_ctx:
297
+ # save as-is, for the first token or cross attention
298
+ cache[module] = output
299
+ else:
300
+ cache[module] = torch.cat([cache[module], output], dim=1).detach()
301
+ return cache[module]
302
+
303
+ def install_hooks(layer: nn.Module):
304
+ if isinstance(layer, MultiHeadAttention):
305
+ hooks.append(layer.key.register_forward_hook(save_to_cache))
306
+ hooks.append(layer.value.register_forward_hook(save_to_cache))
307
+
308
+ self.decoder.apply(install_hooks)
309
+ return cache, hooks
310
+
311
+ def sequence_mask(self, seq_lens, max_len=None, device='cpu'):
312
+ b = seq_lens.shape[0]
313
+ if max_len is None:
314
+ max_len = seq_lens.max()
315
+ mask = torch.arange(max_len).unsqueeze(0).to(device) # [1, t]
316
+ mask = mask < (seq_lens.unsqueeze(1)) # [1, t] + [b, 1] = [b, t]
317
+ mask = mask.float()
318
+ return mask
@@ -0,0 +1,362 @@
1
+ # Copyright 2025 ByteDance and/or its affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import random
16
+ from copy import deepcopy
17
+
18
+ import torch
19
+ import torch.nn.functional as F
20
+ from torch import nn
21
+ from torch.nn import Linear
22
+ from tqdm import tqdm
23
+
24
+ from tts.modules.ar_dur.commons.layers import Embedding, LayerNorm
25
+ from tts.modules.ar_dur.commons.nar_tts_modules import PosEmb
26
+ from tts.modules.ar_dur.commons.rot_transformer import RotTransformerDecoderLayer
27
+ from tts.modules.ar_dur.commons.transformer import SinusoidalPositionalEmbedding
28
+ from tts.modules.ar_dur.commons.rel_transformer import RelTransformerEncoder
29
+
30
+ FS_ENCODERS = {
31
+ 'rel_fft': lambda hp, dict_size: RelTransformerEncoder(
32
+ dict_size, hp['hidden_size'], hp['hidden_size'],
33
+ hp['ffn_hidden_size'], hp['num_heads'], hp['enc_layers'],
34
+ hp['enc_ffn_kernel_size'], hp['dropout'], prenet=hp['enc_prenet'], pre_ln=hp['enc_pre_ln']),
35
+ }
36
+
37
+ def fill_with_neg_inf2(t):
38
+ """FP16-compatible function that fills a tensor with -inf."""
39
+ return t.float().fill_(-1e8).type_as(t)
40
+
41
+ def expand_states(h, mel2token):
42
+ h = F.pad(h, [0, 0, 1, 0])
43
+ mel2token_ = mel2token[..., None].repeat([1, 1, h.shape[-1]])
44
+ h = torch.gather(h, 1, mel2token_) # [B, T, H]
45
+ return h
46
+
47
+
48
+ class CodePredictor(nn.Module):
49
+ def __init__(self, hparams, hidden_size, dec_hidden_size, lm_num_layers, dict_size, code_size):
50
+ super().__init__()
51
+ self.hparams = deepcopy(hparams)
52
+ self.hparams['hidden_size'] = hidden_size
53
+ self.hidden_size = hidden_size
54
+ char_dict_size = hparams.get('char_dict_size', 4000)
55
+ if not hparams.get('lm_use_enc'):
56
+ self.encoder = nn.Embedding(dict_size, self.hidden_size, padding_idx=0)
57
+ if hparams.get('mega_use_char', True):
58
+ self.char_encoder = nn.Embedding(char_dict_size,
59
+ self.hidden_size, padding_idx=0)
60
+ else:
61
+ self.encoder = FS_ENCODERS[self.hparams['encoder_type']](self.hparams, dict_size)
62
+ if hparams.get('mega_use_char', True):
63
+ self.char_encoder = FS_ENCODERS[self.hparams['encoder_type']](self.hparams, char_dict_size)
64
+ if hparams['use_ph_pos_embed']:
65
+ self.ph_pos_embed = PosEmb(self.hidden_size)
66
+
67
+ self.char_empty_embed = nn.Embedding(1, self.hidden_size)
68
+ if hparams.get('use_bert_input'):
69
+ self.bert_input_proj = nn.Linear(768, self.hidden_size)
70
+ self.ling_label_embed_layers = nn.ModuleDict()
71
+ for k, s in zip(hparams['ling_labels'], hparams['ling_label_dict_size']):
72
+ self.ling_label_embed_layers[k] = Embedding(s + 3, self.hidden_size, padding_idx=0)
73
+
74
+ self.dec_hidden_size = dec_hidden_size
75
+ self.enc_proj = nn.Linear(self.hidden_size, dec_hidden_size)
76
+ self.code_emb = Embedding(code_size + 2, dec_hidden_size, 0)
77
+ self.use_pos_embed = hparams.get('use_pos_embed', False)
78
+ if self.use_pos_embed:
79
+ self.embed_positions = SinusoidalPositionalEmbedding(dec_hidden_size, 0, init_size=1024)
80
+ self.use_post_ln = hparams.get('use_post_ln', False)
81
+ self.layers = None
82
+ if not self.use_post_ln:
83
+ self.layer_norm = LayerNorm(dec_hidden_size)
84
+ self.code_size = code_size
85
+ self.project_out_dim = Linear(dec_hidden_size, code_size + 1, bias=True)
86
+
87
+ def forward_ling_encoder(
88
+ self, txt_tokens, ling_feas, char_tokens, ph2char, bert_embed, spk_id, spk_embed, mels_timbre):
89
+ ph_tokens = txt_tokens
90
+ hparams = self.hparams
91
+ ph_nonpadding = (ph_tokens > 0).float()[:, :, None] # [B, T_phone, 1]
92
+ x_spk = self.forward_style_embed(spk_embed, spk_id, mels_timbre)
93
+
94
+ # enc_ph
95
+ if not hparams.get('lm_use_enc'):
96
+ x_ph = self.encoder(ph_tokens)
97
+ x_ph = x_ph + sum(
98
+ [self.ling_label_embed_layers[k](ling_feas[k]) for k in hparams['ling_labels']]) \
99
+ if len(hparams['ling_labels']) > 0 else 0
100
+ x_ph = x_ph + x_spk
101
+ else:
102
+ # enc_ph
103
+ ph_enc_oembed = sum(
104
+ [self.ling_label_embed_layers[k](ling_feas[k]) for k in hparams['ling_labels']]) \
105
+ if len(hparams['ling_labels']) > 0 else 0
106
+ ph_enc_oembed = ph_enc_oembed + self.ph_pos_embed(
107
+ torch.arange(0, ph_tokens.shape[1])[None,].to(ph_tokens.device))
108
+ ph_enc_oembed = ph_enc_oembed + x_spk
109
+ ph_enc_oembed = ph_enc_oembed * ph_nonpadding
110
+ x_ph = self.encoder(ph_tokens, other_embeds=ph_enc_oembed)
111
+
112
+ # enc_char
113
+ if char_tokens is not None and ph2char is not None:
114
+ char_nonpadding = (char_tokens > 0).float()[:, :, None]
115
+ x_char = self.char_encoder(char_tokens)
116
+ empty_char = (ph2char > 100000).long()
117
+ ph2char = ph2char * (1 - empty_char)
118
+ x_char_phlevel = \
119
+ expand_states(x_char * char_nonpadding, ph2char) \
120
+ * (1 - empty_char)[..., None] + \
121
+ self.char_empty_embed(torch.zeros_like(ph_tokens)) * empty_char[..., None]
122
+ else:
123
+ x_char_phlevel = 0
124
+ # x_ling
125
+ x_ling = x_ph + x_char_phlevel
126
+ x_ling = x_ling * ph_nonpadding
127
+ x_ling = self.enc_proj(x_ling)
128
+ return x_ling
129
+
130
+ def sample_one_step(self, vq_pred):
131
+ hparams = self.hparams
132
+ if hparams.get('infer_top_k'):
133
+ top_k = hparams.get('infer_top_k')
134
+ temperature = hparams.get('infer_temperature', 1)
135
+ vq_pred = vq_pred[:, -1] / temperature
136
+ # optionally crop the logits to only the top k options
137
+ if top_k is not None:
138
+ v, _ = torch.topk(vq_pred, min(top_k, vq_pred.size(-1)))
139
+ vq_pred[vq_pred < v[:, [-1]]] = -float('Inf')
140
+ # apply softmax to convert logits to (normalized) probabilities
141
+ probs = F.softmax(vq_pred, dim=-1)
142
+ # sample from the distribution
143
+ vq_pred = torch.multinomial(probs, num_samples=1)
144
+ else:
145
+ vq_pred = torch.argmax(F.softmax(vq_pred[:, -1], dim=-1), 1)
146
+ return vq_pred
147
+
148
+ def forward_style_embed(self, spk_embed=None, spk_id=None, mel_ref=None):
149
+ # add spk embed
150
+ style_embed = 0
151
+ if self.hparams['use_spk_embed']:
152
+ style_embed = style_embed + self.spk_embed_proj(spk_embed)[:, None, :]
153
+ if self.hparams['use_spk_id']:
154
+ style_embed = style_embed + self.spk_id_proj(spk_id)[:, None, :]
155
+ if self.hparams['use_spk_enc']:
156
+ style_embed = style_embed + self.spk_enc(mel_ref)[:, None, :]
157
+ return style_embed
158
+
159
+ def buffered_future_mask(self, tensor):
160
+ dim = tensor.size(0)
161
+ if (
162
+ not hasattr(self, '_future_mask')
163
+ or self._future_mask is None
164
+ or self._future_mask.device != tensor.device
165
+ or self._future_mask.size(0) < dim
166
+ ):
167
+ self._future_mask = torch.triu(fill_with_neg_inf2(tensor.new(dim, dim)), 1)
168
+ return self._future_mask[:dim, :dim]
169
+
170
+
171
+ class ARDurPredictor(CodePredictor):
172
+ def __init__(self, hparams, hidden_size, dec_hidden_size, lm_num_layers, dict_size, code_size, use_rot_embed=True,
173
+ op_version=1):
174
+ super().__init__(hparams, hidden_size, dec_hidden_size, lm_num_layers, dict_size, code_size)
175
+ self.use_rot_embed = use_rot_embed
176
+ bias = hparams.get('lm_bias', True)
177
+ if self.use_rot_embed:
178
+ self.layers = nn.ModuleList([])
179
+ self.layers.extend([
180
+ RotTransformerDecoderLayer(
181
+ dec_hidden_size, 0.0, kernel_size=1, ffn_hidden_size=dec_hidden_size * 4,
182
+ post_ln=self.use_post_ln, op_version=op_version, bias=bias)
183
+ for _ in range(lm_num_layers)
184
+ ])
185
+ if hparams['dur_model_type'] == 'ar_mse':
186
+ self.project_out_dim = nn.Sequential(torch.nn.Linear(dec_hidden_size, 1), nn.Softplus())
187
+ else:
188
+ self.project_out_dim = torch.nn.Linear(dec_hidden_size, code_size + 1)
189
+
190
+ def forward(self, txt_tokens, ling_feas, char_tokens, ph2char, bert_embed,
191
+ prev_code, spk_id=None, spk_embed=None, mels_timbre=None, mel2ph=None,
192
+ incremental_state=None, x_ling=None, attn_mask=None, spk_pos_ids_flat=None,
193
+ prompt_length=None, cache_size=20, streaming=False):
194
+ x = self.code_emb(prev_code)
195
+ if x_ling is None:
196
+ x_ling = self.forward_ling_encoder(
197
+ txt_tokens, ling_feas, char_tokens, ph2char, bert_embed, spk_id, spk_embed, mels_timbre)
198
+ x_ling = x_ling.flatten(0, 1)
199
+ txt_tokens = txt_tokens.flatten(0, 1)
200
+ x_ling = x_ling[txt_tokens > 0][None]
201
+
202
+ # run decoder
203
+ self_attn_padding_mask = None
204
+ if self.use_pos_embed:
205
+ positions = self.embed_positions(
206
+ prev_code,
207
+ incremental_state=incremental_state
208
+ )
209
+ if incremental_state is not None:
210
+ x_ling = x_ling[:, x.shape[1] - 1:x.shape[1]]
211
+ if spk_pos_ids_flat is not None:
212
+ spk_pos_ids_flat = spk_pos_ids_flat[:, x.shape[1] - 1:x.shape[1]]
213
+ x = x[:, -1:]
214
+ if self.use_pos_embed:
215
+ positions = positions[:, -1:]
216
+ if streaming:
217
+ # Shift Pos: query pos is min(cache_size, idx)
218
+ spk_pos_ids_flat = torch.min(torch.LongTensor([prompt_length + cache_size]).to(x.device),
219
+ spk_pos_ids_flat)
220
+
221
+ # # B x T x C -> T x B x C
222
+ if self.use_pos_embed:
223
+ x = x + positions
224
+ x_ling = x_ling[:, :self.hparams['max_tokens']].contiguous()
225
+ T = min(self.hparams.get('max_tokens_per_item', 1e9), x_ling.shape[1])
226
+ x_ling = x_ling.reshape(-1, T, x_ling.shape[-1])
227
+ x = x + x_ling
228
+ x = x.transpose(0, 1)
229
+
230
+ for idx, layer in enumerate(self.layers):
231
+ if incremental_state is None:
232
+ self_attn_mask = self.buffered_future_mask(x)
233
+ if attn_mask is not None:
234
+ self_attn_mask = self_attn_mask + (1 - attn_mask.float()) * -1e8
235
+ self_attn_mask = self_attn_mask.clamp_min(-1e8)
236
+ else:
237
+ self_attn_mask = None
238
+
239
+ x, attn_weights = layer(
240
+ x,
241
+ incremental_state=incremental_state,
242
+ self_attn_mask=self_attn_mask,
243
+ self_attn_padding_mask=self_attn_padding_mask,
244
+ spk_pos_ids_flat=spk_pos_ids_flat
245
+ )
246
+
247
+ if streaming and incremental_state != {}:
248
+ for k, v in incremental_state.items():
249
+ if 'attn_state' in k:
250
+ prev_key, prev_value = incremental_state[k]['prev_key'], incremental_state[k]['prev_value']
251
+ cur_length = prev_key.shape[2]
252
+ if cur_length - prompt_length > cache_size:
253
+ prev_key = torch.cat((prev_key[:, :, :prompt_length], prev_key[:, :, -cache_size:]), dim=2)
254
+ prev_value = torch.cat((prev_value[:, :, :prompt_length], prev_value[:, :, -cache_size:]),
255
+ dim=2)
256
+ incremental_state[k]['prev_key'], incremental_state[k]['prev_value'] = prev_key, prev_value
257
+
258
+ if not self.use_post_ln:
259
+ x = self.layer_norm(x)
260
+ # T x B x C -> B x T x C
261
+ x = x.transpose(0, 1)
262
+ x = self.project_out_dim(x)
263
+ return x
264
+
265
+ def infer(self, txt_tokens, ling_feas, char_tokens, ph2char, bert_embed,
266
+ spk_id=None, spk_embed=None, mels_timbre=None,
267
+ incremental_state=None, ctx_vqcodes=None, spk_pos_ids_flat=None, return_state=False,
268
+ first_step_min=0, return_probs=False, first_decoder_inp=None, dur_disturb=0.0, **kwargs):
269
+ if incremental_state is None:
270
+ incremental_state = {}
271
+ x_ling = self.forward_ling_encoder(
272
+ txt_tokens, ling_feas, char_tokens, ph2char, bert_embed,
273
+ spk_id, spk_embed, mels_timbre)
274
+ x_ling = x_ling.flatten(0, 1)
275
+ txt_tokens_ori = txt_tokens
276
+ txt_tokens_withpad = txt_tokens = txt_tokens.flatten(0, 1)
277
+ x_ling = x_ling[txt_tokens > 0][None]
278
+ txt_tokens = txt_tokens[txt_tokens > 0][None]
279
+
280
+ decoded = torch.zeros_like(txt_tokens)
281
+ decoded = F.pad(decoded, [1, 0], value=self.code_size + 1)
282
+ if incremental_state != {}:
283
+ if first_decoder_inp is None:
284
+ assert ctx_vqcodes is not None
285
+ decoded[:, :ctx_vqcodes.shape[1]] = ctx_vqcodes
286
+ ctx_vqcodes = None
287
+ else:
288
+ decoded[:, :1] = first_decoder_inp
289
+ probs = []
290
+ for step in range(decoded.shape[1] - 1):
291
+ vq_pred = self(txt_tokens, None, None, None, None,
292
+ decoded[:, :step + 1], None, None, None,
293
+ incremental_state=incremental_state, x_ling=x_ling,
294
+ spk_pos_ids_flat=spk_pos_ids_flat, **kwargs)
295
+ probs.append(vq_pred.cpu())
296
+ if ctx_vqcodes is None or step >= ctx_vqcodes.shape[1]:
297
+ if self.hparams['dur_model_type'] == 'ar_mse':
298
+ d = vq_pred[:, -1, 0]
299
+ if dur_disturb > 0 and step >= 1:
300
+ if random.random() > 0.5:
301
+ d = d * (1 + random.random() * dur_disturb)
302
+ else:
303
+ d = d / (1 + random.random() * dur_disturb)
304
+ d = torch.clamp_max(d, self.code_size - 1)
305
+ vq_pred = torch.round(d).long()
306
+ else:
307
+ vq_pred = self.sample_one_step(vq_pred)
308
+ decoded[:, step + 1] = torch.clamp_min(vq_pred, 1)
309
+ if step == 0:
310
+ decoded[:, step + 1] = torch.clamp_min(vq_pred, first_step_min)
311
+ else:
312
+ decoded[:, step + 1] = ctx_vqcodes[:, step]
313
+ decoded = decoded[:, 1:]
314
+ decoded_2d = torch.zeros_like(txt_tokens_ori)
315
+ decoded_2d.flatten(0, 1)[txt_tokens_withpad > 0] = decoded
316
+ if return_state:
317
+ return decoded_2d, incremental_state
318
+ if return_probs:
319
+ return decoded_2d, torch.cat(probs, 1)
320
+ return decoded_2d
321
+
322
+ def streaming_infer(self, txt_tokens, ling_feas, char_tokens, ph2char, bert_embed,
323
+ spk_id=None, spk_embed=None, mels_timbre=None,
324
+ incremental_state=None, ctx_vqcodes=None, spk_pos_ids_flat=None, return_state=False,
325
+ **kwargs):
326
+ if incremental_state is None:
327
+ incremental_state = {}
328
+ x_ling = self.forward_ling_encoder(
329
+ txt_tokens, ling_feas, char_tokens, ph2char, bert_embed,
330
+ spk_id, spk_embed, mels_timbre)
331
+ x_ling = x_ling.flatten(0, 1)
332
+ txt_tokens_ori = txt_tokens
333
+ txt_tokens_withpad = txt_tokens = txt_tokens.flatten(0, 1)
334
+ x_ling = x_ling[txt_tokens > 0][None]
335
+ txt_tokens = txt_tokens[txt_tokens > 0][None]
336
+
337
+ vq_decoded = torch.zeros_like(txt_tokens)
338
+ vq_decoded = F.pad(vq_decoded, [1, 0], value=self.code_size + 1)
339
+ if incremental_state != {}:
340
+ assert ctx_vqcodes is not None
341
+ vq_decoded[:, :ctx_vqcodes.shape[1]] = ctx_vqcodes
342
+ ctx_vqcodes = None
343
+ prompt_length = list(incremental_state.items())[0][1]['prev_key'].shape[2]
344
+ for step in tqdm(range(vq_decoded.shape[1] - 1), desc='AR Duration Predictor inference...'):
345
+ vq_pred = self(txt_tokens, None, None, None, None,
346
+ vq_decoded[:, :step + 1], None, None, None,
347
+ incremental_state=incremental_state, x_ling=x_ling,
348
+ spk_pos_ids_flat=spk_pos_ids_flat, prompt_length=prompt_length, streaming=True, **kwargs)
349
+ if ctx_vqcodes is None or step >= ctx_vqcodes.shape[1]:
350
+ if self.hparams['dur_model_type'] == 'ar_mse':
351
+ vq_pred = torch.round(vq_pred[:, -1, 0]).long()
352
+ else:
353
+ vq_pred = self.sample_one_step(vq_pred)
354
+ vq_decoded[:, step + 1] = vq_pred
355
+ else:
356
+ vq_decoded[:, step + 1] = ctx_vqcodes[:, step]
357
+ vq_decoded = vq_decoded[:, 1:]
358
+ vq_decoded_2d = torch.zeros_like(txt_tokens_ori)
359
+ vq_decoded_2d.flatten(0, 1)[txt_tokens_withpad > 0] = vq_decoded
360
+ if return_state:
361
+ return vq_decoded_2d, incremental_state
362
+ return vq_decoded_2d