xinference 0.13.2__py3-none-any.whl → 0.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/__init__.py +0 -1
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +26 -4
- xinference/client/restful/restful_client.py +16 -1
- xinference/core/chat_interface.py +2 -2
- xinference/core/model.py +8 -3
- xinference/core/scheduler.py +4 -4
- xinference/model/audio/core.py +5 -2
- xinference/model/audio/cosyvoice.py +136 -0
- xinference/model/audio/model_spec.json +24 -0
- xinference/model/audio/model_spec_modelscope.json +27 -0
- xinference/model/flexible/launchers/__init__.py +1 -0
- xinference/model/flexible/launchers/image_process_launcher.py +70 -0
- xinference/model/image/model_spec.json +7 -0
- xinference/model/image/stable_diffusion/core.py +6 -1
- xinference/model/llm/llm_family.json +802 -82
- xinference/model/llm/llm_family_csghub.json +39 -0
- xinference/model/llm/llm_family_modelscope.json +295 -47
- xinference/model/llm/pytorch/chatglm.py +243 -5
- xinference/model/llm/pytorch/cogvlm2.py +1 -1
- xinference/model/llm/utils.py +78 -1
- xinference/model/llm/vllm/core.py +8 -0
- xinference/thirdparty/cosyvoice/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +114 -0
- xinference/thirdparty/cosyvoice/bin/train.py +136 -0
- xinference/thirdparty/cosyvoice/cli/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +83 -0
- xinference/thirdparty/cosyvoice/cli/frontend.py +168 -0
- xinference/thirdparty/cosyvoice/cli/model.py +60 -0
- xinference/thirdparty/cosyvoice/dataset/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/dataset/dataset.py +160 -0
- xinference/thirdparty/cosyvoice/dataset/processor.py +369 -0
- xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/flow/decoder.py +222 -0
- xinference/thirdparty/cosyvoice/flow/flow.py +135 -0
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +138 -0
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +49 -0
- xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +55 -0
- xinference/thirdparty/cosyvoice/hifigan/generator.py +391 -0
- xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/llm/llm.py +206 -0
- xinference/thirdparty/cosyvoice/transformer/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/transformer/activation.py +84 -0
- xinference/thirdparty/cosyvoice/transformer/attention.py +326 -0
- xinference/thirdparty/cosyvoice/transformer/convolution.py +145 -0
- xinference/thirdparty/cosyvoice/transformer/decoder.py +396 -0
- xinference/thirdparty/cosyvoice/transformer/decoder_layer.py +132 -0
- xinference/thirdparty/cosyvoice/transformer/embedding.py +293 -0
- xinference/thirdparty/cosyvoice/transformer/encoder.py +472 -0
- xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +236 -0
- xinference/thirdparty/cosyvoice/transformer/label_smoothing_loss.py +96 -0
- xinference/thirdparty/cosyvoice/transformer/positionwise_feed_forward.py +115 -0
- xinference/thirdparty/cosyvoice/transformer/subsampling.py +383 -0
- xinference/thirdparty/cosyvoice/utils/__init__.py +0 -0
- xinference/thirdparty/cosyvoice/utils/class_utils.py +70 -0
- xinference/thirdparty/cosyvoice/utils/common.py +103 -0
- xinference/thirdparty/cosyvoice/utils/executor.py +110 -0
- xinference/thirdparty/cosyvoice/utils/file_utils.py +41 -0
- xinference/thirdparty/cosyvoice/utils/frontend_utils.py +125 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +227 -0
- xinference/thirdparty/cosyvoice/utils/scheduler.py +739 -0
- xinference/thirdparty/cosyvoice/utils/train_utils.py +289 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.95c1d652.js → main.2ef0cfaf.js} +3 -3
- xinference/web/ui/build/static/js/main.2ef0cfaf.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/b6807ecc0c231fea699533518a0eb2a2bf68a081ce00d452be40600dbffa17a7.json +1 -0
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/METADATA +16 -8
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/RECORD +76 -32
- xinference/web/ui/build/static/js/main.95c1d652.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +0 -1
- /xinference/web/ui/build/static/js/{main.95c1d652.js.LICENSE.txt → main.2ef0cfaf.js.LICENSE.txt} +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/LICENSE +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/WHEEL +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
|
|
2
|
+
# 2024 Alibaba Inc (Xiang Lyu)
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
# Modified from ESPnet(https://github.com/espnet/espnet)
|
|
16
|
+
"""Positonal Encoding Module."""
|
|
17
|
+
|
|
18
|
+
import math
|
|
19
|
+
from typing import Tuple, Union
|
|
20
|
+
|
|
21
|
+
import torch
|
|
22
|
+
import torch.nn.functional as F
|
|
23
|
+
import numpy as np
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PositionalEncoding(torch.nn.Module):
|
|
27
|
+
"""Positional encoding.
|
|
28
|
+
|
|
29
|
+
:param int d_model: embedding dim
|
|
30
|
+
:param float dropout_rate: dropout rate
|
|
31
|
+
:param int max_len: maximum input length
|
|
32
|
+
|
|
33
|
+
PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
|
|
34
|
+
PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self,
|
|
38
|
+
d_model: int,
|
|
39
|
+
dropout_rate: float,
|
|
40
|
+
max_len: int = 5000,
|
|
41
|
+
reverse: bool = False):
|
|
42
|
+
"""Construct an PositionalEncoding object."""
|
|
43
|
+
super().__init__()
|
|
44
|
+
self.d_model = d_model
|
|
45
|
+
self.xscale = math.sqrt(self.d_model)
|
|
46
|
+
self.dropout = torch.nn.Dropout(p=dropout_rate)
|
|
47
|
+
self.max_len = max_len
|
|
48
|
+
|
|
49
|
+
self.pe = torch.zeros(self.max_len, self.d_model)
|
|
50
|
+
position = torch.arange(0, self.max_len,
|
|
51
|
+
dtype=torch.float32).unsqueeze(1)
|
|
52
|
+
div_term = torch.exp(
|
|
53
|
+
torch.arange(0, self.d_model, 2, dtype=torch.float32) *
|
|
54
|
+
-(math.log(10000.0) / self.d_model))
|
|
55
|
+
self.pe[:, 0::2] = torch.sin(position * div_term)
|
|
56
|
+
self.pe[:, 1::2] = torch.cos(position * div_term)
|
|
57
|
+
self.pe = self.pe.unsqueeze(0)
|
|
58
|
+
|
|
59
|
+
def forward(self,
|
|
60
|
+
x: torch.Tensor,
|
|
61
|
+
offset: Union[int, torch.Tensor] = 0) \
|
|
62
|
+
-> Tuple[torch.Tensor, torch.Tensor]:
|
|
63
|
+
"""Add positional encoding.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
x (torch.Tensor): Input. Its shape is (batch, time, ...)
|
|
67
|
+
offset (int, torch.tensor): position offset
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
|
|
71
|
+
torch.Tensor: for compatibility to RelPositionalEncoding
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
self.pe = self.pe.to(x.device)
|
|
75
|
+
pos_emb = self.position_encoding(offset, x.size(1), False)
|
|
76
|
+
x = x * self.xscale + pos_emb
|
|
77
|
+
return self.dropout(x), self.dropout(pos_emb)
|
|
78
|
+
|
|
79
|
+
def position_encoding(self,
|
|
80
|
+
offset: Union[int, torch.Tensor],
|
|
81
|
+
size: int,
|
|
82
|
+
apply_dropout: bool = True) -> torch.Tensor:
|
|
83
|
+
""" For getting encoding in a streaming fashion
|
|
84
|
+
|
|
85
|
+
Attention!!!!!
|
|
86
|
+
we apply dropout only once at the whole utterance level in a none
|
|
87
|
+
streaming way, but will call this function several times with
|
|
88
|
+
increasing input size in a streaming scenario, so the dropout will
|
|
89
|
+
be applied several times.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
offset (int or torch.tensor): start offset
|
|
93
|
+
size (int): required size of position encoding
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
torch.Tensor: Corresponding encoding
|
|
97
|
+
"""
|
|
98
|
+
# How to subscript a Union type:
|
|
99
|
+
# https://github.com/pytorch/pytorch/issues/69434
|
|
100
|
+
if isinstance(offset, int):
|
|
101
|
+
assert offset + size <= self.max_len
|
|
102
|
+
pos_emb = self.pe[:, offset:offset + size]
|
|
103
|
+
elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar
|
|
104
|
+
assert offset + size <= self.max_len
|
|
105
|
+
pos_emb = self.pe[:, offset:offset + size]
|
|
106
|
+
else: # for batched streaming decoding on GPU
|
|
107
|
+
assert torch.max(offset) + size <= self.max_len
|
|
108
|
+
index = offset.unsqueeze(1) + \
|
|
109
|
+
torch.arange(0, size).to(offset.device) # B X T
|
|
110
|
+
flag = index > 0
|
|
111
|
+
# remove negative offset
|
|
112
|
+
index = index * flag
|
|
113
|
+
pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model
|
|
114
|
+
|
|
115
|
+
if apply_dropout:
|
|
116
|
+
pos_emb = self.dropout(pos_emb)
|
|
117
|
+
return pos_emb
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class RelPositionalEncoding(PositionalEncoding):
|
|
121
|
+
"""Relative positional encoding module.
|
|
122
|
+
See : Appendix B in https://arxiv.org/abs/1901.02860
|
|
123
|
+
Args:
|
|
124
|
+
d_model (int): Embedding dimension.
|
|
125
|
+
dropout_rate (float): Dropout rate.
|
|
126
|
+
max_len (int): Maximum input length.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
|
|
130
|
+
"""Initialize class."""
|
|
131
|
+
super().__init__(d_model, dropout_rate, max_len, reverse=True)
|
|
132
|
+
|
|
133
|
+
def forward(self,
|
|
134
|
+
x: torch.Tensor,
|
|
135
|
+
offset: Union[int, torch.Tensor] = 0) \
|
|
136
|
+
-> Tuple[torch.Tensor, torch.Tensor]:
|
|
137
|
+
"""Compute positional encoding.
|
|
138
|
+
Args:
|
|
139
|
+
x (torch.Tensor): Input tensor (batch, time, `*`).
|
|
140
|
+
Returns:
|
|
141
|
+
torch.Tensor: Encoded tensor (batch, time, `*`).
|
|
142
|
+
torch.Tensor: Positional embedding tensor (1, time, `*`).
|
|
143
|
+
"""
|
|
144
|
+
self.pe = self.pe.to(x.device)
|
|
145
|
+
x = x * self.xscale
|
|
146
|
+
pos_emb = self.position_encoding(offset, x.size(1), False)
|
|
147
|
+
return self.dropout(x), self.dropout(pos_emb)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class WhisperPositionalEncoding(PositionalEncoding):
|
|
151
|
+
""" Sinusoids position encoding used in openai-whisper.encoder
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500):
|
|
155
|
+
super().__init__(d_model, dropout_rate, max_len)
|
|
156
|
+
self.xscale = 1.0
|
|
157
|
+
log_timescale_increment = np.log(10000) / (d_model // 2 - 1)
|
|
158
|
+
inv_timescales = torch.exp(-log_timescale_increment *
|
|
159
|
+
torch.arange(d_model // 2))
|
|
160
|
+
scaled_time = torch.arange(max_len)[:, np.newaxis] * \
|
|
161
|
+
inv_timescales[np.newaxis, :]
|
|
162
|
+
pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
|
|
163
|
+
delattr(self, "pe")
|
|
164
|
+
self.register_buffer("pe", pe.unsqueeze(0))
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class LearnablePositionalEncoding(PositionalEncoding):
|
|
168
|
+
""" Learnable position encoding used in openai-whisper.decoder
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448):
|
|
172
|
+
super().__init__(d_model, dropout_rate, max_len)
|
|
173
|
+
# NOTE(xcsong): overwrite self.pe & self.xscale
|
|
174
|
+
self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model))
|
|
175
|
+
self.xscale = 1.0
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class NoPositionalEncoding(torch.nn.Module):
|
|
179
|
+
""" No position encoding
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
def __init__(self, d_model: int, dropout_rate: float):
|
|
183
|
+
super().__init__()
|
|
184
|
+
self.d_model = d_model
|
|
185
|
+
self.dropout = torch.nn.Dropout(p=dropout_rate)
|
|
186
|
+
|
|
187
|
+
def forward(self,
|
|
188
|
+
x: torch.Tensor,
|
|
189
|
+
offset: Union[int, torch.Tensor] = 0) \
|
|
190
|
+
-> Tuple[torch.Tensor, torch.Tensor]:
|
|
191
|
+
""" Just return zero vector for interface compatibility
|
|
192
|
+
"""
|
|
193
|
+
pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
|
|
194
|
+
return self.dropout(x), pos_emb
|
|
195
|
+
|
|
196
|
+
def position_encoding(self, offset: Union[int, torch.Tensor],
|
|
197
|
+
size: int) -> torch.Tensor:
|
|
198
|
+
return torch.zeros(1, size, self.d_model)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class EspnetRelPositionalEncoding(torch.nn.Module):
|
|
202
|
+
"""Relative positional encoding module (new implementation).
|
|
203
|
+
|
|
204
|
+
Details can be found in https://github.com/espnet/espnet/pull/2816.
|
|
205
|
+
|
|
206
|
+
See : Appendix B in https://arxiv.org/abs/1901.02860
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
d_model (int): Embedding dimension.
|
|
210
|
+
dropout_rate (float): Dropout rate.
|
|
211
|
+
max_len (int): Maximum input length.
|
|
212
|
+
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
def __init__(self, d_model, dropout_rate, max_len=5000):
|
|
216
|
+
"""Construct an PositionalEncoding object."""
|
|
217
|
+
super(EspnetRelPositionalEncoding, self).__init__()
|
|
218
|
+
self.d_model = d_model
|
|
219
|
+
self.xscale = math.sqrt(self.d_model)
|
|
220
|
+
self.dropout = torch.nn.Dropout(p=dropout_rate)
|
|
221
|
+
self.pe = None
|
|
222
|
+
self.extend_pe(torch.tensor(0.0).expand(1, max_len))
|
|
223
|
+
|
|
224
|
+
def extend_pe(self, x):
|
|
225
|
+
"""Reset the positional encodings."""
|
|
226
|
+
if self.pe is not None:
|
|
227
|
+
# self.pe contains both positive and negative parts
|
|
228
|
+
# the length of self.pe is 2 * input_len - 1
|
|
229
|
+
if self.pe.size(1) >= x.size(1) * 2 - 1:
|
|
230
|
+
if self.pe.dtype != x.dtype or self.pe.device != x.device:
|
|
231
|
+
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
|
|
232
|
+
return
|
|
233
|
+
# Suppose `i` means to the position of query vecotr and `j` means the
|
|
234
|
+
# position of key vector. We use position relative positions when keys
|
|
235
|
+
# are to the left (i>j) and negative relative positions otherwise (i<j).
|
|
236
|
+
pe_positive = torch.zeros(x.size(1), self.d_model)
|
|
237
|
+
pe_negative = torch.zeros(x.size(1), self.d_model)
|
|
238
|
+
position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
|
|
239
|
+
div_term = torch.exp(
|
|
240
|
+
torch.arange(0, self.d_model, 2, dtype=torch.float32)
|
|
241
|
+
* -(math.log(10000.0) / self.d_model)
|
|
242
|
+
)
|
|
243
|
+
pe_positive[:, 0::2] = torch.sin(position * div_term)
|
|
244
|
+
pe_positive[:, 1::2] = torch.cos(position * div_term)
|
|
245
|
+
pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
|
|
246
|
+
pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
|
|
247
|
+
|
|
248
|
+
# Reserve the order of positive indices and concat both positive and
|
|
249
|
+
# negative indices. This is used to support the shifting trick
|
|
250
|
+
# as in https://arxiv.org/abs/1901.02860
|
|
251
|
+
pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
|
|
252
|
+
pe_negative = pe_negative[1:].unsqueeze(0)
|
|
253
|
+
pe = torch.cat([pe_positive, pe_negative], dim=1)
|
|
254
|
+
self.pe = pe.to(device=x.device, dtype=x.dtype)
|
|
255
|
+
|
|
256
|
+
def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0):
|
|
257
|
+
"""Add positional encoding.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
x (torch.Tensor): Input tensor (batch, time, `*`).
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
torch.Tensor: Encoded tensor (batch, time, `*`).
|
|
264
|
+
|
|
265
|
+
"""
|
|
266
|
+
self.extend_pe(x)
|
|
267
|
+
x = x * self.xscale
|
|
268
|
+
pos_emb = self.position_encoding(size=x.size(1), offset=offset)
|
|
269
|
+
return self.dropout(x), self.dropout(pos_emb)
|
|
270
|
+
|
|
271
|
+
def position_encoding(self,
|
|
272
|
+
offset: Union[int, torch.Tensor],
|
|
273
|
+
size: int) -> torch.Tensor:
|
|
274
|
+
""" For getting encoding in a streaming fashion
|
|
275
|
+
|
|
276
|
+
Attention!!!!!
|
|
277
|
+
we apply dropout only once at the whole utterance level in a none
|
|
278
|
+
streaming way, but will call this function several times with
|
|
279
|
+
increasing input size in a streaming scenario, so the dropout will
|
|
280
|
+
be applied several times.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
offset (int or torch.tensor): start offset
|
|
284
|
+
size (int): required size of position encoding
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
torch.Tensor: Corresponding encoding
|
|
288
|
+
"""
|
|
289
|
+
pos_emb = self.pe[
|
|
290
|
+
:,
|
|
291
|
+
self.pe.size(1) // 2 - size + 1 : self.pe.size(1) // 2 + size,
|
|
292
|
+
]
|
|
293
|
+
return pos_emb
|