xinference 0.13.2__py3-none-any.whl → 0.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (78) hide show
  1. xinference/__init__.py +0 -1
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +26 -4
  4. xinference/client/restful/restful_client.py +16 -1
  5. xinference/core/chat_interface.py +2 -2
  6. xinference/core/model.py +8 -3
  7. xinference/core/scheduler.py +4 -4
  8. xinference/model/audio/core.py +5 -2
  9. xinference/model/audio/cosyvoice.py +136 -0
  10. xinference/model/audio/model_spec.json +24 -0
  11. xinference/model/audio/model_spec_modelscope.json +27 -0
  12. xinference/model/flexible/launchers/__init__.py +1 -0
  13. xinference/model/flexible/launchers/image_process_launcher.py +70 -0
  14. xinference/model/image/model_spec.json +7 -0
  15. xinference/model/image/stable_diffusion/core.py +6 -1
  16. xinference/model/llm/llm_family.json +802 -82
  17. xinference/model/llm/llm_family_csghub.json +39 -0
  18. xinference/model/llm/llm_family_modelscope.json +295 -47
  19. xinference/model/llm/pytorch/chatglm.py +243 -5
  20. xinference/model/llm/pytorch/cogvlm2.py +1 -1
  21. xinference/model/llm/utils.py +78 -1
  22. xinference/model/llm/vllm/core.py +8 -0
  23. xinference/thirdparty/cosyvoice/__init__.py +0 -0
  24. xinference/thirdparty/cosyvoice/bin/__init__.py +0 -0
  25. xinference/thirdparty/cosyvoice/bin/inference.py +114 -0
  26. xinference/thirdparty/cosyvoice/bin/train.py +136 -0
  27. xinference/thirdparty/cosyvoice/cli/__init__.py +0 -0
  28. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +83 -0
  29. xinference/thirdparty/cosyvoice/cli/frontend.py +168 -0
  30. xinference/thirdparty/cosyvoice/cli/model.py +60 -0
  31. xinference/thirdparty/cosyvoice/dataset/__init__.py +0 -0
  32. xinference/thirdparty/cosyvoice/dataset/dataset.py +160 -0
  33. xinference/thirdparty/cosyvoice/dataset/processor.py +369 -0
  34. xinference/thirdparty/cosyvoice/flow/__init__.py +0 -0
  35. xinference/thirdparty/cosyvoice/flow/decoder.py +222 -0
  36. xinference/thirdparty/cosyvoice/flow/flow.py +135 -0
  37. xinference/thirdparty/cosyvoice/flow/flow_matching.py +138 -0
  38. xinference/thirdparty/cosyvoice/flow/length_regulator.py +49 -0
  39. xinference/thirdparty/cosyvoice/hifigan/__init__.py +0 -0
  40. xinference/thirdparty/cosyvoice/hifigan/f0_predictor.py +55 -0
  41. xinference/thirdparty/cosyvoice/hifigan/generator.py +391 -0
  42. xinference/thirdparty/cosyvoice/llm/__init__.py +0 -0
  43. xinference/thirdparty/cosyvoice/llm/llm.py +206 -0
  44. xinference/thirdparty/cosyvoice/transformer/__init__.py +0 -0
  45. xinference/thirdparty/cosyvoice/transformer/activation.py +84 -0
  46. xinference/thirdparty/cosyvoice/transformer/attention.py +326 -0
  47. xinference/thirdparty/cosyvoice/transformer/convolution.py +145 -0
  48. xinference/thirdparty/cosyvoice/transformer/decoder.py +396 -0
  49. xinference/thirdparty/cosyvoice/transformer/decoder_layer.py +132 -0
  50. xinference/thirdparty/cosyvoice/transformer/embedding.py +293 -0
  51. xinference/thirdparty/cosyvoice/transformer/encoder.py +472 -0
  52. xinference/thirdparty/cosyvoice/transformer/encoder_layer.py +236 -0
  53. xinference/thirdparty/cosyvoice/transformer/label_smoothing_loss.py +96 -0
  54. xinference/thirdparty/cosyvoice/transformer/positionwise_feed_forward.py +115 -0
  55. xinference/thirdparty/cosyvoice/transformer/subsampling.py +383 -0
  56. xinference/thirdparty/cosyvoice/utils/__init__.py +0 -0
  57. xinference/thirdparty/cosyvoice/utils/class_utils.py +70 -0
  58. xinference/thirdparty/cosyvoice/utils/common.py +103 -0
  59. xinference/thirdparty/cosyvoice/utils/executor.py +110 -0
  60. xinference/thirdparty/cosyvoice/utils/file_utils.py +41 -0
  61. xinference/thirdparty/cosyvoice/utils/frontend_utils.py +125 -0
  62. xinference/thirdparty/cosyvoice/utils/mask.py +227 -0
  63. xinference/thirdparty/cosyvoice/utils/scheduler.py +739 -0
  64. xinference/thirdparty/cosyvoice/utils/train_utils.py +289 -0
  65. xinference/web/ui/build/asset-manifest.json +3 -3
  66. xinference/web/ui/build/index.html +1 -1
  67. xinference/web/ui/build/static/js/{main.95c1d652.js → main.2ef0cfaf.js} +3 -3
  68. xinference/web/ui/build/static/js/main.2ef0cfaf.js.map +1 -0
  69. xinference/web/ui/node_modules/.cache/babel-loader/b6807ecc0c231fea699533518a0eb2a2bf68a081ce00d452be40600dbffa17a7.json +1 -0
  70. {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/METADATA +16 -8
  71. {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/RECORD +76 -32
  72. xinference/web/ui/build/static/js/main.95c1d652.js.map +0 -1
  73. xinference/web/ui/node_modules/.cache/babel-loader/709711edada3f1596b309d571285fd31f1c364d66f4425bc28723d0088cc351a.json +0 -1
  74. /xinference/web/ui/build/static/js/{main.95c1d652.js.LICENSE.txt → main.2ef0cfaf.js.LICENSE.txt} +0 -0
  75. {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/LICENSE +0 -0
  76. {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/WHEEL +0 -0
  77. {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/entry_points.txt +0 -0
  78. {xinference-0.13.2.dist-info → xinference-0.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,293 @@
1
+ # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
2
+ # 2024 Alibaba Inc (Xiang Lyu)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # Modified from ESPnet(https://github.com/espnet/espnet)
16
+ """Positonal Encoding Module."""
17
+
18
+ import math
19
+ from typing import Tuple, Union
20
+
21
+ import torch
22
+ import torch.nn.functional as F
23
+ import numpy as np
24
+
25
+
26
+ class PositionalEncoding(torch.nn.Module):
27
+ """Positional encoding.
28
+
29
+ :param int d_model: embedding dim
30
+ :param float dropout_rate: dropout rate
31
+ :param int max_len: maximum input length
32
+
33
+ PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
34
+ PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
35
+ """
36
+
37
+ def __init__(self,
38
+ d_model: int,
39
+ dropout_rate: float,
40
+ max_len: int = 5000,
41
+ reverse: bool = False):
42
+ """Construct an PositionalEncoding object."""
43
+ super().__init__()
44
+ self.d_model = d_model
45
+ self.xscale = math.sqrt(self.d_model)
46
+ self.dropout = torch.nn.Dropout(p=dropout_rate)
47
+ self.max_len = max_len
48
+
49
+ self.pe = torch.zeros(self.max_len, self.d_model)
50
+ position = torch.arange(0, self.max_len,
51
+ dtype=torch.float32).unsqueeze(1)
52
+ div_term = torch.exp(
53
+ torch.arange(0, self.d_model, 2, dtype=torch.float32) *
54
+ -(math.log(10000.0) / self.d_model))
55
+ self.pe[:, 0::2] = torch.sin(position * div_term)
56
+ self.pe[:, 1::2] = torch.cos(position * div_term)
57
+ self.pe = self.pe.unsqueeze(0)
58
+
59
+ def forward(self,
60
+ x: torch.Tensor,
61
+ offset: Union[int, torch.Tensor] = 0) \
62
+ -> Tuple[torch.Tensor, torch.Tensor]:
63
+ """Add positional encoding.
64
+
65
+ Args:
66
+ x (torch.Tensor): Input. Its shape is (batch, time, ...)
67
+ offset (int, torch.tensor): position offset
68
+
69
+ Returns:
70
+ torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
71
+ torch.Tensor: for compatibility to RelPositionalEncoding
72
+ """
73
+
74
+ self.pe = self.pe.to(x.device)
75
+ pos_emb = self.position_encoding(offset, x.size(1), False)
76
+ x = x * self.xscale + pos_emb
77
+ return self.dropout(x), self.dropout(pos_emb)
78
+
79
+ def position_encoding(self,
80
+ offset: Union[int, torch.Tensor],
81
+ size: int,
82
+ apply_dropout: bool = True) -> torch.Tensor:
83
+ """ For getting encoding in a streaming fashion
84
+
85
+ Attention!!!!!
86
+ we apply dropout only once at the whole utterance level in a none
87
+ streaming way, but will call this function several times with
88
+ increasing input size in a streaming scenario, so the dropout will
89
+ be applied several times.
90
+
91
+ Args:
92
+ offset (int or torch.tensor): start offset
93
+ size (int): required size of position encoding
94
+
95
+ Returns:
96
+ torch.Tensor: Corresponding encoding
97
+ """
98
+ # How to subscript a Union type:
99
+ # https://github.com/pytorch/pytorch/issues/69434
100
+ if isinstance(offset, int):
101
+ assert offset + size <= self.max_len
102
+ pos_emb = self.pe[:, offset:offset + size]
103
+ elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar
104
+ assert offset + size <= self.max_len
105
+ pos_emb = self.pe[:, offset:offset + size]
106
+ else: # for batched streaming decoding on GPU
107
+ assert torch.max(offset) + size <= self.max_len
108
+ index = offset.unsqueeze(1) + \
109
+ torch.arange(0, size).to(offset.device) # B X T
110
+ flag = index > 0
111
+ # remove negative offset
112
+ index = index * flag
113
+ pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model
114
+
115
+ if apply_dropout:
116
+ pos_emb = self.dropout(pos_emb)
117
+ return pos_emb
118
+
119
+
120
+ class RelPositionalEncoding(PositionalEncoding):
121
+ """Relative positional encoding module.
122
+ See : Appendix B in https://arxiv.org/abs/1901.02860
123
+ Args:
124
+ d_model (int): Embedding dimension.
125
+ dropout_rate (float): Dropout rate.
126
+ max_len (int): Maximum input length.
127
+ """
128
+
129
+ def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
130
+ """Initialize class."""
131
+ super().__init__(d_model, dropout_rate, max_len, reverse=True)
132
+
133
+ def forward(self,
134
+ x: torch.Tensor,
135
+ offset: Union[int, torch.Tensor] = 0) \
136
+ -> Tuple[torch.Tensor, torch.Tensor]:
137
+ """Compute positional encoding.
138
+ Args:
139
+ x (torch.Tensor): Input tensor (batch, time, `*`).
140
+ Returns:
141
+ torch.Tensor: Encoded tensor (batch, time, `*`).
142
+ torch.Tensor: Positional embedding tensor (1, time, `*`).
143
+ """
144
+ self.pe = self.pe.to(x.device)
145
+ x = x * self.xscale
146
+ pos_emb = self.position_encoding(offset, x.size(1), False)
147
+ return self.dropout(x), self.dropout(pos_emb)
148
+
149
+
150
+ class WhisperPositionalEncoding(PositionalEncoding):
151
+ """ Sinusoids position encoding used in openai-whisper.encoder
152
+ """
153
+
154
+ def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500):
155
+ super().__init__(d_model, dropout_rate, max_len)
156
+ self.xscale = 1.0
157
+ log_timescale_increment = np.log(10000) / (d_model // 2 - 1)
158
+ inv_timescales = torch.exp(-log_timescale_increment *
159
+ torch.arange(d_model // 2))
160
+ scaled_time = torch.arange(max_len)[:, np.newaxis] * \
161
+ inv_timescales[np.newaxis, :]
162
+ pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
163
+ delattr(self, "pe")
164
+ self.register_buffer("pe", pe.unsqueeze(0))
165
+
166
+
167
+ class LearnablePositionalEncoding(PositionalEncoding):
168
+ """ Learnable position encoding used in openai-whisper.decoder
169
+ """
170
+
171
+ def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448):
172
+ super().__init__(d_model, dropout_rate, max_len)
173
+ # NOTE(xcsong): overwrite self.pe & self.xscale
174
+ self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model))
175
+ self.xscale = 1.0
176
+
177
+
178
+ class NoPositionalEncoding(torch.nn.Module):
179
+ """ No position encoding
180
+ """
181
+
182
+ def __init__(self, d_model: int, dropout_rate: float):
183
+ super().__init__()
184
+ self.d_model = d_model
185
+ self.dropout = torch.nn.Dropout(p=dropout_rate)
186
+
187
+ def forward(self,
188
+ x: torch.Tensor,
189
+ offset: Union[int, torch.Tensor] = 0) \
190
+ -> Tuple[torch.Tensor, torch.Tensor]:
191
+ """ Just return zero vector for interface compatibility
192
+ """
193
+ pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
194
+ return self.dropout(x), pos_emb
195
+
196
+ def position_encoding(self, offset: Union[int, torch.Tensor],
197
+ size: int) -> torch.Tensor:
198
+ return torch.zeros(1, size, self.d_model)
199
+
200
+
201
+ class EspnetRelPositionalEncoding(torch.nn.Module):
202
+ """Relative positional encoding module (new implementation).
203
+
204
+ Details can be found in https://github.com/espnet/espnet/pull/2816.
205
+
206
+ See : Appendix B in https://arxiv.org/abs/1901.02860
207
+
208
+ Args:
209
+ d_model (int): Embedding dimension.
210
+ dropout_rate (float): Dropout rate.
211
+ max_len (int): Maximum input length.
212
+
213
+ """
214
+
215
+ def __init__(self, d_model, dropout_rate, max_len=5000):
216
+ """Construct an PositionalEncoding object."""
217
+ super(EspnetRelPositionalEncoding, self).__init__()
218
+ self.d_model = d_model
219
+ self.xscale = math.sqrt(self.d_model)
220
+ self.dropout = torch.nn.Dropout(p=dropout_rate)
221
+ self.pe = None
222
+ self.extend_pe(torch.tensor(0.0).expand(1, max_len))
223
+
224
+ def extend_pe(self, x):
225
+ """Reset the positional encodings."""
226
+ if self.pe is not None:
227
+ # self.pe contains both positive and negative parts
228
+ # the length of self.pe is 2 * input_len - 1
229
+ if self.pe.size(1) >= x.size(1) * 2 - 1:
230
+ if self.pe.dtype != x.dtype or self.pe.device != x.device:
231
+ self.pe = self.pe.to(dtype=x.dtype, device=x.device)
232
+ return
233
+ # Suppose `i` means to the position of query vecotr and `j` means the
234
+ # position of key vector. We use position relative positions when keys
235
+ # are to the left (i>j) and negative relative positions otherwise (i<j).
236
+ pe_positive = torch.zeros(x.size(1), self.d_model)
237
+ pe_negative = torch.zeros(x.size(1), self.d_model)
238
+ position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
239
+ div_term = torch.exp(
240
+ torch.arange(0, self.d_model, 2, dtype=torch.float32)
241
+ * -(math.log(10000.0) / self.d_model)
242
+ )
243
+ pe_positive[:, 0::2] = torch.sin(position * div_term)
244
+ pe_positive[:, 1::2] = torch.cos(position * div_term)
245
+ pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
246
+ pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
247
+
248
+ # Reserve the order of positive indices and concat both positive and
249
+ # negative indices. This is used to support the shifting trick
250
+ # as in https://arxiv.org/abs/1901.02860
251
+ pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
252
+ pe_negative = pe_negative[1:].unsqueeze(0)
253
+ pe = torch.cat([pe_positive, pe_negative], dim=1)
254
+ self.pe = pe.to(device=x.device, dtype=x.dtype)
255
+
256
+ def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0):
257
+ """Add positional encoding.
258
+
259
+ Args:
260
+ x (torch.Tensor): Input tensor (batch, time, `*`).
261
+
262
+ Returns:
263
+ torch.Tensor: Encoded tensor (batch, time, `*`).
264
+
265
+ """
266
+ self.extend_pe(x)
267
+ x = x * self.xscale
268
+ pos_emb = self.position_encoding(size=x.size(1), offset=offset)
269
+ return self.dropout(x), self.dropout(pos_emb)
270
+
271
+ def position_encoding(self,
272
+ offset: Union[int, torch.Tensor],
273
+ size: int) -> torch.Tensor:
274
+ """ For getting encoding in a streaming fashion
275
+
276
+ Attention!!!!!
277
+ we apply dropout only once at the whole utterance level in a none
278
+ streaming way, but will call this function several times with
279
+ increasing input size in a streaming scenario, so the dropout will
280
+ be applied several times.
281
+
282
+ Args:
283
+ offset (int or torch.tensor): start offset
284
+ size (int): required size of position encoding
285
+
286
+ Returns:
287
+ torch.Tensor: Corresponding encoding
288
+ """
289
+ pos_emb = self.pe[
290
+ :,
291
+ self.pe.size(1) // 2 - size + 1 : self.pe.size(1) // 2 + size,
292
+ ]
293
+ return pos_emb