xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +415 -1
- xinference/constants.py +2 -0
- xinference/core/model.py +3 -4
- xinference/core/supervisor.py +29 -1
- xinference/core/worker.py +4 -1
- xinference/deploy/cmdline.py +2 -0
- xinference/deploy/test/test_cmdline.py +1 -1
- xinference/model/audio/core.py +5 -0
- xinference/model/audio/cosyvoice.py +0 -1
- xinference/model/audio/kokoro.py +1 -1
- xinference/model/audio/kokoro_zh.py +124 -0
- xinference/model/audio/model_spec.json +64 -20
- xinference/model/embedding/flag/core.py +5 -0
- xinference/model/embedding/llama_cpp/core.py +22 -19
- xinference/model/embedding/sentence_transformers/core.py +19 -4
- xinference/model/embedding/vllm/core.py +40 -8
- xinference/model/image/cache_manager.py +56 -0
- xinference/model/image/core.py +9 -0
- xinference/model/image/model_spec.json +116 -9
- xinference/model/image/stable_diffusion/core.py +141 -31
- xinference/model/llm/core.py +10 -0
- xinference/model/llm/llama_cpp/core.py +42 -40
- xinference/model/llm/llm_family.json +435 -23
- xinference/model/llm/llm_family.py +1 -0
- xinference/model/llm/mlx/core.py +52 -33
- xinference/model/llm/sglang/core.py +2 -44
- xinference/model/llm/tool_parsers/__init__.py +58 -0
- xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
- xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
- xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
- xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
- xinference/model/llm/transformers/core.py +6 -12
- xinference/model/llm/utils.py +128 -46
- xinference/model/llm/vllm/core.py +8 -61
- xinference/model/rerank/core.py +3 -0
- xinference/model/rerank/sentence_transformers/core.py +1 -1
- xinference/model/rerank/vllm/core.py +56 -6
- xinference/model/utils.py +1 -2
- xinference/model/video/model_spec.json +95 -1
- xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
- xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
- xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
- xinference/thirdparty/cosyvoice/bin/train.py +23 -3
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
- xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
- xinference/thirdparty/cosyvoice/cli/model.py +53 -75
- xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
- xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
- xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
- xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
- xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
- xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
- xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
- xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
- xinference/thirdparty/cosyvoice/utils/common.py +20 -0
- xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
- xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
- xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
- xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
- xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
- xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
- xinference/types.py +105 -2
- xinference/ui/gradio/chat_interface.py +2 -0
- xinference/ui/gradio/media_interface.py +353 -7
- xinference/ui/web/ui/build/asset-manifest.json +3 -3
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
- xinference/ui/web/ui/src/locales/en.json +2 -0
- xinference/ui/web/ui/src/locales/ja.json +2 -0
- xinference/ui/web/ui/src/locales/ko.json +2 -0
- xinference/ui/web/ui/src/locales/zh.json +2 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/METADATA +16 -12
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/RECORD +86 -77
- xinference/ui/web/ui/build/static/js/main.4918643a.js +0 -3
- xinference/ui/web/ui/build/static/js/main.4918643a.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +0 -1
- /xinference/ui/web/ui/build/static/js/{main.4918643a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -14,14 +14,13 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
16
|
import random
|
|
17
|
-
import json
|
|
18
17
|
import math
|
|
19
18
|
from functools import partial
|
|
20
19
|
|
|
21
20
|
import torch
|
|
22
21
|
import torch.distributed as dist
|
|
23
22
|
from torch.utils.data import IterableDataset
|
|
24
|
-
from cosyvoice.utils.file_utils import read_lists
|
|
23
|
+
from cosyvoice.utils.file_utils import read_lists
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
class Processor(IterableDataset):
|
|
@@ -127,10 +126,9 @@ def Dataset(data_list_file,
|
|
|
127
126
|
data_pipeline,
|
|
128
127
|
mode='train',
|
|
129
128
|
gan=False,
|
|
129
|
+
dpo=False,
|
|
130
130
|
shuffle=True,
|
|
131
|
-
partition=True
|
|
132
|
-
tts_file='',
|
|
133
|
-
prompt_utt2data=''):
|
|
131
|
+
partition=True):
|
|
134
132
|
""" Construct dataset from arguments
|
|
135
133
|
|
|
136
134
|
We have two shuffle stage in the Dataset. The first is global
|
|
@@ -142,23 +140,12 @@ def Dataset(data_list_file,
|
|
|
142
140
|
tokenizer (BaseTokenizer): tokenizer to tokenize
|
|
143
141
|
partition(bool): whether to do data partition in terms of rank
|
|
144
142
|
"""
|
|
145
|
-
assert mode in ['train', 'inference']
|
|
146
143
|
lists = read_lists(data_list_file)
|
|
147
|
-
if mode == 'inference':
|
|
148
|
-
with open(tts_file) as f:
|
|
149
|
-
tts_data = json.load(f)
|
|
150
|
-
utt2lists = read_json_lists(prompt_utt2data)
|
|
151
|
-
# filter unnecessary file in inference mode
|
|
152
|
-
lists = list({utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists})
|
|
153
144
|
dataset = DataList(lists,
|
|
154
145
|
shuffle=shuffle,
|
|
155
146
|
partition=partition)
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
|
|
159
|
-
if gan is True:
|
|
160
|
-
# map partial arg to padding func in gan mode
|
|
161
|
-
data_pipeline[-1] = partial(data_pipeline[-1], gan=gan)
|
|
147
|
+
# map partial arg to padding func
|
|
148
|
+
data_pipeline[-1] = partial(data_pipeline[-1], gan=gan, dpo=dpo)
|
|
162
149
|
for func in data_pipeline:
|
|
163
150
|
dataset = Processor(dataset, func, mode=mode)
|
|
164
151
|
return dataset
|
|
@@ -43,8 +43,6 @@ def parquet_opener(data, mode='train', tts_data={}):
|
|
|
43
43
|
for df in pq.ParquetFile(url).iter_batches(batch_size=64):
|
|
44
44
|
df = df.to_pandas()
|
|
45
45
|
for i in range(len(df)):
|
|
46
|
-
if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
|
|
47
|
-
continue
|
|
48
46
|
sample.update(dict(df.loc[i]))
|
|
49
47
|
if mode == 'train':
|
|
50
48
|
# NOTE do not return sample directly, must initialize a new dict
|
|
@@ -100,6 +98,8 @@ def filter(data,
|
|
|
100
98
|
continue
|
|
101
99
|
if len(sample['speech_token']) == 0:
|
|
102
100
|
continue
|
|
101
|
+
if 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0:
|
|
102
|
+
continue
|
|
103
103
|
if num_frames != 0:
|
|
104
104
|
if len(sample['text_token']) / num_frames < min_output_input_ratio:
|
|
105
105
|
continue
|
|
@@ -159,6 +159,7 @@ def truncate(data, truncate_length=24576, mode='train'):
|
|
|
159
159
|
|
|
160
160
|
def compute_fbank(data,
|
|
161
161
|
feat_extractor,
|
|
162
|
+
token_mel_ratio=0,
|
|
162
163
|
mode='train'):
|
|
163
164
|
""" Extract fbank
|
|
164
165
|
|
|
@@ -174,8 +175,13 @@ def compute_fbank(data,
|
|
|
174
175
|
assert 'utt' in sample
|
|
175
176
|
assert 'text_token' in sample
|
|
176
177
|
waveform = sample['speech']
|
|
177
|
-
|
|
178
|
-
|
|
178
|
+
feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
|
|
179
|
+
if token_mel_ratio != 0:
|
|
180
|
+
# trim to align speech_token and speech_feat
|
|
181
|
+
token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
|
|
182
|
+
feat = feat[:token_mel_ratio * token_len]
|
|
183
|
+
sample["speech_token"] = sample["speech_token"][:token_len]
|
|
184
|
+
sample['speech_feat'] = feat
|
|
179
185
|
yield sample
|
|
180
186
|
|
|
181
187
|
|
|
@@ -236,8 +242,6 @@ def tokenize(data, get_tokenizer, allowed_special, mode='train'):
|
|
|
236
242
|
for sample in data:
|
|
237
243
|
assert 'text' in sample
|
|
238
244
|
sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
|
|
239
|
-
if mode == 'inference':
|
|
240
|
-
sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
|
|
241
245
|
yield sample
|
|
242
246
|
|
|
243
247
|
|
|
@@ -345,18 +349,15 @@ def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
|
|
|
345
349
|
def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
|
|
346
350
|
""" Wrapper for static/dynamic batch
|
|
347
351
|
"""
|
|
348
|
-
if
|
|
349
|
-
return static_batch(data,
|
|
352
|
+
if batch_type == 'static':
|
|
353
|
+
return static_batch(data, batch_size)
|
|
354
|
+
elif batch_type == 'dynamic':
|
|
355
|
+
return dynamic_batch(data, max_frames_in_batch)
|
|
350
356
|
else:
|
|
351
|
-
|
|
352
|
-
return static_batch(data, batch_size)
|
|
353
|
-
elif batch_type == 'dynamic':
|
|
354
|
-
return dynamic_batch(data, max_frames_in_batch)
|
|
355
|
-
else:
|
|
356
|
-
logging.fatal('Unsupported batch type {}'.format(batch_type))
|
|
357
|
+
logging.fatal('Unsupported batch type {}'.format(batch_type))
|
|
357
358
|
|
|
358
359
|
|
|
359
|
-
def padding(data, use_spk_embedding, mode='train', gan=False):
|
|
360
|
+
def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False):
|
|
360
361
|
""" Padding the data into training data
|
|
361
362
|
|
|
362
363
|
Args:
|
|
@@ -418,16 +419,14 @@ def padding(data, use_spk_embedding, mode='train', gan=False):
|
|
|
418
419
|
# only gan train needs speech, delete it to save memory
|
|
419
420
|
del batch["speech"]
|
|
420
421
|
del batch["speech_len"]
|
|
421
|
-
if
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
batch
|
|
428
|
-
|
|
429
|
-
'tts_text_token': tts_text_token,
|
|
430
|
-
'tts_text_token_len': tts_text_token_len})
|
|
422
|
+
if dpo is True:
|
|
423
|
+
reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order]
|
|
424
|
+
reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32)
|
|
425
|
+
reject_speech_token = pad_sequence(reject_speech_token,
|
|
426
|
+
batch_first=True,
|
|
427
|
+
padding_value=0)
|
|
428
|
+
batch['reject_speech_token'] = reject_speech_token
|
|
429
|
+
batch['reject_speech_token_len'] = reject_speech_token_len
|
|
431
430
|
if use_spk_embedding is True:
|
|
432
431
|
batch["embedding"] = batch["spk_embedding"]
|
|
433
432
|
else:
|