xinference 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (92) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +415 -1
  3. xinference/constants.py +2 -0
  4. xinference/core/model.py +3 -4
  5. xinference/core/supervisor.py +29 -1
  6. xinference/core/worker.py +4 -1
  7. xinference/deploy/cmdline.py +2 -0
  8. xinference/deploy/test/test_cmdline.py +1 -1
  9. xinference/model/audio/core.py +5 -0
  10. xinference/model/audio/cosyvoice.py +0 -1
  11. xinference/model/audio/kokoro.py +1 -1
  12. xinference/model/audio/kokoro_zh.py +124 -0
  13. xinference/model/audio/model_spec.json +64 -20
  14. xinference/model/embedding/flag/core.py +5 -0
  15. xinference/model/embedding/llama_cpp/core.py +22 -19
  16. xinference/model/embedding/sentence_transformers/core.py +19 -4
  17. xinference/model/embedding/vllm/core.py +40 -8
  18. xinference/model/image/cache_manager.py +56 -0
  19. xinference/model/image/core.py +9 -0
  20. xinference/model/image/model_spec.json +116 -9
  21. xinference/model/image/stable_diffusion/core.py +141 -31
  22. xinference/model/llm/core.py +10 -0
  23. xinference/model/llm/llama_cpp/core.py +42 -40
  24. xinference/model/llm/llm_family.json +435 -23
  25. xinference/model/llm/llm_family.py +1 -0
  26. xinference/model/llm/mlx/core.py +52 -33
  27. xinference/model/llm/sglang/core.py +2 -44
  28. xinference/model/llm/tool_parsers/__init__.py +58 -0
  29. xinference/model/llm/tool_parsers/abstract_tool_parser.py +33 -0
  30. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +128 -0
  31. xinference/model/llm/tool_parsers/deepseek_v3_tool_parser.py +145 -0
  32. xinference/model/llm/tool_parsers/glm4_tool_parser.py +123 -0
  33. xinference/model/llm/tool_parsers/llama3_tool_parser.py +77 -0
  34. xinference/model/llm/tool_parsers/qwen_tool_parser.py +320 -0
  35. xinference/model/llm/transformers/core.py +6 -12
  36. xinference/model/llm/utils.py +128 -46
  37. xinference/model/llm/vllm/core.py +8 -61
  38. xinference/model/rerank/core.py +3 -0
  39. xinference/model/rerank/sentence_transformers/core.py +1 -1
  40. xinference/model/rerank/vllm/core.py +56 -6
  41. xinference/model/utils.py +1 -2
  42. xinference/model/video/model_spec.json +95 -1
  43. xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
  44. xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
  45. xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
  46. xinference/thirdparty/cosyvoice/bin/train.py +23 -3
  47. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
  48. xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
  49. xinference/thirdparty/cosyvoice/cli/model.py +53 -75
  50. xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
  51. xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
  52. xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
  53. xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
  54. xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
  55. xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
  56. xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
  57. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
  58. xinference/thirdparty/cosyvoice/utils/common.py +20 -0
  59. xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
  60. xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
  61. xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
  62. xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
  63. xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
  64. xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
  65. xinference/types.py +105 -2
  66. xinference/ui/gradio/chat_interface.py +2 -0
  67. xinference/ui/gradio/media_interface.py +353 -7
  68. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  69. xinference/ui/web/ui/build/index.html +1 -1
  70. xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
  71. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
  72. xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
  73. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
  74. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
  75. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
  76. xinference/ui/web/ui/src/locales/en.json +2 -0
  77. xinference/ui/web/ui/src/locales/ja.json +2 -0
  78. xinference/ui/web/ui/src/locales/ko.json +2 -0
  79. xinference/ui/web/ui/src/locales/zh.json +2 -0
  80. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/METADATA +16 -12
  81. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/RECORD +86 -77
  82. xinference/ui/web/ui/build/static/js/main.4918643a.js +0 -3
  83. xinference/ui/web/ui/build/static/js/main.4918643a.js.map +0 -1
  84. xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
  85. xinference/ui/web/ui/node_modules/.cache/babel-loader/89179f8f51887b9167721860a12412549ff04f78162e921a7b6aa6532646deb2.json +0 -1
  86. xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
  87. xinference/ui/web/ui/node_modules/.cache/babel-loader/9dc5cfc67dd0617b0272aeef8651f1589b2155a4ff1fd72ad3166b217089b619.json +0 -1
  88. /xinference/ui/web/ui/build/static/js/{main.4918643a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
  89. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/WHEEL +0 -0
  90. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/entry_points.txt +0 -0
  91. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/licenses/LICENSE +0 -0
  92. {xinference-1.9.0.dist-info → xinference-1.10.0.dist-info}/top_level.txt +0 -0
@@ -14,14 +14,13 @@
14
14
  # limitations under the License.
15
15
 
16
16
  import random
17
- import json
18
17
  import math
19
18
  from functools import partial
20
19
 
21
20
  import torch
22
21
  import torch.distributed as dist
23
22
  from torch.utils.data import IterableDataset
24
- from cosyvoice.utils.file_utils import read_lists, read_json_lists
23
+ from cosyvoice.utils.file_utils import read_lists
25
24
 
26
25
 
27
26
  class Processor(IterableDataset):
@@ -127,10 +126,9 @@ def Dataset(data_list_file,
127
126
  data_pipeline,
128
127
  mode='train',
129
128
  gan=False,
129
+ dpo=False,
130
130
  shuffle=True,
131
- partition=True,
132
- tts_file='',
133
- prompt_utt2data=''):
131
+ partition=True):
134
132
  """ Construct dataset from arguments
135
133
 
136
134
  We have two shuffle stage in the Dataset. The first is global
@@ -142,23 +140,12 @@ def Dataset(data_list_file,
142
140
  tokenizer (BaseTokenizer): tokenizer to tokenize
143
141
  partition(bool): whether to do data partition in terms of rank
144
142
  """
145
- assert mode in ['train', 'inference']
146
143
  lists = read_lists(data_list_file)
147
- if mode == 'inference':
148
- with open(tts_file) as f:
149
- tts_data = json.load(f)
150
- utt2lists = read_json_lists(prompt_utt2data)
151
- # filter unnecessary file in inference mode
152
- lists = list({utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists})
153
144
  dataset = DataList(lists,
154
145
  shuffle=shuffle,
155
146
  partition=partition)
156
- if mode == 'inference':
157
- # map partial arg to parquet_opener func in inference mode
158
- data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
159
- if gan is True:
160
- # map partial arg to padding func in gan mode
161
- data_pipeline[-1] = partial(data_pipeline[-1], gan=gan)
147
+ # map partial arg to padding func
148
+ data_pipeline[-1] = partial(data_pipeline[-1], gan=gan, dpo=dpo)
162
149
  for func in data_pipeline:
163
150
  dataset = Processor(dataset, func, mode=mode)
164
151
  return dataset
@@ -43,8 +43,6 @@ def parquet_opener(data, mode='train', tts_data={}):
43
43
  for df in pq.ParquetFile(url).iter_batches(batch_size=64):
44
44
  df = df.to_pandas()
45
45
  for i in range(len(df)):
46
- if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
47
- continue
48
46
  sample.update(dict(df.loc[i]))
49
47
  if mode == 'train':
50
48
  # NOTE do not return sample directly, must initialize a new dict
@@ -100,6 +98,8 @@ def filter(data,
100
98
  continue
101
99
  if len(sample['speech_token']) == 0:
102
100
  continue
101
+ if 'reject_speech_token' in sample and len(sample['reject_speech_token']) == 0:
102
+ continue
103
103
  if num_frames != 0:
104
104
  if len(sample['text_token']) / num_frames < min_output_input_ratio:
105
105
  continue
@@ -159,6 +159,7 @@ def truncate(data, truncate_length=24576, mode='train'):
159
159
 
160
160
  def compute_fbank(data,
161
161
  feat_extractor,
162
+ token_mel_ratio=0,
162
163
  mode='train'):
163
164
  """ Extract fbank
164
165
 
@@ -174,8 +175,13 @@ def compute_fbank(data,
174
175
  assert 'utt' in sample
175
176
  assert 'text_token' in sample
176
177
  waveform = sample['speech']
177
- mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
178
- sample['speech_feat'] = mat
178
+ feat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
179
+ if token_mel_ratio != 0:
180
+ # trim to align speech_token and speech_feat
181
+ token_len = int(min(feat.shape[0] / token_mel_ratio, sample["speech_token"].shape[0]))
182
+ feat = feat[:token_mel_ratio * token_len]
183
+ sample["speech_token"] = sample["speech_token"][:token_len]
184
+ sample['speech_feat'] = feat
179
185
  yield sample
180
186
 
181
187
 
@@ -236,8 +242,6 @@ def tokenize(data, get_tokenizer, allowed_special, mode='train'):
236
242
  for sample in data:
237
243
  assert 'text' in sample
238
244
  sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
239
- if mode == 'inference':
240
- sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
241
245
  yield sample
242
246
 
243
247
 
@@ -345,18 +349,15 @@ def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
345
349
  def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
346
350
  """ Wrapper for static/dynamic batch
347
351
  """
348
- if mode == 'inference':
349
- return static_batch(data, 1)
352
+ if batch_type == 'static':
353
+ return static_batch(data, batch_size)
354
+ elif batch_type == 'dynamic':
355
+ return dynamic_batch(data, max_frames_in_batch)
350
356
  else:
351
- if batch_type == 'static':
352
- return static_batch(data, batch_size)
353
- elif batch_type == 'dynamic':
354
- return dynamic_batch(data, max_frames_in_batch)
355
- else:
356
- logging.fatal('Unsupported batch type {}'.format(batch_type))
357
+ logging.fatal('Unsupported batch type {}'.format(batch_type))
357
358
 
358
359
 
359
- def padding(data, use_spk_embedding, mode='train', gan=False):
360
+ def padding(data, use_spk_embedding, mode='train', gan=False, dpo=False):
360
361
  """ Padding the data into training data
361
362
 
362
363
  Args:
@@ -418,16 +419,14 @@ def padding(data, use_spk_embedding, mode='train', gan=False):
418
419
  # only gan train needs speech, delete it to save memory
419
420
  del batch["speech"]
420
421
  del batch["speech_len"]
421
- if mode == 'inference':
422
- tts_text = [sample[i]['tts_text'] for i in order]
423
- tts_index = [sample[i]['tts_index'] for i in order]
424
- tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
425
- tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
426
- tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
427
- batch.update({'tts_text': tts_text,
428
- 'tts_index': tts_index,
429
- 'tts_text_token': tts_text_token,
430
- 'tts_text_token_len': tts_text_token_len})
422
+ if dpo is True:
423
+ reject_speech_token = [torch.tensor(sample[i]['reject_speech_token']) for i in order]
424
+ reject_speech_token_len = torch.tensor([i.size(0) for i in reject_speech_token], dtype=torch.int32)
425
+ reject_speech_token = pad_sequence(reject_speech_token,
426
+ batch_first=True,
427
+ padding_value=0)
428
+ batch['reject_speech_token'] = reject_speech_token
429
+ batch['reject_speech_token_len'] = reject_speech_token_len
431
430
  if use_spk_embedding is True:
432
431
  batch["embedding"] = batch["spk_embedding"]
433
432
  else: