xinference 0.14.2__py3-none-any.whl → 0.14.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/core/chat_interface.py +1 -1
- xinference/core/image_interface.py +9 -0
- xinference/core/model.py +4 -1
- xinference/core/worker.py +48 -41
- xinference/model/audio/chattts.py +24 -9
- xinference/model/audio/core.py +8 -2
- xinference/model/audio/fish_speech.py +228 -0
- xinference/model/audio/model_spec.json +8 -0
- xinference/model/embedding/core.py +23 -1
- xinference/model/image/model_spec.json +2 -1
- xinference/model/image/model_spec_modelscope.json +2 -1
- xinference/model/image/stable_diffusion/core.py +49 -1
- xinference/model/llm/__init__.py +6 -0
- xinference/model/llm/llm_family.json +54 -9
- xinference/model/llm/llm_family.py +2 -0
- xinference/model/llm/llm_family_modelscope.json +56 -10
- xinference/model/llm/lmdeploy/__init__.py +0 -0
- xinference/model/llm/lmdeploy/core.py +557 -0
- xinference/model/llm/transformers/cogvlm2.py +4 -45
- xinference/model/llm/transformers/cogvlm2_video.py +524 -0
- xinference/model/llm/transformers/core.py +1 -0
- xinference/model/llm/transformers/glm4v.py +2 -23
- xinference/model/llm/transformers/intern_vl.py +94 -11
- xinference/model/llm/transformers/minicpmv25.py +2 -23
- xinference/model/llm/transformers/minicpmv26.py +2 -22
- xinference/model/llm/transformers/yi_vl.py +2 -24
- xinference/model/llm/utils.py +10 -1
- xinference/model/llm/vllm/core.py +1 -1
- xinference/thirdparty/fish_speech/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/callbacks/__init__.py +3 -0
- xinference/thirdparty/fish_speech/fish_speech/callbacks/grad_norm.py +113 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/conversation.py +2 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/concat_repeat.py +53 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_pb2.py +33 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_stream.py +36 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/semantic.py +496 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/vqgan.py +147 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/__init__.py +3 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/core.py +40 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +122 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +122 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +123 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +133 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +122 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/scan.py +122 -0
- xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lit_module.py +202 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +779 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lora.py +92 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +3 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +442 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +44 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +625 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +139 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +115 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +225 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/utils.py +94 -0
- xinference/thirdparty/fish_speech/fish_speech/scheduler.py +40 -0
- xinference/thirdparty/fish_speech/fish_speech/text/__init__.py +4 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_class.py +172 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_constant.py +30 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_util.py +342 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/cardinal.py +32 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/date.py +75 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/digit.py +32 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/fraction.py +35 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/money.py +43 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/percentage.py +33 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/telephone.py +51 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +177 -0
- xinference/thirdparty/fish_speech/fish_speech/text/clean.py +69 -0
- xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +130 -0
- xinference/thirdparty/fish_speech/fish_speech/train.py +139 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +23 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/braceexpand.py +217 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/context.py +13 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/file.py +16 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/instantiators.py +50 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/logger.py +55 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/logging_utils.py +48 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/rich_utils.py +100 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/spectrogram.py +122 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +114 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +120 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1237 -0
- xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/api.py +495 -0
- xinference/thirdparty/fish_speech/tools/auto_rerank.py +159 -0
- xinference/thirdparty/fish_speech/tools/download_models.py +55 -0
- xinference/thirdparty/fish_speech/tools/extract_model.py +21 -0
- xinference/thirdparty/fish_speech/tools/file.py +108 -0
- xinference/thirdparty/fish_speech/tools/gen_ref.py +36 -0
- xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +169 -0
- xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +171 -0
- xinference/thirdparty/fish_speech/tools/llama/generate.py +698 -0
- xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +95 -0
- xinference/thirdparty/fish_speech/tools/llama/quantize.py +497 -0
- xinference/thirdparty/fish_speech/tools/llama/rebuild_tokenizer.py +57 -0
- xinference/thirdparty/fish_speech/tools/merge_asr_files.py +55 -0
- xinference/thirdparty/fish_speech/tools/post_api.py +164 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/auto_model.py +573 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +332 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/vad_utils.py +61 -0
- xinference/thirdparty/fish_speech/tools/smart_pad.py +47 -0
- xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/vqgan/create_train_split.py +83 -0
- xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +227 -0
- xinference/thirdparty/fish_speech/tools/vqgan/inference.py +120 -0
- xinference/thirdparty/fish_speech/tools/webui.py +619 -0
- xinference/thirdparty/fish_speech/tools/whisper_asr.py +176 -0
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.ffc26121.js → main.661c7b0a.js} +3 -3
- xinference/web/ui/build/static/js/main.661c7b0a.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +1 -0
- {xinference-0.14.2.dist-info → xinference-0.14.3.dist-info}/METADATA +18 -6
- {xinference-0.14.2.dist-info → xinference-0.14.3.dist-info}/RECORD +135 -37
- xinference/web/ui/build/static/js/main.ffc26121.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2f40209b32e7e46a2eab6b8c8a355eb42c3caa8bc3228dd929f32fd2b3940294.json +0 -1
- /xinference/web/ui/build/static/js/{main.ffc26121.js.LICENSE.txt → main.661c7b0a.js.LICENSE.txt} +0 -0
- {xinference-0.14.2.dist-info → xinference-0.14.3.dist-info}/LICENSE +0 -0
- {xinference-0.14.2.dist-info → xinference-0.14.3.dist-info}/WHEEL +0 -0
- {xinference-0.14.2.dist-info → xinference-0.14.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.2.dist-info → xinference-0.14.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Used to transcribe all audio files in one folder into another folder.
|
|
3
|
+
e.g.
|
|
4
|
+
Directory structure:
|
|
5
|
+
--pre_data_root
|
|
6
|
+
----SP_1
|
|
7
|
+
------01.wav
|
|
8
|
+
------02.wav
|
|
9
|
+
------......
|
|
10
|
+
----SP_2
|
|
11
|
+
------01.wav
|
|
12
|
+
------02.wav
|
|
13
|
+
------......
|
|
14
|
+
Use
|
|
15
|
+
python tools/whisper_asr.py --audio-dir pre_data_root/SP_1 --save-dir data/SP_1
|
|
16
|
+
to transcribe the first speaker.
|
|
17
|
+
|
|
18
|
+
Use
|
|
19
|
+
python tools/whisper_asr.py --audio-dir pre_data_root/SP_2 --save-dir data/SP_2
|
|
20
|
+
to transcribe the second speaker.
|
|
21
|
+
|
|
22
|
+
Note: Be aware of your audio sample rate, which defaults to 44.1kHz.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import re
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
import click
|
|
29
|
+
import soundfile as sf
|
|
30
|
+
from faster_whisper import WhisperModel
|
|
31
|
+
from loguru import logger
|
|
32
|
+
from pydub import AudioSegment
|
|
33
|
+
from tqdm import tqdm
|
|
34
|
+
|
|
35
|
+
from tools.file import AUDIO_EXTENSIONS, list_files
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@click.command()
|
|
39
|
+
@click.option("--model-size", default="large-v3", help="Size of the Whisper model")
|
|
40
|
+
@click.option(
|
|
41
|
+
"--compute-type",
|
|
42
|
+
default="float16",
|
|
43
|
+
help="Computation Precision of the Whisper model [float16 / int8_float16 / int8]",
|
|
44
|
+
)
|
|
45
|
+
@click.option("--audio-dir", required=True, help="Directory containing audio files")
|
|
46
|
+
@click.option(
|
|
47
|
+
"--save-dir", required=True, help="Directory to save processed audio files"
|
|
48
|
+
)
|
|
49
|
+
@click.option(
|
|
50
|
+
"--sample-rate",
|
|
51
|
+
default=44100,
|
|
52
|
+
type=int,
|
|
53
|
+
help="Output sample rate, default to input sample rate",
|
|
54
|
+
)
|
|
55
|
+
@click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
|
|
56
|
+
@click.option("--language", default="auto", help="Language of the transcription")
|
|
57
|
+
@click.option("--initial-prompt", default=None, help="Initial prompt for transcribing")
|
|
58
|
+
def main(
|
|
59
|
+
model_size,
|
|
60
|
+
compute_type,
|
|
61
|
+
audio_dir,
|
|
62
|
+
save_dir,
|
|
63
|
+
sample_rate,
|
|
64
|
+
device,
|
|
65
|
+
language,
|
|
66
|
+
initial_prompt,
|
|
67
|
+
):
|
|
68
|
+
logger.info("Loading / Downloading Faster Whisper model...")
|
|
69
|
+
|
|
70
|
+
model = WhisperModel(
|
|
71
|
+
model_size,
|
|
72
|
+
device=device,
|
|
73
|
+
compute_type=compute_type,
|
|
74
|
+
download_root="faster_whisper",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
logger.info("Model loaded.")
|
|
78
|
+
|
|
79
|
+
save_path = Path(save_dir)
|
|
80
|
+
save_path.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
|
|
82
|
+
audio_files = list_files(
|
|
83
|
+
path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
for file_path in tqdm(audio_files, desc="Processing audio file"):
|
|
87
|
+
file_stem = file_path.stem
|
|
88
|
+
file_suffix = file_path.suffix
|
|
89
|
+
|
|
90
|
+
rel_path = Path(file_path).relative_to(audio_dir)
|
|
91
|
+
(save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
audio = AudioSegment.from_file(file_path)
|
|
94
|
+
|
|
95
|
+
segments, info = model.transcribe(
|
|
96
|
+
file_path,
|
|
97
|
+
beam_size=5,
|
|
98
|
+
language=None if language == "auto" else language,
|
|
99
|
+
initial_prompt=initial_prompt,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
print(
|
|
103
|
+
"Detected language '%s' with probability %f"
|
|
104
|
+
% (info.language, info.language_probability)
|
|
105
|
+
)
|
|
106
|
+
print("Total len(ms): ", len(audio))
|
|
107
|
+
|
|
108
|
+
whole_text = None
|
|
109
|
+
for segment in segments:
|
|
110
|
+
id, start, end, text = (
|
|
111
|
+
segment.id,
|
|
112
|
+
segment.start,
|
|
113
|
+
segment.end,
|
|
114
|
+
segment.text,
|
|
115
|
+
)
|
|
116
|
+
print("Segment %03d [%.2fs -> %.2fs] %s" % (id, start, end, text))
|
|
117
|
+
if not whole_text:
|
|
118
|
+
whole_text = text
|
|
119
|
+
else:
|
|
120
|
+
whole_text += ", " + text
|
|
121
|
+
|
|
122
|
+
whole_text += "."
|
|
123
|
+
|
|
124
|
+
audio_save_path = save_path / rel_path.parent / f"{file_stem}{file_suffix}"
|
|
125
|
+
audio.export(audio_save_path, format=file_suffix[1:])
|
|
126
|
+
print(f"Exported {audio_save_path}")
|
|
127
|
+
|
|
128
|
+
transcript_save_path = save_path / rel_path.parent / f"{file_stem}.lab"
|
|
129
|
+
with open(
|
|
130
|
+
transcript_save_path,
|
|
131
|
+
"w",
|
|
132
|
+
encoding="utf-8",
|
|
133
|
+
) as f:
|
|
134
|
+
f.write(whole_text)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
if __name__ == "__main__":
|
|
138
|
+
main()
|
|
139
|
+
exit(0)
|
|
140
|
+
|
|
141
|
+
audio = AudioSegment.from_wav(
|
|
142
|
+
r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
model_size = "large-v3"
|
|
146
|
+
|
|
147
|
+
model = WhisperModel(
|
|
148
|
+
model_size,
|
|
149
|
+
device="cuda",
|
|
150
|
+
compute_type="float16",
|
|
151
|
+
download_root="faster_whisper",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
segments, info = model.transcribe(
|
|
155
|
+
r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav",
|
|
156
|
+
beam_size=5,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
print(
|
|
160
|
+
"Detected language '%s' with probability %f"
|
|
161
|
+
% (info.language, info.language_probability)
|
|
162
|
+
)
|
|
163
|
+
print("Total len(ms): ", len(audio))
|
|
164
|
+
|
|
165
|
+
for i, segment in enumerate(segments):
|
|
166
|
+
print(
|
|
167
|
+
"Segment %03d [%.2fs -> %.2fs] %s"
|
|
168
|
+
% (i, segment.start, segment.end, segment.text)
|
|
169
|
+
)
|
|
170
|
+
start_ms = int(segment.start * 1000)
|
|
171
|
+
end_ms = int(segment.end * 1000)
|
|
172
|
+
segment_audio = audio[start_ms:end_ms]
|
|
173
|
+
segment_audio.export(f"segment_{i:03d}.wav", format="wav")
|
|
174
|
+
print(f"Exported segment_{i:03d}.wav")
|
|
175
|
+
|
|
176
|
+
print("All segments have been exported.")
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
3
|
"main.css": "./static/css/main.4bafd904.css",
|
|
4
|
-
"main.js": "./static/js/main.
|
|
4
|
+
"main.js": "./static/js/main.661c7b0a.js",
|
|
5
5
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
6
6
|
"index.html": "./index.html",
|
|
7
7
|
"main.4bafd904.css.map": "./static/css/main.4bafd904.css.map",
|
|
8
|
-
"main.
|
|
8
|
+
"main.661c7b0a.js.map": "./static/js/main.661c7b0a.js.map"
|
|
9
9
|
},
|
|
10
10
|
"entrypoints": [
|
|
11
11
|
"static/css/main.4bafd904.css",
|
|
12
|
-
"static/js/main.
|
|
12
|
+
"static/js/main.661c7b0a.js"
|
|
13
13
|
]
|
|
14
14
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.661c7b0a.js"></script><link href="./static/css/main.4bafd904.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|