xinference 0.14.1.post1__py3-none-any.whl → 0.14.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +15 -34
- xinference/client/restful/restful_client.py +2 -2
- xinference/core/chat_interface.py +45 -10
- xinference/core/image_interface.py +9 -0
- xinference/core/model.py +8 -5
- xinference/core/scheduler.py +1 -2
- xinference/core/worker.py +49 -42
- xinference/deploy/cmdline.py +2 -2
- xinference/deploy/test/test_cmdline.py +7 -7
- xinference/model/audio/chattts.py +24 -9
- xinference/model/audio/core.py +8 -2
- xinference/model/audio/fish_speech.py +228 -0
- xinference/model/audio/model_spec.json +8 -0
- xinference/model/embedding/core.py +23 -1
- xinference/model/image/model_spec.json +2 -1
- xinference/model/image/model_spec_modelscope.json +2 -1
- xinference/model/image/stable_diffusion/core.py +49 -1
- xinference/model/llm/__init__.py +26 -27
- xinference/model/llm/{ggml/llamacpp.py → llama_cpp/core.py} +2 -35
- xinference/model/llm/llm_family.json +606 -1266
- xinference/model/llm/llm_family.py +16 -139
- xinference/model/llm/llm_family_modelscope.json +276 -313
- xinference/model/llm/lmdeploy/__init__.py +0 -0
- xinference/model/llm/lmdeploy/core.py +557 -0
- xinference/model/llm/memory.py +9 -9
- xinference/model/llm/sglang/core.py +2 -2
- xinference/model/llm/{pytorch → transformers}/chatglm.py +6 -13
- xinference/model/llm/{pytorch → transformers}/cogvlm2.py +4 -45
- xinference/model/llm/transformers/cogvlm2_video.py +524 -0
- xinference/model/llm/{pytorch → transformers}/core.py +3 -10
- xinference/model/llm/{pytorch → transformers}/glm4v.py +2 -23
- xinference/model/llm/transformers/intern_vl.py +540 -0
- xinference/model/llm/{pytorch → transformers}/internlm2.py +4 -8
- xinference/model/llm/{pytorch → transformers}/minicpmv25.py +2 -23
- xinference/model/llm/{pytorch → transformers}/minicpmv26.py +66 -41
- xinference/model/llm/{pytorch → transformers}/utils.py +1 -2
- xinference/model/llm/{pytorch → transformers}/yi_vl.py +2 -24
- xinference/model/llm/utils.py +85 -70
- xinference/model/llm/vllm/core.py +110 -11
- xinference/model/utils.py +1 -95
- xinference/thirdparty/fish_speech/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/callbacks/__init__.py +3 -0
- xinference/thirdparty/fish_speech/fish_speech/callbacks/grad_norm.py +113 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/lora/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/conversation.py +2 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/concat_repeat.py +53 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_pb2.py +33 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text_data_stream.py +36 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/semantic.py +496 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/vqgan.py +147 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/__init__.py +3 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/core.py +40 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +122 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +122 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +123 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +133 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +122 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/scan.py +122 -0
- xinference/thirdparty/fish_speech/fish_speech/models/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lit_module.py +202 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +779 -0
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/lora.py +92 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +3 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +442 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +44 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +625 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +139 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +115 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +225 -0
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/utils.py +94 -0
- xinference/thirdparty/fish_speech/fish_speech/scheduler.py +40 -0
- xinference/thirdparty/fish_speech/fish_speech/text/__init__.py +4 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_class.py +172 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_constant.py +30 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/basic_util.py +342 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/cardinal.py +32 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/date.py +75 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/digit.py +32 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/fraction.py +35 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/money.py +43 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/percentage.py +33 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/telephone.py +51 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +177 -0
- xinference/thirdparty/fish_speech/fish_speech/text/clean.py +69 -0
- xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +130 -0
- xinference/thirdparty/fish_speech/fish_speech/train.py +139 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/__init__.py +23 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/braceexpand.py +217 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/context.py +13 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/file.py +16 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/instantiators.py +50 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/logger.py +55 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/logging_utils.py +48 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/rich_utils.py +100 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/spectrogram.py +122 -0
- xinference/thirdparty/fish_speech/fish_speech/utils/utils.py +114 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/__init__.py +0 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/launch_utils.py +120 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +1237 -0
- xinference/thirdparty/fish_speech/tools/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/api.py +495 -0
- xinference/thirdparty/fish_speech/tools/auto_rerank.py +159 -0
- xinference/thirdparty/fish_speech/tools/download_models.py +55 -0
- xinference/thirdparty/fish_speech/tools/extract_model.py +21 -0
- xinference/thirdparty/fish_speech/tools/file.py +108 -0
- xinference/thirdparty/fish_speech/tools/gen_ref.py +36 -0
- xinference/thirdparty/fish_speech/tools/llama/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +169 -0
- xinference/thirdparty/fish_speech/tools/llama/eval_in_context.py +171 -0
- xinference/thirdparty/fish_speech/tools/llama/generate.py +698 -0
- xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +95 -0
- xinference/thirdparty/fish_speech/tools/llama/quantize.py +497 -0
- xinference/thirdparty/fish_speech/tools/llama/rebuild_tokenizer.py +57 -0
- xinference/thirdparty/fish_speech/tools/merge_asr_files.py +55 -0
- xinference/thirdparty/fish_speech/tools/post_api.py +164 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/auto_model.py +573 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +332 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/vad_utils.py +61 -0
- xinference/thirdparty/fish_speech/tools/smart_pad.py +47 -0
- xinference/thirdparty/fish_speech/tools/vqgan/__init__.py +0 -0
- xinference/thirdparty/fish_speech/tools/vqgan/create_train_split.py +83 -0
- xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +227 -0
- xinference/thirdparty/fish_speech/tools/vqgan/inference.py +120 -0
- xinference/thirdparty/fish_speech/tools/webui.py +619 -0
- xinference/thirdparty/fish_speech/tools/whisper_asr.py +176 -0
- xinference/thirdparty/internvl/__init__.py +0 -0
- xinference/thirdparty/internvl/conversation.py +393 -0
- xinference/thirdparty/omnilmm/model/utils.py +16 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/main.661c7b0a.js +3 -0
- xinference/web/ui/build/static/js/{main.17ca0398.js.map → main.661c7b0a.js.map} +1 -1
- xinference/web/ui/node_modules/.cache/babel-loader/070d8c6b3b0f3485c6d3885f0b6bbfdf9643e088a468acbd5d596f2396071c16.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/4de9a6942c5f1749d6cbfdd54279699975f16016b182848bc253886f52ec2ec3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5391543180fead1eeef5364300301498d58a7d91d62de3841a32768b67f4552f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/714c37ce0ec5b5c591033f02be2f3f491fdd70da3ef568ee4a4f94689a3d5ca2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/822586ed1077201b64b954f12f25e3f9b45678c1acbabe53d8af3ca82ca71f33.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a797831de0dc74897f4b50b3426555d748f328b4c2cc391de709eadaf6a5f3e3.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/bd6ad8159341315a1764c397621a560809f7eb7219ab5174c801fca7e969d943.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e64b7e8cedcf43d4c95deba60ec1341855c887705805bb62431693118b870c69.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e91938976f229ce986b2907e51e1f00540b584ced0a315d498c172d13220739d.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f72f011744c4649fabddca6f7a9327861ac0a315a89b1a2e62a39774e7863845.json +1 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/METADATA +22 -13
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/RECORD +170 -79
- xinference/locale/utils.py +0 -39
- xinference/locale/zh_CN.json +0 -26
- xinference/model/llm/ggml/tools/__init__.py +0 -15
- xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +0 -498
- xinference/model/llm/ggml/tools/gguf.py +0 -884
- xinference/model/llm/pytorch/__init__.py +0 -13
- xinference/model/llm/pytorch/baichuan.py +0 -81
- xinference/model/llm/pytorch/falcon.py +0 -138
- xinference/model/llm/pytorch/intern_vl.py +0 -352
- xinference/model/llm/pytorch/vicuna.py +0 -69
- xinference/web/ui/build/static/js/main.17ca0398.js +0 -3
- xinference/web/ui/node_modules/.cache/babel-loader/1444c41a4d04494f1cbc2d8c1537df107b451cb569cb2c1fbf5159f3a4841a5f.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/2f40209b32e7e46a2eab6b8c8a355eb42c3caa8bc3228dd929f32fd2b3940294.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/44774c783428f952d8e2e4ad0998a9c5bc16a57cd9c68b7c5ff18aaa5a41d65c.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5262556baf9207738bf6a8ba141ec6599d0a636345c245d61fdf88d3171998cb.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/6450605fac003812485f6251b9f0caafbf2e5bfc3bbe2f000050d9e2fdb8dcd3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/71684495d995c7e266eecc6a0ad8ea0284cc785f80abddf863789c57a6134969.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/80acd1edf31542ab1dcccfad02cb4b38f3325cff847a781fcce97500cfd6f878.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/8a9742ddd8ba8546ef42dc14caca443f2b4524fabed7bf269e0eff3b7b64ee7d.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d06a96a3c9c32e42689094aa3aaad41c8125894e956b8f84a70fadce6e3f65b3.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/d93730e2b5d7e8c957b4d0965d2ed1dac9045a649adbd47c220d11f255d4b1e0.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/e656dc00b4d8b387f0a81ba8fc558767df1601c66369e2eb86a5ef27cf080572.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/f3e02274cb1964e99b1fe69cbb6db233d3d8d7dd05d50ebcdb8e66d50b224b7b.json +0 -1
- /xinference/{locale → model/llm/llama_cpp}/__init__.py +0 -0
- /xinference/model/llm/{ggml → transformers}/__init__.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/compression.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/deepseek_vl.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/llama_2.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/omnilmm.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/qwen_vl.py +0 -0
- /xinference/model/llm/{pytorch → transformers}/tensorizer_utils.py +0 -0
- /xinference/web/ui/build/static/js/{main.17ca0398.js.LICENSE.txt → main.661c7b0a.js.LICENSE.txt} +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/LICENSE +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/WHEEL +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.1.post1.dist-info → xinference-0.14.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
{
|
|
2
|
+
"16-mixed is recommended for 10+ series GPU": "10+ 系列 GPU 建议使用 16-mixed",
|
|
3
|
+
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频,适用于指定音色。",
|
|
4
|
+
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
|
|
5
|
+
"Accumulate Gradient Batches": "梯度累积批次",
|
|
6
|
+
"Add to Processing Area": "加入处理区",
|
|
7
|
+
"Added path successfully!": "添加路径成功!",
|
|
8
|
+
"Advanced Config": "高级参数",
|
|
9
|
+
"Base LLAMA Model": "基础 LLAMA 模型",
|
|
10
|
+
"Batch Inference": "批量推理",
|
|
11
|
+
"Batch Size": "批次大小",
|
|
12
|
+
"Changing with the Model Path": "随模型路径变化",
|
|
13
|
+
"Chinese": "中文",
|
|
14
|
+
"Compile Model": "编译模型",
|
|
15
|
+
"Compile the model can significantly reduce the inference time, but will increase cold start time": "编译模型可以显著减少推理时间,但会增加冷启动时间",
|
|
16
|
+
"Copy": "复制",
|
|
17
|
+
"Data Preprocessing": "数据预处理",
|
|
18
|
+
"Data Preprocessing Path": "数据预处理路径",
|
|
19
|
+
"Data Source": "数据源",
|
|
20
|
+
"Decoder Model Config": "解码器模型配置",
|
|
21
|
+
"Decoder Model Path": "解码器模型路径",
|
|
22
|
+
"Disabled": "禁用",
|
|
23
|
+
"Enable Reference Audio": "启用参考音频",
|
|
24
|
+
"English": "英文",
|
|
25
|
+
"Error Message": "错误信息",
|
|
26
|
+
"File Preprocessing": "文件预处理",
|
|
27
|
+
"Generate": "生成",
|
|
28
|
+
"Generated Audio": "音频",
|
|
29
|
+
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "如果音频没有对应的文本,可以应用 ASR 辅助,支持 .txt 或 .lab 格式",
|
|
30
|
+
"Infer interface is closed": "推理界面已关闭",
|
|
31
|
+
"Inference Configuration": "推理配置",
|
|
32
|
+
"Inference Server Configuration": "推理服务器配置",
|
|
33
|
+
"Inference Server Error": "推理服务器错误",
|
|
34
|
+
"Inferring interface is launched at {}": "推理界面已在 {} 上启动",
|
|
35
|
+
"Initial Learning Rate": "初始学习率",
|
|
36
|
+
"Input Audio & Source Path for Transcription": "输入音频和转录源路径",
|
|
37
|
+
"Input Text": "输入文本",
|
|
38
|
+
"Invalid path: {}": "无效路径: {}",
|
|
39
|
+
"It is recommended to use CUDA, if you have low configuration, use CPU": "建议使用 CUDA,如果配置较低,使用 CPU",
|
|
40
|
+
"Iterative Prompt Length, 0 means off": "迭代提示长度,0 表示关闭",
|
|
41
|
+
"Japanese": "日文",
|
|
42
|
+
"LLAMA Configuration": "LLAMA 配置",
|
|
43
|
+
"LLAMA Model Config": "LLAMA 模型配置",
|
|
44
|
+
"LLAMA Model Path": "LLAMA 模型路径",
|
|
45
|
+
"Labeling Device": "标注加速设备",
|
|
46
|
+
"LoRA Model to be merged": "要合并的 LoRA 模型",
|
|
47
|
+
"Maximum Audio Duration": "最大音频时长",
|
|
48
|
+
"Maximum Length per Sample": "每个样本的最大长度",
|
|
49
|
+
"Maximum Training Steps": "最大训练步数",
|
|
50
|
+
"Maximum tokens per batch, 0 means no limit": "每批最大令牌数,0 表示无限制",
|
|
51
|
+
"Merge": "合并",
|
|
52
|
+
"Merge LoRA": "合并 LoRA",
|
|
53
|
+
"Merge successfully": "合并成功",
|
|
54
|
+
"Minimum Audio Duration": "最小音频时长",
|
|
55
|
+
"Model Output Path": "模型输出路径",
|
|
56
|
+
"Model Size": "模型规模",
|
|
57
|
+
"Move": "移动",
|
|
58
|
+
"Move files successfully": "移动文件成功",
|
|
59
|
+
"No audio generated, please check the input text.": "没有生成音频,请检查输入文本.",
|
|
60
|
+
"No selected options": "没有选择的选项",
|
|
61
|
+
"Number of Workers": "数据加载进程数",
|
|
62
|
+
"Open Inference Server": "打开推理服务器",
|
|
63
|
+
"Open Labeler WebUI": "打开标注工具",
|
|
64
|
+
"Open Tensorboard": "打开 Tensorboard",
|
|
65
|
+
"Opened labeler in browser": "在浏览器中打开标注工具",
|
|
66
|
+
"Optional Label Language": "[可选] 标注语言",
|
|
67
|
+
"Optional online ver": "[可选] 使用在线版",
|
|
68
|
+
"Output Path": "输出路径",
|
|
69
|
+
"Path error, please check the model file exists in the corresponding path": "路径错误,请检查模型文件是否存在于相应路径",
|
|
70
|
+
"Precision": "精度",
|
|
71
|
+
"Probability of applying Speaker Condition": "应用说话人条件的概率",
|
|
72
|
+
"Put your text here.": "在此处输入文本.",
|
|
73
|
+
"Reference Audio": "参考音频",
|
|
74
|
+
"Reference Text": "参考文本",
|
|
75
|
+
"Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "相关代码使用 BSD-3-Clause 许可证发布,权重使用 CC BY-NC-SA 4.0 许可证发布.",
|
|
76
|
+
"Remove Selected Data": "移除选中数据",
|
|
77
|
+
"Removed path successfully!": "移除路径成功!",
|
|
78
|
+
"Repetition Penalty": "重复惩罚",
|
|
79
|
+
"Save model every n steps": "每 n 步保存模型",
|
|
80
|
+
"Select LLAMA ckpt": "选择 LLAMA 检查点",
|
|
81
|
+
"Select VITS ckpt": "选择 VITS 检查点",
|
|
82
|
+
"Select VQGAN ckpt": "选择 VQGAN 检查点",
|
|
83
|
+
"Select source file processing method": "选择源文件处理方法",
|
|
84
|
+
"Select the model to be trained (Depending on the Tab page you are on)": "根据您所在的选项卡页面选择要训练的模型",
|
|
85
|
+
"Selected: {}": "已选择: {}",
|
|
86
|
+
"Speaker": "说话人",
|
|
87
|
+
"Speaker is identified by the folder name": "自动根据父目录名称识别说话人",
|
|
88
|
+
"Start Training": "开始训练",
|
|
89
|
+
"Streaming Audio": "流式音频",
|
|
90
|
+
"Streaming Generate": "流式合成",
|
|
91
|
+
"Tensorboard Host": "Tensorboard 监听地址",
|
|
92
|
+
"Tensorboard Log Path": "Tensorboard 日志路径",
|
|
93
|
+
"Tensorboard Port": "Tensorboard 端口",
|
|
94
|
+
"Tensorboard interface is closed": "Tensorboard 界面已关闭",
|
|
95
|
+
"Tensorboard interface is launched at {}": "Tensorboard 界面已在 {} 上启动",
|
|
96
|
+
"Text is too long, please keep it under {} characters.": "文本太长,请保持在 {} 个字符以内.",
|
|
97
|
+
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左侧输入文件夹的路径或文件列表。无论是否选中,都将在此列表中用于后续训练.",
|
|
98
|
+
"Training Configuration": "训练配置",
|
|
99
|
+
"Training Error": "训练错误",
|
|
100
|
+
"Training stopped": "训练已停止",
|
|
101
|
+
"Type name of the speaker": "输入说话人的名称",
|
|
102
|
+
"Type the path or select from the dropdown": "输入路径或从下拉菜单中选择",
|
|
103
|
+
"Use LoRA": "使用 LoRA",
|
|
104
|
+
"Use LoRA can save GPU memory, but may reduce the quality of the model": "使用 LoRA 可以节省 GPU 内存,但可能会降低模型质量",
|
|
105
|
+
"Use filelist": "使用文件列表",
|
|
106
|
+
"Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 使用 large, 5G 使用 medium, 2G 使用 small",
|
|
107
|
+
"VITS Configuration": "VITS 配置",
|
|
108
|
+
"VQGAN Configuration": "VQGAN 配置",
|
|
109
|
+
"Validation Batch Size": "验证批次大小",
|
|
110
|
+
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "查看预处理文件夹的状态 (使用滑块控制树的深度)",
|
|
111
|
+
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "我们不对模型的任何滥用负责,请在使用之前考虑您当地的法律法规.",
|
|
112
|
+
"WebUI Host": "WebUI 监听地址",
|
|
113
|
+
"WebUI Port": "WebUI 端口",
|
|
114
|
+
"Whisper Model": "Whisper 模型",
|
|
115
|
+
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.",
|
|
116
|
+
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 系列 GPU 建议使用 bf16-true, 10+ 系列 GPU 建议使用 16-mixed",
|
|
117
|
+
"latest": "最近的检查点",
|
|
118
|
+
"new": "创建新的检查点",
|
|
119
|
+
"Realtime Transform Text": "实时规范化文本",
|
|
120
|
+
"Normalization Result Preview (Currently Only Chinese)": "规范化结果预览",
|
|
121
|
+
"Text Normalization": "文本规范化"
|
|
122
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import glob
|
|
3
|
+
import json
|
|
4
|
+
from collections import OrderedDict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from loguru import logger
|
|
8
|
+
|
|
9
|
+
from .core import DEFAULT_LANGUAGE, I18N_FILE_PATH
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def extract_i18n_strings(node):
|
|
13
|
+
i18n_strings = []
|
|
14
|
+
|
|
15
|
+
if (
|
|
16
|
+
isinstance(node, ast.Call)
|
|
17
|
+
and isinstance(node.func, ast.Name)
|
|
18
|
+
and node.func.id == "i18n"
|
|
19
|
+
):
|
|
20
|
+
for arg in node.args:
|
|
21
|
+
if isinstance(arg, ast.Str):
|
|
22
|
+
i18n_strings.append(arg.s)
|
|
23
|
+
|
|
24
|
+
for child_node in ast.iter_child_nodes(node):
|
|
25
|
+
i18n_strings.extend(extract_i18n_strings(child_node))
|
|
26
|
+
|
|
27
|
+
return i18n_strings
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# scan the directory for all .py files (recursively)
|
|
31
|
+
# for each file, parse the code into an AST
|
|
32
|
+
# for each AST, extract the i18n strings
|
|
33
|
+
|
|
34
|
+
strings = []
|
|
35
|
+
folders = ["fish_speech", "tools"]
|
|
36
|
+
# for filename in glob.iglob("**/*.py", recursive=True):
|
|
37
|
+
for folder in folders:
|
|
38
|
+
for f in Path(folder).rglob("*.py"):
|
|
39
|
+
code = f.read_text(encoding="utf-8")
|
|
40
|
+
if "i18n(" in code:
|
|
41
|
+
tree = ast.parse(code)
|
|
42
|
+
i18n_strings = extract_i18n_strings(tree)
|
|
43
|
+
logger.info(f"Found {len(i18n_strings)} i18n strings in {f}")
|
|
44
|
+
strings.extend(i18n_strings)
|
|
45
|
+
|
|
46
|
+
code_keys = set(strings)
|
|
47
|
+
logger.info(f"Total unique: {len(code_keys)}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
|
|
51
|
+
with open(standard_file, "r", encoding="utf-8") as f:
|
|
52
|
+
standard_data = json.load(f, object_pairs_hook=OrderedDict)
|
|
53
|
+
standard_keys = set(standard_data.keys())
|
|
54
|
+
|
|
55
|
+
# Define the standard file name
|
|
56
|
+
unused_keys = standard_keys - code_keys
|
|
57
|
+
logger.info(f"Found {len(unused_keys)} unused keys in {standard_file}")
|
|
58
|
+
for unused_key in unused_keys:
|
|
59
|
+
logger.info(f"\t{unused_key}")
|
|
60
|
+
|
|
61
|
+
missing_keys = code_keys - standard_keys
|
|
62
|
+
logger.info(f"Found {len(missing_keys)} missing keys in {standard_file}")
|
|
63
|
+
for missing_key in missing_keys:
|
|
64
|
+
logger.info(f"\t{missing_key}")
|
|
65
|
+
|
|
66
|
+
code_keys_dict = OrderedDict()
|
|
67
|
+
for s in strings:
|
|
68
|
+
code_keys_dict[s] = s
|
|
69
|
+
|
|
70
|
+
# write back
|
|
71
|
+
with open(standard_file, "w", encoding="utf-8") as f:
|
|
72
|
+
json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
|
|
73
|
+
f.write("\n")
|
|
74
|
+
|
|
75
|
+
logger.info(f"Updated {standard_file}")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# Define the standard file name
|
|
79
|
+
standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
|
|
80
|
+
|
|
81
|
+
# Find all JSON files in the directory
|
|
82
|
+
dir_path = I18N_FILE_PATH
|
|
83
|
+
languages = [f for f in dir_path.glob("*.json") if f.stem != DEFAULT_LANGUAGE]
|
|
84
|
+
|
|
85
|
+
# Load the standard file
|
|
86
|
+
with open(standard_file, "r", encoding="utf-8") as f:
|
|
87
|
+
standard_data = json.load(f, object_pairs_hook=OrderedDict)
|
|
88
|
+
|
|
89
|
+
# Loop through each language file
|
|
90
|
+
for lang_file in languages:
|
|
91
|
+
# Load the language file
|
|
92
|
+
with open(lang_file, "r", encoding="utf-8") as f:
|
|
93
|
+
lang_data = json.load(f, object_pairs_hook=OrderedDict)
|
|
94
|
+
|
|
95
|
+
# Find the difference between the language file and the standard file
|
|
96
|
+
diff = set(standard_data.keys()) - set(lang_data.keys())
|
|
97
|
+
|
|
98
|
+
miss = set(lang_data.keys()) - set(standard_data.keys())
|
|
99
|
+
|
|
100
|
+
# Add any missing keys to the language file
|
|
101
|
+
for key in diff:
|
|
102
|
+
lang_data[key] = "#!" + key
|
|
103
|
+
logger.info(f"Added missing key: {key} to {lang_file}")
|
|
104
|
+
|
|
105
|
+
# Del any extra keys to the language file
|
|
106
|
+
for key in miss:
|
|
107
|
+
del lang_data[key]
|
|
108
|
+
logger.info(f"Del extra key: {key} from {lang_file}")
|
|
109
|
+
|
|
110
|
+
# Sort the keys of the language file to match the order of the standard file
|
|
111
|
+
lang_data = OrderedDict(
|
|
112
|
+
sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Save the updated language file
|
|
116
|
+
with open(lang_file, "w", encoding="utf-8") as f:
|
|
117
|
+
json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
|
|
118
|
+
f.write("\n")
|
|
119
|
+
|
|
120
|
+
logger.info(f"Updated {lang_file}")
|
|
121
|
+
|
|
122
|
+
logger.info("Done")
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
import lightning as L
|
|
4
|
+
import torch
|
|
5
|
+
import torch.nn.functional as F
|
|
6
|
+
from lightning.pytorch.utilities.types import OptimizerLRScheduler
|
|
7
|
+
|
|
8
|
+
import fish_speech.utils as utils
|
|
9
|
+
from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
|
|
10
|
+
from fish_speech.models.text2semantic.llama import NaiveTransformer
|
|
11
|
+
|
|
12
|
+
log = utils.RankedLogger(__name__, rank_zero_only=True)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TextToSemantic(L.LightningModule):
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
model: NaiveTransformer,
|
|
19
|
+
optimizer: Any,
|
|
20
|
+
lr_scheduler: Any,
|
|
21
|
+
):
|
|
22
|
+
super().__init__()
|
|
23
|
+
|
|
24
|
+
self.model = model
|
|
25
|
+
self.optimizer_builder = optimizer
|
|
26
|
+
self.lr_scheduler_builder = lr_scheduler
|
|
27
|
+
|
|
28
|
+
def forward(self, x):
|
|
29
|
+
return self.model(x)
|
|
30
|
+
|
|
31
|
+
def on_save_checkpoint(self, checkpoint):
|
|
32
|
+
# Save only LoRA parameters
|
|
33
|
+
state_dict = checkpoint["state_dict"]
|
|
34
|
+
use_lora = any("lora" in name for name in state_dict.keys())
|
|
35
|
+
if not use_lora:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
for name in list(state_dict.keys()):
|
|
39
|
+
if "lora" not in name:
|
|
40
|
+
state_dict.pop(name)
|
|
41
|
+
|
|
42
|
+
def configure_optimizers(self) -> OptimizerLRScheduler:
|
|
43
|
+
# Get weight decay parameters
|
|
44
|
+
weight_decay_parameters, other_parameters = [], []
|
|
45
|
+
for name, param in self.named_parameters():
|
|
46
|
+
if ".bias" in name or "norm.weight" in name or ".embeddings." in name:
|
|
47
|
+
other_parameters.append(param)
|
|
48
|
+
else:
|
|
49
|
+
weight_decay_parameters.append(param)
|
|
50
|
+
|
|
51
|
+
optimizer = self.optimizer_builder(
|
|
52
|
+
[
|
|
53
|
+
{"params": weight_decay_parameters},
|
|
54
|
+
{"params": other_parameters, "weight_decay": 0.0},
|
|
55
|
+
]
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Print the parameters and their weight decay
|
|
59
|
+
for i in optimizer.param_groups:
|
|
60
|
+
log.info(
|
|
61
|
+
f"Set weight decay: {i['weight_decay']} for {len(i['params'])} parameters"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
lr_scheduler = self.lr_scheduler_builder(optimizer)
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
"optimizer": optimizer,
|
|
68
|
+
"lr_scheduler": {
|
|
69
|
+
"scheduler": lr_scheduler,
|
|
70
|
+
"interval": "step",
|
|
71
|
+
},
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# Copied from https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90
|
|
75
|
+
def get_batch_logps(
|
|
76
|
+
self,
|
|
77
|
+
logits: torch.FloatTensor,
|
|
78
|
+
labels: torch.LongTensor,
|
|
79
|
+
average_log_prob: bool = False,
|
|
80
|
+
) -> torch.FloatTensor:
|
|
81
|
+
"""Compute the log probabilities of the given labels under the given logits.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, codebook_size, vocab_size)
|
|
85
|
+
labels: Labels for which to compute the log probabilities. Label tokens with a value of -100 are ignored. Shape: (batch_size, sequence_length, codebook_size)
|
|
86
|
+
average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
|
|
90
|
+
"""
|
|
91
|
+
assert logits.shape[:-1] == labels.shape
|
|
92
|
+
|
|
93
|
+
labels = labels.clone()
|
|
94
|
+
loss_mask = labels != -100
|
|
95
|
+
|
|
96
|
+
# dummy token; we'll ignore the losses on these tokens later
|
|
97
|
+
labels[labels == -100] = 0
|
|
98
|
+
|
|
99
|
+
per_token_logps = torch.gather(
|
|
100
|
+
logits.log_softmax(-1), dim=-1, index=labels.unsqueeze(-1)
|
|
101
|
+
).squeeze(-1)
|
|
102
|
+
|
|
103
|
+
if average_log_prob:
|
|
104
|
+
return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
|
|
105
|
+
else:
|
|
106
|
+
return (per_token_logps * loss_mask).sum(-1)
|
|
107
|
+
|
|
108
|
+
def _step(self, batch, batch_idx, stage: str):
|
|
109
|
+
is_train = stage == "train"
|
|
110
|
+
|
|
111
|
+
if is_train:
|
|
112
|
+
# Key part to make lora work
|
|
113
|
+
# Otherwise the parameters are merged, which lead to incorrect gradients
|
|
114
|
+
self.model.train()
|
|
115
|
+
|
|
116
|
+
# Do positive and negative samples in the same batch to speed up training
|
|
117
|
+
labels = batch["labels"]
|
|
118
|
+
outputs = self.model(
|
|
119
|
+
inp=batch["inputs"],
|
|
120
|
+
key_padding_mask=batch["attention_masks"],
|
|
121
|
+
)
|
|
122
|
+
token_logits = outputs.token_logits
|
|
123
|
+
codebook_logits = outputs.codebook_logits
|
|
124
|
+
|
|
125
|
+
# Generate labels
|
|
126
|
+
base_loss = F.cross_entropy(
|
|
127
|
+
token_logits.view(-1, token_logits.size(-1)),
|
|
128
|
+
labels[:, 0].reshape(-1),
|
|
129
|
+
ignore_index=-100,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
codebook_labels = labels[:, 1 : 1 + self.model.config.num_codebooks].mT
|
|
133
|
+
semantic_loss = F.cross_entropy(
|
|
134
|
+
codebook_logits.view(-1, codebook_logits.size(-1)),
|
|
135
|
+
codebook_labels.reshape(-1),
|
|
136
|
+
ignore_index=-100,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
loss = base_loss + semantic_loss
|
|
140
|
+
|
|
141
|
+
self.log(
|
|
142
|
+
f"{stage}/loss",
|
|
143
|
+
loss,
|
|
144
|
+
on_step=is_train,
|
|
145
|
+
on_epoch=not is_train,
|
|
146
|
+
prog_bar=True,
|
|
147
|
+
logger=True,
|
|
148
|
+
sync_dist=not is_train,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
self.log(
|
|
152
|
+
f"{stage}/base_loss",
|
|
153
|
+
base_loss,
|
|
154
|
+
on_step=is_train,
|
|
155
|
+
on_epoch=not is_train,
|
|
156
|
+
prog_bar=False,
|
|
157
|
+
logger=True,
|
|
158
|
+
sync_dist=not is_train,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
self.log(
|
|
162
|
+
f"{stage}/semantic_loss",
|
|
163
|
+
semantic_loss,
|
|
164
|
+
on_step=is_train,
|
|
165
|
+
on_epoch=not is_train,
|
|
166
|
+
prog_bar=False,
|
|
167
|
+
logger=True,
|
|
168
|
+
sync_dist=not is_train,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Top-5 accuracy
|
|
172
|
+
accuracy = self.get_accuracy(codebook_logits, codebook_labels)
|
|
173
|
+
self.log(
|
|
174
|
+
f"{stage}/top_5_accuracy",
|
|
175
|
+
accuracy,
|
|
176
|
+
on_step=is_train,
|
|
177
|
+
on_epoch=not is_train,
|
|
178
|
+
prog_bar=True,
|
|
179
|
+
logger=True,
|
|
180
|
+
sync_dist=not is_train,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
return loss
|
|
184
|
+
|
|
185
|
+
def get_accuracy(self, logits, labels):
|
|
186
|
+
mask = (labels != -100) & (labels != CODEBOOK_PAD_TOKEN_ID)
|
|
187
|
+
if mask.sum() == 0:
|
|
188
|
+
return torch.tensor(0.0, device=logits.device)
|
|
189
|
+
|
|
190
|
+
_, indices = logits.topk(5, dim=-1)
|
|
191
|
+
correct = indices.eq(labels.unsqueeze(-1))
|
|
192
|
+
correct[~mask] = 0
|
|
193
|
+
correct = correct.sum()
|
|
194
|
+
accuracy = correct / mask.sum()
|
|
195
|
+
|
|
196
|
+
return accuracy
|
|
197
|
+
|
|
198
|
+
def training_step(self, batch, batch_idx):
|
|
199
|
+
return self._step(batch, batch_idx, "train")
|
|
200
|
+
|
|
201
|
+
def validation_step(self, batch, batch_idx):
|
|
202
|
+
return self._step(batch, batch_idx, "val")
|