qwen-tts 0.0.4__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/PKG-INFO +13 -8
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/README.md +12 -7
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/pyproject.toml +1 -1
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/models/modeling_qwen3_tts.py +8 -3
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py +4 -3
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/inference/qwen3_tts_model.py +2 -2
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/LICENSE +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/MANIFEST.in +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/__init__.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/__main__.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/cli/demo.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/__init__.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/models/__init__.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/models/configuration_qwen3_tts.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/models/processing_qwen3_tts.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_12hz/configuration_qwen3_tts_tokenizer_v2.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/configuration_qwen3_tts_tokenizer_v1.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/modeling_qwen3_tts_tokenizer_v1.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/vq/assets/mel_filters.npz +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/vq/core_vq.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/vq/speech_vq.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/vq/whisper_encoder.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/inference/qwen3_tts_tokenizer.py +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts.egg-info/SOURCES.txt +0 -0
- {qwen_tts-0.0.4 → qwen_tts-0.1.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: qwen-tts
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.1.0
|
|
4
4
|
Summary: Qwen-TTS python package
|
|
5
5
|
Author: Alibaba Qwen Team
|
|
6
6
|
License: Apache-2.0
|
|
@@ -35,7 +35,7 @@ Dynamic: license-file
|
|
|
35
35
|
<p>
|
|
36
36
|
|
|
37
37
|
<p align="center">
|
|
38
|
-
  🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>   |   🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>   |   📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>   |   📑 <a href="https://
|
|
38
|
+
  🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>   |   🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>   |   📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>   |   📑 <a href="https://arxiv.org/abs/2601.15621">Paper</a>  
|
|
39
39
|
<br>
|
|
40
40
|
🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen3-TTS">Hugging Face Demo</a>   |    🖥️ <a href="https://modelscope.cn/studios/Qwen/Qwen3-TTS">ModelScope Demo</a>   |   💬 <a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>   |   🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>   |   📑 <a href="https://help.aliyun.com/zh/model-studio/qwen-tts-realtime">API</a>
|
|
41
41
|
|
|
@@ -66,7 +66,7 @@ We release **Qwen3-TTS**, a series of powerful speech generation capabilities de
|
|
|
66
66
|
- [vLLM Usage](#vllm-usage)
|
|
67
67
|
- [Fine Tuning](#fine-tuning)
|
|
68
68
|
- [Evaluation](#evaluation)
|
|
69
|
-
|
|
69
|
+
- [Citation](#citation)
|
|
70
70
|
|
|
71
71
|
## Overview
|
|
72
72
|
### Introduction
|
|
@@ -1367,18 +1367,23 @@ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` a
|
|
|
1367
1367
|
</details>
|
|
1368
1368
|
|
|
1369
1369
|
|
|
1370
|
-
|
|
1370
|
+
## Citation
|
|
1371
1371
|
|
|
1372
1372
|
If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)
|
|
1373
1373
|
|
|
1374
|
-
|
|
1375
1374
|
```BibTeX
|
|
1376
1375
|
@article{Qwen3-TTS,
|
|
1377
1376
|
title={Qwen3-TTS Technical Report},
|
|
1378
|
-
author={},
|
|
1379
|
-
journal={arXiv preprint arXiv:},
|
|
1377
|
+
author={Hangrui Hu and Xinfa Zhu and Ting He and Dake Guo and Bin Zhang and Xiong Wang and Zhifang Guo and Ziyue Jiang and Hongkun Hao and Zishan Guo and Xinyu Zhang and Pei Zhang and Baosong Yang and Jin Xu and Jingren Zhou and Junyang Lin},
|
|
1378
|
+
journal={arXiv preprint arXiv:2601.15621},
|
|
1380
1379
|
year={2026}
|
|
1381
1380
|
}
|
|
1382
|
-
```
|
|
1381
|
+
```
|
|
1382
|
+
|
|
1383
|
+
|
|
1384
|
+
## Star History
|
|
1385
|
+
|
|
1386
|
+
[](https://star-history.com/#QwenLM/Qwen3-TTS&Date)
|
|
1387
|
+
|
|
1383
1388
|
|
|
1384
1389
|
<br>
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
<p>
|
|
8
8
|
|
|
9
9
|
<p align="center">
|
|
10
|
-
  🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>   |   🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>   |   📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>   |   📑 <a href="https://
|
|
10
|
+
  🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>   |   🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>   |   📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>   |   📑 <a href="https://arxiv.org/abs/2601.15621">Paper</a>  
|
|
11
11
|
<br>
|
|
12
12
|
🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen3-TTS">Hugging Face Demo</a>   |    🖥️ <a href="https://modelscope.cn/studios/Qwen/Qwen3-TTS">ModelScope Demo</a>   |   💬 <a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>   |   🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>   |   📑 <a href="https://help.aliyun.com/zh/model-studio/qwen-tts-realtime">API</a>
|
|
13
13
|
|
|
@@ -38,7 +38,7 @@ We release **Qwen3-TTS**, a series of powerful speech generation capabilities de
|
|
|
38
38
|
- [vLLM Usage](#vllm-usage)
|
|
39
39
|
- [Fine Tuning](#fine-tuning)
|
|
40
40
|
- [Evaluation](#evaluation)
|
|
41
|
-
|
|
41
|
+
- [Citation](#citation)
|
|
42
42
|
|
|
43
43
|
## Overview
|
|
44
44
|
### Introduction
|
|
@@ -1339,18 +1339,23 @@ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` a
|
|
|
1339
1339
|
</details>
|
|
1340
1340
|
|
|
1341
1341
|
|
|
1342
|
-
|
|
1342
|
+
## Citation
|
|
1343
1343
|
|
|
1344
1344
|
If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)
|
|
1345
1345
|
|
|
1346
|
-
|
|
1347
1346
|
```BibTeX
|
|
1348
1347
|
@article{Qwen3-TTS,
|
|
1349
1348
|
title={Qwen3-TTS Technical Report},
|
|
1350
|
-
author={},
|
|
1351
|
-
journal={arXiv preprint arXiv:},
|
|
1349
|
+
author={Hangrui Hu and Xinfa Zhu and Ting He and Dake Guo and Bin Zhang and Xiong Wang and Zhifang Guo and Ziyue Jiang and Hongkun Hao and Zishan Guo and Xinyu Zhang and Pei Zhang and Baosong Yang and Jin Xu and Jingren Zhou and Junyang Lin},
|
|
1350
|
+
journal={arXiv preprint arXiv:2601.15621},
|
|
1352
1351
|
year={2026}
|
|
1353
1352
|
}
|
|
1354
|
-
```
|
|
1353
|
+
```
|
|
1354
|
+
|
|
1355
|
+
|
|
1356
|
+
## Star History
|
|
1357
|
+
|
|
1358
|
+
[](https://star-history.com/#QwenLM/Qwen3-TTS&Date)
|
|
1359
|
+
|
|
1355
1360
|
|
|
1356
1361
|
<br>
|
|
@@ -470,7 +470,7 @@ class Qwen3TTSPreTrainedModel(PreTrainedModel):
|
|
|
470
470
|
supports_gradient_checkpointing = True
|
|
471
471
|
_no_split_modules = ["Qwen3TTSDecoderLayer"]
|
|
472
472
|
_skip_keys_device_placement = "past_key_values"
|
|
473
|
-
|
|
473
|
+
_supports_flash_attn = True
|
|
474
474
|
_supports_sdpa = True
|
|
475
475
|
_supports_cache_class = True
|
|
476
476
|
_supports_static_cache = False
|
|
@@ -501,8 +501,7 @@ class Qwen3TTSTalkerTextPreTrainedModel(PreTrainedModel):
|
|
|
501
501
|
supports_gradient_checkpointing = True
|
|
502
502
|
_no_split_modules = []
|
|
503
503
|
_skip_keys_device_placement = ["past_key_values"]
|
|
504
|
-
|
|
505
|
-
_supports_flash_attn_2 = True
|
|
504
|
+
_supports_flash_attn = True
|
|
506
505
|
_supports_sdpa = True
|
|
507
506
|
_supports_flex_attn = True
|
|
508
507
|
_supports_cache_class = True
|
|
@@ -1869,6 +1868,11 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
|
|
|
1869
1868
|
weights_only=True,
|
|
1870
1869
|
**kwargs,
|
|
1871
1870
|
):
|
|
1871
|
+
# Hotfix to enable passing the correct attn implementation which is stored in the config but not in kwargs
|
|
1872
|
+
requested_attn_implementation = kwargs.pop("attn_implementation", None)
|
|
1873
|
+
if requested_attn_implementation is None and config and config._attn_implementation:
|
|
1874
|
+
requested_attn_implementation = config._attn_implementation
|
|
1875
|
+
|
|
1872
1876
|
model = super().from_pretrained(
|
|
1873
1877
|
pretrained_model_name_or_path,
|
|
1874
1878
|
*model_args,
|
|
@@ -1881,6 +1885,7 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
|
|
|
1881
1885
|
revision=revision,
|
|
1882
1886
|
use_safetensors=use_safetensors,
|
|
1883
1887
|
weights_only=weights_only,
|
|
1888
|
+
attn_implementation=requested_attn_implementation,
|
|
1884
1889
|
**kwargs,
|
|
1885
1890
|
)
|
|
1886
1891
|
if not local_files_only and not os.path.isdir(pretrained_model_name_or_path):
|
{qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py
RENAMED
|
@@ -198,12 +198,13 @@ class Qwen3TTSTokenizerV2CausalTransConvNet(nn.Module):
|
|
|
198
198
|
self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride)
|
|
199
199
|
|
|
200
200
|
pad = kernel_size - stride
|
|
201
|
-
self.left_pad =
|
|
202
|
-
self.right_pad = pad
|
|
201
|
+
self.left_pad = 0
|
|
202
|
+
self.right_pad = int(pad)
|
|
203
203
|
|
|
204
204
|
def forward(self, hidden_state):
|
|
205
205
|
hidden_state = self.conv(hidden_state)
|
|
206
|
-
|
|
206
|
+
if self.right_pad > 0:
|
|
207
|
+
hidden_state = hidden_state[..., : hidden_state.shape[-1] - self.right_pad]
|
|
207
208
|
return hidden_state.contiguous()
|
|
208
209
|
|
|
209
210
|
|
|
@@ -639,7 +639,7 @@ class Qwen3TTSModel:
|
|
|
639
639
|
text: Union[str, List[str]],
|
|
640
640
|
instruct: Union[str, List[str]],
|
|
641
641
|
language: Union[str, List[str]] = None,
|
|
642
|
-
non_streaming_mode: bool =
|
|
642
|
+
non_streaming_mode: bool = True,
|
|
643
643
|
**kwargs,
|
|
644
644
|
) -> Tuple[List[np.ndarray], int]:
|
|
645
645
|
"""
|
|
@@ -735,7 +735,7 @@ class Qwen3TTSModel:
|
|
|
735
735
|
speaker: Union[str, List[str]],
|
|
736
736
|
language: Union[str, List[str]] = None,
|
|
737
737
|
instruct: Optional[Union[str, List[str]]] = None,
|
|
738
|
-
non_streaming_mode: bool =
|
|
738
|
+
non_streaming_mode: bool = True,
|
|
739
739
|
**kwargs,
|
|
740
740
|
) -> Tuple[List[np.ndarray], int]:
|
|
741
741
|
"""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/modeling_qwen3_tts_tokenizer_v1.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|