qwen-tts 0.0.4__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qwen_tts/core/models/modeling_qwen3_tts.py +8 -3
- qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py +4 -3
- qwen_tts/inference/qwen3_tts_model.py +2 -2
- {qwen_tts-0.0.4.dist-info → qwen_tts-0.1.0.dist-info}/METADATA +13 -8
- {qwen_tts-0.0.4.dist-info → qwen_tts-0.1.0.dist-info}/RECORD +9 -9
- {qwen_tts-0.0.4.dist-info → qwen_tts-0.1.0.dist-info}/WHEEL +1 -1
- {qwen_tts-0.0.4.dist-info → qwen_tts-0.1.0.dist-info}/entry_points.txt +0 -0
- {qwen_tts-0.0.4.dist-info → qwen_tts-0.1.0.dist-info}/licenses/LICENSE +0 -0
- {qwen_tts-0.0.4.dist-info → qwen_tts-0.1.0.dist-info}/top_level.txt +0 -0
|
@@ -470,7 +470,7 @@ class Qwen3TTSPreTrainedModel(PreTrainedModel):
|
|
|
470
470
|
supports_gradient_checkpointing = True
|
|
471
471
|
_no_split_modules = ["Qwen3TTSDecoderLayer"]
|
|
472
472
|
_skip_keys_device_placement = "past_key_values"
|
|
473
|
-
|
|
473
|
+
_supports_flash_attn = True
|
|
474
474
|
_supports_sdpa = True
|
|
475
475
|
_supports_cache_class = True
|
|
476
476
|
_supports_static_cache = False
|
|
@@ -501,8 +501,7 @@ class Qwen3TTSTalkerTextPreTrainedModel(PreTrainedModel):
|
|
|
501
501
|
supports_gradient_checkpointing = True
|
|
502
502
|
_no_split_modules = []
|
|
503
503
|
_skip_keys_device_placement = ["past_key_values"]
|
|
504
|
-
|
|
505
|
-
_supports_flash_attn_2 = True
|
|
504
|
+
_supports_flash_attn = True
|
|
506
505
|
_supports_sdpa = True
|
|
507
506
|
_supports_flex_attn = True
|
|
508
507
|
_supports_cache_class = True
|
|
@@ -1869,6 +1868,11 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
|
|
|
1869
1868
|
weights_only=True,
|
|
1870
1869
|
**kwargs,
|
|
1871
1870
|
):
|
|
1871
|
+
# Hotfix to enable passing the correct attn implementation which is stored in the config but not in kwargs
|
|
1872
|
+
requested_attn_implementation = kwargs.pop("attn_implementation", None)
|
|
1873
|
+
if requested_attn_implementation is None and config and config._attn_implementation:
|
|
1874
|
+
requested_attn_implementation = config._attn_implementation
|
|
1875
|
+
|
|
1872
1876
|
model = super().from_pretrained(
|
|
1873
1877
|
pretrained_model_name_or_path,
|
|
1874
1878
|
*model_args,
|
|
@@ -1881,6 +1885,7 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
|
|
|
1881
1885
|
revision=revision,
|
|
1882
1886
|
use_safetensors=use_safetensors,
|
|
1883
1887
|
weights_only=weights_only,
|
|
1888
|
+
attn_implementation=requested_attn_implementation,
|
|
1884
1889
|
**kwargs,
|
|
1885
1890
|
)
|
|
1886
1891
|
if not local_files_only and not os.path.isdir(pretrained_model_name_or_path):
|
|
@@ -198,12 +198,13 @@ class Qwen3TTSTokenizerV2CausalTransConvNet(nn.Module):
|
|
|
198
198
|
self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride)
|
|
199
199
|
|
|
200
200
|
pad = kernel_size - stride
|
|
201
|
-
self.left_pad =
|
|
202
|
-
self.right_pad = pad
|
|
201
|
+
self.left_pad = 0
|
|
202
|
+
self.right_pad = int(pad)
|
|
203
203
|
|
|
204
204
|
def forward(self, hidden_state):
|
|
205
205
|
hidden_state = self.conv(hidden_state)
|
|
206
|
-
|
|
206
|
+
if self.right_pad > 0:
|
|
207
|
+
hidden_state = hidden_state[..., : hidden_state.shape[-1] - self.right_pad]
|
|
207
208
|
return hidden_state.contiguous()
|
|
208
209
|
|
|
209
210
|
|
|
@@ -639,7 +639,7 @@ class Qwen3TTSModel:
|
|
|
639
639
|
text: Union[str, List[str]],
|
|
640
640
|
instruct: Union[str, List[str]],
|
|
641
641
|
language: Union[str, List[str]] = None,
|
|
642
|
-
non_streaming_mode: bool =
|
|
642
|
+
non_streaming_mode: bool = True,
|
|
643
643
|
**kwargs,
|
|
644
644
|
) -> Tuple[List[np.ndarray], int]:
|
|
645
645
|
"""
|
|
@@ -735,7 +735,7 @@ class Qwen3TTSModel:
|
|
|
735
735
|
speaker: Union[str, List[str]],
|
|
736
736
|
language: Union[str, List[str]] = None,
|
|
737
737
|
instruct: Optional[Union[str, List[str]]] = None,
|
|
738
|
-
non_streaming_mode: bool =
|
|
738
|
+
non_streaming_mode: bool = True,
|
|
739
739
|
**kwargs,
|
|
740
740
|
) -> Tuple[List[np.ndarray], int]:
|
|
741
741
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: qwen-tts
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.1.0
|
|
4
4
|
Summary: Qwen-TTS python package
|
|
5
5
|
Author: Alibaba Qwen Team
|
|
6
6
|
License: Apache-2.0
|
|
@@ -35,7 +35,7 @@ Dynamic: license-file
|
|
|
35
35
|
<p>
|
|
36
36
|
|
|
37
37
|
<p align="center">
|
|
38
|
-
  🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>   |   🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>   |   📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>   |   📑 <a href="https://
|
|
38
|
+
  🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>   |   🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>   |   📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>   |   📑 <a href="https://arxiv.org/abs/2601.15621">Paper</a>  
|
|
39
39
|
<br>
|
|
40
40
|
🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen3-TTS">Hugging Face Demo</a>   |    🖥️ <a href="https://modelscope.cn/studios/Qwen/Qwen3-TTS">ModelScope Demo</a>   |   💬 <a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>   |   🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>   |   📑 <a href="https://help.aliyun.com/zh/model-studio/qwen-tts-realtime">API</a>
|
|
41
41
|
|
|
@@ -66,7 +66,7 @@ We release **Qwen3-TTS**, a series of powerful speech generation capabilities de
|
|
|
66
66
|
- [vLLM Usage](#vllm-usage)
|
|
67
67
|
- [Fine Tuning](#fine-tuning)
|
|
68
68
|
- [Evaluation](#evaluation)
|
|
69
|
-
|
|
69
|
+
- [Citation](#citation)
|
|
70
70
|
|
|
71
71
|
## Overview
|
|
72
72
|
### Introduction
|
|
@@ -1367,18 +1367,23 @@ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` a
|
|
|
1367
1367
|
</details>
|
|
1368
1368
|
|
|
1369
1369
|
|
|
1370
|
-
|
|
1370
|
+
## Citation
|
|
1371
1371
|
|
|
1372
1372
|
If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)
|
|
1373
1373
|
|
|
1374
|
-
|
|
1375
1374
|
```BibTeX
|
|
1376
1375
|
@article{Qwen3-TTS,
|
|
1377
1376
|
title={Qwen3-TTS Technical Report},
|
|
1378
|
-
author={},
|
|
1379
|
-
journal={arXiv preprint arXiv:},
|
|
1377
|
+
author={Hangrui Hu and Xinfa Zhu and Ting He and Dake Guo and Bin Zhang and Xiong Wang and Zhifang Guo and Ziyue Jiang and Hongkun Hao and Zishan Guo and Xinyu Zhang and Pei Zhang and Baosong Yang and Jin Xu and Jingren Zhou and Junyang Lin},
|
|
1378
|
+
journal={arXiv preprint arXiv:2601.15621},
|
|
1380
1379
|
year={2026}
|
|
1381
1380
|
}
|
|
1382
|
-
```
|
|
1381
|
+
```
|
|
1382
|
+
|
|
1383
|
+
|
|
1384
|
+
## Star History
|
|
1385
|
+
|
|
1386
|
+
[](https://star-history.com/#QwenLM/Qwen3-TTS&Date)
|
|
1387
|
+
|
|
1383
1388
|
|
|
1384
1389
|
<br>
|
|
@@ -4,21 +4,21 @@ qwen_tts/cli/demo.py,sha256=6ijgkwdT4Fy91Tq3vZC3voGrhZVJkCdx2xXws6v81es,29160
|
|
|
4
4
|
qwen_tts/core/__init__.py,sha256=GzgNnehDttWF2TjDOdBmE2VnynElQSZ0I0IEr0OGZ54,990
|
|
5
5
|
qwen_tts/core/models/__init__.py,sha256=kX042P1-2E3nNwP9I5TVNcpBbhVpTR5QMk5KBtQCLII,807
|
|
6
6
|
qwen_tts/core/models/configuration_qwen3_tts.py,sha256=9Shn8U_eBqQW3RSGTVA85tE9CgjV9dowGR4cgME_XRg,26428
|
|
7
|
-
qwen_tts/core/models/modeling_qwen3_tts.py,sha256=
|
|
7
|
+
qwen_tts/core/models/modeling_qwen3_tts.py,sha256=JcQmVrz4EPBu9rwYOb1wg_PIz-2sOhR8QGC0JisclqA,100211
|
|
8
8
|
qwen_tts/core/models/processing_qwen3_tts.py,sha256=YUciAxiORu2mjXQMJfDyKOziSmHs-ULlfW5J54tNa80,4022
|
|
9
9
|
qwen_tts/core/tokenizer_12hz/configuration_qwen3_tts_tokenizer_v2.py,sha256=njDCQ5SwDLA2bX2jSCt0NkaKzBzT2hpv5hSh00ZTpeM,7946
|
|
10
|
-
qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py,sha256=
|
|
10
|
+
qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py,sha256=OT_C6mD12rlnjYMJqHyT_t1idNDrFieXoEttbamaSJs,40464
|
|
11
11
|
qwen_tts/core/tokenizer_25hz/configuration_qwen3_tts_tokenizer_v1.py,sha256=KPlPcV332W02XJzqACHZzKjGPS9I9IQjxDo9PK4o7wI,14494
|
|
12
12
|
qwen_tts/core/tokenizer_25hz/modeling_qwen3_tts_tokenizer_v1.py,sha256=SdXRMvkR2uW8Fa_FzvRicu3nw4FzTHnCSLkUmfxBPF0,56472
|
|
13
13
|
qwen_tts/core/tokenizer_25hz/vq/core_vq.py,sha256=01-p8A70hjNhST4QL04o8JJIhJcRffWAgZ6Ttd_zuN4,20074
|
|
14
14
|
qwen_tts/core/tokenizer_25hz/vq/speech_vq.py,sha256=fGj8uoxQjYoY4iQbVjMC0b9G-OyxUVxZIiJj2FJLspg,14833
|
|
15
15
|
qwen_tts/core/tokenizer_25hz/vq/whisper_encoder.py,sha256=oXSLNJaLqO_-v5AsSkOZlBs-Sbyj9sASD8Zz47p9dn0,14351
|
|
16
16
|
qwen_tts/core/tokenizer_25hz/vq/assets/mel_filters.npz,sha256=dFCucHI6XvnTQePO5ijHywF382zkLES37SvzMl8PbUw,4271
|
|
17
|
-
qwen_tts/inference/qwen3_tts_model.py,sha256=
|
|
17
|
+
qwen_tts/inference/qwen3_tts_model.py,sha256=4dpFBzKFfB9f4-NuurhdsvbcakjKr_aXP0YzhOMCdeQ,37121
|
|
18
18
|
qwen_tts/inference/qwen3_tts_tokenizer.py,sha256=vX1-6_rJIGQ7QtKd932ngHJYvEBjBUSos1tCEtOyFaw,15698
|
|
19
|
-
qwen_tts-0.0.
|
|
20
|
-
qwen_tts-0.0.
|
|
21
|
-
qwen_tts-0.0.
|
|
22
|
-
qwen_tts-0.0.
|
|
23
|
-
qwen_tts-0.0.
|
|
24
|
-
qwen_tts-0.0.
|
|
19
|
+
qwen_tts-0.1.0.dist-info/licenses/LICENSE,sha256=pEpggcc6118CVbsrtcq3TvGClWWolaJOU6TxEpCrdlU,11343
|
|
20
|
+
qwen_tts-0.1.0.dist-info/METADATA,sha256=qIoqgk0JBnvOKTWCh0ijBaECdDTAJHdkf5DhQwaRTPg,61393
|
|
21
|
+
qwen_tts-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
22
|
+
qwen_tts-0.1.0.dist-info/entry_points.txt,sha256=hcoVetKUabLdCmu2ST4jGo8jilnoslpFzV-bxlHf0E0,57
|
|
23
|
+
qwen_tts-0.1.0.dist-info/top_level.txt,sha256=1o-44WiYkUtYVTiL9eexzyNQXK6YWCOGZltO81PUirA,9
|
|
24
|
+
qwen_tts-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|