qwen-tts 0.0.4__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -470,7 +470,7 @@ class Qwen3TTSPreTrainedModel(PreTrainedModel):
470
470
  supports_gradient_checkpointing = True
471
471
  _no_split_modules = ["Qwen3TTSDecoderLayer"]
472
472
  _skip_keys_device_placement = "past_key_values"
473
- _supports_flash_attn_2 = True
473
+ _supports_flash_attn = True
474
474
  _supports_sdpa = True
475
475
  _supports_cache_class = True
476
476
  _supports_static_cache = False
@@ -501,8 +501,7 @@ class Qwen3TTSTalkerTextPreTrainedModel(PreTrainedModel):
501
501
  supports_gradient_checkpointing = True
502
502
  _no_split_modules = []
503
503
  _skip_keys_device_placement = ["past_key_values"]
504
- _supports_flash_attn_3 = True
505
- _supports_flash_attn_2 = True
504
+ _supports_flash_attn = True
506
505
  _supports_sdpa = True
507
506
  _supports_flex_attn = True
508
507
  _supports_cache_class = True
@@ -1869,6 +1868,11 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
1869
1868
  weights_only=True,
1870
1869
  **kwargs,
1871
1870
  ):
1871
+ # Hotfix to enable passing the correct attn implementation which is stored in the config but not in kwargs
1872
+ requested_attn_implementation = kwargs.pop("attn_implementation", None)
1873
+ if requested_attn_implementation is None and config and config._attn_implementation:
1874
+ requested_attn_implementation = config._attn_implementation
1875
+
1872
1876
  model = super().from_pretrained(
1873
1877
  pretrained_model_name_or_path,
1874
1878
  *model_args,
@@ -1881,6 +1885,7 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
1881
1885
  revision=revision,
1882
1886
  use_safetensors=use_safetensors,
1883
1887
  weights_only=weights_only,
1888
+ attn_implementation=requested_attn_implementation,
1884
1889
  **kwargs,
1885
1890
  )
1886
1891
  if not local_files_only and not os.path.isdir(pretrained_model_name_or_path):
@@ -198,12 +198,13 @@ class Qwen3TTSTokenizerV2CausalTransConvNet(nn.Module):
198
198
  self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride)
199
199
 
200
200
  pad = kernel_size - stride
201
- self.left_pad = math.ceil(pad)
202
- self.right_pad = pad = self.left_pad
201
+ self.left_pad = 0
202
+ self.right_pad = int(pad)
203
203
 
204
204
  def forward(self, hidden_state):
205
205
  hidden_state = self.conv(hidden_state)
206
- hidden_state = hidden_state[..., self.left_pad : hidden_state.shape[-1] - self.right_pad]
206
+ if self.right_pad > 0:
207
+ hidden_state = hidden_state[..., : hidden_state.shape[-1] - self.right_pad]
207
208
  return hidden_state.contiguous()
208
209
 
209
210
 
@@ -639,7 +639,7 @@ class Qwen3TTSModel:
639
639
  text: Union[str, List[str]],
640
640
  instruct: Union[str, List[str]],
641
641
  language: Union[str, List[str]] = None,
642
- non_streaming_mode: bool = False,
642
+ non_streaming_mode: bool = True,
643
643
  **kwargs,
644
644
  ) -> Tuple[List[np.ndarray], int]:
645
645
  """
@@ -735,7 +735,7 @@ class Qwen3TTSModel:
735
735
  speaker: Union[str, List[str]],
736
736
  language: Union[str, List[str]] = None,
737
737
  instruct: Optional[Union[str, List[str]]] = None,
738
- non_streaming_mode: bool = False,
738
+ non_streaming_mode: bool = True,
739
739
  **kwargs,
740
740
  ) -> Tuple[List[np.ndarray], int]:
741
741
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: qwen-tts
3
- Version: 0.0.4
3
+ Version: 0.1.0
4
4
  Summary: Qwen-TTS python package
5
5
  Author: Alibaba Qwen Team
6
6
  License: Apache-2.0
@@ -35,7 +35,7 @@ Dynamic: license-file
35
35
  <p>
36
36
 
37
37
  <p align="center">
38
- &nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://github.com/QwenLM/Qwen3-TTS/blob/main/assets/Qwen3_TTS.pdf">Paper</a>&nbsp&nbsp
38
+ &nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://arxiv.org/abs/2601.15621">Paper</a>&nbsp&nbsp
39
39
  <br>
40
40
  🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen3-TTS">Hugging Face Demo</a>&nbsp&nbsp | &nbsp&nbsp 🖥️ <a href="https://modelscope.cn/studios/Qwen/Qwen3-TTS">ModelScope Demo</a>&nbsp&nbsp | &nbsp&nbsp💬 <a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>&nbsp&nbsp | &nbsp&nbsp🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://help.aliyun.com/zh/model-studio/qwen-tts-realtime">API</a>
41
41
 
@@ -66,7 +66,7 @@ We release **Qwen3-TTS**, a series of powerful speech generation capabilities de
66
66
  - [vLLM Usage](#vllm-usage)
67
67
  - [Fine Tuning](#fine-tuning)
68
68
  - [Evaluation](#evaluation)
69
- <!-- - [Citation](#citation) -->
69
+ - [Citation](#citation)
70
70
 
71
71
  ## Overview
72
72
  ### Introduction
@@ -1367,18 +1367,23 @@ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` a
1367
1367
  </details>
1368
1368
 
1369
1369
 
1370
- <!-- ## Citation
1370
+ ## Citation
1371
1371
 
1372
1372
  If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)
1373
1373
 
1374
-
1375
1374
  ```BibTeX
1376
1375
  @article{Qwen3-TTS,
1377
1376
  title={Qwen3-TTS Technical Report},
1378
- author={},
1379
- journal={arXiv preprint arXiv:},
1377
+ author={Hangrui Hu and Xinfa Zhu and Ting He and Dake Guo and Bin Zhang and Xiong Wang and Zhifang Guo and Ziyue Jiang and Hongkun Hao and Zishan Guo and Xinyu Zhang and Pei Zhang and Baosong Yang and Jin Xu and Jingren Zhou and Junyang Lin},
1378
+ journal={arXiv preprint arXiv:2601.15621},
1380
1379
  year={2026}
1381
1380
  }
1382
- ``` -->
1381
+ ```
1382
+
1383
+
1384
+ ## Star History
1385
+
1386
+ [![Star History Chart](https://api.star-history.com/svg?repos=QwenLM/Qwen3-TTS&type=Date)](https://star-history.com/#QwenLM/Qwen3-TTS&Date)
1387
+
1383
1388
 
1384
1389
  <br>
@@ -4,21 +4,21 @@ qwen_tts/cli/demo.py,sha256=6ijgkwdT4Fy91Tq3vZC3voGrhZVJkCdx2xXws6v81es,29160
4
4
  qwen_tts/core/__init__.py,sha256=GzgNnehDttWF2TjDOdBmE2VnynElQSZ0I0IEr0OGZ54,990
5
5
  qwen_tts/core/models/__init__.py,sha256=kX042P1-2E3nNwP9I5TVNcpBbhVpTR5QMk5KBtQCLII,807
6
6
  qwen_tts/core/models/configuration_qwen3_tts.py,sha256=9Shn8U_eBqQW3RSGTVA85tE9CgjV9dowGR4cgME_XRg,26428
7
- qwen_tts/core/models/modeling_qwen3_tts.py,sha256=nFtU2UNWBMbL_aD7uut31MlZsPlVkrAPQMUH9TZO3jg,99825
7
+ qwen_tts/core/models/modeling_qwen3_tts.py,sha256=JcQmVrz4EPBu9rwYOb1wg_PIz-2sOhR8QGC0JisclqA,100211
8
8
  qwen_tts/core/models/processing_qwen3_tts.py,sha256=YUciAxiORu2mjXQMJfDyKOziSmHs-ULlfW5J54tNa80,4022
9
9
  qwen_tts/core/tokenizer_12hz/configuration_qwen3_tts_tokenizer_v2.py,sha256=njDCQ5SwDLA2bX2jSCt0NkaKzBzT2hpv5hSh00ZTpeM,7946
10
- qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py,sha256=ZZEFydnpx1TH3cgomRFAWA6R-uA11_EGtpVidUc6s78,40467
10
+ qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py,sha256=OT_C6mD12rlnjYMJqHyT_t1idNDrFieXoEttbamaSJs,40464
11
11
  qwen_tts/core/tokenizer_25hz/configuration_qwen3_tts_tokenizer_v1.py,sha256=KPlPcV332W02XJzqACHZzKjGPS9I9IQjxDo9PK4o7wI,14494
12
12
  qwen_tts/core/tokenizer_25hz/modeling_qwen3_tts_tokenizer_v1.py,sha256=SdXRMvkR2uW8Fa_FzvRicu3nw4FzTHnCSLkUmfxBPF0,56472
13
13
  qwen_tts/core/tokenizer_25hz/vq/core_vq.py,sha256=01-p8A70hjNhST4QL04o8JJIhJcRffWAgZ6Ttd_zuN4,20074
14
14
  qwen_tts/core/tokenizer_25hz/vq/speech_vq.py,sha256=fGj8uoxQjYoY4iQbVjMC0b9G-OyxUVxZIiJj2FJLspg,14833
15
15
  qwen_tts/core/tokenizer_25hz/vq/whisper_encoder.py,sha256=oXSLNJaLqO_-v5AsSkOZlBs-Sbyj9sASD8Zz47p9dn0,14351
16
16
  qwen_tts/core/tokenizer_25hz/vq/assets/mel_filters.npz,sha256=dFCucHI6XvnTQePO5ijHywF382zkLES37SvzMl8PbUw,4271
17
- qwen_tts/inference/qwen3_tts_model.py,sha256=lPt0DAv8pHDpUyXQpVmTnmjtok6j0ww0TFuWniAxghA,37123
17
+ qwen_tts/inference/qwen3_tts_model.py,sha256=4dpFBzKFfB9f4-NuurhdsvbcakjKr_aXP0YzhOMCdeQ,37121
18
18
  qwen_tts/inference/qwen3_tts_tokenizer.py,sha256=vX1-6_rJIGQ7QtKd932ngHJYvEBjBUSos1tCEtOyFaw,15698
19
- qwen_tts-0.0.4.dist-info/licenses/LICENSE,sha256=pEpggcc6118CVbsrtcq3TvGClWWolaJOU6TxEpCrdlU,11343
20
- qwen_tts-0.0.4.dist-info/METADATA,sha256=C8_PgKFWjwp4YpUzWGTbDgfVjczbTEaPIEbJSAtkJFM,61044
21
- qwen_tts-0.0.4.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
22
- qwen_tts-0.0.4.dist-info/entry_points.txt,sha256=hcoVetKUabLdCmu2ST4jGo8jilnoslpFzV-bxlHf0E0,57
23
- qwen_tts-0.0.4.dist-info/top_level.txt,sha256=1o-44WiYkUtYVTiL9eexzyNQXK6YWCOGZltO81PUirA,9
24
- qwen_tts-0.0.4.dist-info/RECORD,,
19
+ qwen_tts-0.1.0.dist-info/licenses/LICENSE,sha256=pEpggcc6118CVbsrtcq3TvGClWWolaJOU6TxEpCrdlU,11343
20
+ qwen_tts-0.1.0.dist-info/METADATA,sha256=qIoqgk0JBnvOKTWCh0ijBaECdDTAJHdkf5DhQwaRTPg,61393
21
+ qwen_tts-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
22
+ qwen_tts-0.1.0.dist-info/entry_points.txt,sha256=hcoVetKUabLdCmu2ST4jGo8jilnoslpFzV-bxlHf0E0,57
23
+ qwen_tts-0.1.0.dist-info/top_level.txt,sha256=1o-44WiYkUtYVTiL9eexzyNQXK6YWCOGZltO81PUirA,9
24
+ qwen_tts-0.1.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5