qwen-tts 0.0.4__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/PKG-INFO +13 -8
  2. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/README.md +12 -7
  3. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/pyproject.toml +1 -1
  4. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/models/modeling_qwen3_tts.py +8 -3
  5. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py +4 -3
  6. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/inference/qwen3_tts_model.py +2 -2
  7. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/LICENSE +0 -0
  8. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/MANIFEST.in +0 -0
  9. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/__init__.py +0 -0
  10. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/__main__.py +0 -0
  11. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/cli/demo.py +0 -0
  12. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/__init__.py +0 -0
  13. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/models/__init__.py +0 -0
  14. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/models/configuration_qwen3_tts.py +0 -0
  15. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/models/processing_qwen3_tts.py +0 -0
  16. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_12hz/configuration_qwen3_tts_tokenizer_v2.py +0 -0
  17. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/configuration_qwen3_tts_tokenizer_v1.py +0 -0
  18. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/modeling_qwen3_tts_tokenizer_v1.py +0 -0
  19. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/vq/assets/mel_filters.npz +0 -0
  20. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/vq/core_vq.py +0 -0
  21. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/vq/speech_vq.py +0 -0
  22. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_25hz/vq/whisper_encoder.py +0 -0
  23. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/inference/qwen3_tts_tokenizer.py +0 -0
  24. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts.egg-info/SOURCES.txt +0 -0
  25. {qwen_tts-0.0.4 → qwen_tts-0.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: qwen-tts
3
- Version: 0.0.4
3
+ Version: 0.1.0
4
4
  Summary: Qwen-TTS python package
5
5
  Author: Alibaba Qwen Team
6
6
  License: Apache-2.0
@@ -35,7 +35,7 @@ Dynamic: license-file
35
35
  <p>
36
36
 
37
37
  <p align="center">
38
- &nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://github.com/QwenLM/Qwen3-TTS/blob/main/assets/Qwen3_TTS.pdf">Paper</a>&nbsp&nbsp
38
+ &nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://arxiv.org/abs/2601.15621">Paper</a>&nbsp&nbsp
39
39
  <br>
40
40
  🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen3-TTS">Hugging Face Demo</a>&nbsp&nbsp | &nbsp&nbsp 🖥️ <a href="https://modelscope.cn/studios/Qwen/Qwen3-TTS">ModelScope Demo</a>&nbsp&nbsp | &nbsp&nbsp💬 <a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>&nbsp&nbsp | &nbsp&nbsp🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://help.aliyun.com/zh/model-studio/qwen-tts-realtime">API</a>
41
41
 
@@ -66,7 +66,7 @@ We release **Qwen3-TTS**, a series of powerful speech generation capabilities de
66
66
  - [vLLM Usage](#vllm-usage)
67
67
  - [Fine Tuning](#fine-tuning)
68
68
  - [Evaluation](#evaluation)
69
- <!-- - [Citation](#citation) -->
69
+ - [Citation](#citation)
70
70
 
71
71
  ## Overview
72
72
  ### Introduction
@@ -1367,18 +1367,23 @@ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` a
1367
1367
  </details>
1368
1368
 
1369
1369
 
1370
- <!-- ## Citation
1370
+ ## Citation
1371
1371
 
1372
1372
  If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)
1373
1373
 
1374
-
1375
1374
  ```BibTeX
1376
1375
  @article{Qwen3-TTS,
1377
1376
  title={Qwen3-TTS Technical Report},
1378
- author={},
1379
- journal={arXiv preprint arXiv:},
1377
+ author={Hangrui Hu and Xinfa Zhu and Ting He and Dake Guo and Bin Zhang and Xiong Wang and Zhifang Guo and Ziyue Jiang and Hongkun Hao and Zishan Guo and Xinyu Zhang and Pei Zhang and Baosong Yang and Jin Xu and Jingren Zhou and Junyang Lin},
1378
+ journal={arXiv preprint arXiv:2601.15621},
1380
1379
  year={2026}
1381
1380
  }
1382
- ``` -->
1381
+ ```
1382
+
1383
+
1384
+ ## Star History
1385
+
1386
+ [![Star History Chart](https://api.star-history.com/svg?repos=QwenLM/Qwen3-TTS&type=Date)](https://star-history.com/#QwenLM/Qwen3-TTS&Date)
1387
+
1383
1388
 
1384
1389
  <br>
@@ -7,7 +7,7 @@
7
7
  <p>
8
8
 
9
9
  <p align="center">
10
- &nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://github.com/QwenLM/Qwen3-TTS/blob/main/assets/Qwen3_TTS.pdf">Paper</a>&nbsp&nbsp
10
+ &nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://arxiv.org/abs/2601.15621">Paper</a>&nbsp&nbsp
11
11
  <br>
12
12
  🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen3-TTS">Hugging Face Demo</a>&nbsp&nbsp | &nbsp&nbsp 🖥️ <a href="https://modelscope.cn/studios/Qwen/Qwen3-TTS">ModelScope Demo</a>&nbsp&nbsp | &nbsp&nbsp💬 <a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>&nbsp&nbsp | &nbsp&nbsp🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://help.aliyun.com/zh/model-studio/qwen-tts-realtime">API</a>
13
13
 
@@ -38,7 +38,7 @@ We release **Qwen3-TTS**, a series of powerful speech generation capabilities de
38
38
  - [vLLM Usage](#vllm-usage)
39
39
  - [Fine Tuning](#fine-tuning)
40
40
  - [Evaluation](#evaluation)
41
- <!-- - [Citation](#citation) -->
41
+ - [Citation](#citation)
42
42
 
43
43
  ## Overview
44
44
  ### Introduction
@@ -1339,18 +1339,23 @@ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` a
1339
1339
  </details>
1340
1340
 
1341
1341
 
1342
- <!-- ## Citation
1342
+ ## Citation
1343
1343
 
1344
1344
  If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)
1345
1345
 
1346
-
1347
1346
  ```BibTeX
1348
1347
  @article{Qwen3-TTS,
1349
1348
  title={Qwen3-TTS Technical Report},
1350
- author={},
1351
- journal={arXiv preprint arXiv:},
1349
+ author={Hangrui Hu and Xinfa Zhu and Ting He and Dake Guo and Bin Zhang and Xiong Wang and Zhifang Guo and Ziyue Jiang and Hongkun Hao and Zishan Guo and Xinyu Zhang and Pei Zhang and Baosong Yang and Jin Xu and Jingren Zhou and Junyang Lin},
1350
+ journal={arXiv preprint arXiv:2601.15621},
1352
1351
  year={2026}
1353
1352
  }
1354
- ``` -->
1353
+ ```
1354
+
1355
+
1356
+ ## Star History
1357
+
1358
+ [![Star History Chart](https://api.star-history.com/svg?repos=QwenLM/Qwen3-TTS&type=Date)](https://star-history.com/#QwenLM/Qwen3-TTS&Date)
1359
+
1355
1360
 
1356
1361
  <br>
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "qwen-tts"
7
- version = "0.0.4"
7
+ version = "0.1.0"
8
8
  description = "Qwen-TTS python package"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -470,7 +470,7 @@ class Qwen3TTSPreTrainedModel(PreTrainedModel):
470
470
  supports_gradient_checkpointing = True
471
471
  _no_split_modules = ["Qwen3TTSDecoderLayer"]
472
472
  _skip_keys_device_placement = "past_key_values"
473
- _supports_flash_attn_2 = True
473
+ _supports_flash_attn = True
474
474
  _supports_sdpa = True
475
475
  _supports_cache_class = True
476
476
  _supports_static_cache = False
@@ -501,8 +501,7 @@ class Qwen3TTSTalkerTextPreTrainedModel(PreTrainedModel):
501
501
  supports_gradient_checkpointing = True
502
502
  _no_split_modules = []
503
503
  _skip_keys_device_placement = ["past_key_values"]
504
- _supports_flash_attn_3 = True
505
- _supports_flash_attn_2 = True
504
+ _supports_flash_attn = True
506
505
  _supports_sdpa = True
507
506
  _supports_flex_attn = True
508
507
  _supports_cache_class = True
@@ -1869,6 +1868,11 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
1869
1868
  weights_only=True,
1870
1869
  **kwargs,
1871
1870
  ):
1871
+ # Hotfix to enable passing the correct attn implementation which is stored in the config but not in kwargs
1872
+ requested_attn_implementation = kwargs.pop("attn_implementation", None)
1873
+ if requested_attn_implementation is None and config and config._attn_implementation:
1874
+ requested_attn_implementation = config._attn_implementation
1875
+
1872
1876
  model = super().from_pretrained(
1873
1877
  pretrained_model_name_or_path,
1874
1878
  *model_args,
@@ -1881,6 +1885,7 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
1881
1885
  revision=revision,
1882
1886
  use_safetensors=use_safetensors,
1883
1887
  weights_only=weights_only,
1888
+ attn_implementation=requested_attn_implementation,
1884
1889
  **kwargs,
1885
1890
  )
1886
1891
  if not local_files_only and not os.path.isdir(pretrained_model_name_or_path):
@@ -198,12 +198,13 @@ class Qwen3TTSTokenizerV2CausalTransConvNet(nn.Module):
198
198
  self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride)
199
199
 
200
200
  pad = kernel_size - stride
201
- self.left_pad = math.ceil(pad)
202
- self.right_pad = pad = self.left_pad
201
+ self.left_pad = 0
202
+ self.right_pad = int(pad)
203
203
 
204
204
  def forward(self, hidden_state):
205
205
  hidden_state = self.conv(hidden_state)
206
- hidden_state = hidden_state[..., self.left_pad : hidden_state.shape[-1] - self.right_pad]
206
+ if self.right_pad > 0:
207
+ hidden_state = hidden_state[..., : hidden_state.shape[-1] - self.right_pad]
207
208
  return hidden_state.contiguous()
208
209
 
209
210
 
@@ -639,7 +639,7 @@ class Qwen3TTSModel:
639
639
  text: Union[str, List[str]],
640
640
  instruct: Union[str, List[str]],
641
641
  language: Union[str, List[str]] = None,
642
- non_streaming_mode: bool = False,
642
+ non_streaming_mode: bool = True,
643
643
  **kwargs,
644
644
  ) -> Tuple[List[np.ndarray], int]:
645
645
  """
@@ -735,7 +735,7 @@ class Qwen3TTSModel:
735
735
  speaker: Union[str, List[str]],
736
736
  language: Union[str, List[str]] = None,
737
737
  instruct: Optional[Union[str, List[str]]] = None,
738
- non_streaming_mode: bool = False,
738
+ non_streaming_mode: bool = True,
739
739
  **kwargs,
740
740
  ) -> Tuple[List[np.ndarray], int]:
741
741
  """
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes