PyPI - qwen-tts - Versions diffs - 0.0.4__tar.gz → 0.1.0__tar.gz - Mend

qwen-tts 0.0.4tar.gz → 0.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{qwen_tts-0.0.4 → qwen_tts-0.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: qwen-tts
-Version: 0.0.4
+Version: 0.1.0
 Summary: Qwen-TTS python package
 Author: Alibaba Qwen Team
 License: Apache-2.0
@@ -35,7 +35,7 @@ Dynamic: license-file
 <p>
 <p align="center">
-&nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://github.com/QwenLM/Qwen3-TTS/blob/main/assets/Qwen3_TTS.pdf">Paper</a>&nbsp&nbsp
+&nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://arxiv.org/abs/2601.15621">Paper</a>&nbsp&nbsp
 <br>
 🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen3-TTS">Hugging Face Demo</a>&nbsp&nbsp | &nbsp&nbsp 🖥️ <a href="https://modelscope.cn/studios/Qwen/Qwen3-TTS">ModelScope Demo</a>&nbsp&nbsp | &nbsp&nbsp💬 <a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>&nbsp&nbsp | &nbsp&nbsp🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://help.aliyun.com/zh/model-studio/qwen-tts-realtime">API</a>
@@ -66,7 +66,7 @@ We release **Qwen3-TTS**, a series of powerful speech generation capabilities de
 - [vLLM Usage](#vllm-usage)
 - [Fine Tuning](#fine-tuning)
 - [Evaluation](#evaluation)
-<!-- - [Citation](#citation) -->
+- [Citation](#citation)
 ## Overview
 ### Introduction
@@ -1367,18 +1367,23 @@ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` a
 </details>
-<!-- ## Citation
+## Citation
 If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)
 ```BibTeX
 @article{Qwen3-TTS,
   title={Qwen3-TTS Technical Report},
-  author={},
-  journal={arXiv preprint arXiv:},
+  author={Hangrui Hu and Xinfa Zhu and Ting He and Dake Guo and Bin Zhang and Xiong Wang and Zhifang Guo and Ziyue Jiang and Hongkun Hao and Zishan Guo and Xinyu Zhang and Pei Zhang and Baosong Yang and Jin Xu and Jingren Zhou and Junyang Lin},
+  journal={arXiv preprint arXiv:2601.15621},
   year={2026}
 }
-``` -->
+```
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=QwenLM/Qwen3-TTS&type=Date)](https://star-history.com/#QwenLM/Qwen3-TTS&Date)
 <br>

{qwen_tts-0.0.4 → qwen_tts-0.1.0}/README.md RENAMED Viewed

@@ -7,7 +7,7 @@
 <p>
 <p align="center">
-&nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://github.com/QwenLM/Qwen3-TTS/blob/main/assets/Qwen3_TTS.pdf">Paper</a>&nbsp&nbsp
+&nbsp&nbsp🤗 <a href="https://huggingface.co/collections/Qwen/qwen3-tts">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://modelscope.cn/collections/Qwen/Qwen3-TTS">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://qwen.ai/blog?id=qwen3tts-0115">Blog</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://arxiv.org/abs/2601.15621">Paper</a>&nbsp&nbsp
 <br>
 🖥️ <a href="https://huggingface.co/spaces/Qwen/Qwen3-TTS">Hugging Face Demo</a>&nbsp&nbsp | &nbsp&nbsp 🖥️ <a href="https://modelscope.cn/studios/Qwen/Qwen3-TTS">ModelScope Demo</a>&nbsp&nbsp | &nbsp&nbsp💬 <a href="https://github.com/QwenLM/Qwen/blob/main/assets/wechat.png">WeChat (微信)</a>&nbsp&nbsp | &nbsp&nbsp🫨 <a href="https://discord.gg/CV4E9rpNSD">Discord</a>&nbsp&nbsp | &nbsp&nbsp📑 <a href="https://help.aliyun.com/zh/model-studio/qwen-tts-realtime">API</a>
@@ -38,7 +38,7 @@ We release **Qwen3-TTS**, a series of powerful speech generation capabilities de
 - [vLLM Usage](#vllm-usage)
 - [Fine Tuning](#fine-tuning)
 - [Evaluation](#evaluation)
-<!-- - [Citation](#citation) -->
+- [Citation](#citation)
 ## Overview
 ### Introduction
@@ -1339,18 +1339,23 @@ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` a
 </details>
-<!-- ## Citation
+## Citation
 If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)
 ```BibTeX
 @article{Qwen3-TTS,
   title={Qwen3-TTS Technical Report},
-  author={},
-  journal={arXiv preprint arXiv:},
+  author={Hangrui Hu and Xinfa Zhu and Ting He and Dake Guo and Bin Zhang and Xiong Wang and Zhifang Guo and Ziyue Jiang and Hongkun Hao and Zishan Guo and Xinyu Zhang and Pei Zhang and Baosong Yang and Jin Xu and Jingren Zhou and Junyang Lin},
+  journal={arXiv preprint arXiv:2601.15621},
   year={2026}
 }
-``` -->
+```
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=QwenLM/Qwen3-TTS&type=Date)](https://star-history.com/#QwenLM/Qwen3-TTS&Date)
 <br>

{qwen_tts-0.0.4 → qwen_tts-0.1.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "qwen-tts"
-version = "0.0.4"
+version = "0.1.0"
 description = "Qwen-TTS python package"
 readme = "README.md"
 requires-python = ">=3.9"

{qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/models/modeling_qwen3_tts.py RENAMED Viewed

@@ -470,7 +470,7 @@ class Qwen3TTSPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["Qwen3TTSDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
+    _supports_flash_attn = True
     _supports_sdpa = True
     _supports_cache_class = True
     _supports_static_cache = False
@@ -501,8 +501,7 @@ class Qwen3TTSTalkerTextPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = []
     _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn_3 = True
-    _supports_flash_attn_2 = True
+    _supports_flash_attn = True
     _supports_sdpa = True
     _supports_flex_attn = True
     _supports_cache_class = True
@@ -1869,6 +1868,11 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
         weights_only=True,
         **kwargs,
     ):
+        # Hotfix to enable passing the correct attn implementation which is stored in the config but not in kwargs
+        requested_attn_implementation = kwargs.pop("attn_implementation", None)
+        if requested_attn_implementation is None and config and config._attn_implementation:
+            requested_attn_implementation = config._attn_implementation
         model = super().from_pretrained(
             pretrained_model_name_or_path,
             *model_args,
@@ -1881,6 +1885,7 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
             revision=revision,
             use_safetensors=use_safetensors,
             weights_only=weights_only,
+            attn_implementation=requested_attn_implementation,
             **kwargs,
         )
         if not local_files_only and not os.path.isdir(pretrained_model_name_or_path):

{qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/core/tokenizer_12hz/modeling_qwen3_tts_tokenizer_v2.py RENAMED Viewed

@@ -198,12 +198,13 @@ class Qwen3TTSTokenizerV2CausalTransConvNet(nn.Module):
         self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride)
         pad = kernel_size - stride
-        self.left_pad = math.ceil(pad)
-        self.right_pad = pad = self.left_pad
+        self.left_pad = 0
+        self.right_pad = int(pad)
     def forward(self, hidden_state):
         hidden_state = self.conv(hidden_state)
-        hidden_state = hidden_state[..., self.left_pad : hidden_state.shape[-1] - self.right_pad]
+        if self.right_pad > 0:
+            hidden_state = hidden_state[..., : hidden_state.shape[-1] - self.right_pad]
         return hidden_state.contiguous()

{qwen_tts-0.0.4 → qwen_tts-0.1.0}/qwen_tts/inference/qwen3_tts_model.py RENAMED Viewed

@@ -639,7 +639,7 @@ class Qwen3TTSModel:
         text: Union[str, List[str]],
         instruct: Union[str, List[str]],
         language: Union[str, List[str]] = None,
-        non_streaming_mode: bool = False,
+        non_streaming_mode: bool = True,
         **kwargs,
     ) -> Tuple[List[np.ndarray], int]:
         """
@@ -735,7 +735,7 @@ class Qwen3TTSModel:
         speaker: Union[str, List[str]],
         language: Union[str, List[str]] = None,
         instruct: Optional[Union[str, List[str]]] = None,
-        non_streaming_mode: bool = False,
+        non_streaming_mode: bool = True,
         **kwargs,
     ) -> Tuple[List[np.ndarray], int]:
         """