mazinger 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. mazinger-1.7.0/LICENSE +21 -0
  2. mazinger-1.7.0/PKG-INFO +189 -0
  3. mazinger-1.7.0/README.md +145 -0
  4. mazinger-1.7.0/mazinger/__init__.py +14 -0
  5. mazinger-1.7.0/mazinger/__main__.py +25 -0
  6. mazinger-1.7.0/mazinger/assemble.py +358 -0
  7. mazinger-1.7.0/mazinger/cli/__init__.py +72 -0
  8. mazinger-1.7.0/mazinger/cli/_describe.py +63 -0
  9. mazinger-1.7.0/mazinger/cli/_download.py +19 -0
  10. mazinger-1.7.0/mazinger/cli/_dub.py +91 -0
  11. mazinger-1.7.0/mazinger/cli/_groups.py +327 -0
  12. mazinger-1.7.0/mazinger/cli/_resegment.py +40 -0
  13. mazinger-1.7.0/mazinger/cli/_slice.py +32 -0
  14. mazinger-1.7.0/mazinger/cli/_speak.py +112 -0
  15. mazinger-1.7.0/mazinger/cli/_subtitle.py +79 -0
  16. mazinger-1.7.0/mazinger/cli/_thumbnails.py +57 -0
  17. mazinger-1.7.0/mazinger/cli/_transcribe.py +61 -0
  18. mazinger-1.7.0/mazinger/cli/_translate.py +90 -0
  19. mazinger-1.7.0/mazinger/describe.py +83 -0
  20. mazinger-1.7.0/mazinger/download.py +452 -0
  21. mazinger-1.7.0/mazinger/paths.py +81 -0
  22. mazinger-1.7.0/mazinger/pipeline.py +366 -0
  23. mazinger-1.7.0/mazinger/profiles.py +112 -0
  24. mazinger-1.7.0/mazinger/resegment.py +554 -0
  25. mazinger-1.7.0/mazinger/srt.py +94 -0
  26. mazinger-1.7.0/mazinger/subtitle.py +561 -0
  27. mazinger-1.7.0/mazinger/thumbnails.py +190 -0
  28. mazinger-1.7.0/mazinger/transcribe.py +579 -0
  29. mazinger-1.7.0/mazinger/translate.py +403 -0
  30. mazinger-1.7.0/mazinger/tts.py +352 -0
  31. mazinger-1.7.0/mazinger/utils.py +182 -0
  32. mazinger-1.7.0/mazinger.egg-info/PKG-INFO +189 -0
  33. mazinger-1.7.0/mazinger.egg-info/SOURCES.txt +37 -0
  34. mazinger-1.7.0/mazinger.egg-info/dependency_links.txt +1 -0
  35. mazinger-1.7.0/mazinger.egg-info/entry_points.txt +2 -0
  36. mazinger-1.7.0/mazinger.egg-info/requires.txt +43 -0
  37. mazinger-1.7.0/mazinger.egg-info/top_level.txt +4 -0
  38. mazinger-1.7.0/pyproject.toml +99 -0
  39. mazinger-1.7.0/setup.cfg +4 -0
mazinger-1.7.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Mazinger Dubber Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,189 @@
1
+ Metadata-Version: 2.4
2
+ Name: mazinger
3
+ Version: 1.7.0
4
+ Summary: End-to-end video dubbing pipeline: transcribe, translate, and voice-clone.
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: yt-dlp>=2024.0
10
+ Requires-Dist: openai>=1.0
11
+ Requires-Dist: json-repair>=0.28
12
+ Requires-Dist: Pillow>=10.0
13
+ Requires-Dist: soundfile>=0.12
14
+ Requires-Dist: numpy>=1.24
15
+ Requires-Dist: tqdm>=4.60
16
+ Provides-Extra: transcribe-faster
17
+ Requires-Dist: faster-whisper>=1.0; extra == "transcribe-faster"
18
+ Provides-Extra: transcribe-whisperx
19
+ Requires-Dist: whisperx>=3.0; extra == "transcribe-whisperx"
20
+ Requires-Dist: torch>=2.0; extra == "transcribe-whisperx"
21
+ Provides-Extra: transcribe
22
+ Requires-Dist: mazinger[transcribe-whisperx]; extra == "transcribe"
23
+ Provides-Extra: tts
24
+ Requires-Dist: qwen-tts; extra == "tts"
25
+ Requires-Dist: torch>=2.0; extra == "tts"
26
+ Requires-Dist: soundfile>=0.12; extra == "tts"
27
+ Provides-Extra: tts-chatterbox
28
+ Requires-Dist: chatterbox-tts; extra == "tts-chatterbox"
29
+ Requires-Dist: resemble-perth>=1.0.1; extra == "tts-chatterbox"
30
+ Requires-Dist: torch>=2.0; extra == "tts-chatterbox"
31
+ Requires-Dist: torchaudio>=2.0; extra == "tts-chatterbox"
32
+ Requires-Dist: soundfile>=0.12; extra == "tts-chatterbox"
33
+ Requires-Dist: numpy>=1.26; extra == "tts-chatterbox"
34
+ Requires-Dist: pandas>=2.2; extra == "tts-chatterbox"
35
+ Provides-Extra: flash-attn
36
+ Requires-Dist: flash-attn>=2.0; extra == "flash-attn"
37
+ Provides-Extra: audio-enhance
38
+ Requires-Dist: demucs>=4.0.1; extra == "audio-enhance"
39
+ Provides-Extra: all-qwen
40
+ Requires-Dist: mazinger[audio-enhance,transcribe-whisperx,tts]; extra == "all-qwen"
41
+ Provides-Extra: all-chatterbox
42
+ Requires-Dist: mazinger[audio-enhance,transcribe-faster,tts-chatterbox]; extra == "all-chatterbox"
43
+ Dynamic: license-file
44
+
45
+ <p align="center">
46
+ <img src="docs/assets/main-logo-refined.png" alt="Mazinger Dubber" width="320" height="320" />
47
+ </p>
48
+
49
+ <h1 align="center">Mazinger Dubber</h1>
50
+
51
+ <p align="center">
52
+ End-to-end video dubbing pipeline. Download a video, transcribe it, translate the subtitles, clone a voice, and produce a fully dubbed audio or video file — in one command.
53
+ </p>
54
+
55
+ ## What It Does
56
+
57
+ Mazinger chains nine stages into a single pipeline:
58
+
59
+ 1. **Download** — fetch a video from a URL or ingest a local file, extract the audio track
60
+ 2. **Transcribe** — convert speech to SRT subtitles (OpenAI Whisper API, faster-whisper, or WhisperX)
61
+ 3. **Thumbnails** — use an LLM to pick key frames from the video for visual context
62
+ 4. **Describe** — analyze the transcript and thumbnails to produce a structured summary (title, key points, keywords)
63
+ 5. **Translate** — translate the SRT into another language with duration-aware word budgets
64
+ 6. **Re-segment** — merge fragments and split oversized subtitles for readability
65
+ 7. **Speak** — synthesize voice-cloned speech for every subtitle entry (Qwen3-TTS or Chatterbox)
66
+ 8. **Assemble** — place each audio segment on the original timeline with optional tempo adjustment, loudness matching, and background audio mixing
67
+ 9. **Subtitle** — burn styled subtitles into the video and/or mux the new audio track
68
+
69
+ Every stage can run independently or as part of the full pipeline. Interrupted runs resume automatically — completed stages and individual TTS segments are cached and skipped.
70
+
71
+ ## Prerequisites
72
+
73
+ - Python 3.10 or later
74
+ - ffmpeg installed and on `PATH` (`apt install ffmpeg` / `brew install ffmpeg`)
75
+ - An OpenAI API key for LLM-powered stages (transcription, translation, thumbnails, description)
76
+ - A CUDA GPU for local transcription and TTS (not needed for cloud-only workflows)
77
+
78
+ ## Installation
79
+
80
+ The base install covers download, transcription (cloud), thumbnails, description, translation, re-segmentation, and subtitle embedding. No GPU needed.
81
+
82
+ ```bash
83
+ pip install .
84
+ ```
85
+
86
+ Add local transcription or TTS as optional extras:
87
+
88
+ ```bash
89
+ # Local transcription
90
+ pip install ".[transcribe-faster]" # faster-whisper (Chatterbox-compatible)
91
+ pip install ".[transcribe-whisperx]" # WhisperX (best word-level alignment)
92
+
93
+ # Voice synthesis
94
+ pip install ".[tts]" # Qwen3-TTS (voice sample + transcript)
95
+ pip install ".[tts-chatterbox]" # Chatterbox (voice sample only, emotion control)
96
+
97
+ # Full bundles
98
+ pip install ".[all-qwen]" # WhisperX + Qwen3-TTS
99
+ pip install ".[all-chatterbox]" # faster-whisper + Chatterbox
100
+ ```
101
+
102
+ > Qwen and Chatterbox require different `transformers` versions and cannot share an environment.
103
+ > WhisperX also conflicts with Chatterbox — pair it with Qwen, or use faster-whisper with Chatterbox.
104
+
105
+ See the [Installation Guide](docs/installation.md) for venv recipes, Colab setup, and uv overrides.
106
+
107
+ ## Quick Start
108
+
109
+ ### Dub a video in one command
110
+
111
+ ```bash
112
+ mazinger dub "https://youtube.com/watch?v=VIDEO_ID" \
113
+ --voice-sample speaker.m4a \
114
+ --voice-script speaker_transcript.txt \
115
+ --target-language Spanish \
116
+ --base-dir ./output
117
+ ```
118
+
119
+ ### Use a voice profile instead of local files
120
+
121
+ Voice profiles are hosted on HuggingFace and downloaded automatically:
122
+
123
+ ```bash
124
+ mazinger dub "https://youtube.com/watch?v=VIDEO_ID" \
125
+ --clone-profile abubakr \
126
+ --target-language Arabic
127
+ ```
128
+
129
+ ### Produce a video with burned subtitles
130
+
131
+ ```bash
132
+ mazinger dub "https://youtube.com/watch?v=VIDEO_ID" \
133
+ --clone-profile abubakr \
134
+ --output-type video \
135
+ --embed-subtitles \
136
+ --subtitle-google-font "Noto Sans Arabic" \
137
+ --subtitle-font-size 24
138
+ ```
139
+
140
+ ### Run a single stage
141
+
142
+ Every stage has its own sub-command:
143
+
144
+ ```bash
145
+ mazinger download "https://youtube.com/watch?v=VIDEO_ID" --base-dir ./output
146
+ mazinger slice "https://youtube.com/watch?v=VIDEO_ID" --start 00:01:00 --end 00:04:00
147
+ mazinger transcribe ./output/projects/my-video/source/audio.mp3 -o subs.srt
148
+ mazinger translate --srt subs.srt --target-language French -o translated.srt
149
+ mazinger subtitle video.mp4 --srt translated.srt -o output.mp4
150
+ ```
151
+
152
+ ### Python API
153
+
154
+ ```python
155
+ from mazinger import MazingerDubber
156
+
157
+ dubber = MazingerDubber(openai_api_key="sk-...", base_dir="./output")
158
+
159
+ proj = dubber.dub(
160
+ source="https://youtube.com/watch?v=VIDEO_ID",
161
+ voice_sample="speaker.m4a",
162
+ voice_script="speaker_transcript.txt",
163
+ target_language="Spanish",
164
+ output_type="video",
165
+ embed_subtitles=True,
166
+ )
167
+
168
+ print(proj.final_video) # ./output/projects/<slug>/tts/dubbed.mp4
169
+ ```
170
+
171
+ ## Documentation
172
+
173
+ Full documentation lives in the [`docs/`](docs/) directory:
174
+
175
+ | Chapter | Contents |
176
+ |---------|----------|
177
+ | [Installation](docs/installation.md) | All install methods, extras, compatibility matrix, Colab and venv recipes |
178
+ | [Quick Start](docs/quick-start.md) | Common workflows with copy-paste examples |
179
+ | [Pipeline Overview](docs/pipeline.md) | How the nine stages connect, data flow, and resume behavior |
180
+ | [CLI Reference](docs/cli-reference.md) | Every command, flag, and default value |
181
+ | [Python API](docs/python-api.md) | Classes, functions, and parameters for programmatic use |
182
+ | [Voice Profiles](docs/voice-profiles.md) | Using, creating, and uploading voice profiles |
183
+ | [Subtitle Styling](docs/subtitle-styling.md) | Fonts, colors, positioning, RTL support, Google Fonts |
184
+ | [Configuration](docs/configuration.md) | Environment variables, caching, tempo control, LLM usage tracking |
185
+ | [Project Structure](docs/project-structure.md) | Output directory layout and file naming conventions |
186
+
187
+ ## License
188
+
189
+ MIT
@@ -0,0 +1,145 @@
1
+ <p align="center">
2
+ <img src="docs/assets/main-logo-refined.png" alt="Mazinger Dubber" width="320" height="320" />
3
+ </p>
4
+
5
+ <h1 align="center">Mazinger Dubber</h1>
6
+
7
+ <p align="center">
8
+ End-to-end video dubbing pipeline. Download a video, transcribe it, translate the subtitles, clone a voice, and produce a fully dubbed audio or video file — in one command.
9
+ </p>
10
+
11
+ ## What It Does
12
+
13
+ Mazinger chains nine stages into a single pipeline:
14
+
15
+ 1. **Download** — fetch a video from a URL or ingest a local file, extract the audio track
16
+ 2. **Transcribe** — convert speech to SRT subtitles (OpenAI Whisper API, faster-whisper, or WhisperX)
17
+ 3. **Thumbnails** — use an LLM to pick key frames from the video for visual context
18
+ 4. **Describe** — analyze the transcript and thumbnails to produce a structured summary (title, key points, keywords)
19
+ 5. **Translate** — translate the SRT into another language with duration-aware word budgets
20
+ 6. **Re-segment** — merge fragments and split oversized subtitles for readability
21
+ 7. **Speak** — synthesize voice-cloned speech for every subtitle entry (Qwen3-TTS or Chatterbox)
22
+ 8. **Assemble** — place each audio segment on the original timeline with optional tempo adjustment, loudness matching, and background audio mixing
23
+ 9. **Subtitle** — burn styled subtitles into the video and/or mux the new audio track
24
+
25
+ Every stage can run independently or as part of the full pipeline. Interrupted runs resume automatically — completed stages and individual TTS segments are cached and skipped.
26
+
27
+ ## Prerequisites
28
+
29
+ - Python 3.10 or later
30
+ - ffmpeg installed and on `PATH` (`apt install ffmpeg` / `brew install ffmpeg`)
31
+ - An OpenAI API key for LLM-powered stages (transcription, translation, thumbnails, description)
32
+ - A CUDA GPU for local transcription and TTS (not needed for cloud-only workflows)
33
+
34
+ ## Installation
35
+
36
+ The base install covers download, transcription (cloud), thumbnails, description, translation, re-segmentation, and subtitle embedding. No GPU needed.
37
+
38
+ ```bash
39
+ pip install .
40
+ ```
41
+
42
+ Add local transcription or TTS as optional extras:
43
+
44
+ ```bash
45
+ # Local transcription
46
+ pip install ".[transcribe-faster]" # faster-whisper (Chatterbox-compatible)
47
+ pip install ".[transcribe-whisperx]" # WhisperX (best word-level alignment)
48
+
49
+ # Voice synthesis
50
+ pip install ".[tts]" # Qwen3-TTS (voice sample + transcript)
51
+ pip install ".[tts-chatterbox]" # Chatterbox (voice sample only, emotion control)
52
+
53
+ # Full bundles
54
+ pip install ".[all-qwen]" # WhisperX + Qwen3-TTS
55
+ pip install ".[all-chatterbox]" # faster-whisper + Chatterbox
56
+ ```
57
+
58
+ > Qwen and Chatterbox require different `transformers` versions and cannot share an environment.
59
+ > WhisperX also conflicts with Chatterbox — pair it with Qwen, or use faster-whisper with Chatterbox.
60
+
61
+ See the [Installation Guide](docs/installation.md) for venv recipes, Colab setup, and uv overrides.
62
+
63
+ ## Quick Start
64
+
65
+ ### Dub a video in one command
66
+
67
+ ```bash
68
+ mazinger dub "https://youtube.com/watch?v=VIDEO_ID" \
69
+ --voice-sample speaker.m4a \
70
+ --voice-script speaker_transcript.txt \
71
+ --target-language Spanish \
72
+ --base-dir ./output
73
+ ```
74
+
75
+ ### Use a voice profile instead of local files
76
+
77
+ Voice profiles are hosted on HuggingFace and downloaded automatically:
78
+
79
+ ```bash
80
+ mazinger dub "https://youtube.com/watch?v=VIDEO_ID" \
81
+ --clone-profile abubakr \
82
+ --target-language Arabic
83
+ ```
84
+
85
+ ### Produce a video with burned subtitles
86
+
87
+ ```bash
88
+ mazinger dub "https://youtube.com/watch?v=VIDEO_ID" \
89
+ --clone-profile abubakr \
90
+ --output-type video \
91
+ --embed-subtitles \
92
+ --subtitle-google-font "Noto Sans Arabic" \
93
+ --subtitle-font-size 24
94
+ ```
95
+
96
+ ### Run a single stage
97
+
98
+ Every stage has its own sub-command:
99
+
100
+ ```bash
101
+ mazinger download "https://youtube.com/watch?v=VIDEO_ID" --base-dir ./output
102
+ mazinger slice "https://youtube.com/watch?v=VIDEO_ID" --start 00:01:00 --end 00:04:00
103
+ mazinger transcribe ./output/projects/my-video/source/audio.mp3 -o subs.srt
104
+ mazinger translate --srt subs.srt --target-language French -o translated.srt
105
+ mazinger subtitle video.mp4 --srt translated.srt -o output.mp4
106
+ ```
107
+
108
+ ### Python API
109
+
110
+ ```python
111
+ from mazinger import MazingerDubber
112
+
113
+ dubber = MazingerDubber(openai_api_key="sk-...", base_dir="./output")
114
+
115
+ proj = dubber.dub(
116
+ source="https://youtube.com/watch?v=VIDEO_ID",
117
+ voice_sample="speaker.m4a",
118
+ voice_script="speaker_transcript.txt",
119
+ target_language="Spanish",
120
+ output_type="video",
121
+ embed_subtitles=True,
122
+ )
123
+
124
+ print(proj.final_video) # ./output/projects/<slug>/tts/dubbed.mp4
125
+ ```
126
+
127
+ ## Documentation
128
+
129
+ Full documentation lives in the [`docs/`](docs/) directory:
130
+
131
+ | Chapter | Contents |
132
+ |---------|----------|
133
+ | [Installation](docs/installation.md) | All install methods, extras, compatibility matrix, Colab and venv recipes |
134
+ | [Quick Start](docs/quick-start.md) | Common workflows with copy-paste examples |
135
+ | [Pipeline Overview](docs/pipeline.md) | How the nine stages connect, data flow, and resume behavior |
136
+ | [CLI Reference](docs/cli-reference.md) | Every command, flag, and default value |
137
+ | [Python API](docs/python-api.md) | Classes, functions, and parameters for programmatic use |
138
+ | [Voice Profiles](docs/voice-profiles.md) | Using, creating, and uploading voice profiles |
139
+ | [Subtitle Styling](docs/subtitle-styling.md) | Fonts, colors, positioning, RTL support, Google Fonts |
140
+ | [Configuration](docs/configuration.md) | Environment variables, caching, tempo control, LLM usage tracking |
141
+ | [Project Structure](docs/project-structure.md) | Output directory layout and file naming conventions |
142
+
143
+ ## License
144
+
145
+ MIT
@@ -0,0 +1,14 @@
1
+ """
2
+ Mazinger Dubber -- End-to-end video dubbing pipeline.
3
+
4
+ Transcribe, translate, and voice-clone audio from any video URL.
5
+ Each stage can be used independently or chained through the unified
6
+ ``MazingerDubber`` pipeline class.
7
+ """
8
+
9
+ from mazinger.pipeline import MazingerDubber
10
+ from mazinger.paths import ProjectPaths
11
+ from mazinger.utils import LLMUsageTracker
12
+
13
+ __all__ = ["MazingerDubber", "ProjectPaths", "LLMUsageTracker"]
14
+ __version__ = "1.1.0"
@@ -0,0 +1,25 @@
1
+ """Allow ``python -m mazinger`` invocation."""
2
+
3
+ import os
4
+ import warnings
5
+
6
+ # Remove Jupyter/Colab-specific matplotlib backend that may not be available
7
+ # in this virtual environment, causing an import error in downstream libs.
8
+ os.environ.pop("MPLBACKEND", None)
9
+
10
+ # ── Silence noisy third-party warnings ──────────────────────────────────────
11
+ # torchcodec FFmpeg compatibility warning from pyannote
12
+ warnings.filterwarnings("ignore", message=".*torchcodec is not installed correctly.*")
13
+ # pyannote TF32 reproducibility warning
14
+ warnings.filterwarnings("ignore", category=UserWarning, module="pyannote")
15
+ # Lightning checkpoint auto-upgrade info
16
+ warnings.filterwarnings("ignore", message=".*Lightning automatically upgraded.*")
17
+
18
+ # Suppress Lightning upgrade log at the logging level too
19
+ import logging
20
+ logging.getLogger("lightning.pytorch.utilities.migration").setLevel(logging.WARNING)
21
+
22
+ from mazinger.cli import main
23
+
24
+ if __name__ == "__main__":
25
+ main()