narractive 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- narractive-2.0.0.dist-info/METADATA +218 -0
- narractive-2.0.0.dist-info/RECORD +28 -0
- narractive-2.0.0.dist-info/WHEEL +5 -0
- narractive-2.0.0.dist-info/entry_points.txt +3 -0
- narractive-2.0.0.dist-info/licenses/LICENSE +21 -0
- narractive-2.0.0.dist-info/top_level.txt +1 -0
- video_automation/__init__.py +25 -0
- video_automation/__main__.py +4 -0
- video_automation/bridges/__init__.py +1 -0
- video_automation/bridges/f5_tts_bridge.py +140 -0
- video_automation/cli.py +550 -0
- video_automation/core/__init__.py +6 -0
- video_automation/core/app_automator.py +499 -0
- video_automation/core/diagram_generator.py +378 -0
- video_automation/core/frame_capturer.py +533 -0
- video_automation/core/narrator.py +287 -0
- video_automation/core/obs_controller.py +327 -0
- video_automation/core/qgis_automator.py +9 -0
- video_automation/core/timeline.py +339 -0
- video_automation/core/video_assembler.py +603 -0
- video_automation/diagrams/__init__.py +0 -0
- video_automation/diagrams/template.html +228 -0
- video_automation/scripts/__init__.py +0 -0
- video_automation/scripts/_recal_sidebar.py +34 -0
- video_automation/scripts/calibrate.py +1392 -0
- video_automation/scripts/setup_obs.py +196 -0
- video_automation/sequences/__init__.py +15 -0
- video_automation/sequences/base.py +255 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: narractive
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Modular framework for automating desktop application demo video production
|
|
5
|
+
Author: Simon Ducorneau
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/imagodata/narractive
|
|
8
|
+
Project-URL: Repository, https://github.com/imagodata/narractive
|
|
9
|
+
Keywords: video,automation,pyautogui,obs,tts,ffmpeg,demo,screencast,narractive
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Multimedia :: Video
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: pyautogui>=0.9.54
|
|
23
|
+
Requires-Dist: Pillow>=10.0
|
|
24
|
+
Requires-Dist: pyyaml>=6.0
|
|
25
|
+
Requires-Dist: click>=8.1
|
|
26
|
+
Requires-Dist: edge-tts>=6.1.0
|
|
27
|
+
Requires-Dist: mutagen>=1.47.0
|
|
28
|
+
Provides-Extra: obs
|
|
29
|
+
Requires-Dist: obsws-python>=1.7.0; extra == "obs"
|
|
30
|
+
Provides-Extra: windows
|
|
31
|
+
Requires-Dist: pywin32>=306; extra == "windows"
|
|
32
|
+
Provides-Extra: png
|
|
33
|
+
Requires-Dist: playwright>=1.40; extra == "png"
|
|
34
|
+
Provides-Extra: elevenlabs
|
|
35
|
+
Requires-Dist: elevenlabs; extra == "elevenlabs"
|
|
36
|
+
Provides-Extra: all
|
|
37
|
+
Requires-Dist: obsws-python>=1.7.0; extra == "all"
|
|
38
|
+
Requires-Dist: pywin32>=306; extra == "all"
|
|
39
|
+
Requires-Dist: playwright>=1.40; extra == "all"
|
|
40
|
+
Provides-Extra: headless
|
|
41
|
+
Requires-Dist: python-xlib>=0.33; extra == "headless"
|
|
42
|
+
Provides-Extra: dev
|
|
43
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
44
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
45
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
46
|
+
Dynamic: license-file
|
|
47
|
+
|
|
48
|
+
# Narractive
|
|
49
|
+
|
|
50
|
+
A modular Python framework for automated video production — from narration to final cut.
|
|
51
|
+
|
|
52
|
+
Narractive orchestrates the full pipeline: UI interaction (PyAutoGUI), screen recording (OBS or headless), text-to-speech narration, Mermaid diagram generation, and FFmpeg assembly. Script your sequences, define narration cues, and let the framework produce polished demo videos hands-free.
|
|
53
|
+
|
|
54
|
+
## Features
|
|
55
|
+
|
|
56
|
+
- **Dual recording backends**: OBS WebSocket (desktop) or headless frame capture (Docker/Xvfb)
|
|
57
|
+
- **Multi-engine TTS narration**: edge-tts (free), ElevenLabs (premium), F5-TTS (voice cloning)
|
|
58
|
+
- **Timeline-synchronized sequences**: Narration cues paired with UI actions
|
|
59
|
+
- **Mermaid diagram slides**: HTML + PNG generation with dark theme
|
|
60
|
+
- **FFmpeg post-production**: Clip concatenation, narration mixing, timecode-based assembly
|
|
61
|
+
- **Interactive calibration**: Record UI element positions for pixel-perfect automation
|
|
62
|
+
- **Docker support**: Reproducible headless production in CI/CD
|
|
63
|
+
|
|
64
|
+
## Quick Start
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Install
|
|
68
|
+
pip install -e .
|
|
69
|
+
|
|
70
|
+
# Copy and configure
|
|
71
|
+
cp config.template.yaml config.yaml
|
|
72
|
+
|
|
73
|
+
# Calibrate UI positions (interactive)
|
|
74
|
+
video-automation --calibrate --config config.yaml
|
|
75
|
+
|
|
76
|
+
# Generate narration
|
|
77
|
+
video-automation --narration --narrations-file narrations.yaml
|
|
78
|
+
|
|
79
|
+
# Generate diagrams
|
|
80
|
+
video-automation --diagrams --diagrams-module my_project.diagrams.mermaid_definitions
|
|
81
|
+
|
|
82
|
+
# Record all sequences
|
|
83
|
+
video-automation --all --sequences-package my_project.sequences --config config.yaml
|
|
84
|
+
|
|
85
|
+
# Or headless (Docker)
|
|
86
|
+
docker compose run --rm video --all --sequences-package my_project.sequences
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Architecture
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
narractive/
|
|
93
|
+
├── video_automation/ # Framework (pip-installable)
|
|
94
|
+
│ ├── core/ # Generic modules
|
|
95
|
+
│ │ ├── app_automator.py # PyAutoGUI + window control
|
|
96
|
+
│ │ ├── obs_controller.py # OBS WebSocket 5.x
|
|
97
|
+
│ │ ├── frame_capturer.py # Headless Xvfb capture
|
|
98
|
+
│ │ ├── narrator.py # TTS (edge-tts/ElevenLabs/F5-TTS)
|
|
99
|
+
│ │ ├── timeline.py # Narration-synchronized cues
|
|
100
|
+
│ │ ├── diagram_generator.py # Mermaid → HTML/PNG
|
|
101
|
+
│ │ └── video_assembler.py # FFmpeg post-production
|
|
102
|
+
│ ├── sequences/
|
|
103
|
+
│ │ └── base.py # VideoSequence + TimelineSequence
|
|
104
|
+
│ ├── scripts/
|
|
105
|
+
│ │ ├── calibrate.py # Interactive UI calibration
|
|
106
|
+
│ │ └── setup_obs.py # OBS auto-configuration
|
|
107
|
+
│ └── cli.py # Click-based CLI
|
|
108
|
+
│
|
|
109
|
+
├── examples/
|
|
110
|
+
│ └── filtermate/ # Example project (QGIS plugin demo)
|
|
111
|
+
│ ├── sequences/ # 11 original + 7 v01 sequences
|
|
112
|
+
│ ├── diagrams/ # 20 Mermaid diagram definitions
|
|
113
|
+
│ ├── narrations.yaml # French narration scripts
|
|
114
|
+
│ └── config.yaml # Calibrated UI positions
|
|
115
|
+
│
|
|
116
|
+
├── config.template.yaml # Configuration template
|
|
117
|
+
├── Dockerfile # Headless Docker image
|
|
118
|
+
├── docker-compose.yml
|
|
119
|
+
└── pyproject.toml
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Creating Sequences for Your App
|
|
123
|
+
|
|
124
|
+
### 1. Simple sequence (manual timing)
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from video_automation.sequences.base import VideoSequence
|
|
128
|
+
|
|
129
|
+
class MyIntro(VideoSequence):
|
|
130
|
+
name = "Introduction"
|
|
131
|
+
sequence_id = "seq00"
|
|
132
|
+
duration_estimate = 30.0
|
|
133
|
+
obs_scene = "Main"
|
|
134
|
+
|
|
135
|
+
def execute(self, obs, app, config):
|
|
136
|
+
app.focus_app()
|
|
137
|
+
app.click_at("my_button")
|
|
138
|
+
app.wait(2.0)
|
|
139
|
+
app.scroll_down(3)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### 2. Timeline sequence (narration-synchronized)
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from video_automation.sequences.base import TimelineSequence
|
|
146
|
+
from video_automation.core.timeline import NarrationCue
|
|
147
|
+
|
|
148
|
+
class MyDemo(TimelineSequence):
|
|
149
|
+
name = "Live Demo"
|
|
150
|
+
sequence_id = "seq01"
|
|
151
|
+
duration_estimate = 60.0
|
|
152
|
+
|
|
153
|
+
def build_timeline(self, obs, app, config):
|
|
154
|
+
return [
|
|
155
|
+
NarrationCue(
|
|
156
|
+
text="Welcome to the demo.",
|
|
157
|
+
actions=lambda: app.wait(1.0),
|
|
158
|
+
sync="during",
|
|
159
|
+
),
|
|
160
|
+
NarrationCue(
|
|
161
|
+
text="Let's open the settings.",
|
|
162
|
+
actions=lambda: app.click_at("settings_button"),
|
|
163
|
+
sync="after",
|
|
164
|
+
),
|
|
165
|
+
]
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### 3. Register sequences
|
|
169
|
+
|
|
170
|
+
Create `my_project/sequences/__init__.py`:
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
from video_automation.sequences.base import VideoSequence
|
|
174
|
+
# Import your sequence modules here to register them
|
|
175
|
+
from my_project.sequences.seq00_intro import MyIntro
|
|
176
|
+
from my_project.sequences.seq01_demo import MyDemo
|
|
177
|
+
|
|
178
|
+
SEQUENCES = [MyIntro, MyDemo]
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Then run:
|
|
182
|
+
```bash
|
|
183
|
+
video-automation --list --sequences-package my_project.sequences
|
|
184
|
+
video-automation --all --sequences-package my_project.sequences
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## Configuration
|
|
188
|
+
|
|
189
|
+
See `config.template.yaml` for all available options. Key sections:
|
|
190
|
+
|
|
191
|
+
| Section | Purpose |
|
|
192
|
+
|---------|---------|
|
|
193
|
+
| `obs` | OBS WebSocket connection, scenes, output directory |
|
|
194
|
+
| `app` | Window title, panel name, calibrated UI positions |
|
|
195
|
+
| `timing` | Click/type/scroll delays, transition pauses |
|
|
196
|
+
| `diagrams` | Mermaid rendering (resolution, theme, colors) |
|
|
197
|
+
| `narration` | TTS engine, voice, speed, F5-TTS options |
|
|
198
|
+
| `capture` | Headless frame capture (FPS, resolution, display) |
|
|
199
|
+
| `output` | Final video encoding (resolution, fps, codec) |
|
|
200
|
+
|
|
201
|
+
## TTS Engines
|
|
202
|
+
|
|
203
|
+
| Engine | Cost | Quality | Setup |
|
|
204
|
+
|--------|------|---------|-------|
|
|
205
|
+
| edge-tts | Free | Good | `pip install edge-tts` |
|
|
206
|
+
| ElevenLabs | Paid | Excellent | `pip install elevenlabs` + API key |
|
|
207
|
+
| F5-TTS | Free | Excellent | Conda env + GPU recommended |
|
|
208
|
+
|
|
209
|
+
## Requirements
|
|
210
|
+
|
|
211
|
+
- Python 3.10+
|
|
212
|
+
- FFmpeg (for video assembly)
|
|
213
|
+
- OBS Studio (desktop mode) or Docker (headless mode)
|
|
214
|
+
- Your target application installed and running
|
|
215
|
+
|
|
216
|
+
## License
|
|
217
|
+
|
|
218
|
+
MIT
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
narractive-2.0.0.dist-info/licenses/LICENSE,sha256=tAmFdl5fPmXgRYL0Ini_u5tmNFzYTaG0sz4qvVtY-Ac,1072
|
|
2
|
+
video_automation/__init__.py,sha256=B3X06kMUwnJKZLzWOlUMbbqgJfHVwrLe9twYDPj7VAU,868
|
|
3
|
+
video_automation/__main__.py,sha256=SBDIyIOM65-ulv7-Qd8upXmKM-NBszG2Ph3FPO9ioe8,97
|
|
4
|
+
video_automation/cli.py,sha256=X29d1B1YUP3NGA1GerA-Kd493uEFDFN7BGOK4CAhEL0,21337
|
|
5
|
+
video_automation/bridges/__init__.py,sha256=TViMqxCAOIgg5lrSIW0-xRsD7ZXwUxWPLOkj0SYbGCE,78
|
|
6
|
+
video_automation/bridges/f5_tts_bridge.py,sha256=_P5cvbcLVRj7PajF932THyjnwfxcC1iKtOSFsH36Wnw,5773
|
|
7
|
+
video_automation/core/__init__.py,sha256=7kqaoS-BatVNqvhXPDw7k44Z3K9KW6jkfWT9XqlCZvA,179
|
|
8
|
+
video_automation/core/app_automator.py,sha256=pvC_GO0nHrBZBd__bCMYTuwoN8BhsuHIwkFECfshCug,19568
|
|
9
|
+
video_automation/core/diagram_generator.py,sha256=AswQTeoXNzPnFASLxnkZQEh56rNfIMn0kgvI0IrgYzc,11657
|
|
10
|
+
video_automation/core/frame_capturer.py,sha256=MKLCepsmIO4cYumUIMYJF2mFxWYR0G19jn_lwL9Hm9M,18521
|
|
11
|
+
video_automation/core/narrator.py,sha256=tAX4okIbQpJ8AugD_1gDxtZjq6d4F4cSz7DLn3CgqFo,11391
|
|
12
|
+
video_automation/core/obs_controller.py,sha256=KiNr7tSQd0swvOItaLAd1mbLGJTtAsqGuC_AEPoK-Zk,11451
|
|
13
|
+
video_automation/core/qgis_automator.py,sha256=bbmaNBXt9wLPorSB007SogqIgaH2RXSdUVlZYcEx6sY,254
|
|
14
|
+
video_automation/core/timeline.py,sha256=zSpOl16t6A9wX0Wjif-uT-SHGqv3ZKDdtzZrsYoWHeU,12373
|
|
15
|
+
video_automation/core/video_assembler.py,sha256=p_T4WoXhMXKK9uo8hl4j-7-8NzB0sxq9fr8tYEN83xQ,21069
|
|
16
|
+
video_automation/diagrams/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
+
video_automation/diagrams/template.html,sha256=PDt4JZHFXzgTcKosh6-vy0p9zpv9WUtFArnudd4wxFY,6886
|
|
18
|
+
video_automation/scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
video_automation/scripts/_recal_sidebar.py,sha256=4wzolHK1PgXrS_CQQIHB-RGDFa-oNrfwKL1_T1BQ7JI,1033
|
|
20
|
+
video_automation/scripts/calibrate.py,sha256=xls8ERZUcLIu8UmrGzQXIYoduwEt7zkWSaLuPERDNuI,55747
|
|
21
|
+
video_automation/scripts/setup_obs.py,sha256=QnPeF3VEQThZ_VStdxeyA85aLWgFf4Zx2e_nR6OMCvw,7825
|
|
22
|
+
video_automation/sequences/__init__.py,sha256=CC_7YHzsjYrMpT_HbJRQ0gPhW-ieL6Od6a3v2j2f1tE,487
|
|
23
|
+
video_automation/sequences/base.py,sha256=TC5x5tPWioH6FDF_sgmqLRKuvp2FMHmpA6xw60Wzmu8,8991
|
|
24
|
+
narractive-2.0.0.dist-info/METADATA,sha256=vpl3e6HBhccSUuv78qLnXIvPzcFScF2unkHvnhP4s_0,7603
|
|
25
|
+
narractive-2.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
26
|
+
narractive-2.0.0.dist-info/entry_points.txt,sha256=LpW9nHnQw7OoExd5Z7x6H2mhx_EXSmvBnL7F4TR5VmM,102
|
|
27
|
+
narractive-2.0.0.dist-info/top_level.txt,sha256=2kpQ9LGATNN5HoEsM7uZa00vCryKB4kgVbokaq3FIsA,17
|
|
28
|
+
narractive-2.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Simon Ducorneau
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
video_automation
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Video Automation — Desktop Application Video Production Framework
|
|
3
|
+
================================================================
|
|
4
|
+
A modular, reusable framework for automating demo videos of any
|
|
5
|
+
desktop application.
|
|
6
|
+
|
|
7
|
+
Core modules:
|
|
8
|
+
- app_automator: PyAutoGUI-based UI control
|
|
9
|
+
- obs_controller: OBS WebSocket 5.x recording
|
|
10
|
+
- frame_capturer: Headless Xvfb frame capture (Docker)
|
|
11
|
+
- narrator: Multi-engine TTS (edge-tts, ElevenLabs, F5-TTS)
|
|
12
|
+
- timeline: Narration-synchronized cue execution
|
|
13
|
+
- diagram_generator: Mermaid → HTML/PNG slides
|
|
14
|
+
- video_assembler: FFmpeg post-production pipeline
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
__version__ = version("narractive")
|
|
21
|
+
except PackageNotFoundError:
|
|
22
|
+
try:
|
|
23
|
+
__version__ = version("video-automation")
|
|
24
|
+
except PackageNotFoundError:
|
|
25
|
+
__version__ = "2.0.0"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Bridge scripts for external TTS engines (run in separate environments)."""
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""
|
|
2
|
+
F5-TTS bridge script — runs inside the f5-tts conda env.
|
|
3
|
+
|
|
4
|
+
Called as:
|
|
5
|
+
python f5_tts_bridge.py --ref_audio REF.wav --ref_text "..." \
|
|
6
|
+
--gen_text "..." --output_file OUT.wav [--speed 1.0] [--model F5TTS_v1_Base] \
|
|
7
|
+
[--remove_silence]
|
|
8
|
+
|
|
9
|
+
This avoids importing the full infer_cli module (which has heavy
|
|
10
|
+
module-level imports that can crash on some Windows setups) and uses
|
|
11
|
+
the Python API directly.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import sys
|
|
16
|
+
import types
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Pre-import mocks for modules that are NOT needed for F5-TTS inference
|
|
20
|
+
# but whose C-extensions crash on some Windows 11 setups (DLL policy / GIL).
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
def _make_mock(name, attrs=()):
|
|
24
|
+
"""Create a stub module with no-op callables and a valid __spec__."""
|
|
25
|
+
mod = types.ModuleType(name)
|
|
26
|
+
# torch._dynamo.trace_rules checks __spec__ via importlib.util.find_spec
|
|
27
|
+
from importlib.machinery import ModuleSpec
|
|
28
|
+
mod.__spec__ = ModuleSpec(name, None)
|
|
29
|
+
for a in attrs:
|
|
30
|
+
setattr(mod, a, lambda *_a, **_kw: None)
|
|
31
|
+
return mod
|
|
32
|
+
|
|
33
|
+
# matplotlib — used only for spectrogram visualization
|
|
34
|
+
_noop_attrs = ["figure", "show", "savefig", "plot", "subplot", "close",
|
|
35
|
+
"clf", "title", "xlabel", "ylabel", "use"]
|
|
36
|
+
_mpl = _make_mock("matplotlib", _noop_attrs)
|
|
37
|
+
_mpl.__path__ = [] # make it look like a package
|
|
38
|
+
sys.modules["matplotlib"] = _mpl
|
|
39
|
+
sys.modules["matplotlib.pylab"] = _make_mock("matplotlib.pylab", _noop_attrs)
|
|
40
|
+
sys.modules["matplotlib.pyplot"] = _make_mock("matplotlib.pyplot", _noop_attrs)
|
|
41
|
+
|
|
42
|
+
# sklearn / scipy.sparse — pulled in by transformers but not used for TTS inference.
|
|
43
|
+
# On Windows, scipy/sklearn compiled DLLs can be incompatible with numpy versions.
|
|
44
|
+
for _mod_name in (
|
|
45
|
+
"sklearn", "sklearn.base", "sklearn.utils", "sklearn.utils._chunking",
|
|
46
|
+
"sklearn.utils._param_validation", "sklearn.utils.validation",
|
|
47
|
+
"sklearn.utils._array_api", "sklearn.utils.fixes",
|
|
48
|
+
"sklearn.utils._metadata_requests", "sklearn.utils._estimator_html_repr",
|
|
49
|
+
"sklearn.metrics", "sklearn.metrics._ranking",
|
|
50
|
+
):
|
|
51
|
+
sys.modules[_mod_name] = _make_mock(_mod_name)
|
|
52
|
+
|
|
53
|
+
# Also mock sklearn.metrics.roc_curve since transformers imports it directly
|
|
54
|
+
_mock_metrics = sys.modules["sklearn.metrics"]
|
|
55
|
+
_mock_metrics.roc_curve = lambda *a, **kw: ([], [], [])
|
|
56
|
+
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
# Patch torchaudio to use soundfile backend instead of torchcodec
|
|
60
|
+
# (torchcodec requires ffmpeg DLLs which are problematic on Windows)
|
|
61
|
+
import torchaudio
|
|
62
|
+
import soundfile as sf
|
|
63
|
+
import torch
|
|
64
|
+
|
|
65
|
+
def _load_soundfile(filepath, **kwargs):
|
|
66
|
+
"""Load audio using soundfile as fallback for torchaudio.load."""
|
|
67
|
+
data, samplerate = sf.read(filepath, dtype="float32")
|
|
68
|
+
# soundfile returns (samples, channels), torch wants (channels, samples)
|
|
69
|
+
tensor = torch.from_numpy(data)
|
|
70
|
+
if tensor.ndim == 1:
|
|
71
|
+
tensor = tensor.unsqueeze(0)
|
|
72
|
+
else:
|
|
73
|
+
tensor = tensor.T
|
|
74
|
+
return tensor, samplerate
|
|
75
|
+
|
|
76
|
+
torchaudio.load = _load_soundfile
|
|
77
|
+
|
|
78
|
+
import argparse
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def main():
|
|
82
|
+
parser = argparse.ArgumentParser(description="F5-TTS bridge for narration generation")
|
|
83
|
+
parser.add_argument("--ref_audio", required=True, help="Path to reference audio WAV")
|
|
84
|
+
parser.add_argument("--ref_text", default="", help="Transcription of reference audio")
|
|
85
|
+
parser.add_argument("--gen_text", default="", help="Text to synthesize")
|
|
86
|
+
parser.add_argument("--gen_text_file", default="", help="UTF-8 text file with text to synthesize (overrides --gen_text)")
|
|
87
|
+
parser.add_argument("--ref_text_file", default="", help="UTF-8 text file with ref transcription (overrides --ref_text)")
|
|
88
|
+
parser.add_argument("--output_file", required=True, help="Output WAV path")
|
|
89
|
+
parser.add_argument("--speed", type=float, default=1.0, help="Speech speed multiplier")
|
|
90
|
+
parser.add_argument("--model", default="F5TTS_v1_Base", help="Model type")
|
|
91
|
+
parser.add_argument("--remove_silence", action="store_true", help="Remove silence")
|
|
92
|
+
args = parser.parse_args()
|
|
93
|
+
|
|
94
|
+
# Read text from files if provided (avoids Windows CLI encoding issues)
|
|
95
|
+
gen_text = args.gen_text
|
|
96
|
+
if args.gen_text_file and os.path.exists(args.gen_text_file):
|
|
97
|
+
with open(args.gen_text_file, encoding="utf-8") as f:
|
|
98
|
+
gen_text = f.read().strip()
|
|
99
|
+
|
|
100
|
+
ref_text = args.ref_text
|
|
101
|
+
if args.ref_text_file and os.path.exists(args.ref_text_file):
|
|
102
|
+
with open(args.ref_text_file, encoding="utf-8") as f:
|
|
103
|
+
ref_text = f.read().strip()
|
|
104
|
+
|
|
105
|
+
if not gen_text:
|
|
106
|
+
print("ERROR: no gen_text provided (use --gen_text or --gen_text_file)", file=sys.stderr)
|
|
107
|
+
sys.exit(1)
|
|
108
|
+
|
|
109
|
+
from f5_tts.api import F5TTS
|
|
110
|
+
|
|
111
|
+
print(f"Loading F5-TTS model: {args.model}", flush=True)
|
|
112
|
+
tts = F5TTS(model=args.model)
|
|
113
|
+
|
|
114
|
+
print(f"Generating: {os.path.basename(args.output_file)}", flush=True)
|
|
115
|
+
wav, sr, spec = tts.infer(
|
|
116
|
+
ref_file=args.ref_audio,
|
|
117
|
+
ref_text=ref_text,
|
|
118
|
+
gen_text=gen_text,
|
|
119
|
+
file_wave=args.output_file,
|
|
120
|
+
speed=args.speed,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if args.remove_silence and os.path.exists(args.output_file):
|
|
124
|
+
try:
|
|
125
|
+
from f5_tts.infer.utils_infer import remove_silence_for_generated_wav
|
|
126
|
+
remove_silence_for_generated_wav(args.output_file)
|
|
127
|
+
print("Silence removed.", flush=True)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
print(f"Warning: could not remove silence: {e}", file=sys.stderr)
|
|
130
|
+
|
|
131
|
+
if os.path.exists(args.output_file):
|
|
132
|
+
size = os.path.getsize(args.output_file)
|
|
133
|
+
print(f"OK: {args.output_file} ({size} bytes)", flush=True)
|
|
134
|
+
else:
|
|
135
|
+
print("ERROR: output file not created", file=sys.stderr)
|
|
136
|
+
sys.exit(1)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__":
|
|
140
|
+
main()
|