scholium 2026.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scholium-2026.1/LICENSE +21 -0
- scholium-2026.1/PKG-INFO +703 -0
- scholium-2026.1/README.md +630 -0
- scholium-2026.1/pyproject.toml +169 -0
- scholium-2026.1/setup.cfg +4 -0
- scholium-2026.1/src/scholium/__init__.py +44 -0
- scholium-2026.1/src/scholium/config.py +127 -0
- scholium-2026.1/src/scholium/main.py +1165 -0
- scholium-2026.1/src/scholium/slide_processor.py +128 -0
- scholium-2026.1/src/scholium/tts_engine.py +349 -0
- scholium-2026.1/src/scholium/unified_parser.py +585 -0
- scholium-2026.1/src/scholium/video_generator.py +294 -0
- scholium-2026.1/src/scholium/voice_manager.py +193 -0
- scholium-2026.1/src/scholium.egg-info/PKG-INFO +703 -0
- scholium-2026.1/src/scholium.egg-info/SOURCES.txt +31 -0
- scholium-2026.1/src/scholium.egg-info/dependency_links.txt +1 -0
- scholium-2026.1/src/scholium.egg-info/entry_points.txt +3 -0
- scholium-2026.1/src/scholium.egg-info/requires.txt +63 -0
- scholium-2026.1/src/scholium.egg-info/top_level.txt +2 -0
- scholium-2026.1/src/tts_providers/__init__.py +119 -0
- scholium-2026.1/src/tts_providers/bark.py +117 -0
- scholium-2026.1/src/tts_providers/base.py +61 -0
- scholium-2026.1/src/tts_providers/coqui.py +277 -0
- scholium-2026.1/src/tts_providers/el.py +161 -0
- scholium-2026.1/src/tts_providers/f5tts.py +162 -0
- scholium-2026.1/src/tts_providers/openai.py +112 -0
- scholium-2026.1/src/tts_providers/piper.py +187 -0
- scholium-2026.1/src/tts_providers/styletts2.py +206 -0
- scholium-2026.1/src/tts_providers/tortoise.py +204 -0
- scholium-2026.1/tests/test_core.py +623 -0
- scholium-2026.1/tests/test_integration.py +254 -0
- scholium-2026.1/tests/test_providers.py +599 -0
- scholium-2026.1/tests/test_tts_engine.py +233 -0
scholium-2026.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Scholium Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
scholium-2026.1/PKG-INFO
ADDED
|
@@ -0,0 +1,703 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scholium
|
|
3
|
+
Version: 2026.1
|
|
4
|
+
Summary: Generate educational videos from markdown slides with AI voice synthesis
|
|
5
|
+
Author-email: Colin Caprani <colin.caprani@monash.edu>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ccaprani/scholium
|
|
8
|
+
Project-URL: Issues, https://github.com/ccaprani/scholium/issues
|
|
9
|
+
Keywords: tts,education,video,slides,lecture,pandoc,beamer
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Education
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pyyaml>=6.0
|
|
20
|
+
Requires-Dist: click>=8.0
|
|
21
|
+
Requires-Dist: pdf2image>=1.16.0
|
|
22
|
+
Requires-Dist: Pillow>=10.0.0
|
|
23
|
+
Requires-Dist: ffmpeg-python>=0.2.0
|
|
24
|
+
Requires-Dist: pydub>=0.25.0
|
|
25
|
+
Requires-Dist: tqdm>=4.65.0
|
|
26
|
+
Provides-Extra: piper
|
|
27
|
+
Requires-Dist: piper-tts>=1.2.0; extra == "piper"
|
|
28
|
+
Requires-Dist: pathvalidate>=3.0.0; extra == "piper"
|
|
29
|
+
Provides-Extra: elevenlabs
|
|
30
|
+
Requires-Dist: elevenlabs<3.0.0,>=1.0.0; extra == "elevenlabs"
|
|
31
|
+
Provides-Extra: coqui
|
|
32
|
+
Requires-Dist: TTS>=0.22.0; extra == "coqui"
|
|
33
|
+
Requires-Dist: transformers; extra == "coqui"
|
|
34
|
+
Requires-Dist: torch>=2.0.0; extra == "coqui"
|
|
35
|
+
Requires-Dist: torchaudio>=2.0.0; extra == "coqui"
|
|
36
|
+
Provides-Extra: openai
|
|
37
|
+
Requires-Dist: openai>=1.0.0; extra == "openai"
|
|
38
|
+
Provides-Extra: bark
|
|
39
|
+
Requires-Dist: bark>=0.1.0; extra == "bark"
|
|
40
|
+
Requires-Dist: transformers>=4.30.0; extra == "bark"
|
|
41
|
+
Requires-Dist: torch>=2.0.0; extra == "bark"
|
|
42
|
+
Requires-Dist: torchaudio>=2.0.0; extra == "bark"
|
|
43
|
+
Provides-Extra: f5tts
|
|
44
|
+
Requires-Dist: f5-tts>=0.3.0; extra == "f5tts"
|
|
45
|
+
Requires-Dist: torch>=2.0.0; extra == "f5tts"
|
|
46
|
+
Requires-Dist: torchaudio>=2.0.0; extra == "f5tts"
|
|
47
|
+
Requires-Dist: soundfile>=0.12.0; extra == "f5tts"
|
|
48
|
+
Provides-Extra: styletts2
|
|
49
|
+
Requires-Dist: styletts2>=0.1.0; extra == "styletts2"
|
|
50
|
+
Requires-Dist: torch>=2.0.0; extra == "styletts2"
|
|
51
|
+
Requires-Dist: torchaudio>=2.0.0; extra == "styletts2"
|
|
52
|
+
Requires-Dist: soundfile>=0.12.0; extra == "styletts2"
|
|
53
|
+
Provides-Extra: tortoise
|
|
54
|
+
Requires-Dist: tortoise-tts>=2.1.0; extra == "tortoise"
|
|
55
|
+
Requires-Dist: torch>=2.0.0; extra == "tortoise"
|
|
56
|
+
Requires-Dist: torchaudio>=2.0.0; extra == "tortoise"
|
|
57
|
+
Provides-Extra: all
|
|
58
|
+
Requires-Dist: piper-tts>=1.2.0; extra == "all"
|
|
59
|
+
Requires-Dist: pathvalidate>=3.0.0; extra == "all"
|
|
60
|
+
Requires-Dist: elevenlabs<3.0.0,>=1.0.0; extra == "all"
|
|
61
|
+
Requires-Dist: openai>=1.0.0; extra == "all"
|
|
62
|
+
Requires-Dist: f5-tts>=0.3.0; extra == "all"
|
|
63
|
+
Requires-Dist: torch>=2.0.0; extra == "all"
|
|
64
|
+
Requires-Dist: torchaudio>=2.0.0; extra == "all"
|
|
65
|
+
Requires-Dist: soundfile>=0.12.0; extra == "all"
|
|
66
|
+
Provides-Extra: dev
|
|
67
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
68
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
69
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
70
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
71
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
72
|
+
Dynamic: license-file
|
|
73
|
+
|
|
74
|
+
<p align="center">
|
|
75
|
+
<picture>
|
|
76
|
+
<source media="(prefers-color-scheme: dark)" srcset="docs/brand/logo-horizontal-dark-navbar.svg">
|
|
77
|
+
<source media="(prefers-color-scheme: light)" srcset="docs/brand/logo-horizontal.svg">
|
|
78
|
+
<img alt="Scholium" src="docs/brand/logo-horizontal.svg" width="65%">
|
|
79
|
+
</picture>
|
|
80
|
+
</p>
|
|
81
|
+
|
|
82
|
+
<p align="center">
|
|
83
|
+
<a href="https://www.python.org/downloads/"><img alt="Python 3.11+" src="https://img.shields.io/badge/python-3.11+-blue.svg"></a>
|
|
84
|
+
<a href="https://opensource.org/licenses/MIT"><img alt="License: MIT" src="https://img.shields.io/badge/License-MIT-yellow.svg"></a>
|
|
85
|
+
<a href="https://ccaprani.github.io/scholium"><img alt="Docs" src="https://img.shields.io/badge/docs-GitHub%20Pages-blue.svg"></a>
|
|
86
|
+
</p>
|
|
87
|
+
|
|
88
|
+
**Automated instructional video generation from markdown.**
|
|
89
|
+
|
|
90
|
+
> *Scholium* (Greek: σχόλιον) — An explanatory note or commentary. Your digital scholium for the modern classroom.
|
|
91
|
+
|
|
92
|
+
Convert markdown slides with embedded narration into professional videos. Perfect for flipped classroom content, lecture recordings, and maintaining course libraries.
|
|
93
|
+
|
|
94
|
+
<p align="center">
|
|
95
|
+
<img src="https://ccaprani.github.io/scholium/demo.gif"
|
|
96
|
+
alt="Scholium terminal demo — generating a narrated video from markdown"
|
|
97
|
+
width="100%">
|
|
98
|
+
</p>
|
|
99
|
+
|
|
100
|
+
<p align="center">
|
|
101
|
+
<a href="https://ccaprani.github.io/scholium/demo.mp4">▶ Watch the output video</a>
|
|
102
|
+
</p>
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Quick Start
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# 1. Install (requires Python 3.11+, pandoc, ffmpeg)
|
|
110
|
+
pip install scholium[piper]
|
|
111
|
+
|
|
112
|
+
# 2. Create a markdown file with embedded narration
|
|
113
|
+
cat > lecture.md << 'EOF'
|
|
114
|
+
---
|
|
115
|
+
title: "Newton's Laws"
|
|
116
|
+
author: "Physics 101"
|
|
117
|
+
title_notes: |
|
|
118
|
+
Welcome to today's lecture on Newton's Laws of Motion.
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
# What Are Newton's Laws?
|
|
122
|
+
|
|
123
|
+
Three fundamental principles governing motion.
|
|
124
|
+
|
|
125
|
+
::: notes
|
|
126
|
+
Newton's three laws form the foundation of classical mechanics.
|
|
127
|
+
Every object in the universe obeys these rules.
|
|
128
|
+
:::
|
|
129
|
+
|
|
130
|
+
# The First Law
|
|
131
|
+
|
|
132
|
+
An object in motion stays in motion unless acted upon by a force.
|
|
133
|
+
|
|
134
|
+
::: notes
|
|
135
|
+
This is the law of inertia.
|
|
136
|
+
Objects resist any change to their state of motion.
|
|
137
|
+
:::
|
|
138
|
+
EOF
|
|
139
|
+
|
|
140
|
+
# 3. Generate video
|
|
141
|
+
scholium generate lecture.md lecture.mp4
|
|
142
|
+
|
|
143
|
+
# 4. That's it! Your video is ready.
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Key Features
|
|
149
|
+
|
|
150
|
+
- 📝 **Unified Markdown Format**: Slides and narration in one file with `::: notes :::` blocks
|
|
151
|
+
- 🎯 **Pandoc Integration**: Full Beamer support with `slide-level` for section-based lectures
|
|
152
|
+
- 🎤 **Multiple TTS Providers**: Piper (local), ElevenLabs (cloud), Coqui, F5-TTS, StyleTTS2, Tortoise (local voice cloning), OpenAI, Bark
|
|
153
|
+
- ⏱️ **Flexible Timing**: Control pauses, slide duration, and pacing with simple directives
|
|
154
|
+
- 🔧 **Production Ready**: Batch processing, validation, verbose output
|
|
155
|
+
- 🎨 **Professional Output**: 1080p video with synchronized audio and slides
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## Installation
|
|
160
|
+
|
|
161
|
+
### System Requirements
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
# Ubuntu/Debian
|
|
165
|
+
sudo apt-get install pandoc texlive-latex-base texlive-latex-extra ffmpeg
|
|
166
|
+
|
|
167
|
+
# macOS
|
|
168
|
+
brew install pandoc mactex ffmpeg
|
|
169
|
+
|
|
170
|
+
# Windows
|
|
171
|
+
choco install pandoc miktex ffmpeg
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Install Scholium
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
# Recommended: Piper (fast, local, no API key needed)
|
|
178
|
+
pip install scholium[piper]
|
|
179
|
+
|
|
180
|
+
# Or other providers:
|
|
181
|
+
pip install scholium[elevenlabs] # High quality cloud API
|
|
182
|
+
pip install scholium[coqui] # Local voice cloning
|
|
183
|
+
pip install scholium[openai] # OpenAI TTS
|
|
184
|
+
pip install scholium[bark] # Highest quality, slowest
|
|
185
|
+
pip install scholium[f5tts] # Fast local voice cloning (zero-shot)
|
|
186
|
+
pip install scholium[styletts2] # Expressive local voice cloning
|
|
187
|
+
pip install scholium[tortoise] # Very high quality local voice cloning
|
|
188
|
+
|
|
189
|
+
# All providers:
|
|
190
|
+
pip install scholium[all]
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Usage
|
|
196
|
+
|
|
197
|
+
### Basic Command
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
scholium generate slides.md output.mp4 [options]
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Common Options
|
|
204
|
+
|
|
205
|
+
- `--voice NAME`: Voice ID to use (e.g., `en_US-lessac-medium` for Piper, an ElevenLabs voice ID, or a registered local voice name)
|
|
206
|
+
- `--provider NAME`: TTS provider (`piper`, `elevenlabs`, `coqui`, `openai`, `bark`, `f5tts`, `styletts2`, `tortoise`)
|
|
207
|
+
- `--section-duration SECONDS`: Duration for silent section/TOC slides (default: 3.0)
|
|
208
|
+
- `--verbose`: Show detailed progress
|
|
209
|
+
- `--keep-temp`: Keep temporary files for debugging
|
|
210
|
+
|
|
211
|
+
### Example
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
# With Piper (local)
|
|
215
|
+
scholium generate lecture.md lecture.mp4 \
|
|
216
|
+
--provider piper \
|
|
217
|
+
--voice en_US-lessac-medium \
|
|
218
|
+
--section-duration 2.0 \
|
|
219
|
+
--verbose
|
|
220
|
+
|
|
221
|
+
# With ElevenLabs (cloud)
|
|
222
|
+
export ELEVENLABS_API_KEY="your_key"
|
|
223
|
+
scholium generate lecture.md lecture.mp4 \
|
|
224
|
+
--provider elevenlabs \
|
|
225
|
+
--voice Xb7hH8MSUJpSbSDYk0k2 # Alice - Clear, Engaging Educator
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## Markdown Format
|
|
231
|
+
|
|
232
|
+
### Structure
|
|
233
|
+
|
|
234
|
+
Scholium uses standard Pandoc markdown with embedded `::: notes :::` blocks for narration:
|
|
235
|
+
|
|
236
|
+
```markdown
|
|
237
|
+
---
|
|
238
|
+
title: "My Lecture"
|
|
239
|
+
author: "Your Name"
|
|
240
|
+
slide-level: 2 # Use ## for slides, # for sections
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
# Section Title
|
|
244
|
+
|
|
245
|
+
<!-- This creates a table-of-contents slide (no narration needed) -->
|
|
246
|
+
|
|
247
|
+
## First Slide
|
|
248
|
+
|
|
249
|
+
Your slide content here.
|
|
250
|
+
|
|
251
|
+
::: notes
|
|
252
|
+
This narration will be spoken over the slide.
|
|
253
|
+
You can use multiple paragraphs.
|
|
254
|
+
:::
|
|
255
|
+
|
|
256
|
+
## Second Slide
|
|
257
|
+
|
|
258
|
+
More content.
|
|
259
|
+
|
|
260
|
+
::: notes
|
|
261
|
+
:: Reference: See textbook page 47
|
|
262
|
+
:: Author note: Double-check this calculation
|
|
263
|
+
|
|
264
|
+
This narration will be spoken.
|
|
265
|
+
Lines starting with :: are metadata - not narrated.
|
|
266
|
+
<!-- HTML comments are also ignored -->
|
|
267
|
+
|
|
268
|
+
More spoken narration here.
|
|
269
|
+
:::
|
|
270
|
+
|
|
271
|
+
# Another Section
|
|
272
|
+
|
|
273
|
+
## Third Slide
|
|
274
|
+
|
|
275
|
+
Content continues.
|
|
276
|
+
|
|
277
|
+
::: notes
|
|
278
|
+
And so does the narration.
|
|
279
|
+
:::
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
**Notes blocks can contain:**
|
|
283
|
+
- **Narration text**: Regular text is converted to speech
|
|
284
|
+
- **Metadata** (`:: prefix`): Author notes, references, reminders - not narrated
|
|
285
|
+
- **HTML comments** (`<!-- -->`): Also ignored during narration
|
|
286
|
+
- **Timing directives**: `[MIN 10s]`, `[PRE 2s]`, etc. - control timing, not spoken
|
|
287
|
+
|
|
288
|
+
### Slide Levels (Pandoc Integration)
|
|
289
|
+
|
|
290
|
+
Use the `slide-level` in YAML frontmatter to control slide structure:
|
|
291
|
+
|
|
292
|
+
**`slide-level: 1` (default)**: Each `#` heading creates a slide
|
|
293
|
+
```markdown
|
|
294
|
+
---
|
|
295
|
+
slide-level: 1
|
|
296
|
+
---
|
|
297
|
+
|
|
298
|
+
# Slide One
|
|
299
|
+
Content
|
|
300
|
+
|
|
301
|
+
::: notes
|
|
302
|
+
Narration
|
|
303
|
+
:::
|
|
304
|
+
|
|
305
|
+
# Slide Two
|
|
306
|
+
Content
|
|
307
|
+
|
|
308
|
+
## Just a subheading within Slide Two
|
|
309
|
+
|
|
310
|
+
::: notes
|
|
311
|
+
More narration
|
|
312
|
+
:::
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
**`slide-level: 2` (for section-based lectures)**: `#` creates sections with TOC slides, `##` creates content slides
|
|
316
|
+
```markdown
|
|
317
|
+
---
|
|
318
|
+
slide-level: 2
|
|
319
|
+
---
|
|
320
|
+
|
|
321
|
+
# Section Title
|
|
322
|
+
<!-- Auto-generates TOC slide, no narration needed -->
|
|
323
|
+
|
|
324
|
+
## Actual Slide One
|
|
325
|
+
Content
|
|
326
|
+
|
|
327
|
+
::: notes
|
|
328
|
+
Narration for slide one
|
|
329
|
+
:::
|
|
330
|
+
|
|
331
|
+
## Actual Slide Two
|
|
332
|
+
Content
|
|
333
|
+
|
|
334
|
+
::: notes
|
|
335
|
+
Narration for slide two
|
|
336
|
+
:::
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### Timing Control
|
|
340
|
+
|
|
341
|
+
Add timing directives inside `::: notes :::` blocks:
|
|
342
|
+
|
|
343
|
+
```markdown
|
|
344
|
+
## Complex Diagram
|
|
345
|
+
|
|
346
|
+
[Large diagram image]
|
|
347
|
+
|
|
348
|
+
::: notes
|
|
349
|
+
:: Reference: Figure adapted from Smith et al. (2023)
|
|
350
|
+
:: TODO: Update with latest data next semester
|
|
351
|
+
|
|
352
|
+
[MIN 15s] [PRE 2s] [POST 3s]
|
|
353
|
+
|
|
354
|
+
Take a moment to examine this diagram.
|
|
355
|
+
[PAUSE 2s]
|
|
356
|
+
Notice the three main components...
|
|
357
|
+
:::
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
**Available directives:**
|
|
361
|
+
- `[MIN 10s]` - Minimum slide duration (even if narration is shorter)
|
|
362
|
+
- `[PRE 2s]` - Pause 2 seconds before speaking
|
|
363
|
+
- `[POST 3s]` - Pause 3 seconds after speaking
|
|
364
|
+
- `[PAUSE 2s]` - 2-second mid-narration pause
|
|
365
|
+
- `[DUR 5s]` - Fixed duration (overrides everything)
|
|
366
|
+
|
|
367
|
+
**Metadata in notes** (prefixed with `::`):
|
|
368
|
+
- Not converted to speech
|
|
369
|
+
- Useful for references, author notes, TODOs
|
|
370
|
+
- Helps maintain context when editing lectures
|
|
371
|
+
|
|
372
|
+
### Incremental Bullets
|
|
373
|
+
|
|
374
|
+
Use `>-` for incremental bullet reveals (Pandoc/Beamer syntax):
|
|
375
|
+
|
|
376
|
+
```markdown
|
|
377
|
+
## Key Points
|
|
378
|
+
|
|
379
|
+
>- First point appears
|
|
380
|
+
>- Then second point
|
|
381
|
+
>- Finally third point
|
|
382
|
+
|
|
383
|
+
::: notes
|
|
384
|
+
Let's look at three key points.
|
|
385
|
+
|
|
386
|
+
First, we have the foundation concept.
|
|
387
|
+
|
|
388
|
+
Second, the application of that concept.
|
|
389
|
+
|
|
390
|
+
And third, the implications for our work.
|
|
391
|
+
:::
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
Each bullet creates a new slide page. Split your narration into paragraphs (separated by blank lines) to match.
|
|
395
|
+
|
|
396
|
+
---
|
|
397
|
+
|
|
398
|
+
## TTS Providers
|
|
399
|
+
|
|
400
|
+
| Provider | Type | Quality | Speed | Voice Cloning | API Key | Cost | `[all]` |
|
|
401
|
+
|----------|------|---------|-------|---------------|---------|------|---------|
|
|
402
|
+
| **Piper** | Local | ⭐⭐⭐⭐ | Fast | ❌ | ❌ | Free | ✅ |
|
|
403
|
+
| **ElevenLabs** | Cloud | ⭐⭐⭐⭐⭐ | Fast | ✅ | ✅ | Paid | ✅ |
|
|
404
|
+
| **Coqui** | Local | ⭐⭐⭐⭐ | Medium | ✅ | ❌ | Free | ❌ |
|
|
405
|
+
| **OpenAI** | Cloud | ⭐⭐⭐⭐ | Fast | ❌ | ✅ | Paid | ✅ |
|
|
406
|
+
| **Bark** | Local | ⭐⭐⭐⭐⭐ | Slow | ⚠️ | ❌ | Free | ❌ |
|
|
407
|
+
| **F5-TTS** | Local | ⭐⭐⭐⭐⭐ | Fast | ✅ | ❌ | Free | ✅ |
|
|
408
|
+
| **StyleTTS2** | Local | ⭐⭐⭐⭐⭐ | Medium | ✅ | ❌ | Free | ❌ |
|
|
409
|
+
| **Tortoise** | Local | ⭐⭐⭐⭐⭐ | Slow | ✅ | ❌ | Free | ❌ |
|
|
410
|
+
|
|
411
|
+
> `pip install scholium[all]` installs only the four ✅ providers (Piper, ElevenLabs, OpenAI, F5-TTS).
|
|
412
|
+
> Coqui, Bark, StyleTTS2, and Tortoise have transitive dependency conflicts on Python 3.11+ — install individually.
|
|
413
|
+
|
|
414
|
+
### Piper (Recommended)
|
|
415
|
+
|
|
416
|
+
```bash
|
|
417
|
+
pip install scholium[piper]
|
|
418
|
+
scholium generate lecture.md output.mp4 --provider piper
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
Available voices: `en_US-lessac-medium`, `en_US-amy-medium`, `en_GB-alan-medium`, etc.
|
|
422
|
+
|
|
423
|
+
### ElevenLabs (Highest Quality)
|
|
424
|
+
|
|
425
|
+
ElevenLabs voices are identified by a **Voice ID**, not their display name. Use `list-voices` to find the ID for the voice you want:
|
|
426
|
+
|
|
427
|
+
```bash
|
|
428
|
+
pip install scholium[elevenlabs]
|
|
429
|
+
export ELEVENLABS_API_KEY="your_key"
|
|
430
|
+
|
|
431
|
+
# List voices — shows Name and Voice ID side by side
|
|
432
|
+
scholium list-voices --provider elevenlabs
|
|
433
|
+
|
|
434
|
+
# Use the Voice ID with --voice (not the display name)
|
|
435
|
+
scholium generate lecture.md output.mp4 --provider elevenlabs --voice Xb7hH8MSUJpSbSDYk0k2
|
|
436
|
+
```
|
|
437
|
+
|
|
438
|
+
### Coqui (Local Voice Cloning)
|
|
439
|
+
|
|
440
|
+
```bash
|
|
441
|
+
pip install scholium[coqui]
|
|
442
|
+
scholium train-voice --name my_voice --provider coqui --sample recording.wav
|
|
443
|
+
scholium generate lecture.md output.mp4 --provider coqui --voice my_voice
|
|
444
|
+
```
|
|
445
|
+
|
|
446
|
+
### F5-TTS (Fast Local Voice Cloning)
|
|
447
|
+
|
|
448
|
+
Zero-shot cloning from a 5-15 second reference clip — no training step required.
|
|
449
|
+
|
|
450
|
+
```bash
|
|
451
|
+
pip install scholium[f5tts]
|
|
452
|
+
|
|
453
|
+
# Option A: register a voice in the library
|
|
454
|
+
scholium train-voice --name my_voice --provider f5tts --sample recording.wav
|
|
455
|
+
scholium generate lecture.md output.mp4 --provider f5tts --voice my_voice
|
|
456
|
+
|
|
457
|
+
# Option B: point directly to a reference file in config.yaml
|
|
458
|
+
# f5tts:
|
|
459
|
+
# model_path: "f5tts/my_voice/sample.wav" # relative to voices_dir
|
|
460
|
+
# ref_text: "Words spoken in the recording."
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
### StyleTTS2 (Expressive Local Voice Cloning)
|
|
464
|
+
|
|
465
|
+
```bash
|
|
466
|
+
pip install scholium[styletts2]
|
|
467
|
+
scholium train-voice --name my_voice --provider styletts2 --sample recording.wav
|
|
468
|
+
scholium generate lecture.md output.mp4 --provider styletts2 --voice my_voice
|
|
469
|
+
```
|
|
470
|
+
|
|
471
|
+
Or set `styletts2.model_path` in `config.yaml` to skip voice registration.
|
|
472
|
+
|
|
473
|
+
### Tortoise TTS (Highest-Quality Local Cloning)
|
|
474
|
+
|
|
475
|
+
```bash
|
|
476
|
+
pip install scholium[tortoise]
|
|
477
|
+
scholium train-voice --name my_voice --provider tortoise --sample recording.wav
|
|
478
|
+
# Add extra clips for better quality:
|
|
479
|
+
cp clip2.wav ~/.local/share/scholium/voices/tortoise/my_voice/sample_2.wav
|
|
480
|
+
scholium generate lecture.md output.mp4 --provider tortoise --voice my_voice
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
Or set `tortoise.model_path` in `config.yaml` to skip voice registration.
|
|
484
|
+
|
|
485
|
+
---
|
|
486
|
+
|
|
487
|
+
## Configuration
|
|
488
|
+
|
|
489
|
+
Create `config.yaml` in your project:
|
|
490
|
+
|
|
491
|
+
```yaml
|
|
492
|
+
# Slide settings
|
|
493
|
+
pandoc_template: beamer
|
|
494
|
+
|
|
495
|
+
# TTS settings
|
|
496
|
+
tts_provider: piper
|
|
497
|
+
voice: en_US-lessac-medium
|
|
498
|
+
|
|
499
|
+
# Timing defaults
|
|
500
|
+
timing:
|
|
501
|
+
default_pre_delay: 0.5 # Pause before speaking
|
|
502
|
+
default_post_delay: 1.0 # Pause after speaking
|
|
503
|
+
min_slide_duration: 3.0 # Minimum for any slide
|
|
504
|
+
silent_slide_duration: 2.0 # Duration for TOC/section slides
|
|
505
|
+
|
|
506
|
+
# Video settings
|
|
507
|
+
resolution: [1920, 1080]
|
|
508
|
+
fps: 30
|
|
509
|
+
|
|
510
|
+
# Paths
|
|
511
|
+
voices_dir: ~/.local/share/scholium/voices
|
|
512
|
+
temp_dir: ./temp
|
|
513
|
+
keep_temp_files: false
|
|
514
|
+
verbose: true
|
|
515
|
+
|
|
516
|
+
# Provider-specific settings
|
|
517
|
+
piper:
|
|
518
|
+
quality: medium
|
|
519
|
+
|
|
520
|
+
elevenlabs:
|
|
521
|
+
model: eleven_multilingual_v2
|
|
522
|
+
|
|
523
|
+
coqui:
|
|
524
|
+
model: tts_models/multilingual/multi-dataset/xtts_v2
|
|
525
|
+
|
|
526
|
+
# Zero-shot local providers: set model_path to use a reference audio file
|
|
527
|
+
# directly without registering a voice via scholium train-voice.
|
|
528
|
+
# Paths are relative to voices_dir (or absolute).
|
|
529
|
+
f5tts:
|
|
530
|
+
model: "F5-TTS"
|
|
531
|
+
# model_path: "f5tts/my_voice/sample.wav"
|
|
532
|
+
# ref_text: "Exact words spoken in the reference clip."
|
|
533
|
+
|
|
534
|
+
styletts2:
|
|
535
|
+
alpha: 0.3
|
|
536
|
+
beta: 0.7
|
|
537
|
+
diffusion_steps: 5
|
|
538
|
+
# model_path: "styletts2/my_voice/sample.wav"
|
|
539
|
+
|
|
540
|
+
tortoise:
|
|
541
|
+
preset: "fast"
|
|
542
|
+
# model_path: "tortoise/my_voice/sample.wav"
|
|
543
|
+
```
|
|
544
|
+
|
|
545
|
+
---
|
|
546
|
+
|
|
547
|
+
## Voice Management
|
|
548
|
+
|
|
549
|
+
### List Voices
|
|
550
|
+
|
|
551
|
+
```bash
|
|
552
|
+
# Local voice library (Coqui, F5-TTS, StyleTTS2, Tortoise)
|
|
553
|
+
scholium list-voices
|
|
554
|
+
|
|
555
|
+
# ElevenLabs cloud voices — shows Name and Voice ID
|
|
556
|
+
scholium list-voices --provider elevenlabs
|
|
557
|
+
```
|
|
558
|
+
|
|
559
|
+
### Register a Voice
|
|
560
|
+
|
|
561
|
+
All zero-shot local providers (Coqui, F5-TTS, StyleTTS2, Tortoise) use the same command:
|
|
562
|
+
|
|
563
|
+
```bash
|
|
564
|
+
scholium train-voice \
|
|
565
|
+
--name my_lecture_voice \
|
|
566
|
+
--provider f5tts \ # or coqui, styletts2, tortoise
|
|
567
|
+
--sample my_recording.wav \
|
|
568
|
+
--description "My natural teaching voice"
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
### Skip Registration with `model_path`
|
|
572
|
+
|
|
573
|
+
For F5-TTS, StyleTTS2, and Tortoise, you can point directly to a reference file in `config.yaml` without registering a voice:
|
|
574
|
+
|
|
575
|
+
```yaml
|
|
576
|
+
f5tts:
|
|
577
|
+
model_path: "f5tts/my_voice/sample.wav" # relative to voices_dir, or absolute
|
|
578
|
+
ref_text: "The words spoken in the clip." # optional but improves accuracy
|
|
579
|
+
```
|
|
580
|
+
|
|
581
|
+
### Regenerate Embeddings (Coqui)
|
|
582
|
+
|
|
583
|
+
```bash
|
|
584
|
+
# Pre-compute speaker embeddings to speed up Coqui generation
|
|
585
|
+
scholium regenerate-embeddings --voice my_lecture_voice
|
|
586
|
+
```
|
|
587
|
+
|
|
588
|
+
---
|
|
589
|
+
|
|
590
|
+
## Batch Processing
|
|
591
|
+
|
|
592
|
+
Process multiple lectures with a simple script:
|
|
593
|
+
|
|
594
|
+
```bash
|
|
595
|
+
#!/bin/bash
|
|
596
|
+
for lecture in lectures/*.md; do
|
|
597
|
+
output="${lecture%.md}.mp4"
|
|
598
|
+
scholium generate "$lecture" "$output" --verbose
|
|
599
|
+
done
|
|
600
|
+
```
|
|
601
|
+
|
|
602
|
+
Or use Python:
|
|
603
|
+
|
|
604
|
+
```python
|
|
605
|
+
from pathlib import Path
|
|
606
|
+
import subprocess
|
|
607
|
+
|
|
608
|
+
for lecture in Path("lectures").glob("*.md"):
|
|
609
|
+
output = lecture.with_suffix(".mp4")
|
|
610
|
+
subprocess.run([
|
|
611
|
+
"scholium", "generate",
|
|
612
|
+
str(lecture), str(output),
|
|
613
|
+
"--verbose"
|
|
614
|
+
])
|
|
615
|
+
```
|
|
616
|
+
|
|
617
|
+
---
|
|
618
|
+
|
|
619
|
+
## Examples
|
|
620
|
+
|
|
621
|
+
See the `examples/` directory for:
|
|
622
|
+
- Basic lecture with sections (`example_level2.md`)
|
|
623
|
+
- Incremental bullets and timing
|
|
624
|
+
- Voice cloning workflow
|
|
625
|
+
- Batch processing scripts
|
|
626
|
+
|
|
627
|
+
---
|
|
628
|
+
|
|
629
|
+
## Performance
|
|
630
|
+
|
|
631
|
+
**Generation time** (per 10-minute lecture):
|
|
632
|
+
- NVIDIA GPU: 5-10 minutes
|
|
633
|
+
- Apple Silicon: 10-15 minutes
|
|
634
|
+
- Modern CPU: 30-60 minutes
|
|
635
|
+
|
|
636
|
+
**First run**: Models download automatically (~500MB-1.5GB), cached for future use.
|
|
637
|
+
|
|
638
|
+
---
|
|
639
|
+
|
|
640
|
+
## Troubleshooting
|
|
641
|
+
|
|
642
|
+
**"Pandoc not found"**: Install pandoc and LaTeX (see Installation)
|
|
643
|
+
|
|
644
|
+
**"Narration bleeding over section slides"**: Make sure you have `slide-level: 2` in your YAML frontmatter
|
|
645
|
+
|
|
646
|
+
**"Slide count mismatch"**: Don't add `::: notes :::` after `#` section headings when using `slide-level: 2`
|
|
647
|
+
|
|
648
|
+
**"Voice not found"**:
|
|
649
|
+
- Piper: Use voice name like `en_US-lessac-medium`
|
|
650
|
+
- ElevenLabs: Use voice ID (run the list command above)
|
|
651
|
+
- Coqui / F5-TTS / StyleTTS2 / Tortoise: Use a registered voice name from `scholium list-voices`, or set `model_path` under the provider section in `config.yaml`
|
|
652
|
+
|
|
653
|
+
**"Out of memory"**:
|
|
654
|
+
- Close other applications
|
|
655
|
+
- Use `export CUDA_VISIBLE_DEVICES=""` to force CPU
|
|
656
|
+
- Process one lecture at a time
|
|
657
|
+
|
|
658
|
+
---
|
|
659
|
+
|
|
660
|
+
## Documentation
|
|
661
|
+
|
|
662
|
+
- **Full docs**: <https://ccaprani.github.io/scholium>
|
|
663
|
+
- [Getting Started](https://ccaprani.github.io/scholium/user/installation.html)
|
|
664
|
+
- [Markdown Format](https://ccaprani.github.io/scholium/user/markdown-format.html)
|
|
665
|
+
- [TTS Providers](https://ccaprani.github.io/scholium/user/tts-providers.html)
|
|
666
|
+
- [CLI Reference](https://ccaprani.github.io/scholium/user/cli.html)
|
|
667
|
+
- **Examples**: `examples/` directory in this repo
|
|
668
|
+
- **Issues**: [GitHub Issues](https://github.com/ccaprani/scholium/issues)
|
|
669
|
+
- **API reference**: `scholium --help`
|
|
670
|
+
|
|
671
|
+
---
|
|
672
|
+
|
|
673
|
+
## Project Philosophy
|
|
674
|
+
|
|
675
|
+
**Simple tool, not a framework**. Scholium does one thing well: converts markdown+narration into video. It integrates with your existing workflow rather than replacing it.
|
|
676
|
+
|
|
677
|
+
**Text-first**. Everything is plain text (markdown + YAML), so it's:
|
|
678
|
+
- Version controllable (Git)
|
|
679
|
+
- Searchable and editable
|
|
680
|
+
- Reproducible across systems
|
|
681
|
+
- Easy to maintain
|
|
682
|
+
|
|
683
|
+
**Pandoc-native**. Uses standard Beamer slide syntax, so your slides work in LaTeX/Beamer too.
|
|
684
|
+
|
|
685
|
+
---
|
|
686
|
+
|
|
687
|
+
## License
|
|
688
|
+
|
|
689
|
+
MIT License - see LICENSE file
|
|
690
|
+
|
|
691
|
+
---
|
|
692
|
+
|
|
693
|
+
## Contributing
|
|
694
|
+
|
|
695
|
+
Contributions welcome! Focus areas:
|
|
696
|
+
- New TTS provider integrations
|
|
697
|
+
- Performance improvements
|
|
698
|
+
- Documentation and examples
|
|
699
|
+
- Bug fixes
|
|
700
|
+
|
|
701
|
+
---
|
|
702
|
+
|
|
703
|
+
**Scholium: Your digital scholium for the modern classroom.** 📖
|