muvid 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
muvid/__init__.py ADDED
@@ -0,0 +1,73 @@
1
+ """mtv — tools to make music videos.
2
+
3
+ Public surface (also the CLI verbs):
4
+
5
+ init_project, transcribe_song, align_lyrics,
6
+ add_character, add_character_images, generate_character_images,
7
+ curate_character,
8
+ add_environment, render_environment,
9
+ write_script, parse_script,
10
+ render_shot, render, compose, status,
11
+
12
+ Project model:
13
+
14
+ MusicVideoProject(root) — folder-backed project facade
15
+ ProjectSpec, SongInfo, SectionSpec, ShotSpec,
16
+ CharacterRef, EnvironmentRef
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from mtv.facade import (
22
+ add_character,
23
+ add_character_images,
24
+ add_environment,
25
+ align_lyrics,
26
+ compose,
27
+ curate_character,
28
+ generate_character_images,
29
+ init_project,
30
+ parse_script,
31
+ render,
32
+ render_environment,
33
+ render_shot,
34
+ status,
35
+ transcribe_song,
36
+ write_script,
37
+ )
38
+ from mtv.project import MusicVideoProject
39
+ from mtv.schema import (
40
+ CharacterRef,
41
+ EnvironmentRef,
42
+ ProjectSpec,
43
+ SectionSpec,
44
+ ShotSpec,
45
+ SongInfo,
46
+ )
47
+
48
+ __all__ = [
49
+ # high-level facade
50
+ "add_character",
51
+ "add_character_images",
52
+ "add_environment",
53
+ "align_lyrics",
54
+ "compose",
55
+ "curate_character",
56
+ "generate_character_images",
57
+ "init_project",
58
+ "parse_script",
59
+ "render",
60
+ "render_environment",
61
+ "render_shot",
62
+ "status",
63
+ "transcribe_song",
64
+ "write_script",
65
+ # data model
66
+ "CharacterRef",
67
+ "EnvironmentRef",
68
+ "MusicVideoProject",
69
+ "ProjectSpec",
70
+ "SectionSpec",
71
+ "ShotSpec",
72
+ "SongInfo",
73
+ ]
muvid/__main__.py ADDED
@@ -0,0 +1,160 @@
1
+ """mtv CLI — argh dispatch over the top-level facade.
2
+
3
+ Run ``mtv --help`` after install. Every verb is the same Python
4
+ function the skill and UI call.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json as _json
10
+
11
+ from mtv import facade
12
+
13
+
14
+ def _print_json(obj):
15
+ print(_json.dumps(obj, indent=2, default=str))
16
+
17
+
18
+ def init(root: str, *, title: str = "", song: str = "") -> None:
19
+ """Create a new music video project at ROOT (optionally with a song)."""
20
+ out = facade.init_project(root, title=title, song=song or None)
21
+ print(out)
22
+
23
+
24
+ def transcribe(root: str, *, api_key: str = "") -> None:
25
+ """Run ElevenLabs Scribe on the project's song; writes lyrics/transcript.json."""
26
+ print(facade.transcribe_song(root, api_key=api_key or None))
27
+
28
+
29
+ def align(root: str) -> None:
30
+ """Build lyrics/alignment.annot from transcript + lyrics.md."""
31
+ print(facade.align_lyrics(root))
32
+
33
+
34
+ def character(
35
+ root: str,
36
+ name: str,
37
+ *,
38
+ description: str = "",
39
+ voice_id: str = "",
40
+ reference_audio_url: str = "",
41
+ ) -> None:
42
+ """Create or update a character card."""
43
+ _print_json(facade.add_character(
44
+ root, name,
45
+ description=description,
46
+ voice_id=voice_id,
47
+ reference_audio_url=reference_audio_url,
48
+ ))
49
+
50
+
51
+ def character_images(root: str, name: str, *paths: str) -> None:
52
+ """Drop existing image files into characters/<name>/refs/."""
53
+ _print_json(facade.add_character_images(root, name, list(paths)))
54
+
55
+
56
+ def character_generate(
57
+ root: str, name: str, *, n: int = 6, quality: str = "balanced"
58
+ ) -> None:
59
+ """Generate N reference images for a character via fal."""
60
+ _print_json(facade.generate_character_images(root, name, n=n, quality=quality))
61
+
62
+
63
+ def character_curate(
64
+ root: str, name: str, *, k: int = 8, recipe: str = "person_mock"
65
+ ) -> None:
66
+ """Run lookbook to select K best reference images."""
67
+ _print_json(facade.curate_character(root, name, k=k, recipe=recipe))
68
+
69
+
70
+ def environment(
71
+ root: str,
72
+ name: str,
73
+ *,
74
+ description: str = "",
75
+ time_of_day: str = "",
76
+ lighting: str = "",
77
+ ) -> None:
78
+ """Create or update an environment card."""
79
+ _print_json(facade.add_environment(
80
+ root, name,
81
+ description=description,
82
+ time_of_day=time_of_day,
83
+ lighting=lighting,
84
+ ))
85
+
86
+
87
+ def environment_render(root: str, name: str, *, quality: str = "high") -> None:
88
+ """Generate the canonical establishing image for an environment."""
89
+ print(facade.render_environment(root, name, quality=quality))
90
+
91
+
92
+ def script(root: str) -> None:
93
+ """Render the project's sections+shots to script/script.md."""
94
+ print(facade.write_script(root))
95
+
96
+
97
+ def script_apply(root: str) -> None:
98
+ """Parse script/script.md and upsert sections+shots into project.json."""
99
+ facade.parse_script(root)
100
+ print("ok")
101
+
102
+
103
+ def render(
104
+ root: str, *, shot: str = "", quality: str = "balanced", force: bool = False
105
+ ) -> None:
106
+ """Render one shot (--shot ID) or all shots."""
107
+ if shot:
108
+ print(facade.render_shot(root, shot, quality=quality, force=force))
109
+ else:
110
+ for p in facade.render(root, quality=quality, force=force):
111
+ print(p)
112
+
113
+
114
+ def compose(
115
+ root: str, *, out_name: str = "final.mp4", song_audio: bool = True
116
+ ) -> None:
117
+ """Concatenate rendered shots and (optionally) overlay song audio."""
118
+ print(facade.compose(root, out_name=out_name, use_song_audio=song_audio))
119
+
120
+
121
+ def status(root: str) -> None:
122
+ """Print a summary of the project's current state."""
123
+ _print_json(facade.status(root))
124
+
125
+
126
+ def serve(root: str = ".", *, host: str = "127.0.0.1", port: int = 7800) -> None:
127
+ """Launch the local web UI for managing a project."""
128
+ from mtv.ui.app import serve as _serve
129
+
130
+ _serve(root=root, host=host, port=port)
131
+
132
+
133
+ def main() -> None:
134
+ try:
135
+ import argh # type: ignore
136
+ except ImportError as e:
137
+ raise SystemExit(
138
+ "mtv CLI requires `argh`. pip install argh."
139
+ ) from e
140
+ argh.dispatch_commands([
141
+ init,
142
+ transcribe,
143
+ align,
144
+ character,
145
+ character_images,
146
+ character_generate,
147
+ character_curate,
148
+ environment,
149
+ environment_render,
150
+ script,
151
+ script_apply,
152
+ render,
153
+ compose,
154
+ status,
155
+ serve,
156
+ ])
157
+
158
+
159
+ if __name__ == "__main__":
160
+ main()
muvid/align.py ADDED
@@ -0,0 +1,415 @@
1
+ """Lyric → audio alignment.
2
+
3
+ We have:
4
+ - a transcript (Scribe / faster-whisper) with word-level (text, start, end)
5
+ - a user-edited ``LyricsDoc`` with section labels + line text + optional
6
+ manual line-start anchors
7
+
8
+ We want a ``lacing`` store with three tiers (sections, lines, words) so
9
+ the rest of the system can ask "which lines fall in shot X" without
10
+ re-implementing interval math.
11
+
12
+ Strategy (greedy token-match):
13
+
14
+ 1. Tokenize each lyric line into normalized words.
15
+ 2. Walk the transcript word stream once, assigning each transcript word
16
+ to the next unmatched lyric word that matches (case- and
17
+ punctuation-insensitive). Tolerate small mismatches (transcript
18
+ word missing in lyrics, vice versa) with a small lookahead window.
19
+ 3. From the matched words, derive line ``[start, end]`` as
20
+ ``(first_matched_word.start, last_matched_word.end)``. If a line has
21
+ *no* matched words, fall back to the user's manual anchor (if any),
22
+ then to a linear interpolation between neighboring anchored lines.
23
+ 4. Sections inherit ``[start, end]`` from the union of their lines;
24
+ if the user provided explicit ``start_s`` / ``end_s`` on a section,
25
+ those win.
26
+
27
+ The result is written as a ``lacing.SqliteStore`` so it round-trips and
28
+ can be edited by other tools.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import re
34
+ from dataclasses import dataclass
35
+ from pathlib import Path
36
+ from typing import Iterable, Optional
37
+ from uuid import uuid4
38
+
39
+ from mtv.lyrics import LyricsDoc, words_from_transcript
40
+
41
+
42
+ _WORD_TOKEN_RE = re.compile(r"[a-z0-9']+")
43
+
44
+
45
+ def _normalize(token: str) -> str:
46
+ return token.lower().strip("'-")
47
+
48
+
49
+ def _tokenize(text: str) -> list[str]:
50
+ """Cheap normalization: lowercase, strip punctuation."""
51
+ return [_normalize(t) for t in _WORD_TOKEN_RE.findall(text.lower())]
52
+
53
+
54
+ @dataclass(frozen=True, slots=True, kw_only=True)
55
+ class WordAlignment:
56
+ """One alignment between a lyric token and a transcript word."""
57
+
58
+ line_index: int
59
+ token_index: int # within the line
60
+ text: str
61
+ start_s: float
62
+ end_s: float
63
+ confidence: float = 1.0
64
+
65
+
66
+ @dataclass(frozen=True, slots=True, kw_only=True)
67
+ class LineAlignment:
68
+ line_index: int
69
+ section_label: str
70
+ text: str
71
+ start_s: float | None
72
+ end_s: float | None
73
+ word_alignments: tuple[WordAlignment, ...]
74
+
75
+
76
+ @dataclass(frozen=True, slots=True, kw_only=True)
77
+ class SectionAlignment:
78
+ label: str
79
+ title: str
80
+ start_s: float | None
81
+ end_s: float | None
82
+ lines: tuple[LineAlignment, ...]
83
+
84
+
85
+ @dataclass(frozen=True, slots=True, kw_only=True)
86
+ class AlignmentResult:
87
+ sections: tuple[SectionAlignment, ...]
88
+
89
+ @property
90
+ def lines(self) -> tuple[LineAlignment, ...]:
91
+ return tuple(L for s in self.sections for L in s.lines)
92
+
93
+ @property
94
+ def words(self) -> tuple[WordAlignment, ...]:
95
+ return tuple(w for L in self.lines for w in L.word_alignments)
96
+
97
+ def lines_in(self, start_s: float, end_s: float) -> list[LineAlignment]:
98
+ """Lines that fall (at least partially) inside ``[start_s, end_s]``."""
99
+ out: list[LineAlignment] = []
100
+ for L in self.lines:
101
+ if L.start_s is None or L.end_s is None:
102
+ continue
103
+ if L.end_s > start_s and L.start_s < end_s:
104
+ out.append(L)
105
+ return out
106
+
107
+ def section_for(self, t: float) -> SectionAlignment | None:
108
+ for s in self.sections:
109
+ if s.start_s is None or s.end_s is None:
110
+ continue
111
+ if s.start_s <= t < s.end_s:
112
+ return s
113
+ return None
114
+
115
+
116
+ # --- core alignment -------------------------------------------------------
117
+
118
+
119
+ def align_lyrics(
120
+ lyrics: LyricsDoc,
121
+ transcript: dict,
122
+ *,
123
+ duration_s: float = 0.0,
124
+ lookahead: int = 6,
125
+ ) -> AlignmentResult:
126
+ """Greedy token-match alignment.
127
+
128
+ ``duration_s`` is used only when extrapolating end times for lines
129
+ that have no matched words and no later anchor.
130
+ """
131
+ transcript_words = words_from_transcript(transcript)
132
+ transcript_tokens = [_normalize(w["text"].strip("()[],.?!\"")) for w in transcript_words]
133
+
134
+ # Build a single flat list of (line_index, token_index, normalized) for
135
+ # every lyric token across all lines.
136
+ flat_lyric_tokens: list[tuple[int, int, str]] = []
137
+ for L in lyrics.lines:
138
+ toks = _tokenize(L.text)
139
+ for ti, tok in enumerate(toks):
140
+ flat_lyric_tokens.append((L.line_index, ti, tok))
141
+
142
+ # Greedy walk: for each lyric token, find the next matching transcript
143
+ # word within ``lookahead`` of the current cursor.
144
+ word_alignments_per_line: dict[int, list[WordAlignment]] = {}
145
+ cursor = 0
146
+ for line_idx, tok_idx, lyric_tok in flat_lyric_tokens:
147
+ if not lyric_tok:
148
+ continue
149
+ match_at = -1
150
+ for j in range(cursor, min(cursor + lookahead + 1, len(transcript_tokens))):
151
+ if transcript_tokens[j] == lyric_tok:
152
+ match_at = j
153
+ break
154
+ if match_at < 0:
155
+ # tolerate a 1-character substitution (sung mishears)
156
+ for j in range(cursor, min(cursor + lookahead + 1, len(transcript_tokens))):
157
+ if _close_enough(transcript_tokens[j], lyric_tok):
158
+ match_at = j
159
+ break
160
+ if match_at < 0:
161
+ continue
162
+ w = transcript_words[match_at]
163
+ word_alignments_per_line.setdefault(line_idx, []).append(
164
+ WordAlignment(
165
+ line_index=line_idx,
166
+ token_index=tok_idx,
167
+ text=w["text"],
168
+ start_s=float(w["start"]),
169
+ end_s=float(w["end"]),
170
+ confidence=0.9 if transcript_tokens[match_at] == lyric_tok else 0.6,
171
+ )
172
+ )
173
+ cursor = match_at + 1
174
+
175
+ # Now reduce to line / section alignments, falling back to anchors and
176
+ # interpolation for empty lines.
177
+ line_alignments: list[LineAlignment] = []
178
+ for L in lyrics.lines:
179
+ wal = tuple(word_alignments_per_line.get(L.line_index, ()))
180
+ if wal:
181
+ start = wal[0].start_s
182
+ end = wal[-1].end_s
183
+ else:
184
+ start = L.start_s
185
+ end = None
186
+ line_alignments.append(
187
+ LineAlignment(
188
+ line_index=L.line_index,
189
+ section_label=L.section_label,
190
+ text=L.text,
191
+ start_s=start,
192
+ end_s=end,
193
+ word_alignments=wal,
194
+ )
195
+ )
196
+
197
+ line_alignments = _interpolate_line_times(line_alignments, duration_s)
198
+
199
+ # Group back into sections.
200
+ by_label: dict[str, list[LineAlignment]] = {}
201
+ section_meta: dict[str, tuple[str, float | None, float | None]] = {}
202
+ for s in lyrics.sections:
203
+ section_meta[s.label] = (s.title, s.start_s, s.end_s)
204
+ for la in line_alignments:
205
+ by_label.setdefault(la.section_label, []).append(la)
206
+
207
+ section_alignments: list[SectionAlignment] = []
208
+ for s in lyrics.sections:
209
+ lines_for = tuple(by_label.get(s.label, ()))
210
+ starts = [L.start_s for L in lines_for if L.start_s is not None]
211
+ ends = [L.end_s for L in lines_for if L.end_s is not None]
212
+ s_start = s.start_s if s.start_s is not None else (min(starts) if starts else None)
213
+ s_end = s.end_s if s.end_s is not None else (max(ends) if ends else None)
214
+ section_alignments.append(
215
+ SectionAlignment(
216
+ label=s.label,
217
+ title=s.title,
218
+ start_s=s_start,
219
+ end_s=s_end,
220
+ lines=lines_for,
221
+ )
222
+ )
223
+ return AlignmentResult(sections=tuple(section_alignments))
224
+
225
+
226
+ def _close_enough(a: str, b: str) -> bool:
227
+ """Tolerate trivial sung-vs-said variants — same first/last char and
228
+ differ by at most one internal char."""
229
+ if not a or not b:
230
+ return False
231
+ if abs(len(a) - len(b)) > 1:
232
+ return False
233
+ if a[0] != b[0] or a[-1] != b[-1]:
234
+ return False
235
+ # Levenshtein ≤ 1, length ≥ 3
236
+ if len(a) < 3 or len(b) < 3:
237
+ return False
238
+ return _lev1(a, b)
239
+
240
+
241
+ def _lev1(a: str, b: str) -> bool:
242
+ if a == b:
243
+ return True
244
+ if len(a) == len(b):
245
+ diffs = sum(1 for x, y in zip(a, b) if x != y)
246
+ return diffs <= 1
247
+ # one-char insert/delete
248
+ short, long = (a, b) if len(a) < len(b) else (b, a)
249
+ for i in range(len(long)):
250
+ if long[:i] + long[i + 1:] == short:
251
+ return True
252
+ return False
253
+
254
+
255
+ def _interpolate_line_times(
256
+ lines: list[LineAlignment], duration_s: float
257
+ ) -> list[LineAlignment]:
258
+ """Fill missing line times by linear interpolation between anchors.
259
+
260
+ Lines that still have no end after this are given a tiny
261
+ ``end = start + 0.5s`` placeholder; lines with no start at all are
262
+ left as-is.
263
+ """
264
+ if not lines:
265
+ return lines
266
+ starts = [L.start_s for L in lines]
267
+ # Forward-fill ends within each line: if a line has start but no end,
268
+ # use the next line's start (or duration_s).
269
+ out: list[LineAlignment] = []
270
+ for i, L in enumerate(lines):
271
+ start = L.start_s
272
+ end = L.end_s
273
+ if start is None:
274
+ # Look back for a previous start; if none, default to 0.0.
275
+ for j in range(i - 1, -1, -1):
276
+ if lines[j].end_s is not None:
277
+ start = lines[j].end_s
278
+ break
279
+ if lines[j].start_s is not None:
280
+ start = lines[j].start_s
281
+ break
282
+ if start is None:
283
+ start = 0.0
284
+ if end is None:
285
+ for j in range(i + 1, len(lines)):
286
+ if lines[j].start_s is not None:
287
+ end = lines[j].start_s
288
+ break
289
+ if end is None:
290
+ end = duration_s if duration_s and duration_s > start else start + 0.5
291
+ out.append(
292
+ LineAlignment(
293
+ line_index=L.line_index,
294
+ section_label=L.section_label,
295
+ text=L.text,
296
+ start_s=start,
297
+ end_s=end,
298
+ word_alignments=L.word_alignments,
299
+ )
300
+ )
301
+ return out
302
+
303
+
304
+ # --- lacing serialization -------------------------------------------------
305
+
306
+
307
+ def write_alignment_store(
308
+ alignment: AlignmentResult,
309
+ *,
310
+ path: str | Path,
311
+ asset_id: str = "song:audio",
312
+ rate: int = 1000,
313
+ ) -> None:
314
+ """Write alignment to a ``lacing.SqliteStore`` file (.annot).
315
+
316
+ Three tiers: ``sections``, ``lines``, ``words``. Body schemas use
317
+ custom URIs registered locally; we don't enforce them here (the
318
+ store still validates structure).
319
+ """
320
+ from lacing import (
321
+ Annotation,
322
+ MediaRef,
323
+ Provenance,
324
+ RationalTime,
325
+ SqliteStore,
326
+ Tier,
327
+ TimeInterval,
328
+ )
329
+
330
+ path = Path(path)
331
+ if path.exists():
332
+ path.unlink()
333
+ store = SqliteStore(str(path))
334
+ try:
335
+ store.add_tier(Tier(name="sections"))
336
+ store.add_tier(Tier(name="lines"))
337
+ store.add_tier(Tier(name="words"))
338
+
339
+ prov = Provenance(
340
+ was_generated_by="mtv:align",
341
+ was_attributed_to="mtv",
342
+ generated_at_time=RationalTime.zero(rate),
343
+ )
344
+
345
+ def interval(start_s: float, end_s: float) -> TimeInterval:
346
+ if end_s <= start_s:
347
+ end_s = start_s + 1.0 / rate
348
+ # Round to the integer rate-tick so we never trip lacing's
349
+ # strict "lossy conversion" guard for decimals like 14.2 at
350
+ # rate=1000 (14.2 → 14199.999... ticks).
351
+ start_t = int(round(start_s * rate))
352
+ end_t = int(round(end_s * rate))
353
+ if end_t <= start_t:
354
+ end_t = start_t + 1
355
+ return TimeInterval(
356
+ RationalTime(start_t, rate),
357
+ RationalTime(end_t, rate),
358
+ )
359
+
360
+ for s in alignment.sections:
361
+ if s.start_s is None or s.end_s is None:
362
+ continue
363
+ store.add(
364
+ Annotation(
365
+ id=uuid4(),
366
+ tier="sections",
367
+ reference=MediaRef(
368
+ asset_id=asset_id,
369
+ interval=interval(s.start_s, s.end_s),
370
+ ),
371
+ body={"label": s.label, "title": s.title},
372
+ body_schema_uri="annot://schema/song-section/v1",
373
+ provenance=prov,
374
+ )
375
+ )
376
+ for L in alignment.lines:
377
+ if L.start_s is None or L.end_s is None:
378
+ continue
379
+ store.add(
380
+ Annotation(
381
+ id=uuid4(),
382
+ tier="lines",
383
+ reference=MediaRef(
384
+ asset_id=asset_id,
385
+ interval=interval(L.start_s, L.end_s),
386
+ ),
387
+ body={
388
+ "text": L.text,
389
+ "line_index": L.line_index,
390
+ "section": L.section_label,
391
+ },
392
+ body_schema_uri="annot://schema/lyric-line/v1",
393
+ provenance=prov,
394
+ )
395
+ )
396
+ for w in alignment.words:
397
+ store.add(
398
+ Annotation(
399
+ id=uuid4(),
400
+ tier="words",
401
+ reference=MediaRef(
402
+ asset_id=asset_id,
403
+ interval=interval(w.start_s, w.end_s),
404
+ ),
405
+ body={
406
+ "text": w.text,
407
+ "line_index": w.line_index,
408
+ "confidence": w.confidence,
409
+ },
410
+ body_schema_uri="annot://schema/word/v1",
411
+ provenance=prov,
412
+ )
413
+ )
414
+ finally:
415
+ store.close()