muvid 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- muvid/__init__.py +73 -0
- muvid/__main__.py +160 -0
- muvid/align.py +415 -0
- muvid/characters.py +205 -0
- muvid/compose.py +104 -0
- muvid/environments.py +73 -0
- muvid/facade.py +211 -0
- muvid/lyrics.py +265 -0
- muvid/project.py +300 -0
- muvid/render/__init__.py +202 -0
- muvid/render/_common.py +92 -0
- muvid/render/animation.py +97 -0
- muvid/render/image_to_video.py +51 -0
- muvid/render/lipsync.py +48 -0
- muvid/render/still.py +46 -0
- muvid/render/text_to_video.py +25 -0
- muvid/schema.py +202 -0
- muvid/script.py +241 -0
- muvid/ui/__init__.py +4 -0
- muvid/ui/app.py +267 -0
- muvid/ui/static/index.html +266 -0
- muvid-0.0.2.dist-info/METADATA +165 -0
- muvid-0.0.2.dist-info/RECORD +26 -0
- muvid-0.0.2.dist-info/WHEEL +4 -0
- muvid-0.0.2.dist-info/entry_points.txt +2 -0
- muvid-0.0.2.dist-info/licenses/LICENSE +21 -0
muvid/__init__.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""mtv — tools to make music videos.
|
|
2
|
+
|
|
3
|
+
Public surface (also the CLI verbs):
|
|
4
|
+
|
|
5
|
+
init_project, transcribe_song, align_lyrics,
|
|
6
|
+
add_character, add_character_images, generate_character_images,
|
|
7
|
+
curate_character,
|
|
8
|
+
add_environment, render_environment,
|
|
9
|
+
write_script, parse_script,
|
|
10
|
+
render_shot, render, compose, status,
|
|
11
|
+
|
|
12
|
+
Project model:
|
|
13
|
+
|
|
14
|
+
MusicVideoProject(root) — folder-backed project facade
|
|
15
|
+
ProjectSpec, SongInfo, SectionSpec, ShotSpec,
|
|
16
|
+
CharacterRef, EnvironmentRef
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from mtv.facade import (
|
|
22
|
+
add_character,
|
|
23
|
+
add_character_images,
|
|
24
|
+
add_environment,
|
|
25
|
+
align_lyrics,
|
|
26
|
+
compose,
|
|
27
|
+
curate_character,
|
|
28
|
+
generate_character_images,
|
|
29
|
+
init_project,
|
|
30
|
+
parse_script,
|
|
31
|
+
render,
|
|
32
|
+
render_environment,
|
|
33
|
+
render_shot,
|
|
34
|
+
status,
|
|
35
|
+
transcribe_song,
|
|
36
|
+
write_script,
|
|
37
|
+
)
|
|
38
|
+
from mtv.project import MusicVideoProject
|
|
39
|
+
from mtv.schema import (
|
|
40
|
+
CharacterRef,
|
|
41
|
+
EnvironmentRef,
|
|
42
|
+
ProjectSpec,
|
|
43
|
+
SectionSpec,
|
|
44
|
+
ShotSpec,
|
|
45
|
+
SongInfo,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
# high-level facade
|
|
50
|
+
"add_character",
|
|
51
|
+
"add_character_images",
|
|
52
|
+
"add_environment",
|
|
53
|
+
"align_lyrics",
|
|
54
|
+
"compose",
|
|
55
|
+
"curate_character",
|
|
56
|
+
"generate_character_images",
|
|
57
|
+
"init_project",
|
|
58
|
+
"parse_script",
|
|
59
|
+
"render",
|
|
60
|
+
"render_environment",
|
|
61
|
+
"render_shot",
|
|
62
|
+
"status",
|
|
63
|
+
"transcribe_song",
|
|
64
|
+
"write_script",
|
|
65
|
+
# data model
|
|
66
|
+
"CharacterRef",
|
|
67
|
+
"EnvironmentRef",
|
|
68
|
+
"MusicVideoProject",
|
|
69
|
+
"ProjectSpec",
|
|
70
|
+
"SectionSpec",
|
|
71
|
+
"ShotSpec",
|
|
72
|
+
"SongInfo",
|
|
73
|
+
]
|
muvid/__main__.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""mtv CLI — argh dispatch over the top-level facade.
|
|
2
|
+
|
|
3
|
+
Run ``mtv --help`` after install. Every verb is the same Python
|
|
4
|
+
function the skill and UI call.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json as _json
|
|
10
|
+
|
|
11
|
+
from mtv import facade
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _print_json(obj):
|
|
15
|
+
print(_json.dumps(obj, indent=2, default=str))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def init(root: str, *, title: str = "", song: str = "") -> None:
|
|
19
|
+
"""Create a new music video project at ROOT (optionally with a song)."""
|
|
20
|
+
out = facade.init_project(root, title=title, song=song or None)
|
|
21
|
+
print(out)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def transcribe(root: str, *, api_key: str = "") -> None:
|
|
25
|
+
"""Run ElevenLabs Scribe on the project's song; writes lyrics/transcript.json."""
|
|
26
|
+
print(facade.transcribe_song(root, api_key=api_key or None))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def align(root: str) -> None:
|
|
30
|
+
"""Build lyrics/alignment.annot from transcript + lyrics.md."""
|
|
31
|
+
print(facade.align_lyrics(root))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def character(
|
|
35
|
+
root: str,
|
|
36
|
+
name: str,
|
|
37
|
+
*,
|
|
38
|
+
description: str = "",
|
|
39
|
+
voice_id: str = "",
|
|
40
|
+
reference_audio_url: str = "",
|
|
41
|
+
) -> None:
|
|
42
|
+
"""Create or update a character card."""
|
|
43
|
+
_print_json(facade.add_character(
|
|
44
|
+
root, name,
|
|
45
|
+
description=description,
|
|
46
|
+
voice_id=voice_id,
|
|
47
|
+
reference_audio_url=reference_audio_url,
|
|
48
|
+
))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def character_images(root: str, name: str, *paths: str) -> None:
|
|
52
|
+
"""Drop existing image files into characters/<name>/refs/."""
|
|
53
|
+
_print_json(facade.add_character_images(root, name, list(paths)))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def character_generate(
|
|
57
|
+
root: str, name: str, *, n: int = 6, quality: str = "balanced"
|
|
58
|
+
) -> None:
|
|
59
|
+
"""Generate N reference images for a character via fal."""
|
|
60
|
+
_print_json(facade.generate_character_images(root, name, n=n, quality=quality))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def character_curate(
|
|
64
|
+
root: str, name: str, *, k: int = 8, recipe: str = "person_mock"
|
|
65
|
+
) -> None:
|
|
66
|
+
"""Run lookbook to select K best reference images."""
|
|
67
|
+
_print_json(facade.curate_character(root, name, k=k, recipe=recipe))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def environment(
|
|
71
|
+
root: str,
|
|
72
|
+
name: str,
|
|
73
|
+
*,
|
|
74
|
+
description: str = "",
|
|
75
|
+
time_of_day: str = "",
|
|
76
|
+
lighting: str = "",
|
|
77
|
+
) -> None:
|
|
78
|
+
"""Create or update an environment card."""
|
|
79
|
+
_print_json(facade.add_environment(
|
|
80
|
+
root, name,
|
|
81
|
+
description=description,
|
|
82
|
+
time_of_day=time_of_day,
|
|
83
|
+
lighting=lighting,
|
|
84
|
+
))
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def environment_render(root: str, name: str, *, quality: str = "high") -> None:
|
|
88
|
+
"""Generate the canonical establishing image for an environment."""
|
|
89
|
+
print(facade.render_environment(root, name, quality=quality))
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def script(root: str) -> None:
|
|
93
|
+
"""Render the project's sections+shots to script/script.md."""
|
|
94
|
+
print(facade.write_script(root))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def script_apply(root: str) -> None:
|
|
98
|
+
"""Parse script/script.md and upsert sections+shots into project.json."""
|
|
99
|
+
facade.parse_script(root)
|
|
100
|
+
print("ok")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def render(
|
|
104
|
+
root: str, *, shot: str = "", quality: str = "balanced", force: bool = False
|
|
105
|
+
) -> None:
|
|
106
|
+
"""Render one shot (--shot ID) or all shots."""
|
|
107
|
+
if shot:
|
|
108
|
+
print(facade.render_shot(root, shot, quality=quality, force=force))
|
|
109
|
+
else:
|
|
110
|
+
for p in facade.render(root, quality=quality, force=force):
|
|
111
|
+
print(p)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def compose(
|
|
115
|
+
root: str, *, out_name: str = "final.mp4", song_audio: bool = True
|
|
116
|
+
) -> None:
|
|
117
|
+
"""Concatenate rendered shots and (optionally) overlay song audio."""
|
|
118
|
+
print(facade.compose(root, out_name=out_name, use_song_audio=song_audio))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def status(root: str) -> None:
|
|
122
|
+
"""Print a summary of the project's current state."""
|
|
123
|
+
_print_json(facade.status(root))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def serve(root: str = ".", *, host: str = "127.0.0.1", port: int = 7800) -> None:
|
|
127
|
+
"""Launch the local web UI for managing a project."""
|
|
128
|
+
from mtv.ui.app import serve as _serve
|
|
129
|
+
|
|
130
|
+
_serve(root=root, host=host, port=port)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def main() -> None:
|
|
134
|
+
try:
|
|
135
|
+
import argh # type: ignore
|
|
136
|
+
except ImportError as e:
|
|
137
|
+
raise SystemExit(
|
|
138
|
+
"mtv CLI requires `argh`. pip install argh."
|
|
139
|
+
) from e
|
|
140
|
+
argh.dispatch_commands([
|
|
141
|
+
init,
|
|
142
|
+
transcribe,
|
|
143
|
+
align,
|
|
144
|
+
character,
|
|
145
|
+
character_images,
|
|
146
|
+
character_generate,
|
|
147
|
+
character_curate,
|
|
148
|
+
environment,
|
|
149
|
+
environment_render,
|
|
150
|
+
script,
|
|
151
|
+
script_apply,
|
|
152
|
+
render,
|
|
153
|
+
compose,
|
|
154
|
+
status,
|
|
155
|
+
serve,
|
|
156
|
+
])
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
if __name__ == "__main__":
|
|
160
|
+
main()
|
muvid/align.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
"""Lyric → audio alignment.
|
|
2
|
+
|
|
3
|
+
We have:
|
|
4
|
+
- a transcript (Scribe / faster-whisper) with word-level (text, start, end)
|
|
5
|
+
- a user-edited ``LyricsDoc`` with section labels + line text + optional
|
|
6
|
+
manual line-start anchors
|
|
7
|
+
|
|
8
|
+
We want a ``lacing`` store with three tiers (sections, lines, words) so
|
|
9
|
+
the rest of the system can ask "which lines fall in shot X" without
|
|
10
|
+
re-implementing interval math.
|
|
11
|
+
|
|
12
|
+
Strategy (greedy token-match):
|
|
13
|
+
|
|
14
|
+
1. Tokenize each lyric line into normalized words.
|
|
15
|
+
2. Walk the transcript word stream once, assigning each transcript word
|
|
16
|
+
to the next unmatched lyric word that matches (case- and
|
|
17
|
+
punctuation-insensitive). Tolerate small mismatches (transcript
|
|
18
|
+
word missing in lyrics, vice versa) with a small lookahead window.
|
|
19
|
+
3. From the matched words, derive line ``[start, end]`` as
|
|
20
|
+
``(first_matched_word.start, last_matched_word.end)``. If a line has
|
|
21
|
+
*no* matched words, fall back to the user's manual anchor (if any),
|
|
22
|
+
then to a linear interpolation between neighboring anchored lines.
|
|
23
|
+
4. Sections inherit ``[start, end]`` from the union of their lines;
|
|
24
|
+
if the user provided explicit ``start_s`` / ``end_s`` on a section,
|
|
25
|
+
those win.
|
|
26
|
+
|
|
27
|
+
The result is written as a ``lacing.SqliteStore`` so it round-trips and
|
|
28
|
+
can be edited by other tools.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
33
|
+
import re
|
|
34
|
+
from dataclasses import dataclass
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import Iterable, Optional
|
|
37
|
+
from uuid import uuid4
|
|
38
|
+
|
|
39
|
+
from mtv.lyrics import LyricsDoc, words_from_transcript
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
_WORD_TOKEN_RE = re.compile(r"[a-z0-9']+")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _normalize(token: str) -> str:
|
|
46
|
+
return token.lower().strip("'-")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _tokenize(text: str) -> list[str]:
|
|
50
|
+
"""Cheap normalization: lowercase, strip punctuation."""
|
|
51
|
+
return [_normalize(t) for t in _WORD_TOKEN_RE.findall(text.lower())]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
55
|
+
class WordAlignment:
|
|
56
|
+
"""One alignment between a lyric token and a transcript word."""
|
|
57
|
+
|
|
58
|
+
line_index: int
|
|
59
|
+
token_index: int # within the line
|
|
60
|
+
text: str
|
|
61
|
+
start_s: float
|
|
62
|
+
end_s: float
|
|
63
|
+
confidence: float = 1.0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
67
|
+
class LineAlignment:
|
|
68
|
+
line_index: int
|
|
69
|
+
section_label: str
|
|
70
|
+
text: str
|
|
71
|
+
start_s: float | None
|
|
72
|
+
end_s: float | None
|
|
73
|
+
word_alignments: tuple[WordAlignment, ...]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
77
|
+
class SectionAlignment:
|
|
78
|
+
label: str
|
|
79
|
+
title: str
|
|
80
|
+
start_s: float | None
|
|
81
|
+
end_s: float | None
|
|
82
|
+
lines: tuple[LineAlignment, ...]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass(frozen=True, slots=True, kw_only=True)
|
|
86
|
+
class AlignmentResult:
|
|
87
|
+
sections: tuple[SectionAlignment, ...]
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def lines(self) -> tuple[LineAlignment, ...]:
|
|
91
|
+
return tuple(L for s in self.sections for L in s.lines)
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def words(self) -> tuple[WordAlignment, ...]:
|
|
95
|
+
return tuple(w for L in self.lines for w in L.word_alignments)
|
|
96
|
+
|
|
97
|
+
def lines_in(self, start_s: float, end_s: float) -> list[LineAlignment]:
|
|
98
|
+
"""Lines that fall (at least partially) inside ``[start_s, end_s]``."""
|
|
99
|
+
out: list[LineAlignment] = []
|
|
100
|
+
for L in self.lines:
|
|
101
|
+
if L.start_s is None or L.end_s is None:
|
|
102
|
+
continue
|
|
103
|
+
if L.end_s > start_s and L.start_s < end_s:
|
|
104
|
+
out.append(L)
|
|
105
|
+
return out
|
|
106
|
+
|
|
107
|
+
def section_for(self, t: float) -> SectionAlignment | None:
|
|
108
|
+
for s in self.sections:
|
|
109
|
+
if s.start_s is None or s.end_s is None:
|
|
110
|
+
continue
|
|
111
|
+
if s.start_s <= t < s.end_s:
|
|
112
|
+
return s
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# --- core alignment -------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def align_lyrics(
|
|
120
|
+
lyrics: LyricsDoc,
|
|
121
|
+
transcript: dict,
|
|
122
|
+
*,
|
|
123
|
+
duration_s: float = 0.0,
|
|
124
|
+
lookahead: int = 6,
|
|
125
|
+
) -> AlignmentResult:
|
|
126
|
+
"""Greedy token-match alignment.
|
|
127
|
+
|
|
128
|
+
``duration_s`` is used only when extrapolating end times for lines
|
|
129
|
+
that have no matched words and no later anchor.
|
|
130
|
+
"""
|
|
131
|
+
transcript_words = words_from_transcript(transcript)
|
|
132
|
+
transcript_tokens = [_normalize(w["text"].strip("()[],.?!\"")) for w in transcript_words]
|
|
133
|
+
|
|
134
|
+
# Build a single flat list of (line_index, token_index, normalized) for
|
|
135
|
+
# every lyric token across all lines.
|
|
136
|
+
flat_lyric_tokens: list[tuple[int, int, str]] = []
|
|
137
|
+
for L in lyrics.lines:
|
|
138
|
+
toks = _tokenize(L.text)
|
|
139
|
+
for ti, tok in enumerate(toks):
|
|
140
|
+
flat_lyric_tokens.append((L.line_index, ti, tok))
|
|
141
|
+
|
|
142
|
+
# Greedy walk: for each lyric token, find the next matching transcript
|
|
143
|
+
# word within ``lookahead`` of the current cursor.
|
|
144
|
+
word_alignments_per_line: dict[int, list[WordAlignment]] = {}
|
|
145
|
+
cursor = 0
|
|
146
|
+
for line_idx, tok_idx, lyric_tok in flat_lyric_tokens:
|
|
147
|
+
if not lyric_tok:
|
|
148
|
+
continue
|
|
149
|
+
match_at = -1
|
|
150
|
+
for j in range(cursor, min(cursor + lookahead + 1, len(transcript_tokens))):
|
|
151
|
+
if transcript_tokens[j] == lyric_tok:
|
|
152
|
+
match_at = j
|
|
153
|
+
break
|
|
154
|
+
if match_at < 0:
|
|
155
|
+
# tolerate a 1-character substitution (sung mishears)
|
|
156
|
+
for j in range(cursor, min(cursor + lookahead + 1, len(transcript_tokens))):
|
|
157
|
+
if _close_enough(transcript_tokens[j], lyric_tok):
|
|
158
|
+
match_at = j
|
|
159
|
+
break
|
|
160
|
+
if match_at < 0:
|
|
161
|
+
continue
|
|
162
|
+
w = transcript_words[match_at]
|
|
163
|
+
word_alignments_per_line.setdefault(line_idx, []).append(
|
|
164
|
+
WordAlignment(
|
|
165
|
+
line_index=line_idx,
|
|
166
|
+
token_index=tok_idx,
|
|
167
|
+
text=w["text"],
|
|
168
|
+
start_s=float(w["start"]),
|
|
169
|
+
end_s=float(w["end"]),
|
|
170
|
+
confidence=0.9 if transcript_tokens[match_at] == lyric_tok else 0.6,
|
|
171
|
+
)
|
|
172
|
+
)
|
|
173
|
+
cursor = match_at + 1
|
|
174
|
+
|
|
175
|
+
# Now reduce to line / section alignments, falling back to anchors and
|
|
176
|
+
# interpolation for empty lines.
|
|
177
|
+
line_alignments: list[LineAlignment] = []
|
|
178
|
+
for L in lyrics.lines:
|
|
179
|
+
wal = tuple(word_alignments_per_line.get(L.line_index, ()))
|
|
180
|
+
if wal:
|
|
181
|
+
start = wal[0].start_s
|
|
182
|
+
end = wal[-1].end_s
|
|
183
|
+
else:
|
|
184
|
+
start = L.start_s
|
|
185
|
+
end = None
|
|
186
|
+
line_alignments.append(
|
|
187
|
+
LineAlignment(
|
|
188
|
+
line_index=L.line_index,
|
|
189
|
+
section_label=L.section_label,
|
|
190
|
+
text=L.text,
|
|
191
|
+
start_s=start,
|
|
192
|
+
end_s=end,
|
|
193
|
+
word_alignments=wal,
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
line_alignments = _interpolate_line_times(line_alignments, duration_s)
|
|
198
|
+
|
|
199
|
+
# Group back into sections.
|
|
200
|
+
by_label: dict[str, list[LineAlignment]] = {}
|
|
201
|
+
section_meta: dict[str, tuple[str, float | None, float | None]] = {}
|
|
202
|
+
for s in lyrics.sections:
|
|
203
|
+
section_meta[s.label] = (s.title, s.start_s, s.end_s)
|
|
204
|
+
for la in line_alignments:
|
|
205
|
+
by_label.setdefault(la.section_label, []).append(la)
|
|
206
|
+
|
|
207
|
+
section_alignments: list[SectionAlignment] = []
|
|
208
|
+
for s in lyrics.sections:
|
|
209
|
+
lines_for = tuple(by_label.get(s.label, ()))
|
|
210
|
+
starts = [L.start_s for L in lines_for if L.start_s is not None]
|
|
211
|
+
ends = [L.end_s for L in lines_for if L.end_s is not None]
|
|
212
|
+
s_start = s.start_s if s.start_s is not None else (min(starts) if starts else None)
|
|
213
|
+
s_end = s.end_s if s.end_s is not None else (max(ends) if ends else None)
|
|
214
|
+
section_alignments.append(
|
|
215
|
+
SectionAlignment(
|
|
216
|
+
label=s.label,
|
|
217
|
+
title=s.title,
|
|
218
|
+
start_s=s_start,
|
|
219
|
+
end_s=s_end,
|
|
220
|
+
lines=lines_for,
|
|
221
|
+
)
|
|
222
|
+
)
|
|
223
|
+
return AlignmentResult(sections=tuple(section_alignments))
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _close_enough(a: str, b: str) -> bool:
|
|
227
|
+
"""Tolerate trivial sung-vs-said variants — same first/last char and
|
|
228
|
+
differ by at most one internal char."""
|
|
229
|
+
if not a or not b:
|
|
230
|
+
return False
|
|
231
|
+
if abs(len(a) - len(b)) > 1:
|
|
232
|
+
return False
|
|
233
|
+
if a[0] != b[0] or a[-1] != b[-1]:
|
|
234
|
+
return False
|
|
235
|
+
# Levenshtein ≤ 1, length ≥ 3
|
|
236
|
+
if len(a) < 3 or len(b) < 3:
|
|
237
|
+
return False
|
|
238
|
+
return _lev1(a, b)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _lev1(a: str, b: str) -> bool:
|
|
242
|
+
if a == b:
|
|
243
|
+
return True
|
|
244
|
+
if len(a) == len(b):
|
|
245
|
+
diffs = sum(1 for x, y in zip(a, b) if x != y)
|
|
246
|
+
return diffs <= 1
|
|
247
|
+
# one-char insert/delete
|
|
248
|
+
short, long = (a, b) if len(a) < len(b) else (b, a)
|
|
249
|
+
for i in range(len(long)):
|
|
250
|
+
if long[:i] + long[i + 1:] == short:
|
|
251
|
+
return True
|
|
252
|
+
return False
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _interpolate_line_times(
|
|
256
|
+
lines: list[LineAlignment], duration_s: float
|
|
257
|
+
) -> list[LineAlignment]:
|
|
258
|
+
"""Fill missing line times by linear interpolation between anchors.
|
|
259
|
+
|
|
260
|
+
Lines that still have no end after this are given a tiny
|
|
261
|
+
``end = start + 0.5s`` placeholder; lines with no start at all are
|
|
262
|
+
left as-is.
|
|
263
|
+
"""
|
|
264
|
+
if not lines:
|
|
265
|
+
return lines
|
|
266
|
+
starts = [L.start_s for L in lines]
|
|
267
|
+
# Forward-fill ends within each line: if a line has start but no end,
|
|
268
|
+
# use the next line's start (or duration_s).
|
|
269
|
+
out: list[LineAlignment] = []
|
|
270
|
+
for i, L in enumerate(lines):
|
|
271
|
+
start = L.start_s
|
|
272
|
+
end = L.end_s
|
|
273
|
+
if start is None:
|
|
274
|
+
# Look back for a previous start; if none, default to 0.0.
|
|
275
|
+
for j in range(i - 1, -1, -1):
|
|
276
|
+
if lines[j].end_s is not None:
|
|
277
|
+
start = lines[j].end_s
|
|
278
|
+
break
|
|
279
|
+
if lines[j].start_s is not None:
|
|
280
|
+
start = lines[j].start_s
|
|
281
|
+
break
|
|
282
|
+
if start is None:
|
|
283
|
+
start = 0.0
|
|
284
|
+
if end is None:
|
|
285
|
+
for j in range(i + 1, len(lines)):
|
|
286
|
+
if lines[j].start_s is not None:
|
|
287
|
+
end = lines[j].start_s
|
|
288
|
+
break
|
|
289
|
+
if end is None:
|
|
290
|
+
end = duration_s if duration_s and duration_s > start else start + 0.5
|
|
291
|
+
out.append(
|
|
292
|
+
LineAlignment(
|
|
293
|
+
line_index=L.line_index,
|
|
294
|
+
section_label=L.section_label,
|
|
295
|
+
text=L.text,
|
|
296
|
+
start_s=start,
|
|
297
|
+
end_s=end,
|
|
298
|
+
word_alignments=L.word_alignments,
|
|
299
|
+
)
|
|
300
|
+
)
|
|
301
|
+
return out
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
# --- lacing serialization -------------------------------------------------
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def write_alignment_store(
|
|
308
|
+
alignment: AlignmentResult,
|
|
309
|
+
*,
|
|
310
|
+
path: str | Path,
|
|
311
|
+
asset_id: str = "song:audio",
|
|
312
|
+
rate: int = 1000,
|
|
313
|
+
) -> None:
|
|
314
|
+
"""Write alignment to a ``lacing.SqliteStore`` file (.annot).
|
|
315
|
+
|
|
316
|
+
Three tiers: ``sections``, ``lines``, ``words``. Body schemas use
|
|
317
|
+
custom URIs registered locally; we don't enforce them here (the
|
|
318
|
+
store still validates structure).
|
|
319
|
+
"""
|
|
320
|
+
from lacing import (
|
|
321
|
+
Annotation,
|
|
322
|
+
MediaRef,
|
|
323
|
+
Provenance,
|
|
324
|
+
RationalTime,
|
|
325
|
+
SqliteStore,
|
|
326
|
+
Tier,
|
|
327
|
+
TimeInterval,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
path = Path(path)
|
|
331
|
+
if path.exists():
|
|
332
|
+
path.unlink()
|
|
333
|
+
store = SqliteStore(str(path))
|
|
334
|
+
try:
|
|
335
|
+
store.add_tier(Tier(name="sections"))
|
|
336
|
+
store.add_tier(Tier(name="lines"))
|
|
337
|
+
store.add_tier(Tier(name="words"))
|
|
338
|
+
|
|
339
|
+
prov = Provenance(
|
|
340
|
+
was_generated_by="mtv:align",
|
|
341
|
+
was_attributed_to="mtv",
|
|
342
|
+
generated_at_time=RationalTime.zero(rate),
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
def interval(start_s: float, end_s: float) -> TimeInterval:
|
|
346
|
+
if end_s <= start_s:
|
|
347
|
+
end_s = start_s + 1.0 / rate
|
|
348
|
+
# Round to the integer rate-tick so we never trip lacing's
|
|
349
|
+
# strict "lossy conversion" guard for decimals like 14.2 at
|
|
350
|
+
# rate=1000 (14.2 → 14199.999... ticks).
|
|
351
|
+
start_t = int(round(start_s * rate))
|
|
352
|
+
end_t = int(round(end_s * rate))
|
|
353
|
+
if end_t <= start_t:
|
|
354
|
+
end_t = start_t + 1
|
|
355
|
+
return TimeInterval(
|
|
356
|
+
RationalTime(start_t, rate),
|
|
357
|
+
RationalTime(end_t, rate),
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
for s in alignment.sections:
|
|
361
|
+
if s.start_s is None or s.end_s is None:
|
|
362
|
+
continue
|
|
363
|
+
store.add(
|
|
364
|
+
Annotation(
|
|
365
|
+
id=uuid4(),
|
|
366
|
+
tier="sections",
|
|
367
|
+
reference=MediaRef(
|
|
368
|
+
asset_id=asset_id,
|
|
369
|
+
interval=interval(s.start_s, s.end_s),
|
|
370
|
+
),
|
|
371
|
+
body={"label": s.label, "title": s.title},
|
|
372
|
+
body_schema_uri="annot://schema/song-section/v1",
|
|
373
|
+
provenance=prov,
|
|
374
|
+
)
|
|
375
|
+
)
|
|
376
|
+
for L in alignment.lines:
|
|
377
|
+
if L.start_s is None or L.end_s is None:
|
|
378
|
+
continue
|
|
379
|
+
store.add(
|
|
380
|
+
Annotation(
|
|
381
|
+
id=uuid4(),
|
|
382
|
+
tier="lines",
|
|
383
|
+
reference=MediaRef(
|
|
384
|
+
asset_id=asset_id,
|
|
385
|
+
interval=interval(L.start_s, L.end_s),
|
|
386
|
+
),
|
|
387
|
+
body={
|
|
388
|
+
"text": L.text,
|
|
389
|
+
"line_index": L.line_index,
|
|
390
|
+
"section": L.section_label,
|
|
391
|
+
},
|
|
392
|
+
body_schema_uri="annot://schema/lyric-line/v1",
|
|
393
|
+
provenance=prov,
|
|
394
|
+
)
|
|
395
|
+
)
|
|
396
|
+
for w in alignment.words:
|
|
397
|
+
store.add(
|
|
398
|
+
Annotation(
|
|
399
|
+
id=uuid4(),
|
|
400
|
+
tier="words",
|
|
401
|
+
reference=MediaRef(
|
|
402
|
+
asset_id=asset_id,
|
|
403
|
+
interval=interval(w.start_s, w.end_s),
|
|
404
|
+
),
|
|
405
|
+
body={
|
|
406
|
+
"text": w.text,
|
|
407
|
+
"line_index": w.line_index,
|
|
408
|
+
"confidence": w.confidence,
|
|
409
|
+
},
|
|
410
|
+
body_schema_uri="annot://schema/word/v1",
|
|
411
|
+
provenance=prov,
|
|
412
|
+
)
|
|
413
|
+
)
|
|
414
|
+
finally:
|
|
415
|
+
store.close()
|