polysync 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polysync/__init__.py +19 -0
- polysync/audio.py +130 -0
- polysync/cli.py +79 -0
- polysync/edit/__init__.py +9 -0
- polysync/edit/autoedit.py +321 -0
- polysync/edit/render_cuts.py +72 -0
- polysync/edit/render_pip.py +141 -0
- polysync/sidecar.py +88 -0
- polysync/sync.py +206 -0
- polysync/verify.py +118 -0
- polysync-0.1.0.dist-info/METADATA +115 -0
- polysync-0.1.0.dist-info/RECORD +16 -0
- polysync-0.1.0.dist-info/WHEEL +5 -0
- polysync-0.1.0.dist-info/entry_points.txt +2 -0
- polysync-0.1.0.dist-info/licenses/LICENSE +21 -0
- polysync-0.1.0.dist-info/top_level.txt +1 -0
polysync/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""polysync — multicam audio sync + director-style auto-edit.
|
|
2
|
+
|
|
3
|
+
Align N recordings of one event by audio cross-correlation (envelope-based,
|
|
4
|
+
robust at low SNR), emit reversible `.sync.json` sidecars (originals are never
|
|
5
|
+
re-encoded), then auto-cut / picture-in-picture them into a single MP4.
|
|
6
|
+
|
|
7
|
+
Public API:
|
|
8
|
+
from polysync import compute_sync, SyncResult, SyncError
|
|
9
|
+
from polysync.sidecar import read_sidecar, write_sidecar
|
|
10
|
+
"""
|
|
11
|
+
from .sync import compute_sync, SyncResult, SyncError
|
|
12
|
+
from .sidecar import read_sidecar, write_sidecar, sidecar_path, SCHEMA_VERSION
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
15
|
+
__all__ = [
|
|
16
|
+
"compute_sync", "SyncResult", "SyncError",
|
|
17
|
+
"read_sidecar", "write_sidecar", "sidecar_path", "SCHEMA_VERSION",
|
|
18
|
+
"__version__",
|
|
19
|
+
]
|
polysync/audio.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Shared audio primitives — the pieces sync, verify, and edit all need.
|
|
2
|
+
|
|
3
|
+
Everything here is either pure numpy/scipy (unit-testable without media) or a
|
|
4
|
+
thin ffmpeg/ffprobe wrapper. Keeping these in one place is the whole reason
|
|
5
|
+
polysync is a package and not three copy-pasted scripts.
|
|
6
|
+
"""
|
|
7
|
+
import subprocess
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from scipy import signal
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def loudest_audio_stream(video_path):
|
|
15
|
+
"""Return the index N of the audio stream (`0:a:N`) with the highest mean
|
|
16
|
+
volume, probed over a 60 s window mid-file.
|
|
17
|
+
|
|
18
|
+
Why this matters: pro cameras often record multiple audio tracks where the
|
|
19
|
+
first one is dead. Sony FX6 MXF clips carry 4 mono PCM tracks and commonly
|
|
20
|
+
leave a:0 / a:1 silent (~-90 dB) with the real room mic on a:2 / a:3.
|
|
21
|
+
Hard-coding `0:a:0` would cross-correlate silence and fail to sync, so pick
|
|
22
|
+
the loudest track instead. Single-stream files (most MP4 cams) short-circuit
|
|
23
|
+
to a:0.
|
|
24
|
+
"""
|
|
25
|
+
video_path = Path(video_path)
|
|
26
|
+
streams = subprocess.run(
|
|
27
|
+
["ffprobe", "-v", "error", "-select_streams", "a",
|
|
28
|
+
"-show_entries", "stream=index", "-of", "csv=p=0", str(video_path)],
|
|
29
|
+
check=True, capture_output=True, text=True,
|
|
30
|
+
).stdout.strip().splitlines()
|
|
31
|
+
if len(streams) <= 1:
|
|
32
|
+
return 0
|
|
33
|
+
best_idx, best_db = 0, -1e9
|
|
34
|
+
for ch in range(len(streams)):
|
|
35
|
+
err = subprocess.run(
|
|
36
|
+
["ffmpeg", "-nostdin", "-hide_banner", "-ss", "300", "-t", "60",
|
|
37
|
+
"-i", str(video_path), "-map", "0:a:%d" % ch,
|
|
38
|
+
"-af", "volumedetect", "-f", "null", "-"],
|
|
39
|
+
capture_output=True, text=True,
|
|
40
|
+
).stderr
|
|
41
|
+
for line in err.splitlines():
|
|
42
|
+
if "mean_volume" in line:
|
|
43
|
+
try:
|
|
44
|
+
db = float(line.split("mean_volume:")[1].strip().split()[0])
|
|
45
|
+
except (IndexError, ValueError):
|
|
46
|
+
db = -1e9
|
|
47
|
+
if db > best_db:
|
|
48
|
+
best_db, best_idx = db, ch
|
|
49
|
+
break
|
|
50
|
+
print(" [%s] loudest audio stream: a:%d (%.1f dB)"
|
|
51
|
+
% (video_path.name, best_idx, best_db))
|
|
52
|
+
return best_idx
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def extract_pcm(video_path, dst, sr, stream=None):
|
|
56
|
+
"""Extract one audio track as mono signed-16 PCM at `sr` Hz.
|
|
57
|
+
|
|
58
|
+
`stream` is the `0:a:N` index; if None, auto-select the loudest track.
|
|
59
|
+
No `-itsoffset` is ever applied here — offsets are pure metadata and are
|
|
60
|
+
handled by index arithmetic / `-itsoffset` at consume time downstream.
|
|
61
|
+
"""
|
|
62
|
+
video_path = Path(video_path)
|
|
63
|
+
ch = loudest_audio_stream(video_path) if stream is None else stream
|
|
64
|
+
subprocess.run(
|
|
65
|
+
["ffmpeg", "-nostdin", "-y", "-i", str(video_path),
|
|
66
|
+
"-map", "0:a:%d" % ch, "-ac", "1", "-ar", str(sr),
|
|
67
|
+
"-f", "s16le", str(dst)],
|
|
68
|
+
check=True, stderr=subprocess.DEVNULL,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def read_pcm(path):
|
|
73
|
+
"""Read a raw s16le file into a float32 array."""
|
|
74
|
+
return np.fromfile(str(path), dtype=np.int16).astype(np.float32)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def media_duration(path):
|
|
78
|
+
"""Container duration in seconds, via ffprobe."""
|
|
79
|
+
out = subprocess.run(
|
|
80
|
+
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
|
81
|
+
"-of", "default=nw=1:nk=1", str(path)],
|
|
82
|
+
check=True, capture_output=True, text=True,
|
|
83
|
+
)
|
|
84
|
+
return float(out.stdout.strip())
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def frame_rms(x, sr, hop_ms=10, win_ms=50):
|
|
88
|
+
"""Sliding-window RMS of `x`. Returns (rms_per_frame, frame_sr_hz).
|
|
89
|
+
|
|
90
|
+
Uses a cumulative-sum trick so it's O(n) regardless of window size. This is
|
|
91
|
+
the shared primitive behind both the sync envelope (log of this, high-passed)
|
|
92
|
+
and the edit per-second loudness.
|
|
93
|
+
"""
|
|
94
|
+
hop = int(sr * hop_ms / 1000)
|
|
95
|
+
win = int(sr * win_ms / 1000)
|
|
96
|
+
n = (len(x) - win) // hop + 1
|
|
97
|
+
if n <= 0:
|
|
98
|
+
return np.zeros(0, dtype=np.float32), sr / hop
|
|
99
|
+
sq = x.astype(np.float64) ** 2
|
|
100
|
+
csq = np.concatenate([[0.0], np.cumsum(sq)])
|
|
101
|
+
out = np.empty(n, dtype=np.float32)
|
|
102
|
+
for i in range(n):
|
|
103
|
+
s = i * hop
|
|
104
|
+
out[i] = np.sqrt(max(1e-9, (csq[s + win] - csq[s]) / win))
|
|
105
|
+
return out, sr / hop
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def log_envelope(x, sr, hop_ms=10, win_ms=50, highpass_hz=0.05):
|
|
109
|
+
"""Log-energy envelope, high-passed to strip slow gain/drift offsets.
|
|
110
|
+
|
|
111
|
+
This is what sync cross-correlates: it captures dialogue/music dynamics
|
|
112
|
+
that BOTH mics hear regardless of their frequency response — the reason
|
|
113
|
+
the matcher is robust even when the two cameras have very different mics.
|
|
114
|
+
"""
|
|
115
|
+
rms, fsr = frame_rms(x, sr, hop_ms, win_ms)
|
|
116
|
+
env = np.log(rms + 1e-3)
|
|
117
|
+
if highpass_hz:
|
|
118
|
+
env = highpass(env, fsr, highpass_hz)
|
|
119
|
+
return env, fsr
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def highpass(x, fs, cut_hz=0.05):
|
|
123
|
+
sos = signal.butter(2, cut_hz, btype="high", fs=fs, output="sos")
|
|
124
|
+
return signal.sosfiltfilt(sos, x).astype(np.float32)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def normalize(x):
|
|
128
|
+
x = x - x.mean()
|
|
129
|
+
s = x.std()
|
|
130
|
+
return x / s if s > 0 else x
|
polysync/cli.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""`polysync` command-line entry point.
|
|
2
|
+
|
|
3
|
+
polysync sync REFERENCE SOURCE [--partial]
|
|
4
|
+
polysync verify REFERENCE SOURCE SIDECAR [--apply-drift]
|
|
5
|
+
polysync edit IN1 IN2 ... --out edl.json [--mode rotation|greedy]
|
|
6
|
+
polysync render-cuts EDL --out out.mp4
|
|
7
|
+
polysync render-pip EDL --out out.mp4 [--pip bottom-right]
|
|
8
|
+
"""
|
|
9
|
+
import argparse
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
from . import __version__
|
|
13
|
+
from .sync import sync_files, SyncError
|
|
14
|
+
from .verify import verify_files
|
|
15
|
+
from .edit import autoedit, render_cuts, render_pip
|
|
16
|
+
|
|
17
|
+
USAGE = __doc__
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _cmd_sync(argv):
|
|
21
|
+
ap = argparse.ArgumentParser(prog="polysync sync")
|
|
22
|
+
ap.add_argument("reference", help="Reference recording (defines the timeline)")
|
|
23
|
+
ap.add_argument("source", help="Source to align to the reference")
|
|
24
|
+
ap.add_argument("--partial", action="store_true",
|
|
25
|
+
help="Lenient mode for a source covering only part of the "
|
|
26
|
+
"reference's span; degrades gracefully, writes only the "
|
|
27
|
+
"source sidecar.")
|
|
28
|
+
args = ap.parse_args(argv)
|
|
29
|
+
try:
|
|
30
|
+
sync_files(args.reference, args.source, partial=args.partial)
|
|
31
|
+
except SyncError as e:
|
|
32
|
+
print("ERROR: %s" % e, file=sys.stderr)
|
|
33
|
+
return 1
|
|
34
|
+
return 0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _cmd_verify(argv):
|
|
38
|
+
ap = argparse.ArgumentParser(prog="polysync verify")
|
|
39
|
+
ap.add_argument("reference")
|
|
40
|
+
ap.add_argument("source")
|
|
41
|
+
ap.add_argument("sidecar", help="The source's <source>.sync.json")
|
|
42
|
+
ap.add_argument("--apply-drift", action="store_true")
|
|
43
|
+
ap.add_argument("--step", type=float, default=600.0,
|
|
44
|
+
help="Probe spacing in seconds (default 10 min)")
|
|
45
|
+
args = ap.parse_args(argv)
|
|
46
|
+
try:
|
|
47
|
+
passed, _ = verify_files(args.reference, args.source, args.sidecar,
|
|
48
|
+
step=args.step, apply_drift=args.apply_drift)
|
|
49
|
+
except ValueError as e:
|
|
50
|
+
print("ERROR: %s" % e, file=sys.stderr)
|
|
51
|
+
return 2
|
|
52
|
+
return 0 if passed else 1
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def main(argv=None):
|
|
56
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
57
|
+
if not argv or argv[0] in ("-h", "--help", "help"):
|
|
58
|
+
print(USAGE)
|
|
59
|
+
return 0
|
|
60
|
+
if argv[0] in ("-V", "--version"):
|
|
61
|
+
print("polysync %s" % __version__)
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
cmd, rest = argv[0], argv[1:]
|
|
65
|
+
dispatch = {
|
|
66
|
+
"sync": _cmd_sync,
|
|
67
|
+
"verify": _cmd_verify,
|
|
68
|
+
"edit": lambda a: autoedit.main(a) or 0,
|
|
69
|
+
"render-cuts": lambda a: render_cuts.main(a) or 0,
|
|
70
|
+
"render-pip": lambda a: render_pip.main(a) or 0,
|
|
71
|
+
}
|
|
72
|
+
if cmd not in dispatch:
|
|
73
|
+
print("Unknown command %r.\n%s" % (cmd, USAGE), file=sys.stderr)
|
|
74
|
+
return 2
|
|
75
|
+
return dispatch[cmd](rest)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == "__main__":
|
|
79
|
+
sys.exit(main())
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Director-style multicam auto-edit on top of polysync sidecars.
|
|
2
|
+
|
|
3
|
+
autoedit — build an EDL (which cam is on screen each second) from synced inputs
|
|
4
|
+
render_cuts — render the EDL to one MP4 (hard cuts)
|
|
5
|
+
render_pip — render the EDL with a picture-in-picture inset
|
|
6
|
+
"""
|
|
7
|
+
from .autoedit import build_edl
|
|
8
|
+
|
|
9
|
+
__all__ = ["build_edl"]
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""Build a director-style EDL from N synced camera angles.
|
|
2
|
+
|
|
3
|
+
Inputs are ORIGINAL untouched media; each should have a `<input>.sync.json`
|
|
4
|
+
sidecar (from `polysync sync`). Sidecars give per-cam `delta_seconds` and
|
|
5
|
+
`overlap_in_reference`. Missing sidecar => cam assumed at delta=0, full coverage.
|
|
6
|
+
|
|
7
|
+
Decisions are audio-energy-driven only: per second, the cam whose mic is
|
|
8
|
+
loudest relative to the others (active-speaker proxy) wins, subject to dwell
|
|
9
|
+
hysteresis and coverage. No face/framing detection.
|
|
10
|
+
"""
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import tempfile
|
|
14
|
+
import warnings
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
from .. import audio
|
|
20
|
+
from ..sidecar import read_sidecar, SCHEMA_VERSION
|
|
21
|
+
|
|
22
|
+
SR = 16000
|
|
23
|
+
FRAME_HZ = 1.0
|
|
24
|
+
ENV_HOP_MS = 100
|
|
25
|
+
ENV_WIN_MS = 200
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _per_sec_envelope(x):
|
|
29
|
+
"""Log-RMS envelope of `x` collapsed to one value per reference second."""
|
|
30
|
+
rms, fsr = audio.frame_rms(x, SR, hop_ms=ENV_HOP_MS, win_ms=ENV_WIN_MS)
|
|
31
|
+
env = np.log(rms + 1e-3)
|
|
32
|
+
return env, fsr
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _lift_to_reference(env, env_sr, delta_sec, total_ref_sec):
|
|
36
|
+
"""Lift a cam-local per-frame envelope into the reference timeline at 1 Hz.
|
|
37
|
+
|
|
38
|
+
Reference second t reads the cam's local second (t - delta_sec). Seconds
|
|
39
|
+
outside the cam's recorded range become -inf so the editor never picks them.
|
|
40
|
+
"""
|
|
41
|
+
n_per = int(env_sr / FRAME_HZ)
|
|
42
|
+
take = (len(env) // n_per) * n_per
|
|
43
|
+
local = env[:take].reshape(-1, n_per).mean(axis=1) if take else np.zeros(0)
|
|
44
|
+
out = np.full(total_ref_sec, -np.inf, dtype=np.float32)
|
|
45
|
+
for t in range(total_ref_sec):
|
|
46
|
+
tl = int(t - delta_sec)
|
|
47
|
+
if 0 <= tl < len(local):
|
|
48
|
+
out[t] = local[tl]
|
|
49
|
+
return out
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _coverage_from_sidecar(input_path, total):
|
|
53
|
+
_, ovl, _ = read_sidecar(input_path)
|
|
54
|
+
if ovl is None:
|
|
55
|
+
return (0.0, float(total))
|
|
56
|
+
return (max(0.0, ovl[0]), min(float(total), ovl[1]))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _parse_coverage_flag(values, k_total, total):
|
|
60
|
+
cov = [(0.0, float(total))] * k_total
|
|
61
|
+
for v in (values or []):
|
|
62
|
+
parts = v.split(":")
|
|
63
|
+
if len(parts) != 3:
|
|
64
|
+
raise SystemExit("--coverage expects CAM:START:END, got %r" % v)
|
|
65
|
+
k = int(parts[0])
|
|
66
|
+
if not (0 <= k < k_total):
|
|
67
|
+
raise SystemExit("--coverage cam %d out of range" % k)
|
|
68
|
+
cov[k] = (float(parts[1]), float(parts[2]))
|
|
69
|
+
return cov
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _covered_at(cov, t):
|
|
73
|
+
return [k for k, (s, e) in enumerate(cov) if s <= t < e]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def rotation_edit(scores, coverage, min_dwell=8, max_dwell=15,
|
|
77
|
+
opening_dwell=10, seed=42):
|
|
78
|
+
"""Alternate among covered cams with varying dwell; force a switch when the
|
|
79
|
+
active cam leaves coverage."""
|
|
80
|
+
T, K = scores.shape
|
|
81
|
+
rng = np.random.default_rng(seed)
|
|
82
|
+
seq = np.full(T, -1, dtype=np.int32)
|
|
83
|
+
|
|
84
|
+
def best_at(t, candidates, win=opening_dwell):
|
|
85
|
+
end = min(T, t + win)
|
|
86
|
+
return max(candidates,
|
|
87
|
+
key=lambda k: scores[t:end, k].mean() if end > t else scores[t, k])
|
|
88
|
+
|
|
89
|
+
# The overlap window often starts a few seconds in (no cam covers t=0).
|
|
90
|
+
# Open at the first covered second; leading seconds are backfilled below.
|
|
91
|
+
cur_set = _covered_at(coverage, 0)
|
|
92
|
+
t_open = 0
|
|
93
|
+
if not cur_set:
|
|
94
|
+
t_open = next((t for t in range(T) if _covered_at(coverage, t)), -1)
|
|
95
|
+
if t_open < 0:
|
|
96
|
+
raise SystemExit("No camera is covered at any time")
|
|
97
|
+
cur_set = _covered_at(coverage, t_open)
|
|
98
|
+
cur = best_at(t_open, cur_set)
|
|
99
|
+
t = t_open
|
|
100
|
+
while t < T:
|
|
101
|
+
dwell = int(rng.integers(min_dwell, max_dwell + 1))
|
|
102
|
+
end = t
|
|
103
|
+
while end < t + dwell and end < T:
|
|
104
|
+
if cur not in _covered_at(coverage, end):
|
|
105
|
+
break
|
|
106
|
+
seq[end] = cur
|
|
107
|
+
end += 1
|
|
108
|
+
if end >= T:
|
|
109
|
+
break
|
|
110
|
+
cands = [k for k in _covered_at(coverage, end) if k != cur]
|
|
111
|
+
if not cands:
|
|
112
|
+
cands = _covered_at(coverage, end)
|
|
113
|
+
if not cands:
|
|
114
|
+
seq[end] = cur
|
|
115
|
+
t = end + 1
|
|
116
|
+
continue
|
|
117
|
+
upcoming = min(T, end + 6)
|
|
118
|
+
cur = max(cands, key=lambda k: scores[end:upcoming, k].mean()
|
|
119
|
+
if upcoming > end else scores[end, k])
|
|
120
|
+
t = end
|
|
121
|
+
for t in range(T):
|
|
122
|
+
if seq[t] == -1:
|
|
123
|
+
cands = _covered_at(coverage, t)
|
|
124
|
+
seq[t] = cands[0] if cands else 0
|
|
125
|
+
return seq
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def greedy_edit(scores, coverage, min_dwell=4, max_dwell=18, lookahead=4,
|
|
129
|
+
switch_threshold=0.0, opening_dwell=8):
|
|
130
|
+
"""Greedy hard-cut editor with min/max dwell hysteresis."""
|
|
131
|
+
T, K = scores.shape
|
|
132
|
+
|
|
133
|
+
def win_mean(t, k, w):
|
|
134
|
+
end = min(T, t + w)
|
|
135
|
+
return scores[t:end, k].mean() if end > t else scores[t, k]
|
|
136
|
+
|
|
137
|
+
seq = np.full(T, -1, dtype=np.int32)
|
|
138
|
+
cands0 = _covered_at(coverage, 0)
|
|
139
|
+
t_open = 0
|
|
140
|
+
if not cands0:
|
|
141
|
+
t_open = next((t for t in range(T) if _covered_at(coverage, t)), -1)
|
|
142
|
+
if t_open < 0:
|
|
143
|
+
raise SystemExit("No camera is covered at any time")
|
|
144
|
+
cands0 = _covered_at(coverage, t_open)
|
|
145
|
+
seq[t_open] = max(cands0, key=lambda k: win_mean(t_open, k, opening_dwell))
|
|
146
|
+
streak = 1
|
|
147
|
+
for t in range(t_open + 1, T):
|
|
148
|
+
cur = seq[t - 1]
|
|
149
|
+
if cur not in _covered_at(coverage, t):
|
|
150
|
+
cands = [k for k in _covered_at(coverage, t) if k != cur] or _covered_at(coverage, t)
|
|
151
|
+
if not cands:
|
|
152
|
+
seq[t] = cur; streak += 1; continue
|
|
153
|
+
seq[t] = max(cands, key=lambda k: win_mean(t, k, lookahead))
|
|
154
|
+
streak = 1; continue
|
|
155
|
+
if streak < min_dwell:
|
|
156
|
+
seq[t] = cur; streak += 1; continue
|
|
157
|
+
cands = [k for k in _covered_at(coverage, t) if k != cur]
|
|
158
|
+
if not cands:
|
|
159
|
+
seq[t] = cur; streak += 1; continue
|
|
160
|
+
if streak >= max_dwell:
|
|
161
|
+
seq[t] = max(cands, key=lambda k: win_mean(t, k, lookahead))
|
|
162
|
+
streak = 1; continue
|
|
163
|
+
cur_s = win_mean(t, cur, lookahead)
|
|
164
|
+
best_k = max(cands, key=lambda k: win_mean(t, k, lookahead))
|
|
165
|
+
if win_mean(t, best_k, lookahead) > cur_s + switch_threshold:
|
|
166
|
+
seq[t] = best_k; streak = 1
|
|
167
|
+
else:
|
|
168
|
+
seq[t] = cur; streak += 1
|
|
169
|
+
# Backfill any leading uncovered seconds (before t_open) with a covered cam.
|
|
170
|
+
for t in range(T):
|
|
171
|
+
if seq[t] == -1:
|
|
172
|
+
cands = _covered_at(coverage, t)
|
|
173
|
+
seq[t] = cands[0] if cands else 0
|
|
174
|
+
return seq
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def edl_from_seq(seq):
|
|
178
|
+
edl = []
|
|
179
|
+
i = 0
|
|
180
|
+
while i < len(seq):
|
|
181
|
+
j = i
|
|
182
|
+
while j < len(seq) and seq[j] == seq[i]:
|
|
183
|
+
j += 1
|
|
184
|
+
edl.append({"start": float(i), "end": float(j), "cam": int(seq[i])})
|
|
185
|
+
i = j
|
|
186
|
+
return edl
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def build_edl(inputs, mode="rotation", audio_source=None, min_dwell=8,
|
|
190
|
+
max_dwell=15, switch_threshold=0.0, seed=42, coverage_flags=None,
|
|
191
|
+
verbose=True):
|
|
192
|
+
"""Compute the EDL plan dict for a list of input paths."""
|
|
193
|
+
inputs = [Path(p) for p in inputs]
|
|
194
|
+
K = len(inputs)
|
|
195
|
+
|
|
196
|
+
deltas, cov_from_sc, has_sc = [], [], []
|
|
197
|
+
for p in inputs:
|
|
198
|
+
d, ovl, has = read_sidecar(p)
|
|
199
|
+
deltas.append(d); cov_from_sc.append(ovl); has_sc.append(has)
|
|
200
|
+
missing = [p.name for p, h in zip(inputs, has_sc) if not h]
|
|
201
|
+
if missing and verbose:
|
|
202
|
+
print("WARN: no sidecar for %s; assuming delta=0, full coverage. "
|
|
203
|
+
"Run `polysync sync` first if these should be offset." % missing)
|
|
204
|
+
|
|
205
|
+
durations, envs = [], []
|
|
206
|
+
with tempfile.TemporaryDirectory() as td:
|
|
207
|
+
td = Path(td)
|
|
208
|
+
for i, p in enumerate(inputs):
|
|
209
|
+
out = td / ("%d.pcm" % i)
|
|
210
|
+
audio.extract_pcm(p, out, SR)
|
|
211
|
+
x = audio.read_pcm(out)
|
|
212
|
+
durations.append(len(x) / SR)
|
|
213
|
+
envs.append(_per_sec_envelope(x))
|
|
214
|
+
|
|
215
|
+
cov_ends = [ovl[1] for ovl in cov_from_sc if ovl is not None]
|
|
216
|
+
total = int(max(cov_ends)) if cov_ends else int(min(durations))
|
|
217
|
+
|
|
218
|
+
per_sec = np.full((total, K), -np.inf, dtype=np.float32)
|
|
219
|
+
for k, (env, esr) in enumerate(envs):
|
|
220
|
+
per_sec[:, k] = _lift_to_reference(env, esr, deltas[k], total)
|
|
221
|
+
|
|
222
|
+
coverage = [_coverage_from_sidecar(p, total) for p in inputs]
|
|
223
|
+
if coverage_flags:
|
|
224
|
+
overrides = _parse_coverage_flag(coverage_flags, K, total)
|
|
225
|
+
for v in coverage_flags:
|
|
226
|
+
k = int(v.split(":")[0])
|
|
227
|
+
coverage[k] = overrides[k]
|
|
228
|
+
|
|
229
|
+
if verbose:
|
|
230
|
+
print("Cameras (%d):" % K)
|
|
231
|
+
for k, p in enumerate(inputs):
|
|
232
|
+
s, e = coverage[k]
|
|
233
|
+
print(" cam%d: %s coverage [%.1f .. %.1f]s" % (k, p.name, s, e))
|
|
234
|
+
|
|
235
|
+
finite = np.where(np.isfinite(per_sec), per_sec, np.nan)
|
|
236
|
+
if K > 1:
|
|
237
|
+
scores = np.full_like(per_sec, -np.inf)
|
|
238
|
+
with warnings.catch_warnings(): # all-nan seconds -> nan, handled below
|
|
239
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
240
|
+
for k in range(K):
|
|
241
|
+
others = np.nanmean(np.delete(finite, k, axis=1), axis=1)
|
|
242
|
+
diff = finite[:, k] - others
|
|
243
|
+
scores[:, k] = np.where(np.isfinite(diff), diff, -np.inf)
|
|
244
|
+
else:
|
|
245
|
+
scores = per_sec.copy()
|
|
246
|
+
|
|
247
|
+
if audio_source is None:
|
|
248
|
+
spread = []
|
|
249
|
+
for k in range(K):
|
|
250
|
+
v = finite[:, k]
|
|
251
|
+
v = v[np.isfinite(v)]
|
|
252
|
+
spread.append(0.0 if len(v) == 0 else
|
|
253
|
+
float(np.percentile(v, 90) - np.percentile(v, 10)))
|
|
254
|
+
cov_pct = np.array([(coverage[k][1] - coverage[k][0]) / max(1, total)
|
|
255
|
+
for k in range(K)])
|
|
256
|
+
audio_src = int(np.argmax(np.array(spread) + 0.5 * cov_pct))
|
|
257
|
+
else:
|
|
258
|
+
audio_src = audio_source
|
|
259
|
+
|
|
260
|
+
if K == 1:
|
|
261
|
+
seq = np.zeros(total, dtype=np.int32)
|
|
262
|
+
elif mode == "rotation":
|
|
263
|
+
seq = rotation_edit(scores, coverage, min_dwell=min_dwell,
|
|
264
|
+
max_dwell=max_dwell, seed=seed)
|
|
265
|
+
else:
|
|
266
|
+
seq = greedy_edit(scores, coverage, min_dwell=min_dwell,
|
|
267
|
+
max_dwell=max_dwell, switch_threshold=switch_threshold)
|
|
268
|
+
edl = edl_from_seq(seq)
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
"_about": ("EDL produced by polysync.edit.autoedit. Times are in the "
|
|
272
|
+
"reference timeline. deltas[k] is the per-input offset; "
|
|
273
|
+
"render scripts apply ffmpeg -itsoffset deltas[k] so they "
|
|
274
|
+
"read original (un-trimmed) files."),
|
|
275
|
+
"schema_version": SCHEMA_VERSION,
|
|
276
|
+
"inputs": [str(p) for p in inputs],
|
|
277
|
+
"deltas": [float(d) for d in deltas],
|
|
278
|
+
"duration_sec": total,
|
|
279
|
+
"audio_source": audio_src,
|
|
280
|
+
"coverage": [list(c) for c in coverage],
|
|
281
|
+
"edl": edl,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def main(argv=None):
|
|
286
|
+
ap = argparse.ArgumentParser(prog="polysync edit",
|
|
287
|
+
description="Build a multicam auto-edit EDL.")
|
|
288
|
+
ap.add_argument("inputs", type=Path, nargs="+",
|
|
289
|
+
help="Synced video files (camera 0, 1, ...)")
|
|
290
|
+
ap.add_argument("--audio-source", type=int, default=None,
|
|
291
|
+
help="Cam index to use as master audio (default: highest "
|
|
292
|
+
"dynamic-range covered cam)")
|
|
293
|
+
ap.add_argument("--mode", choices=["rotation", "greedy"], default="rotation")
|
|
294
|
+
ap.add_argument("--min-dwell", type=int, default=8)
|
|
295
|
+
ap.add_argument("--max-dwell", type=int, default=15)
|
|
296
|
+
ap.add_argument("--switch-threshold", type=float, default=0.0)
|
|
297
|
+
ap.add_argument("--seed", type=int, default=42)
|
|
298
|
+
ap.add_argument("--coverage", action="append", default=None,
|
|
299
|
+
help="Override per-cam coverage CAM:START:END (repeatable)")
|
|
300
|
+
ap.add_argument("--out", type=Path, required=True, help="output EDL json")
|
|
301
|
+
args = ap.parse_args(argv)
|
|
302
|
+
|
|
303
|
+
plan = build_edl(
|
|
304
|
+
args.inputs, mode=args.mode, audio_source=args.audio_source,
|
|
305
|
+
min_dwell=args.min_dwell, max_dwell=args.max_dwell,
|
|
306
|
+
switch_threshold=args.switch_threshold, seed=args.seed,
|
|
307
|
+
coverage_flags=args.coverage,
|
|
308
|
+
)
|
|
309
|
+
args.out.write_text(json.dumps(plan, indent=2))
|
|
310
|
+
edl, total = plan["edl"], plan["duration_sec"]
|
|
311
|
+
print("\nEDL: %d segments; audio_source=cam%d; saved %s"
|
|
312
|
+
% (len(edl), plan["audio_source"], args.out))
|
|
313
|
+
counts = {}
|
|
314
|
+
for row in edl:
|
|
315
|
+
counts[row["cam"]] = counts.get(row["cam"], 0) + (row["end"] - row["start"])
|
|
316
|
+
for k, dur in sorted(counts.items()):
|
|
317
|
+
print(" cam%d: %.0fs on screen (%.0f%%)" % (k, dur, 100 * dur / total))
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
if __name__ == "__main__":
|
|
321
|
+
main()
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Render an autoedit EDL into one MP4 with hard cuts (no transitions / PiP).
|
|
2
|
+
|
|
3
|
+
Applies each input's `delta` via `ffmpeg -itsoffset` so EDL times (reference
|
|
4
|
+
timeline) work directly inside the filter graph — originals are read untouched.
|
|
5
|
+
"""
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import subprocess
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def render_cuts(edl_path, out, encoder="hevc_videotoolbox", bitrate="12M",
|
|
13
|
+
width=1920, height=1080, fps=30, run=True):
|
|
14
|
+
plan = json.loads(Path(edl_path).read_text())
|
|
15
|
+
inputs = plan["inputs"]
|
|
16
|
+
deltas = plan.get("deltas", [0.0] * len(inputs))
|
|
17
|
+
edl = plan["edl"]
|
|
18
|
+
audio_src = plan["audio_source"]
|
|
19
|
+
W, H = width, height
|
|
20
|
+
|
|
21
|
+
cmd = ["ffmpeg", "-nostdin", "-y"]
|
|
22
|
+
for src, dlt in zip(inputs, deltas):
|
|
23
|
+
if abs(dlt) > 1e-9:
|
|
24
|
+
cmd.extend(["-itsoffset", "%.6f" % dlt])
|
|
25
|
+
cmd.extend(["-i", src])
|
|
26
|
+
|
|
27
|
+
filters = []
|
|
28
|
+
for i, row in enumerate(edl):
|
|
29
|
+
filters.append(
|
|
30
|
+
"[%d:v]trim=start=%s:end=%s,setpts=PTS-STARTPTS,"
|
|
31
|
+
"scale=%d:%d:force_original_aspect_ratio=decrease,"
|
|
32
|
+
"pad=%d:%d:(ow-iw)/2:(oh-ih)/2,setsar=1,fps=%d[v%d]"
|
|
33
|
+
% (row["cam"], row["start"], row["end"], W, H, W, H, fps, i)
|
|
34
|
+
)
|
|
35
|
+
concat = "".join("[v%d]" % i for i in range(len(edl)))
|
|
36
|
+
filters.append("%sconcat=n=%d:v=1:a=0[vout]" % (concat, len(edl)))
|
|
37
|
+
fc = ";".join(filters)
|
|
38
|
+
|
|
39
|
+
audio_offset = edl[0]["start"] if edl else 0.0
|
|
40
|
+
duration = plan["duration_sec"]
|
|
41
|
+
fc += (";[%d:a:0]atrim=start=%s:duration=%s,asetpts=PTS-STARTPTS[aout]"
|
|
42
|
+
% (audio_src, audio_offset, duration))
|
|
43
|
+
cmd.extend([
|
|
44
|
+
"-filter_complex", fc,
|
|
45
|
+
"-map", "[vout]", "-map", "[aout]",
|
|
46
|
+
"-t", str(duration),
|
|
47
|
+
"-c:v", encoder, "-b:v", bitrate, "-tag:v", "hvc1",
|
|
48
|
+
"-c:a", "aac", "-b:a", "192k",
|
|
49
|
+
"-movflags", "+faststart", str(out),
|
|
50
|
+
])
|
|
51
|
+
if run:
|
|
52
|
+
print(" ".join(cmd))
|
|
53
|
+
subprocess.run(cmd, check=True)
|
|
54
|
+
return cmd
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def main(argv=None):
|
|
58
|
+
ap = argparse.ArgumentParser(prog="polysync render-cuts")
|
|
59
|
+
ap.add_argument("edl", type=Path)
|
|
60
|
+
ap.add_argument("--out", type=Path, required=True)
|
|
61
|
+
ap.add_argument("--encoder", default="hevc_videotoolbox")
|
|
62
|
+
ap.add_argument("--bitrate", default="12M")
|
|
63
|
+
ap.add_argument("--width", type=int, default=1920)
|
|
64
|
+
ap.add_argument("--height", type=int, default=1080)
|
|
65
|
+
ap.add_argument("--fps", type=int, default=30)
|
|
66
|
+
args = ap.parse_args(argv)
|
|
67
|
+
render_cuts(args.edl, args.out, encoder=args.encoder, bitrate=args.bitrate,
|
|
68
|
+
width=args.width, height=args.height, fps=args.fps)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
if __name__ == "__main__":
|
|
72
|
+
main()
|