polysync 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polysync/__init__.py +19 -0
- polysync/audio.py +130 -0
- polysync/cli.py +79 -0
- polysync/edit/__init__.py +9 -0
- polysync/edit/autoedit.py +321 -0
- polysync/edit/render_cuts.py +72 -0
- polysync/edit/render_pip.py +141 -0
- polysync/sidecar.py +88 -0
- polysync/sync.py +206 -0
- polysync/verify.py +118 -0
- polysync-0.1.0.dist-info/METADATA +115 -0
- polysync-0.1.0.dist-info/RECORD +16 -0
- polysync-0.1.0.dist-info/WHEEL +5 -0
- polysync-0.1.0.dist-info/entry_points.txt +2 -0
- polysync-0.1.0.dist-info/licenses/LICENSE +21 -0
- polysync-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Render an autoedit EDL with picture-in-picture for 1, 2, or N cameras.
|
|
2
|
+
|
|
3
|
+
- 1 cam: pass-through (no inset).
|
|
4
|
+
- 2 cam: main = active cam; PiP = the other cam at the same time range.
|
|
5
|
+
- N cam: main = active; PiP = a covered non-active cam (round-robin or first).
|
|
6
|
+
|
|
7
|
+
Per-segment EDL rows may carry a `pip` field (cam index) to override the picker.
|
|
8
|
+
"""
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import subprocess
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
POSITIONS = {
|
|
15
|
+
"bottom-right": ("W-w-{m}", "H-h-{m}"),
|
|
16
|
+
"top-right": ("W-w-{m}", "{m}"),
|
|
17
|
+
"bottom-left": ("{m}", "H-h-{m}"),
|
|
18
|
+
"top-left": ("{m}", "{m}"),
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def pick_pip(row, K, coverage, mode="next"):
|
|
23
|
+
"""Choose the PiP cam for a segment, among cams covered for the WHOLE
|
|
24
|
+
segment. Honours an explicit row['pip']. Returns None if no other cam fits."""
|
|
25
|
+
if row.get("pip") is not None:
|
|
26
|
+
return int(row["pip"])
|
|
27
|
+
cam = row["cam"]
|
|
28
|
+
s, e = row["start"], row["end"]
|
|
29
|
+
candidates = [k for k in range(K)
|
|
30
|
+
if k != cam and coverage[k][0] <= s and coverage[k][1] >= e]
|
|
31
|
+
if not candidates:
|
|
32
|
+
return None
|
|
33
|
+
if mode == "next":
|
|
34
|
+
for off in range(1, K):
|
|
35
|
+
cand = (cam + off) % K
|
|
36
|
+
if cand in candidates:
|
|
37
|
+
return cand
|
|
38
|
+
return candidates[0]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def render_pip(edl_path, out, encoder="hevc_videotoolbox", bitrate="12M",
|
|
42
|
+
width=1920, height=1080, fps=30, pip="bottom-right",
|
|
43
|
+
pip_width=480, pip_margin=24, border_px=4, pip_pick="next",
|
|
44
|
+
run=True):
|
|
45
|
+
plan = json.loads(Path(edl_path).read_text())
|
|
46
|
+
inputs = plan["inputs"]
|
|
47
|
+
deltas = plan.get("deltas", [0.0] * len(inputs))
|
|
48
|
+
edl = plan["edl"]
|
|
49
|
+
audio_src = plan["audio_source"]
|
|
50
|
+
K = len(inputs)
|
|
51
|
+
coverage = plan.get("coverage", [[0.0, plan["duration_sec"]]] * K)
|
|
52
|
+
|
|
53
|
+
W, H = width, height
|
|
54
|
+
pw = pip_width
|
|
55
|
+
ph = round(pw * 9 / 16)
|
|
56
|
+
bw = border_px
|
|
57
|
+
x_expr, y_expr = POSITIONS[pip]
|
|
58
|
+
x_expr = x_expr.format(m=pip_margin)
|
|
59
|
+
y_expr = y_expr.format(m=pip_margin)
|
|
60
|
+
|
|
61
|
+
cmd = ["ffmpeg", "-nostdin", "-y"]
|
|
62
|
+
for src, dlt in zip(inputs, deltas):
|
|
63
|
+
if abs(dlt) > 1e-9:
|
|
64
|
+
cmd.extend(["-itsoffset", "%.6f" % dlt])
|
|
65
|
+
cmd.extend(["-i", src])
|
|
66
|
+
|
|
67
|
+
filters = []
|
|
68
|
+
for i, row in enumerate(edl):
|
|
69
|
+
cam = row["cam"]
|
|
70
|
+
s, e = row["start"], row["end"]
|
|
71
|
+
main_label = "m%d" % i if K > 1 else "v%d" % i
|
|
72
|
+
filters.append(
|
|
73
|
+
"[%d:v]trim=start=%s:end=%s,setpts=PTS-STARTPTS,"
|
|
74
|
+
"scale=%d:%d:force_original_aspect_ratio=decrease,"
|
|
75
|
+
"pad=%d:%d:(ow-iw)/2:(oh-ih)/2,setsar=1,fps=%d[%s]"
|
|
76
|
+
% (cam, s, e, W, H, W, H, fps, main_label)
|
|
77
|
+
)
|
|
78
|
+
if K == 1:
|
|
79
|
+
continue
|
|
80
|
+
pip_cam = pick_pip(row, K, coverage, mode=pip_pick)
|
|
81
|
+
if pip_cam is None:
|
|
82
|
+
filters.append("[m%d]copy[v%d]" % (i, i))
|
|
83
|
+
continue
|
|
84
|
+
chain = (
|
|
85
|
+
"[%d:v]trim=start=%s:end=%s,setpts=PTS-STARTPTS,"
|
|
86
|
+
"scale=%d:%d:force_original_aspect_ratio=decrease,"
|
|
87
|
+
"pad=%d:%d:(ow-iw)/2:(oh-ih)/2,"
|
|
88
|
+
% (pip_cam, s, e, pw, ph, pw, ph)
|
|
89
|
+
)
|
|
90
|
+
if bw > 0:
|
|
91
|
+
chain += "pad=%d:%d:%d:%d:white," % (pw + 2 * bw, ph + 2 * bw, bw, bw)
|
|
92
|
+
chain += "setsar=1,fps=%d[p%d]" % (fps, i)
|
|
93
|
+
filters.append(chain)
|
|
94
|
+
filters.append("[m%d][p%d]overlay=%s:%s:eof_action=pass[v%d]"
|
|
95
|
+
% (i, i, x_expr, y_expr, i))
|
|
96
|
+
|
|
97
|
+
concat = "".join("[v%d]" % i for i in range(len(edl)))
|
|
98
|
+
filters.append("%sconcat=n=%d:v=1:a=0[vout]" % (concat, len(edl)))
|
|
99
|
+
audio_offset = edl[0]["start"] if edl else 0.0
|
|
100
|
+
dur = plan["duration_sec"]
|
|
101
|
+
fc = ";".join(filters)
|
|
102
|
+
fc += (";[%d:a:0]atrim=start=%s:duration=%s,asetpts=PTS-STARTPTS[aout]"
|
|
103
|
+
% (audio_src, audio_offset, dur))
|
|
104
|
+
cmd.extend([
|
|
105
|
+
"-filter_complex", fc,
|
|
106
|
+
"-map", "[vout]", "-map", "[aout]",
|
|
107
|
+
"-t", str(dur),
|
|
108
|
+
"-c:v", encoder, "-b:v", bitrate, "-tag:v", "hvc1",
|
|
109
|
+
"-c:a", "aac", "-b:a", "192k",
|
|
110
|
+
"-movflags", "+faststart", str(out),
|
|
111
|
+
])
|
|
112
|
+
if run:
|
|
113
|
+
print("PiP %dx%d, inset %dx%d (+%dpx) at %s; %d cams; %d segments"
|
|
114
|
+
% (W, H, pw, ph, bw, pip, K, len(edl)))
|
|
115
|
+
subprocess.run(cmd, check=True)
|
|
116
|
+
return cmd
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def main(argv=None):
|
|
120
|
+
ap = argparse.ArgumentParser(prog="polysync render-pip")
|
|
121
|
+
ap.add_argument("edl", type=Path)
|
|
122
|
+
ap.add_argument("--out", type=Path, required=True)
|
|
123
|
+
ap.add_argument("--encoder", default="hevc_videotoolbox")
|
|
124
|
+
ap.add_argument("--bitrate", default="12M")
|
|
125
|
+
ap.add_argument("--width", type=int, default=1920)
|
|
126
|
+
ap.add_argument("--height", type=int, default=1080)
|
|
127
|
+
ap.add_argument("--fps", type=int, default=30)
|
|
128
|
+
ap.add_argument("--pip", choices=list(POSITIONS), default="bottom-right")
|
|
129
|
+
ap.add_argument("--pip-width", type=int, default=480)
|
|
130
|
+
ap.add_argument("--pip-margin", type=int, default=24)
|
|
131
|
+
ap.add_argument("--border-px", type=int, default=4)
|
|
132
|
+
ap.add_argument("--pip-pick", choices=["next", "second-best"], default="next")
|
|
133
|
+
args = ap.parse_args(argv)
|
|
134
|
+
render_pip(args.edl, args.out, encoder=args.encoder, bitrate=args.bitrate,
|
|
135
|
+
width=args.width, height=args.height, fps=args.fps, pip=args.pip,
|
|
136
|
+
pip_width=args.pip_width, pip_margin=args.pip_margin,
|
|
137
|
+
border_px=args.border_px, pip_pick=args.pip_pick)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__":
|
|
141
|
+
main()
|
polysync/sidecar.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""`.sync.json` sidecar — the reversible alignment record polysync emits.
|
|
2
|
+
|
|
3
|
+
One sidecar per original input, written next to it as `<input>.sync.json`.
|
|
4
|
+
Originals are never modified. Downstream aligns the source to the reference
|
|
5
|
+
timeline with `ffmpeg -itsoffset delta_seconds` (optionally
|
|
6
|
+
`atempo=1+drift_slope` for long-form lip-sync).
|
|
7
|
+
"""
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
SCHEMA_VERSION = 1
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def sidecar_path(media_path):
|
|
15
|
+
"""Sidecar lives next to the original, named `<original>.sync.json`."""
|
|
16
|
+
media_path = Path(media_path)
|
|
17
|
+
return media_path.with_suffix(media_path.suffix + ".sync.json")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def write_sidecar(media_path, *, source, reference, delta_seconds, drift_slope,
|
|
21
|
+
overlap_in_reference, overlap_in_source, verification=None):
|
|
22
|
+
"""Write the canonical sidecar next to `media_path` and return its Path."""
|
|
23
|
+
sc = {
|
|
24
|
+
"_about": (
|
|
25
|
+
"Sync metadata for %s (aligned to %s). Generated by polysync. "
|
|
26
|
+
"Originals are not modified; downstream uses "
|
|
27
|
+
"ffmpeg -itsoffset delta_seconds to align." % (source, reference)
|
|
28
|
+
),
|
|
29
|
+
"_help": {
|
|
30
|
+
"delta_seconds": (
|
|
31
|
+
"Source's t=0 expressed in reference's timeline. "
|
|
32
|
+
"Positive => source starts after reference. "
|
|
33
|
+
"Apply via `ffmpeg -itsoffset <delta_seconds> -i <source>`."
|
|
34
|
+
),
|
|
35
|
+
"drift_slope": (
|
|
36
|
+
"Residual clock drift between source and reference clocks "
|
|
37
|
+
"(dimensionless, ~1e-5 typical). For camera-cut editing, "
|
|
38
|
+
"ignore. For sync-sound / long-form lip-sync apply "
|
|
39
|
+
"atempo=1+drift_slope to the source."
|
|
40
|
+
),
|
|
41
|
+
"overlap_in_reference": (
|
|
42
|
+
"[start, end] window in reference's timeline where BOTH source "
|
|
43
|
+
"and reference have valid content. Use to constrain trims/EDLs."
|
|
44
|
+
),
|
|
45
|
+
"overlap_in_source": (
|
|
46
|
+
"Same window in the source's local timeline "
|
|
47
|
+
"(= overlap_in_reference shifted by -delta_seconds)."
|
|
48
|
+
),
|
|
49
|
+
"verification": (
|
|
50
|
+
"Filled in by `polysync verify`: median_residual_ms, "
|
|
51
|
+
"residual_spread_ms, probe_count. None until verify runs."
|
|
52
|
+
),
|
|
53
|
+
},
|
|
54
|
+
"schema_version": SCHEMA_VERSION,
|
|
55
|
+
"source": source,
|
|
56
|
+
"reference": reference,
|
|
57
|
+
"delta_seconds": float(delta_seconds),
|
|
58
|
+
"drift_slope": float(drift_slope),
|
|
59
|
+
"overlap_in_reference": [float(overlap_in_reference[0]),
|
|
60
|
+
float(overlap_in_reference[1])],
|
|
61
|
+
"overlap_in_source": [float(overlap_in_source[0]),
|
|
62
|
+
float(overlap_in_source[1])],
|
|
63
|
+
"verification": verification,
|
|
64
|
+
}
|
|
65
|
+
p = sidecar_path(media_path)
|
|
66
|
+
p.write_text(json.dumps(sc, indent=2, ensure_ascii=False))
|
|
67
|
+
return p
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def read_sidecar(input_path):
|
|
71
|
+
"""Read `<input>.sync.json`. Returns (delta_seconds, overlap_in_reference,
|
|
72
|
+
has_sidecar). Falls back to (0.0, None, False) if absent or unparseable —
|
|
73
|
+
callers treat that as 'this input is at reference t=0, full coverage'."""
|
|
74
|
+
sc = sidecar_path(input_path)
|
|
75
|
+
if not sc.exists():
|
|
76
|
+
return (0.0, None, False)
|
|
77
|
+
try:
|
|
78
|
+
d = json.loads(sc.read_text())
|
|
79
|
+
if d.get("schema_version") != SCHEMA_VERSION:
|
|
80
|
+
print("WARN: %s schema_version != %d; reading anyway"
|
|
81
|
+
% (sc.name, SCHEMA_VERSION))
|
|
82
|
+
delta = float(d["delta_seconds"])
|
|
83
|
+
ovl = d.get("overlap_in_reference")
|
|
84
|
+
ovl = (float(ovl[0]), float(ovl[1])) if ovl else None
|
|
85
|
+
return (delta, ovl, True)
|
|
86
|
+
except Exception as e: # noqa: BLE001 — best-effort, never crash the edit
|
|
87
|
+
print("WARN: failed to parse %s: %s; using delta=0" % (sc.name, e))
|
|
88
|
+
return (0.0, None, False)
|
polysync/sync.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""Compute the time offset between two recordings of the same event.
|
|
2
|
+
|
|
3
|
+
Algorithm (envelope cross-correlation + multi-probe drift fit):
|
|
4
|
+
1. Log-energy envelope of each signal (audio.log_envelope), high-passed.
|
|
5
|
+
2. FFT cross-correlate envelopes end-to-end -> coarse offset (~10 ms).
|
|
6
|
+
3. Refine at sample level with 60 s probes near the coarse position,
|
|
7
|
+
parabolic peak interpolation.
|
|
8
|
+
4. Linear-fit delta(t) across probes -> clock drift; report the
|
|
9
|
+
midpoint-canonical offset so residual error is symmetric around zero.
|
|
10
|
+
|
|
11
|
+
Two failure philosophies, selected by `partial`:
|
|
12
|
+
- partial=False (full-overlap multicam): demand >=3 good probes or raise
|
|
13
|
+
SyncError. Too few good matches almost always means the wrong files.
|
|
14
|
+
- partial=True (a source covering only part of the reference): degrade
|
|
15
|
+
gracefully — median delta on few probes, coarse delta if none.
|
|
16
|
+
|
|
17
|
+
`compute_sync` works on numpy PCM arrays (unit-testable, no ffmpeg). `sync_files`
|
|
18
|
+
is the file/CLI layer that extracts audio and writes sidecars.
|
|
19
|
+
"""
|
|
20
|
+
import sys
|
|
21
|
+
import tempfile
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
from scipy import signal
|
|
26
|
+
|
|
27
|
+
from . import audio
|
|
28
|
+
from .sidecar import write_sidecar
|
|
29
|
+
|
|
30
|
+
SR = 8000 # sync works fine at 8 kHz; the envelope is what matters, not HF
|
|
31
|
+
GOOD_NCOEF = 0.05 # a probe counts as "good" above this normalized correlation
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SyncError(Exception):
|
|
35
|
+
"""Raised when full-overlap sync cannot find enough evidence to trust."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SyncResult(object):
|
|
39
|
+
"""Outcome of compute_sync. `delta_seconds` is the source's t=0 expressed
|
|
40
|
+
in the reference timeline; positive => source starts after reference."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, delta_seconds, drift_slope, coarse_corr,
|
|
43
|
+
n_probes, n_good, fallback):
|
|
44
|
+
self.delta_seconds = float(delta_seconds)
|
|
45
|
+
self.drift_slope = float(drift_slope)
|
|
46
|
+
self.coarse_corr = float(coarse_corr)
|
|
47
|
+
self.n_probes = int(n_probes)
|
|
48
|
+
self.n_good = int(n_good)
|
|
49
|
+
self.fallback = fallback # None | "median" | "coarse"
|
|
50
|
+
|
|
51
|
+
def __repr__(self):
|
|
52
|
+
return ("SyncResult(delta=%.6f, drift=%.3e, coarse_corr=%.3f, "
|
|
53
|
+
"probes=%d, good=%d, fallback=%r)"
|
|
54
|
+
% (self.delta_seconds, self.drift_slope, self.coarse_corr,
|
|
55
|
+
self.n_probes, self.n_good, self.fallback))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def coarse_offset(env_a, env_b, env_sr):
|
|
59
|
+
"""Return (delta, normalized_corr) with delta = tA - tB so A_t = B_t + delta."""
|
|
60
|
+
a_n = audio.normalize(env_a)
|
|
61
|
+
b_n = audio.normalize(env_b)
|
|
62
|
+
xc = signal.correlate(a_n, b_n, mode="full", method="fft")
|
|
63
|
+
lags = np.arange(len(xc)) - (len(b_n) - 1)
|
|
64
|
+
pk = int(np.argmax(xc))
|
|
65
|
+
return float(lags[pk] / env_sr), float(xc[pk] / len(env_b))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _refine(a, b, b_start_s, expected_delta, sr, probe_len_s=60.0, pad_s=1.5):
|
|
69
|
+
pl = int(probe_len_s * sr)
|
|
70
|
+
bs = int(b_start_s * sr)
|
|
71
|
+
if bs + pl > len(b):
|
|
72
|
+
return None
|
|
73
|
+
probe = b[bs:bs + pl].astype(np.float32)
|
|
74
|
+
a_center = b_start_s + expected_delta
|
|
75
|
+
lo = max(0, int((a_center - pad_s) * sr))
|
|
76
|
+
hi = min(len(a), int((a_center + pad_s + probe_len_s) * sr))
|
|
77
|
+
if hi - lo < pl:
|
|
78
|
+
return None
|
|
79
|
+
seg = a[lo:hi].astype(np.float32)
|
|
80
|
+
xc = signal.correlate(audio.normalize(seg), audio.normalize(probe),
|
|
81
|
+
mode="valid", method="fft")
|
|
82
|
+
pk = int(np.argmax(np.abs(xc)))
|
|
83
|
+
val = xc[pk] / len(probe)
|
|
84
|
+
if 0 < pk < len(xc) - 1:
|
|
85
|
+
y0, y1, y2 = xc[pk - 1], xc[pk], xc[pk + 1]
|
|
86
|
+
denom = (y0 - 2 * y1 + y2)
|
|
87
|
+
sub = 0.5 * (y0 - y2) / denom if abs(denom) > 1e-9 else 0.0
|
|
88
|
+
else:
|
|
89
|
+
sub = 0.0
|
|
90
|
+
a_pos = (lo + pk + sub) / sr
|
|
91
|
+
return float(a_pos - b_start_s), float(val)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _multi_probe(a, b, expected_delta, b_dur, a_dur, sr, step_s=180.0):
|
|
95
|
+
rs = []
|
|
96
|
+
for bs in np.arange(60.0, b_dur - 60.0, step_s):
|
|
97
|
+
a_center = bs + expected_delta
|
|
98
|
+
if a_center < 1.5 or a_center + 60.0 + 1.5 > a_dur:
|
|
99
|
+
continue
|
|
100
|
+
r = _refine(a, b, bs, expected_delta, sr)
|
|
101
|
+
if r:
|
|
102
|
+
rs.append((bs, r[0], r[1]))
|
|
103
|
+
return rs
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def compute_sync(a, b, a_dur, b_dur, sr=SR, partial=False, verbose=False):
|
|
107
|
+
"""Align PCM array `b` (source) to `a` (reference). Returns SyncResult.
|
|
108
|
+
|
|
109
|
+
`a`, `b` are mono float arrays at `sr` Hz. `a_dur`/`b_dur` are their
|
|
110
|
+
durations in seconds (usually len/sr, passed explicitly so callers can use
|
|
111
|
+
container duration). Raises SyncError in full-overlap mode when evidence is
|
|
112
|
+
too weak.
|
|
113
|
+
"""
|
|
114
|
+
env_a, esr = audio.log_envelope(a, sr)
|
|
115
|
+
env_b, _ = audio.log_envelope(b, sr)
|
|
116
|
+
coarse_d, coarse_v = coarse_offset(env_a, env_b, esr)
|
|
117
|
+
if verbose:
|
|
118
|
+
print(" coarse delta = %+.4fs (xc/N=%.3f)" % (coarse_d, coarse_v))
|
|
119
|
+
|
|
120
|
+
probes = _multi_probe(a, b, coarse_d, b_dur, a_dur, sr)
|
|
121
|
+
good = np.array([abs(p[2]) > GOOD_NCOEF for p in probes], dtype=bool)
|
|
122
|
+
if verbose:
|
|
123
|
+
print(" good probes: %d / %d" % (int(good.sum()), len(probes)))
|
|
124
|
+
|
|
125
|
+
if good.sum() >= 3:
|
|
126
|
+
bs_arr = np.array([p[0] for p in probes])
|
|
127
|
+
d_arr = np.array([p[1] for p in probes])
|
|
128
|
+
slope, intercept = np.polyfit(bs_arr[good], d_arr[good], 1)
|
|
129
|
+
delta = float(slope * (b_dur / 2) + intercept)
|
|
130
|
+
return SyncResult(delta, float(slope), coarse_v,
|
|
131
|
+
len(probes), int(good.sum()), None)
|
|
132
|
+
if not partial:
|
|
133
|
+
raise SyncError(
|
|
134
|
+
"too few good probes (%d < 3); sync unreliable. If this is a "
|
|
135
|
+
"short partial-coverage clip, use partial=True / --partial."
|
|
136
|
+
% int(good.sum()))
|
|
137
|
+
if probes:
|
|
138
|
+
delta = float(np.median([p[1] for p in probes]))
|
|
139
|
+
return SyncResult(delta, 0.0, coarse_v, len(probes),
|
|
140
|
+
int(good.sum()), "median")
|
|
141
|
+
return SyncResult(float(coarse_d), 0.0, coarse_v, 0, 0, "coarse")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _overlap(delta, a_dur, b_dur):
|
|
145
|
+
ref_start = max(0.0, delta)
|
|
146
|
+
ref_end = min(a_dur, delta + b_dur)
|
|
147
|
+
return (ref_start, ref_end), (ref_start - delta, ref_end - delta)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def sync_files(reference, source, partial=False, verbose=True):
|
|
151
|
+
"""Extract audio from both files, compute the offset, write sidecar(s).
|
|
152
|
+
|
|
153
|
+
In full-overlap mode writes a sidecar for BOTH inputs (reference gets
|
|
154
|
+
delta=0). In partial mode writes only the source sidecar. Returns the
|
|
155
|
+
source's sidecar Path.
|
|
156
|
+
"""
|
|
157
|
+
reference, source = Path(reference), Path(source)
|
|
158
|
+
a_dur = audio.media_duration(reference)
|
|
159
|
+
b_dur = audio.media_duration(source)
|
|
160
|
+
if verbose:
|
|
161
|
+
print("Mode: %s" % ("partial-coverage" if partial else "full-overlap"))
|
|
162
|
+
print("A (reference): %s duration=%.3fs" % (reference.name, a_dur))
|
|
163
|
+
print("B (source): %s duration=%.3fs" % (source.name, b_dur))
|
|
164
|
+
|
|
165
|
+
with tempfile.TemporaryDirectory() as td:
|
|
166
|
+
td = Path(td)
|
|
167
|
+
a_pcm, b_pcm = td / "a.pcm", td / "b.pcm"
|
|
168
|
+
if verbose:
|
|
169
|
+
print("Extracting mono PCM @ %d Hz..." % SR)
|
|
170
|
+
audio.extract_pcm(reference, a_pcm, SR)
|
|
171
|
+
audio.extract_pcm(source, b_pcm, SR)
|
|
172
|
+
a = audio.read_pcm(a_pcm)
|
|
173
|
+
b = audio.read_pcm(b_pcm)
|
|
174
|
+
res = compute_sync(a, b, a_dur, b_dur, sr=SR, partial=partial,
|
|
175
|
+
verbose=verbose)
|
|
176
|
+
|
|
177
|
+
if res.coarse_corr < 0.3 and verbose:
|
|
178
|
+
print(" WARNING: low coarse correlation; sync may be unreliable.",
|
|
179
|
+
file=sys.stderr)
|
|
180
|
+
if verbose:
|
|
181
|
+
msg = " delta=%+.6fs drift=%+.3e" % (res.delta_seconds, res.drift_slope)
|
|
182
|
+
if res.fallback:
|
|
183
|
+
msg += " (fallback: %s)" % res.fallback
|
|
184
|
+
print(msg)
|
|
185
|
+
|
|
186
|
+
(ref_ovl, src_ovl) = _overlap(res.delta_seconds, a_dur, b_dur)
|
|
187
|
+
if ref_ovl[1] - ref_ovl[0] < 1.0:
|
|
188
|
+
raise SyncError("overlap window <1s; the two recordings barely share "
|
|
189
|
+
"content (delta=%.3fs)" % res.delta_seconds)
|
|
190
|
+
|
|
191
|
+
src_sc = write_sidecar(
|
|
192
|
+
source, source=source.name, reference=reference.name,
|
|
193
|
+
delta_seconds=res.delta_seconds, drift_slope=res.drift_slope,
|
|
194
|
+
overlap_in_reference=ref_ovl, overlap_in_source=src_ovl,
|
|
195
|
+
)
|
|
196
|
+
if verbose:
|
|
197
|
+
print("Wrote %s" % src_sc)
|
|
198
|
+
if not partial:
|
|
199
|
+
ref_sc = write_sidecar(
|
|
200
|
+
reference, source=reference.name, reference=reference.name,
|
|
201
|
+
delta_seconds=0.0, drift_slope=0.0,
|
|
202
|
+
overlap_in_reference=ref_ovl, overlap_in_source=ref_ovl,
|
|
203
|
+
)
|
|
204
|
+
if verbose:
|
|
205
|
+
print("Wrote %s" % ref_sc)
|
|
206
|
+
return src_sc
|
polysync/verify.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Independent residual check for a (reference, source, sidecar) triple.
|
|
2
|
+
|
|
3
|
+
Re-extracts audio from BOTH originals NATIVELY (loudest stream, no ffmpeg
|
|
4
|
+
offset) and runs multi-probe cross-correlation inside the overlap window,
|
|
5
|
+
applying the sidecar's `delta_seconds` (and, with `apply_drift`, the slope) as
|
|
6
|
+
index arithmetic in numpy.
|
|
7
|
+
|
|
8
|
+
Why index arithmetic and not `ffmpeg -itsoffset`: `-itsoffset` shifts input
|
|
9
|
+
timestamps, but a headerless raw stream (`-f s16le`) has no timestamps to carry
|
|
10
|
+
the offset — ffmpeg silently drops it and inserts NO leading silence. Relying
|
|
11
|
+
on it lines the source's t=0 up with the reference's t=0 regardless of delta,
|
|
12
|
+
so every probe correlates the wrong region, peaks land in noise (ncoef ~0), and
|
|
13
|
+
verification falsely FAILs. Shifting indices ourselves matches exactly how the
|
|
14
|
+
offset was computed.
|
|
15
|
+
|
|
16
|
+
PASS = |median_residual_ms| < 15 AND residual_spread_ms < 1 frame at target fps.
|
|
17
|
+
A spread-only fail with a near-zero median is usually far-field-mic noise on a
|
|
18
|
+
wide/B-roll camera, not real desync — for camera-cut editing, trust the median.
|
|
19
|
+
"""
|
|
20
|
+
import json
|
|
21
|
+
import sys
|
|
22
|
+
import tempfile
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
import numpy as np
|
|
26
|
+
from scipy import signal
|
|
27
|
+
|
|
28
|
+
from . import audio
|
|
29
|
+
|
|
30
|
+
SR = 8000
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def verify_files(reference, source, sidecar, probe_len=60.0, step=600.0,
|
|
34
|
+
max_frame_ms=33.33, apply_drift=False, verbose=True):
|
|
35
|
+
"""Run verification and write results into the sidecar's `verification`
|
|
36
|
+
field. Returns (passed: bool, stats: dict)."""
|
|
37
|
+
reference, source, sidecar = Path(reference), Path(source), Path(sidecar)
|
|
38
|
+
sc = json.loads(sidecar.read_text())
|
|
39
|
+
delta = float(sc["delta_seconds"])
|
|
40
|
+
drift_slope = float(sc.get("drift_slope", 0.0))
|
|
41
|
+
overlap_ref = sc["overlap_in_reference"]
|
|
42
|
+
|
|
43
|
+
if verbose:
|
|
44
|
+
print("Reference: %s" % reference.name)
|
|
45
|
+
print("Source: %s" % source.name)
|
|
46
|
+
print("delta_seconds = %+.6f drift_slope = %+.3e (%s)"
|
|
47
|
+
% (delta, drift_slope,
|
|
48
|
+
"applied" if apply_drift else "not applied"))
|
|
49
|
+
|
|
50
|
+
with tempfile.TemporaryDirectory() as td:
|
|
51
|
+
td = Path(td)
|
|
52
|
+
ref_pcm, src_pcm = td / "ref.pcm", td / "src.pcm"
|
|
53
|
+
audio.extract_pcm(reference, ref_pcm, SR)
|
|
54
|
+
audio.extract_pcm(source, src_pcm, SR)
|
|
55
|
+
ref = audio.read_pcm(ref_pcm)
|
|
56
|
+
src = audio.read_pcm(src_pcm)
|
|
57
|
+
|
|
58
|
+
pl = int(probe_len * SR)
|
|
59
|
+
pad = int(0.5 * SR)
|
|
60
|
+
ovl_start = max(60.0, float(overlap_ref[0]) + 1.0)
|
|
61
|
+
ovl_end = float(overlap_ref[1]) - probe_len - 1.0
|
|
62
|
+
if ovl_end <= ovl_start:
|
|
63
|
+
raise ValueError("overlap window too short to verify")
|
|
64
|
+
|
|
65
|
+
rs = []
|
|
66
|
+
for bs in np.arange(ovl_start, ovl_end, step):
|
|
67
|
+
# Source-local time corresponding to this reference time.
|
|
68
|
+
src_t = bs - delta
|
|
69
|
+
if apply_drift:
|
|
70
|
+
src_t = src_t / (1.0 + drift_slope)
|
|
71
|
+
si = int(src_t * SR)
|
|
72
|
+
bsi = int(bs * SR)
|
|
73
|
+
if si < 0 or si + pl > len(src):
|
|
74
|
+
continue
|
|
75
|
+
probe = src[si:si + pl]
|
|
76
|
+
if np.abs(probe).mean() < 1.0:
|
|
77
|
+
continue # silence — nothing to correlate
|
|
78
|
+
lo = max(0, bsi - pad)
|
|
79
|
+
hi = min(len(ref), bsi + pl + pad)
|
|
80
|
+
if hi - lo < pl:
|
|
81
|
+
continue
|
|
82
|
+
seg = ref[lo:hi].astype(np.float32)
|
|
83
|
+
xc = signal.correlate(audio.normalize(seg),
|
|
84
|
+
audio.normalize(probe.astype(np.float32)),
|
|
85
|
+
mode="valid", method="fft")
|
|
86
|
+
pk = int(np.argmax(np.abs(xc)))
|
|
87
|
+
ncoef = float(xc[pk] / len(probe))
|
|
88
|
+
residual_ms = ((lo + pk) / SR - bs) * 1000
|
|
89
|
+
rs.append((bs, residual_ms, ncoef))
|
|
90
|
+
if verbose:
|
|
91
|
+
print("t=%7.1fs residual=%+7.2f ms ncoef=%+.3f"
|
|
92
|
+
% (bs, residual_ms, ncoef))
|
|
93
|
+
|
|
94
|
+
if not rs:
|
|
95
|
+
raise ValueError("no usable probes (all silence or out of overlap)")
|
|
96
|
+
|
|
97
|
+
arr = np.array([r[1] for r in rs])
|
|
98
|
+
median_residual_ms = float(np.median(arr))
|
|
99
|
+
residual_spread_ms = float(np.max(np.abs(arr - median_residual_ms)) * 2)
|
|
100
|
+
passed = (abs(median_residual_ms) <= 15.0
|
|
101
|
+
and residual_spread_ms <= max_frame_ms)
|
|
102
|
+
|
|
103
|
+
stats = {
|
|
104
|
+
"median_residual_ms": round(median_residual_ms, 3),
|
|
105
|
+
"residual_spread_ms": round(residual_spread_ms, 3),
|
|
106
|
+
"probe_count": len(rs),
|
|
107
|
+
"drift_applied": bool(apply_drift),
|
|
108
|
+
}
|
|
109
|
+
sc["verification"] = stats
|
|
110
|
+
sidecar.write_text(json.dumps(sc, indent=2, ensure_ascii=False))
|
|
111
|
+
if verbose:
|
|
112
|
+
print("\nResidual: median=%+.2f ms spread=+-%.2f ms -> %s"
|
|
113
|
+
% (median_residual_ms, residual_spread_ms / 2,
|
|
114
|
+
"PASS" if passed else "FAIL"))
|
|
115
|
+
if not passed and abs(median_residual_ms) <= 15.0:
|
|
116
|
+
print(" (spread-only fail with near-zero median is usually "
|
|
117
|
+
"far-field-mic noise, not real desync)", file=sys.stderr)
|
|
118
|
+
return passed, stats
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: polysync
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multicam audio sync and director-style auto-edit — align N angles of one event by audio cross-correlation, then cut/PiP them into one MP4. Reversible sidecars, never re-encodes the originals.
|
|
5
|
+
Author: 王建硕 (Jian Shuo Wang)
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jianshuo/polysync
|
|
8
|
+
Project-URL: Issues, https://github.com/jianshuo/polysync/issues
|
|
9
|
+
Keywords: multicam,audio-sync,video-editing,cross-correlation,ffmpeg,picture-in-picture,podcast,interview
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Topic :: Multimedia :: Video
|
|
17
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Analysis
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: numpy>=1.21
|
|
22
|
+
Requires-Dist: scipy>=1.7
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# polysync
|
|
28
|
+
|
|
29
|
+
**Multicam audio sync + director-style auto-edit.** Align N recordings of the
|
|
30
|
+
same event by audio cross-correlation, then cut or picture-in-picture them into
|
|
31
|
+
a single MP4 — driven entirely by who's talking.
|
|
32
|
+
|
|
33
|
+
What makes it different from "yet another sync tool":
|
|
34
|
+
|
|
35
|
+
- **Reversible sidecars, never re-encodes the originals.** Sync writes a tiny
|
|
36
|
+
`<input>.sync.json` next to each file holding a single offset. A 75-min 4K
|
|
37
|
+
3-camera shoot is 250+ GB; baking offsets into re-encoded copies would double
|
|
38
|
+
that and lose quality. Downstream applies the offset with `ffmpeg -itsoffset`
|
|
39
|
+
at consume time. Originals are touched read-only, always.
|
|
40
|
+
- **Envelope cross-correlation, not raw waveform.** Matches the log-energy
|
|
41
|
+
envelope, which both mics hear regardless of their frequency response — robust
|
|
42
|
+
even when a second camera's on-board mic sounds nothing like the main one.
|
|
43
|
+
- **Clock-drift aware.** Cheap recorders drift 5–50 ppm; polysync fits the drift
|
|
44
|
+
across the recording and reports it separately, so long-form lip-sync can
|
|
45
|
+
correct it while camera-cut editing can ignore it.
|
|
46
|
+
- **Handles the messy real cases.** Auto-picks the loudest audio track (pro
|
|
47
|
+
cameras often leave track 1 dead), partial-coverage clips that only span part
|
|
48
|
+
of the session, and independent verification of the result.
|
|
49
|
+
|
|
50
|
+
## Install
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install polysync # once published
|
|
54
|
+
# or, from a checkout:
|
|
55
|
+
pip install -e ".[dev]"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Requires **Python ≥ 3.9** and **ffmpeg / ffprobe** on your `PATH`
|
|
59
|
+
(`brew install ffmpeg`, `apt install ffmpeg`, …). Python deps: `numpy`, `scipy`.
|
|
60
|
+
|
|
61
|
+
## Quickstart
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# 1. Sync each angle to a reference camera (writes <file>.sync.json sidecars)
|
|
65
|
+
polysync sync CAM_A.mp4 CAM_B.mxf
|
|
66
|
+
polysync sync CAM_A.mp4 CAM_C.mxf
|
|
67
|
+
|
|
68
|
+
# 2. (optional) Verify the alignment — re-checks residual independently
|
|
69
|
+
polysync verify CAM_A.mp4 CAM_B.mxf CAM_B.mxf.sync.json
|
|
70
|
+
|
|
71
|
+
# 3. Build an auto-edit decision list (who's on screen each second)
|
|
72
|
+
polysync edit CAM_A.mp4 CAM_B.mxf CAM_C.mxf --out edl.json
|
|
73
|
+
|
|
74
|
+
# 4. Render — hard cuts, or with a picture-in-picture inset
|
|
75
|
+
polysync render-cuts edl.json --out final.mp4
|
|
76
|
+
polysync render-pip edl.json --out final.mp4 --pip bottom-right
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
A clip that only covers **part** of the session (a Riverside / phone / lavalier
|
|
80
|
+
recording that started mid-way):
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
polysync sync REFERENCE.mp4 PARTIAL.m4a --partial
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## How it consumes the sidecar
|
|
87
|
+
|
|
88
|
+
`delta_seconds` is the source's `t=0` in the reference's timeline (positive =
|
|
89
|
+
source starts later). To align by hand:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
ffmpeg -itsoffset $(jq -r .delta_seconds CAM_B.mxf.sync.json) -i CAM_B.mxf \
|
|
93
|
+
-i CAM_A.mp4 -filter_complex "[0:v][1:v]hstack" out.mp4
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
The `edit` / `render-*` commands read every sidecar automatically.
|
|
97
|
+
|
|
98
|
+
## Python API
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from polysync import compute_sync # pure-numpy core, unit-testable
|
|
102
|
+
from polysync.sync import sync_files # file → sidecar
|
|
103
|
+
from polysync.verify import verify_files
|
|
104
|
+
from polysync.edit import build_edl
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Status
|
|
108
|
+
|
|
109
|
+
Beta (0.1). Sync + verify are battle-tested on real Sony FX3/FX6 multicam
|
|
110
|
+
interview footage; the auto-edit is audio-energy-driven (no face detection).
|
|
111
|
+
Issues and PRs welcome.
|
|
112
|
+
|
|
113
|
+
## License
|
|
114
|
+
|
|
115
|
+
MIT © 王建硕 (Jian Shuo Wang)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
polysync/__init__.py,sha256=RgITjxkGmBXYPCZf1HB8k51BhzuAlJZ18_fdArxOQs4,738
|
|
2
|
+
polysync/audio.py,sha256=ynWo_o_EyOt5oeOV3DlWbmYGKSs18ccd4Kqnffe9qqY,4838
|
|
3
|
+
polysync/cli.py,sha256=BlkblUjiS47Fv75JK09yRJpsjqBRLeX2jMmH7LaK40Q,2725
|
|
4
|
+
polysync/sidecar.py,sha256=q8S0-NndFC3cTHJu-jWEUeIStwfpnTtBejMMSmuZYbg,3833
|
|
5
|
+
polysync/sync.py,sha256=rftcvHKNjWCvW9ajxGDiwaQ9rb-Our0wErkI6Iz5BqE,8301
|
|
6
|
+
polysync/verify.py,sha256=RMCzLsX37idtOj3M0kHrc4Vuy5fkQrUKCz7YmNIuGhU,4798
|
|
7
|
+
polysync/edit/__init__.py,sha256=DbaXJlHTigztesgHv1Drq5qnOVInbVqaJk-pqvXntBQ,328
|
|
8
|
+
polysync/edit/autoedit.py,sha256=CpwaQNNgw8OeZs71DJ-se5SF4e412UPbCi0Go-dSgIc,12230
|
|
9
|
+
polysync/edit/render_cuts.py,sha256=a_r4YrF0sd16Q1nfPJe5d6V-T7dby2WnHwZl9cvFPJ4,2617
|
|
10
|
+
polysync/edit/render_pip.py,sha256=1HSDq-aNCPC1l98iBh8DOZCS-G02TtJukzunvsmXCmA,5372
|
|
11
|
+
polysync-0.1.0.dist-info/licenses/LICENSE,sha256=hcmoQh1_1s1uNIhPR-r0iRSaW5podyrLeiRSN6aQETY,1083
|
|
12
|
+
polysync-0.1.0.dist-info/METADATA,sha256=cpCI34EHIzrUlFWd_uMOdv_KuDZ_lR8YQl9S6WM1ddU,4363
|
|
13
|
+
polysync-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
14
|
+
polysync-0.1.0.dist-info/entry_points.txt,sha256=NmYIEaQREjCdPPfVab4LZfuC8AlMMS1JqaTokiyXpGo,47
|
|
15
|
+
polysync-0.1.0.dist-info/top_level.txt,sha256=GQB7IrzkxyfeEJZAvijNYqf43U6cksUB8_qhV7OfvFE,9
|
|
16
|
+
polysync-0.1.0.dist-info/RECORD,,
|