polysync 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,141 @@
1
+ """Render an autoedit EDL with picture-in-picture for 1, 2, or N cameras.
2
+
3
+ - 1 cam: pass-through (no inset).
4
+ - 2 cam: main = active cam; PiP = the other cam at the same time range.
5
+ - N cam: main = active; PiP = a covered non-active cam (round-robin or first).
6
+
7
+ Per-segment EDL rows may carry a `pip` field (cam index) to override the picker.
8
+ """
9
+ import argparse
10
+ import json
11
+ import subprocess
12
+ from pathlib import Path
13
+
14
+ POSITIONS = {
15
+ "bottom-right": ("W-w-{m}", "H-h-{m}"),
16
+ "top-right": ("W-w-{m}", "{m}"),
17
+ "bottom-left": ("{m}", "H-h-{m}"),
18
+ "top-left": ("{m}", "{m}"),
19
+ }
20
+
21
+
22
+ def pick_pip(row, K, coverage, mode="next"):
23
+ """Choose the PiP cam for a segment, among cams covered for the WHOLE
24
+ segment. Honours an explicit row['pip']. Returns None if no other cam fits."""
25
+ if row.get("pip") is not None:
26
+ return int(row["pip"])
27
+ cam = row["cam"]
28
+ s, e = row["start"], row["end"]
29
+ candidates = [k for k in range(K)
30
+ if k != cam and coverage[k][0] <= s and coverage[k][1] >= e]
31
+ if not candidates:
32
+ return None
33
+ if mode == "next":
34
+ for off in range(1, K):
35
+ cand = (cam + off) % K
36
+ if cand in candidates:
37
+ return cand
38
+ return candidates[0]
39
+
40
+
41
+ def render_pip(edl_path, out, encoder="hevc_videotoolbox", bitrate="12M",
42
+ width=1920, height=1080, fps=30, pip="bottom-right",
43
+ pip_width=480, pip_margin=24, border_px=4, pip_pick="next",
44
+ run=True):
45
+ plan = json.loads(Path(edl_path).read_text())
46
+ inputs = plan["inputs"]
47
+ deltas = plan.get("deltas", [0.0] * len(inputs))
48
+ edl = plan["edl"]
49
+ audio_src = plan["audio_source"]
50
+ K = len(inputs)
51
+ coverage = plan.get("coverage", [[0.0, plan["duration_sec"]]] * K)
52
+
53
+ W, H = width, height
54
+ pw = pip_width
55
+ ph = round(pw * 9 / 16)
56
+ bw = border_px
57
+ x_expr, y_expr = POSITIONS[pip]
58
+ x_expr = x_expr.format(m=pip_margin)
59
+ y_expr = y_expr.format(m=pip_margin)
60
+
61
+ cmd = ["ffmpeg", "-nostdin", "-y"]
62
+ for src, dlt in zip(inputs, deltas):
63
+ if abs(dlt) > 1e-9:
64
+ cmd.extend(["-itsoffset", "%.6f" % dlt])
65
+ cmd.extend(["-i", src])
66
+
67
+ filters = []
68
+ for i, row in enumerate(edl):
69
+ cam = row["cam"]
70
+ s, e = row["start"], row["end"]
71
+ main_label = "m%d" % i if K > 1 else "v%d" % i
72
+ filters.append(
73
+ "[%d:v]trim=start=%s:end=%s,setpts=PTS-STARTPTS,"
74
+ "scale=%d:%d:force_original_aspect_ratio=decrease,"
75
+ "pad=%d:%d:(ow-iw)/2:(oh-ih)/2,setsar=1,fps=%d[%s]"
76
+ % (cam, s, e, W, H, W, H, fps, main_label)
77
+ )
78
+ if K == 1:
79
+ continue
80
+ pip_cam = pick_pip(row, K, coverage, mode=pip_pick)
81
+ if pip_cam is None:
82
+ filters.append("[m%d]copy[v%d]" % (i, i))
83
+ continue
84
+ chain = (
85
+ "[%d:v]trim=start=%s:end=%s,setpts=PTS-STARTPTS,"
86
+ "scale=%d:%d:force_original_aspect_ratio=decrease,"
87
+ "pad=%d:%d:(ow-iw)/2:(oh-ih)/2,"
88
+ % (pip_cam, s, e, pw, ph, pw, ph)
89
+ )
90
+ if bw > 0:
91
+ chain += "pad=%d:%d:%d:%d:white," % (pw + 2 * bw, ph + 2 * bw, bw, bw)
92
+ chain += "setsar=1,fps=%d[p%d]" % (fps, i)
93
+ filters.append(chain)
94
+ filters.append("[m%d][p%d]overlay=%s:%s:eof_action=pass[v%d]"
95
+ % (i, i, x_expr, y_expr, i))
96
+
97
+ concat = "".join("[v%d]" % i for i in range(len(edl)))
98
+ filters.append("%sconcat=n=%d:v=1:a=0[vout]" % (concat, len(edl)))
99
+ audio_offset = edl[0]["start"] if edl else 0.0
100
+ dur = plan["duration_sec"]
101
+ fc = ";".join(filters)
102
+ fc += (";[%d:a:0]atrim=start=%s:duration=%s,asetpts=PTS-STARTPTS[aout]"
103
+ % (audio_src, audio_offset, dur))
104
+ cmd.extend([
105
+ "-filter_complex", fc,
106
+ "-map", "[vout]", "-map", "[aout]",
107
+ "-t", str(dur),
108
+ "-c:v", encoder, "-b:v", bitrate, "-tag:v", "hvc1",
109
+ "-c:a", "aac", "-b:a", "192k",
110
+ "-movflags", "+faststart", str(out),
111
+ ])
112
+ if run:
113
+ print("PiP %dx%d, inset %dx%d (+%dpx) at %s; %d cams; %d segments"
114
+ % (W, H, pw, ph, bw, pip, K, len(edl)))
115
+ subprocess.run(cmd, check=True)
116
+ return cmd
117
+
118
+
119
+ def main(argv=None):
120
+ ap = argparse.ArgumentParser(prog="polysync render-pip")
121
+ ap.add_argument("edl", type=Path)
122
+ ap.add_argument("--out", type=Path, required=True)
123
+ ap.add_argument("--encoder", default="hevc_videotoolbox")
124
+ ap.add_argument("--bitrate", default="12M")
125
+ ap.add_argument("--width", type=int, default=1920)
126
+ ap.add_argument("--height", type=int, default=1080)
127
+ ap.add_argument("--fps", type=int, default=30)
128
+ ap.add_argument("--pip", choices=list(POSITIONS), default="bottom-right")
129
+ ap.add_argument("--pip-width", type=int, default=480)
130
+ ap.add_argument("--pip-margin", type=int, default=24)
131
+ ap.add_argument("--border-px", type=int, default=4)
132
+ ap.add_argument("--pip-pick", choices=["next", "second-best"], default="next")
133
+ args = ap.parse_args(argv)
134
+ render_pip(args.edl, args.out, encoder=args.encoder, bitrate=args.bitrate,
135
+ width=args.width, height=args.height, fps=args.fps, pip=args.pip,
136
+ pip_width=args.pip_width, pip_margin=args.pip_margin,
137
+ border_px=args.border_px, pip_pick=args.pip_pick)
138
+
139
+
140
+ if __name__ == "__main__":
141
+ main()
polysync/sidecar.py ADDED
@@ -0,0 +1,88 @@
1
+ """`.sync.json` sidecar — the reversible alignment record polysync emits.
2
+
3
+ One sidecar per original input, written next to it as `<input>.sync.json`.
4
+ Originals are never modified. Downstream aligns the source to the reference
5
+ timeline with `ffmpeg -itsoffset delta_seconds` (optionally
6
+ `atempo=1+drift_slope` for long-form lip-sync).
7
+ """
8
+ import json
9
+ from pathlib import Path
10
+
11
+ SCHEMA_VERSION = 1
12
+
13
+
14
+ def sidecar_path(media_path):
15
+ """Sidecar lives next to the original, named `<original>.sync.json`."""
16
+ media_path = Path(media_path)
17
+ return media_path.with_suffix(media_path.suffix + ".sync.json")
18
+
19
+
20
+ def write_sidecar(media_path, *, source, reference, delta_seconds, drift_slope,
21
+ overlap_in_reference, overlap_in_source, verification=None):
22
+ """Write the canonical sidecar next to `media_path` and return its Path."""
23
+ sc = {
24
+ "_about": (
25
+ "Sync metadata for %s (aligned to %s). Generated by polysync. "
26
+ "Originals are not modified; downstream uses "
27
+ "ffmpeg -itsoffset delta_seconds to align." % (source, reference)
28
+ ),
29
+ "_help": {
30
+ "delta_seconds": (
31
+ "Source's t=0 expressed in reference's timeline. "
32
+ "Positive => source starts after reference. "
33
+ "Apply via `ffmpeg -itsoffset <delta_seconds> -i <source>`."
34
+ ),
35
+ "drift_slope": (
36
+ "Residual clock drift between source and reference clocks "
37
+ "(dimensionless, ~1e-5 typical). For camera-cut editing, "
38
+ "ignore. For sync-sound / long-form lip-sync apply "
39
+ "atempo=1+drift_slope to the source."
40
+ ),
41
+ "overlap_in_reference": (
42
+ "[start, end] window in reference's timeline where BOTH source "
43
+ "and reference have valid content. Use to constrain trims/EDLs."
44
+ ),
45
+ "overlap_in_source": (
46
+ "Same window in the source's local timeline "
47
+ "(= overlap_in_reference shifted by -delta_seconds)."
48
+ ),
49
+ "verification": (
50
+ "Filled in by `polysync verify`: median_residual_ms, "
51
+ "residual_spread_ms, probe_count. None until verify runs."
52
+ ),
53
+ },
54
+ "schema_version": SCHEMA_VERSION,
55
+ "source": source,
56
+ "reference": reference,
57
+ "delta_seconds": float(delta_seconds),
58
+ "drift_slope": float(drift_slope),
59
+ "overlap_in_reference": [float(overlap_in_reference[0]),
60
+ float(overlap_in_reference[1])],
61
+ "overlap_in_source": [float(overlap_in_source[0]),
62
+ float(overlap_in_source[1])],
63
+ "verification": verification,
64
+ }
65
+ p = sidecar_path(media_path)
66
+ p.write_text(json.dumps(sc, indent=2, ensure_ascii=False))
67
+ return p
68
+
69
+
70
+ def read_sidecar(input_path):
71
+ """Read `<input>.sync.json`. Returns (delta_seconds, overlap_in_reference,
72
+ has_sidecar). Falls back to (0.0, None, False) if absent or unparseable —
73
+ callers treat that as 'this input is at reference t=0, full coverage'."""
74
+ sc = sidecar_path(input_path)
75
+ if not sc.exists():
76
+ return (0.0, None, False)
77
+ try:
78
+ d = json.loads(sc.read_text())
79
+ if d.get("schema_version") != SCHEMA_VERSION:
80
+ print("WARN: %s schema_version != %d; reading anyway"
81
+ % (sc.name, SCHEMA_VERSION))
82
+ delta = float(d["delta_seconds"])
83
+ ovl = d.get("overlap_in_reference")
84
+ ovl = (float(ovl[0]), float(ovl[1])) if ovl else None
85
+ return (delta, ovl, True)
86
+ except Exception as e: # noqa: BLE001 — best-effort, never crash the edit
87
+ print("WARN: failed to parse %s: %s; using delta=0" % (sc.name, e))
88
+ return (0.0, None, False)
polysync/sync.py ADDED
@@ -0,0 +1,206 @@
1
+ """Compute the time offset between two recordings of the same event.
2
+
3
+ Algorithm (envelope cross-correlation + multi-probe drift fit):
4
+ 1. Log-energy envelope of each signal (audio.log_envelope), high-passed.
5
+ 2. FFT cross-correlate envelopes end-to-end -> coarse offset (~10 ms).
6
+ 3. Refine at sample level with 60 s probes near the coarse position,
7
+ parabolic peak interpolation.
8
+ 4. Linear-fit delta(t) across probes -> clock drift; report the
9
+ midpoint-canonical offset so residual error is symmetric around zero.
10
+
11
+ Two failure philosophies, selected by `partial`:
12
+ - partial=False (full-overlap multicam): demand >=3 good probes or raise
13
+ SyncError. Too few good matches almost always means the wrong files.
14
+ - partial=True (a source covering only part of the reference): degrade
15
+ gracefully — median delta on few probes, coarse delta if none.
16
+
17
+ `compute_sync` works on numpy PCM arrays (unit-testable, no ffmpeg). `sync_files`
18
+ is the file/CLI layer that extracts audio and writes sidecars.
19
+ """
20
+ import sys
21
+ import tempfile
22
+ from pathlib import Path
23
+
24
+ import numpy as np
25
+ from scipy import signal
26
+
27
+ from . import audio
28
+ from .sidecar import write_sidecar
29
+
30
+ SR = 8000 # sync works fine at 8 kHz; the envelope is what matters, not HF
31
+ GOOD_NCOEF = 0.05 # a probe counts as "good" above this normalized correlation
32
+
33
+
34
+ class SyncError(Exception):
35
+ """Raised when full-overlap sync cannot find enough evidence to trust."""
36
+
37
+
38
+ class SyncResult(object):
39
+ """Outcome of compute_sync. `delta_seconds` is the source's t=0 expressed
40
+ in the reference timeline; positive => source starts after reference."""
41
+
42
+ def __init__(self, delta_seconds, drift_slope, coarse_corr,
43
+ n_probes, n_good, fallback):
44
+ self.delta_seconds = float(delta_seconds)
45
+ self.drift_slope = float(drift_slope)
46
+ self.coarse_corr = float(coarse_corr)
47
+ self.n_probes = int(n_probes)
48
+ self.n_good = int(n_good)
49
+ self.fallback = fallback # None | "median" | "coarse"
50
+
51
+ def __repr__(self):
52
+ return ("SyncResult(delta=%.6f, drift=%.3e, coarse_corr=%.3f, "
53
+ "probes=%d, good=%d, fallback=%r)"
54
+ % (self.delta_seconds, self.drift_slope, self.coarse_corr,
55
+ self.n_probes, self.n_good, self.fallback))
56
+
57
+
58
+ def coarse_offset(env_a, env_b, env_sr):
59
+ """Return (delta, normalized_corr) with delta = tA - tB so A_t = B_t + delta."""
60
+ a_n = audio.normalize(env_a)
61
+ b_n = audio.normalize(env_b)
62
+ xc = signal.correlate(a_n, b_n, mode="full", method="fft")
63
+ lags = np.arange(len(xc)) - (len(b_n) - 1)
64
+ pk = int(np.argmax(xc))
65
+ return float(lags[pk] / env_sr), float(xc[pk] / len(env_b))
66
+
67
+
68
+ def _refine(a, b, b_start_s, expected_delta, sr, probe_len_s=60.0, pad_s=1.5):
69
+ pl = int(probe_len_s * sr)
70
+ bs = int(b_start_s * sr)
71
+ if bs + pl > len(b):
72
+ return None
73
+ probe = b[bs:bs + pl].astype(np.float32)
74
+ a_center = b_start_s + expected_delta
75
+ lo = max(0, int((a_center - pad_s) * sr))
76
+ hi = min(len(a), int((a_center + pad_s + probe_len_s) * sr))
77
+ if hi - lo < pl:
78
+ return None
79
+ seg = a[lo:hi].astype(np.float32)
80
+ xc = signal.correlate(audio.normalize(seg), audio.normalize(probe),
81
+ mode="valid", method="fft")
82
+ pk = int(np.argmax(np.abs(xc)))
83
+ val = xc[pk] / len(probe)
84
+ if 0 < pk < len(xc) - 1:
85
+ y0, y1, y2 = xc[pk - 1], xc[pk], xc[pk + 1]
86
+ denom = (y0 - 2 * y1 + y2)
87
+ sub = 0.5 * (y0 - y2) / denom if abs(denom) > 1e-9 else 0.0
88
+ else:
89
+ sub = 0.0
90
+ a_pos = (lo + pk + sub) / sr
91
+ return float(a_pos - b_start_s), float(val)
92
+
93
+
94
+ def _multi_probe(a, b, expected_delta, b_dur, a_dur, sr, step_s=180.0):
95
+ rs = []
96
+ for bs in np.arange(60.0, b_dur - 60.0, step_s):
97
+ a_center = bs + expected_delta
98
+ if a_center < 1.5 or a_center + 60.0 + 1.5 > a_dur:
99
+ continue
100
+ r = _refine(a, b, bs, expected_delta, sr)
101
+ if r:
102
+ rs.append((bs, r[0], r[1]))
103
+ return rs
104
+
105
+
106
+ def compute_sync(a, b, a_dur, b_dur, sr=SR, partial=False, verbose=False):
107
+ """Align PCM array `b` (source) to `a` (reference). Returns SyncResult.
108
+
109
+ `a`, `b` are mono float arrays at `sr` Hz. `a_dur`/`b_dur` are their
110
+ durations in seconds (usually len/sr, passed explicitly so callers can use
111
+ container duration). Raises SyncError in full-overlap mode when evidence is
112
+ too weak.
113
+ """
114
+ env_a, esr = audio.log_envelope(a, sr)
115
+ env_b, _ = audio.log_envelope(b, sr)
116
+ coarse_d, coarse_v = coarse_offset(env_a, env_b, esr)
117
+ if verbose:
118
+ print(" coarse delta = %+.4fs (xc/N=%.3f)" % (coarse_d, coarse_v))
119
+
120
+ probes = _multi_probe(a, b, coarse_d, b_dur, a_dur, sr)
121
+ good = np.array([abs(p[2]) > GOOD_NCOEF for p in probes], dtype=bool)
122
+ if verbose:
123
+ print(" good probes: %d / %d" % (int(good.sum()), len(probes)))
124
+
125
+ if good.sum() >= 3:
126
+ bs_arr = np.array([p[0] for p in probes])
127
+ d_arr = np.array([p[1] for p in probes])
128
+ slope, intercept = np.polyfit(bs_arr[good], d_arr[good], 1)
129
+ delta = float(slope * (b_dur / 2) + intercept)
130
+ return SyncResult(delta, float(slope), coarse_v,
131
+ len(probes), int(good.sum()), None)
132
+ if not partial:
133
+ raise SyncError(
134
+ "too few good probes (%d < 3); sync unreliable. If this is a "
135
+ "short partial-coverage clip, use partial=True / --partial."
136
+ % int(good.sum()))
137
+ if probes:
138
+ delta = float(np.median([p[1] for p in probes]))
139
+ return SyncResult(delta, 0.0, coarse_v, len(probes),
140
+ int(good.sum()), "median")
141
+ return SyncResult(float(coarse_d), 0.0, coarse_v, 0, 0, "coarse")
142
+
143
+
144
+ def _overlap(delta, a_dur, b_dur):
145
+ ref_start = max(0.0, delta)
146
+ ref_end = min(a_dur, delta + b_dur)
147
+ return (ref_start, ref_end), (ref_start - delta, ref_end - delta)
148
+
149
+
150
+ def sync_files(reference, source, partial=False, verbose=True):
151
+ """Extract audio from both files, compute the offset, write sidecar(s).
152
+
153
+ In full-overlap mode writes a sidecar for BOTH inputs (reference gets
154
+ delta=0). In partial mode writes only the source sidecar. Returns the
155
+ source's sidecar Path.
156
+ """
157
+ reference, source = Path(reference), Path(source)
158
+ a_dur = audio.media_duration(reference)
159
+ b_dur = audio.media_duration(source)
160
+ if verbose:
161
+ print("Mode: %s" % ("partial-coverage" if partial else "full-overlap"))
162
+ print("A (reference): %s duration=%.3fs" % (reference.name, a_dur))
163
+ print("B (source): %s duration=%.3fs" % (source.name, b_dur))
164
+
165
+ with tempfile.TemporaryDirectory() as td:
166
+ td = Path(td)
167
+ a_pcm, b_pcm = td / "a.pcm", td / "b.pcm"
168
+ if verbose:
169
+ print("Extracting mono PCM @ %d Hz..." % SR)
170
+ audio.extract_pcm(reference, a_pcm, SR)
171
+ audio.extract_pcm(source, b_pcm, SR)
172
+ a = audio.read_pcm(a_pcm)
173
+ b = audio.read_pcm(b_pcm)
174
+ res = compute_sync(a, b, a_dur, b_dur, sr=SR, partial=partial,
175
+ verbose=verbose)
176
+
177
+ if res.coarse_corr < 0.3 and verbose:
178
+ print(" WARNING: low coarse correlation; sync may be unreliable.",
179
+ file=sys.stderr)
180
+ if verbose:
181
+ msg = " delta=%+.6fs drift=%+.3e" % (res.delta_seconds, res.drift_slope)
182
+ if res.fallback:
183
+ msg += " (fallback: %s)" % res.fallback
184
+ print(msg)
185
+
186
+ (ref_ovl, src_ovl) = _overlap(res.delta_seconds, a_dur, b_dur)
187
+ if ref_ovl[1] - ref_ovl[0] < 1.0:
188
+ raise SyncError("overlap window <1s; the two recordings barely share "
189
+ "content (delta=%.3fs)" % res.delta_seconds)
190
+
191
+ src_sc = write_sidecar(
192
+ source, source=source.name, reference=reference.name,
193
+ delta_seconds=res.delta_seconds, drift_slope=res.drift_slope,
194
+ overlap_in_reference=ref_ovl, overlap_in_source=src_ovl,
195
+ )
196
+ if verbose:
197
+ print("Wrote %s" % src_sc)
198
+ if not partial:
199
+ ref_sc = write_sidecar(
200
+ reference, source=reference.name, reference=reference.name,
201
+ delta_seconds=0.0, drift_slope=0.0,
202
+ overlap_in_reference=ref_ovl, overlap_in_source=ref_ovl,
203
+ )
204
+ if verbose:
205
+ print("Wrote %s" % ref_sc)
206
+ return src_sc
polysync/verify.py ADDED
@@ -0,0 +1,118 @@
1
+ """Independent residual check for a (reference, source, sidecar) triple.
2
+
3
+ Re-extracts audio from BOTH originals NATIVELY (loudest stream, no ffmpeg
4
+ offset) and runs multi-probe cross-correlation inside the overlap window,
5
+ applying the sidecar's `delta_seconds` (and, with `apply_drift`, the slope) as
6
+ index arithmetic in numpy.
7
+
8
+ Why index arithmetic and not `ffmpeg -itsoffset`: `-itsoffset` shifts input
9
+ timestamps, but a headerless raw stream (`-f s16le`) has no timestamps to carry
10
+ the offset — ffmpeg silently drops it and inserts NO leading silence. Relying
11
+ on it lines the source's t=0 up with the reference's t=0 regardless of delta,
12
+ so every probe correlates the wrong region, peaks land in noise (ncoef ~0), and
13
+ verification falsely FAILs. Shifting indices ourselves matches exactly how the
14
+ offset was computed.
15
+
16
+ PASS = |median_residual_ms| < 15 AND residual_spread_ms < 1 frame at target fps.
17
+ A spread-only fail with a near-zero median is usually far-field-mic noise on a
18
+ wide/B-roll camera, not real desync — for camera-cut editing, trust the median.
19
+ """
20
+ import json
21
+ import sys
22
+ import tempfile
23
+ from pathlib import Path
24
+
25
+ import numpy as np
26
+ from scipy import signal
27
+
28
+ from . import audio
29
+
30
+ SR = 8000
31
+
32
+
33
+ def verify_files(reference, source, sidecar, probe_len=60.0, step=600.0,
34
+ max_frame_ms=33.33, apply_drift=False, verbose=True):
35
+ """Run verification and write results into the sidecar's `verification`
36
+ field. Returns (passed: bool, stats: dict)."""
37
+ reference, source, sidecar = Path(reference), Path(source), Path(sidecar)
38
+ sc = json.loads(sidecar.read_text())
39
+ delta = float(sc["delta_seconds"])
40
+ drift_slope = float(sc.get("drift_slope", 0.0))
41
+ overlap_ref = sc["overlap_in_reference"]
42
+
43
+ if verbose:
44
+ print("Reference: %s" % reference.name)
45
+ print("Source: %s" % source.name)
46
+ print("delta_seconds = %+.6f drift_slope = %+.3e (%s)"
47
+ % (delta, drift_slope,
48
+ "applied" if apply_drift else "not applied"))
49
+
50
+ with tempfile.TemporaryDirectory() as td:
51
+ td = Path(td)
52
+ ref_pcm, src_pcm = td / "ref.pcm", td / "src.pcm"
53
+ audio.extract_pcm(reference, ref_pcm, SR)
54
+ audio.extract_pcm(source, src_pcm, SR)
55
+ ref = audio.read_pcm(ref_pcm)
56
+ src = audio.read_pcm(src_pcm)
57
+
58
+ pl = int(probe_len * SR)
59
+ pad = int(0.5 * SR)
60
+ ovl_start = max(60.0, float(overlap_ref[0]) + 1.0)
61
+ ovl_end = float(overlap_ref[1]) - probe_len - 1.0
62
+ if ovl_end <= ovl_start:
63
+ raise ValueError("overlap window too short to verify")
64
+
65
+ rs = []
66
+ for bs in np.arange(ovl_start, ovl_end, step):
67
+ # Source-local time corresponding to this reference time.
68
+ src_t = bs - delta
69
+ if apply_drift:
70
+ src_t = src_t / (1.0 + drift_slope)
71
+ si = int(src_t * SR)
72
+ bsi = int(bs * SR)
73
+ if si < 0 or si + pl > len(src):
74
+ continue
75
+ probe = src[si:si + pl]
76
+ if np.abs(probe).mean() < 1.0:
77
+ continue # silence — nothing to correlate
78
+ lo = max(0, bsi - pad)
79
+ hi = min(len(ref), bsi + pl + pad)
80
+ if hi - lo < pl:
81
+ continue
82
+ seg = ref[lo:hi].astype(np.float32)
83
+ xc = signal.correlate(audio.normalize(seg),
84
+ audio.normalize(probe.astype(np.float32)),
85
+ mode="valid", method="fft")
86
+ pk = int(np.argmax(np.abs(xc)))
87
+ ncoef = float(xc[pk] / len(probe))
88
+ residual_ms = ((lo + pk) / SR - bs) * 1000
89
+ rs.append((bs, residual_ms, ncoef))
90
+ if verbose:
91
+ print("t=%7.1fs residual=%+7.2f ms ncoef=%+.3f"
92
+ % (bs, residual_ms, ncoef))
93
+
94
+ if not rs:
95
+ raise ValueError("no usable probes (all silence or out of overlap)")
96
+
97
+ arr = np.array([r[1] for r in rs])
98
+ median_residual_ms = float(np.median(arr))
99
+ residual_spread_ms = float(np.max(np.abs(arr - median_residual_ms)) * 2)
100
+ passed = (abs(median_residual_ms) <= 15.0
101
+ and residual_spread_ms <= max_frame_ms)
102
+
103
+ stats = {
104
+ "median_residual_ms": round(median_residual_ms, 3),
105
+ "residual_spread_ms": round(residual_spread_ms, 3),
106
+ "probe_count": len(rs),
107
+ "drift_applied": bool(apply_drift),
108
+ }
109
+ sc["verification"] = stats
110
+ sidecar.write_text(json.dumps(sc, indent=2, ensure_ascii=False))
111
+ if verbose:
112
+ print("\nResidual: median=%+.2f ms spread=+-%.2f ms -> %s"
113
+ % (median_residual_ms, residual_spread_ms / 2,
114
+ "PASS" if passed else "FAIL"))
115
+ if not passed and abs(median_residual_ms) <= 15.0:
116
+ print(" (spread-only fail with near-zero median is usually "
117
+ "far-field-mic noise, not real desync)", file=sys.stderr)
118
+ return passed, stats
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.4
2
+ Name: polysync
3
+ Version: 0.1.0
4
+ Summary: Multicam audio sync and director-style auto-edit — align N angles of one event by audio cross-correlation, then cut/PiP them into one MP4. Reversible sidecars, never re-encodes the originals.
5
+ Author: 王建硕 (Jian Shuo Wang)
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/jianshuo/polysync
8
+ Project-URL: Issues, https://github.com/jianshuo/polysync/issues
9
+ Keywords: multicam,audio-sync,video-editing,cross-correlation,ffmpeg,picture-in-picture,podcast,interview
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: End Users/Desktop
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Topic :: Multimedia :: Video
17
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Analysis
18
+ Requires-Python: >=3.9
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: numpy>=1.21
22
+ Requires-Dist: scipy>=1.7
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=7; extra == "dev"
25
+ Dynamic: license-file
26
+
27
+ # polysync
28
+
29
+ **Multicam audio sync + director-style auto-edit.** Align N recordings of the
30
+ same event by audio cross-correlation, then cut or picture-in-picture them into
31
+ a single MP4 — driven entirely by who's talking.
32
+
33
+ What makes it different from "yet another sync tool":
34
+
35
+ - **Reversible sidecars, never re-encodes the originals.** Sync writes a tiny
36
+ `<input>.sync.json` next to each file holding a single offset. A 75-min 4K
37
+ 3-camera shoot is 250+ GB; baking offsets into re-encoded copies would double
38
+ that and lose quality. Downstream applies the offset with `ffmpeg -itsoffset`
39
+ at consume time. Originals are touched read-only, always.
40
+ - **Envelope cross-correlation, not raw waveform.** Matches the log-energy
41
+ envelope, which both mics hear regardless of their frequency response — robust
42
+ even when a second camera's on-board mic sounds nothing like the main one.
43
+ - **Clock-drift aware.** Cheap recorders drift 5–50 ppm; polysync fits the drift
44
+ across the recording and reports it separately, so long-form lip-sync can
45
+ correct it while camera-cut editing can ignore it.
46
+ - **Handles the messy real cases.** Auto-picks the loudest audio track (pro
47
+ cameras often leave track 1 dead), partial-coverage clips that only span part
48
+ of the session, and independent verification of the result.
49
+
50
+ ## Install
51
+
52
+ ```bash
53
+ pip install polysync # once published
54
+ # or, from a checkout:
55
+ pip install -e ".[dev]"
56
+ ```
57
+
58
+ Requires **Python ≥ 3.9** and **ffmpeg / ffprobe** on your `PATH`
59
+ (`brew install ffmpeg`, `apt install ffmpeg`, …). Python deps: `numpy`, `scipy`.
60
+
61
+ ## Quickstart
62
+
63
+ ```bash
64
+ # 1. Sync each angle to a reference camera (writes <file>.sync.json sidecars)
65
+ polysync sync CAM_A.mp4 CAM_B.mxf
66
+ polysync sync CAM_A.mp4 CAM_C.mxf
67
+
68
+ # 2. (optional) Verify the alignment — re-checks residual independently
69
+ polysync verify CAM_A.mp4 CAM_B.mxf CAM_B.mxf.sync.json
70
+
71
+ # 3. Build an auto-edit decision list (who's on screen each second)
72
+ polysync edit CAM_A.mp4 CAM_B.mxf CAM_C.mxf --out edl.json
73
+
74
+ # 4. Render — hard cuts, or with a picture-in-picture inset
75
+ polysync render-cuts edl.json --out final.mp4
76
+ polysync render-pip edl.json --out final.mp4 --pip bottom-right
77
+ ```
78
+
79
+ A clip that only covers **part** of the session (a Riverside / phone / lavalier
80
+ recording that started mid-way):
81
+
82
+ ```bash
83
+ polysync sync REFERENCE.mp4 PARTIAL.m4a --partial
84
+ ```
85
+
86
+ ## How it consumes the sidecar
87
+
88
+ `delta_seconds` is the source's `t=0` in the reference's timeline (positive =
89
+ source starts later). To align by hand:
90
+
91
+ ```bash
92
+ ffmpeg -itsoffset $(jq -r .delta_seconds CAM_B.mxf.sync.json) -i CAM_B.mxf \
93
+ -i CAM_A.mp4 -filter_complex "[0:v][1:v]hstack" out.mp4
94
+ ```
95
+
96
+ The `edit` / `render-*` commands read every sidecar automatically.
97
+
98
+ ## Python API
99
+
100
+ ```python
101
+ from polysync import compute_sync # pure-numpy core, unit-testable
102
+ from polysync.sync import sync_files # file → sidecar
103
+ from polysync.verify import verify_files
104
+ from polysync.edit import build_edl
105
+ ```
106
+
107
+ ## Status
108
+
109
+ Beta (0.1). Sync + verify are battle-tested on real Sony FX3/FX6 multicam
110
+ interview footage; the auto-edit is audio-energy-driven (no face detection).
111
+ Issues and PRs welcome.
112
+
113
+ ## License
114
+
115
+ MIT © 王建硕 (Jian Shuo Wang)
@@ -0,0 +1,16 @@
1
+ polysync/__init__.py,sha256=RgITjxkGmBXYPCZf1HB8k51BhzuAlJZ18_fdArxOQs4,738
2
+ polysync/audio.py,sha256=ynWo_o_EyOt5oeOV3DlWbmYGKSs18ccd4Kqnffe9qqY,4838
3
+ polysync/cli.py,sha256=BlkblUjiS47Fv75JK09yRJpsjqBRLeX2jMmH7LaK40Q,2725
4
+ polysync/sidecar.py,sha256=q8S0-NndFC3cTHJu-jWEUeIStwfpnTtBejMMSmuZYbg,3833
5
+ polysync/sync.py,sha256=rftcvHKNjWCvW9ajxGDiwaQ9rb-Our0wErkI6Iz5BqE,8301
6
+ polysync/verify.py,sha256=RMCzLsX37idtOj3M0kHrc4Vuy5fkQrUKCz7YmNIuGhU,4798
7
+ polysync/edit/__init__.py,sha256=DbaXJlHTigztesgHv1Drq5qnOVInbVqaJk-pqvXntBQ,328
8
+ polysync/edit/autoedit.py,sha256=CpwaQNNgw8OeZs71DJ-se5SF4e412UPbCi0Go-dSgIc,12230
9
+ polysync/edit/render_cuts.py,sha256=a_r4YrF0sd16Q1nfPJe5d6V-T7dby2WnHwZl9cvFPJ4,2617
10
+ polysync/edit/render_pip.py,sha256=1HSDq-aNCPC1l98iBh8DOZCS-G02TtJukzunvsmXCmA,5372
11
+ polysync-0.1.0.dist-info/licenses/LICENSE,sha256=hcmoQh1_1s1uNIhPR-r0iRSaW5podyrLeiRSN6aQETY,1083
12
+ polysync-0.1.0.dist-info/METADATA,sha256=cpCI34EHIzrUlFWd_uMOdv_KuDZ_lR8YQl9S6WM1ddU,4363
13
+ polysync-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
14
+ polysync-0.1.0.dist-info/entry_points.txt,sha256=NmYIEaQREjCdPPfVab4LZfuC8AlMMS1JqaTokiyXpGo,47
15
+ polysync-0.1.0.dist-info/top_level.txt,sha256=GQB7IrzkxyfeEJZAvijNYqf43U6cksUB8_qhV7OfvFE,9
16
+ polysync-0.1.0.dist-info/RECORD,,