plaudio 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
plaudio/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ """Plaudio: voice-bank-first speaker labelling for the Plaud Note family."""
2
+ __version__ = "0.1.0"
@@ -0,0 +1 @@
1
+ """Command-line interface."""
@@ -0,0 +1 @@
1
+ """Subcommand handlers."""
@@ -0,0 +1,74 @@
1
+ """`plaudio clean TRANSCRIPT_MD [--corrections FILE]`"""
2
+ from __future__ import annotations
3
+ import argparse, pathlib, re, sys
4
+ from plaudio.core.clean import (
5
+ load_corrections, apply_corrections, strip_silence_fillers,
6
+ merge_fragments, reflow_speaker_paragraphs, fmt_ts, wrap_para,
7
+ )
8
+
9
+ TRANSCRIPT_RE = re.compile(r"^## Transcript\s*$", re.M)
10
+ SEG_RE = re.compile(
11
+ r"\*\*\[(\d+):(\d+)[–—\-](\d+):(\d+)\] ([^*:]+):\*\* "
12
+ r"((?:.|\n(?!\*\*\[))+?)(?=\n\*\*\[|\Z)",
13
+ re.MULTILINE,
14
+ )
15
+ NEXT_SECTION_RE = re.compile(r"^## (?!Transcript)", re.M)
16
+
17
+ def parse_transcript_block(content: str):
18
+ m_start = TRANSCRIPT_RE.search(content)
19
+ if not m_start:
20
+ return [], -1, -1
21
+ rest = content[m_start.end():]
22
+ m_next = NEXT_SECTION_RE.search(rest)
23
+ end_idx = m_start.end() + (m_next.start() if m_next else len(rest))
24
+ block = content[m_start.end():end_idx]
25
+ segments = []
26
+ for match in SEG_RE.finditer(block):
27
+ m1, s1, m2, s2, speaker, text = match.groups()
28
+ segments.append({
29
+ "speaker": speaker.strip(),
30
+ "t_start": int(m1) * 60 + int(s1),
31
+ "t_end": int(m2) * 60 + int(s2),
32
+ "text": " ".join(text.split()),
33
+ })
34
+ return segments, m_start.start(), end_idx
35
+
36
+ def render_transcript(segments, meta_line):
37
+ lines = ["## Transcript", "", meta_line, ""]
38
+ for seg in segments:
39
+ if not seg["text"].strip(): continue
40
+ lines.append(f"**[{fmt_ts(seg['t_start'])}–{fmt_ts(seg['t_end'])}] {seg['speaker']}:** "
41
+ f"{wrap_para(seg['text'])}")
42
+ lines.append("")
43
+ return "\n".join(lines)
44
+
45
+ def cmd_clean(args: argparse.Namespace) -> int:
46
+ path = pathlib.Path(args.path).expanduser()
47
+ if not path.exists():
48
+ print(f"not found: {path}", file=sys.stderr); return 2
49
+ corrections = load_corrections(pathlib.Path(args.corrections).expanduser()) if args.corrections else []
50
+ content = path.read_text()
51
+ if "<!-- cleaned: true -->" in content and not args.force:
52
+ print(f" skip (already cleaned): {path.name}")
53
+ return 0
54
+ segments, start, end = parse_transcript_block(content)
55
+ if not segments:
56
+ print(f" no transcript block found: {path.name}", file=sys.stderr); return 1
57
+ for seg in segments:
58
+ seg["text"] = apply_corrections(seg["text"], corrections)
59
+ seg["text"] = strip_silence_fillers(seg["text"])
60
+ seg["text"] = re.sub(r"\s+", " ", seg["text"]).strip()
61
+ segments = merge_fragments(segments)
62
+ segments = reflow_speaker_paragraphs(segments)
63
+ meta_line = "*Local pipeline transcript. Cleaned for readability.*"
64
+ new_block = render_transcript(segments, meta_line) + "\n<!-- cleaned: true -->\n"
65
+ path.write_text(content[:start] + new_block + content[end:])
66
+ print(f" cleaned: {path.name}")
67
+ return 0
68
+
69
+ def register(sub: argparse._SubParsersAction) -> None:
70
+ p = sub.add_parser("clean", help="clean a transcript markdown file")
71
+ p.add_argument("path")
72
+ p.add_argument("--corrections", help="path to a JSON list of [wrong, right] pairs")
73
+ p.add_argument("--force", action="store_true")
74
+ p.set_defaults(func=cmd_clean)
@@ -0,0 +1,97 @@
1
+ """`plaudio db ingest|search|list`"""
2
+ from __future__ import annotations
3
+ import argparse, os, pathlib, sys
4
+ from plaudio.core.corpus import TranscriptCorpus
5
+
6
+
7
+ def _corpus_path() -> pathlib.Path:
8
+ return pathlib.Path(os.environ.get("PLAUDIO_CORPUS", str(TranscriptCorpus.default_path()))).expanduser()
9
+
10
+
11
+ def _parse_speaker_map(items: list[str] | None) -> dict[str, str]:
12
+ out: dict[str, str] = {}
13
+ for item in items or []:
14
+ if "=" not in item:
15
+ continue
16
+ k, v = item.split("=", 1)
17
+ out[k.strip()] = v.strip()
18
+ return out
19
+
20
+
21
+ def cmd_ingest(args: argparse.Namespace) -> int:
22
+ corpus = TranscriptCorpus(_corpus_path())
23
+ p = pathlib.Path(args.plaud_json).expanduser()
24
+ if not p.exists():
25
+ print(f"transcript not found: {p}", file=sys.stderr)
26
+ return 2
27
+ corpus.ingest(p,
28
+ meeting_id=args.meeting_id,
29
+ date=args.date,
30
+ title=args.title,
31
+ speakers=_parse_speaker_map(args.speaker),
32
+ vault_note=args.vault_note,
33
+ audio_path=args.audio_path)
34
+ print(f"ingested meeting '{args.meeting_id}'")
35
+ return 0
36
+
37
+
38
+ def _fmt_ms(ms: int) -> str:
39
+ s = ms // 1000
40
+ return f"{s//60:02d}:{s%60:02d}"
41
+
42
+
43
+ def cmd_search(args: argparse.Namespace) -> int:
44
+ corpus = TranscriptCorpus(_corpus_path())
45
+ rows = corpus.search(args.query, speaker=args.speaker,
46
+ since=args.since, until=args.until, limit=args.limit)
47
+ if not rows:
48
+ print("(no matches)")
49
+ return 0
50
+ last = None
51
+ for r in rows:
52
+ if r["meeting_id"] != last:
53
+ print(f"\n-- {r['date']} | {r['title']} ({r['meeting_id']}) --")
54
+ last = r["meeting_id"]
55
+ speaker = r["speaker_name"] or r["speaker_label"]
56
+ print(f" [{_fmt_ms(r['start_ms'])}-{_fmt_ms(r['end_ms'])}] {speaker}: {r['content'][:200]}")
57
+ return 0
58
+
59
+
60
+ def cmd_list(args: argparse.Namespace) -> int:
61
+ corpus = TranscriptCorpus(_corpus_path())
62
+ rows = corpus.list_meetings()
63
+ if not rows:
64
+ print("(empty)")
65
+ return 0
66
+ for r in rows:
67
+ dur = f"{(r['duration_ms'] or 0)/60000:.1f}min" if r['duration_ms'] else "?"
68
+ print(f" {r['date']} {r['title'][:55]:<55s} {dur:>8s} "
69
+ f"{(r['language'] or '?'):>4s} {str(r['n_speakers'] or '?'):>2s}sp {r['n_segments']:>5d}seg")
70
+ return 0
71
+
72
+
73
+ def register(sub: argparse._SubParsersAction) -> None:
74
+ db = sub.add_parser("db", help="searchable transcripts corpus")
75
+ dbsub = db.add_subparsers(dest="db_cmd", required=True)
76
+
77
+ i = dbsub.add_parser("ingest", help="ingest a .plaud.json transcript into the corpus")
78
+ i.add_argument("plaud_json", help="path to .plaud.json file")
79
+ i.add_argument("--meeting-id", required=True, help="unique meeting identifier")
80
+ i.add_argument("--date", required=True, help="meeting date YYYY-MM-DD")
81
+ i.add_argument("--title", required=True, help="meeting title")
82
+ i.add_argument("--vault-note", help="path to vault note for this meeting")
83
+ i.add_argument("--audio-path", help="path to source audio file")
84
+ i.add_argument("--speaker", action="append", metavar="LABEL=Name",
85
+ help="speaker label to real-name mapping; repeat for multiple")
86
+ i.set_defaults(func=cmd_ingest)
87
+
88
+ s = dbsub.add_parser("search", help="full-text search across transcripts")
89
+ s.add_argument("query", help="search query (trigram, supports ZH+EN)")
90
+ s.add_argument("--speaker", help="filter to a specific speaker name or label")
91
+ s.add_argument("--since", metavar="YYYY-MM-DD", help="earliest meeting date")
92
+ s.add_argument("--until", metavar="YYYY-MM-DD", help="latest meeting date")
93
+ s.add_argument("--limit", type=int, default=50, help="max results (default 50)")
94
+ s.set_defaults(func=cmd_search)
95
+
96
+ lst = dbsub.add_parser("list", help="list all ingested meetings")
97
+ lst.set_defaults(func=cmd_list)
@@ -0,0 +1,85 @@
1
+ """`plaudio doctor` -- environment + dependency checks. High-leverage support tool."""
2
+ from __future__ import annotations
3
+ import argparse, os, pathlib, platform, shutil, sys
4
+
5
+
6
+ def _ok(name: str, detail: str = "") -> None:
7
+ print(f" ok {name}" + (f": {detail}" if detail else ""))
8
+
9
+
10
+ def _bad(name: str, detail: str = "") -> int:
11
+ print(f" fail {name}" + (f" -- {detail}" if detail else ""))
12
+ return 1
13
+
14
+
15
+ def cmd_doctor(args: argparse.Namespace) -> int:
16
+ print("Plaudio doctor -- environment check\n")
17
+ fails = 0
18
+
19
+ if platform.system() == "Darwin":
20
+ _ok("macOS", platform.mac_ver()[0])
21
+ else:
22
+ fails |= _bad("platform", f"{platform.system()} (Plaudio supports macOS only)")
23
+
24
+ if platform.machine() == "arm64":
25
+ _ok("Apple Silicon", "arm64")
26
+ else:
27
+ fails |= _bad("CPU", f"{platform.machine()} (Apple Silicon required for mlx-whisper)")
28
+
29
+ pv = sys.version_info
30
+ if pv >= (3, 11):
31
+ _ok("Python", f"{pv.major}.{pv.minor}.{pv.micro}")
32
+ else:
33
+ fails |= _bad("Python", f"{pv.major}.{pv.minor} (>= 3.11 required)")
34
+
35
+ ff = shutil.which("ffmpeg")
36
+ if ff:
37
+ _ok("ffmpeg", ff)
38
+ else:
39
+ fails |= _bad("ffmpeg", "not on PATH; install: brew install ffmpeg")
40
+
41
+ mw = shutil.which("mlx_whisper")
42
+ if mw:
43
+ _ok("mlx_whisper", mw)
44
+ else:
45
+ fails |= _bad("mlx_whisper", "not installed; pip install mlx-whisper")
46
+
47
+ try:
48
+ import pyannote.audio as _pa
49
+ _ok("pyannote.audio", getattr(_pa, "__version__", "?"))
50
+ except ImportError as e:
51
+ fails |= _bad("pyannote.audio", f"import failed: {e}")
52
+
53
+ token_path = pathlib.Path(
54
+ os.environ.get("PLAUDIO_HF_TOKEN_FILE", "~/.huggingface/token")
55
+ ).expanduser()
56
+ if token_path.exists() and token_path.read_text().strip():
57
+ _ok("HF token", str(token_path))
58
+ else:
59
+ fails |= _bad(
60
+ "HF token",
61
+ f"not found at {token_path}. pyannote/speaker-diarization-3.1 is gated; "
62
+ f"accept at https://huggingface.co/pyannote/speaker-diarization-3.1 and "
63
+ f"save your token at {token_path}",
64
+ )
65
+
66
+ try:
67
+ import torch
68
+ if torch.backends.mps.is_available():
69
+ _ok("torch MPS", "available")
70
+ else:
71
+ print(" - torch MPS not available (will fall back to CPU; slower)")
72
+ except ImportError:
73
+ fails |= _bad("torch", "not installed")
74
+
75
+ print()
76
+ if fails:
77
+ print("Doctor: some checks failed. See messages above.")
78
+ return 1
79
+ print("Doctor: all checks passed.")
80
+ return 0
81
+
82
+
83
+ def register(sub: argparse._SubParsersAction) -> None:
84
+ p = sub.add_parser("doctor", help="environment + dependency check")
85
+ p.set_defaults(func=cmd_doctor)
@@ -0,0 +1,56 @@
1
+ """`plaudio enrol AUDIO --name X --start S --end S`"""
2
+ from __future__ import annotations
3
+ import argparse, os, pathlib, sys
4
+ from plaudio.core.voicebank import VoiceBank
5
+
6
+ DEFAULT_TOKEN_FILE = pathlib.Path("~/.huggingface/token").expanduser()
7
+
8
+ CONSENT_TEXT = """\
9
+ plaudio enrol: enrolling a person's voice creates a biometric profile.
10
+ Only enrol with the speaker's knowledge. The voicebank lives at
11
+ {path} (mode 0600); back it up explicitly if you want it elsewhere.
12
+ """
13
+
14
+ def _bank_path() -> pathlib.Path:
15
+ return pathlib.Path(os.environ.get("PLAUDIO_VOICEBANK", str(VoiceBank.default_path()))).expanduser()
16
+
17
+ def cmd_enrol(args: argparse.Namespace) -> int:
18
+ bank_path = _bank_path()
19
+ if not bank_path.exists():
20
+ print(CONSENT_TEXT.format(path=bank_path), file=sys.stderr)
21
+ token_file = pathlib.Path(args.hf_token_file).expanduser()
22
+ if not token_file.exists():
23
+ print(f"HF token not found at {token_file}; create it or pass --hf-token-file.", file=sys.stderr)
24
+ return 2
25
+ hf_token = token_file.read_text().strip()
26
+ bank = VoiceBank.load(bank_path)
27
+ audio = pathlib.Path(args.audio).expanduser()
28
+ try:
29
+ profile = bank.enrol_from_audio(
30
+ audio,
31
+ name=args.name,
32
+ start_s=args.start,
33
+ end_s=args.end,
34
+ hf_token=hf_token,
35
+ notes=args.notes,
36
+ num_speakers=args.num_speakers,
37
+ )
38
+ except Exception as e:
39
+ print(f"enrolment failed: {e}", file=sys.stderr)
40
+ return 1
41
+ bank.save(bank_path)
42
+ print(f"enrolled {profile.name} (dim={profile.embedding_dim}, duration={profile.duration_s:.0f}s)")
43
+ print(f"voicebank: {bank_path}")
44
+ return 0
45
+
46
+ def register(sub: argparse._SubParsersAction) -> None:
47
+ p = sub.add_parser("enrol", help="enrol a voice profile from audio")
48
+ p.add_argument("audio")
49
+ p.add_argument("--name", required=True)
50
+ p.add_argument("--start", type=float, default=None, help="start (sec) of the clean window")
51
+ p.add_argument("--end", type=float, default=None, help="end (sec) of the clean window")
52
+ p.add_argument("--num-speakers", type=int, default=None,
53
+ help="hint to pyannote; helps when the audio has few speakers")
54
+ p.add_argument("--notes", default="")
55
+ p.add_argument("--hf-token-file", default=str(DEFAULT_TOKEN_FILE))
56
+ p.set_defaults(func=cmd_enrol)
@@ -0,0 +1,240 @@
1
+ """`plaudio label AUDIO TRANSCRIPT [--enrol] [--batch-label "L=Name,..."]`"""
2
+ from __future__ import annotations
3
+ import argparse, json, os, pathlib, platform, subprocess, sys
4
+ from collections import defaultdict
5
+ from plaudio.core.voicebank import VoiceBank
6
+
7
+ DEFAULT_TOKEN_FILE = pathlib.Path("~/.huggingface/token").expanduser()
8
+
9
+ def _bank_path() -> pathlib.Path:
10
+ return pathlib.Path(os.environ.get("PLAUDIO_VOICEBANK", str(VoiceBank.default_path()))).expanduser()
11
+
12
+ def fmt_ms(ms): s = int(ms / 1000); return f"{s//60:02d}:{s%60:02d}"
13
+
14
+ def find_best_clip(segs, target_speaker, min_dur_s=15, max_dur_s=45):
15
+ spans = []
16
+ cur_start = cur_end = None
17
+ for s in sorted(segs, key=lambda x: x['start_time']):
18
+ if s['speaker'] != target_speaker: continue
19
+ if cur_start is None:
20
+ cur_start, cur_end = s['start_time'], s['end_time']
21
+ elif s['start_time'] - cur_end < 3000:
22
+ cur_end = s['end_time']
23
+ else:
24
+ spans.append((cur_start, cur_end))
25
+ cur_start, cur_end = s['start_time'], s['end_time']
26
+ if cur_start is not None: spans.append((cur_start, cur_end))
27
+ if not spans: return None
28
+ longest = max(spans, key=lambda t: t[1]-t[0])
29
+ if (longest[1]-longest[0])/1000 < min_dur_s: return None
30
+ return (longest[0], min(longest[1], longest[0] + max_dur_s*1000))
31
+
32
+ def find_enrol_window(segs, target_speaker, max_dur_s=180):
33
+ spans = []
34
+ cur_start = cur_end = None
35
+ for s in sorted(segs, key=lambda x: x['start_time']):
36
+ if s['speaker'] != target_speaker: continue
37
+ if cur_start is None:
38
+ cur_start, cur_end = s['start_time'], s['end_time']
39
+ elif s['start_time'] - cur_end < 5000:
40
+ cur_end = s['end_time']
41
+ else:
42
+ spans.append((cur_start, cur_end))
43
+ cur_start, cur_end = s['start_time'], s['end_time']
44
+ if cur_start is not None: spans.append((cur_start, cur_end))
45
+ if not spans: return None
46
+ longest = max(spans, key=lambda t: t[1]-t[0])
47
+ return (longest[0], min(longest[1], longest[0] + max_dur_s*1000))
48
+
49
+ def play_clip_bg(audio_path, start_ms, end_ms):
50
+ tmp = pathlib.Path(f"/tmp/plaudio-clip-{start_ms}-{end_ms}.wav")
51
+ dur_s = (end_ms - start_ms) / 1000
52
+ try:
53
+ r = subprocess.run(
54
+ ["ffmpeg", "-y", "-ss", str(start_ms/1000), "-t", str(dur_s),
55
+ "-i", str(audio_path), "-ar", "16000", "-ac", "1", str(tmp)],
56
+ capture_output=True, text=True, timeout=30)
57
+ if r.returncode != 0:
58
+ return None, f"ffmpeg exit {r.returncode}: {r.stderr[-300:]}"
59
+ if not tmp.exists() or tmp.stat().st_size == 0:
60
+ return None, "ffmpeg produced empty file"
61
+ player = "afplay" if platform.system() == "Darwin" else "ffplay"
62
+ args_list = [player, str(tmp)] if player == "afplay" \
63
+ else [player, "-nodisp", "-autoexit", "-loglevel", "quiet", str(tmp)]
64
+ proc = subprocess.Popen(args_list, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
65
+ return proc, tmp
66
+ except FileNotFoundError as e:
67
+ return None, f"missing executable: {e}"
68
+ except Exception as e:
69
+ return None, f"unexpected: {e}"
70
+
71
+ def stop_playback(proc, tmp):
72
+ if proc is not None and proc.poll() is None:
73
+ try:
74
+ proc.terminate(); proc.wait(timeout=2)
75
+ except Exception:
76
+ try: proc.kill()
77
+ except Exception: pass
78
+ if tmp is not None:
79
+ try: tmp.unlink(missing_ok=True)
80
+ except Exception: pass
81
+
82
+ def save_labels(data, jpath, mapping):
83
+ for s in data['segments']:
84
+ if s['speaker'] in mapping:
85
+ if 'original_speaker' not in s:
86
+ s['original_speaker'] = s['speaker']
87
+ s['speaker'] = mapping[s['speaker']]
88
+ jpath.write_text(json.dumps(data, ensure_ascii=False, indent=2))
89
+
90
+ def _enrol(name, audio, start_ms, end_ms, hf_token, num_speakers=None):
91
+ print(f" enrolling {name} from [{fmt_ms(start_ms)}-{fmt_ms(end_ms)}]...")
92
+ bank = VoiceBank.load(_bank_path())
93
+ try:
94
+ bank.enrol_from_audio(
95
+ pathlib.Path(audio),
96
+ name=name,
97
+ start_s=start_ms/1000,
98
+ end_s=end_ms/1000,
99
+ hf_token=hf_token,
100
+ notes=f"via plaudio label interactive labelling",
101
+ num_speakers=num_speakers,
102
+ )
103
+ bank.save(_bank_path())
104
+ print(f" enrolled {name}")
105
+ except Exception as e:
106
+ print(f" enrolment failed: {e}", file=sys.stderr)
107
+
108
+ def cmd_label(args: argparse.Namespace) -> int:
109
+ audio = pathlib.Path(args.audio).expanduser()
110
+ jpath = pathlib.Path(args.transcript).expanduser()
111
+ if not audio.exists(): print(f"audio not found: {audio}", file=sys.stderr); return 2
112
+ if not jpath.exists(): print(f"transcript not found: {jpath}", file=sys.stderr); return 2
113
+
114
+ hf_token = ""
115
+ if args.enrol or args.batch_label:
116
+ tf = pathlib.Path(args.hf_token_file).expanduser()
117
+ if not tf.exists():
118
+ print(f"HF token not found at {tf}; enrolment needs it", file=sys.stderr); return 2
119
+ hf_token = tf.read_text().strip()
120
+
121
+ data = json.loads(jpath.read_text())
122
+ segs = data.get('segments', [])
123
+ if not segs: print("no segments in json", file=sys.stderr); return 2
124
+
125
+ by_spk = defaultdict(list)
126
+ for s in segs:
127
+ by_spk[s['speaker']].append(s)
128
+ durs = {k: sum((x['end_time']-x['start_time'])/1000 for x in v) for k,v in by_spk.items()}
129
+ main_spk = sorted([k for k,v in durs.items() if v > 5], key=lambda k: -durs[k])
130
+
131
+ if args.batch_label:
132
+ mapping = {}
133
+ enrol_windows = {}
134
+ for pair in args.batch_label.split(","):
135
+ if "=" not in pair: continue
136
+ spk, name = pair.split("=", 1)
137
+ spk, name = spk.strip(), name.strip()
138
+ if spk not in by_spk:
139
+ print(f" warn: {spk} not in transcript, skipping"); continue
140
+ mapping[spk] = name
141
+ es = find_enrol_window(by_spk[spk], spk, max_dur_s=180)
142
+ if es: enrol_windows[name] = es
143
+ if mapping:
144
+ save_labels(data, jpath, mapping)
145
+ print(f"batch-labelled {len(mapping)} speakers: {mapping}")
146
+ if args.enrol:
147
+ print("\n=== Enrolling voice profiles ===")
148
+ for name, (s_ms, e_ms) in enrol_windows.items():
149
+ if name == "Unknown": continue
150
+ _enrol(name, audio, s_ms, e_ms, hf_token)
151
+ return 0
152
+
153
+ print(f"\n{len(main_spk)} clusters to label (> 5s)\n")
154
+ mapping = {}
155
+ enrol_windows = {}
156
+
157
+ try:
158
+ for spk in main_spk:
159
+ if not (spk.startswith("SPEAKER_") or spk == "Unknown"):
160
+ print(f"\n{spk}: already labelled, skipping")
161
+ continue
162
+ clip = find_best_clip(by_spk[spk], spk, args.min_dur, args.max_dur)
163
+ if not clip:
164
+ print(f" {spk}: no clean >={args.min_dur}s clip, skipping")
165
+ continue
166
+ start_ms, end_ms = clip
167
+ dur_s = (end_ms - start_ms) / 1000
168
+ print(f"\n{'='*70}")
169
+ print(f"Speaker: {spk} | Airtime: {durs[spk]:.0f}s ({durs[spk]/60:.1f}min)")
170
+ print(f"Clip: {fmt_ms(start_ms)}-{fmt_ms(end_ms)} ({dur_s:.0f}s)")
171
+ in_window = [x for x in by_spk[spk] if x['start_time'] >= start_ms and x['start_time'] < end_ms]
172
+ text = " ".join(x['content'] for x in in_window)
173
+ print(f"Text: \"{text[:500]}\"")
174
+ print()
175
+ playback_proc = playback_tmp = None
176
+ if not args.no_play:
177
+ playback_proc, playback_tmp = play_clip_bg(audio, start_ms, end_ms)
178
+ if playback_proc is None:
179
+ print(f" playback error: {playback_tmp}")
180
+ playback_tmp = None
181
+ else:
182
+ print(" (playing in background; type at any time to stop)")
183
+ try:
184
+ ans = input("name / [s]kip / [u]nknown / [r]eplay / [q]uit: ").strip()
185
+ except EOFError:
186
+ ans = 'q'
187
+ finally:
188
+ stop_playback(playback_proc, playback_tmp)
189
+ while ans.lower() == 'r':
190
+ if args.no_play:
191
+ print(" (--no-play; cannot replay)")
192
+ try: ans = input("name / [s]kip / [u]nknown / [q]uit: ").strip()
193
+ except EOFError: ans = 'q'
194
+ break
195
+ pp, pt = play_clip_bg(audio, start_ms, end_ms)
196
+ if pp is None:
197
+ print(f" replay error: {pt}")
198
+ else:
199
+ print(" (replaying)")
200
+ try:
201
+ ans = input("name / [s]kip / [u]nknown / [r]eplay / [q]uit: ").strip()
202
+ except EOFError:
203
+ ans = 'q'
204
+ finally:
205
+ stop_playback(pp, pt)
206
+ if ans.lower() in ('q', 'quit'): raise KeyboardInterrupt
207
+ if ans.lower() in ('s', 'skip', ''): continue
208
+ if ans.lower() in ('u', 'unknown'):
209
+ mapping[spk] = "Unknown"
210
+ save_labels(data, jpath, {spk: "Unknown"})
211
+ print(f" -> {spk} = Unknown (saved)")
212
+ continue
213
+ mapping[spk] = ans
214
+ enrol_span = find_enrol_window(by_spk[spk], spk, max_dur_s=180)
215
+ enrol_windows[ans] = enrol_span if enrol_span else (start_ms, end_ms)
216
+ es_start, es_end = enrol_windows[ans]
217
+ save_labels(data, jpath, {spk: ans})
218
+ print(f" -> {spk} = {ans} (saved); enrolment will use [{fmt_ms(es_start)}-{fmt_ms(es_end)}]")
219
+ except KeyboardInterrupt:
220
+ print("\n\n(quitting; labels saved so far)")
221
+
222
+ print(f"\nwrote {len(mapping)} labels to {jpath}")
223
+ if args.enrol and enrol_windows:
224
+ print(f"\n=== Enrolling voice profiles ({len(enrol_windows)} people) ===")
225
+ for name, (start_ms, end_ms) in enrol_windows.items():
226
+ if name == "Unknown": continue
227
+ _enrol(name, audio, start_ms, end_ms, hf_token)
228
+ return 0
229
+
230
+ def register(sub: argparse._SubParsersAction) -> None:
231
+ p = sub.add_parser("label", help="interactive speaker labelling")
232
+ p.add_argument("audio")
233
+ p.add_argument("transcript")
234
+ p.add_argument("--enrol", action="store_true")
235
+ p.add_argument("--no-play", action="store_true")
236
+ p.add_argument("--batch-label", type=str)
237
+ p.add_argument("--min-dur", type=int, default=15)
238
+ p.add_argument("--max-dur", type=int, default=15)
239
+ p.add_argument("--hf-token-file", default=str(DEFAULT_TOKEN_FILE))
240
+ p.set_defaults(func=cmd_label)
@@ -0,0 +1,59 @@
1
+ """`plaudio match AUDIO TRANSCRIPT [--threshold T] [--report]`"""
2
+ from __future__ import annotations
3
+ import argparse
4
+ import json
5
+ import os
6
+ import pathlib
7
+ import sys
8
+ from plaudio.core.voicebank import VoiceBank
9
+ from plaudio.core.slidingmatch import SlidingMatcher, DEFAULT_THRESHOLD
10
+
11
+
12
+ def _bank_path() -> pathlib.Path:
13
+ return pathlib.Path(
14
+ os.environ.get("PLAUDIO_VOICEBANK", str(VoiceBank.default_path()))
15
+ ).expanduser()
16
+
17
+
18
+ def cmd_match(args: argparse.Namespace) -> int:
19
+ audio = pathlib.Path(args.audio).expanduser()
20
+ jpath = pathlib.Path(args.transcript).expanduser()
21
+ if not audio.exists():
22
+ print(f"audio not found: {audio}", file=sys.stderr)
23
+ return 2
24
+ if not jpath.exists():
25
+ print(f"transcript not found: {jpath}", file=sys.stderr)
26
+ return 2
27
+ bank = VoiceBank.load(_bank_path())
28
+ matcher = SlidingMatcher(bank, threshold=args.threshold)
29
+ data = json.loads(jpath.read_text())
30
+ segs = data.get("segments", [])
31
+ report = matcher.label_segments(audio, segs)
32
+ jpath.write_text(json.dumps(data, ensure_ascii=False, indent=2))
33
+ print(
34
+ f" {report.get('n_matched', 0)} / {report.get('n_total', 0)} windows matched"
35
+ f" at threshold {args.threshold}"
36
+ )
37
+ print(
38
+ f" coalesced into {report.get('n_runs', 0)} runs,"
39
+ f" relabelled {report.get('n_segments_relabelled', 0)} segments"
40
+ )
41
+ if args.report:
42
+ from collections import defaultdict
43
+
44
+ durs: dict[str, float] = defaultdict(float)
45
+ for s in segs:
46
+ durs[s["speaker"]] += (s["end_time"] - s["start_time"]) / 1000.0
47
+ print("\nPer-speaker durations:")
48
+ for k, v in sorted(durs.items(), key=lambda x: -x[1]):
49
+ print(f" {k}: {v:.0f}s ({v/60:.1f}min)")
50
+ return 0
51
+
52
+
53
+ def register(sub: argparse._SubParsersAction) -> None:
54
+ p = sub.add_parser("match", help="re-label transcript via voice-bank sliding match")
55
+ p.add_argument("audio")
56
+ p.add_argument("transcript")
57
+ p.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD)
58
+ p.add_argument("--report", action="store_true")
59
+ p.set_defaults(func=cmd_match)
@@ -0,0 +1,22 @@
1
+ """`plaudio plaud login|list|sync` -- stubs in v0.1, full impl in v0.2."""
2
+ from __future__ import annotations
3
+ import argparse
4
+
5
+ V02_MSG = (
6
+ "Plaud cloud sync is not implemented in v0.1. See README -> Roadmap. "
7
+ "For now: use the Plaud MCP via your editor's MCP host, or the Plaud "
8
+ "mobile app, then point `plaudio transcribe` at the downloaded audio."
9
+ )
10
+
11
+
12
+ def cmd_stub(args: argparse.Namespace) -> int:
13
+ print(V02_MSG)
14
+ return 0
15
+
16
+
17
+ def register(sub: argparse._SubParsersAction) -> None:
18
+ p = sub.add_parser("plaud", help="Plaud cloud sync (v0.2)")
19
+ psub = p.add_subparsers(dest="plaud_cmd", required=True)
20
+ psub.add_parser("login").set_defaults(func=cmd_stub)
21
+ psub.add_parser("list").set_defaults(func=cmd_stub)
22
+ psub.add_parser("sync").set_defaults(func=cmd_stub)
@@ -0,0 +1,30 @@
1
+ """`plaudio transcribe AUDIO [--vocab FILE] [--language LANG] [--out DIR]`"""
2
+ from __future__ import annotations
3
+ import argparse, pathlib, sys
4
+ from plaudio.core.transcribe import transcribe, load_vocab_prompt, DEFAULT_MODEL
5
+
6
+
7
+ def cmd_transcribe(args: argparse.Namespace) -> int:
8
+ audio = pathlib.Path(args.audio).expanduser()
9
+ out_dir = pathlib.Path(args.out).expanduser() if args.out else audio.parent
10
+ vocab = load_vocab_prompt(pathlib.Path(args.vocab).expanduser()) if args.vocab else ""
11
+ try:
12
+ segs, elapsed = transcribe(audio, out_dir=out_dir, language=args.language,
13
+ vocab=vocab, model=args.model)
14
+ except FileNotFoundError as e:
15
+ print(f"audio not found: {e}", file=sys.stderr)
16
+ return 2
17
+ print(f"ok {len(segs)} segments in {elapsed:.1f}s "
18
+ f"({len(segs)/max(elapsed,1e-3):.1f} seg/s)")
19
+ print(f" output: {out_dir / (audio.stem + '.json')}")
20
+ return 0
21
+
22
+
23
+ def register(sub: argparse._SubParsersAction) -> None:
24
+ t = sub.add_parser("transcribe", help="mlx-whisper ASR on an audio file")
25
+ t.add_argument("audio")
26
+ t.add_argument("--vocab", help="path to a vocab file (one term per line); default empty")
27
+ t.add_argument("--language", default="en")
28
+ t.add_argument("--out", help="output directory; default: same as audio")
29
+ t.add_argument("--model", default=DEFAULT_MODEL)
30
+ t.set_defaults(func=cmd_transcribe)