plaudio 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- plaudio/__init__.py +2 -0
- plaudio/cli/__init__.py +1 -0
- plaudio/cli/commands/__init__.py +1 -0
- plaudio/cli/commands/clean.py +74 -0
- plaudio/cli/commands/db.py +97 -0
- plaudio/cli/commands/doctor.py +85 -0
- plaudio/cli/commands/enrol.py +56 -0
- plaudio/cli/commands/label.py +240 -0
- plaudio/cli/commands/match.py +59 -0
- plaudio/cli/commands/plaud.py +22 -0
- plaudio/cli/commands/transcribe.py +30 -0
- plaudio/cli/commands/voicebank.py +91 -0
- plaudio/cli/main.py +40 -0
- plaudio/core/__init__.py +1 -0
- plaudio/core/clean.py +72 -0
- plaudio/core/corpus.py +157 -0
- plaudio/core/diarise.py +131 -0
- plaudio/core/slidingmatch.py +143 -0
- plaudio/core/transcribe.py +75 -0
- plaudio/core/voicebank.py +152 -0
- plaudio/plaud/__init__.py +1 -0
- plaudio/plaud/cloud.py +31 -0
- plaudio-0.1.0.dist-info/METADATA +162 -0
- plaudio-0.1.0.dist-info/RECORD +28 -0
- plaudio-0.1.0.dist-info/WHEEL +5 -0
- plaudio-0.1.0.dist-info/entry_points.txt +2 -0
- plaudio-0.1.0.dist-info/licenses/LICENSE +661 -0
- plaudio-0.1.0.dist-info/top_level.txt +1 -0
plaudio/__init__.py
ADDED
plaudio/cli/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Command-line interface."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Subcommand handlers."""
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""`plaudio clean TRANSCRIPT_MD [--corrections FILE]`"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import argparse, pathlib, re, sys
|
|
4
|
+
from plaudio.core.clean import (
|
|
5
|
+
load_corrections, apply_corrections, strip_silence_fillers,
|
|
6
|
+
merge_fragments, reflow_speaker_paragraphs, fmt_ts, wrap_para,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
TRANSCRIPT_RE = re.compile(r"^## Transcript\s*$", re.M)
|
|
10
|
+
SEG_RE = re.compile(
|
|
11
|
+
r"\*\*\[(\d+):(\d+)[–—\-](\d+):(\d+)\] ([^*:]+):\*\* "
|
|
12
|
+
r"((?:.|\n(?!\*\*\[))+?)(?=\n\*\*\[|\Z)",
|
|
13
|
+
re.MULTILINE,
|
|
14
|
+
)
|
|
15
|
+
NEXT_SECTION_RE = re.compile(r"^## (?!Transcript)", re.M)
|
|
16
|
+
|
|
17
|
+
def parse_transcript_block(content: str):
|
|
18
|
+
m_start = TRANSCRIPT_RE.search(content)
|
|
19
|
+
if not m_start:
|
|
20
|
+
return [], -1, -1
|
|
21
|
+
rest = content[m_start.end():]
|
|
22
|
+
m_next = NEXT_SECTION_RE.search(rest)
|
|
23
|
+
end_idx = m_start.end() + (m_next.start() if m_next else len(rest))
|
|
24
|
+
block = content[m_start.end():end_idx]
|
|
25
|
+
segments = []
|
|
26
|
+
for match in SEG_RE.finditer(block):
|
|
27
|
+
m1, s1, m2, s2, speaker, text = match.groups()
|
|
28
|
+
segments.append({
|
|
29
|
+
"speaker": speaker.strip(),
|
|
30
|
+
"t_start": int(m1) * 60 + int(s1),
|
|
31
|
+
"t_end": int(m2) * 60 + int(s2),
|
|
32
|
+
"text": " ".join(text.split()),
|
|
33
|
+
})
|
|
34
|
+
return segments, m_start.start(), end_idx
|
|
35
|
+
|
|
36
|
+
def render_transcript(segments, meta_line):
|
|
37
|
+
lines = ["## Transcript", "", meta_line, ""]
|
|
38
|
+
for seg in segments:
|
|
39
|
+
if not seg["text"].strip(): continue
|
|
40
|
+
lines.append(f"**[{fmt_ts(seg['t_start'])}–{fmt_ts(seg['t_end'])}] {seg['speaker']}:** "
|
|
41
|
+
f"{wrap_para(seg['text'])}")
|
|
42
|
+
lines.append("")
|
|
43
|
+
return "\n".join(lines)
|
|
44
|
+
|
|
45
|
+
def cmd_clean(args: argparse.Namespace) -> int:
|
|
46
|
+
path = pathlib.Path(args.path).expanduser()
|
|
47
|
+
if not path.exists():
|
|
48
|
+
print(f"not found: {path}", file=sys.stderr); return 2
|
|
49
|
+
corrections = load_corrections(pathlib.Path(args.corrections).expanduser()) if args.corrections else []
|
|
50
|
+
content = path.read_text()
|
|
51
|
+
if "<!-- cleaned: true -->" in content and not args.force:
|
|
52
|
+
print(f" skip (already cleaned): {path.name}")
|
|
53
|
+
return 0
|
|
54
|
+
segments, start, end = parse_transcript_block(content)
|
|
55
|
+
if not segments:
|
|
56
|
+
print(f" no transcript block found: {path.name}", file=sys.stderr); return 1
|
|
57
|
+
for seg in segments:
|
|
58
|
+
seg["text"] = apply_corrections(seg["text"], corrections)
|
|
59
|
+
seg["text"] = strip_silence_fillers(seg["text"])
|
|
60
|
+
seg["text"] = re.sub(r"\s+", " ", seg["text"]).strip()
|
|
61
|
+
segments = merge_fragments(segments)
|
|
62
|
+
segments = reflow_speaker_paragraphs(segments)
|
|
63
|
+
meta_line = "*Local pipeline transcript. Cleaned for readability.*"
|
|
64
|
+
new_block = render_transcript(segments, meta_line) + "\n<!-- cleaned: true -->\n"
|
|
65
|
+
path.write_text(content[:start] + new_block + content[end:])
|
|
66
|
+
print(f" cleaned: {path.name}")
|
|
67
|
+
return 0
|
|
68
|
+
|
|
69
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
70
|
+
p = sub.add_parser("clean", help="clean a transcript markdown file")
|
|
71
|
+
p.add_argument("path")
|
|
72
|
+
p.add_argument("--corrections", help="path to a JSON list of [wrong, right] pairs")
|
|
73
|
+
p.add_argument("--force", action="store_true")
|
|
74
|
+
p.set_defaults(func=cmd_clean)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""`plaudio db ingest|search|list`"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import argparse, os, pathlib, sys
|
|
4
|
+
from plaudio.core.corpus import TranscriptCorpus
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _corpus_path() -> pathlib.Path:
|
|
8
|
+
return pathlib.Path(os.environ.get("PLAUDIO_CORPUS", str(TranscriptCorpus.default_path()))).expanduser()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _parse_speaker_map(items: list[str] | None) -> dict[str, str]:
|
|
12
|
+
out: dict[str, str] = {}
|
|
13
|
+
for item in items or []:
|
|
14
|
+
if "=" not in item:
|
|
15
|
+
continue
|
|
16
|
+
k, v = item.split("=", 1)
|
|
17
|
+
out[k.strip()] = v.strip()
|
|
18
|
+
return out
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def cmd_ingest(args: argparse.Namespace) -> int:
|
|
22
|
+
corpus = TranscriptCorpus(_corpus_path())
|
|
23
|
+
p = pathlib.Path(args.plaud_json).expanduser()
|
|
24
|
+
if not p.exists():
|
|
25
|
+
print(f"transcript not found: {p}", file=sys.stderr)
|
|
26
|
+
return 2
|
|
27
|
+
corpus.ingest(p,
|
|
28
|
+
meeting_id=args.meeting_id,
|
|
29
|
+
date=args.date,
|
|
30
|
+
title=args.title,
|
|
31
|
+
speakers=_parse_speaker_map(args.speaker),
|
|
32
|
+
vault_note=args.vault_note,
|
|
33
|
+
audio_path=args.audio_path)
|
|
34
|
+
print(f"ingested meeting '{args.meeting_id}'")
|
|
35
|
+
return 0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _fmt_ms(ms: int) -> str:
|
|
39
|
+
s = ms // 1000
|
|
40
|
+
return f"{s//60:02d}:{s%60:02d}"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def cmd_search(args: argparse.Namespace) -> int:
|
|
44
|
+
corpus = TranscriptCorpus(_corpus_path())
|
|
45
|
+
rows = corpus.search(args.query, speaker=args.speaker,
|
|
46
|
+
since=args.since, until=args.until, limit=args.limit)
|
|
47
|
+
if not rows:
|
|
48
|
+
print("(no matches)")
|
|
49
|
+
return 0
|
|
50
|
+
last = None
|
|
51
|
+
for r in rows:
|
|
52
|
+
if r["meeting_id"] != last:
|
|
53
|
+
print(f"\n-- {r['date']} | {r['title']} ({r['meeting_id']}) --")
|
|
54
|
+
last = r["meeting_id"]
|
|
55
|
+
speaker = r["speaker_name"] or r["speaker_label"]
|
|
56
|
+
print(f" [{_fmt_ms(r['start_ms'])}-{_fmt_ms(r['end_ms'])}] {speaker}: {r['content'][:200]}")
|
|
57
|
+
return 0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def cmd_list(args: argparse.Namespace) -> int:
|
|
61
|
+
corpus = TranscriptCorpus(_corpus_path())
|
|
62
|
+
rows = corpus.list_meetings()
|
|
63
|
+
if not rows:
|
|
64
|
+
print("(empty)")
|
|
65
|
+
return 0
|
|
66
|
+
for r in rows:
|
|
67
|
+
dur = f"{(r['duration_ms'] or 0)/60000:.1f}min" if r['duration_ms'] else "?"
|
|
68
|
+
print(f" {r['date']} {r['title'][:55]:<55s} {dur:>8s} "
|
|
69
|
+
f"{(r['language'] or '?'):>4s} {str(r['n_speakers'] or '?'):>2s}sp {r['n_segments']:>5d}seg")
|
|
70
|
+
return 0
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
74
|
+
db = sub.add_parser("db", help="searchable transcripts corpus")
|
|
75
|
+
dbsub = db.add_subparsers(dest="db_cmd", required=True)
|
|
76
|
+
|
|
77
|
+
i = dbsub.add_parser("ingest", help="ingest a .plaud.json transcript into the corpus")
|
|
78
|
+
i.add_argument("plaud_json", help="path to .plaud.json file")
|
|
79
|
+
i.add_argument("--meeting-id", required=True, help="unique meeting identifier")
|
|
80
|
+
i.add_argument("--date", required=True, help="meeting date YYYY-MM-DD")
|
|
81
|
+
i.add_argument("--title", required=True, help="meeting title")
|
|
82
|
+
i.add_argument("--vault-note", help="path to vault note for this meeting")
|
|
83
|
+
i.add_argument("--audio-path", help="path to source audio file")
|
|
84
|
+
i.add_argument("--speaker", action="append", metavar="LABEL=Name",
|
|
85
|
+
help="speaker label to real-name mapping; repeat for multiple")
|
|
86
|
+
i.set_defaults(func=cmd_ingest)
|
|
87
|
+
|
|
88
|
+
s = dbsub.add_parser("search", help="full-text search across transcripts")
|
|
89
|
+
s.add_argument("query", help="search query (trigram, supports ZH+EN)")
|
|
90
|
+
s.add_argument("--speaker", help="filter to a specific speaker name or label")
|
|
91
|
+
s.add_argument("--since", metavar="YYYY-MM-DD", help="earliest meeting date")
|
|
92
|
+
s.add_argument("--until", metavar="YYYY-MM-DD", help="latest meeting date")
|
|
93
|
+
s.add_argument("--limit", type=int, default=50, help="max results (default 50)")
|
|
94
|
+
s.set_defaults(func=cmd_search)
|
|
95
|
+
|
|
96
|
+
lst = dbsub.add_parser("list", help="list all ingested meetings")
|
|
97
|
+
lst.set_defaults(func=cmd_list)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""`plaudio doctor` -- environment + dependency checks. High-leverage support tool."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import argparse, os, pathlib, platform, shutil, sys
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _ok(name: str, detail: str = "") -> None:
|
|
7
|
+
print(f" ok {name}" + (f": {detail}" if detail else ""))
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _bad(name: str, detail: str = "") -> int:
|
|
11
|
+
print(f" fail {name}" + (f" -- {detail}" if detail else ""))
|
|
12
|
+
return 1
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def cmd_doctor(args: argparse.Namespace) -> int:
|
|
16
|
+
print("Plaudio doctor -- environment check\n")
|
|
17
|
+
fails = 0
|
|
18
|
+
|
|
19
|
+
if platform.system() == "Darwin":
|
|
20
|
+
_ok("macOS", platform.mac_ver()[0])
|
|
21
|
+
else:
|
|
22
|
+
fails |= _bad("platform", f"{platform.system()} (Plaudio supports macOS only)")
|
|
23
|
+
|
|
24
|
+
if platform.machine() == "arm64":
|
|
25
|
+
_ok("Apple Silicon", "arm64")
|
|
26
|
+
else:
|
|
27
|
+
fails |= _bad("CPU", f"{platform.machine()} (Apple Silicon required for mlx-whisper)")
|
|
28
|
+
|
|
29
|
+
pv = sys.version_info
|
|
30
|
+
if pv >= (3, 11):
|
|
31
|
+
_ok("Python", f"{pv.major}.{pv.minor}.{pv.micro}")
|
|
32
|
+
else:
|
|
33
|
+
fails |= _bad("Python", f"{pv.major}.{pv.minor} (>= 3.11 required)")
|
|
34
|
+
|
|
35
|
+
ff = shutil.which("ffmpeg")
|
|
36
|
+
if ff:
|
|
37
|
+
_ok("ffmpeg", ff)
|
|
38
|
+
else:
|
|
39
|
+
fails |= _bad("ffmpeg", "not on PATH; install: brew install ffmpeg")
|
|
40
|
+
|
|
41
|
+
mw = shutil.which("mlx_whisper")
|
|
42
|
+
if mw:
|
|
43
|
+
_ok("mlx_whisper", mw)
|
|
44
|
+
else:
|
|
45
|
+
fails |= _bad("mlx_whisper", "not installed; pip install mlx-whisper")
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
import pyannote.audio as _pa
|
|
49
|
+
_ok("pyannote.audio", getattr(_pa, "__version__", "?"))
|
|
50
|
+
except ImportError as e:
|
|
51
|
+
fails |= _bad("pyannote.audio", f"import failed: {e}")
|
|
52
|
+
|
|
53
|
+
token_path = pathlib.Path(
|
|
54
|
+
os.environ.get("PLAUDIO_HF_TOKEN_FILE", "~/.huggingface/token")
|
|
55
|
+
).expanduser()
|
|
56
|
+
if token_path.exists() and token_path.read_text().strip():
|
|
57
|
+
_ok("HF token", str(token_path))
|
|
58
|
+
else:
|
|
59
|
+
fails |= _bad(
|
|
60
|
+
"HF token",
|
|
61
|
+
f"not found at {token_path}. pyannote/speaker-diarization-3.1 is gated; "
|
|
62
|
+
f"accept at https://huggingface.co/pyannote/speaker-diarization-3.1 and "
|
|
63
|
+
f"save your token at {token_path}",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
import torch
|
|
68
|
+
if torch.backends.mps.is_available():
|
|
69
|
+
_ok("torch MPS", "available")
|
|
70
|
+
else:
|
|
71
|
+
print(" - torch MPS not available (will fall back to CPU; slower)")
|
|
72
|
+
except ImportError:
|
|
73
|
+
fails |= _bad("torch", "not installed")
|
|
74
|
+
|
|
75
|
+
print()
|
|
76
|
+
if fails:
|
|
77
|
+
print("Doctor: some checks failed. See messages above.")
|
|
78
|
+
return 1
|
|
79
|
+
print("Doctor: all checks passed.")
|
|
80
|
+
return 0
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
84
|
+
p = sub.add_parser("doctor", help="environment + dependency check")
|
|
85
|
+
p.set_defaults(func=cmd_doctor)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""`plaudio enrol AUDIO --name X --start S --end S`"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import argparse, os, pathlib, sys
|
|
4
|
+
from plaudio.core.voicebank import VoiceBank
|
|
5
|
+
|
|
6
|
+
DEFAULT_TOKEN_FILE = pathlib.Path("~/.huggingface/token").expanduser()
|
|
7
|
+
|
|
8
|
+
CONSENT_TEXT = """\
|
|
9
|
+
plaudio enrol: enrolling a person's voice creates a biometric profile.
|
|
10
|
+
Only enrol with the speaker's knowledge. The voicebank lives at
|
|
11
|
+
{path} (mode 0600); back it up explicitly if you want it elsewhere.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def _bank_path() -> pathlib.Path:
|
|
15
|
+
return pathlib.Path(os.environ.get("PLAUDIO_VOICEBANK", str(VoiceBank.default_path()))).expanduser()
|
|
16
|
+
|
|
17
|
+
def cmd_enrol(args: argparse.Namespace) -> int:
|
|
18
|
+
bank_path = _bank_path()
|
|
19
|
+
if not bank_path.exists():
|
|
20
|
+
print(CONSENT_TEXT.format(path=bank_path), file=sys.stderr)
|
|
21
|
+
token_file = pathlib.Path(args.hf_token_file).expanduser()
|
|
22
|
+
if not token_file.exists():
|
|
23
|
+
print(f"HF token not found at {token_file}; create it or pass --hf-token-file.", file=sys.stderr)
|
|
24
|
+
return 2
|
|
25
|
+
hf_token = token_file.read_text().strip()
|
|
26
|
+
bank = VoiceBank.load(bank_path)
|
|
27
|
+
audio = pathlib.Path(args.audio).expanduser()
|
|
28
|
+
try:
|
|
29
|
+
profile = bank.enrol_from_audio(
|
|
30
|
+
audio,
|
|
31
|
+
name=args.name,
|
|
32
|
+
start_s=args.start,
|
|
33
|
+
end_s=args.end,
|
|
34
|
+
hf_token=hf_token,
|
|
35
|
+
notes=args.notes,
|
|
36
|
+
num_speakers=args.num_speakers,
|
|
37
|
+
)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
print(f"enrolment failed: {e}", file=sys.stderr)
|
|
40
|
+
return 1
|
|
41
|
+
bank.save(bank_path)
|
|
42
|
+
print(f"enrolled {profile.name} (dim={profile.embedding_dim}, duration={profile.duration_s:.0f}s)")
|
|
43
|
+
print(f"voicebank: {bank_path}")
|
|
44
|
+
return 0
|
|
45
|
+
|
|
46
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
47
|
+
p = sub.add_parser("enrol", help="enrol a voice profile from audio")
|
|
48
|
+
p.add_argument("audio")
|
|
49
|
+
p.add_argument("--name", required=True)
|
|
50
|
+
p.add_argument("--start", type=float, default=None, help="start (sec) of the clean window")
|
|
51
|
+
p.add_argument("--end", type=float, default=None, help="end (sec) of the clean window")
|
|
52
|
+
p.add_argument("--num-speakers", type=int, default=None,
|
|
53
|
+
help="hint to pyannote; helps when the audio has few speakers")
|
|
54
|
+
p.add_argument("--notes", default="")
|
|
55
|
+
p.add_argument("--hf-token-file", default=str(DEFAULT_TOKEN_FILE))
|
|
56
|
+
p.set_defaults(func=cmd_enrol)
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""`plaudio label AUDIO TRANSCRIPT [--enrol] [--batch-label "L=Name,..."]`"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import argparse, json, os, pathlib, platform, subprocess, sys
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from plaudio.core.voicebank import VoiceBank
|
|
6
|
+
|
|
7
|
+
DEFAULT_TOKEN_FILE = pathlib.Path("~/.huggingface/token").expanduser()
|
|
8
|
+
|
|
9
|
+
def _bank_path() -> pathlib.Path:
|
|
10
|
+
return pathlib.Path(os.environ.get("PLAUDIO_VOICEBANK", str(VoiceBank.default_path()))).expanduser()
|
|
11
|
+
|
|
12
|
+
def fmt_ms(ms): s = int(ms / 1000); return f"{s//60:02d}:{s%60:02d}"
|
|
13
|
+
|
|
14
|
+
def find_best_clip(segs, target_speaker, min_dur_s=15, max_dur_s=45):
|
|
15
|
+
spans = []
|
|
16
|
+
cur_start = cur_end = None
|
|
17
|
+
for s in sorted(segs, key=lambda x: x['start_time']):
|
|
18
|
+
if s['speaker'] != target_speaker: continue
|
|
19
|
+
if cur_start is None:
|
|
20
|
+
cur_start, cur_end = s['start_time'], s['end_time']
|
|
21
|
+
elif s['start_time'] - cur_end < 3000:
|
|
22
|
+
cur_end = s['end_time']
|
|
23
|
+
else:
|
|
24
|
+
spans.append((cur_start, cur_end))
|
|
25
|
+
cur_start, cur_end = s['start_time'], s['end_time']
|
|
26
|
+
if cur_start is not None: spans.append((cur_start, cur_end))
|
|
27
|
+
if not spans: return None
|
|
28
|
+
longest = max(spans, key=lambda t: t[1]-t[0])
|
|
29
|
+
if (longest[1]-longest[0])/1000 < min_dur_s: return None
|
|
30
|
+
return (longest[0], min(longest[1], longest[0] + max_dur_s*1000))
|
|
31
|
+
|
|
32
|
+
def find_enrol_window(segs, target_speaker, max_dur_s=180):
|
|
33
|
+
spans = []
|
|
34
|
+
cur_start = cur_end = None
|
|
35
|
+
for s in sorted(segs, key=lambda x: x['start_time']):
|
|
36
|
+
if s['speaker'] != target_speaker: continue
|
|
37
|
+
if cur_start is None:
|
|
38
|
+
cur_start, cur_end = s['start_time'], s['end_time']
|
|
39
|
+
elif s['start_time'] - cur_end < 5000:
|
|
40
|
+
cur_end = s['end_time']
|
|
41
|
+
else:
|
|
42
|
+
spans.append((cur_start, cur_end))
|
|
43
|
+
cur_start, cur_end = s['start_time'], s['end_time']
|
|
44
|
+
if cur_start is not None: spans.append((cur_start, cur_end))
|
|
45
|
+
if not spans: return None
|
|
46
|
+
longest = max(spans, key=lambda t: t[1]-t[0])
|
|
47
|
+
return (longest[0], min(longest[1], longest[0] + max_dur_s*1000))
|
|
48
|
+
|
|
49
|
+
def play_clip_bg(audio_path, start_ms, end_ms):
|
|
50
|
+
tmp = pathlib.Path(f"/tmp/plaudio-clip-{start_ms}-{end_ms}.wav")
|
|
51
|
+
dur_s = (end_ms - start_ms) / 1000
|
|
52
|
+
try:
|
|
53
|
+
r = subprocess.run(
|
|
54
|
+
["ffmpeg", "-y", "-ss", str(start_ms/1000), "-t", str(dur_s),
|
|
55
|
+
"-i", str(audio_path), "-ar", "16000", "-ac", "1", str(tmp)],
|
|
56
|
+
capture_output=True, text=True, timeout=30)
|
|
57
|
+
if r.returncode != 0:
|
|
58
|
+
return None, f"ffmpeg exit {r.returncode}: {r.stderr[-300:]}"
|
|
59
|
+
if not tmp.exists() or tmp.stat().st_size == 0:
|
|
60
|
+
return None, "ffmpeg produced empty file"
|
|
61
|
+
player = "afplay" if platform.system() == "Darwin" else "ffplay"
|
|
62
|
+
args_list = [player, str(tmp)] if player == "afplay" \
|
|
63
|
+
else [player, "-nodisp", "-autoexit", "-loglevel", "quiet", str(tmp)]
|
|
64
|
+
proc = subprocess.Popen(args_list, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
65
|
+
return proc, tmp
|
|
66
|
+
except FileNotFoundError as e:
|
|
67
|
+
return None, f"missing executable: {e}"
|
|
68
|
+
except Exception as e:
|
|
69
|
+
return None, f"unexpected: {e}"
|
|
70
|
+
|
|
71
|
+
def stop_playback(proc, tmp):
|
|
72
|
+
if proc is not None and proc.poll() is None:
|
|
73
|
+
try:
|
|
74
|
+
proc.terminate(); proc.wait(timeout=2)
|
|
75
|
+
except Exception:
|
|
76
|
+
try: proc.kill()
|
|
77
|
+
except Exception: pass
|
|
78
|
+
if tmp is not None:
|
|
79
|
+
try: tmp.unlink(missing_ok=True)
|
|
80
|
+
except Exception: pass
|
|
81
|
+
|
|
82
|
+
def save_labels(data, jpath, mapping):
|
|
83
|
+
for s in data['segments']:
|
|
84
|
+
if s['speaker'] in mapping:
|
|
85
|
+
if 'original_speaker' not in s:
|
|
86
|
+
s['original_speaker'] = s['speaker']
|
|
87
|
+
s['speaker'] = mapping[s['speaker']]
|
|
88
|
+
jpath.write_text(json.dumps(data, ensure_ascii=False, indent=2))
|
|
89
|
+
|
|
90
|
+
def _enrol(name, audio, start_ms, end_ms, hf_token, num_speakers=None):
|
|
91
|
+
print(f" enrolling {name} from [{fmt_ms(start_ms)}-{fmt_ms(end_ms)}]...")
|
|
92
|
+
bank = VoiceBank.load(_bank_path())
|
|
93
|
+
try:
|
|
94
|
+
bank.enrol_from_audio(
|
|
95
|
+
pathlib.Path(audio),
|
|
96
|
+
name=name,
|
|
97
|
+
start_s=start_ms/1000,
|
|
98
|
+
end_s=end_ms/1000,
|
|
99
|
+
hf_token=hf_token,
|
|
100
|
+
notes=f"via plaudio label interactive labelling",
|
|
101
|
+
num_speakers=num_speakers,
|
|
102
|
+
)
|
|
103
|
+
bank.save(_bank_path())
|
|
104
|
+
print(f" enrolled {name}")
|
|
105
|
+
except Exception as e:
|
|
106
|
+
print(f" enrolment failed: {e}", file=sys.stderr)
|
|
107
|
+
|
|
108
|
+
def cmd_label(args: argparse.Namespace) -> int:
|
|
109
|
+
audio = pathlib.Path(args.audio).expanduser()
|
|
110
|
+
jpath = pathlib.Path(args.transcript).expanduser()
|
|
111
|
+
if not audio.exists(): print(f"audio not found: {audio}", file=sys.stderr); return 2
|
|
112
|
+
if not jpath.exists(): print(f"transcript not found: {jpath}", file=sys.stderr); return 2
|
|
113
|
+
|
|
114
|
+
hf_token = ""
|
|
115
|
+
if args.enrol or args.batch_label:
|
|
116
|
+
tf = pathlib.Path(args.hf_token_file).expanduser()
|
|
117
|
+
if not tf.exists():
|
|
118
|
+
print(f"HF token not found at {tf}; enrolment needs it", file=sys.stderr); return 2
|
|
119
|
+
hf_token = tf.read_text().strip()
|
|
120
|
+
|
|
121
|
+
data = json.loads(jpath.read_text())
|
|
122
|
+
segs = data.get('segments', [])
|
|
123
|
+
if not segs: print("no segments in json", file=sys.stderr); return 2
|
|
124
|
+
|
|
125
|
+
by_spk = defaultdict(list)
|
|
126
|
+
for s in segs:
|
|
127
|
+
by_spk[s['speaker']].append(s)
|
|
128
|
+
durs = {k: sum((x['end_time']-x['start_time'])/1000 for x in v) for k,v in by_spk.items()}
|
|
129
|
+
main_spk = sorted([k for k,v in durs.items() if v > 5], key=lambda k: -durs[k])
|
|
130
|
+
|
|
131
|
+
if args.batch_label:
|
|
132
|
+
mapping = {}
|
|
133
|
+
enrol_windows = {}
|
|
134
|
+
for pair in args.batch_label.split(","):
|
|
135
|
+
if "=" not in pair: continue
|
|
136
|
+
spk, name = pair.split("=", 1)
|
|
137
|
+
spk, name = spk.strip(), name.strip()
|
|
138
|
+
if spk not in by_spk:
|
|
139
|
+
print(f" warn: {spk} not in transcript, skipping"); continue
|
|
140
|
+
mapping[spk] = name
|
|
141
|
+
es = find_enrol_window(by_spk[spk], spk, max_dur_s=180)
|
|
142
|
+
if es: enrol_windows[name] = es
|
|
143
|
+
if mapping:
|
|
144
|
+
save_labels(data, jpath, mapping)
|
|
145
|
+
print(f"batch-labelled {len(mapping)} speakers: {mapping}")
|
|
146
|
+
if args.enrol:
|
|
147
|
+
print("\n=== Enrolling voice profiles ===")
|
|
148
|
+
for name, (s_ms, e_ms) in enrol_windows.items():
|
|
149
|
+
if name == "Unknown": continue
|
|
150
|
+
_enrol(name, audio, s_ms, e_ms, hf_token)
|
|
151
|
+
return 0
|
|
152
|
+
|
|
153
|
+
print(f"\n{len(main_spk)} clusters to label (> 5s)\n")
|
|
154
|
+
mapping = {}
|
|
155
|
+
enrol_windows = {}
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
for spk in main_spk:
|
|
159
|
+
if not (spk.startswith("SPEAKER_") or spk == "Unknown"):
|
|
160
|
+
print(f"\n{spk}: already labelled, skipping")
|
|
161
|
+
continue
|
|
162
|
+
clip = find_best_clip(by_spk[spk], spk, args.min_dur, args.max_dur)
|
|
163
|
+
if not clip:
|
|
164
|
+
print(f" {spk}: no clean >={args.min_dur}s clip, skipping")
|
|
165
|
+
continue
|
|
166
|
+
start_ms, end_ms = clip
|
|
167
|
+
dur_s = (end_ms - start_ms) / 1000
|
|
168
|
+
print(f"\n{'='*70}")
|
|
169
|
+
print(f"Speaker: {spk} | Airtime: {durs[spk]:.0f}s ({durs[spk]/60:.1f}min)")
|
|
170
|
+
print(f"Clip: {fmt_ms(start_ms)}-{fmt_ms(end_ms)} ({dur_s:.0f}s)")
|
|
171
|
+
in_window = [x for x in by_spk[spk] if x['start_time'] >= start_ms and x['start_time'] < end_ms]
|
|
172
|
+
text = " ".join(x['content'] for x in in_window)
|
|
173
|
+
print(f"Text: \"{text[:500]}\"")
|
|
174
|
+
print()
|
|
175
|
+
playback_proc = playback_tmp = None
|
|
176
|
+
if not args.no_play:
|
|
177
|
+
playback_proc, playback_tmp = play_clip_bg(audio, start_ms, end_ms)
|
|
178
|
+
if playback_proc is None:
|
|
179
|
+
print(f" playback error: {playback_tmp}")
|
|
180
|
+
playback_tmp = None
|
|
181
|
+
else:
|
|
182
|
+
print(" (playing in background; type at any time to stop)")
|
|
183
|
+
try:
|
|
184
|
+
ans = input("name / [s]kip / [u]nknown / [r]eplay / [q]uit: ").strip()
|
|
185
|
+
except EOFError:
|
|
186
|
+
ans = 'q'
|
|
187
|
+
finally:
|
|
188
|
+
stop_playback(playback_proc, playback_tmp)
|
|
189
|
+
while ans.lower() == 'r':
|
|
190
|
+
if args.no_play:
|
|
191
|
+
print(" (--no-play; cannot replay)")
|
|
192
|
+
try: ans = input("name / [s]kip / [u]nknown / [q]uit: ").strip()
|
|
193
|
+
except EOFError: ans = 'q'
|
|
194
|
+
break
|
|
195
|
+
pp, pt = play_clip_bg(audio, start_ms, end_ms)
|
|
196
|
+
if pp is None:
|
|
197
|
+
print(f" replay error: {pt}")
|
|
198
|
+
else:
|
|
199
|
+
print(" (replaying)")
|
|
200
|
+
try:
|
|
201
|
+
ans = input("name / [s]kip / [u]nknown / [r]eplay / [q]uit: ").strip()
|
|
202
|
+
except EOFError:
|
|
203
|
+
ans = 'q'
|
|
204
|
+
finally:
|
|
205
|
+
stop_playback(pp, pt)
|
|
206
|
+
if ans.lower() in ('q', 'quit'): raise KeyboardInterrupt
|
|
207
|
+
if ans.lower() in ('s', 'skip', ''): continue
|
|
208
|
+
if ans.lower() in ('u', 'unknown'):
|
|
209
|
+
mapping[spk] = "Unknown"
|
|
210
|
+
save_labels(data, jpath, {spk: "Unknown"})
|
|
211
|
+
print(f" -> {spk} = Unknown (saved)")
|
|
212
|
+
continue
|
|
213
|
+
mapping[spk] = ans
|
|
214
|
+
enrol_span = find_enrol_window(by_spk[spk], spk, max_dur_s=180)
|
|
215
|
+
enrol_windows[ans] = enrol_span if enrol_span else (start_ms, end_ms)
|
|
216
|
+
es_start, es_end = enrol_windows[ans]
|
|
217
|
+
save_labels(data, jpath, {spk: ans})
|
|
218
|
+
print(f" -> {spk} = {ans} (saved); enrolment will use [{fmt_ms(es_start)}-{fmt_ms(es_end)}]")
|
|
219
|
+
except KeyboardInterrupt:
|
|
220
|
+
print("\n\n(quitting; labels saved so far)")
|
|
221
|
+
|
|
222
|
+
print(f"\nwrote {len(mapping)} labels to {jpath}")
|
|
223
|
+
if args.enrol and enrol_windows:
|
|
224
|
+
print(f"\n=== Enrolling voice profiles ({len(enrol_windows)} people) ===")
|
|
225
|
+
for name, (start_ms, end_ms) in enrol_windows.items():
|
|
226
|
+
if name == "Unknown": continue
|
|
227
|
+
_enrol(name, audio, start_ms, end_ms, hf_token)
|
|
228
|
+
return 0
|
|
229
|
+
|
|
230
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
231
|
+
p = sub.add_parser("label", help="interactive speaker labelling")
|
|
232
|
+
p.add_argument("audio")
|
|
233
|
+
p.add_argument("transcript")
|
|
234
|
+
p.add_argument("--enrol", action="store_true")
|
|
235
|
+
p.add_argument("--no-play", action="store_true")
|
|
236
|
+
p.add_argument("--batch-label", type=str)
|
|
237
|
+
p.add_argument("--min-dur", type=int, default=15)
|
|
238
|
+
p.add_argument("--max-dur", type=int, default=15)
|
|
239
|
+
p.add_argument("--hf-token-file", default=str(DEFAULT_TOKEN_FILE))
|
|
240
|
+
p.set_defaults(func=cmd_label)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""`plaudio match AUDIO TRANSCRIPT [--threshold T] [--report]`"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import pathlib
|
|
7
|
+
import sys
|
|
8
|
+
from plaudio.core.voicebank import VoiceBank
|
|
9
|
+
from plaudio.core.slidingmatch import SlidingMatcher, DEFAULT_THRESHOLD
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _bank_path() -> pathlib.Path:
|
|
13
|
+
return pathlib.Path(
|
|
14
|
+
os.environ.get("PLAUDIO_VOICEBANK", str(VoiceBank.default_path()))
|
|
15
|
+
).expanduser()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def cmd_match(args: argparse.Namespace) -> int:
|
|
19
|
+
audio = pathlib.Path(args.audio).expanduser()
|
|
20
|
+
jpath = pathlib.Path(args.transcript).expanduser()
|
|
21
|
+
if not audio.exists():
|
|
22
|
+
print(f"audio not found: {audio}", file=sys.stderr)
|
|
23
|
+
return 2
|
|
24
|
+
if not jpath.exists():
|
|
25
|
+
print(f"transcript not found: {jpath}", file=sys.stderr)
|
|
26
|
+
return 2
|
|
27
|
+
bank = VoiceBank.load(_bank_path())
|
|
28
|
+
matcher = SlidingMatcher(bank, threshold=args.threshold)
|
|
29
|
+
data = json.loads(jpath.read_text())
|
|
30
|
+
segs = data.get("segments", [])
|
|
31
|
+
report = matcher.label_segments(audio, segs)
|
|
32
|
+
jpath.write_text(json.dumps(data, ensure_ascii=False, indent=2))
|
|
33
|
+
print(
|
|
34
|
+
f" {report.get('n_matched', 0)} / {report.get('n_total', 0)} windows matched"
|
|
35
|
+
f" at threshold {args.threshold}"
|
|
36
|
+
)
|
|
37
|
+
print(
|
|
38
|
+
f" coalesced into {report.get('n_runs', 0)} runs,"
|
|
39
|
+
f" relabelled {report.get('n_segments_relabelled', 0)} segments"
|
|
40
|
+
)
|
|
41
|
+
if args.report:
|
|
42
|
+
from collections import defaultdict
|
|
43
|
+
|
|
44
|
+
durs: dict[str, float] = defaultdict(float)
|
|
45
|
+
for s in segs:
|
|
46
|
+
durs[s["speaker"]] += (s["end_time"] - s["start_time"]) / 1000.0
|
|
47
|
+
print("\nPer-speaker durations:")
|
|
48
|
+
for k, v in sorted(durs.items(), key=lambda x: -x[1]):
|
|
49
|
+
print(f" {k}: {v:.0f}s ({v/60:.1f}min)")
|
|
50
|
+
return 0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
54
|
+
p = sub.add_parser("match", help="re-label transcript via voice-bank sliding match")
|
|
55
|
+
p.add_argument("audio")
|
|
56
|
+
p.add_argument("transcript")
|
|
57
|
+
p.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD)
|
|
58
|
+
p.add_argument("--report", action="store_true")
|
|
59
|
+
p.set_defaults(func=cmd_match)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""`plaudio plaud login|list|sync` -- stubs in v0.1, full impl in v0.2."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
V02_MSG = (
|
|
6
|
+
"Plaud cloud sync is not implemented in v0.1. See README -> Roadmap. "
|
|
7
|
+
"For now: use the Plaud MCP via your editor's MCP host, or the Plaud "
|
|
8
|
+
"mobile app, then point `plaudio transcribe` at the downloaded audio."
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def cmd_stub(args: argparse.Namespace) -> int:
|
|
13
|
+
print(V02_MSG)
|
|
14
|
+
return 0
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
18
|
+
p = sub.add_parser("plaud", help="Plaud cloud sync (v0.2)")
|
|
19
|
+
psub = p.add_subparsers(dest="plaud_cmd", required=True)
|
|
20
|
+
psub.add_parser("login").set_defaults(func=cmd_stub)
|
|
21
|
+
psub.add_parser("list").set_defaults(func=cmd_stub)
|
|
22
|
+
psub.add_parser("sync").set_defaults(func=cmd_stub)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""`plaudio transcribe AUDIO [--vocab FILE] [--language LANG] [--out DIR]`"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import argparse, pathlib, sys
|
|
4
|
+
from plaudio.core.transcribe import transcribe, load_vocab_prompt, DEFAULT_MODEL
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def cmd_transcribe(args: argparse.Namespace) -> int:
|
|
8
|
+
audio = pathlib.Path(args.audio).expanduser()
|
|
9
|
+
out_dir = pathlib.Path(args.out).expanduser() if args.out else audio.parent
|
|
10
|
+
vocab = load_vocab_prompt(pathlib.Path(args.vocab).expanduser()) if args.vocab else ""
|
|
11
|
+
try:
|
|
12
|
+
segs, elapsed = transcribe(audio, out_dir=out_dir, language=args.language,
|
|
13
|
+
vocab=vocab, model=args.model)
|
|
14
|
+
except FileNotFoundError as e:
|
|
15
|
+
print(f"audio not found: {e}", file=sys.stderr)
|
|
16
|
+
return 2
|
|
17
|
+
print(f"ok {len(segs)} segments in {elapsed:.1f}s "
|
|
18
|
+
f"({len(segs)/max(elapsed,1e-3):.1f} seg/s)")
|
|
19
|
+
print(f" output: {out_dir / (audio.stem + '.json')}")
|
|
20
|
+
return 0
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def register(sub: argparse._SubParsersAction) -> None:
|
|
24
|
+
t = sub.add_parser("transcribe", help="mlx-whisper ASR on an audio file")
|
|
25
|
+
t.add_argument("audio")
|
|
26
|
+
t.add_argument("--vocab", help="path to a vocab file (one term per line); default empty")
|
|
27
|
+
t.add_argument("--language", default="en")
|
|
28
|
+
t.add_argument("--out", help="output directory; default: same as audio")
|
|
29
|
+
t.add_argument("--model", default=DEFAULT_MODEL)
|
|
30
|
+
t.set_defaults(func=cmd_transcribe)
|