fow-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. fly_on_the_wall/__init__.py +3 -0
  2. fly_on_the_wall/audio.py +164 -0
  3. fly_on_the_wall/audio_metadata.py +241 -0
  4. fly_on_the_wall/cache.py +26 -0
  5. fly_on_the_wall/cleanup.py +29 -0
  6. fly_on_the_wall/cli.py +641 -0
  7. fly_on_the_wall/cli_costs.py +81 -0
  8. fly_on_the_wall/cli_menu.py +163 -0
  9. fly_on_the_wall/cli_publish.py +141 -0
  10. fly_on_the_wall/cli_speaker_review.py +315 -0
  11. fly_on_the_wall/cli_watch.py +209 -0
  12. fly_on_the_wall/config.py +92 -0
  13. fly_on_the_wall/costs.py +169 -0
  14. fly_on_the_wall/db.py +508 -0
  15. fly_on_the_wall/doctor.py +142 -0
  16. fly_on_the_wall/embeddings.py +142 -0
  17. fly_on_the_wall/exporting.py +155 -0
  18. fly_on_the_wall/glossary.py +31 -0
  19. fly_on_the_wall/meetings.py +382 -0
  20. fly_on_the_wall/normalization.py +166 -0
  21. fly_on_the_wall/people.py +82 -0
  22. fly_on_the_wall/people_embeddings.py +68 -0
  23. fly_on_the_wall/pipeline.py +120 -0
  24. fly_on_the_wall/processing.py +427 -0
  25. fly_on_the_wall/providers/__init__.py +1 -0
  26. fly_on_the_wall/providers/elevenlabs.py +145 -0
  27. fly_on_the_wall/providers/openai_analysis.py +195 -0
  28. fly_on_the_wall/providers/openai_cleanup.py +91 -0
  29. fly_on_the_wall/publishing.py +410 -0
  30. fly_on_the_wall/reanalysis.py +172 -0
  31. fly_on_the_wall/recording_quality.py +141 -0
  32. fly_on_the_wall/rendering.py +115 -0
  33. fly_on_the_wall/secrets.py +93 -0
  34. fly_on_the_wall/service_pricing.py +75 -0
  35. fly_on_the_wall/setup.py +221 -0
  36. fly_on_the_wall/speaker_identity.py +173 -0
  37. fly_on_the_wall/speaker_matching.py +134 -0
  38. fly_on_the_wall/speakers.py +221 -0
  39. fly_on_the_wall/storage.py +53 -0
  40. fly_on_the_wall/voice_samples.py +125 -0
  41. fly_on_the_wall/watch.py +347 -0
  42. fow_cli-0.1.0.dist-info/METADATA +447 -0
  43. fow_cli-0.1.0.dist-info/RECORD +46 -0
  44. fow_cli-0.1.0.dist-info/WHEEL +4 -0
  45. fow_cli-0.1.0.dist-info/entry_points.txt +2 -0
  46. fow_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,142 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib.util
4
+ import sys
5
+ from dataclasses import dataclass
6
+ from shutil import which
7
+
8
+ from fly_on_the_wall.config import default_config_path, load_config
9
+ from fly_on_the_wall.db import database
10
+ from fly_on_the_wall.secrets import get_api_key_status
11
+ from fly_on_the_wall.storage import storage_paths
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class DoctorCheck:
16
+ name: str
17
+ ok: bool
18
+ detail: str
19
+
20
+
21
+ def run_checks() -> list[DoctorCheck]:
22
+ paths = storage_paths()
23
+ checks = [
24
+ DoctorCheck(
25
+ name="python",
26
+ ok=sys.version_info >= (3, 12),
27
+ detail=f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
28
+ ),
29
+ DoctorCheck(
30
+ name="ffmpeg",
31
+ ok=which("ffmpeg") is not None,
32
+ detail=which("ffmpeg") or "not found",
33
+ ),
34
+ DoctorCheck(
35
+ name="config path",
36
+ ok=True,
37
+ detail=str(default_config_path()),
38
+ ),
39
+ DoctorCheck(
40
+ name="storage path",
41
+ ok=True,
42
+ detail=str(paths.root),
43
+ ),
44
+ ]
45
+
46
+ config = load_config()
47
+ provider = config.default_transcription_provider
48
+ provider_status = get_api_key_status(provider)
49
+ checks.append(
50
+ DoctorCheck(
51
+ name=f"{provider} api key",
52
+ ok=provider_status.available,
53
+ detail=_secret_detail(provider_status.source, provider_status.env_var),
54
+ )
55
+ )
56
+ openai_status = get_api_key_status("openai")
57
+ checks.append(
58
+ DoctorCheck(
59
+ name="openai api key",
60
+ ok=openai_status.available,
61
+ detail=_secret_detail(openai_status.source, openai_status.env_var),
62
+ )
63
+ )
64
+ checks.extend(_speaker_embedding_checks())
65
+ return checks
66
+
67
+
68
+ def _speaker_embedding_checks() -> list[DoctorCheck]:
69
+ pyannote_available = _module_available("pyannote.audio")
70
+ counts = _speaker_embedding_counts()
71
+ return [
72
+ DoctorCheck(
73
+ name="pyannote.audio",
74
+ ok=pyannote_available,
75
+ detail=(
76
+ "available for speaker embeddings"
77
+ if pyannote_available
78
+ else "missing; install the identity extra for local speaker embeddings"
79
+ ),
80
+ ),
81
+ DoctorCheck(
82
+ name="voice sample embeddings",
83
+ ok=counts["voice_samples"] == 0 or counts["embedded_voice_samples"] == counts["voice_samples"],
84
+ detail=(f"{counts['embedded_voice_samples']}/{counts['voice_samples']} voice samples embedded"),
85
+ ),
86
+ DoctorCheck(
87
+ name="local speaker embeddings",
88
+ ok=counts["local_speakers"] == 0 or counts["embedded_local_speakers"] > 0 or pyannote_available,
89
+ detail=(f"{counts['embedded_local_speakers']}/{counts['local_speakers']} local speakers embedded"),
90
+ ),
91
+ ]
92
+
93
+
94
+ def _speaker_embedding_counts() -> dict[str, int]:
95
+ with database() as connection:
96
+ voice = connection.execute(
97
+ """
98
+ SELECT COUNT(*) AS total,
99
+ SUM(CASE WHEN embedding_path IS NOT NULL THEN 1 ELSE 0 END) AS embedded
100
+ FROM voice_samples
101
+ """
102
+ ).fetchone()
103
+ local = connection.execute(
104
+ """
105
+ SELECT COUNT(DISTINCT local_speakers.id) AS total,
106
+ COUNT(DISTINCT local_speaker_embeddings.local_speaker_id) AS embedded
107
+ FROM local_speakers
108
+ LEFT JOIN local_speaker_embeddings
109
+ ON local_speaker_embeddings.local_speaker_id = local_speakers.id
110
+ """
111
+ ).fetchone()
112
+ return {
113
+ "voice_samples": int(voice["total"] or 0),
114
+ "embedded_voice_samples": int(voice["embedded"] or 0),
115
+ "local_speakers": int(local["total"] or 0),
116
+ "embedded_local_speakers": int(local["embedded"] or 0),
117
+ }
118
+
119
+
120
+ def _module_available(module_name: str) -> bool:
121
+ try:
122
+ return importlib.util.find_spec(module_name) is not None
123
+ except ModuleNotFoundError:
124
+ return False
125
+
126
+
127
+ def _secret_detail(source: str, env_var: str | None) -> str:
128
+ if source == "env":
129
+ return f"{env_var} is set"
130
+ if source == "keyring":
131
+ return "set in OS keyring"
132
+ if source == "missing":
133
+ return f"{env_var} is not set and no keyring entry was found"
134
+ return "unknown provider"
135
+
136
+
137
+ def has_failures(checks: list[DoctorCheck]) -> bool:
138
+ return any(not check.ok for check in checks)
139
+
140
+
141
+ def check_names(checks: list[DoctorCheck]) -> set[str]:
142
+ return {check.name for check in checks}
@@ -0,0 +1,142 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import math
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from sqlite3 import Connection
8
+ from typing import Protocol
9
+ from uuid import uuid4
10
+
11
+ from fly_on_the_wall.storage import StoragePaths, storage_paths
12
+
13
+ DEFAULT_EMBEDDING_MODEL = "pyannote/wespeaker-voxceleb-resnet34-LM"
14
+
15
+
16
+ class EmbeddingBackend(Protocol):
17
+ model_name: str
18
+
19
+ def embed(self, audio_path: Path) -> list[float]: ...
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class CachedEmbedding:
24
+ model_name: str
25
+ path: Path
26
+ vector: list[float]
27
+
28
+
29
+ class PyannoteEmbeddingBackend:
30
+ model_name = DEFAULT_EMBEDDING_MODEL
31
+
32
+ def __init__(self) -> None:
33
+ try:
34
+ from pyannote.audio import Inference, Model
35
+ except ImportError as exc:
36
+ raise RuntimeError("pyannote.audio is required for local speaker embeddings.") from exc
37
+
38
+ model = Model.from_pretrained(self.model_name)
39
+ self._inference = Inference(model, window="whole")
40
+
41
+ def embed(self, audio_path: Path) -> list[float]:
42
+ embedding = self._inference(str(audio_path))
43
+ if hasattr(embedding, "tolist"):
44
+ return [float(value) for value in embedding.tolist()]
45
+ return [float(value) for value in embedding]
46
+
47
+
48
+ def cache_voice_sample_embedding(
49
+ connection: Connection,
50
+ voice_sample_id: str,
51
+ backend: EmbeddingBackend,
52
+ storage: StoragePaths | None = None,
53
+ ) -> CachedEmbedding:
54
+ sample = connection.execute(
55
+ "SELECT person_id, audio_path FROM voice_samples WHERE id = ?", (voice_sample_id,)
56
+ ).fetchone()
57
+ if sample is None:
58
+ raise ValueError(f"Voice sample does not exist: {voice_sample_id}")
59
+
60
+ paths = storage or storage_paths()
61
+ vector = backend.embed(Path(sample["audio_path"]))
62
+ embedding_path = _write_embedding(
63
+ paths.artifacts / "embeddings" / "voice-samples" / sample["person_id"],
64
+ voice_sample_id,
65
+ backend.model_name,
66
+ vector,
67
+ )
68
+
69
+ with connection:
70
+ connection.execute(
71
+ """
72
+ UPDATE voice_samples
73
+ SET embedding_model = ?, embedding_path = ?
74
+ WHERE id = ?
75
+ """,
76
+ (backend.model_name, str(embedding_path), voice_sample_id),
77
+ )
78
+ return CachedEmbedding(backend.model_name, embedding_path, vector)
79
+
80
+
81
+ def cache_local_speaker_embedding(
82
+ connection: Connection,
83
+ local_speaker_id: str,
84
+ audio_path: Path,
85
+ backend: EmbeddingBackend,
86
+ storage: StoragePaths | None = None,
87
+ ) -> CachedEmbedding:
88
+ if connection.execute("SELECT 1 FROM local_speakers WHERE id = ?", (local_speaker_id,)).fetchone() is None:
89
+ raise ValueError(f"Local speaker does not exist: {local_speaker_id}")
90
+
91
+ paths = storage or storage_paths()
92
+ vector = backend.embed(audio_path)
93
+ embedding_path = _write_embedding(
94
+ paths.artifacts / "embeddings" / "local-speakers",
95
+ local_speaker_id,
96
+ backend.model_name,
97
+ vector,
98
+ )
99
+ embedding_id = str(uuid4())
100
+ with connection:
101
+ connection.execute(
102
+ """
103
+ INSERT INTO local_speaker_embeddings(
104
+ id, local_speaker_id, audio_path, embedding_model, embedding_path
105
+ ) VALUES (?, ?, ?, ?, ?)
106
+ ON CONFLICT(local_speaker_id, embedding_model) DO UPDATE SET
107
+ audio_path = excluded.audio_path,
108
+ embedding_path = excluded.embedding_path
109
+ """,
110
+ (
111
+ embedding_id,
112
+ local_speaker_id,
113
+ str(audio_path),
114
+ backend.model_name,
115
+ str(embedding_path),
116
+ ),
117
+ )
118
+ return CachedEmbedding(backend.model_name, embedding_path, vector)
119
+
120
+
121
+ def read_embedding(path: Path) -> list[float]:
122
+ data = json.loads(path.read_text())
123
+ return [float(value) for value in data["vector"]]
124
+
125
+
126
+ def cosine_similarity(left: list[float], right: list[float]) -> float:
127
+ if len(left) != len(right):
128
+ raise ValueError("Embedding vectors must have the same length.")
129
+ numerator = sum(a * b for a, b in zip(left, right, strict=True))
130
+ left_norm = math.sqrt(sum(a * a for a in left))
131
+ right_norm = math.sqrt(sum(b * b for b in right))
132
+ if left_norm == 0 or right_norm == 0:
133
+ return 0.0
134
+ return numerator / (left_norm * right_norm)
135
+
136
+
137
+ def _write_embedding(directory: Path, source_id: str, model_name: str, vector: list[float]) -> Path:
138
+ safe_model_name = model_name.replace("/", "--")
139
+ path = directory / f"{source_id}.{safe_model_name}.json"
140
+ path.parent.mkdir(parents=True, exist_ok=True)
141
+ path.write_text(json.dumps({"model": model_name, "vector": vector}) + "\n")
142
+ return path
@@ -0,0 +1,155 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import re
6
+ from dataclasses import dataclass
7
+ from datetime import UTC, datetime
8
+ from pathlib import Path
9
+ from sqlite3 import Connection
10
+ from uuid import uuid4
11
+
12
+ from fly_on_the_wall.storage import StoragePaths, storage_paths
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class ExportResult:
17
+ id: str
18
+ output_dir: Path
19
+ transcript_path: Path
20
+ analysis_path: Path
21
+ manifest_path: Path
22
+
23
+
24
+ def export_markdown_transcript(
25
+ connection: Connection,
26
+ meeting_id: str,
27
+ transcript: str,
28
+ analysis: str,
29
+ storage: StoragePaths | None = None,
30
+ ) -> ExportResult:
31
+ meeting = connection.execute(
32
+ """
33
+ SELECT meetings.*, audio_metadata.recorded_at, audio_metadata.recorded_at_confidence
34
+ FROM meetings
35
+ LEFT JOIN audio_metadata ON audio_metadata.meeting_id = meetings.id
36
+ WHERE meetings.id = ?
37
+ """,
38
+ (meeting_id,),
39
+ ).fetchone()
40
+ if meeting is None:
41
+ raise ValueError(f"Meeting does not exist: {meeting_id}")
42
+
43
+ export_id = str(uuid4())
44
+ timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
45
+ paths = storage or storage_paths()
46
+ output_dir = paths.exports / meeting["slug"] / f"{timestamp}-{export_id[:8]}"
47
+ transcript_path = output_dir / "transcript.md"
48
+ analysis_path = output_dir / "analysis.md"
49
+ manifest_path = output_dir / "manifest.json"
50
+ output_dir.mkdir(parents=True, exist_ok=False)
51
+
52
+ markdown = _markdown_document(dict(meeting), transcript)
53
+ transcript_path.write_text(markdown)
54
+ analysis_markdown = analysis.strip() + "\n"
55
+ analysis_path.write_text(analysis_markdown)
56
+ manifest_path.write_text(
57
+ json.dumps(
58
+ {
59
+ "id": export_id,
60
+ "meeting_id": meeting_id,
61
+ "format": "markdown",
62
+ "transcript_path": str(transcript_path),
63
+ "analysis_path": str(analysis_path),
64
+ "transcript_sha256": _sha256(markdown),
65
+ "analysis_sha256": _sha256(analysis_markdown),
66
+ },
67
+ indent=2,
68
+ )
69
+ + "\n"
70
+ )
71
+
72
+ with connection:
73
+ connection.execute(
74
+ """
75
+ INSERT INTO exports(id, meeting_id, format, output_dir, manifest_path)
76
+ VALUES (?, ?, ?, ?, ?)
77
+ """,
78
+ (export_id, meeting_id, "markdown", str(output_dir), str(manifest_path)),
79
+ )
80
+ return ExportResult(export_id, output_dir, transcript_path, analysis_path, manifest_path)
81
+
82
+
83
+ def _markdown_document(meeting: dict, transcript: str) -> str:
84
+ turns = _readable_turns(transcript)
85
+ people = _participants(turns)
86
+ date, time = _date_time(_meeting_timestamp(meeting))
87
+ lines = [
88
+ f"# {meeting['title']}",
89
+ "",
90
+ f"Date: {date}",
91
+ f"Time: {time}",
92
+ "Location: Unknown",
93
+ "Position: Unknown",
94
+ f"People: {', '.join(people) if people else 'Unknown'}",
95
+ "",
96
+ "## Manuscript",
97
+ "",
98
+ ]
99
+ for speaker, text in turns:
100
+ lines.append(f"**{speaker}:** {text}")
101
+ lines.append("")
102
+ return "\n".join(lines).rstrip() + "\n"
103
+
104
+
105
+ def _readable_turns(transcript: str) -> list[tuple[str, str]]:
106
+ unknown_speakers: dict[str, str] = {}
107
+ turns: list[tuple[str, str]] = []
108
+ for block in [block.strip() for block in transcript.split("\n\n") if block.strip()]:
109
+ speaker, text, source_label = _parse_turn(block)
110
+ if speaker == "Unknown":
111
+ key = source_label or speaker
112
+ if key not in unknown_speakers:
113
+ unknown_speakers[key] = f"Unknown speaker {len(unknown_speakers) + 1}"
114
+ speaker = unknown_speakers[key]
115
+ turns.append((speaker, text))
116
+ return turns
117
+
118
+
119
+ def _parse_turn(block: str) -> tuple[str, str, str | None]:
120
+ speaker, separator, text = block.partition(":")
121
+ if not separator:
122
+ return "Unknown", block, None
123
+
124
+ match = re.match(
125
+ r"^(?P<name>.*?)(?:\s+\[[^\]]+\])?(?:\s+\((?P<source>[^)]+)\))?$",
126
+ speaker.strip(),
127
+ )
128
+ if match is None:
129
+ return speaker.strip() or "Unknown", text.strip(), None
130
+ return match.group("name").strip() or "Unknown", text.strip(), match.group("source")
131
+
132
+
133
+ def _participants(turns: list[tuple[str, str]]) -> list[str]:
134
+ participants: list[str] = []
135
+ for speaker, _ in turns:
136
+ if speaker not in participants:
137
+ participants.append(speaker)
138
+ return participants
139
+
140
+
141
+ def _date_time(created_at: str | None) -> tuple[str, str]:
142
+ if not created_at:
143
+ return "Unknown", "Unknown"
144
+ date, _, time = created_at.partition(" ")
145
+ return date or "Unknown", time or "Unknown"
146
+
147
+
148
+ def _meeting_timestamp(meeting: dict) -> str | None:
149
+ if meeting.get("recorded_at_confidence") in {"high", "medium"}:
150
+ return meeting.get("recorded_at")
151
+ return meeting.get("created_at")
152
+
153
+
154
+ def _sha256(value: str) -> str:
155
+ return hashlib.sha256(value.encode()).hexdigest()
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import yaml
7
+
8
+
9
+ def load_glossary_terms(path: Path | None) -> list[str]:
10
+ if path is None or not path.exists():
11
+ return []
12
+ data = yaml.safe_load(path.read_text())
13
+ return sorted(set(_collect_terms(data)))
14
+
15
+
16
+ def _collect_terms(value: Any) -> list[str]:
17
+ if value is None:
18
+ return []
19
+ if isinstance(value, str):
20
+ return [value]
21
+ if isinstance(value, list):
22
+ terms: list[str] = []
23
+ for item in value:
24
+ terms.extend(_collect_terms(item))
25
+ return terms
26
+ if isinstance(value, dict):
27
+ terms = []
28
+ for item in value.values():
29
+ terms.extend(_collect_terms(item))
30
+ return terms
31
+ return []