fow-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. fly_on_the_wall/__init__.py +3 -0
  2. fly_on_the_wall/audio.py +164 -0
  3. fly_on_the_wall/audio_metadata.py +241 -0
  4. fly_on_the_wall/cache.py +26 -0
  5. fly_on_the_wall/cleanup.py +29 -0
  6. fly_on_the_wall/cli.py +641 -0
  7. fly_on_the_wall/cli_costs.py +81 -0
  8. fly_on_the_wall/cli_menu.py +163 -0
  9. fly_on_the_wall/cli_publish.py +141 -0
  10. fly_on_the_wall/cli_speaker_review.py +315 -0
  11. fly_on_the_wall/cli_watch.py +209 -0
  12. fly_on_the_wall/config.py +92 -0
  13. fly_on_the_wall/costs.py +169 -0
  14. fly_on_the_wall/db.py +508 -0
  15. fly_on_the_wall/doctor.py +142 -0
  16. fly_on_the_wall/embeddings.py +142 -0
  17. fly_on_the_wall/exporting.py +155 -0
  18. fly_on_the_wall/glossary.py +31 -0
  19. fly_on_the_wall/meetings.py +382 -0
  20. fly_on_the_wall/normalization.py +166 -0
  21. fly_on_the_wall/people.py +82 -0
  22. fly_on_the_wall/people_embeddings.py +68 -0
  23. fly_on_the_wall/pipeline.py +120 -0
  24. fly_on_the_wall/processing.py +427 -0
  25. fly_on_the_wall/providers/__init__.py +1 -0
  26. fly_on_the_wall/providers/elevenlabs.py +145 -0
  27. fly_on_the_wall/providers/openai_analysis.py +195 -0
  28. fly_on_the_wall/providers/openai_cleanup.py +91 -0
  29. fly_on_the_wall/publishing.py +410 -0
  30. fly_on_the_wall/reanalysis.py +172 -0
  31. fly_on_the_wall/recording_quality.py +141 -0
  32. fly_on_the_wall/rendering.py +115 -0
  33. fly_on_the_wall/secrets.py +93 -0
  34. fly_on_the_wall/service_pricing.py +75 -0
  35. fly_on_the_wall/setup.py +221 -0
  36. fly_on_the_wall/speaker_identity.py +173 -0
  37. fly_on_the_wall/speaker_matching.py +134 -0
  38. fly_on_the_wall/speakers.py +221 -0
  39. fly_on_the_wall/storage.py +53 -0
  40. fly_on_the_wall/voice_samples.py +125 -0
  41. fly_on_the_wall/watch.py +347 -0
  42. fow_cli-0.1.0.dist-info/METADATA +447 -0
  43. fow_cli-0.1.0.dist-info/RECORD +46 -0
  44. fow_cli-0.1.0.dist-info/WHEEL +4 -0
  45. fow_cli-0.1.0.dist-info/entry_points.txt +2 -0
  46. fow_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,382 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import re
5
+ import shutil
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from sqlite3 import Connection
9
+ from uuid import uuid4
10
+
11
+ from fly_on_the_wall.audio_metadata import extract_and_store_audio_metadata
12
+ from fly_on_the_wall.config import AppConfig
13
+ from fly_on_the_wall.storage import StoragePaths, ensure_storage_layout
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class Meeting:
18
+ id: str
19
+ slug: str
20
+ title: str
21
+ title_source: str
22
+ language: str
23
+ imported_audio_path: Path
24
+ audio_sha256: str | None = None
25
+ generated_title: str | None = None
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class DeleteMeetingResult:
30
+ id: str
31
+ slug: str
32
+ removed_paths: tuple[Path, ...]
33
+
34
+
35
+ def import_meeting(
36
+ connection: Connection,
37
+ audio_path: Path,
38
+ title: str | None,
39
+ config: AppConfig,
40
+ storage: StoragePaths | None = None,
41
+ description: str | None = None,
42
+ ) -> Meeting:
43
+ if not audio_path.is_file():
44
+ raise FileNotFoundError(f"Audio file does not exist: {audio_path}")
45
+
46
+ audio_sha256 = file_sha256(audio_path)
47
+ existing = get_meeting_by_audio_sha256(connection, audio_sha256)
48
+ if existing is not None:
49
+ return _meeting_from_row(existing)
50
+
51
+ paths = storage or ensure_storage_layout()
52
+ meeting_id = str(uuid4())
53
+ provisional_title = title or audio_path.stem
54
+ title_source = "manual" if title else "filename"
55
+ slug = unique_slug(connection, slugify(provisional_title))
56
+ imported_audio_path = paths.audio / slug / audio_path.name
57
+ imported_audio_path.parent.mkdir(parents=True, exist_ok=True)
58
+ shutil.copy2(audio_path, imported_audio_path)
59
+
60
+ with connection:
61
+ connection.execute(
62
+ """
63
+ INSERT INTO meetings(
64
+ id,
65
+ slug,
66
+ title,
67
+ title_source,
68
+ description,
69
+ language,
70
+ original_audio_path,
71
+ imported_audio_path,
72
+ audio_sha256
73
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
74
+ """,
75
+ (
76
+ meeting_id,
77
+ slug,
78
+ provisional_title,
79
+ title_source,
80
+ description,
81
+ config.language,
82
+ str(audio_path),
83
+ str(imported_audio_path),
84
+ audio_sha256,
85
+ ),
86
+ )
87
+ extract_and_store_audio_metadata(connection, meeting_id, imported_audio_path, paths)
88
+
89
+ return Meeting(
90
+ id=meeting_id,
91
+ slug=slug,
92
+ title=provisional_title,
93
+ title_source=title_source,
94
+ language=config.language,
95
+ imported_audio_path=imported_audio_path,
96
+ audio_sha256=audio_sha256,
97
+ )
98
+
99
+
100
+ def file_sha256(path: Path) -> str:
101
+ digest = hashlib.sha256()
102
+ with path.open("rb") as file:
103
+ for chunk in iter(lambda: file.read(1024 * 1024), b""):
104
+ digest.update(chunk)
105
+ return digest.hexdigest()
106
+
107
+
108
+ def slugify(value: str) -> str:
109
+ slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-")
110
+ return slug or "meeting"
111
+
112
+
113
+ def unique_slug(connection: Connection, base_slug: str) -> str:
114
+ slug = base_slug
115
+ suffix = 2
116
+ while _slug_exists(connection, slug):
117
+ slug = f"{base_slug}-{suffix}"
118
+ suffix += 1
119
+ return slug
120
+
121
+
122
+ def _slug_exists(connection: Connection, slug: str) -> bool:
123
+ row = connection.execute("SELECT 1 FROM meetings WHERE slug = ?", (slug,)).fetchone()
124
+ return row is not None
125
+
126
+
127
+ def list_meetings(connection: Connection) -> list[dict]:
128
+ return [
129
+ dict(row)
130
+ for row in connection.execute(
131
+ """
132
+ SELECT id, slug, title, title_source, generated_title, language, created_at
133
+ FROM meetings
134
+ ORDER BY created_at DESC
135
+ """
136
+ ).fetchall()
137
+ ]
138
+
139
+
140
+ def get_meeting(connection: Connection, meeting_id_or_slug: str) -> dict | None:
141
+ row = connection.execute(
142
+ """
143
+ SELECT * FROM meetings
144
+ WHERE id = ? OR slug = ?
145
+ """,
146
+ (meeting_id_or_slug, meeting_id_or_slug),
147
+ ).fetchone()
148
+ return None if row is None else dict(row)
149
+
150
+
151
+ def get_meeting_by_audio_sha256(connection: Connection, audio_sha256: str) -> dict | None:
152
+ row = connection.execute("SELECT * FROM meetings WHERE audio_sha256 = ?", (audio_sha256,)).fetchone()
153
+ return None if row is None else dict(row)
154
+
155
+
156
+ def latest_completed_provider_run(connection: Connection, meeting_id: str, provider: str = "elevenlabs") -> dict | None:
157
+ row = connection.execute(
158
+ """
159
+ SELECT * FROM provider_runs
160
+ WHERE meeting_id = ? AND provider = ? AND status = 'done'
161
+ ORDER BY completed_at DESC, created_at DESC
162
+ LIMIT 1
163
+ """,
164
+ (meeting_id, provider),
165
+ ).fetchone()
166
+ return None if row is None else dict(row)
167
+
168
+
169
+ def update_generated_title(connection: Connection, meeting_id: str, generated_title: str) -> None:
170
+ normalized_title = generated_title.strip()
171
+ if not normalized_title:
172
+ return
173
+
174
+ with connection:
175
+ row = connection.execute("SELECT title_source FROM meetings WHERE id = ?", (meeting_id,)).fetchone()
176
+ if row is None:
177
+ raise ValueError(f"Meeting not found: {meeting_id}")
178
+
179
+ if row["title_source"] == "manual":
180
+ connection.execute(
181
+ """
182
+ UPDATE meetings
183
+ SET generated_title = ?, updated_at = CURRENT_TIMESTAMP
184
+ WHERE id = ?
185
+ """,
186
+ (normalized_title, meeting_id),
187
+ )
188
+ else:
189
+ connection.execute(
190
+ """
191
+ UPDATE meetings
192
+ SET title = ?, title_source = 'generated', generated_title = ?,
193
+ updated_at = CURRENT_TIMESTAMP
194
+ WHERE id = ?
195
+ """,
196
+ (normalized_title, normalized_title, meeting_id),
197
+ )
198
+
199
+
200
+ def rename_meeting(connection: Connection, meeting_id_or_slug: str, title: str) -> dict:
201
+ meeting = get_meeting(connection, meeting_id_or_slug)
202
+ if meeting is None:
203
+ raise ValueError(f"Meeting not found: {meeting_id_or_slug}")
204
+
205
+ normalized_title = title.strip()
206
+ if not normalized_title:
207
+ raise ValueError("Meeting title cannot be empty.")
208
+
209
+ with connection:
210
+ connection.execute(
211
+ """
212
+ UPDATE meetings
213
+ SET title = ?, title_source = 'manual', updated_at = CURRENT_TIMESTAMP
214
+ WHERE id = ?
215
+ """,
216
+ (normalized_title, meeting["id"]),
217
+ )
218
+ updated = get_meeting(connection, meeting["id"])
219
+ if updated is None:
220
+ raise ValueError(f"Meeting not found: {meeting_id_or_slug}")
221
+ return updated
222
+
223
+
224
+ def _meeting_from_row(row: dict) -> Meeting:
225
+ return Meeting(
226
+ id=row["id"],
227
+ slug=row["slug"],
228
+ title=row["title"],
229
+ title_source=row.get("title_source", "manual"),
230
+ language=row["language"],
231
+ imported_audio_path=Path(row["imported_audio_path"]),
232
+ audio_sha256=row.get("audio_sha256"),
233
+ generated_title=row.get("generated_title"),
234
+ )
235
+
236
+
237
+ def meeting_stage_status(connection: Connection, meeting_id_or_slug: str) -> list[dict]:
238
+ meeting = get_meeting(connection, meeting_id_or_slug)
239
+ if meeting is None:
240
+ return []
241
+ return [
242
+ dict(row)
243
+ for row in connection.execute(
244
+ """
245
+ SELECT stage_name, status, error_message, updated_at
246
+ FROM pipeline_stages
247
+ WHERE meeting_id = ?
248
+ ORDER BY stage_name
249
+ """,
250
+ (meeting["id"],),
251
+ ).fetchall()
252
+ ]
253
+
254
+
255
+ def delete_meeting(
256
+ connection: Connection,
257
+ meeting_id_or_slug: str,
258
+ storage: StoragePaths | None = None,
259
+ delete_published: bool = False,
260
+ ) -> DeleteMeetingResult:
261
+ meeting = get_meeting(connection, meeting_id_or_slug)
262
+ if meeting is None:
263
+ raise ValueError(f"Meeting not found: {meeting_id_or_slug}")
264
+
265
+ paths = storage or ensure_storage_layout()
266
+ removed_paths = _meeting_owned_paths(connection, meeting, paths)
267
+ if delete_published:
268
+ removed_paths.extend(_published_paths(connection, meeting["id"]))
269
+ local_speaker_ids = _local_speaker_ids(connection, meeting["id"])
270
+ voice_sample_ids = _meeting_voice_sample_ids(connection, meeting["id"])
271
+
272
+ with connection:
273
+ _delete_corrections(connection, meeting["id"], local_speaker_ids)
274
+ _delete_voice_samples(connection, voice_sample_ids)
275
+ connection.execute("DELETE FROM meetings WHERE id = ?", (meeting["id"],))
276
+
277
+ for path in removed_paths:
278
+ _remove_path(path)
279
+
280
+ return DeleteMeetingResult(
281
+ id=meeting["id"],
282
+ slug=meeting["slug"],
283
+ removed_paths=tuple(path for path in removed_paths if not path.exists()),
284
+ )
285
+
286
+
287
+ def _meeting_owned_paths(connection: Connection, meeting: dict, storage: StoragePaths) -> list[Path]:
288
+ paths = [
289
+ storage.audio / meeting["slug"],
290
+ storage.artifacts / meeting["id"],
291
+ storage.exports / meeting["slug"],
292
+ ]
293
+
294
+ for key in ("imported_audio_path",):
295
+ if meeting.get(key):
296
+ paths.append(Path(meeting[key]))
297
+
298
+ for row in connection.execute(
299
+ "SELECT output_dir, manifest_path FROM exports WHERE meeting_id = ?", (meeting["id"],)
300
+ ).fetchall():
301
+ paths.append(Path(row["output_dir"]))
302
+ paths.append(Path(row["manifest_path"]))
303
+
304
+ for row in connection.execute(
305
+ """
306
+ SELECT audio_path, embedding_path
307
+ FROM voice_samples
308
+ WHERE source_meeting_id = ?
309
+ """,
310
+ (meeting["id"],),
311
+ ).fetchall():
312
+ paths.append(Path(row["audio_path"]))
313
+ if row["embedding_path"]:
314
+ paths.append(Path(row["embedding_path"]))
315
+
316
+ return _deduplicate_paths(paths)
317
+
318
+
319
+ def _published_paths(connection: Connection, meeting_id: str) -> list[Path]:
320
+ return [
321
+ Path(row["output_path"])
322
+ for row in connection.execute(
323
+ "SELECT output_path FROM published_items WHERE meeting_id = ?",
324
+ (meeting_id,),
325
+ ).fetchall()
326
+ ]
327
+
328
+
329
+ def _local_speaker_ids(connection: Connection, meeting_id: str) -> list[str]:
330
+ return [
331
+ row["id"]
332
+ for row in connection.execute("SELECT id FROM local_speakers WHERE meeting_id = ?", (meeting_id,)).fetchall()
333
+ ]
334
+
335
+
336
+ def _meeting_voice_sample_ids(connection: Connection, meeting_id: str) -> list[str]:
337
+ return [
338
+ row["id"]
339
+ for row in connection.execute(
340
+ "SELECT id FROM voice_samples WHERE source_meeting_id = ?", (meeting_id,)
341
+ ).fetchall()
342
+ ]
343
+
344
+
345
+ def _delete_corrections(connection: Connection, meeting_id: str, local_speaker_ids: list[str]) -> None:
346
+ connection.execute("DELETE FROM corrections WHERE meeting_id = ?", (meeting_id,))
347
+ if local_speaker_ids:
348
+ placeholders = ", ".join("?" for _ in local_speaker_ids)
349
+ connection.execute(
350
+ f"DELETE FROM corrections WHERE local_speaker_id IN ({placeholders})",
351
+ local_speaker_ids,
352
+ )
353
+
354
+
355
+ def _delete_voice_samples(connection: Connection, voice_sample_ids: list[str]) -> None:
356
+ if not voice_sample_ids:
357
+ return
358
+ placeholders = ", ".join("?" for _ in voice_sample_ids)
359
+ connection.execute(
360
+ f"DELETE FROM voice_samples WHERE id IN ({placeholders})",
361
+ voice_sample_ids,
362
+ )
363
+
364
+
365
+ def _remove_path(path: Path) -> None:
366
+ if not path.exists():
367
+ return
368
+ if path.is_dir():
369
+ shutil.rmtree(path)
370
+ else:
371
+ path.unlink()
372
+
373
+
374
+ def _deduplicate_paths(paths: list[Path]) -> list[Path]:
375
+ seen: set[Path] = set()
376
+ deduplicated: list[Path] = []
377
+ for path in sorted(paths, key=lambda item: len(item.parts), reverse=True):
378
+ resolved = path.expanduser()
379
+ if resolved not in seen:
380
+ seen.add(resolved)
381
+ deduplicated.append(resolved)
382
+ return deduplicated
@@ -0,0 +1,166 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from sqlite3 import Connection
7
+ from typing import Any
8
+ from uuid import uuid4
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class NormalizedSegment:
13
+ id: str
14
+ sequence: int
15
+ speaker_label: str
16
+ text: str
17
+ start_time: float | None
18
+ end_time: float | None
19
+ language: str | None
20
+
21
+
22
+ def normalize_provider_run(connection: Connection, provider_run_id: str) -> list[NormalizedSegment]:
23
+ provider_run = connection.execute("SELECT * FROM provider_runs WHERE id = ?", (provider_run_id,)).fetchone()
24
+ if provider_run is None:
25
+ raise ValueError(f"Provider run does not exist: {provider_run_id}")
26
+
27
+ raw_response_path = Path(provider_run["raw_response_path"])
28
+ response = json.loads(raw_response_path.read_text())
29
+ segments = normalize_elevenlabs_response(response)
30
+
31
+ with connection:
32
+ connection.execute("DELETE FROM segments WHERE provider_run_id = ?", (provider_run_id,))
33
+ for segment in segments:
34
+ local_speaker_id = _ensure_local_speaker(
35
+ connection,
36
+ provider_run["meeting_id"],
37
+ provider_run_id,
38
+ segment.speaker_label,
39
+ )
40
+ connection.execute(
41
+ """
42
+ INSERT INTO segments(
43
+ id,
44
+ meeting_id,
45
+ provider_run_id,
46
+ local_speaker_id,
47
+ sequence,
48
+ start_time,
49
+ end_time,
50
+ text,
51
+ language,
52
+ source_json
53
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
54
+ """,
55
+ (
56
+ segment.id,
57
+ provider_run["meeting_id"],
58
+ provider_run_id,
59
+ local_speaker_id,
60
+ segment.sequence,
61
+ segment.start_time,
62
+ segment.end_time,
63
+ segment.text,
64
+ segment.language,
65
+ json.dumps({"speaker_label": segment.speaker_label}, ensure_ascii=False),
66
+ ),
67
+ )
68
+
69
+ return segments
70
+
71
+
72
+ def normalize_elevenlabs_response(response: dict[str, Any]) -> list[NormalizedSegment]:
73
+ normalized: list[NormalizedSegment] = []
74
+ sequence = 0
75
+ for transcript in _iter_transcripts(response):
76
+ for speaker_label, words in _speaker_word_groups(transcript.get("words", [])):
77
+ segment = _build_segment(sequence, speaker_label, words, transcript.get("language_code"))
78
+ if segment is None:
79
+ continue
80
+ normalized.append(segment)
81
+ sequence += 1
82
+ return normalized
83
+
84
+
85
+ def _speaker_word_groups(words: list[dict[str, Any]]) -> list[tuple[str, list[dict[str, Any]]]]:
86
+ groups: list[tuple[str, list[dict[str, Any]]]] = []
87
+ current_speaker: str | None = None
88
+ current_words: list[dict[str, Any]] = []
89
+
90
+ for word in words:
91
+ if word.get("type") == "audio_event":
92
+ continue
93
+ speaker = word.get("speaker_id") or "Unknown"
94
+ if current_speaker is not None and speaker != current_speaker:
95
+ groups.append((current_speaker, current_words))
96
+ current_words = []
97
+ current_speaker = speaker
98
+ current_words.append(word)
99
+
100
+ if current_speaker is not None:
101
+ groups.append((current_speaker, current_words))
102
+ return groups
103
+
104
+
105
+ def _build_segment(
106
+ sequence: int,
107
+ speaker_label: str | None,
108
+ words: list[dict[str, Any]],
109
+ language: str | None,
110
+ ) -> NormalizedSegment | None:
111
+ if not words:
112
+ return None
113
+ text = "".join(str(word.get("text", "")) for word in words).strip()
114
+ if not text:
115
+ return None
116
+ return NormalizedSegment(
117
+ id=str(uuid4()),
118
+ sequence=sequence,
119
+ speaker_label=speaker_label or "Unknown",
120
+ text=text,
121
+ start_time=_first_number(words, "start"),
122
+ end_time=_last_number(words, "end"),
123
+ language=language,
124
+ )
125
+
126
+
127
+ def _iter_transcripts(response: dict[str, Any]) -> list[dict[str, Any]]:
128
+ transcripts = response.get("transcripts")
129
+ if isinstance(transcripts, list):
130
+ return [transcript for transcript in transcripts if isinstance(transcript, dict)]
131
+ return [response]
132
+
133
+
134
+ def _ensure_local_speaker(connection: Connection, meeting_id: str, provider_run_id: str, label: str) -> str:
135
+ row = connection.execute(
136
+ "SELECT id FROM local_speakers WHERE provider_run_id = ? AND label = ?",
137
+ (provider_run_id, label),
138
+ ).fetchone()
139
+ if row is not None:
140
+ return row["id"]
141
+
142
+ local_speaker_id = str(uuid4())
143
+ connection.execute(
144
+ """
145
+ INSERT INTO local_speakers(id, meeting_id, provider_run_id, label)
146
+ VALUES (?, ?, ?, ?)
147
+ """,
148
+ (local_speaker_id, meeting_id, provider_run_id, label),
149
+ )
150
+ return local_speaker_id
151
+
152
+
153
+ def _first_number(words: list[dict[str, Any]], key: str) -> float | None:
154
+ for word in words:
155
+ value = word.get(key)
156
+ if isinstance(value, int | float):
157
+ return float(value)
158
+ return None
159
+
160
+
161
+ def _last_number(words: list[dict[str, Any]], key: str) -> float | None:
162
+ for word in reversed(words):
163
+ value = word.get(key)
164
+ if isinstance(value, int | float):
165
+ return float(value)
166
+ return None
@@ -0,0 +1,82 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from sqlite3 import Connection
5
+ from uuid import uuid4
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class Person:
10
+ id: str
11
+ display_name: str
12
+ is_user: bool = False
13
+
14
+
15
+ def create_person(connection: Connection, display_name: str) -> Person:
16
+ person = Person(id=str(uuid4()), display_name=display_name.strip(), is_user=False)
17
+ if not person.display_name:
18
+ raise ValueError("Person display name cannot be empty.")
19
+
20
+ with connection:
21
+ connection.execute(
22
+ "INSERT INTO people(id, display_name) VALUES (?, ?)",
23
+ (person.id, person.display_name),
24
+ )
25
+ return person
26
+
27
+
28
+ def list_people(connection: Connection) -> list[Person]:
29
+ rows = connection.execute("SELECT id, display_name, is_user FROM people ORDER BY display_name").fetchall()
30
+ return [_person_from_row(row) for row in rows]
31
+
32
+
33
+ def get_person(connection: Connection, person_id_or_name: str) -> Person | None:
34
+ row = connection.execute(
35
+ """
36
+ SELECT id, display_name, is_user FROM people
37
+ WHERE id = ? OR lower(display_name) = lower(?)
38
+ """,
39
+ (person_id_or_name, person_id_or_name),
40
+ ).fetchone()
41
+ if row is None:
42
+ return None
43
+ return _person_from_row(row)
44
+
45
+
46
+ def get_user_person(connection: Connection) -> Person | None:
47
+ row = connection.execute("SELECT id, display_name, is_user FROM people WHERE is_user = 1 LIMIT 1").fetchone()
48
+ return None if row is None else _person_from_row(row)
49
+
50
+
51
+ def set_user_person(connection: Connection, person_id_or_name: str) -> Person:
52
+ person = get_person(connection, person_id_or_name)
53
+ if person is None:
54
+ raise ValueError(f"Person not found: {person_id_or_name}")
55
+
56
+ with connection:
57
+ connection.execute("UPDATE people SET is_user = 0")
58
+ connection.execute(
59
+ "UPDATE people SET is_user = 1, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
60
+ (person.id,),
61
+ )
62
+ return Person(id=person.id, display_name=person.display_name, is_user=True)
63
+
64
+
65
+ def unset_user_person(connection: Connection) -> Person | None:
66
+ person = get_user_person(connection)
67
+ if person is None:
68
+ return None
69
+ with connection:
70
+ connection.execute(
71
+ "UPDATE people SET is_user = 0, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
72
+ (person.id,),
73
+ )
74
+ return Person(id=person.id, display_name=person.display_name, is_user=False)
75
+
76
+
77
+ def _person_from_row(row) -> Person:
78
+ return Person(
79
+ id=row["id"],
80
+ display_name=row["display_name"],
81
+ is_user=bool(row["is_user"]),
82
+ )
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from sqlite3 import Connection
5
+
6
+ from fly_on_the_wall.embeddings import (
7
+ EmbeddingBackend,
8
+ PyannoteEmbeddingBackend,
9
+ cache_voice_sample_embedding,
10
+ )
11
+ from fly_on_the_wall.storage import StoragePaths
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class PeopleEmbeddingStatus:
16
+ people: int
17
+ voice_samples: int
18
+ embedded_voice_samples: int
19
+ missing_voice_sample_embeddings: int
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class PeopleEmbeddingBackfillResult:
24
+ embedded: int
25
+ failed: int
26
+
27
+
28
+ def people_embedding_status(connection: Connection) -> PeopleEmbeddingStatus:
29
+ people = connection.execute("SELECT COUNT(*) FROM people").fetchone()[0]
30
+ voice = connection.execute(
31
+ """
32
+ SELECT COUNT(*) AS total,
33
+ SUM(CASE WHEN embedding_path IS NOT NULL THEN 1 ELSE 0 END) AS embedded
34
+ FROM voice_samples
35
+ """
36
+ ).fetchone()
37
+ total = int(voice["total"] or 0)
38
+ embedded = int(voice["embedded"] or 0)
39
+ return PeopleEmbeddingStatus(
40
+ people=int(people),
41
+ voice_samples=total,
42
+ embedded_voice_samples=embedded,
43
+ missing_voice_sample_embeddings=total - embedded,
44
+ )
45
+
46
+
47
+ def backfill_people_embeddings(
48
+ connection: Connection,
49
+ storage: StoragePaths | None = None,
50
+ backend: EmbeddingBackend | None = None,
51
+ ) -> PeopleEmbeddingBackfillResult:
52
+ resolved_backend = backend or PyannoteEmbeddingBackend()
53
+ rows = connection.execute(
54
+ """
55
+ SELECT id FROM voice_samples
56
+ WHERE embedding_path IS NULL
57
+ ORDER BY created_at
58
+ """
59
+ ).fetchall()
60
+ embedded = failed = 0
61
+ for row in rows:
62
+ try:
63
+ cache_voice_sample_embedding(connection, row["id"], resolved_backend, storage)
64
+ except (FileNotFoundError, RuntimeError, ValueError):
65
+ failed += 1
66
+ continue
67
+ embedded += 1
68
+ return PeopleEmbeddingBackfillResult(embedded=embedded, failed=failed)