fow-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fly_on_the_wall/__init__.py +3 -0
- fly_on_the_wall/audio.py +164 -0
- fly_on_the_wall/audio_metadata.py +241 -0
- fly_on_the_wall/cache.py +26 -0
- fly_on_the_wall/cleanup.py +29 -0
- fly_on_the_wall/cli.py +641 -0
- fly_on_the_wall/cli_costs.py +81 -0
- fly_on_the_wall/cli_menu.py +163 -0
- fly_on_the_wall/cli_publish.py +141 -0
- fly_on_the_wall/cli_speaker_review.py +315 -0
- fly_on_the_wall/cli_watch.py +209 -0
- fly_on_the_wall/config.py +92 -0
- fly_on_the_wall/costs.py +169 -0
- fly_on_the_wall/db.py +508 -0
- fly_on_the_wall/doctor.py +142 -0
- fly_on_the_wall/embeddings.py +142 -0
- fly_on_the_wall/exporting.py +155 -0
- fly_on_the_wall/glossary.py +31 -0
- fly_on_the_wall/meetings.py +382 -0
- fly_on_the_wall/normalization.py +166 -0
- fly_on_the_wall/people.py +82 -0
- fly_on_the_wall/people_embeddings.py +68 -0
- fly_on_the_wall/pipeline.py +120 -0
- fly_on_the_wall/processing.py +427 -0
- fly_on_the_wall/providers/__init__.py +1 -0
- fly_on_the_wall/providers/elevenlabs.py +145 -0
- fly_on_the_wall/providers/openai_analysis.py +195 -0
- fly_on_the_wall/providers/openai_cleanup.py +91 -0
- fly_on_the_wall/publishing.py +410 -0
- fly_on_the_wall/reanalysis.py +172 -0
- fly_on_the_wall/recording_quality.py +141 -0
- fly_on_the_wall/rendering.py +115 -0
- fly_on_the_wall/secrets.py +93 -0
- fly_on_the_wall/service_pricing.py +75 -0
- fly_on_the_wall/setup.py +221 -0
- fly_on_the_wall/speaker_identity.py +173 -0
- fly_on_the_wall/speaker_matching.py +134 -0
- fly_on_the_wall/speakers.py +221 -0
- fly_on_the_wall/storage.py +53 -0
- fly_on_the_wall/voice_samples.py +125 -0
- fly_on_the_wall/watch.py +347 -0
- fow_cli-0.1.0.dist-info/METADATA +447 -0
- fow_cli-0.1.0.dist-info/RECORD +46 -0
- fow_cli-0.1.0.dist-info/WHEEL +4 -0
- fow_cli-0.1.0.dist-info/entry_points.txt +2 -0
- fow_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Sequence
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from sqlite3 import Connection
|
|
6
|
+
|
|
7
|
+
StageStatus = str
|
|
8
|
+
|
|
9
|
+
PENDING: StageStatus = "pending"
|
|
10
|
+
RUNNING: StageStatus = "running"
|
|
11
|
+
DONE: StageStatus = "done"
|
|
12
|
+
FAILED: StageStatus = "failed"
|
|
13
|
+
STALE: StageStatus = "stale"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PipelineError(RuntimeError):
|
|
17
|
+
"""Raised when a pipeline stage cannot run."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class Stage:
|
|
22
|
+
name: str
|
|
23
|
+
run: Callable[[Connection, str], None]
|
|
24
|
+
dependencies: tuple[str, ...] = field(default_factory=tuple)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def run_pipeline(
|
|
28
|
+
connection: Connection,
|
|
29
|
+
meeting_id: str,
|
|
30
|
+
stages: Sequence[Stage],
|
|
31
|
+
force: bool = False,
|
|
32
|
+
) -> list[str]:
|
|
33
|
+
completed: list[str] = []
|
|
34
|
+
stage_by_name = {stage.name: stage for stage in stages}
|
|
35
|
+
|
|
36
|
+
for stage in stages:
|
|
37
|
+
_ensure_dependencies_done(connection, meeting_id, stage, stage_by_name)
|
|
38
|
+
if not force and get_stage_status(connection, meeting_id, stage.name) == DONE:
|
|
39
|
+
continue
|
|
40
|
+
run_stage(connection, meeting_id, stage)
|
|
41
|
+
completed.append(stage.name)
|
|
42
|
+
|
|
43
|
+
return completed
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def run_stage(connection: Connection, meeting_id: str, stage: Stage) -> None:
|
|
47
|
+
set_stage_status(connection, meeting_id, stage.name, RUNNING)
|
|
48
|
+
try:
|
|
49
|
+
stage.run(connection, meeting_id)
|
|
50
|
+
except Exception as exc:
|
|
51
|
+
set_stage_status(connection, meeting_id, stage.name, FAILED, str(exc))
|
|
52
|
+
raise
|
|
53
|
+
set_stage_status(connection, meeting_id, stage.name, DONE)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_stage_status(connection: Connection, meeting_id: str, stage_name: str) -> StageStatus | None:
|
|
57
|
+
row = connection.execute(
|
|
58
|
+
"SELECT status FROM pipeline_stages WHERE meeting_id = ? AND stage_name = ?",
|
|
59
|
+
(meeting_id, stage_name),
|
|
60
|
+
).fetchone()
|
|
61
|
+
return None if row is None else row["status"]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def set_stage_status(
|
|
65
|
+
connection: Connection,
|
|
66
|
+
meeting_id: str,
|
|
67
|
+
stage_name: str,
|
|
68
|
+
status: StageStatus,
|
|
69
|
+
error_message: str | None = None,
|
|
70
|
+
) -> None:
|
|
71
|
+
with connection:
|
|
72
|
+
connection.execute(
|
|
73
|
+
"""
|
|
74
|
+
INSERT INTO pipeline_stages(
|
|
75
|
+
meeting_id,
|
|
76
|
+
stage_name,
|
|
77
|
+
status,
|
|
78
|
+
error_message,
|
|
79
|
+
started_at,
|
|
80
|
+
completed_at,
|
|
81
|
+
updated_at
|
|
82
|
+
) VALUES (
|
|
83
|
+
?,
|
|
84
|
+
?,
|
|
85
|
+
?,
|
|
86
|
+
?,
|
|
87
|
+
CASE WHEN ? = 'running' THEN CURRENT_TIMESTAMP ELSE NULL END,
|
|
88
|
+
CASE WHEN ? = 'done' THEN CURRENT_TIMESTAMP ELSE NULL END,
|
|
89
|
+
CURRENT_TIMESTAMP
|
|
90
|
+
)
|
|
91
|
+
ON CONFLICT(meeting_id, stage_name) DO UPDATE SET
|
|
92
|
+
status = excluded.status,
|
|
93
|
+
error_message = excluded.error_message,
|
|
94
|
+
started_at = CASE
|
|
95
|
+
WHEN excluded.status = 'running' THEN CURRENT_TIMESTAMP
|
|
96
|
+
ELSE pipeline_stages.started_at
|
|
97
|
+
END,
|
|
98
|
+
completed_at = CASE
|
|
99
|
+
WHEN excluded.status = 'done' THEN CURRENT_TIMESTAMP
|
|
100
|
+
ELSE NULL
|
|
101
|
+
END,
|
|
102
|
+
updated_at = CURRENT_TIMESTAMP
|
|
103
|
+
""",
|
|
104
|
+
(meeting_id, stage_name, status, error_message, status, status),
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def mark_stale(connection: Connection, meeting_id: str, stage_names: Sequence[str]) -> None:
|
|
109
|
+
for stage_name in stage_names:
|
|
110
|
+
set_stage_status(connection, meeting_id, stage_name, STALE)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _ensure_dependencies_done(
|
|
114
|
+
connection: Connection, meeting_id: str, stage: Stage, stage_by_name: dict[str, Stage]
|
|
115
|
+
) -> None:
|
|
116
|
+
for dependency in stage.dependencies:
|
|
117
|
+
if dependency not in stage_by_name:
|
|
118
|
+
raise PipelineError(f"Unknown dependency {dependency!r} for stage {stage.name!r}.")
|
|
119
|
+
if get_stage_status(connection, meeting_id, dependency) != DONE:
|
|
120
|
+
raise PipelineError(f"Stage {stage.name!r} requires {dependency!r} to be done.")
|
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from sqlite3 import Connection
|
|
8
|
+
|
|
9
|
+
from fly_on_the_wall.cache import read_cached_text, text_sha256, write_cached_text
|
|
10
|
+
from fly_on_the_wall.cleanup import deterministic_cleanup
|
|
11
|
+
from fly_on_the_wall.config import AppConfig
|
|
12
|
+
from fly_on_the_wall.costs import record_openai_usage
|
|
13
|
+
from fly_on_the_wall.embeddings import EmbeddingBackend
|
|
14
|
+
from fly_on_the_wall.exporting import ExportResult, export_markdown_transcript
|
|
15
|
+
from fly_on_the_wall.glossary import load_glossary_terms
|
|
16
|
+
from fly_on_the_wall.meetings import (
|
|
17
|
+
Meeting,
|
|
18
|
+
get_meeting,
|
|
19
|
+
import_meeting,
|
|
20
|
+
latest_completed_provider_run,
|
|
21
|
+
update_generated_title,
|
|
22
|
+
)
|
|
23
|
+
from fly_on_the_wall.normalization import normalize_provider_run
|
|
24
|
+
from fly_on_the_wall.providers.elevenlabs import run_transcription
|
|
25
|
+
from fly_on_the_wall.providers.openai_analysis import (
|
|
26
|
+
DEFAULT_ANALYSIS_MODEL,
|
|
27
|
+
AnalysisRequest,
|
|
28
|
+
OpenAIAnalysisError,
|
|
29
|
+
OpenAIRequestOptions,
|
|
30
|
+
TitleRequest,
|
|
31
|
+
analyze_meeting,
|
|
32
|
+
fallback_analysis,
|
|
33
|
+
suggest_meeting_title,
|
|
34
|
+
)
|
|
35
|
+
from fly_on_the_wall.providers.openai_cleanup import (
|
|
36
|
+
CLEANUP_PROMPT_VERSION,
|
|
37
|
+
OpenAICleanupError,
|
|
38
|
+
cleanup_transcript,
|
|
39
|
+
)
|
|
40
|
+
from fly_on_the_wall.providers.openai_cleanup import (
|
|
41
|
+
DEFAULT_MODEL as DEFAULT_CLEANUP_MODEL,
|
|
42
|
+
)
|
|
43
|
+
from fly_on_the_wall.publishing import publish_enabled_targets
|
|
44
|
+
from fly_on_the_wall.recording_quality import (
|
|
45
|
+
RecordingIgnoredError,
|
|
46
|
+
assess_after_transcription,
|
|
47
|
+
assess_before_transcription,
|
|
48
|
+
store_recording_quality,
|
|
49
|
+
)
|
|
50
|
+
from fly_on_the_wall.rendering import render_named_transcript
|
|
51
|
+
from fly_on_the_wall.secrets import get_api_key
|
|
52
|
+
from fly_on_the_wall.speaker_identity import match_provider_run_speakers
|
|
53
|
+
from fly_on_the_wall.storage import StoragePaths, ensure_storage_layout
|
|
54
|
+
|
|
55
|
+
TranscribeFn = Callable[[Connection, str, Path, StoragePaths], str]
|
|
56
|
+
ProgressFn = Callable[[str], None]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass(frozen=True)
|
|
60
|
+
class ProcessResult:
|
|
61
|
+
meeting: Meeting
|
|
62
|
+
provider_run_id: str
|
|
63
|
+
export: ExportResult
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True)
|
|
67
|
+
class RefreshContext:
|
|
68
|
+
connection: Connection
|
|
69
|
+
meeting: Meeting
|
|
70
|
+
config: AppConfig
|
|
71
|
+
paths: StoragePaths
|
|
72
|
+
description: str | None
|
|
73
|
+
embedding_backend: EmbeddingBackend | None
|
|
74
|
+
progress: TimedProgress
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass(frozen=True)
|
|
78
|
+
class TranscriptArtifacts:
|
|
79
|
+
deterministic: str
|
|
80
|
+
cleaned: str
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def process_audio(
|
|
84
|
+
connection: Connection,
|
|
85
|
+
audio_path: Path,
|
|
86
|
+
title: str | None,
|
|
87
|
+
config: AppConfig,
|
|
88
|
+
storage: StoragePaths | None = None,
|
|
89
|
+
description: str | None = None,
|
|
90
|
+
transcribe_fn: TranscribeFn | None = None,
|
|
91
|
+
embedding_backend: EmbeddingBackend | None = None,
|
|
92
|
+
progress: ProgressFn | None = None,
|
|
93
|
+
) -> ProcessResult:
|
|
94
|
+
paths = storage or ensure_storage_layout()
|
|
95
|
+
timed_progress = TimedProgress(progress)
|
|
96
|
+
with timed_progress.step("Importing audio"):
|
|
97
|
+
meeting = import_meeting(connection, audio_path, title, config, paths, description)
|
|
98
|
+
timed_progress.message(f"Audio duration: {_audio_duration_label(connection, meeting.id)}")
|
|
99
|
+
pre_quality = assess_before_transcription(connection, meeting)
|
|
100
|
+
if pre_quality is not None:
|
|
101
|
+
store_recording_quality(connection, meeting.id, pre_quality)
|
|
102
|
+
if pre_quality.status in {"empty", "nonsense"}:
|
|
103
|
+
timed_progress.message(f"Ignoring recording ({pre_quality.reason})")
|
|
104
|
+
raise RecordingIgnoredError(meeting, pre_quality)
|
|
105
|
+
|
|
106
|
+
existing_provider_run = latest_completed_provider_run(connection, meeting.id)
|
|
107
|
+
if existing_provider_run is None:
|
|
108
|
+
with timed_progress.step("Transcribing audio with ElevenLabs"):
|
|
109
|
+
resolved_transcribe = transcribe_fn or _run_elevenlabs_transcription
|
|
110
|
+
provider_run_id = resolved_transcribe(connection, meeting.id, meeting.imported_audio_path, paths)
|
|
111
|
+
else:
|
|
112
|
+
timed_progress.message("Reusing completed ElevenLabs transcription")
|
|
113
|
+
provider_run_id = existing_provider_run["id"]
|
|
114
|
+
context = RefreshContext(connection, meeting, config, paths, description, embedding_backend, timed_progress)
|
|
115
|
+
export = _refresh_provider_run(context, provider_run_id)
|
|
116
|
+
timed_progress.message(f"Done ({timed_progress.total_elapsed()})")
|
|
117
|
+
return ProcessResult(_meeting_from_database(connection, meeting.id), provider_run_id, export)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def refresh_meeting(
|
|
121
|
+
connection: Connection,
|
|
122
|
+
meeting_id_or_slug: str,
|
|
123
|
+
config: AppConfig,
|
|
124
|
+
storage: StoragePaths | None = None,
|
|
125
|
+
embedding_backend: EmbeddingBackend | None = None,
|
|
126
|
+
progress: ProgressFn | None = None,
|
|
127
|
+
) -> ProcessResult:
|
|
128
|
+
paths = storage or ensure_storage_layout()
|
|
129
|
+
meeting_row = get_meeting(connection, meeting_id_or_slug)
|
|
130
|
+
if meeting_row is None:
|
|
131
|
+
raise ValueError(f"Meeting not found: {meeting_id_or_slug}")
|
|
132
|
+
|
|
133
|
+
provider_run = latest_completed_provider_run(connection, meeting_row["id"])
|
|
134
|
+
if provider_run is None:
|
|
135
|
+
raise ValueError(f"No completed transcription found for meeting: {meeting_id_or_slug}")
|
|
136
|
+
|
|
137
|
+
meeting = Meeting(
|
|
138
|
+
id=meeting_row["id"],
|
|
139
|
+
slug=meeting_row["slug"],
|
|
140
|
+
title=meeting_row["title"],
|
|
141
|
+
title_source=meeting_row.get("title_source", "manual"),
|
|
142
|
+
language=meeting_row["language"],
|
|
143
|
+
imported_audio_path=Path(meeting_row["imported_audio_path"]),
|
|
144
|
+
audio_sha256=meeting_row.get("audio_sha256"),
|
|
145
|
+
generated_title=meeting_row.get("generated_title"),
|
|
146
|
+
)
|
|
147
|
+
timed_progress = TimedProgress(progress)
|
|
148
|
+
timed_progress.message(f"Refreshing meeting {meeting.slug}")
|
|
149
|
+
context = RefreshContext(
|
|
150
|
+
connection,
|
|
151
|
+
meeting,
|
|
152
|
+
config,
|
|
153
|
+
paths,
|
|
154
|
+
meeting_row.get("description"),
|
|
155
|
+
embedding_backend,
|
|
156
|
+
timed_progress,
|
|
157
|
+
)
|
|
158
|
+
export = _refresh_provider_run(context, provider_run["id"])
|
|
159
|
+
timed_progress.message(f"Done ({timed_progress.total_elapsed()})")
|
|
160
|
+
return ProcessResult(_meeting_from_database(connection, meeting.id), provider_run["id"], export)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _refresh_provider_run(context: RefreshContext, provider_run_id: str) -> ExportResult:
|
|
164
|
+
with context.progress.step("Normalizing transcript"):
|
|
165
|
+
segments = normalize_provider_run(context.connection, provider_run_id)
|
|
166
|
+
quality = assess_after_transcription(context.connection, context.meeting, segments)
|
|
167
|
+
store_recording_quality(context.connection, context.meeting.id, quality)
|
|
168
|
+
if quality.status in {"empty", "nonsense"}:
|
|
169
|
+
context.progress.message(f"Ignoring recording ({quality.reason})")
|
|
170
|
+
raise RecordingIgnoredError(context.meeting, quality)
|
|
171
|
+
|
|
172
|
+
with context.progress.step("Matching speaker identities"):
|
|
173
|
+
try:
|
|
174
|
+
match_provider_run_speakers(
|
|
175
|
+
context.connection,
|
|
176
|
+
provider_run_id,
|
|
177
|
+
context.embedding_backend,
|
|
178
|
+
context.paths,
|
|
179
|
+
)
|
|
180
|
+
except RuntimeError as exc:
|
|
181
|
+
context.progress.message(f"Speaker identity matching skipped ({exc})")
|
|
182
|
+
with context.progress.step("Rendering named transcript"):
|
|
183
|
+
named_transcript = render_named_transcript(context.connection, provider_run_id, storage=context.paths)
|
|
184
|
+
with context.progress.step("Running deterministic cleanup"):
|
|
185
|
+
deterministic_transcript = deterministic_cleanup(named_transcript)
|
|
186
|
+
artifacts = _cleanup_transcript(context, deterministic_transcript)
|
|
187
|
+
analysis = _analyze_transcript(context, artifacts.cleaned)
|
|
188
|
+
_suggest_and_apply_title(context, artifacts.cleaned, analysis)
|
|
189
|
+
|
|
190
|
+
with context.progress.step("Exporting markdown"):
|
|
191
|
+
export = export_markdown_transcript(
|
|
192
|
+
context.connection, context.meeting.id, artifacts.cleaned, analysis, context.paths
|
|
193
|
+
)
|
|
194
|
+
_publish_enabled_targets(context.connection, context.meeting.id, context.progress)
|
|
195
|
+
return export
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _cleanup_transcript(context: RefreshContext, deterministic_transcript: str) -> TranscriptArtifacts:
|
|
199
|
+
if context.config.cleanup_mode != "light" or not get_api_key("openai"):
|
|
200
|
+
return TranscriptArtifacts(deterministic_transcript, deterministic_transcript)
|
|
201
|
+
|
|
202
|
+
glossary_terms = load_glossary_terms(context.config.glossary_path)
|
|
203
|
+
cleanup_cache_key = text_sha256(
|
|
204
|
+
"\n".join(
|
|
205
|
+
[
|
|
206
|
+
DEFAULT_CLEANUP_MODEL,
|
|
207
|
+
CLEANUP_PROMPT_VERSION,
|
|
208
|
+
context.description or "",
|
|
209
|
+
"\n".join(glossary_terms),
|
|
210
|
+
deterministic_transcript,
|
|
211
|
+
]
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
cleanup_cache_dir = context.paths.artifacts / context.meeting.id / "llm-cleanup"
|
|
215
|
+
cached_cleanup = read_cached_text(cleanup_cache_dir, cleanup_cache_key)
|
|
216
|
+
if cached_cleanup is not None:
|
|
217
|
+
context.progress.message("Reusing OpenAI light cleanup")
|
|
218
|
+
return TranscriptArtifacts(deterministic_transcript, cached_cleanup)
|
|
219
|
+
|
|
220
|
+
with context.progress.step("Running OpenAI light cleanup"):
|
|
221
|
+
try:
|
|
222
|
+
cleaned_transcript = cleanup_transcript(
|
|
223
|
+
deterministic_transcript,
|
|
224
|
+
glossary_terms=glossary_terms,
|
|
225
|
+
meeting_context=context.description,
|
|
226
|
+
usage_callback=lambda response: record_openai_usage(
|
|
227
|
+
context.connection,
|
|
228
|
+
meeting_id=context.meeting.id,
|
|
229
|
+
model=DEFAULT_CLEANUP_MODEL,
|
|
230
|
+
service="cleanup",
|
|
231
|
+
response=response,
|
|
232
|
+
),
|
|
233
|
+
)
|
|
234
|
+
write_cached_text(cleanup_cache_dir, cleanup_cache_key, cleaned_transcript)
|
|
235
|
+
return TranscriptArtifacts(deterministic_transcript, cleaned_transcript)
|
|
236
|
+
except OpenAICleanupError as exc:
|
|
237
|
+
context.progress.message(f"OpenAI cleanup failed; exporting deterministic cleanup ({exc})")
|
|
238
|
+
return TranscriptArtifacts(deterministic_transcript, deterministic_transcript)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _publish_enabled_targets(connection: Connection, meeting_id: str, progress: TimedProgress) -> None:
|
|
242
|
+
try:
|
|
243
|
+
published = publish_enabled_targets(connection, meeting_id)
|
|
244
|
+
except ValueError as exc:
|
|
245
|
+
progress.message(f"Publishing skipped ({exc})")
|
|
246
|
+
return
|
|
247
|
+
for result in published:
|
|
248
|
+
progress.message(f"Published to {result.target.name}: {result.output_path}")
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _suggest_and_apply_title(
|
|
252
|
+
context: RefreshContext,
|
|
253
|
+
transcript: str,
|
|
254
|
+
analysis: str,
|
|
255
|
+
) -> None:
|
|
256
|
+
if not get_api_key("openai"):
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
meeting = get_meeting(context.connection, context.meeting.id)
|
|
260
|
+
if meeting is None:
|
|
261
|
+
raise ValueError(f"Meeting not found: {context.meeting.id}")
|
|
262
|
+
|
|
263
|
+
if meeting.get("title_source") == "manual":
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
title_cache_key = text_sha256("\n".join([DEFAULT_ANALYSIS_MODEL, context.description or "", transcript, analysis]))
|
|
267
|
+
title_cache_dir = context.paths.artifacts / context.meeting.id / "generated-title"
|
|
268
|
+
cached_title = read_cached_text(title_cache_dir, title_cache_key)
|
|
269
|
+
if cached_title is not None:
|
|
270
|
+
context.progress.message("Reusing generated meeting title")
|
|
271
|
+
generated_title = cached_title
|
|
272
|
+
else:
|
|
273
|
+
with context.progress.step("Generating meeting title"):
|
|
274
|
+
try:
|
|
275
|
+
generated_title = suggest_meeting_title(
|
|
276
|
+
TitleRequest(
|
|
277
|
+
transcript,
|
|
278
|
+
analysis,
|
|
279
|
+
meeting_context=context.description,
|
|
280
|
+
options=OpenAIRequestOptions(
|
|
281
|
+
usage_callback=lambda response: record_openai_usage(
|
|
282
|
+
context.connection,
|
|
283
|
+
meeting_id=context.meeting.id,
|
|
284
|
+
model=DEFAULT_ANALYSIS_MODEL,
|
|
285
|
+
service="title",
|
|
286
|
+
response=response,
|
|
287
|
+
)
|
|
288
|
+
),
|
|
289
|
+
),
|
|
290
|
+
)
|
|
291
|
+
except OpenAIAnalysisError as exc:
|
|
292
|
+
context.progress.message(f"Meeting title generation failed ({exc})")
|
|
293
|
+
return
|
|
294
|
+
write_cached_text(title_cache_dir, title_cache_key, generated_title)
|
|
295
|
+
|
|
296
|
+
if generated_title.strip():
|
|
297
|
+
update_generated_title(context.connection, context.meeting.id, generated_title)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _run_elevenlabs_transcription(
|
|
301
|
+
connection: Connection, meeting_id: str, audio_path: Path, storage: StoragePaths
|
|
302
|
+
) -> str:
|
|
303
|
+
return run_transcription(connection, meeting_id, audio_path, storage)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _meeting_from_database(connection: Connection, meeting_id: str) -> Meeting:
|
|
307
|
+
row = get_meeting(connection, meeting_id)
|
|
308
|
+
if row is None:
|
|
309
|
+
raise ValueError(f"Meeting not found: {meeting_id}")
|
|
310
|
+
return Meeting(
|
|
311
|
+
id=row["id"],
|
|
312
|
+
slug=row["slug"],
|
|
313
|
+
title=row["title"],
|
|
314
|
+
title_source=row.get("title_source", "manual"),
|
|
315
|
+
language=row["language"],
|
|
316
|
+
imported_audio_path=Path(row["imported_audio_path"]),
|
|
317
|
+
audio_sha256=row.get("audio_sha256"),
|
|
318
|
+
generated_title=row.get("generated_title"),
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _report(progress: ProgressFn | None, message: str) -> None:
|
|
323
|
+
if progress is not None:
|
|
324
|
+
progress(message)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class TimedProgress:
|
|
328
|
+
def __init__(self, progress: ProgressFn | None) -> None:
|
|
329
|
+
self.progress = progress
|
|
330
|
+
self.started_at = time.monotonic()
|
|
331
|
+
|
|
332
|
+
def message(self, message: str) -> None:
|
|
333
|
+
_report(self.progress, message)
|
|
334
|
+
|
|
335
|
+
def step(self, label: str) -> TimedStep:
|
|
336
|
+
return TimedStep(self, label)
|
|
337
|
+
|
|
338
|
+
def total_elapsed(self) -> str:
|
|
339
|
+
return _format_elapsed(time.monotonic() - self.started_at)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
class TimedStep:
|
|
343
|
+
def __init__(self, progress: TimedProgress, label: str) -> None:
|
|
344
|
+
self.progress = progress
|
|
345
|
+
self.label = label
|
|
346
|
+
self.started_at = 0.0
|
|
347
|
+
|
|
348
|
+
def __enter__(self) -> None:
|
|
349
|
+
self.started_at = time.monotonic()
|
|
350
|
+
self.progress.message(self.label)
|
|
351
|
+
|
|
352
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
|
353
|
+
elapsed = _format_elapsed(time.monotonic() - self.started_at)
|
|
354
|
+
self.progress.message(f"{self.label} completed in {elapsed}")
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _audio_duration_label(connection: Connection, meeting_id: str) -> str:
|
|
358
|
+
duration = _audio_duration_from_metadata(connection, meeting_id)
|
|
359
|
+
if duration is None:
|
|
360
|
+
return "Unknown"
|
|
361
|
+
return _format_duration(duration)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _audio_duration_from_metadata(connection: Connection, meeting_id: str) -> float | None:
|
|
365
|
+
row = connection.execute(
|
|
366
|
+
"SELECT duration_seconds FROM audio_metadata WHERE meeting_id = ?",
|
|
367
|
+
(meeting_id,),
|
|
368
|
+
).fetchone()
|
|
369
|
+
if row is None or row["duration_seconds"] is None:
|
|
370
|
+
return None
|
|
371
|
+
return float(row["duration_seconds"])
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _format_duration(seconds: float) -> str:
|
|
375
|
+
total_seconds = int(seconds)
|
|
376
|
+
hours, remainder = divmod(total_seconds, 3600)
|
|
377
|
+
minutes, seconds = divmod(remainder, 60)
|
|
378
|
+
if hours:
|
|
379
|
+
return f"{hours}h {minutes}m {seconds}s"
|
|
380
|
+
if minutes:
|
|
381
|
+
return f"{minutes}m {seconds}s"
|
|
382
|
+
return f"{seconds}s"
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _format_elapsed(seconds: float) -> str:
|
|
386
|
+
if seconds < 1:
|
|
387
|
+
return f"{seconds:.2f}s"
|
|
388
|
+
return _format_duration(seconds)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def _analyze_transcript(
|
|
392
|
+
context: RefreshContext,
|
|
393
|
+
transcript: str,
|
|
394
|
+
) -> str:
|
|
395
|
+
if not get_api_key("openai"):
|
|
396
|
+
return fallback_analysis("OPENAI_API_KEY is missing")
|
|
397
|
+
|
|
398
|
+
analysis_cache_key = text_sha256("\n".join([DEFAULT_ANALYSIS_MODEL, context.description or "", transcript]))
|
|
399
|
+
analysis_cache_dir = context.paths.artifacts / context.meeting.id / "analysis"
|
|
400
|
+
cached_analysis = read_cached_text(analysis_cache_dir, analysis_cache_key)
|
|
401
|
+
if cached_analysis is not None:
|
|
402
|
+
context.progress.message("Reusing meeting analysis")
|
|
403
|
+
return cached_analysis
|
|
404
|
+
|
|
405
|
+
with context.progress.step("Analyzing meeting"):
|
|
406
|
+
try:
|
|
407
|
+
analysis = analyze_meeting(
|
|
408
|
+
AnalysisRequest(
|
|
409
|
+
transcript,
|
|
410
|
+
meeting_context=context.description,
|
|
411
|
+
options=OpenAIRequestOptions(
|
|
412
|
+
usage_callback=lambda response: record_openai_usage(
|
|
413
|
+
context.connection,
|
|
414
|
+
meeting_id=context.meeting.id,
|
|
415
|
+
model=DEFAULT_ANALYSIS_MODEL,
|
|
416
|
+
service="analysis",
|
|
417
|
+
response=response,
|
|
418
|
+
)
|
|
419
|
+
),
|
|
420
|
+
),
|
|
421
|
+
)
|
|
422
|
+
except OpenAIAnalysisError as exc:
|
|
423
|
+
context.progress.message(f"Meeting analysis failed; exporting fallback analysis ({exc})")
|
|
424
|
+
return fallback_analysis(str(exc))
|
|
425
|
+
|
|
426
|
+
write_cached_text(analysis_cache_dir, analysis_cache_key, analysis)
|
|
427
|
+
return analysis
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Transcription and processing provider integrations."""
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from sqlite3 import Connection
|
|
6
|
+
from typing import Any
|
|
7
|
+
from uuid import uuid4
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
|
|
11
|
+
from fly_on_the_wall.costs import record_service_usage
|
|
12
|
+
from fly_on_the_wall.secrets import get_api_key
|
|
13
|
+
from fly_on_the_wall.storage import StoragePaths, storage_paths
|
|
14
|
+
|
|
15
|
+
API_URL = "https://api.elevenlabs.io/v1/speech-to-text"
|
|
16
|
+
PROVIDER = "elevenlabs"
|
|
17
|
+
MODEL = "scribe_v2"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ElevenLabsError(RuntimeError):
|
|
21
|
+
"""Raised when ElevenLabs transcription fails."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def transcribe_audio(
|
|
25
|
+
audio_path: Path,
|
|
26
|
+
api_key: str | None = None,
|
|
27
|
+
client: httpx.Client | None = None,
|
|
28
|
+
num_speakers: int | None = None,
|
|
29
|
+
diarization_threshold: float | None = None,
|
|
30
|
+
no_verbatim: bool = False,
|
|
31
|
+
) -> dict[str, Any]:
|
|
32
|
+
resolved_api_key = api_key or get_api_key(PROVIDER)
|
|
33
|
+
if not resolved_api_key:
|
|
34
|
+
raise ElevenLabsError("Missing ELEVENLABS_API_KEY.")
|
|
35
|
+
|
|
36
|
+
data = {
|
|
37
|
+
"model_id": MODEL,
|
|
38
|
+
"tag_audio_events": "true",
|
|
39
|
+
"timestamps_granularity": "word",
|
|
40
|
+
"diarize": "true",
|
|
41
|
+
"temperature": "0",
|
|
42
|
+
"seed": "1",
|
|
43
|
+
"no_verbatim": str(no_verbatim).lower(),
|
|
44
|
+
}
|
|
45
|
+
if num_speakers is not None:
|
|
46
|
+
data["num_speakers"] = str(num_speakers)
|
|
47
|
+
if diarization_threshold is not None:
|
|
48
|
+
data["diarization_threshold"] = str(diarization_threshold)
|
|
49
|
+
|
|
50
|
+
close_client = client is None
|
|
51
|
+
http_client = client or httpx.Client(timeout=600)
|
|
52
|
+
try:
|
|
53
|
+
with audio_path.open("rb") as audio_file:
|
|
54
|
+
response = http_client.post(
|
|
55
|
+
API_URL,
|
|
56
|
+
headers={"xi-api-key": resolved_api_key},
|
|
57
|
+
data=data,
|
|
58
|
+
files={"file": (audio_path.name, audio_file)},
|
|
59
|
+
)
|
|
60
|
+
response.raise_for_status()
|
|
61
|
+
return response.json()
|
|
62
|
+
except httpx.HTTPStatusError as exc:
|
|
63
|
+
message = f"ElevenLabs HTTP {exc.response.status_code}: {exc.response.text}"
|
|
64
|
+
raise ElevenLabsError(message) from exc
|
|
65
|
+
except httpx.HTTPError as exc:
|
|
66
|
+
raise ElevenLabsError(f"ElevenLabs request failed: {exc}") from exc
|
|
67
|
+
finally:
|
|
68
|
+
if close_client:
|
|
69
|
+
http_client.close()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def run_transcription(
|
|
73
|
+
connection: Connection,
|
|
74
|
+
meeting_id: str,
|
|
75
|
+
audio_path: Path,
|
|
76
|
+
storage: StoragePaths | None = None,
|
|
77
|
+
client: httpx.Client | None = None,
|
|
78
|
+
api_key: str | None = None,
|
|
79
|
+
) -> str:
|
|
80
|
+
paths = storage or storage_paths()
|
|
81
|
+
provider_run_id = str(uuid4())
|
|
82
|
+
raw_response_path = paths.artifacts / meeting_id / "provider-runs" / f"{provider_run_id}.raw.json"
|
|
83
|
+
raw_response_path.parent.mkdir(parents=True, exist_ok=True)
|
|
84
|
+
|
|
85
|
+
_insert_provider_run(connection, provider_run_id, meeting_id, raw_response_path, "running")
|
|
86
|
+
try:
|
|
87
|
+
response = transcribe_audio(audio_path, api_key=api_key, client=client)
|
|
88
|
+
raw_response_path.write_text(json.dumps(response, indent=2, ensure_ascii=False) + "\n")
|
|
89
|
+
duration = float(response.get("audio_duration_secs") or 0)
|
|
90
|
+
record_service_usage(
|
|
91
|
+
connection,
|
|
92
|
+
meeting_id=meeting_id,
|
|
93
|
+
provider_run_id=provider_run_id,
|
|
94
|
+
provider=PROVIDER,
|
|
95
|
+
model=MODEL,
|
|
96
|
+
service="transcription",
|
|
97
|
+
unit="audio_second",
|
|
98
|
+
input_quantity=duration,
|
|
99
|
+
usage={"audio_duration_secs": duration},
|
|
100
|
+
)
|
|
101
|
+
except Exception:
|
|
102
|
+
_set_provider_run_status(connection, provider_run_id, "failed")
|
|
103
|
+
raise
|
|
104
|
+
|
|
105
|
+
_set_provider_run_status(connection, provider_run_id, "done")
|
|
106
|
+
return provider_run_id
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _insert_provider_run(
|
|
110
|
+
connection: Connection,
|
|
111
|
+
provider_run_id: str,
|
|
112
|
+
meeting_id: str,
|
|
113
|
+
raw_response_path: Path,
|
|
114
|
+
status: str,
|
|
115
|
+
) -> None:
|
|
116
|
+
with connection:
|
|
117
|
+
connection.execute(
|
|
118
|
+
"""
|
|
119
|
+
INSERT INTO provider_runs(
|
|
120
|
+
id,
|
|
121
|
+
meeting_id,
|
|
122
|
+
provider,
|
|
123
|
+
model,
|
|
124
|
+
raw_response_path,
|
|
125
|
+
status
|
|
126
|
+
) VALUES (?, ?, ?, ?, ?, ?)
|
|
127
|
+
""",
|
|
128
|
+
(provider_run_id, meeting_id, PROVIDER, MODEL, str(raw_response_path), status),
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _set_provider_run_status(connection: Connection, provider_run_id: str, status: str) -> None:
|
|
133
|
+
with connection:
|
|
134
|
+
connection.execute(
|
|
135
|
+
"""
|
|
136
|
+
UPDATE provider_runs
|
|
137
|
+
SET status = ?,
|
|
138
|
+
completed_at = CASE
|
|
139
|
+
WHEN ? = 'done' THEN CURRENT_TIMESTAMP
|
|
140
|
+
ELSE completed_at
|
|
141
|
+
END
|
|
142
|
+
WHERE id = ?
|
|
143
|
+
""",
|
|
144
|
+
(status, status, provider_run_id),
|
|
145
|
+
)
|