fow-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fly_on_the_wall/__init__.py +3 -0
- fly_on_the_wall/audio.py +164 -0
- fly_on_the_wall/audio_metadata.py +241 -0
- fly_on_the_wall/cache.py +26 -0
- fly_on_the_wall/cleanup.py +29 -0
- fly_on_the_wall/cli.py +641 -0
- fly_on_the_wall/cli_costs.py +81 -0
- fly_on_the_wall/cli_menu.py +163 -0
- fly_on_the_wall/cli_publish.py +141 -0
- fly_on_the_wall/cli_speaker_review.py +315 -0
- fly_on_the_wall/cli_watch.py +209 -0
- fly_on_the_wall/config.py +92 -0
- fly_on_the_wall/costs.py +169 -0
- fly_on_the_wall/db.py +508 -0
- fly_on_the_wall/doctor.py +142 -0
- fly_on_the_wall/embeddings.py +142 -0
- fly_on_the_wall/exporting.py +155 -0
- fly_on_the_wall/glossary.py +31 -0
- fly_on_the_wall/meetings.py +382 -0
- fly_on_the_wall/normalization.py +166 -0
- fly_on_the_wall/people.py +82 -0
- fly_on_the_wall/people_embeddings.py +68 -0
- fly_on_the_wall/pipeline.py +120 -0
- fly_on_the_wall/processing.py +427 -0
- fly_on_the_wall/providers/__init__.py +1 -0
- fly_on_the_wall/providers/elevenlabs.py +145 -0
- fly_on_the_wall/providers/openai_analysis.py +195 -0
- fly_on_the_wall/providers/openai_cleanup.py +91 -0
- fly_on_the_wall/publishing.py +410 -0
- fly_on_the_wall/reanalysis.py +172 -0
- fly_on_the_wall/recording_quality.py +141 -0
- fly_on_the_wall/rendering.py +115 -0
- fly_on_the_wall/secrets.py +93 -0
- fly_on_the_wall/service_pricing.py +75 -0
- fly_on_the_wall/setup.py +221 -0
- fly_on_the_wall/speaker_identity.py +173 -0
- fly_on_the_wall/speaker_matching.py +134 -0
- fly_on_the_wall/speakers.py +221 -0
- fly_on_the_wall/storage.py +53 -0
- fly_on_the_wall/voice_samples.py +125 -0
- fly_on_the_wall/watch.py +347 -0
- fow_cli-0.1.0.dist-info/METADATA +447 -0
- fow_cli-0.1.0.dist-info/RECORD +46 -0
- fow_cli-0.1.0.dist-info/WHEEL +4 -0
- fow_cli-0.1.0.dist-info/entry_points.txt +2 -0
- fow_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
fly_on_the_wall/audio.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from select import select
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AudioError(RuntimeError):
|
|
12
|
+
"""Raised when an audio operation fails."""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_duration(audio_path: Path) -> float:
|
|
16
|
+
result = _run(
|
|
17
|
+
[
|
|
18
|
+
"ffprobe",
|
|
19
|
+
"-v",
|
|
20
|
+
"error",
|
|
21
|
+
"-show_entries",
|
|
22
|
+
"format=duration",
|
|
23
|
+
"-of",
|
|
24
|
+
"default=noprint_wrappers=1:nokey=1",
|
|
25
|
+
str(audio_path),
|
|
26
|
+
]
|
|
27
|
+
)
|
|
28
|
+
try:
|
|
29
|
+
return float(result.stdout.strip())
|
|
30
|
+
except ValueError as exc:
|
|
31
|
+
raise AudioError(f"Could not read duration for {audio_path}") from exc
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def probe_metadata(audio_path: Path) -> dict:
|
|
35
|
+
result = _run(
|
|
36
|
+
[
|
|
37
|
+
"ffprobe",
|
|
38
|
+
"-v",
|
|
39
|
+
"quiet",
|
|
40
|
+
"-print_format",
|
|
41
|
+
"json",
|
|
42
|
+
"-show_format",
|
|
43
|
+
"-show_streams",
|
|
44
|
+
str(audio_path),
|
|
45
|
+
]
|
|
46
|
+
)
|
|
47
|
+
try:
|
|
48
|
+
return json.loads(result.stdout)
|
|
49
|
+
except json.JSONDecodeError as exc:
|
|
50
|
+
raise AudioError(f"Could not parse metadata for {audio_path}") from exc
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def convert_to_wav(input_path: Path, output_path: Path) -> Path:
|
|
54
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
_run(["ffmpeg", "-y", "-i", str(input_path), str(output_path)])
|
|
56
|
+
return output_path
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def normalize_for_embedding(input_path: Path, output_path: Path) -> Path:
|
|
60
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
_run(
|
|
62
|
+
[
|
|
63
|
+
"ffmpeg",
|
|
64
|
+
"-y",
|
|
65
|
+
"-i",
|
|
66
|
+
str(input_path),
|
|
67
|
+
"-ac",
|
|
68
|
+
"1",
|
|
69
|
+
"-ar",
|
|
70
|
+
"16000",
|
|
71
|
+
str(output_path),
|
|
72
|
+
]
|
|
73
|
+
)
|
|
74
|
+
return output_path
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def extract_clip(input_path: Path, output_path: Path, start: float, end: float) -> Path:
|
|
78
|
+
if end <= start:
|
|
79
|
+
raise ValueError("Clip end must be greater than start.")
|
|
80
|
+
|
|
81
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
82
|
+
_run(
|
|
83
|
+
[
|
|
84
|
+
"ffmpeg",
|
|
85
|
+
"-y",
|
|
86
|
+
"-ss",
|
|
87
|
+
f"{max(0.0, start):.3f}",
|
|
88
|
+
"-to",
|
|
89
|
+
f"{end:.3f}",
|
|
90
|
+
"-i",
|
|
91
|
+
str(input_path),
|
|
92
|
+
str(output_path),
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
return output_path
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def play_audio(audio_path: Path, player: str = "ffplay", stop_on_enter: bool = False) -> None:
|
|
99
|
+
command = audio_playback_command(audio_path, player)
|
|
100
|
+
if stop_on_enter:
|
|
101
|
+
_run_until_enter(command)
|
|
102
|
+
return
|
|
103
|
+
_run(command)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def audio_playback_command(audio_path: Path, player: str = "ffplay") -> list[str]:
|
|
107
|
+
if player == "ffplay":
|
|
108
|
+
return ["ffplay", "-nodisp", "-autoexit", str(audio_path)]
|
|
109
|
+
return [player, str(audio_path)]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def start_audio_playback(audio_path: Path, player: str = "ffplay") -> subprocess.Popen:
|
|
113
|
+
command = audio_playback_command(audio_path, player)
|
|
114
|
+
try:
|
|
115
|
+
return subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
116
|
+
except FileNotFoundError as exc:
|
|
117
|
+
raise AudioError(f"Required audio tool not found: {command[0]}") from exc
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def stop_audio_playback(process: subprocess.Popen) -> None:
|
|
121
|
+
if process.poll() is not None:
|
|
122
|
+
return
|
|
123
|
+
process.terminate()
|
|
124
|
+
try:
|
|
125
|
+
process.wait(timeout=2)
|
|
126
|
+
except subprocess.TimeoutExpired:
|
|
127
|
+
process.kill()
|
|
128
|
+
process.wait()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _run_until_enter(command: list[str]) -> None:
|
|
132
|
+
try:
|
|
133
|
+
process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
134
|
+
except FileNotFoundError as exc:
|
|
135
|
+
raise AudioError(f"Required audio tool not found: {command[0]}") from exc
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
while process.poll() is None:
|
|
139
|
+
if sys.stdin.isatty():
|
|
140
|
+
ready, _, _ = select([sys.stdin], [], [], 0.1)
|
|
141
|
+
if ready:
|
|
142
|
+
sys.stdin.readline()
|
|
143
|
+
process.terminate()
|
|
144
|
+
break
|
|
145
|
+
else:
|
|
146
|
+
time.sleep(0.1)
|
|
147
|
+
except KeyboardInterrupt:
|
|
148
|
+
process.terminate()
|
|
149
|
+
finally:
|
|
150
|
+
try:
|
|
151
|
+
process.wait(timeout=2)
|
|
152
|
+
except subprocess.TimeoutExpired:
|
|
153
|
+
process.kill()
|
|
154
|
+
process.wait()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _run(command: list[str]) -> subprocess.CompletedProcess[str]:
|
|
158
|
+
try:
|
|
159
|
+
return subprocess.run(command, check=True, capture_output=True, text=True)
|
|
160
|
+
except FileNotFoundError as exc:
|
|
161
|
+
raise AudioError(f"Required audio tool not found: {command[0]}") from exc
|
|
162
|
+
except subprocess.CalledProcessError as exc:
|
|
163
|
+
message = exc.stderr.strip() or exc.stdout.strip() or str(exc)
|
|
164
|
+
raise AudioError(f"Audio command failed: {message}") from exc
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from sqlite3 import Connection
|
|
9
|
+
|
|
10
|
+
from fly_on_the_wall.audio import AudioError, probe_metadata
|
|
11
|
+
from fly_on_the_wall.storage import StoragePaths
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class NormalizedAudioMetadata:
|
|
16
|
+
recorded_at: str | None = None
|
|
17
|
+
recorded_at_source: str | None = None
|
|
18
|
+
recorded_at_confidence: str | None = None
|
|
19
|
+
duration_seconds: float | None = None
|
|
20
|
+
size_bytes: int | None = None
|
|
21
|
+
bit_rate: int | None = None
|
|
22
|
+
codec: str | None = None
|
|
23
|
+
sample_rate: int | None = None
|
|
24
|
+
channels: int | None = None
|
|
25
|
+
channel_layout: str | None = None
|
|
26
|
+
container_format: str | None = None
|
|
27
|
+
metadata_title: str | None = None
|
|
28
|
+
metadata_artist: str | None = None
|
|
29
|
+
metadata_album: str | None = None
|
|
30
|
+
metadata_genre: str | None = None
|
|
31
|
+
metadata_comment: str | None = None
|
|
32
|
+
metadata_encoder: str | None = None
|
|
33
|
+
device_or_software: str | None = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_and_store_audio_metadata(
|
|
37
|
+
connection: Connection,
|
|
38
|
+
meeting_id: str,
|
|
39
|
+
audio_path: Path,
|
|
40
|
+
storage: StoragePaths,
|
|
41
|
+
) -> None:
|
|
42
|
+
try:
|
|
43
|
+
raw_metadata = probe_metadata(audio_path)
|
|
44
|
+
except AudioError:
|
|
45
|
+
return
|
|
46
|
+
|
|
47
|
+
metadata_dir = storage.artifacts / meeting_id
|
|
48
|
+
metadata_dir.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
raw_metadata_path = metadata_dir / "audio-metadata.ffprobe.json"
|
|
50
|
+
raw_metadata_path.write_text(json.dumps(raw_metadata, indent=2) + "\n")
|
|
51
|
+
|
|
52
|
+
normalized = normalize_audio_metadata(raw_metadata, audio_path)
|
|
53
|
+
with connection:
|
|
54
|
+
connection.execute(
|
|
55
|
+
"""
|
|
56
|
+
INSERT OR REPLACE INTO audio_metadata(
|
|
57
|
+
id,
|
|
58
|
+
meeting_id,
|
|
59
|
+
raw_metadata_path,
|
|
60
|
+
recorded_at,
|
|
61
|
+
recorded_at_source,
|
|
62
|
+
recorded_at_confidence,
|
|
63
|
+
duration_seconds,
|
|
64
|
+
size_bytes,
|
|
65
|
+
bit_rate,
|
|
66
|
+
codec,
|
|
67
|
+
sample_rate,
|
|
68
|
+
channels,
|
|
69
|
+
channel_layout,
|
|
70
|
+
container_format,
|
|
71
|
+
metadata_title,
|
|
72
|
+
metadata_artist,
|
|
73
|
+
metadata_album,
|
|
74
|
+
metadata_genre,
|
|
75
|
+
metadata_comment,
|
|
76
|
+
metadata_encoder,
|
|
77
|
+
device_or_software
|
|
78
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
79
|
+
""",
|
|
80
|
+
(
|
|
81
|
+
meeting_id,
|
|
82
|
+
meeting_id,
|
|
83
|
+
str(raw_metadata_path),
|
|
84
|
+
normalized.recorded_at,
|
|
85
|
+
normalized.recorded_at_source,
|
|
86
|
+
normalized.recorded_at_confidence,
|
|
87
|
+
normalized.duration_seconds,
|
|
88
|
+
normalized.size_bytes,
|
|
89
|
+
normalized.bit_rate,
|
|
90
|
+
normalized.codec,
|
|
91
|
+
normalized.sample_rate,
|
|
92
|
+
normalized.channels,
|
|
93
|
+
normalized.channel_layout,
|
|
94
|
+
normalized.container_format,
|
|
95
|
+
normalized.metadata_title,
|
|
96
|
+
normalized.metadata_artist,
|
|
97
|
+
normalized.metadata_album,
|
|
98
|
+
normalized.metadata_genre,
|
|
99
|
+
normalized.metadata_comment,
|
|
100
|
+
normalized.metadata_encoder,
|
|
101
|
+
normalized.device_or_software,
|
|
102
|
+
),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def normalize_audio_metadata(raw_metadata: dict, audio_path: Path) -> NormalizedAudioMetadata:
|
|
107
|
+
audio_stream = _first_audio_stream(raw_metadata)
|
|
108
|
+
format_data = raw_metadata.get("format") if isinstance(raw_metadata.get("format"), dict) else {}
|
|
109
|
+
format_tags = _normalized_tags(format_data.get("tags"))
|
|
110
|
+
stream_tags = _normalized_tags(audio_stream.get("tags"))
|
|
111
|
+
tags = {**stream_tags, **format_tags}
|
|
112
|
+
recorded_at, recorded_at_source, recorded_at_confidence = _recorded_at(tags, audio_path)
|
|
113
|
+
|
|
114
|
+
return NormalizedAudioMetadata(
|
|
115
|
+
recorded_at=recorded_at,
|
|
116
|
+
recorded_at_source=recorded_at_source,
|
|
117
|
+
recorded_at_confidence=recorded_at_confidence,
|
|
118
|
+
duration_seconds=_optional_float(format_data.get("duration")),
|
|
119
|
+
size_bytes=_optional_int(format_data.get("size")),
|
|
120
|
+
bit_rate=_optional_int(format_data.get("bit_rate") or audio_stream.get("bit_rate")),
|
|
121
|
+
codec=_optional_str(audio_stream.get("codec_name")),
|
|
122
|
+
sample_rate=_optional_int(audio_stream.get("sample_rate")),
|
|
123
|
+
channels=_optional_int(audio_stream.get("channels")),
|
|
124
|
+
channel_layout=_optional_str(audio_stream.get("channel_layout")),
|
|
125
|
+
container_format=_optional_str(format_data.get("format_name")),
|
|
126
|
+
metadata_title=tags.get("title"),
|
|
127
|
+
metadata_artist=tags.get("artist"),
|
|
128
|
+
metadata_album=tags.get("album"),
|
|
129
|
+
metadata_genre=tags.get("genre"),
|
|
130
|
+
metadata_comment=tags.get("comment"),
|
|
131
|
+
metadata_encoder=tags.get("encoder"),
|
|
132
|
+
device_or_software=tags.get("artist") or tags.get("encoder"),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _first_audio_stream(raw_metadata: dict) -> dict:
|
|
137
|
+
streams = raw_metadata.get("streams")
|
|
138
|
+
if not isinstance(streams, list):
|
|
139
|
+
return {}
|
|
140
|
+
for stream in streams:
|
|
141
|
+
if isinstance(stream, dict) and stream.get("codec_type") == "audio":
|
|
142
|
+
return stream
|
|
143
|
+
return {}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _normalized_tags(tags: object) -> dict[str, str]:
|
|
147
|
+
if not isinstance(tags, dict):
|
|
148
|
+
return {}
|
|
149
|
+
return {str(key).lower(): str(value).strip() for key, value in tags.items() if str(value).strip()}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _recorded_at(tags: dict[str, str], audio_path: Path) -> tuple[str | None, str | None, str | None]:
|
|
153
|
+
for key in ("date", "creation_time", "com.apple.quicktime.creationdate"):
|
|
154
|
+
parsed = _parse_datetime(tags.get(key))
|
|
155
|
+
if parsed is not None:
|
|
156
|
+
return parsed, f"metadata.{key}", "medium"
|
|
157
|
+
|
|
158
|
+
title = tags.get("title")
|
|
159
|
+
parsed = _parse_philips_title_datetime(title)
|
|
160
|
+
if parsed is not None:
|
|
161
|
+
return parsed, "metadata.title", "high"
|
|
162
|
+
|
|
163
|
+
parsed = _parse_recup_filename_datetime(audio_path.name)
|
|
164
|
+
if parsed is not None:
|
|
165
|
+
return parsed, "filename.recup", "medium"
|
|
166
|
+
|
|
167
|
+
parsed = _parse_recorder_filename_datetime(audio_path.name)
|
|
168
|
+
if parsed is not None:
|
|
169
|
+
return parsed, "filename.recorder", "medium"
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
mtime = datetime.fromtimestamp(audio_path.stat().st_mtime).replace(microsecond=0)
|
|
173
|
+
except OSError:
|
|
174
|
+
return None, None, None
|
|
175
|
+
return _format_datetime(mtime), "filesystem.mtime", "low"
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _parse_philips_title_datetime(value: str | None) -> str | None:
|
|
179
|
+
if not value:
|
|
180
|
+
return None
|
|
181
|
+
match = re.search(r"(\d{4}-\d{2}-\d{2})[ T](\d{2}:\d{2}:\d{2})", value)
|
|
182
|
+
if match is None:
|
|
183
|
+
return None
|
|
184
|
+
return _parse_datetime(f"{match.group(1)} {match.group(2)}")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _parse_recup_filename_datetime(value: str) -> str | None:
|
|
188
|
+
match = re.search(r"DV-(\d{4})-(\d{2})-(\d{2})-(\d{2})(\d{2})(\d{2})", value)
|
|
189
|
+
if match is None:
|
|
190
|
+
return None
|
|
191
|
+
year, month, day, hour, minute, second = match.groups()
|
|
192
|
+
return _parse_datetime(f"{year}-{month}-{day} {hour}:{minute}:{second}")
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _parse_recorder_filename_datetime(value: str) -> str | None:
|
|
196
|
+
match = re.search(r"(\d{2})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})", value)
|
|
197
|
+
if match is None:
|
|
198
|
+
return None
|
|
199
|
+
year, month, day, hour, minute, second = match.groups()
|
|
200
|
+
return _parse_datetime(f"20{year}-{month}-{day} {hour}:{minute}:{second}")
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _parse_datetime(value: str | None) -> str | None:
|
|
204
|
+
if not value:
|
|
205
|
+
return None
|
|
206
|
+
normalized = value.strip().replace("Z", "+00:00")
|
|
207
|
+
for fmt in (None, "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S"):
|
|
208
|
+
try:
|
|
209
|
+
if fmt is None:
|
|
210
|
+
parsed = datetime.fromisoformat(normalized)
|
|
211
|
+
else:
|
|
212
|
+
parsed = datetime.strptime(normalized, fmt)
|
|
213
|
+
except ValueError:
|
|
214
|
+
continue
|
|
215
|
+
return _format_datetime(parsed.replace(tzinfo=None))
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _format_datetime(value: datetime) -> str:
|
|
220
|
+
return value.strftime("%Y-%m-%d %H:%M:%S")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _optional_str(value: object) -> str | None:
|
|
224
|
+
if value is None:
|
|
225
|
+
return None
|
|
226
|
+
text = str(value).strip()
|
|
227
|
+
return text or None
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _optional_int(value: object) -> int | None:
|
|
231
|
+
try:
|
|
232
|
+
return int(value) if value is not None else None
|
|
233
|
+
except (TypeError, ValueError):
|
|
234
|
+
return None
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _optional_float(value: object) -> float | None:
|
|
238
|
+
try:
|
|
239
|
+
return float(value) if value is not None else None
|
|
240
|
+
except (TypeError, ValueError):
|
|
241
|
+
return None
|
fly_on_the_wall/cache.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def text_sha256(value: str) -> str:
|
|
8
|
+
return hashlib.sha256(value.encode()).hexdigest()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def read_cached_text(cache_dir: Path, cache_key: str) -> str | None:
|
|
12
|
+
path = _cache_path(cache_dir, cache_key)
|
|
13
|
+
if not path.exists():
|
|
14
|
+
return None
|
|
15
|
+
return path.read_text()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def write_cached_text(cache_dir: Path, cache_key: str, value: str) -> Path:
|
|
19
|
+
path = _cache_path(cache_dir, cache_key)
|
|
20
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
path.write_text(value)
|
|
22
|
+
return path
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _cache_path(cache_dir: Path, cache_key: str) -> Path:
|
|
26
|
+
return cache_dir / f"{cache_key}.txt"
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def deterministic_cleanup(transcript: str) -> str:
|
|
7
|
+
turns = [_parse_turn(line) for line in transcript.splitlines() if line.strip()]
|
|
8
|
+
merged: list[tuple[str, str]] = []
|
|
9
|
+
for speaker, text in turns:
|
|
10
|
+
cleaned_text = normalize_whitespace(text)
|
|
11
|
+
if not cleaned_text:
|
|
12
|
+
continue
|
|
13
|
+
if merged and merged[-1][0] == speaker:
|
|
14
|
+
previous_speaker, previous_text = merged[-1]
|
|
15
|
+
merged[-1] = (previous_speaker, normalize_whitespace(f"{previous_text} {cleaned_text}"))
|
|
16
|
+
else:
|
|
17
|
+
merged.append((speaker, cleaned_text))
|
|
18
|
+
return "\n\n".join(f"{speaker}: {text}" for speaker, text in merged)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def normalize_whitespace(value: str) -> str:
|
|
22
|
+
return re.sub(r"\s+", " ", value).strip()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _parse_turn(line: str) -> tuple[str, str]:
|
|
26
|
+
if ":" not in line:
|
|
27
|
+
return "Unknown", line
|
|
28
|
+
speaker, text = line.split(":", 1)
|
|
29
|
+
return normalize_whitespace(speaker), text
|