fow-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fly_on_the_wall/__init__.py +3 -0
- fly_on_the_wall/audio.py +164 -0
- fly_on_the_wall/audio_metadata.py +241 -0
- fly_on_the_wall/cache.py +26 -0
- fly_on_the_wall/cleanup.py +29 -0
- fly_on_the_wall/cli.py +641 -0
- fly_on_the_wall/cli_costs.py +81 -0
- fly_on_the_wall/cli_menu.py +163 -0
- fly_on_the_wall/cli_publish.py +141 -0
- fly_on_the_wall/cli_speaker_review.py +315 -0
- fly_on_the_wall/cli_watch.py +209 -0
- fly_on_the_wall/config.py +92 -0
- fly_on_the_wall/costs.py +169 -0
- fly_on_the_wall/db.py +508 -0
- fly_on_the_wall/doctor.py +142 -0
- fly_on_the_wall/embeddings.py +142 -0
- fly_on_the_wall/exporting.py +155 -0
- fly_on_the_wall/glossary.py +31 -0
- fly_on_the_wall/meetings.py +382 -0
- fly_on_the_wall/normalization.py +166 -0
- fly_on_the_wall/people.py +82 -0
- fly_on_the_wall/people_embeddings.py +68 -0
- fly_on_the_wall/pipeline.py +120 -0
- fly_on_the_wall/processing.py +427 -0
- fly_on_the_wall/providers/__init__.py +1 -0
- fly_on_the_wall/providers/elevenlabs.py +145 -0
- fly_on_the_wall/providers/openai_analysis.py +195 -0
- fly_on_the_wall/providers/openai_cleanup.py +91 -0
- fly_on_the_wall/publishing.py +410 -0
- fly_on_the_wall/reanalysis.py +172 -0
- fly_on_the_wall/recording_quality.py +141 -0
- fly_on_the_wall/rendering.py +115 -0
- fly_on_the_wall/secrets.py +93 -0
- fly_on_the_wall/service_pricing.py +75 -0
- fly_on_the_wall/setup.py +221 -0
- fly_on_the_wall/speaker_identity.py +173 -0
- fly_on_the_wall/speaker_matching.py +134 -0
- fly_on_the_wall/speakers.py +221 -0
- fly_on_the_wall/storage.py +53 -0
- fly_on_the_wall/voice_samples.py +125 -0
- fly_on_the_wall/watch.py +347 -0
- fow_cli-0.1.0.dist-info/METADATA +447 -0
- fow_cli-0.1.0.dist-info/RECORD +46 -0
- fow_cli-0.1.0.dist-info/WHEEL +4 -0
- fow_cli-0.1.0.dist-info/entry_points.txt +2 -0
- fow_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
import sys
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from shutil import which
|
|
7
|
+
|
|
8
|
+
from fly_on_the_wall.config import default_config_path, load_config
|
|
9
|
+
from fly_on_the_wall.db import database
|
|
10
|
+
from fly_on_the_wall.secrets import get_api_key_status
|
|
11
|
+
from fly_on_the_wall.storage import storage_paths
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class DoctorCheck:
|
|
16
|
+
name: str
|
|
17
|
+
ok: bool
|
|
18
|
+
detail: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def run_checks() -> list[DoctorCheck]:
|
|
22
|
+
paths = storage_paths()
|
|
23
|
+
checks = [
|
|
24
|
+
DoctorCheck(
|
|
25
|
+
name="python",
|
|
26
|
+
ok=sys.version_info >= (3, 12),
|
|
27
|
+
detail=f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
|
|
28
|
+
),
|
|
29
|
+
DoctorCheck(
|
|
30
|
+
name="ffmpeg",
|
|
31
|
+
ok=which("ffmpeg") is not None,
|
|
32
|
+
detail=which("ffmpeg") or "not found",
|
|
33
|
+
),
|
|
34
|
+
DoctorCheck(
|
|
35
|
+
name="config path",
|
|
36
|
+
ok=True,
|
|
37
|
+
detail=str(default_config_path()),
|
|
38
|
+
),
|
|
39
|
+
DoctorCheck(
|
|
40
|
+
name="storage path",
|
|
41
|
+
ok=True,
|
|
42
|
+
detail=str(paths.root),
|
|
43
|
+
),
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
config = load_config()
|
|
47
|
+
provider = config.default_transcription_provider
|
|
48
|
+
provider_status = get_api_key_status(provider)
|
|
49
|
+
checks.append(
|
|
50
|
+
DoctorCheck(
|
|
51
|
+
name=f"{provider} api key",
|
|
52
|
+
ok=provider_status.available,
|
|
53
|
+
detail=_secret_detail(provider_status.source, provider_status.env_var),
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
openai_status = get_api_key_status("openai")
|
|
57
|
+
checks.append(
|
|
58
|
+
DoctorCheck(
|
|
59
|
+
name="openai api key",
|
|
60
|
+
ok=openai_status.available,
|
|
61
|
+
detail=_secret_detail(openai_status.source, openai_status.env_var),
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
checks.extend(_speaker_embedding_checks())
|
|
65
|
+
return checks
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _speaker_embedding_checks() -> list[DoctorCheck]:
|
|
69
|
+
pyannote_available = _module_available("pyannote.audio")
|
|
70
|
+
counts = _speaker_embedding_counts()
|
|
71
|
+
return [
|
|
72
|
+
DoctorCheck(
|
|
73
|
+
name="pyannote.audio",
|
|
74
|
+
ok=pyannote_available,
|
|
75
|
+
detail=(
|
|
76
|
+
"available for speaker embeddings"
|
|
77
|
+
if pyannote_available
|
|
78
|
+
else "missing; install the identity extra for local speaker embeddings"
|
|
79
|
+
),
|
|
80
|
+
),
|
|
81
|
+
DoctorCheck(
|
|
82
|
+
name="voice sample embeddings",
|
|
83
|
+
ok=counts["voice_samples"] == 0 or counts["embedded_voice_samples"] == counts["voice_samples"],
|
|
84
|
+
detail=(f"{counts['embedded_voice_samples']}/{counts['voice_samples']} voice samples embedded"),
|
|
85
|
+
),
|
|
86
|
+
DoctorCheck(
|
|
87
|
+
name="local speaker embeddings",
|
|
88
|
+
ok=counts["local_speakers"] == 0 or counts["embedded_local_speakers"] > 0 or pyannote_available,
|
|
89
|
+
detail=(f"{counts['embedded_local_speakers']}/{counts['local_speakers']} local speakers embedded"),
|
|
90
|
+
),
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _speaker_embedding_counts() -> dict[str, int]:
|
|
95
|
+
with database() as connection:
|
|
96
|
+
voice = connection.execute(
|
|
97
|
+
"""
|
|
98
|
+
SELECT COUNT(*) AS total,
|
|
99
|
+
SUM(CASE WHEN embedding_path IS NOT NULL THEN 1 ELSE 0 END) AS embedded
|
|
100
|
+
FROM voice_samples
|
|
101
|
+
"""
|
|
102
|
+
).fetchone()
|
|
103
|
+
local = connection.execute(
|
|
104
|
+
"""
|
|
105
|
+
SELECT COUNT(DISTINCT local_speakers.id) AS total,
|
|
106
|
+
COUNT(DISTINCT local_speaker_embeddings.local_speaker_id) AS embedded
|
|
107
|
+
FROM local_speakers
|
|
108
|
+
LEFT JOIN local_speaker_embeddings
|
|
109
|
+
ON local_speaker_embeddings.local_speaker_id = local_speakers.id
|
|
110
|
+
"""
|
|
111
|
+
).fetchone()
|
|
112
|
+
return {
|
|
113
|
+
"voice_samples": int(voice["total"] or 0),
|
|
114
|
+
"embedded_voice_samples": int(voice["embedded"] or 0),
|
|
115
|
+
"local_speakers": int(local["total"] or 0),
|
|
116
|
+
"embedded_local_speakers": int(local["embedded"] or 0),
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _module_available(module_name: str) -> bool:
|
|
121
|
+
try:
|
|
122
|
+
return importlib.util.find_spec(module_name) is not None
|
|
123
|
+
except ModuleNotFoundError:
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _secret_detail(source: str, env_var: str | None) -> str:
|
|
128
|
+
if source == "env":
|
|
129
|
+
return f"{env_var} is set"
|
|
130
|
+
if source == "keyring":
|
|
131
|
+
return "set in OS keyring"
|
|
132
|
+
if source == "missing":
|
|
133
|
+
return f"{env_var} is not set and no keyring entry was found"
|
|
134
|
+
return "unknown provider"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def has_failures(checks: list[DoctorCheck]) -> bool:
|
|
138
|
+
return any(not check.ok for check in checks)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def check_names(checks: list[DoctorCheck]) -> set[str]:
|
|
142
|
+
return {check.name for check in checks}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import math
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from sqlite3 import Connection
|
|
8
|
+
from typing import Protocol
|
|
9
|
+
from uuid import uuid4
|
|
10
|
+
|
|
11
|
+
from fly_on_the_wall.storage import StoragePaths, storage_paths
|
|
12
|
+
|
|
13
|
+
DEFAULT_EMBEDDING_MODEL = "pyannote/wespeaker-voxceleb-resnet34-LM"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EmbeddingBackend(Protocol):
|
|
17
|
+
model_name: str
|
|
18
|
+
|
|
19
|
+
def embed(self, audio_path: Path) -> list[float]: ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class CachedEmbedding:
|
|
24
|
+
model_name: str
|
|
25
|
+
path: Path
|
|
26
|
+
vector: list[float]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class PyannoteEmbeddingBackend:
|
|
30
|
+
model_name = DEFAULT_EMBEDDING_MODEL
|
|
31
|
+
|
|
32
|
+
def __init__(self) -> None:
|
|
33
|
+
try:
|
|
34
|
+
from pyannote.audio import Inference, Model
|
|
35
|
+
except ImportError as exc:
|
|
36
|
+
raise RuntimeError("pyannote.audio is required for local speaker embeddings.") from exc
|
|
37
|
+
|
|
38
|
+
model = Model.from_pretrained(self.model_name)
|
|
39
|
+
self._inference = Inference(model, window="whole")
|
|
40
|
+
|
|
41
|
+
def embed(self, audio_path: Path) -> list[float]:
|
|
42
|
+
embedding = self._inference(str(audio_path))
|
|
43
|
+
if hasattr(embedding, "tolist"):
|
|
44
|
+
return [float(value) for value in embedding.tolist()]
|
|
45
|
+
return [float(value) for value in embedding]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def cache_voice_sample_embedding(
|
|
49
|
+
connection: Connection,
|
|
50
|
+
voice_sample_id: str,
|
|
51
|
+
backend: EmbeddingBackend,
|
|
52
|
+
storage: StoragePaths | None = None,
|
|
53
|
+
) -> CachedEmbedding:
|
|
54
|
+
sample = connection.execute(
|
|
55
|
+
"SELECT person_id, audio_path FROM voice_samples WHERE id = ?", (voice_sample_id,)
|
|
56
|
+
).fetchone()
|
|
57
|
+
if sample is None:
|
|
58
|
+
raise ValueError(f"Voice sample does not exist: {voice_sample_id}")
|
|
59
|
+
|
|
60
|
+
paths = storage or storage_paths()
|
|
61
|
+
vector = backend.embed(Path(sample["audio_path"]))
|
|
62
|
+
embedding_path = _write_embedding(
|
|
63
|
+
paths.artifacts / "embeddings" / "voice-samples" / sample["person_id"],
|
|
64
|
+
voice_sample_id,
|
|
65
|
+
backend.model_name,
|
|
66
|
+
vector,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
with connection:
|
|
70
|
+
connection.execute(
|
|
71
|
+
"""
|
|
72
|
+
UPDATE voice_samples
|
|
73
|
+
SET embedding_model = ?, embedding_path = ?
|
|
74
|
+
WHERE id = ?
|
|
75
|
+
""",
|
|
76
|
+
(backend.model_name, str(embedding_path), voice_sample_id),
|
|
77
|
+
)
|
|
78
|
+
return CachedEmbedding(backend.model_name, embedding_path, vector)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def cache_local_speaker_embedding(
|
|
82
|
+
connection: Connection,
|
|
83
|
+
local_speaker_id: str,
|
|
84
|
+
audio_path: Path,
|
|
85
|
+
backend: EmbeddingBackend,
|
|
86
|
+
storage: StoragePaths | None = None,
|
|
87
|
+
) -> CachedEmbedding:
|
|
88
|
+
if connection.execute("SELECT 1 FROM local_speakers WHERE id = ?", (local_speaker_id,)).fetchone() is None:
|
|
89
|
+
raise ValueError(f"Local speaker does not exist: {local_speaker_id}")
|
|
90
|
+
|
|
91
|
+
paths = storage or storage_paths()
|
|
92
|
+
vector = backend.embed(audio_path)
|
|
93
|
+
embedding_path = _write_embedding(
|
|
94
|
+
paths.artifacts / "embeddings" / "local-speakers",
|
|
95
|
+
local_speaker_id,
|
|
96
|
+
backend.model_name,
|
|
97
|
+
vector,
|
|
98
|
+
)
|
|
99
|
+
embedding_id = str(uuid4())
|
|
100
|
+
with connection:
|
|
101
|
+
connection.execute(
|
|
102
|
+
"""
|
|
103
|
+
INSERT INTO local_speaker_embeddings(
|
|
104
|
+
id, local_speaker_id, audio_path, embedding_model, embedding_path
|
|
105
|
+
) VALUES (?, ?, ?, ?, ?)
|
|
106
|
+
ON CONFLICT(local_speaker_id, embedding_model) DO UPDATE SET
|
|
107
|
+
audio_path = excluded.audio_path,
|
|
108
|
+
embedding_path = excluded.embedding_path
|
|
109
|
+
""",
|
|
110
|
+
(
|
|
111
|
+
embedding_id,
|
|
112
|
+
local_speaker_id,
|
|
113
|
+
str(audio_path),
|
|
114
|
+
backend.model_name,
|
|
115
|
+
str(embedding_path),
|
|
116
|
+
),
|
|
117
|
+
)
|
|
118
|
+
return CachedEmbedding(backend.model_name, embedding_path, vector)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def read_embedding(path: Path) -> list[float]:
|
|
122
|
+
data = json.loads(path.read_text())
|
|
123
|
+
return [float(value) for value in data["vector"]]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def cosine_similarity(left: list[float], right: list[float]) -> float:
|
|
127
|
+
if len(left) != len(right):
|
|
128
|
+
raise ValueError("Embedding vectors must have the same length.")
|
|
129
|
+
numerator = sum(a * b for a, b in zip(left, right, strict=True))
|
|
130
|
+
left_norm = math.sqrt(sum(a * a for a in left))
|
|
131
|
+
right_norm = math.sqrt(sum(b * b for b in right))
|
|
132
|
+
if left_norm == 0 or right_norm == 0:
|
|
133
|
+
return 0.0
|
|
134
|
+
return numerator / (left_norm * right_norm)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _write_embedding(directory: Path, source_id: str, model_name: str, vector: list[float]) -> Path:
|
|
138
|
+
safe_model_name = model_name.replace("/", "--")
|
|
139
|
+
path = directory / f"{source_id}.{safe_model_name}.json"
|
|
140
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
141
|
+
path.write_text(json.dumps({"model": model_name, "vector": vector}) + "\n")
|
|
142
|
+
return path
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from sqlite3 import Connection
|
|
10
|
+
from uuid import uuid4
|
|
11
|
+
|
|
12
|
+
from fly_on_the_wall.storage import StoragePaths, storage_paths
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class ExportResult:
|
|
17
|
+
id: str
|
|
18
|
+
output_dir: Path
|
|
19
|
+
transcript_path: Path
|
|
20
|
+
analysis_path: Path
|
|
21
|
+
manifest_path: Path
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def export_markdown_transcript(
|
|
25
|
+
connection: Connection,
|
|
26
|
+
meeting_id: str,
|
|
27
|
+
transcript: str,
|
|
28
|
+
analysis: str,
|
|
29
|
+
storage: StoragePaths | None = None,
|
|
30
|
+
) -> ExportResult:
|
|
31
|
+
meeting = connection.execute(
|
|
32
|
+
"""
|
|
33
|
+
SELECT meetings.*, audio_metadata.recorded_at, audio_metadata.recorded_at_confidence
|
|
34
|
+
FROM meetings
|
|
35
|
+
LEFT JOIN audio_metadata ON audio_metadata.meeting_id = meetings.id
|
|
36
|
+
WHERE meetings.id = ?
|
|
37
|
+
""",
|
|
38
|
+
(meeting_id,),
|
|
39
|
+
).fetchone()
|
|
40
|
+
if meeting is None:
|
|
41
|
+
raise ValueError(f"Meeting does not exist: {meeting_id}")
|
|
42
|
+
|
|
43
|
+
export_id = str(uuid4())
|
|
44
|
+
timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
|
|
45
|
+
paths = storage or storage_paths()
|
|
46
|
+
output_dir = paths.exports / meeting["slug"] / f"{timestamp}-{export_id[:8]}"
|
|
47
|
+
transcript_path = output_dir / "transcript.md"
|
|
48
|
+
analysis_path = output_dir / "analysis.md"
|
|
49
|
+
manifest_path = output_dir / "manifest.json"
|
|
50
|
+
output_dir.mkdir(parents=True, exist_ok=False)
|
|
51
|
+
|
|
52
|
+
markdown = _markdown_document(dict(meeting), transcript)
|
|
53
|
+
transcript_path.write_text(markdown)
|
|
54
|
+
analysis_markdown = analysis.strip() + "\n"
|
|
55
|
+
analysis_path.write_text(analysis_markdown)
|
|
56
|
+
manifest_path.write_text(
|
|
57
|
+
json.dumps(
|
|
58
|
+
{
|
|
59
|
+
"id": export_id,
|
|
60
|
+
"meeting_id": meeting_id,
|
|
61
|
+
"format": "markdown",
|
|
62
|
+
"transcript_path": str(transcript_path),
|
|
63
|
+
"analysis_path": str(analysis_path),
|
|
64
|
+
"transcript_sha256": _sha256(markdown),
|
|
65
|
+
"analysis_sha256": _sha256(analysis_markdown),
|
|
66
|
+
},
|
|
67
|
+
indent=2,
|
|
68
|
+
)
|
|
69
|
+
+ "\n"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
with connection:
|
|
73
|
+
connection.execute(
|
|
74
|
+
"""
|
|
75
|
+
INSERT INTO exports(id, meeting_id, format, output_dir, manifest_path)
|
|
76
|
+
VALUES (?, ?, ?, ?, ?)
|
|
77
|
+
""",
|
|
78
|
+
(export_id, meeting_id, "markdown", str(output_dir), str(manifest_path)),
|
|
79
|
+
)
|
|
80
|
+
return ExportResult(export_id, output_dir, transcript_path, analysis_path, manifest_path)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _markdown_document(meeting: dict, transcript: str) -> str:
|
|
84
|
+
turns = _readable_turns(transcript)
|
|
85
|
+
people = _participants(turns)
|
|
86
|
+
date, time = _date_time(_meeting_timestamp(meeting))
|
|
87
|
+
lines = [
|
|
88
|
+
f"# {meeting['title']}",
|
|
89
|
+
"",
|
|
90
|
+
f"Date: {date}",
|
|
91
|
+
f"Time: {time}",
|
|
92
|
+
"Location: Unknown",
|
|
93
|
+
"Position: Unknown",
|
|
94
|
+
f"People: {', '.join(people) if people else 'Unknown'}",
|
|
95
|
+
"",
|
|
96
|
+
"## Manuscript",
|
|
97
|
+
"",
|
|
98
|
+
]
|
|
99
|
+
for speaker, text in turns:
|
|
100
|
+
lines.append(f"**{speaker}:** {text}")
|
|
101
|
+
lines.append("")
|
|
102
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _readable_turns(transcript: str) -> list[tuple[str, str]]:
|
|
106
|
+
unknown_speakers: dict[str, str] = {}
|
|
107
|
+
turns: list[tuple[str, str]] = []
|
|
108
|
+
for block in [block.strip() for block in transcript.split("\n\n") if block.strip()]:
|
|
109
|
+
speaker, text, source_label = _parse_turn(block)
|
|
110
|
+
if speaker == "Unknown":
|
|
111
|
+
key = source_label or speaker
|
|
112
|
+
if key not in unknown_speakers:
|
|
113
|
+
unknown_speakers[key] = f"Unknown speaker {len(unknown_speakers) + 1}"
|
|
114
|
+
speaker = unknown_speakers[key]
|
|
115
|
+
turns.append((speaker, text))
|
|
116
|
+
return turns
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _parse_turn(block: str) -> tuple[str, str, str | None]:
|
|
120
|
+
speaker, separator, text = block.partition(":")
|
|
121
|
+
if not separator:
|
|
122
|
+
return "Unknown", block, None
|
|
123
|
+
|
|
124
|
+
match = re.match(
|
|
125
|
+
r"^(?P<name>.*?)(?:\s+\[[^\]]+\])?(?:\s+\((?P<source>[^)]+)\))?$",
|
|
126
|
+
speaker.strip(),
|
|
127
|
+
)
|
|
128
|
+
if match is None:
|
|
129
|
+
return speaker.strip() or "Unknown", text.strip(), None
|
|
130
|
+
return match.group("name").strip() or "Unknown", text.strip(), match.group("source")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _participants(turns: list[tuple[str, str]]) -> list[str]:
|
|
134
|
+
participants: list[str] = []
|
|
135
|
+
for speaker, _ in turns:
|
|
136
|
+
if speaker not in participants:
|
|
137
|
+
participants.append(speaker)
|
|
138
|
+
return participants
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _date_time(created_at: str | None) -> tuple[str, str]:
|
|
142
|
+
if not created_at:
|
|
143
|
+
return "Unknown", "Unknown"
|
|
144
|
+
date, _, time = created_at.partition(" ")
|
|
145
|
+
return date or "Unknown", time or "Unknown"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _meeting_timestamp(meeting: dict) -> str | None:
|
|
149
|
+
if meeting.get("recorded_at_confidence") in {"high", "medium"}:
|
|
150
|
+
return meeting.get("recorded_at")
|
|
151
|
+
return meeting.get("created_at")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _sha256(value: str) -> str:
|
|
155
|
+
return hashlib.sha256(value.encode()).hexdigest()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_glossary_terms(path: Path | None) -> list[str]:
|
|
10
|
+
if path is None or not path.exists():
|
|
11
|
+
return []
|
|
12
|
+
data = yaml.safe_load(path.read_text())
|
|
13
|
+
return sorted(set(_collect_terms(data)))
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _collect_terms(value: Any) -> list[str]:
|
|
17
|
+
if value is None:
|
|
18
|
+
return []
|
|
19
|
+
if isinstance(value, str):
|
|
20
|
+
return [value]
|
|
21
|
+
if isinstance(value, list):
|
|
22
|
+
terms: list[str] = []
|
|
23
|
+
for item in value:
|
|
24
|
+
terms.extend(_collect_terms(item))
|
|
25
|
+
return terms
|
|
26
|
+
if isinstance(value, dict):
|
|
27
|
+
terms = []
|
|
28
|
+
for item in value.values():
|
|
29
|
+
terms.extend(_collect_terms(item))
|
|
30
|
+
return terms
|
|
31
|
+
return []
|