fow-cli 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {fow_cli-0.2.0 → fow_cli-0.3.0}/CHANGELOG.md +9 -0
  2. {fow_cli-0.2.0 → fow_cli-0.3.0}/PKG-INFO +29 -1
  3. {fow_cli-0.2.0 → fow_cli-0.3.0}/README.md +28 -0
  4. {fow_cli-0.2.0 → fow_cli-0.3.0}/pyproject.toml +1 -1
  5. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/__init__.py +1 -1
  6. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/cli.py +2 -0
  7. fow_cli-0.3.0/src/fly_on_the_wall/cli_glossary.py +124 -0
  8. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/db.py +11 -1
  9. fow_cli-0.3.0/src/fly_on_the_wall/glossary.py +207 -0
  10. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/processing.py +29 -8
  11. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/providers/elevenlabs.py +19 -4
  12. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/providers/openai_analysis.py +22 -4
  13. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/providers/openai_cleanup.py +12 -3
  14. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/publishing.py +32 -3
  15. fow_cli-0.2.0/src/fly_on_the_wall/glossary.py +0 -31
  16. {fow_cli-0.2.0 → fow_cli-0.3.0}/.gitignore +0 -0
  17. {fow_cli-0.2.0 → fow_cli-0.3.0}/LICENSE +0 -0
  18. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/api_keys.py +0 -0
  19. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/audio.py +0 -0
  20. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/audio_metadata.py +0 -0
  21. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/cache.py +0 -0
  22. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/cleanup.py +0 -0
  23. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/cli_costs.py +0 -0
  24. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/cli_menu.py +0 -0
  25. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/cli_publish.py +0 -0
  26. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/cli_speaker_review.py +0 -0
  27. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/cli_watch.py +0 -0
  28. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/config.py +0 -0
  29. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/costs.py +0 -0
  30. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/doctor.py +0 -0
  31. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/embeddings.py +0 -0
  32. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/exporting.py +0 -0
  33. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/meetings.py +0 -0
  34. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/normalization.py +0 -0
  35. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/people.py +0 -0
  36. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/people_embeddings.py +0 -0
  37. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/pipeline.py +0 -0
  38. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/providers/__init__.py +0 -0
  39. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/py.typed +0 -0
  40. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/reanalysis.py +0 -0
  41. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/recording_quality.py +0 -0
  42. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/rendering.py +0 -0
  43. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/secrets.py +0 -0
  44. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/service_pricing.py +0 -0
  45. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/setup.py +0 -0
  46. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/speaker_identity.py +0 -0
  47. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/speaker_matching.py +0 -0
  48. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/speakers.py +0 -0
  49. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/storage.py +0 -0
  50. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/voice_samples.py +0 -0
  51. {fow_cli-0.2.0 → fow_cli-0.3.0}/src/fly_on_the_wall/watch.py +0 -0
@@ -2,6 +2,15 @@
2
2
 
3
3
  All notable changes to Fly on the Wall are documented here.
4
4
 
5
+ ## [0.3.0] - 2026-06-13
6
+
7
+ ### Added
8
+
9
+ - Added glossary management with `fow glossary` commands.
10
+ - Added glossary and known-person hints for ElevenLabs transcription keyterms.
11
+ - Added glossary guidance to OpenAI cleanup, analysis, and title generation.
12
+ - Added Obsidian `participants` frontmatter links for known meeting speakers.
13
+
5
14
  ## [0.2.0] - 2026-06-09
6
15
 
7
16
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fow-cli
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Personal CLI note-taker for turning meeting audio into cleaned meeting manuscripts.
5
5
  Project-URL: Repository, https://github.com/henriksvensson/fly-on-the-wall
6
6
  License-Expression: MIT
@@ -50,6 +50,8 @@ Issues and suggestions are welcome via GitHub Issues, but the project is provide
50
50
 
51
51
  Audio is sent to configured transcription/AI providers during processing. Optional speaker identity embeddings run locally when installed with the `identity` extra. External providers may charge usage-based fees depending on your provider account, pricing plan, and processing volume.
52
52
 
53
+ Glossary/keyterm hints are sent to ElevenLabs when processing new recordings. ElevenLabs currently documents this as a billable add-on to speech-to-text usage.
54
+
53
55
  ## Development Transparency
54
56
 
55
57
  This project was developed as an agentic coding project using [OpenCode](https://opencode.ai/) with [OpenAI](https://openai.com/) GPT-5.5. Code quality checks were supported by CodeScene's [CodeHealth](https://codescene.com/product/code-health) analysis.
@@ -271,6 +273,32 @@ fow people embeddings status
271
273
  fow people embeddings backfill
272
274
  ```
273
275
 
276
+ ## Glossary
277
+
278
+ Use the glossary for names, company names, project names, product names, acronyms, and domain-specific phrases that transcription or cleanup models may spell incorrectly.
279
+
280
+ Add terms with optional context:
281
+
282
+ ```bash
283
+ fow glossary add "Hejare" --description "Company name"
284
+ fow glossary add "Datadrivna" --description "The phrase data driven in Swedish"
285
+ fow glossary add "Ants" --description "Company name"
286
+ fow glossary add "TT" --description "Company name, short for Theodora Tech"
287
+ ```
288
+
289
+ Manage terms:
290
+
291
+ ```bash
292
+ fow glossary list
293
+ fow glossary show "Hejare"
294
+ fow glossary update "TT" --description "Company name, short for Theodora Tech"
295
+ fow glossary disable "Ants"
296
+ fow glossary enable "Ants"
297
+ fow glossary remove "Ants"
298
+ ```
299
+
300
+ During processing, `fow` combines enabled glossary terms with known people names. The combined list is sent to ElevenLabs as transcription keyterms for new transcriptions, and to OpenAI cleanup, analysis, and title generation as spelling/context guidance. Corrections are model-mediated; `fow` does not do deterministic search-and-replace from the glossary.
301
+
274
302
  ## Watched Folders
275
303
 
276
304
  Fly on the Wall can watch local folders, mounted Dropbox/rclone folders, and removable recorder folders.
@@ -20,6 +20,8 @@ Issues and suggestions are welcome via GitHub Issues, but the project is provide
20
20
 
21
21
  Audio is sent to configured transcription/AI providers during processing. Optional speaker identity embeddings run locally when installed with the `identity` extra. External providers may charge usage-based fees depending on your provider account, pricing plan, and processing volume.
22
22
 
23
+ Glossary/keyterm hints are sent to ElevenLabs when processing new recordings. ElevenLabs currently documents this as a billable add-on to speech-to-text usage.
24
+
23
25
  ## Development Transparency
24
26
 
25
27
  This project was developed as an agentic coding project using [OpenCode](https://opencode.ai/) with [OpenAI](https://openai.com/) GPT-5.5. Code quality checks were supported by CodeScene's [CodeHealth](https://codescene.com/product/code-health) analysis.
@@ -241,6 +243,32 @@ fow people embeddings status
241
243
  fow people embeddings backfill
242
244
  ```
243
245
 
246
+ ## Glossary
247
+
248
+ Use the glossary for names, company names, project names, product names, acronyms, and domain-specific phrases that transcription or cleanup models may spell incorrectly.
249
+
250
+ Add terms with optional context:
251
+
252
+ ```bash
253
+ fow glossary add "Hejare" --description "Company name"
254
+ fow glossary add "Datadrivna" --description "The phrase data driven in Swedish"
255
+ fow glossary add "Ants" --description "Company name"
256
+ fow glossary add "TT" --description "Company name, short for Theodora Tech"
257
+ ```
258
+
259
+ Manage terms:
260
+
261
+ ```bash
262
+ fow glossary list
263
+ fow glossary show "Hejare"
264
+ fow glossary update "TT" --description "Company name, short for Theodora Tech"
265
+ fow glossary disable "Ants"
266
+ fow glossary enable "Ants"
267
+ fow glossary remove "Ants"
268
+ ```
269
+
270
+ During processing, `fow` combines enabled glossary terms with known people names. The combined list is sent to ElevenLabs as transcription keyterms for new transcriptions, and to OpenAI cleanup, analysis, and title generation as spelling/context guidance. Corrections are model-mediated; `fow` does not do deterministic search-and-replace from the glossary.
271
+
244
272
  ## Watched Folders
245
273
 
246
274
  Fly on the Wall can watch local folders, mounted Dropbox/rclone folders, and removable recorder folders.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "fow-cli"
3
- version = "0.2.0"
3
+ version = "0.3.0"
4
4
  description = "Personal CLI note-taker for turning meeting audio into cleaned meeting manuscripts."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -1,3 +1,3 @@
1
1
  """Fly on the Wall CLI application."""
2
2
 
3
- __version__ = "0.2.0"
3
+ __version__ = "0.3.0"
@@ -9,6 +9,7 @@ from rich.table import Table
9
9
 
10
10
  from fly_on_the_wall import __version__
11
11
  from fly_on_the_wall.cli_costs import costs_app
12
+ from fly_on_the_wall.cli_glossary import glossary_app
12
13
  from fly_on_the_wall.cli_publish import publish_app
13
14
  from fly_on_the_wall.cli_speaker_review import speakers_review
14
15
  from fly_on_the_wall.cli_watch import watch_app
@@ -78,6 +79,7 @@ app.add_typer(meetings_app, name="meetings")
78
79
  meetings_app.add_typer(meeting_speakers_app, name="speakers")
79
80
  app.add_typer(refresh_app, name="refresh")
80
81
  app.add_typer(secrets_app, name="secrets")
82
+ app.add_typer(glossary_app, name="glossary")
81
83
  app.add_typer(watch_app, name="watch")
82
84
  app.add_typer(publish_app, name="publish")
83
85
  app.add_typer(costs_app, name="costs")
@@ -0,0 +1,124 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Annotated
4
+
5
+ import typer
6
+ from rich.console import Console
7
+ from rich.table import Table
8
+
9
+ from fly_on_the_wall.db import database
10
+ from fly_on_the_wall.glossary import (
11
+ create_glossary_term,
12
+ get_glossary_term,
13
+ list_glossary_terms,
14
+ remove_glossary_term,
15
+ update_glossary_term,
16
+ )
17
+
18
+ glossary_app = typer.Typer(help="Manage transcription and cleanup glossary terms.", no_args_is_help=True)
19
+ console = Console()
20
+
21
+
22
+ @glossary_app.command("add")
23
+ def glossary_add(
24
+ term: str,
25
+ description: Annotated[str | None, typer.Option("--description", "-d", help="Optional context.")] = None,
26
+ ) -> None:
27
+ """Add a word or phrase to the glossary."""
28
+ with database() as connection:
29
+ try:
30
+ created = create_glossary_term(connection, term, description)
31
+ except ValueError as exc:
32
+ console.print(str(exc))
33
+ raise typer.Exit(code=1) from exc
34
+ console.print(f"Added glossary term: {created.term}")
35
+
36
+
37
+ @glossary_app.command("list")
38
+ def glossary_list(
39
+ all_terms: Annotated[bool, typer.Option("--all", help="Include disabled terms.")] = False,
40
+ ) -> None:
41
+ """List glossary terms."""
42
+ with database() as connection:
43
+ terms = list_glossary_terms(connection, include_disabled=all_terms)
44
+ if not terms:
45
+ console.print("No glossary terms found.")
46
+ return
47
+
48
+ table = Table(title="Glossary")
49
+ table.add_column("Term")
50
+ table.add_column("Description")
51
+ table.add_column("Enabled")
52
+ for term in terms:
53
+ table.add_row(term.term, term.description or "", "yes" if term.enabled else "no")
54
+ console.print(table)
55
+
56
+
57
+ @glossary_app.command("show")
58
+ def glossary_show(term: str) -> None:
59
+ """Show one glossary term."""
60
+ with database() as connection:
61
+ found = get_glossary_term(connection, term)
62
+ if found is None:
63
+ console.print(f"Glossary term not found: {term}")
64
+ raise typer.Exit(code=1)
65
+ console.print(f"Term: {found.term}")
66
+ console.print(f"Description: {found.description or ''}")
67
+ console.print(f"Enabled: {'yes' if found.enabled else 'no'}")
68
+ console.print(f"ID: {found.id}")
69
+
70
+
71
+ @glossary_app.command("update")
72
+ def glossary_update(
73
+ term: str,
74
+ new_term: Annotated[str | None, typer.Option("--term", help="Replace the glossary term text.")] = None,
75
+ description: Annotated[str | None, typer.Option("--description", "-d", help="Replace the description.")] = None,
76
+ ) -> None:
77
+ """Update a glossary term or description."""
78
+ with database() as connection:
79
+ try:
80
+ updated = update_glossary_term(connection, term, term=new_term, description=description)
81
+ except ValueError as exc:
82
+ console.print(str(exc))
83
+ raise typer.Exit(code=1) from exc
84
+ console.print(f"Updated glossary term: {updated.term}")
85
+
86
+
87
+ @glossary_app.command("enable")
88
+ def glossary_enable(term: str) -> None:
89
+ """Enable a glossary term."""
90
+ _set_enabled(term, True)
91
+
92
+
93
+ @glossary_app.command("disable")
94
+ def glossary_disable(term: str) -> None:
95
+ """Disable a glossary term without deleting it."""
96
+ _set_enabled(term, False)
97
+
98
+
99
+ @glossary_app.command("remove")
100
+ def glossary_remove(
101
+ term: str,
102
+ yes: Annotated[bool, typer.Option("--yes", "-y", help="Remove without confirmation.")] = False,
103
+ ) -> None:
104
+ """Remove a glossary term."""
105
+ if not yes and not typer.confirm(f"Remove glossary term '{term}'?", default=False):
106
+ console.print("Cancelled.")
107
+ return
108
+ with database() as connection:
109
+ removed = remove_glossary_term(connection, term)
110
+ if not removed:
111
+ console.print(f"Glossary term not found: {term}")
112
+ raise typer.Exit(code=1)
113
+ console.print(f"Removed glossary term: {term}")
114
+
115
+
116
+ def _set_enabled(term: str, enabled: bool) -> None:
117
+ with database() as connection:
118
+ try:
119
+ updated = update_glossary_term(connection, term, enabled=enabled)
120
+ except ValueError as exc:
121
+ console.print(str(exc))
122
+ raise typer.Exit(code=1) from exc
123
+ state = "Enabled" if enabled else "Disabled"
124
+ console.print(f"{state} glossary term: {updated.term}")
@@ -8,7 +8,7 @@ from pathlib import Path
8
8
 
9
9
  from fly_on_the_wall.storage import ensure_storage_layout, storage_paths
10
10
 
11
- SCHEMA_VERSION = 17
11
+ SCHEMA_VERSION = 18
12
12
 
13
13
  SCHEMA_STATEMENTS = (
14
14
  """
@@ -43,6 +43,16 @@ SCHEMA_STATEMENTS = (
43
43
  )
44
44
  """,
45
45
  """
46
+ CREATE TABLE IF NOT EXISTS glossary_terms (
47
+ id TEXT PRIMARY KEY,
48
+ term TEXT NOT NULL UNIQUE,
49
+ description TEXT,
50
+ enabled INTEGER NOT NULL DEFAULT 1,
51
+ created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
52
+ updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
53
+ )
54
+ """,
55
+ """
46
56
  CREATE TABLE IF NOT EXISTS pipeline_stages (
47
57
  id INTEGER PRIMARY KEY AUTOINCREMENT,
48
58
  meeting_id TEXT NOT NULL,
@@ -0,0 +1,207 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from sqlite3 import Connection
6
+ from typing import Any
7
+ from uuid import uuid4
8
+
9
+ import yaml
10
+
11
+ UNSUPPORTED_KEYTERM_CHARS = set("<>{}[]\\")
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class GlossaryTerm:
16
+ id: str
17
+ term: str
18
+ description: str | None
19
+ enabled: bool
20
+
21
+
22
+ def create_glossary_term(connection: Connection, term: str, description: str | None = None) -> GlossaryTerm:
23
+ normalized = _normalize_term(term)
24
+ normalized_description = _normalize_optional(description)
25
+ term_id = str(uuid4())
26
+ with connection:
27
+ connection.execute(
28
+ """
29
+ INSERT INTO glossary_terms(id, term, description)
30
+ VALUES (?, ?, ?)
31
+ """,
32
+ (term_id, normalized, normalized_description),
33
+ )
34
+ return get_glossary_term(connection, normalized) # type: ignore[return-value]
35
+
36
+
37
+ def get_glossary_term(connection: Connection, term_or_id: str) -> GlossaryTerm | None:
38
+ row = connection.execute(
39
+ """
40
+ SELECT * FROM glossary_terms
41
+ WHERE id = ? OR term = ?
42
+ """,
43
+ (term_or_id, term_or_id),
44
+ ).fetchone()
45
+ return _term_from_row(row) if row is not None else None
46
+
47
+
48
+ def list_glossary_terms(connection: Connection, include_disabled: bool = False) -> list[GlossaryTerm]:
49
+ query = "SELECT * FROM glossary_terms"
50
+ if not include_disabled:
51
+ query += " WHERE enabled = 1"
52
+ query += " ORDER BY lower(term)"
53
+ return [_term_from_row(row) for row in connection.execute(query).fetchall()]
54
+
55
+
56
+ def update_glossary_term(
57
+ connection: Connection,
58
+ term_or_id: str,
59
+ *,
60
+ term: str | None = None,
61
+ description: str | None = None,
62
+ enabled: bool | None = None,
63
+ ) -> GlossaryTerm:
64
+ existing = get_glossary_term(connection, term_or_id)
65
+ if existing is None:
66
+ raise ValueError(f"Glossary term not found: {term_or_id}")
67
+
68
+ updated_term = existing.term if term is None else _normalize_term(term)
69
+ updated_description = existing.description if description is None else _normalize_optional(description)
70
+ updated_enabled = existing.enabled if enabled is None else enabled
71
+ with connection:
72
+ connection.execute(
73
+ """
74
+ UPDATE glossary_terms
75
+ SET term = ?,
76
+ description = ?,
77
+ enabled = ?,
78
+ updated_at = CURRENT_TIMESTAMP
79
+ WHERE id = ?
80
+ """,
81
+ (updated_term, updated_description, int(updated_enabled), existing.id),
82
+ )
83
+ return get_glossary_term(connection, existing.id) # type: ignore[return-value]
84
+
85
+
86
+ def remove_glossary_term(connection: Connection, term_or_id: str) -> bool:
87
+ existing = get_glossary_term(connection, term_or_id)
88
+ if existing is None:
89
+ return False
90
+ with connection:
91
+ connection.execute("DELETE FROM glossary_terms WHERE id = ?", (existing.id,))
92
+ return True
93
+
94
+
95
+ def glossary_prompt_lines(connection: Connection, legacy_glossary_path: Path | None = None) -> list[str]:
96
+ lines: list[str] = []
97
+ seen: set[str] = set()
98
+
99
+ for item in list_glossary_terms(connection):
100
+ key = item.term.casefold()
101
+ if key in seen:
102
+ continue
103
+ seen.add(key)
104
+ if item.description:
105
+ lines.append(f"{item.term}: {item.description}")
106
+ else:
107
+ lines.append(item.term)
108
+
109
+ for term in load_glossary_terms(legacy_glossary_path):
110
+ key = term.casefold()
111
+ if key not in seen:
112
+ seen.add(key)
113
+ lines.append(term)
114
+
115
+ for name in _people_names(connection):
116
+ key = name.casefold()
117
+ if key not in seen:
118
+ seen.add(key)
119
+ lines.append(name)
120
+
121
+ return lines
122
+
123
+
124
+ def transcription_keyterms(connection: Connection, legacy_glossary_path: Path | None = None) -> list[str]:
125
+ terms: list[str] = []
126
+ seen: set[str] = set()
127
+ for item in list_glossary_terms(connection):
128
+ _append_keyterm(terms, seen, item.term)
129
+ for term in load_glossary_terms(legacy_glossary_path):
130
+ _append_keyterm(terms, seen, term)
131
+ for name in _people_names(connection):
132
+ _append_keyterm(terms, seen, name)
133
+ return terms[:1000]
134
+
135
+
136
+ def load_glossary_terms(path: Path | None) -> list[str]:
137
+ if path is None or not path.exists():
138
+ return []
139
+ data = yaml.safe_load(path.read_text())
140
+ return sorted(set(_collect_terms(data)), key=str.casefold)
141
+
142
+
143
+ def _append_keyterm(terms: list[str], seen: set[str], value: str) -> None:
144
+ normalized = " ".join(value.split())
145
+ key = normalized.casefold()
146
+ if key in seen or not _valid_keyterm(normalized):
147
+ return
148
+ seen.add(key)
149
+ terms.append(normalized)
150
+
151
+
152
+ def _valid_keyterm(value: str) -> bool:
153
+ return (
154
+ bool(value)
155
+ and len(value) < 50
156
+ and len(value.split()) <= 5
157
+ and not any(char in value for char in UNSUPPORTED_KEYTERM_CHARS)
158
+ )
159
+
160
+
161
+ def _people_names(connection: Connection) -> list[str]:
162
+ return [
163
+ str(row["display_name"])
164
+ for row in connection.execute("SELECT display_name FROM people ORDER BY lower(display_name)").fetchall()
165
+ ]
166
+
167
+
168
+ def _normalize_term(value: str) -> str:
169
+ normalized = " ".join(value.split())
170
+ if not normalized:
171
+ raise ValueError("Glossary term cannot be empty")
172
+ return normalized
173
+
174
+
175
+ def _normalize_optional(value: str | None) -> str | None:
176
+ if value is None:
177
+ return None
178
+ normalized = " ".join(value.split())
179
+ return normalized or None
180
+
181
+
182
+ def _term_from_row(row: Any) -> GlossaryTerm:
183
+ return GlossaryTerm(
184
+ id=row["id"],
185
+ term=row["term"],
186
+ description=row["description"],
187
+ enabled=bool(row["enabled"]),
188
+ )
189
+
190
+
191
+ def _collect_terms(value: Any) -> list[str]:
192
+ if value is None:
193
+ return []
194
+ if isinstance(value, str):
195
+ normalized = " ".join(value.split())
196
+ return [normalized] if normalized else []
197
+ if isinstance(value, list):
198
+ terms: list[str] = []
199
+ for item in value:
200
+ terms.extend(_collect_terms(item))
201
+ return terms
202
+ if isinstance(value, dict):
203
+ terms = []
204
+ for item in value.values():
205
+ terms.extend(_collect_terms(item))
206
+ return terms
207
+ return []
@@ -13,7 +13,7 @@ from fly_on_the_wall.config import AppConfig
13
13
  from fly_on_the_wall.costs import record_openai_usage
14
14
  from fly_on_the_wall.embeddings import EmbeddingBackend
15
15
  from fly_on_the_wall.exporting import ExportResult, export_markdown_transcript
16
- from fly_on_the_wall.glossary import load_glossary_terms
16
+ from fly_on_the_wall.glossary import glossary_prompt_lines, transcription_keyterms
17
17
  from fly_on_the_wall.meetings import (
18
18
  Meeting,
19
19
  get_meeting,
@@ -107,8 +107,17 @@ def process_audio(
107
107
  existing_provider_run = latest_completed_provider_run(connection, meeting.id)
108
108
  if existing_provider_run is None:
109
109
  with timed_progress.step("Transcribing audio with ElevenLabs"):
110
- resolved_transcribe = transcribe_fn or _run_elevenlabs_transcription
111
- provider_run_id = resolved_transcribe(connection, meeting.id, meeting.imported_audio_path, paths)
110
+ if transcribe_fn is not None:
111
+ provider_run_id = transcribe_fn(connection, meeting.id, meeting.imported_audio_path, paths)
112
+ else:
113
+ keyterms = transcription_keyterms(connection, config.glossary_path)
114
+ provider_run_id = _run_elevenlabs_transcription(
115
+ connection,
116
+ meeting.id,
117
+ meeting.imported_audio_path,
118
+ paths,
119
+ keyterms,
120
+ )
112
121
  else:
113
122
  timed_progress.message("Reusing completed ElevenLabs transcription")
114
123
  provider_run_id = existing_provider_run["id"]
@@ -200,7 +209,7 @@ def _cleanup_transcript(context: RefreshContext, deterministic_transcript: str)
200
209
  if context.config.cleanup_mode != "light" or not get_api_key("openai"):
201
210
  return TranscriptArtifacts(deterministic_transcript, deterministic_transcript)
202
211
 
203
- glossary_terms = load_glossary_terms(context.config.glossary_path)
212
+ glossary_terms = glossary_prompt_lines(context.connection, context.config.glossary_path)
204
213
  cleanup_cache_key = text_sha256(
205
214
  "\n".join(
206
215
  [
@@ -260,7 +269,10 @@ def _suggest_and_apply_title(
260
269
  if meeting.get("title_source") == "manual":
261
270
  return
262
271
 
263
- title_cache_key = text_sha256("\n".join([DEFAULT_ANALYSIS_MODEL, context.description or "", transcript, analysis]))
272
+ glossary_terms = glossary_prompt_lines(context.connection, context.config.glossary_path)
273
+ title_cache_key = text_sha256(
274
+ "\n".join([DEFAULT_ANALYSIS_MODEL, context.description or "", "\n".join(glossary_terms), transcript, analysis])
275
+ )
264
276
  title_cache_dir = context.paths.artifacts / context.meeting.id / "generated-title"
265
277
  cached_title = read_cached_text(title_cache_dir, title_cache_key)
266
278
  if cached_title is not None:
@@ -274,6 +286,7 @@ def _suggest_and_apply_title(
274
286
  transcript,
275
287
  analysis,
276
288
  meeting_context=context.description,
289
+ glossary_terms=glossary_terms,
277
290
  options=OpenAIRequestOptions(
278
291
  usage_callback=lambda response: _record_openai_usage(
279
292
  context, DEFAULT_ANALYSIS_MODEL, "title", response
@@ -291,9 +304,13 @@ def _suggest_and_apply_title(
291
304
 
292
305
 
293
306
  def _run_elevenlabs_transcription(
294
- connection: Connection, meeting_id: str, audio_path: Path, storage: StoragePaths
307
+ connection: Connection,
308
+ meeting_id: str,
309
+ audio_path: Path,
310
+ storage: StoragePaths,
311
+ keyterms: list[str] | None = None,
295
312
  ) -> str:
296
- return run_transcription(connection, meeting_id, audio_path, storage)
313
+ return run_transcription(connection, meeting_id, audio_path, storage, keyterms=keyterms)
297
314
 
298
315
 
299
316
  def _meeting_from_database(connection: Connection, meeting_id: str) -> Meeting:
@@ -398,7 +415,10 @@ def _analyze_transcript(
398
415
  if not get_api_key("openai"):
399
416
  return fallback_analysis("OPENAI_API_KEY is missing")
400
417
 
401
- analysis_cache_key = text_sha256("\n".join([DEFAULT_ANALYSIS_MODEL, context.description or "", transcript]))
418
+ glossary_terms = glossary_prompt_lines(context.connection, context.config.glossary_path)
419
+ analysis_cache_key = text_sha256(
420
+ "\n".join([DEFAULT_ANALYSIS_MODEL, context.description or "", "\n".join(glossary_terms), transcript])
421
+ )
402
422
  analysis_cache_dir = context.paths.artifacts / context.meeting.id / "analysis"
403
423
  cached_analysis = read_cached_text(analysis_cache_dir, analysis_cache_key)
404
424
  if cached_analysis is not None:
@@ -411,6 +431,7 @@ def _analyze_transcript(
411
431
  AnalysisRequest(
412
432
  transcript,
413
433
  meeting_context=context.description,
434
+ glossary_terms=glossary_terms,
414
435
  options=OpenAIRequestOptions(
415
436
  usage_callback=lambda response: _record_openai_usage(
416
437
  context, DEFAULT_ANALYSIS_MODEL, "analysis", response
@@ -28,6 +28,7 @@ def transcribe_audio(
28
28
  num_speakers: int | None = None,
29
29
  diarization_threshold: float | None = None,
30
30
  no_verbatim: bool = False,
31
+ keyterms: list[str] | None = None,
31
32
  ) -> dict[str, Any]:
32
33
  resolved_api_key = api_key or get_api_key(PROVIDER)
33
34
  if not resolved_api_key:
@@ -46,6 +47,8 @@ def transcribe_audio(
46
47
  data["num_speakers"] = str(num_speakers)
47
48
  if diarization_threshold is not None:
48
49
  data["diarization_threshold"] = str(diarization_threshold)
50
+ if keyterms:
51
+ data["keyterms"] = json.dumps(keyterms, ensure_ascii=False)
49
52
 
50
53
  close_client = client is None
51
54
  http_client = client or httpx.Client(timeout=600)
@@ -76,15 +79,17 @@ def run_transcription(
76
79
  storage: StoragePaths | None = None,
77
80
  client: httpx.Client | None = None,
78
81
  api_key: str | None = None,
82
+ keyterms: list[str] | None = None,
79
83
  ) -> str:
80
84
  paths = storage or storage_paths()
81
85
  provider_run_id = str(uuid4())
82
86
  raw_response_path = paths.artifacts / meeting_id / "provider-runs" / f"{provider_run_id}.raw.json"
83
87
  raw_response_path.parent.mkdir(parents=True, exist_ok=True)
84
88
 
85
- _insert_provider_run(connection, provider_run_id, meeting_id, raw_response_path, "running")
89
+ settings = {"keyterms": keyterms or []}
90
+ _insert_provider_run(connection, provider_run_id, meeting_id, raw_response_path, "running", settings)
86
91
  try:
87
- response = transcribe_audio(audio_path, api_key=api_key, client=client)
92
+ response = transcribe_audio(audio_path, api_key=api_key, client=client, keyterms=keyterms)
88
93
  raw_response_path.write_text(json.dumps(response, indent=2, ensure_ascii=False) + "\n")
89
94
  duration = float(response.get("audio_duration_secs") or 0)
90
95
  record_service_usage(
@@ -112,6 +117,7 @@ def _insert_provider_run(
112
117
  meeting_id: str,
113
118
  raw_response_path: Path,
114
119
  status: str,
120
+ settings: dict[str, Any] | None = None,
115
121
  ) -> None:
116
122
  with connection:
117
123
  connection.execute(
@@ -121,11 +127,20 @@ def _insert_provider_run(
121
127
  meeting_id,
122
128
  provider,
123
129
  model,
130
+ settings_json,
124
131
  raw_response_path,
125
132
  status
126
- ) VALUES (?, ?, ?, ?, ?, ?)
133
+ ) VALUES (?, ?, ?, ?, ?, ?, ?)
127
134
  """,
128
- (provider_run_id, meeting_id, PROVIDER, MODEL, str(raw_response_path), status),
135
+ (
136
+ provider_run_id,
137
+ meeting_id,
138
+ PROVIDER,
139
+ MODEL,
140
+ json.dumps(settings or {}, sort_keys=True),
141
+ str(raw_response_path),
142
+ status,
143
+ ),
129
144
  )
130
145
 
131
146
 
@@ -28,6 +28,7 @@ class OpenAIRequestOptions:
28
28
  class AnalysisRequest:
29
29
  transcript_markdown: str
30
30
  meeting_context: str | None = None
31
+ glossary_terms: list[str] | None = None
31
32
  options: OpenAIRequestOptions = field(default_factory=OpenAIRequestOptions)
32
33
 
33
34
 
@@ -36,6 +37,7 @@ class TitleRequest:
36
37
  transcript_markdown: str
37
38
  analysis_markdown: str
38
39
  meeting_context: str | None = None
40
+ glossary_terms: list[str] | None = None
39
41
  options: OpenAIRequestOptions = field(default_factory=OpenAIRequestOptions)
40
42
 
41
43
 
@@ -50,7 +52,7 @@ class ChatCompletionRequest:
50
52
  def analyze_meeting(request: AnalysisRequest) -> str:
51
53
  return _post_chat_completion(
52
54
  ChatCompletionRequest(
53
- system_prompt=_system_prompt(request.meeting_context),
55
+ system_prompt=_system_prompt(request.meeting_context, request.glossary_terms),
54
56
  user_prompt=request.transcript_markdown,
55
57
  options=request.options,
56
58
  timeout_seconds=180,
@@ -61,7 +63,7 @@ def analyze_meeting(request: AnalysisRequest) -> str:
61
63
  def suggest_meeting_title(request: TitleRequest) -> str:
62
64
  content = _post_chat_completion(
63
65
  ChatCompletionRequest(
64
- system_prompt=_title_system_prompt(request.meeting_context),
66
+ system_prompt=_title_system_prompt(request.meeting_context, request.glossary_terms),
65
67
  user_prompt=(f"Transcript:\n{request.transcript_markdown}\n\nAnalysis:\n{request.analysis_markdown}"),
66
68
  options=request.options,
67
69
  timeout_seconds=60,
@@ -148,8 +150,9 @@ None identified.
148
150
  """.strip()
149
151
 
150
152
 
151
- def _system_prompt(meeting_context: str | None) -> str:
153
+ def _system_prompt(meeting_context: str | None, glossary_terms: list[str] | None) -> str:
152
154
  context = meeting_context or "none"
155
+ glossary = _format_glossary(glossary_terms)
153
156
  return f"""
154
157
  You analyze meeting transcripts for a personal note-taker.
155
158
  Return concise Markdown with exactly these headings:
@@ -163,12 +166,17 @@ Return concise Markdown with exactly these headings:
163
166
  Keep it short and prioritized. Do not invent facts.
164
167
  If a section has no useful content, write "None identified."
165
168
  For action items, use: - Owner: task. Due: date or Not mentioned.
169
+ Use the glossary spellings when the transcript appears to refer to these names or domain terms.
170
+ Do not insert glossary terms unless the transcript context supports them.
166
171
  Meeting context: {context}
172
+ Known names and terms:
173
+ {glossary}
167
174
  """.strip()
168
175
 
169
176
 
170
- def _title_system_prompt(meeting_context: str | None) -> str:
177
+ def _title_system_prompt(meeting_context: str | None, glossary_terms: list[str] | None) -> str:
171
178
  context = meeting_context or "none"
179
+ glossary = _format_glossary(glossary_terms)
172
180
  return f"""
173
181
  You name meeting transcripts for a personal note-taker.
174
182
  Return only one title, with no Markdown, labels, quotes, or punctuation wrapper.
@@ -177,10 +185,20 @@ Prefer concrete names, projects, organizations, and topics from the transcript.
177
185
  Do not include dates unless the date is central to the meeting topic.
178
186
  Do not return generic titles like "Meeting Summary" or "Team Meeting".
179
187
  If the transcript has no meaningful content, return an empty string.
188
+ Use the glossary spellings when the transcript appears to refer to these names or domain terms.
189
+ Do not insert glossary terms unless the transcript context supports them.
180
190
  Meeting context: {context}
191
+ Known names and terms:
192
+ {glossary}
181
193
  """.strip()
182
194
 
183
195
 
196
+ def _format_glossary(glossary_terms: list[str] | None) -> str:
197
+ if not glossary_terms:
198
+ return "- none"
199
+ return "\n".join(f"- {term}" for term in glossary_terms)
200
+
201
+
184
202
  def _extract_content(response: dict[str, Any]) -> str:
185
203
  try:
186
204
  content = response["choices"][0]["message"]["content"]
@@ -10,7 +10,7 @@ from fly_on_the_wall.secrets import get_api_key
10
10
  API_URL = "https://api.openai.com/v1/chat/completions"
11
11
  DEFAULT_MODEL = "gpt-5.4-mini"
12
12
  DEFAULT_CLEANUP_TIMEOUT_SECONDS = 1800
13
- CLEANUP_PROMPT_VERSION = "2026-06-04-manuscript-cleanup-v4"
13
+ CLEANUP_PROMPT_VERSION = "2026-06-13-manuscript-cleanup-glossary-v5"
14
14
 
15
15
 
16
16
  class OpenAICleanupError(RuntimeError):
@@ -61,7 +61,7 @@ def cleanup_transcript(
61
61
 
62
62
 
63
63
  def _system_prompt(glossary_terms: list[str] | None, meeting_context: str | None) -> str:
64
- glossary = ", ".join(glossary_terms or []) or "none"
64
+ glossary = _format_glossary(glossary_terms)
65
65
  context = meeting_context or "none"
66
66
  return f"""
67
67
  You clean meeting transcripts into readable manuscript-style dialogue.
@@ -78,12 +78,21 @@ of an idiom, or used with clear literal/comparative meaning, such as "på samma
78
78
  Prefer complete readable sentences over literal STT fragments, but do not summarize,
79
79
  invent details, remove uncertainty markers, or add new content.
80
80
  Preserve standalone acknowledgements such as yes/no/okay/mm and Swedish ja/nej/okej/mm.
81
+ Use the glossary spellings when the transcript appears to refer to these names or domain terms.
82
+ Do not insert glossary terms unless the transcript context supports them.
81
83
  Return only the cleaned manuscript.
82
84
  Meeting context: {context}
83
- Glossary terms: {glossary}
85
+ Known names and terms:
86
+ {glossary}
84
87
  """.strip()
85
88
 
86
89
 
90
+ def _format_glossary(glossary_terms: list[str] | None) -> str:
91
+ if not glossary_terms:
92
+ return "- none"
93
+ return "\n".join(f"- {term}" for term in glossary_terms)
94
+
95
+
87
96
  def _extract_content(response: dict[str, Any]) -> str:
88
97
  try:
89
98
  content = response["choices"][0]["message"]["content"]
@@ -133,7 +133,8 @@ def publish_meeting(connection: Connection, meeting_id_or_slug: str, target_iden
133
133
  analysis_markdown = _read_analysis_markdown(analysis_path)
134
134
  manifest = json.loads(manifest_path.read_text())
135
135
  output_path = _published_output_path(connection, meeting, target)
136
- content = _obsidian_note(meeting, transcript_markdown, analysis_markdown, manifest)
136
+ participants = _meeting_participants(connection, meeting["id"])
137
+ content = _obsidian_note(meeting, transcript_markdown, analysis_markdown, manifest, participants)
137
138
  content_hash = _sha256(content)
138
139
 
139
140
  output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -271,7 +272,34 @@ def _published_output_path(connection: Connection, meeting: dict, target: Publis
271
272
  return target.path / filename
272
273
 
273
274
 
274
- def _obsidian_note(meeting: dict, transcript_markdown: str, analysis_markdown: str, manifest: dict) -> str:
275
+ def _meeting_participants(connection: Connection, meeting_id: str) -> list[str]:
276
+ rows = connection.execute(
277
+ """
278
+ SELECT DISTINCT people.display_name
279
+ FROM speaker_assignments
280
+ JOIN local_speakers ON local_speakers.id = speaker_assignments.local_speaker_id
281
+ JOIN people ON people.id = speaker_assignments.person_id
282
+ WHERE local_speakers.meeting_id = ?
283
+ AND speaker_assignments.status = 'known'
284
+ ORDER BY lower(people.display_name)
285
+ """,
286
+ (meeting_id,),
287
+ ).fetchall()
288
+ return [_obsidian_people_link(row["display_name"]) for row in rows]
289
+
290
+
291
+ def _obsidian_people_link(display_name: str) -> str:
292
+ safe_name = display_name.replace("[[", "").replace("]]", "").replace("|", "-").strip()
293
+ return f"[[People/{safe_name}]]"
294
+
295
+
296
+ def _obsidian_note(
297
+ meeting: dict,
298
+ transcript_markdown: str,
299
+ analysis_markdown: str,
300
+ manifest: dict,
301
+ participants: list[str] | None = None,
302
+ ) -> str:
275
303
  date, time = _date_time(_meeting_timestamp(meeting))
276
304
  frontmatter = {
277
305
  "title": meeting["title"],
@@ -284,6 +312,7 @@ def _obsidian_note(meeting: dict, transcript_markdown: str, analysis_markdown: s
284
312
  "recorded_at": meeting.get("recorded_at"),
285
313
  "duration_seconds": meeting.get("duration_seconds"),
286
314
  "recording_quality": meeting.get("recording_quality_status"),
315
+ "participants": participants or None,
287
316
  "tags": ["meetings", "fly-on-the-wall"],
288
317
  }
289
318
  lines = ["---", *_yaml_lines(frontmatter), "---", ""]
@@ -330,7 +359,7 @@ def _yaml_lines(values: dict) -> list[str]:
330
359
 
331
360
  def _yaml_scalar(value: object) -> str:
332
361
  text = str(value)
333
- if re.search(r"[:#\n,]", text):
362
+ if re.search(r"[:#\n,\[\]{}]", text):
334
363
  return json.dumps(text, ensure_ascii=False)
335
364
  return text
336
365
 
@@ -1,31 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from pathlib import Path
4
- from typing import Any
5
-
6
- import yaml
7
-
8
-
9
- def load_glossary_terms(path: Path | None) -> list[str]:
10
- if path is None or not path.exists():
11
- return []
12
- data = yaml.safe_load(path.read_text())
13
- return sorted(set(_collect_terms(data)))
14
-
15
-
16
- def _collect_terms(value: Any) -> list[str]:
17
- if value is None:
18
- return []
19
- if isinstance(value, str):
20
- return [value]
21
- if isinstance(value, list):
22
- terms: list[str] = []
23
- for item in value:
24
- terms.extend(_collect_terms(item))
25
- return terms
26
- if isinstance(value, dict):
27
- terms = []
28
- for item in value.values():
29
- terms.extend(_collect_terms(item))
30
- return terms
31
- return []
File without changes
File without changes