chirp-notes-ai 0.0.1a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. chirp_notes_ai-0.0.1a0/.codespellrc +3 -0
  2. chirp_notes_ai-0.0.1a0/.docs/DEVELOPMENT.md +85 -0
  3. chirp_notes_ai-0.0.1a0/.docs/architecture.md +59 -0
  4. chirp_notes_ai-0.0.1a0/.docs/chunking.md +73 -0
  5. chirp_notes_ai-0.0.1a0/.docs/diarization-roadmap.md +139 -0
  6. chirp_notes_ai-0.0.1a0/.docs/embeddings.md +96 -0
  7. chirp_notes_ai-0.0.1a0/.docs/hybrid-retrieval.md +48 -0
  8. chirp_notes_ai-0.0.1a0/.docs/imgs/chirp-logo.png +0 -0
  9. chirp_notes_ai-0.0.1a0/.github/actions/quality-checks/action.yml +63 -0
  10. chirp_notes_ai-0.0.1a0/.github/workflows/main-build.yml +40 -0
  11. chirp_notes_ai-0.0.1a0/.github/workflows/pr-checks.yml +46 -0
  12. chirp_notes_ai-0.0.1a0/.github/workflows/publish.yaml +40 -0
  13. chirp_notes_ai-0.0.1a0/.github/workflows/shared-build-and-test.yaml +193 -0
  14. chirp_notes_ai-0.0.1a0/.github/workflows/shared-publish-package.yaml +109 -0
  15. chirp_notes_ai-0.0.1a0/.gitignore +217 -0
  16. chirp_notes_ai-0.0.1a0/.pre-commit-config.yaml +30 -0
  17. chirp_notes_ai-0.0.1a0/.vscode/extensions.json +38 -0
  18. chirp_notes_ai-0.0.1a0/.vscode/settings.json +92 -0
  19. chirp_notes_ai-0.0.1a0/AGENTS.md +26 -0
  20. chirp_notes_ai-0.0.1a0/CLAUDE.md +20 -0
  21. chirp_notes_ai-0.0.1a0/LICENSE +21 -0
  22. chirp_notes_ai-0.0.1a0/Makefile +231 -0
  23. chirp_notes_ai-0.0.1a0/PKG-INFO +226 -0
  24. chirp_notes_ai-0.0.1a0/README.md +198 -0
  25. chirp_notes_ai-0.0.1a0/_bmad-output/implementation-artifacts/deferred-work.md +42 -0
  26. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-audio-capture/epic.md +125 -0
  27. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-audio-capture/stories/2.1-audio-capture-module.md +147 -0
  28. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-audio-capture/stories/2.2-recorder-integration.md +101 -0
  29. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-audio-capture/stories/2.3-blackhole-removal.md +125 -0
  30. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/epic.md +105 -0
  31. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.1-storage-rewrite.md +116 -0
  32. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.2-command-surface-prune.md +89 -0
  33. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.3-notes-sub-app.md +77 -0
  34. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.4-transcribe-queue-checklist.md +101 -0
  35. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.5-init-polish.md +102 -0
  36. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.6-record-verify.md +74 -0
  37. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.7-ask-verify.md +63 -0
  38. chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.8-search-keyword-rewrite.md +304 -0
  39. chirp_notes_ai-0.0.1a0/chirp/__init__.py +0 -0
  40. chirp_notes_ai-0.0.1a0/chirp/about.py +215 -0
  41. chirp_notes_ai-0.0.1a0/chirp/branding.py +89 -0
  42. chirp_notes_ai-0.0.1a0/chirp/cli.py +1155 -0
  43. chirp_notes_ai-0.0.1a0/chirp/exceptions.py +30 -0
  44. chirp_notes_ai-0.0.1a0/chirp/init_flow.py +699 -0
  45. chirp_notes_ai-0.0.1a0/config/__init__.py +0 -0
  46. chirp_notes_ai-0.0.1a0/config/settings.py +154 -0
  47. chirp_notes_ai-0.0.1a0/notes/__init__.py +0 -0
  48. chirp_notes_ai-0.0.1a0/notes/constants.py +1 -0
  49. chirp_notes_ai-0.0.1a0/notes/manual_note_manager.py +131 -0
  50. chirp_notes_ai-0.0.1a0/notes/note_editor.py +738 -0
  51. chirp_notes_ai-0.0.1a0/notes/note_generator.py +515 -0
  52. chirp_notes_ai-0.0.1a0/notes/template_engine.py +122 -0
  53. chirp_notes_ai-0.0.1a0/notes_chat/__init__.py +1 -0
  54. chirp_notes_ai-0.0.1a0/notes_chat/bm25.py +106 -0
  55. chirp_notes_ai-0.0.1a0/notes_chat/cache.py +73 -0
  56. chirp_notes_ai-0.0.1a0/notes_chat/cli.py +167 -0
  57. chirp_notes_ai-0.0.1a0/notes_chat/config.py +5 -0
  58. chirp_notes_ai-0.0.1a0/notes_chat/index.py +373 -0
  59. chirp_notes_ai-0.0.1a0/notes_chat/interactive.py +277 -0
  60. chirp_notes_ai-0.0.1a0/notes_chat/prompting.py +771 -0
  61. chirp_notes_ai-0.0.1a0/notes_chat/retrieval.py +409 -0
  62. chirp_notes_ai-0.0.1a0/notes_chat/search_keyword.py +397 -0
  63. chirp_notes_ai-0.0.1a0/notes_chat/time_ranges.py +139 -0
  64. chirp_notes_ai-0.0.1a0/notes_chat/types.py +34 -0
  65. chirp_notes_ai-0.0.1a0/pyproject.toml +137 -0
  66. chirp_notes_ai-0.0.1a0/recorder/__init__.py +0 -0
  67. chirp_notes_ai-0.0.1a0/recorder/audio_recorder.py +341 -0
  68. chirp_notes_ai-0.0.1a0/recorder/device_manager.py +237 -0
  69. chirp_notes_ai-0.0.1a0/recorder/live_audio.py +228 -0
  70. chirp_notes_ai-0.0.1a0/recorder/live_dashboard.py +359 -0
  71. chirp_notes_ai-0.0.1a0/recorder/live_session.py +306 -0
  72. chirp_notes_ai-0.0.1a0/recorder/live_transcriber.py +277 -0
  73. chirp_notes_ai-0.0.1a0/recorder/live_types.py +32 -0
  74. chirp_notes_ai-0.0.1a0/recorder/meeting_monitor.py +80 -0
  75. chirp_notes_ai-0.0.1a0/recorder/vad_chunker.py +191 -0
  76. chirp_notes_ai-0.0.1a0/renovate.json +8 -0
  77. chirp_notes_ai-0.0.1a0/scripts/README.md +192 -0
  78. chirp_notes_ai-0.0.1a0/scripts/debug_live_transcript.py +33 -0
  79. chirp_notes_ai-0.0.1a0/scripts/pyproject.toml +8 -0
  80. chirp_notes_ai-0.0.1a0/scripts/test_live_vad.py +125 -0
  81. chirp_notes_ai-0.0.1a0/scripts/update_pr_version.py +42 -0
  82. chirp_notes_ai-0.0.1a0/scripts/update_release_version.py +28 -0
  83. chirp_notes_ai-0.0.1a0/scripts/uv.lock +8 -0
  84. chirp_notes_ai-0.0.1a0/scripts/version_utils.py +77 -0
  85. chirp_notes_ai-0.0.1a0/templates/daily_notes.md +5 -0
  86. chirp_notes_ai-0.0.1a0/templates/meeting_section.md +34 -0
  87. chirp_notes_ai-0.0.1a0/tests/__init__.py +0 -0
  88. chirp_notes_ai-0.0.1a0/tests/notes_chat/test_interactive.py +180 -0
  89. chirp_notes_ai-0.0.1a0/tests/notes_chat/test_search_keyword.py +222 -0
  90. chirp_notes_ai-0.0.1a0/tests/test_about.py +53 -0
  91. chirp_notes_ai-0.0.1a0/tests/test_ask_sources.py +241 -0
  92. chirp_notes_ai-0.0.1a0/tests/test_audio_recorder.py +228 -0
  93. chirp_notes_ai-0.0.1a0/tests/test_auto_indexing.py +77 -0
  94. chirp_notes_ai-0.0.1a0/tests/test_batch_processor.py +360 -0
  95. chirp_notes_ai-0.0.1a0/tests/test_bm25.py +151 -0
  96. chirp_notes_ai-0.0.1a0/tests/test_cache.py +147 -0
  97. chirp_notes_ai-0.0.1a0/tests/test_ci_versioning.py +47 -0
  98. chirp_notes_ai-0.0.1a0/tests/test_cli_commands.py +486 -0
  99. chirp_notes_ai-0.0.1a0/tests/test_cli_startup.py +48 -0
  100. chirp_notes_ai-0.0.1a0/tests/test_device_manager.py +466 -0
  101. chirp_notes_ai-0.0.1a0/tests/test_file_utils.py +152 -0
  102. chirp_notes_ai-0.0.1a0/tests/test_index_manifest.py +200 -0
  103. chirp_notes_ai-0.0.1a0/tests/test_init_flow.py +442 -0
  104. chirp_notes_ai-0.0.1a0/tests/test_live_transcriber.py +293 -0
  105. chirp_notes_ai-0.0.1a0/tests/test_manual_notes.py +76 -0
  106. chirp_notes_ai-0.0.1a0/tests/test_note_editor.py +22 -0
  107. chirp_notes_ai-0.0.1a0/tests/test_note_generator.py +242 -0
  108. chirp_notes_ai-0.0.1a0/tests/test_parse_timeframe.py +82 -0
  109. chirp_notes_ai-0.0.1a0/tests/test_prompting.py +281 -0
  110. chirp_notes_ai-0.0.1a0/tests/test_record_view.py +188 -0
  111. chirp_notes_ai-0.0.1a0/tests/test_retrieval_merge.py +208 -0
  112. chirp_notes_ai-0.0.1a0/tests/test_settings.py +57 -0
  113. chirp_notes_ai-0.0.1a0/tests/test_time_ranges.py +106 -0
  114. chirp_notes_ai-0.0.1a0/tests/test_vad_chunker.py +211 -0
  115. chirp_notes_ai-0.0.1a0/tests/test_whisper_transcriber.py +461 -0
  116. chirp_notes_ai-0.0.1a0/transcriber/__init__.py +0 -0
  117. chirp_notes_ai-0.0.1a0/transcriber/batch_processor.py +398 -0
  118. chirp_notes_ai-0.0.1a0/transcriber/compression.py +60 -0
  119. chirp_notes_ai-0.0.1a0/transcriber/whisper_transcriber.py +332 -0
  120. chirp_notes_ai-0.0.1a0/utils/__init__.py +0 -0
  121. chirp_notes_ai-0.0.1a0/utils/file_utils.py +192 -0
  122. chirp_notes_ai-0.0.1a0/utils/popup_manager.py +81 -0
  123. chirp_notes_ai-0.0.1a0/utils/time_utils.py +152 -0
  124. chirp_notes_ai-0.0.1a0/uv.lock +3092 -0
@@ -0,0 +1,3 @@
1
+ [codespell]
2
+ skip = .git,__pycache__,.pytest_cache,*.pyc,.venv,venv,dist,build,*.egg-info,htmlcov,to-transcribe,transcription-out,notes-out,uv.lock
3
+ ignore-words-list = blackhole,chirp,ollama,llama,pyaudio,portaudio,whisper,pydantic,typer,pytest,mypy,ruff,htmlcov,pycache,pyproject,transcriber,transcriptions,mkdir,framerate,setnchannels,setsampwidth,setframerate,writeframes,maxInputChannels,maxOutputChannels,defaultSampleRate,hostApi,applescript,osascript,autodiscovery,elif,datetime,isoformat,fromisoformat,strftime,strptime,mtime,microsecond,unlink,iterdir,mkdir,pathlib,gzip,yaml,toml,venv,async,asyncio,threading,subprocess,kwargs,args,enum,uuid,onnxruntime,ctranslate,huggingface,tokenizers,flatbuffers,fsspec,httpcore,httpx,idna,mdurl,mpmath,protobuf,shellingham,sniffio,sympy,tqdm,urllib,coloredlogs,humanfriendly,filelock,certifi,charset,normalizer,annotated,anyio,chirp,transcriber,transcriptions,transcribe,iterm
@@ -0,0 +1,85 @@
1
+ # Development Guide
2
+
3
+ This document is for contributors working on Chirp. End users should start with the top-level `README.md`.
4
+
5
+ ## Prerequisites
6
+
7
+ - Python 3.11+
8
+ - macOS for local audio-capture development
9
+ - Homebrew
10
+ - Git
11
+ - Ollama for note-generation and retrieval flows
12
+
13
+ ## Setup
14
+
15
+ ```bash
16
+ git clone <repository-url>
17
+ cd chirp-ai-note-app
18
+ make dev-install
19
+ ```
20
+
21
+ This installs system dependencies, syncs the Python environment with `uv`, installs the package editable, and enables pre-commit hooks.
22
+
23
+ If you only need the editable install in an already-prepared environment:
24
+
25
+ ```bash
26
+ make install-venv
27
+ ```
28
+
29
+ ## Quality checks
30
+
31
+ ```bash
32
+ make check
33
+ make test
34
+ make test-coverage
35
+ make lint-fix
36
+ make format
37
+ make type-check
38
+ ```
39
+
40
+ Targeted test helpers are also available:
41
+
42
+ ```bash
43
+ make test-file FILE=tests/test_settings.py
44
+ make test-match PATTERN=slugify
45
+ make test-failed
46
+ ```
47
+
48
+ ## Useful CLI checks
49
+
50
+ ```bash
51
+ uv run chirp --help
52
+ uv run chirp init --recheck
53
+ make verify-deps
54
+ ```
55
+
56
+ ## Project structure
57
+
58
+ ```text
59
+ chirp-ai-note-app/
60
+ ├── chirp/ # Typer CLI entrypoint and high-level flows
61
+ ├── config/ # Pydantic settings and config-path helpers
62
+ ├── recorder/ # Audio recording, device handling, live transcription
63
+ ├── transcriber/ # Whisper transcription and batch processing
64
+ ├── notes/ # Note generation, templates, manual editing
65
+ ├── notes_chat/ # Retrieval, keyword search, chat flows, indexing
66
+ ├── utils/ # Shared filesystem and time helpers
67
+ ├── templates/ # Prompt and note templates
68
+ ├── scripts/ # Dev/debug helper scripts
69
+ ├── tests/ # Pytest suite
70
+ ├── AGENTS.md # Canonical contributor guidance
71
+ └── README.md # Canonical user-facing readme
72
+ ```
73
+
74
+ ## Notes storage
75
+
76
+ - Config file: `~/.chirp/config.toml`
77
+ - Default notes root: `~/Documents/chirp`
78
+ - Each note lives in its own folder with `audio.wav`, `transcript.txt`, `notes.md`, and `meta.toml`
79
+
80
+ ## Contributing
81
+
82
+ 1. Branch from `main`.
83
+ 2. Keep changes scoped.
84
+ 3. Run the relevant tests plus `make check`.
85
+ 4. Open a PR with context about behavior changes and any CLI-facing output changes.
@@ -0,0 +1,59 @@
1
+ # Chirp Architecture (at a glance)
2
+
3
+ This guide shows how Chirp moves from audio to answers using a simple, hybrid RAG pipeline. For implementation details, see `notes_chat/`, `notes/`, `transcriber/`, and `config/`.
4
+
5
+ ## Ingestion (index build)
6
+
7
+ ```mermaid
8
+ flowchart LR
9
+ Notes[notes-out/*.md] --> Chunk[Chunk]
10
+ Chunk --> Embed[Embed]
11
+ Embed --> Chroma[(Chroma)]
12
+ Chunk --> BM25[BM25 Corpus]
13
+ ```
14
+
15
+ ## Retrieval (ask)
16
+
17
+ ```mermaid
18
+ flowchart LR
19
+ Question[Question] --> QEmb[Embed]
20
+ QEmb --> Chroma[(Chroma)]
21
+ Question --> BM25Q[BM25 Search]
22
+ Chroma --> VecHits[Vector Hits]
23
+ BM25Q --> LexHits[Lexical Hits]
24
+ VecHits --> Merge[Merge + Dedupe]
25
+ LexHits --> Merge
26
+ Merge --> Context[Context Budget]
27
+ Context --> LLM[LLM]
28
+ LLM --> Answer[Answer + Sources]
29
+ ```
30
+
31
+ - Chunking: section-aware with overlap. See: [Chunking Strategy](./chunking.md)
32
+ - Embeddings: Ollama embeddings; same model for chunks and queries. See: [Embedding Backend](./embeddings.md)
33
+ - Dedupe key: `(path, content_hash)` so overlapping or repeated text doesn’t show twice
34
+ - Why hybrid? See: [Hybrid Retrieval](./hybrid-retrieval.md)
35
+
36
+ ## Components (what exists)
37
+
38
+ - CLI (`chirp`): entry point to record, process notes, index, and chat
39
+ - Recorder + Transcriber: produce transcription and notes
40
+ - Note Generator: writes `notes-out/*.md`
41
+ - Indexer: chunks, embeds, and stores in Chroma; rebuilds BM25
42
+ - Retriever: hybrid search (vector + BM25), merges and builds context
43
+ - LLM: answers using the built context
44
+ - Storage:
45
+ - Chroma (persistent at `.notes_index/chroma`)
46
+ - BM25 corpus (at `.notes_index/bm25.json`)
47
+
48
+ ## Operations (quick refs)
49
+
50
+ - Manual notes saved via the CLI are auto-indexed when `notes_chat.auto_index` is enabled.
51
+ - See the main `README.md` for commands and usage.
52
+
53
+ ## Configuration
54
+
55
+ - Main settings live in `config/config.yaml` (paths, models, and RAG tuning)
56
+ - Notable knobs (see linked docs for behavior):
57
+
58
+ - Chunking: `notes_chat.chunk_size`, `notes_chat.overlap` → [Chunking](./chunking.md)
59
+ - Embeddings: `notes_chat.emb_model`, `models.ollama_url` → [Embeddings](./embeddings.md)
@@ -0,0 +1,73 @@
1
+ # Chunking Strategy
2
+
3
+ This document defines the chunking strategy used for indexing notes into the RAG pipeline.
4
+
5
+ - Code reference: `notes_chat/index.py`
6
+ - Config knobs: `notes_chat.chunk_size`, `notes_chat.overlap` in `config/config.yaml` / `config/settings.py`
7
+ - Defaults: `chunk_size: 1000` characters, `overlap: 200` characters
8
+
9
+ ## Goals
10
+
11
+ - Preserve semantic boundaries by splitting around second-level headings first (`##` in Markdown)
12
+ - Keep chunks under a target character budget for efficient embedding and retrieval
13
+ - Add overlap to reduce information loss at boundaries
14
+
15
+ ## Inputs & Outputs
16
+
17
+ - Input: Markdown note text (from `notes-out/*.md`) and extracted metadata
18
+ - Output: List of chunks with fields:
19
+ - `id`: `<file_stem>_<section_index>` or `<file_stem>_<section_index>_<chunk_index>`
20
+ - `content`: the chunk text
21
+ - `meta`: `title`, `date`, `participants`, `duration`, etc.
22
+ - `content_hash`: stable hash of normalized content for de-duplication
23
+
24
+ ## Algorithm
25
+
26
+ 1. Section-aware split
27
+
28
+ - Split the document on `\n##`, effectively chunking by second-level headings while retaining the heading text in the section.
29
+ - Skip empty sections and very short ones: sections with `< 50` characters are ignored.
30
+ - Note: Sections are defined as meetings and/or single transcripts.
31
+
32
+ 2. Size check per section
33
+
34
+ - If `len(section) <= chunk_size`, emit the whole section as a single chunk.
35
+ - Else, split the large section with overlapping windows (see below).
36
+
37
+ 3. Overlapping windows for large sections
38
+
39
+ - Convert character budgets to approximate word windows using a 6 characters-per-word heuristic:
40
+ - `chunk_words = chunk_size // 6`
41
+ - `overlap_words = overlap // 6`
42
+ - Slide a window across the section’s words:
43
+ - `start = 0`
44
+ - `end = min(start + chunk_words, total_words)`
45
+ - Emit `" ".join(words[start:end])`
46
+ - Set `start = end - overlap_words` (floors at 0) and repeat until the end of the section.
47
+
48
+ 4. Metadata and IDs
49
+
50
+ - Each chunk gets a deterministic `id` and carries `meta` plus `content_hash` (used for de-duplication).
51
+
52
+ ## Defaults & Tuning
53
+
54
+ - Defaults from `config/config.yaml`:
55
+ - `notes_chat.chunk_size: 1000` (approx. `~166` words per chunk)
56
+ - `notes_chat.overlap: 200` (approx. `~33` words overlap)
57
+ - Increase `chunk_size` if your sections are dense and short, or you want fewer, larger chunks.
58
+ - Increase `overlap` if you see boundary-loss in answers; decrease for faster indexing/search.
59
+
60
+ ## Edge Cases & Notes
61
+
62
+ - No headings: the entire file acts as a single section and will be either a single chunk or split by word windows.
63
+ - Very short files/sections (`< 50` chars) are ignored to avoid noisy chunks.
64
+ - Non-ASCII/whitespace: tokenization uses `str.split()` (whitespace); very long tokens (e.g., URLs) may push beyond targets.
65
+ - IDs and signatures (`content_hash`) help merge/dedupe across hybrid retrieval (Chroma + BM25).
66
+
67
+ ## Rationale
68
+
69
+ - Section-first splitting aligns chunks with human-authored structure.
70
+ - Overlap preserves context across chunk boundaries, improving recall in semantic search.
71
+ - Word-based windows derived from character budgets keep behavior stable while allowing intuitive char-sized tuning.
72
+
73
+ See also: [Architecture](./architecture.md)
@@ -0,0 +1,139 @@
1
+ # Diarization Roadmap (Local, macOS-first)
2
+
3
+ This document outlines a pragmatic, phased plan to add speaker detection to Chirp. It keeps everything local, starts simple, and gives a clear upgrade path. No implementation is committed yet—this is a planning guide for when we're ready.
4
+
5
+ - Goals
6
+ - Tag “You” vs “Others” reliably during meetings
7
+ - Optionally split “Others” into Speaker A/B/C
8
+ - Keep fully local; no cloud calls or gated model requirements by default
9
+ - Minimal impact on the current CLI until we flip it on
10
+
11
+ - Scope
12
+ - macOS focus to start (BlackHole + Aggregate/Multi-Output devices)
13
+ - Compatible with existing transcription (faster-whisper)
14
+ - Future-compatible with Linux/Windows if desired
15
+
16
+ ## Phase 0 — Prep and Guardrails
17
+
18
+ - Config placeholders (no behavior change yet)
19
+ - `diarization.enabled: false`
20
+ - `diarization.backend: "speechbrain" | "pyannote"`
21
+ - `diarization.overlap: false`
22
+ - `diarization.align_words: true`
23
+ - `diarization.mic_label.enabled: true`
24
+ - Recorder readiness
25
+ - Dual capture (Mic + System via BlackHole) or Aggregate Device input (2 channels)
26
+ - Keep sample rate aligned (e.g., 48 kHz)
27
+ - Tooling
28
+ - `chirp status` and `chirp devices` surface device hints (Aggregate/Multi-Output)
29
+ - Acceptance criteria
30
+ - Config keys exist and are ignored safely when disabled
31
+ - Clear doc pointers; no runtime breakage if diarization is off
32
+
33
+ ## Phase 1 — Mic-based “You” Labeling (No Other-speaker splits)
34
+
35
+ - Approach
36
+ - Record two synchronized streams: Mic (you) and System (others)
37
+ - Run VAD on Mic to build a "you speaking" time mask
38
+ - Assign transcript words/segments to "You" when they overlap the mask; otherwise "Other"
39
+ - Pros: Zero model downloads, fast, robust with headphones
40
+ - Limitations: No separation among “Others,” no overlap handling
41
+ - Config ideas
42
+ - `diarization.mic_label.threshold: 0.4` (overlap proportion)
43
+ - `diarization.vad: webrtc`
44
+ - Acceptance criteria
45
+ - Words you speak are labeled “You” with high precision on headphone setups
46
+ - No regression to existing transcription/notes when disabled
47
+
48
+ ## Phase 2 — SpeechBrain Diarization for “Others”
49
+
50
+ - Approach
51
+ - VAD on System stream → short speech segments (~1.5–3.0s)
52
+ - ECAPA‑TDNN embeddings (SpeechBrain) for each segment
53
+ - Cluster embeddings (Agglomerative/Spectral, cosine distance) into Speaker A/B/C
54
+ - Map words to speakers by timestamp overlap (±100–200 ms padding)
55
+ - Keep Mic-based “You” override: if Mic is active, label as “You”
56
+ - Pros: Fully local, no gated models, Apple Silicon friendly
57
+ - Limitations: Not overlap‑aware; best for typical meetings
58
+ - Config ideas
59
+ - `diarization.backend: "speechbrain"`
60
+ - `diarization.max_speakers: null` (auto 2–6 with override)
61
+ - `diarization.min_speech_ms: 250`
62
+ - `diarization.max_silence_ms: 400`
63
+ - `diarization.frame_ms: 30`
64
+ - Acceptance criteria
65
+ - Two‑speaker meetings label cleanly into “You” and one other speaker
66
+ - Three‑plus speaker meetings are reasonable (some fragmentation acceptable)
67
+
68
+ ## Phase 3 — Overlap Handling
69
+
70
+ - Approach
71
+ - Basic: If both Mic and System show strong activity, mark words as "Overlap" or choose dominant energy
72
+ - Advanced: Enable overlap‑aware diarization (e.g., pyannote backend) to permit multiple concurrent speakers
73
+ - Word assignment: choose the speaker with highest local activity; if Mic active and dominant, prefer “You”
74
+ - Pros: Better attribution during cross‑talk
75
+ - Limitations: Heavier models if you enable pyannote (requires HF token)
76
+ - Config ideas
77
+ - `diarization.overlap: true`
78
+ - `diarization.energy_ratio_threshold: 2.0`
79
+ - Acceptance criteria
80
+ - Cross‑talk segments aren’t misattributed wholesale; predictable tie‑breaking
81
+
82
+ ## Phase 4 — Performance, Reliability, and Fallbacks
83
+
84
+ - Performance
85
+ - Chunk long audio (2–5 min with overlaps) to bound memory; stitch results
86
+ - CPU acceptable on M‑series; keep offline, non‑realtime
87
+ - Reliability
88
+ - Friendly errors if models unavailable; clear instructions in `chirp status/test`
89
+ - Fallbacks
90
+ - If `backend: pyannote` is requested but models/HF token absent, fall back to SpeechBrain with a warning
91
+ - Acceptance criteria
92
+ - 30–90 minute meetings process reliably with clear messaging and no hard failures
93
+
94
+ ## Phase 5 — Optional Identification and UX
95
+
96
+ - Identify "Me"
97
+ - Optional enrollment: store a local ECAPA voiceprint; relabel diarized cluster as “You” via cosine similarity
98
+ - Keep others anonymous (Speaker A/B/C)
99
+ - CLI/UX
100
+ - Flags: `--diarize`, `--overlap`, `--identify-me`
101
+ - Templates: compact vs detailed speaker labels
102
+ - Indexing
103
+ - Include `speaker` and `is_you` metadata for future filters
104
+ - Acceptance criteria
105
+ - Easy toggles, predictable output, speaker tags improve note readability
106
+
107
+ ## Risks and Mitigations
108
+
109
+ - Mic bleed (no headphones): higher false “You” rate → increase thresholds; recommend headphones
110
+ - Double‑talk: resolve with overlap heuristics or pyannote backend when needed
111
+ - Fragmentation: merge adjacent same‑speaker segments; tune VAD thresholds
112
+ - Model weight size/network: default to SpeechBrain; make pyannote optional
113
+
114
+ ## Testing Plan (incremental)
115
+
116
+ - Unit tests
117
+ - VAD mask generation (Mic/System) with synthetic signals
118
+ - Word‑to‑mask overlap labeling
119
+ - Embedding clustering on toy datasets (2–3 speakers)
120
+ - Fixtures
121
+ - Short (2–5 min) two‑speaker and three‑speaker samples
122
+ - Headphones vs. speaker playback scenarios
123
+ - CLI smoke
124
+ - `chirp transcribe --diarize` produces labeled utterances; disabled mode unchanged
125
+
126
+ ## Next Steps (when ready)
127
+
128
+ - Keep diarization disabled by default
129
+ - Implement Phase 1 first (mic‑based labeling) for immediate value
130
+ - Add Phase 2 (SpeechBrain) behind `backend: speechbrain`
131
+ - Consider Phase 3 (overlap) only if needed; pyannote as an optional backend
132
+
133
+ ---
134
+
135
+ References
136
+
137
+ - SpeechBrain ECAPA TDNN: <https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb>
138
+ - WebRTC VAD: <https://webrtc.org/> (Python bindings: `webrtcvad`)
139
+ - pyannote (optional backend): <https://github.com/pyannote/pyannote-audio>
@@ -0,0 +1,96 @@
1
+ # Embedding Backend
2
+
3
+ This document explains how embeddings are generated and used in Chirp’s RAG pipeline.
4
+
5
+ - Code references: `notes_chat/index.py`, `notes_chat/retrieval.py`, `config/config.yaml`, `config/settings.py`, `notes_chat/prompting.py`
6
+ - Default backend: [Ollama](https://ollama.com) HTTP API
7
+ - Default embedding model: `nomic-embed-text`
8
+
9
+ ## Overview
10
+
11
+ Embeddings convert text into high-dimensional vectors that preserve semantic similarity. Chirp uses embeddings to:
12
+
13
+ - Index note chunks into a vector database (Chroma)
14
+ - Embed queries at retrieval time and run vector similarity search
15
+
16
+ ```mermaid
17
+ flowchart LR
18
+ A[Chunk Text] -->|POST /api/embeddings| O[Ollama]
19
+ O --> V[Vector]
20
+ V --> C[(Chroma Collection 'notes')]
21
+ ```
22
+
23
+ ## Configuration
24
+
25
+ - `config/config.yaml`
26
+ - `notes_chat.emb_model`: embedding model name (default `nomic-embed-text`)
27
+ - `models.ollama_url`: Ollama server URL (default `http://localhost:11434`)
28
+ - `config/settings.py` hydrates these into `ChirpSettings` used across the app.
29
+
30
+ ## Indexing Flow
31
+
32
+ - Implemented in `notes_chat/index.py`:
33
+ 1. Chunk notes (section-aware + overlapping windows)
34
+ 2. For each chunk, compute `content_hash` and call Ollama embeddings:
35
+ - Endpoint: `POST {ollama_url}/api/embeddings`
36
+ - Payload: `{ "model": emb_model, "prompt": chunk_text }`
37
+ - Response: `{ "embedding": [float, ...] }`
38
+ 3. Upsert into Chroma with `ids`, `documents`, `embeddings`, and metadata (including `content_hash`)
39
+ 4. Rebuild the BM25 lexicon (`.notes_index/bm25.json`) from Chroma documents
40
+
41
+ Key method signatures:
42
+
43
+ - `_get_embeddings(texts: list[str]) -> list[list[float]]`
44
+ - `collection.add(ids, documents, embeddings, metadatas)`
45
+
46
+ ## Retrieval Flow
47
+
48
+ - Implemented in `notes_chat/retrieval.py`:
49
+ 1. Parse time filter (if present)
50
+ 2. Compute query embedding via Ollama:
51
+ - Endpoint: `POST {ollama_url}/api/embeddings`
52
+ - Payload: `{ "model": emb_model, "prompt": query }`
53
+ 3. Query Chroma for top-k semantic matches
54
+ 4. Query BM25 for lexical matches
55
+ 5. Merge + dedupe using `(path, content_hash)`
56
+ 6. Build context under a character budget and pass to the LLM for answering
57
+
58
+ ## Determinism and Model Choice
59
+
60
+ - Embedding calls are stateless and do not stream.
61
+ - The chosen model `nomic-embed-text` provides a general-purpose English embedding suitable for note-sized chunks.
62
+ - You can swap `notes_chat.emb_model` to another Ollama-compatible embedding model if desired.
63
+
64
+ ## Error Handling
65
+
66
+ - Indexing (`_get_embeddings`):
67
+ - Non-200 responses cause the whole file’s add-to-index to fail (and be skipped).
68
+ - Connection errors are caught and surfaced via a console message.
69
+ - Retrieval (`_get_query_embedding`):
70
+ - Returns `None` on error; retrieval will still return BM25-only results or an informative suggestion if nothing is found.
71
+
72
+ Common failure modes and fixes:
73
+
74
+ - “Failed to get embeddings”: ensure Ollama is running and the model is pulled.
75
+ - Timeouts: large models or long prompts—verify `ollama serve` and local resources.
76
+
77
+ ## Troubleshooting
78
+
79
+ - Verify Ollama:
80
+ - `curl {ollama_url}/api/version`
81
+ - `curl {ollama_url}/api/tags` (ensure `notes_chat.emb_model` is listed)
82
+ - From project root:
83
+ - Rebuild index: `uv run chirp notes index --force`
84
+
85
+ ## Extensibility
86
+
87
+ - Add other embedding backends by implementing equivalents of:
88
+ - Index: `_get_embeddings(texts)`
89
+ - Retrieval: `_get_query_embedding(query)`
90
+ - Keep `content_hash` unchanged—only the embedding vectors change.
91
+ - Consider adding model-specific normalization or truncation if needed by the target API.
92
+
93
+ ## Privacy
94
+
95
+ - With Ollama running locally, text never leaves your machine.
96
+ - If you later switch to a hosted embedding API, review data policies and redact sensitive content as needed before embedding.
@@ -0,0 +1,48 @@
1
+ # Hybrid Retrieval: Embeddings + BM25
2
+
3
+ This note explains why Chirp uses hybrid retrieval (semantic embeddings + BM25 lexical search), how it works, and when you might tune or change it.
4
+
5
+ ## TL;DR
6
+
7
+ - Keep both: embeddings catch semantic matches; BM25 catches exact terms (IDs, names, phrases).
8
+ - Merge the results, dedupe by `(path, content_hash)`, then build a context under a fixed character budget.
9
+
10
+ ## How it works
11
+
12
+ See the retrieval diagram in the Architecture doc: [Architecture → Retrieval (ask)](./architecture.md#retrieval-ask).
13
+
14
+ - Embeddings: query and chunks are embedded with the same model; Chroma returns top-k by cosine similarity.
15
+ - BM25: ranks chunks by lexical overlap; strong for exact tokens, IDs, acronyms, and phrases.
16
+ - Merge + Dedupe: combine lists and deduplicate using `(path, content_hash)`.
17
+ - Context: allocate text across top chunks within `ctx_char_budget`, then prompt the LLM and attach sources.
18
+
19
+ ## Why hybrid?
20
+
21
+ - Short or specific queries: BM25 shines on IDs (e.g., `jira-123`), names, codes, dates, and quoted phrases.
22
+ - Paraphrased or fuzzy queries: embeddings retrieve semantically related content even if words differ.
23
+ - Local and fast: both run locally (Ollama + Chroma + BM25 corpus) with low overhead.
24
+
25
+ ## When to change it
26
+
27
+ - Stronger embeddings + re-ranker: if you add a cross-encoder/LLM re-ranking step, embeddings-only can be competitive.
28
+ - Domain-specific tokens: increase BM25 weight if queries often include IDs or exact terms.
29
+ - Minimal needs: if queries are always natural language, embeddings-only may be sufficient.
30
+
31
+ ## Tuning tips
32
+
33
+ - k-values: start with `k=10` for both; keep totals small to avoid noisy merges.
34
+ - Fusion: simple score normalization or Reciprocal Rank Fusion (RRF) keeps logic robust.
35
+ - Routing: detect quotes, ALL-CAPS acronyms, many digits → boost BM25’s influence.
36
+ - Budgeting: round-robin or interleaving across sources when constructing the context.
37
+
38
+ ## Failure modes and guardrails
39
+
40
+ - Empty or low-similarity vectors: backfill from BM25.
41
+ - Duplicate content across notes: dedupe with `(path, content_hash)`.
42
+ - Very long chunks: rely on chunking and overlaps to keep embeddings effective (see [Chunking](./chunking.md)).
43
+ - Index freshness: when `notes_chat.auto_index` is enabled, manual note saves trigger an index update.
44
+
45
+ ## References
46
+
47
+ - Embeddings backend: [Embeddings](./embeddings.md)
48
+ - Architecture overview: [Architecture](./architecture.md)
@@ -0,0 +1,63 @@
1
+ name: 'Quality Checks'
2
+ description: 'Run formatting, linting, spell check, and type checking'
3
+
4
+ inputs:
5
+ python-version:
6
+ description: 'Python version to use'
7
+ required: false
8
+ default: '3.11'
9
+ uv-version:
10
+ description: 'UV version to use'
11
+ required: false
12
+ default: 'latest'
13
+
14
+ runs:
15
+ using: composite
16
+ steps:
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v6
19
+ with:
20
+ python-version: ${{ inputs.python-version }}
21
+
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v8.1.0
24
+ with:
25
+ version: ${{ inputs.uv-version }}
26
+ enable-cache: true
27
+
28
+ - name: Install system dependencies (macOS)
29
+ shell: bash
30
+ run: |
31
+ if [[ "$RUNNER_OS" == "macOS" ]]; then
32
+ brew install portaudio
33
+ fi
34
+
35
+ - name: Install Python dependencies
36
+ shell: bash
37
+ run: |
38
+ uv sync --all-extras
39
+
40
+ - name: Run formatting check
41
+ shell: bash
42
+ run: |
43
+ uv run ruff format --check .
44
+
45
+ - name: Run linting
46
+ shell: bash
47
+ run: |
48
+ uv run ruff check .
49
+
50
+ - name: Run spell check
51
+ shell: bash
52
+ run: |
53
+ uv run codespell
54
+
55
+ - name: Run type checking
56
+ shell: bash
57
+ run: |
58
+ uv run mypy
59
+
60
+ - name: Validate code compilation
61
+ shell: bash
62
+ run: |
63
+ make validate
@@ -0,0 +1,40 @@
1
+ name: Main Branch Build
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ quality-and-build:
10
+ runs-on: macos-latest
11
+
12
+ steps:
13
+ - name: Checkout code
14
+ uses: actions/checkout@v6
15
+
16
+ - name: Run quality checks
17
+ uses: ./.github/actions/quality-checks
18
+
19
+ - name: Run tests
20
+ run: |
21
+ uv run pytest tests/ -v --cov=chirp --cov=config --cov=recorder --cov=transcriber --cov=notes --cov=utils --cov-report=xml
22
+
23
+ - name: Build package
24
+ run: |
25
+ uv build
26
+
27
+ - name: Upload build artifacts
28
+ uses: actions/upload-artifact@v7
29
+ with:
30
+ name: chirp-package
31
+ path: dist/
32
+ retention-days: 30
33
+
34
+ - name: Upload coverage reports
35
+ uses: codecov/codecov-action@v6
36
+ with:
37
+ file: ./coverage.xml
38
+ flags: unittests
39
+ name: codecov-umbrella
40
+ fail_ci_if_error: false
@@ -0,0 +1,46 @@
1
+ name: PR Quality Checks
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [ main ]
6
+ types: [opened, synchronize, reopened, ready_for_review]
7
+
8
+ concurrency:
9
+ group: ${{ github.workflow }}-${{ github.ref }}
10
+ cancel-in-progress: true
11
+
12
+ jobs:
13
+ quality-checks:
14
+ if: github.event.pull_request.draft == false
15
+ runs-on: macos-latest
16
+ steps:
17
+ - name: ⤵️ Checkout code
18
+ uses: actions/checkout@v6
19
+
20
+ - name: 🔎 Run quality checks (format, lint, spell, type, validate)
21
+ uses: ./.github/actions/quality-checks
22
+ with:
23
+ python-version: '3.11'
24
+
25
+ build-and-test:
26
+ needs: quality-checks
27
+ if: github.event.pull_request.draft == false
28
+ uses: ./.github/workflows/shared-build-and-test.yaml
29
+ with:
30
+ python-version: '3.11'
31
+ os-matrix: '["macos-latest"]'
32
+ upload-artifacts: true
33
+ update-version-for-pr: true
34
+ run-integration-tests: false
35
+
36
+ publish-to-test-pypi:
37
+ needs: build-and-test
38
+ if: github.event.pull_request.draft == false
39
+ uses: ./.github/workflows/shared-publish-package.yaml
40
+ with:
41
+ python-version: '3.11'
42
+ is-test-pypi: true
43
+ secrets: inherit
44
+ permissions:
45
+ pull-requests: write
46
+ id-token: write