PyPI - chirp-notes-ai - Versions diffs - 0.0.1a0__tar.gz - Mend

chirp-notes-ai 0.0.1a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

chirp_notes_ai-0.0.1a0/.codespellrc +3 -0
chirp_notes_ai-0.0.1a0/.docs/DEVELOPMENT.md +85 -0
chirp_notes_ai-0.0.1a0/.docs/architecture.md +59 -0
chirp_notes_ai-0.0.1a0/.docs/chunking.md +73 -0
chirp_notes_ai-0.0.1a0/.docs/diarization-roadmap.md +139 -0
chirp_notes_ai-0.0.1a0/.docs/embeddings.md +96 -0
chirp_notes_ai-0.0.1a0/.docs/hybrid-retrieval.md +48 -0
chirp_notes_ai-0.0.1a0/.docs/imgs/chirp-logo.png +0 -0
chirp_notes_ai-0.0.1a0/.github/actions/quality-checks/action.yml +63 -0
chirp_notes_ai-0.0.1a0/.github/workflows/main-build.yml +40 -0
chirp_notes_ai-0.0.1a0/.github/workflows/pr-checks.yml +46 -0
chirp_notes_ai-0.0.1a0/.github/workflows/publish.yaml +40 -0
chirp_notes_ai-0.0.1a0/.github/workflows/shared-build-and-test.yaml +193 -0
chirp_notes_ai-0.0.1a0/.github/workflows/shared-publish-package.yaml +109 -0
chirp_notes_ai-0.0.1a0/.gitignore +217 -0
chirp_notes_ai-0.0.1a0/.pre-commit-config.yaml +30 -0
chirp_notes_ai-0.0.1a0/.vscode/extensions.json +38 -0
chirp_notes_ai-0.0.1a0/.vscode/settings.json +92 -0
chirp_notes_ai-0.0.1a0/AGENTS.md +26 -0
chirp_notes_ai-0.0.1a0/CLAUDE.md +20 -0
chirp_notes_ai-0.0.1a0/LICENSE +21 -0
chirp_notes_ai-0.0.1a0/Makefile +231 -0
chirp_notes_ai-0.0.1a0/PKG-INFO +226 -0
chirp_notes_ai-0.0.1a0/README.md +198 -0
chirp_notes_ai-0.0.1a0/_bmad-output/implementation-artifacts/deferred-work.md +42 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-audio-capture/epic.md +125 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-audio-capture/stories/2.1-audio-capture-module.md +147 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-audio-capture/stories/2.2-recorder-integration.md +101 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-audio-capture/stories/2.3-blackhole-removal.md +125 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/epic.md +105 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.1-storage-rewrite.md +116 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.2-command-surface-prune.md +89 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.3-notes-sub-app.md +77 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.4-transcribe-queue-checklist.md +101 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.5-init-polish.md +102 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.6-record-verify.md +74 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.7-ask-verify.md +63 -0
chirp_notes_ai-0.0.1a0/_bmad-output/planning-artifacts/epic-wireframe-alignment/stories/1.8-search-keyword-rewrite.md +304 -0
chirp_notes_ai-0.0.1a0/chirp/__init__.py +0 -0
chirp_notes_ai-0.0.1a0/chirp/about.py +215 -0
chirp_notes_ai-0.0.1a0/chirp/branding.py +89 -0
chirp_notes_ai-0.0.1a0/chirp/cli.py +1155 -0
chirp_notes_ai-0.0.1a0/chirp/exceptions.py +30 -0
chirp_notes_ai-0.0.1a0/chirp/init_flow.py +699 -0
chirp_notes_ai-0.0.1a0/config/__init__.py +0 -0
chirp_notes_ai-0.0.1a0/config/settings.py +154 -0
chirp_notes_ai-0.0.1a0/notes/__init__.py +0 -0
chirp_notes_ai-0.0.1a0/notes/constants.py +1 -0
chirp_notes_ai-0.0.1a0/notes/manual_note_manager.py +131 -0
chirp_notes_ai-0.0.1a0/notes/note_editor.py +738 -0
chirp_notes_ai-0.0.1a0/notes/note_generator.py +515 -0
chirp_notes_ai-0.0.1a0/notes/template_engine.py +122 -0
chirp_notes_ai-0.0.1a0/notes_chat/__init__.py +1 -0
chirp_notes_ai-0.0.1a0/notes_chat/bm25.py +106 -0
chirp_notes_ai-0.0.1a0/notes_chat/cache.py +73 -0
chirp_notes_ai-0.0.1a0/notes_chat/cli.py +167 -0
chirp_notes_ai-0.0.1a0/notes_chat/config.py +5 -0
chirp_notes_ai-0.0.1a0/notes_chat/index.py +373 -0
chirp_notes_ai-0.0.1a0/notes_chat/interactive.py +277 -0
chirp_notes_ai-0.0.1a0/notes_chat/prompting.py +771 -0
chirp_notes_ai-0.0.1a0/notes_chat/retrieval.py +409 -0
chirp_notes_ai-0.0.1a0/notes_chat/search_keyword.py +397 -0
chirp_notes_ai-0.0.1a0/notes_chat/time_ranges.py +139 -0
chirp_notes_ai-0.0.1a0/notes_chat/types.py +34 -0
chirp_notes_ai-0.0.1a0/pyproject.toml +137 -0
chirp_notes_ai-0.0.1a0/recorder/__init__.py +0 -0
chirp_notes_ai-0.0.1a0/recorder/audio_recorder.py +341 -0
chirp_notes_ai-0.0.1a0/recorder/device_manager.py +237 -0
chirp_notes_ai-0.0.1a0/recorder/live_audio.py +228 -0
chirp_notes_ai-0.0.1a0/recorder/live_dashboard.py +359 -0
chirp_notes_ai-0.0.1a0/recorder/live_session.py +306 -0
chirp_notes_ai-0.0.1a0/recorder/live_transcriber.py +277 -0
chirp_notes_ai-0.0.1a0/recorder/live_types.py +32 -0
chirp_notes_ai-0.0.1a0/recorder/meeting_monitor.py +80 -0
chirp_notes_ai-0.0.1a0/recorder/vad_chunker.py +191 -0
chirp_notes_ai-0.0.1a0/renovate.json +8 -0
chirp_notes_ai-0.0.1a0/scripts/README.md +192 -0
chirp_notes_ai-0.0.1a0/scripts/debug_live_transcript.py +33 -0
chirp_notes_ai-0.0.1a0/scripts/pyproject.toml +8 -0
chirp_notes_ai-0.0.1a0/scripts/test_live_vad.py +125 -0
chirp_notes_ai-0.0.1a0/scripts/update_pr_version.py +42 -0
chirp_notes_ai-0.0.1a0/scripts/update_release_version.py +28 -0
chirp_notes_ai-0.0.1a0/scripts/uv.lock +8 -0
chirp_notes_ai-0.0.1a0/scripts/version_utils.py +77 -0
chirp_notes_ai-0.0.1a0/templates/daily_notes.md +5 -0
chirp_notes_ai-0.0.1a0/templates/meeting_section.md +34 -0
chirp_notes_ai-0.0.1a0/tests/__init__.py +0 -0
chirp_notes_ai-0.0.1a0/tests/notes_chat/test_interactive.py +180 -0
chirp_notes_ai-0.0.1a0/tests/notes_chat/test_search_keyword.py +222 -0
chirp_notes_ai-0.0.1a0/tests/test_about.py +53 -0
chirp_notes_ai-0.0.1a0/tests/test_ask_sources.py +241 -0
chirp_notes_ai-0.0.1a0/tests/test_audio_recorder.py +228 -0
chirp_notes_ai-0.0.1a0/tests/test_auto_indexing.py +77 -0
chirp_notes_ai-0.0.1a0/tests/test_batch_processor.py +360 -0
chirp_notes_ai-0.0.1a0/tests/test_bm25.py +151 -0
chirp_notes_ai-0.0.1a0/tests/test_cache.py +147 -0
chirp_notes_ai-0.0.1a0/tests/test_ci_versioning.py +47 -0
chirp_notes_ai-0.0.1a0/tests/test_cli_commands.py +486 -0
chirp_notes_ai-0.0.1a0/tests/test_cli_startup.py +48 -0
chirp_notes_ai-0.0.1a0/tests/test_device_manager.py +466 -0
chirp_notes_ai-0.0.1a0/tests/test_file_utils.py +152 -0
chirp_notes_ai-0.0.1a0/tests/test_index_manifest.py +200 -0
chirp_notes_ai-0.0.1a0/tests/test_init_flow.py +442 -0
chirp_notes_ai-0.0.1a0/tests/test_live_transcriber.py +293 -0
chirp_notes_ai-0.0.1a0/tests/test_manual_notes.py +76 -0
chirp_notes_ai-0.0.1a0/tests/test_note_editor.py +22 -0
chirp_notes_ai-0.0.1a0/tests/test_note_generator.py +242 -0
chirp_notes_ai-0.0.1a0/tests/test_parse_timeframe.py +82 -0
chirp_notes_ai-0.0.1a0/tests/test_prompting.py +281 -0
chirp_notes_ai-0.0.1a0/tests/test_record_view.py +188 -0
chirp_notes_ai-0.0.1a0/tests/test_retrieval_merge.py +208 -0
chirp_notes_ai-0.0.1a0/tests/test_settings.py +57 -0
chirp_notes_ai-0.0.1a0/tests/test_time_ranges.py +106 -0
chirp_notes_ai-0.0.1a0/tests/test_vad_chunker.py +211 -0
chirp_notes_ai-0.0.1a0/tests/test_whisper_transcriber.py +461 -0
chirp_notes_ai-0.0.1a0/transcriber/__init__.py +0 -0
chirp_notes_ai-0.0.1a0/transcriber/batch_processor.py +398 -0
chirp_notes_ai-0.0.1a0/transcriber/compression.py +60 -0
chirp_notes_ai-0.0.1a0/transcriber/whisper_transcriber.py +332 -0
chirp_notes_ai-0.0.1a0/utils/__init__.py +0 -0
chirp_notes_ai-0.0.1a0/utils/file_utils.py +192 -0
chirp_notes_ai-0.0.1a0/utils/popup_manager.py +81 -0
chirp_notes_ai-0.0.1a0/utils/time_utils.py +152 -0
chirp_notes_ai-0.0.1a0/uv.lock +3092 -0

chirp_notes_ai-0.0.1a0/.codespellrc ADDED Viewed

@@ -0,0 +1,3 @@
+[codespell]
+skip = .git,__pycache__,.pytest_cache,*.pyc,.venv,venv,dist,build,*.egg-info,htmlcov,to-transcribe,transcription-out,notes-out,uv.lock
+ignore-words-list = blackhole,chirp,ollama,llama,pyaudio,portaudio,whisper,pydantic,typer,pytest,mypy,ruff,htmlcov,pycache,pyproject,transcriber,transcriptions,mkdir,framerate,setnchannels,setsampwidth,setframerate,writeframes,maxInputChannels,maxOutputChannels,defaultSampleRate,hostApi,applescript,osascript,autodiscovery,elif,datetime,isoformat,fromisoformat,strftime,strptime,mtime,microsecond,unlink,iterdir,mkdir,pathlib,gzip,yaml,toml,venv,async,asyncio,threading,subprocess,kwargs,args,enum,uuid,onnxruntime,ctranslate,huggingface,tokenizers,flatbuffers,fsspec,httpcore,httpx,idna,mdurl,mpmath,protobuf,shellingham,sniffio,sympy,tqdm,urllib,coloredlogs,humanfriendly,filelock,certifi,charset,normalizer,annotated,anyio,chirp,transcriber,transcriptions,transcribe,iterm

chirp_notes_ai-0.0.1a0/.docs/DEVELOPMENT.md ADDED Viewed

@@ -0,0 +1,85 @@
+# Development Guide
+This document is for contributors working on Chirp. End users should start with the top-level `README.md`.
+## Prerequisites
+- Python 3.11+
+- macOS for local audio-capture development
+- Homebrew
+- Git
+- Ollama for note-generation and retrieval flows
+## Setup
+```bash
+git clone <repository-url>
+cd chirp-ai-note-app
+make dev-install
+```
+This installs system dependencies, syncs the Python environment with `uv`, installs the package editable, and enables pre-commit hooks.
+If you only need the editable install in an already-prepared environment:
+```bash
+make install-venv
+```
+## Quality checks
+```bash
+make check
+make test
+make test-coverage
+make lint-fix
+make format
+make type-check
+```
+Targeted test helpers are also available:
+```bash
+make test-file FILE=tests/test_settings.py
+make test-match PATTERN=slugify
+make test-failed
+```
+## Useful CLI checks
+```bash
+uv run chirp --help
+uv run chirp init --recheck
+make verify-deps
+```
+## Project structure
+```text
+chirp-ai-note-app/
+├── chirp/           # Typer CLI entrypoint and high-level flows
+├── config/          # Pydantic settings and config-path helpers
+├── recorder/        # Audio recording, device handling, live transcription
+├── transcriber/     # Whisper transcription and batch processing
+├── notes/           # Note generation, templates, manual editing
+├── notes_chat/      # Retrieval, keyword search, chat flows, indexing
+├── utils/           # Shared filesystem and time helpers
+├── templates/       # Prompt and note templates
+├── scripts/         # Dev/debug helper scripts
+├── tests/           # Pytest suite
+├── AGENTS.md        # Canonical contributor guidance
+└── README.md        # Canonical user-facing readme
+```
+## Notes storage
+- Config file: `~/.chirp/config.toml`
+- Default notes root: `~/Documents/chirp`
+- Each note lives in its own folder with `audio.wav`, `transcript.txt`, `notes.md`, and `meta.toml`
+## Contributing
+1. Branch from `main`.
+2. Keep changes scoped.
+3. Run the relevant tests plus `make check`.
+4. Open a PR with context about behavior changes and any CLI-facing output changes.

chirp_notes_ai-0.0.1a0/.docs/architecture.md ADDED Viewed

@@ -0,0 +1,59 @@
+# Chirp Architecture (at a glance)
+This guide shows how Chirp moves from audio to answers using a simple, hybrid RAG pipeline. For implementation details, see `notes_chat/`, `notes/`, `transcriber/`, and `config/`.
+## Ingestion (index build)
+```mermaid
+flowchart LR
+  Notes[notes-out/*.md] --> Chunk[Chunk]
+  Chunk --> Embed[Embed]
+  Embed --> Chroma[(Chroma)]
+  Chunk --> BM25[BM25 Corpus]
+```
+## Retrieval (ask)
+```mermaid
+flowchart LR
+  Question[Question] --> QEmb[Embed]
+  QEmb --> Chroma[(Chroma)]
+  Question --> BM25Q[BM25 Search]
+  Chroma --> VecHits[Vector Hits]
+  BM25Q --> LexHits[Lexical Hits]
+  VecHits --> Merge[Merge + Dedupe]
+  LexHits --> Merge
+  Merge --> Context[Context Budget]
+  Context --> LLM[LLM]
+  LLM --> Answer[Answer + Sources]
+```
+- Chunking: section-aware with overlap. See: [Chunking Strategy](./chunking.md)
+- Embeddings: Ollama embeddings; same model for chunks and queries. See: [Embedding Backend](./embeddings.md)
+- Dedupe key: `(path, content_hash)` so overlapping or repeated text doesn’t show twice
+- Why hybrid? See: [Hybrid Retrieval](./hybrid-retrieval.md)
+## Components (what exists)
+- CLI (`chirp`): entry point to record, process notes, index, and chat
+- Recorder + Transcriber: produce transcription and notes
+- Note Generator: writes `notes-out/*.md`
+- Indexer: chunks, embeds, and stores in Chroma; rebuilds BM25
+- Retriever: hybrid search (vector + BM25), merges and builds context
+- LLM: answers using the built context
+- Storage:
+  - Chroma (persistent at `.notes_index/chroma`)
+  - BM25 corpus (at `.notes_index/bm25.json`)
+## Operations (quick refs)
+- Manual notes saved via the CLI are auto-indexed when `notes_chat.auto_index` is enabled.
+- See the main `README.md` for commands and usage.
+## Configuration
+- Main settings live in `config/config.yaml` (paths, models, and RAG tuning)
+- Notable knobs (see linked docs for behavior):
+  - Chunking: `notes_chat.chunk_size`, `notes_chat.overlap` → [Chunking](./chunking.md)
+  - Embeddings: `notes_chat.emb_model`, `models.ollama_url` → [Embeddings](./embeddings.md)

chirp_notes_ai-0.0.1a0/.docs/chunking.md ADDED Viewed

@@ -0,0 +1,73 @@
+# Chunking Strategy
+This document defines the chunking strategy used for indexing notes into the RAG pipeline.
+- Code reference: `notes_chat/index.py`
+- Config knobs: `notes_chat.chunk_size`, `notes_chat.overlap` in `config/config.yaml` / `config/settings.py`
+- Defaults: `chunk_size: 1000` characters, `overlap: 200` characters
+## Goals
+- Preserve semantic boundaries by splitting around second-level headings first (`##` in Markdown)
+- Keep chunks under a target character budget for efficient embedding and retrieval
+- Add overlap to reduce information loss at boundaries
+## Inputs & Outputs
+- Input: Markdown note text (from `notes-out/*.md`) and extracted metadata
+- Output: List of chunks with fields:
+  - `id`: `<file_stem>_<section_index>` or `<file_stem>_<section_index>_<chunk_index>`
+  - `content`: the chunk text
+  - `meta`: `title`, `date`, `participants`, `duration`, etc.
+  - `content_hash`: stable hash of normalized content for de-duplication
+## Algorithm
+1. Section-aware split
+   - Split the document on `\n##`, effectively chunking by second-level headings while retaining the heading text in the section.
+   - Skip empty sections and very short ones: sections with `< 50` characters are ignored.
+   - Note: Sections are defined as meetings and/or single transcripts.
+2. Size check per section
+   - If `len(section) <= chunk_size`, emit the whole section as a single chunk.
+   - Else, split the large section with overlapping windows (see below).
+3. Overlapping windows for large sections
+   - Convert character budgets to approximate word windows using a 6 characters-per-word heuristic:
+     - `chunk_words = chunk_size // 6`
+     - `overlap_words = overlap // 6`
+   - Slide a window across the section’s words:
+     - `start = 0`
+     - `end = min(start + chunk_words, total_words)`
+     - Emit `" ".join(words[start:end])`
+     - Set `start = end - overlap_words` (floors at 0) and repeat until the end of the section.
+4. Metadata and IDs
+- Each chunk gets a deterministic `id` and carries `meta` plus `content_hash` (used for de-duplication).
+## Defaults & Tuning
+- Defaults from `config/config.yaml`:
+  - `notes_chat.chunk_size: 1000` (approx. `~166` words per chunk)
+  - `notes_chat.overlap: 200` (approx. `~33` words overlap)
+- Increase `chunk_size` if your sections are dense and short, or you want fewer, larger chunks.
+- Increase `overlap` if you see boundary-loss in answers; decrease for faster indexing/search.
+## Edge Cases & Notes
+- No headings: the entire file acts as a single section and will be either a single chunk or split by word windows.
+- Very short files/sections (`< 50` chars) are ignored to avoid noisy chunks.
+- Non-ASCII/whitespace: tokenization uses `str.split()` (whitespace); very long tokens (e.g., URLs) may push beyond targets.
+- IDs and signatures (`content_hash`) help merge/dedupe across hybrid retrieval (Chroma + BM25).
+## Rationale
+- Section-first splitting aligns chunks with human-authored structure.
+- Overlap preserves context across chunk boundaries, improving recall in semantic search.
+- Word-based windows derived from character budgets keep behavior stable while allowing intuitive char-sized tuning.
+See also: [Architecture](./architecture.md)

chirp_notes_ai-0.0.1a0/.docs/diarization-roadmap.md ADDED Viewed

@@ -0,0 +1,139 @@
+# Diarization Roadmap (Local, macOS-first)
+This document outlines a pragmatic, phased plan to add speaker detection to Chirp. It keeps everything local, starts simple, and gives a clear upgrade path. No implementation is committed yet—this is a planning guide for when we're ready.
+- Goals
+  - Tag “You” vs “Others” reliably during meetings
+  - Optionally split “Others” into Speaker A/B/C
+  - Keep fully local; no cloud calls or gated model requirements by default
+  - Minimal impact on the current CLI until we flip it on
+- Scope
+  - macOS focus to start (BlackHole + Aggregate/Multi-Output devices)
+  - Compatible with existing transcription (faster-whisper)
+  - Future-compatible with Linux/Windows if desired
+## Phase 0 — Prep and Guardrails
+- Config placeholders (no behavior change yet)
+  - `diarization.enabled: false`
+  - `diarization.backend: "speechbrain" | "pyannote"`
+  - `diarization.overlap: false`
+  - `diarization.align_words: true`
+  - `diarization.mic_label.enabled: true`
+- Recorder readiness
+  - Dual capture (Mic + System via BlackHole) or Aggregate Device input (2 channels)
+  - Keep sample rate aligned (e.g., 48 kHz)
+- Tooling
+  - `chirp status` and `chirp devices` surface device hints (Aggregate/Multi-Output)
+- Acceptance criteria
+  - Config keys exist and are ignored safely when disabled
+  - Clear doc pointers; no runtime breakage if diarization is off
+## Phase 1 — Mic-based “You” Labeling (No Other-speaker splits)
+- Approach
+  - Record two synchronized streams: Mic (you) and System (others)
+  - Run VAD on Mic to build a "you speaking" time mask
+  - Assign transcript words/segments to "You" when they overlap the mask; otherwise "Other"
+- Pros: Zero model downloads, fast, robust with headphones
+- Limitations: No separation among “Others,” no overlap handling
+- Config ideas
+  - `diarization.mic_label.threshold: 0.4` (overlap proportion)
+  - `diarization.vad: webrtc`
+- Acceptance criteria
+  - Words you speak are labeled “You” with high precision on headphone setups
+  - No regression to existing transcription/notes when disabled
+## Phase 2 — SpeechBrain Diarization for “Others”
+- Approach
+  - VAD on System stream → short speech segments (~1.5–3.0s)
+  - ECAPA‑TDNN embeddings (SpeechBrain) for each segment
+  - Cluster embeddings (Agglomerative/Spectral, cosine distance) into Speaker A/B/C
+  - Map words to speakers by timestamp overlap (±100–200 ms padding)
+  - Keep Mic-based “You” override: if Mic is active, label as “You”
+- Pros: Fully local, no gated models, Apple Silicon friendly
+- Limitations: Not overlap‑aware; best for typical meetings
+- Config ideas
+  - `diarization.backend: "speechbrain"`
+  - `diarization.max_speakers: null` (auto 2–6 with override)
+  - `diarization.min_speech_ms: 250`
+  - `diarization.max_silence_ms: 400`
+  - `diarization.frame_ms: 30`
+- Acceptance criteria
+  - Two‑speaker meetings label cleanly into “You” and one other speaker
+  - Three‑plus speaker meetings are reasonable (some fragmentation acceptable)
+## Phase 3 — Overlap Handling
+- Approach
+  - Basic: If both Mic and System show strong activity, mark words as "Overlap" or choose dominant energy
+  - Advanced: Enable overlap‑aware diarization (e.g., pyannote backend) to permit multiple concurrent speakers
+  - Word assignment: choose the speaker with highest local activity; if Mic active and dominant, prefer “You”
+- Pros: Better attribution during cross‑talk
+- Limitations: Heavier models if you enable pyannote (requires HF token)
+- Config ideas
+  - `diarization.overlap: true`
+  - `diarization.energy_ratio_threshold: 2.0`
+- Acceptance criteria
+  - Cross‑talk segments aren’t misattributed wholesale; predictable tie‑breaking
+## Phase 4 — Performance, Reliability, and Fallbacks
+- Performance
+  - Chunk long audio (2–5 min with overlaps) to bound memory; stitch results
+  - CPU acceptable on M‑series; keep offline, non‑realtime
+- Reliability
+  - Friendly errors if models unavailable; clear instructions in `chirp status/test`
+- Fallbacks
+  - If `backend: pyannote` is requested but models/HF token absent, fall back to SpeechBrain with a warning
+- Acceptance criteria
+  - 30–90 minute meetings process reliably with clear messaging and no hard failures
+## Phase 5 — Optional Identification and UX
+- Identify "Me"
+  - Optional enrollment: store a local ECAPA voiceprint; relabel diarized cluster as “You” via cosine similarity
+  - Keep others anonymous (Speaker A/B/C)
+- CLI/UX
+  - Flags: `--diarize`, `--overlap`, `--identify-me`
+  - Templates: compact vs detailed speaker labels
+- Indexing
+  - Include `speaker` and `is_you` metadata for future filters
+- Acceptance criteria
+  - Easy toggles, predictable output, speaker tags improve note readability
+## Risks and Mitigations
+- Mic bleed (no headphones): higher false “You” rate → increase thresholds; recommend headphones
+- Double‑talk: resolve with overlap heuristics or pyannote backend when needed
+- Fragmentation: merge adjacent same‑speaker segments; tune VAD thresholds
+- Model weight size/network: default to SpeechBrain; make pyannote optional
+## Testing Plan (incremental)
+- Unit tests
+  - VAD mask generation (Mic/System) with synthetic signals
+  - Word‑to‑mask overlap labeling
+  - Embedding clustering on toy datasets (2–3 speakers)
+- Fixtures
+  - Short (2–5 min) two‑speaker and three‑speaker samples
+  - Headphones vs. speaker playback scenarios
+- CLI smoke
+  - `chirp transcribe --diarize` produces labeled utterances; disabled mode unchanged
+## Next Steps (when ready)
+- Keep diarization disabled by default
+- Implement Phase 1 first (mic‑based labeling) for immediate value
+- Add Phase 2 (SpeechBrain) behind `backend: speechbrain`
+- Consider Phase 3 (overlap) only if needed; pyannote as an optional backend
+---
+References
+- SpeechBrain ECAPA TDNN: <https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb>
+- WebRTC VAD: <https://webrtc.org/> (Python bindings: `webrtcvad`)
+- pyannote (optional backend): <https://github.com/pyannote/pyannote-audio>

chirp_notes_ai-0.0.1a0/.docs/embeddings.md ADDED Viewed

@@ -0,0 +1,96 @@
+# Embedding Backend
+This document explains how embeddings are generated and used in Chirp’s RAG pipeline.
+- Code references: `notes_chat/index.py`, `notes_chat/retrieval.py`, `config/config.yaml`, `config/settings.py`, `notes_chat/prompting.py`
+- Default backend: [Ollama](https://ollama.com) HTTP API
+- Default embedding model: `nomic-embed-text`
+## Overview
+Embeddings convert text into high-dimensional vectors that preserve semantic similarity. Chirp uses embeddings to:
+- Index note chunks into a vector database (Chroma)
+- Embed queries at retrieval time and run vector similarity search
+```mermaid
+flowchart LR
+  A[Chunk Text] -->|POST /api/embeddings| O[Ollama]
+  O --> V[Vector]
+  V --> C[(Chroma Collection 'notes')]
+```
+## Configuration
+- `config/config.yaml`
+  - `notes_chat.emb_model`: embedding model name (default `nomic-embed-text`)
+  - `models.ollama_url`: Ollama server URL (default `http://localhost:11434`)
+- `config/settings.py` hydrates these into `ChirpSettings` used across the app.
+## Indexing Flow
+- Implemented in `notes_chat/index.py`:
+  1. Chunk notes (section-aware + overlapping windows)
+  2. For each chunk, compute `content_hash` and call Ollama embeddings:
+     - Endpoint: `POST {ollama_url}/api/embeddings`
+     - Payload: `{ "model": emb_model, "prompt": chunk_text }`
+     - Response: `{ "embedding": [float, ...] }`
+  3. Upsert into Chroma with `ids`, `documents`, `embeddings`, and metadata (including `content_hash`)
+  4. Rebuild the BM25 lexicon (`.notes_index/bm25.json`) from Chroma documents
+Key method signatures:
+- `_get_embeddings(texts: list[str]) -> list[list[float]]`
+- `collection.add(ids, documents, embeddings, metadatas)`
+## Retrieval Flow
+- Implemented in `notes_chat/retrieval.py`:
+  1. Parse time filter (if present)
+  2. Compute query embedding via Ollama:
+     - Endpoint: `POST {ollama_url}/api/embeddings`
+     - Payload: `{ "model": emb_model, "prompt": query }`
+  3. Query Chroma for top-k semantic matches
+  4. Query BM25 for lexical matches
+  5. Merge + dedupe using `(path, content_hash)`
+  6. Build context under a character budget and pass to the LLM for answering
+## Determinism and Model Choice
+- Embedding calls are stateless and do not stream.
+- The chosen model `nomic-embed-text` provides a general-purpose English embedding suitable for note-sized chunks.
+- You can swap `notes_chat.emb_model` to another Ollama-compatible embedding model if desired.
+## Error Handling
+- Indexing (`_get_embeddings`):
+  - Non-200 responses cause the whole file’s add-to-index to fail (and be skipped).
+  - Connection errors are caught and surfaced via a console message.
+- Retrieval (`_get_query_embedding`):
+  - Returns `None` on error; retrieval will still return BM25-only results or an informative suggestion if nothing is found.
+Common failure modes and fixes:
+- “Failed to get embeddings”: ensure Ollama is running and the model is pulled.
+- Timeouts: large models or long prompts—verify `ollama serve` and local resources.
+## Troubleshooting
+- Verify Ollama:
+  - `curl {ollama_url}/api/version`
+  - `curl {ollama_url}/api/tags` (ensure `notes_chat.emb_model` is listed)
+- From project root:
+  - Rebuild index: `uv run chirp notes index --force`
+## Extensibility
+- Add other embedding backends by implementing equivalents of:
+  - Index: `_get_embeddings(texts)`
+  - Retrieval: `_get_query_embedding(query)`
+- Keep `content_hash` unchanged—only the embedding vectors change.
+- Consider adding model-specific normalization or truncation if needed by the target API.
+## Privacy
+- With Ollama running locally, text never leaves your machine.
+- If you later switch to a hosted embedding API, review data policies and redact sensitive content as needed before embedding.

chirp_notes_ai-0.0.1a0/.docs/hybrid-retrieval.md ADDED Viewed

@@ -0,0 +1,48 @@
+# Hybrid Retrieval: Embeddings + BM25
+This note explains why Chirp uses hybrid retrieval (semantic embeddings + BM25 lexical search), how it works, and when you might tune or change it.
+## TL;DR
+- Keep both: embeddings catch semantic matches; BM25 catches exact terms (IDs, names, phrases).
+- Merge the results, dedupe by `(path, content_hash)`, then build a context under a fixed character budget.
+## How it works
+See the retrieval diagram in the Architecture doc: [Architecture → Retrieval (ask)](./architecture.md#retrieval-ask).
+- Embeddings: query and chunks are embedded with the same model; Chroma returns top-k by cosine similarity.
+- BM25: ranks chunks by lexical overlap; strong for exact tokens, IDs, acronyms, and phrases.
+- Merge + Dedupe: combine lists and deduplicate using `(path, content_hash)`.
+- Context: allocate text across top chunks within `ctx_char_budget`, then prompt the LLM and attach sources.
+## Why hybrid?
+- Short or specific queries: BM25 shines on IDs (e.g., `jira-123`), names, codes, dates, and quoted phrases.
+- Paraphrased or fuzzy queries: embeddings retrieve semantically related content even if words differ.
+- Local and fast: both run locally (Ollama + Chroma + BM25 corpus) with low overhead.
+## When to change it
+- Stronger embeddings + re-ranker: if you add a cross-encoder/LLM re-ranking step, embeddings-only can be competitive.
+- Domain-specific tokens: increase BM25 weight if queries often include IDs or exact terms.
+- Minimal needs: if queries are always natural language, embeddings-only may be sufficient.
+## Tuning tips
+- k-values: start with `k=10` for both; keep totals small to avoid noisy merges.
+- Fusion: simple score normalization or Reciprocal Rank Fusion (RRF) keeps logic robust.
+- Routing: detect quotes, ALL-CAPS acronyms, many digits → boost BM25’s influence.
+- Budgeting: round-robin or interleaving across sources when constructing the context.
+## Failure modes and guardrails
+- Empty or low-similarity vectors: backfill from BM25.
+- Duplicate content across notes: dedupe with `(path, content_hash)`.
+- Very long chunks: rely on chunking and overlaps to keep embeddings effective (see [Chunking](./chunking.md)).
+- Index freshness: when `notes_chat.auto_index` is enabled, manual note saves trigger an index update.
+## References
+- Embeddings backend: [Embeddings](./embeddings.md)
+- Architecture overview: [Architecture](./architecture.md)

chirp_notes_ai-0.0.1a0/.docs/imgs/chirp-logo.png ADDED Viewed

Binary file

chirp_notes_ai-0.0.1a0/.github/actions/quality-checks/action.yml ADDED Viewed

@@ -0,0 +1,63 @@
+name: 'Quality Checks'
+description: 'Run formatting, linting, spell check, and type checking'
+inputs:
+  python-version:
+    description: 'Python version to use'
+    required: false
+    default: '3.11'
+  uv-version:
+    description: 'UV version to use'
+    required: false
+    default: 'latest'
+runs:
+  using: composite
+  steps:
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: ${{ inputs.python-version }}
+    - name: Install uv
+      uses: astral-sh/setup-uv@v8.1.0
+      with:
+        version: ${{ inputs.uv-version }}
+        enable-cache: true
+    - name: Install system dependencies (macOS)
+      shell: bash
+      run: |
+        if [[ "$RUNNER_OS" == "macOS" ]]; then
+          brew install portaudio
+        fi
+    - name: Install Python dependencies
+      shell: bash
+      run: |
+        uv sync --all-extras
+    - name: Run formatting check
+      shell: bash
+      run: |
+        uv run ruff format --check .
+    - name: Run linting
+      shell: bash
+      run: |
+        uv run ruff check .
+    - name: Run spell check
+      shell: bash
+      run: |
+        uv run codespell
+    - name: Run type checking
+      shell: bash
+      run: |
+        uv run mypy
+    - name: Validate code compilation
+      shell: bash
+      run: |
+        make validate

chirp_notes_ai-0.0.1a0/.github/workflows/main-build.yml ADDED Viewed

@@ -0,0 +1,40 @@
+name: Main Branch Build
+on:
+  push:
+    branches: [ main ]
+  workflow_dispatch:
+jobs:
+  quality-and-build:
+    runs-on: macos-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v6
+    - name: Run quality checks
+      uses: ./.github/actions/quality-checks
+    - name: Run tests
+      run: |
+        uv run pytest tests/ -v --cov=chirp --cov=config --cov=recorder --cov=transcriber --cov=notes --cov=utils --cov-report=xml
+    - name: Build package
+      run: |
+        uv build
+    - name: Upload build artifacts
+      uses: actions/upload-artifact@v7
+      with:
+        name: chirp-package
+        path: dist/
+        retention-days: 30
+    - name: Upload coverage reports
+      uses: codecov/codecov-action@v6
+      with:
+        file: ./coverage.xml
+        flags: unittests
+        name: codecov-umbrella
+        fail_ci_if_error: false

chirp_notes_ai-0.0.1a0/.github/workflows/pr-checks.yml ADDED Viewed

@@ -0,0 +1,46 @@
+name: PR Quality Checks
+on:
+  pull_request:
+    branches: [ main ]
+    types: [opened, synchronize, reopened, ready_for_review]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  quality-checks:
+    if: github.event.pull_request.draft == false
+    runs-on: macos-latest
+    steps:
+      - name: ⤵️ Checkout code
+        uses: actions/checkout@v6
+      - name: 🔎 Run quality checks (format, lint, spell, type, validate)
+        uses: ./.github/actions/quality-checks
+        with:
+          python-version: '3.11'
+  build-and-test:
+    needs: quality-checks
+    if: github.event.pull_request.draft == false
+    uses: ./.github/workflows/shared-build-and-test.yaml
+    with:
+      python-version: '3.11'
+      os-matrix: '["macos-latest"]'
+      upload-artifacts: true
+      update-version-for-pr: true
+      run-integration-tests: false
+  publish-to-test-pypi:
+    needs: build-and-test
+    if: github.event.pull_request.draft == false
+    uses: ./.github/workflows/shared-publish-package.yaml
+    with:
+      python-version: '3.11'
+      is-test-pypi: true
+    secrets: inherit
+    permissions:
+      pull-requests: write
+      id-token: write