slurpai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ # Required for OpenAI backend
2
+ OPENAI_API_KEY=sk-...
3
+
4
+ # Optional: default backend (openai or faster-whisper)
5
+ # INGESTIBLE_BACKEND=openai
6
+
7
+ # Optional: OpenAI model override
8
+ # OPENAI_WHISPER_MODEL=whisper-1
9
+
10
+ # Optional: local Whisper model size (base, small, medium, large)
11
+ # INGESTIBLE_WHISPER_MODEL=base
@@ -0,0 +1,31 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Install ffmpeg
20
+ run: sudo apt-get update && sudo apt-get install -y ffmpeg
21
+
22
+ - name: Set up Python ${{ matrix.python-version }}
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: ${{ matrix.python-version }}
26
+
27
+ - name: Install dependencies
28
+ run: pip install -e ".[dev]"
29
+
30
+ - name: Run tests
31
+ run: pytest -v
@@ -0,0 +1,29 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ *.egg
8
+
9
+ # Virtual environments
10
+ .venv/
11
+ venv/
12
+
13
+ # Environment
14
+ .env
15
+
16
+ # IDE
17
+ .idea/
18
+ .vscode/
19
+ *.swp
20
+ *.swo
21
+
22
+ # OS
23
+ .DS_Store
24
+ Thumbs.db
25
+
26
+ # Test
27
+ .pytest_cache/
28
+ .coverage
29
+ htmlcov/
@@ -0,0 +1,37 @@
1
+ # Contributing to ingestible
2
+
3
+ Thanks for wanting to help. Here's how to get set up.
4
+
5
+ ## Development setup
6
+
7
+ ```bash
8
+ git clone https://github.com/grahamrowe82/ingestible.git
9
+ cd ingestible
10
+ python -m venv .venv
11
+ source .venv/bin/activate
12
+ pip install -e ".[dev]"
13
+ ```
14
+
15
+ You'll also need [ffmpeg](https://ffmpeg.org/) installed.
16
+
17
+ ## Running tests
18
+
19
+ ```bash
20
+ pytest -v
21
+ ```
22
+
23
+ Tests use ffmpeg to generate tiny test audio/video files — no API keys needed.
24
+
25
+ ## Submitting changes
26
+
27
+ 1. Fork the repo and create a branch
28
+ 2. Make your changes
29
+ 3. Run `pytest` and make sure everything passes
30
+ 4. Open a pull request
31
+
32
+ ## Reporting bugs
33
+
34
+ Open an issue at https://github.com/grahamrowe82/ingestible/issues with:
35
+ - What you ran (`ingest ...`)
36
+ - What happened (error message or unexpected output)
37
+ - Your OS and Python version
slurpai-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Graham Rowe
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
slurpai-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: slurpai
3
+ Version: 0.1.0
4
+ Summary: Convert voice notes, videos, and audio files into AI-ready text and images
5
+ Project-URL: Repository, https://github.com/grahamrowe82/ingestible
6
+ Project-URL: Issues, https://github.com/grahamrowe82/ingestible/issues
7
+ Author-email: Graham Rowe <graham@phasetransitions.ai>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: ai,audio-to-text,cli,ffmpeg,openai,transcription,video-to-text,voice-notes,whisper
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
20
+ Classifier: Topic :: Text Processing
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: click>=8.0
23
+ Requires-Dist: openai>=1.0
24
+ Requires-Dist: python-dotenv>=1.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=7.0; extra == 'dev'
27
+ Provides-Extra: local
28
+ Requires-Dist: faster-whisper>=0.10; extra == 'local'
29
+ Description-Content-Type: text/markdown
30
+
31
+ # ingestible
32
+
33
+ Convert voice notes, videos, and audio files into AI-ready text and images.
34
+
35
+ Consultants, researchers, and anyone who works with AI tools faces the same problem: clients and colleagues send voice notes, screen recordings, and video walkthroughs — but your AI workflow needs text and images. Ingestible bridges that gap with a single command.
36
+
37
+ ## Quick start
38
+
39
+ ```bash
40
+ pip install ingestible
41
+ export OPENAI_API_KEY=sk-...
42
+ ingest client-feedback.opus
43
+ ```
44
+
45
+ That's it. You get a folder with `transcript.txt` and you're ready to feed it into whatever AI tool you're using.
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install ingestible
51
+ ```
52
+
53
+ You also need [ffmpeg](https://ffmpeg.org/) on your PATH:
54
+
55
+ | OS | Command |
56
+ |----|---------|
57
+ | macOS | `brew install ffmpeg` |
58
+ | Ubuntu/Debian | `sudo apt install ffmpeg` |
59
+ | Windows | `choco install ffmpeg` or download from [ffmpeg.org](https://ffmpeg.org/download.html) |
60
+
61
+ ## Usage
62
+
63
+ ```bash
64
+ # Transcribe a voice note
65
+ ingest recording.opus
66
+
67
+ # Process a video (transcript + frame grabs every 15 seconds)
68
+ ingest feedback.mp4
69
+
70
+ # Batch process everything in a folder
71
+ ingest *.opus *.mp4
72
+
73
+ # Grab frames more frequently
74
+ ingest --frame-interval 5 demo.mp4
75
+
76
+ # Use local Whisper instead of OpenAI API
77
+ pip install ingestible[local]
78
+ ingest --backend faster-whisper recording.opus
79
+
80
+ # Preview what would be processed
81
+ ingest --dry-run *.opus
82
+ ```
83
+
84
+ ## Output
85
+
86
+ Each file produces a folder alongside it:
87
+
88
+ ```
89
+ recording/
90
+ ├── transcript.txt # Plain text transcription
91
+ ├── frames/ # Video frame grabs (video only)
92
+ │ ├── frame_001.jpg
93
+ │ ├── frame_002.jpg
94
+ │ └── ...
95
+ └── process.log # Timestamped processing log
96
+ ```
97
+
98
+ Re-running the same command skips already-completed files (idempotent).
99
+
100
+ ## Privacy notice
101
+
102
+ **By default, ingestible sends your audio to [OpenAI's Whisper API](https://platform.openai.com/docs/guides/speech-to-text) for transcription.** Your audio is transmitted to OpenAI's servers. Review [OpenAI's data usage policy](https://openai.com/policies/api-data-usage-policies) to understand how your data is handled.
103
+
104
+ If you need fully local, private transcription — no data leaves your machine:
105
+
106
+ ```bash
107
+ pip install ingestible[local]
108
+ ingest --backend faster-whisper recording.opus
109
+ ```
110
+
111
+ This uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) running entirely on your CPU. It's slower but nothing leaves your computer.
112
+
113
+ ## Configuration
114
+
115
+ Set `OPENAI_API_KEY` in your environment or a `.env` file in the current directory.
116
+
117
+ | Variable | Default | Description |
118
+ |----------|---------|-------------|
119
+ | `OPENAI_API_KEY` | — | Required for OpenAI backend |
120
+ | `INGESTIBLE_BACKEND` | `openai` | Default backend (`openai` or `faster-whisper`) |
121
+ | `OPENAI_WHISPER_MODEL` | `whisper-1` | OpenAI model to use |
122
+ | `INGESTIBLE_WHISPER_MODEL` | `base` | Local Whisper model size (`base`, `small`, `medium`, `large`) |
123
+
124
+ ## Supported formats
125
+
126
+ **Audio:** `.opus`, `.m4a`, `.ogg`, `.mp3`, `.wav`
127
+
128
+ **Video:** `.mp4`, `.mkv`, `.mov`, `.webm`
129
+
130
+ All formats are normalised to MP3 before transcription — this ensures consistent behaviour regardless of input format.
131
+
132
+ ## Requirements
133
+
134
+ - Python 3.10+
135
+ - [ffmpeg](https://ffmpeg.org/) on your PATH
136
+
137
+ ## Contributing
138
+
139
+ Found a bug or want to add a format? See [CONTRIBUTING.md](CONTRIBUTING.md).
140
+
141
+ ## License
142
+
143
+ MIT
@@ -0,0 +1,113 @@
1
+ # ingestible
2
+
3
+ Convert voice notes, videos, and audio files into AI-ready text and images.
4
+
5
+ Consultants, researchers, and anyone who works with AI tools faces the same problem: clients and colleagues send voice notes, screen recordings, and video walkthroughs — but your AI workflow needs text and images. Ingestible bridges that gap with a single command.
6
+
7
+ ## Quick start
8
+
9
+ ```bash
10
+ pip install ingestible
11
+ export OPENAI_API_KEY=sk-...
12
+ ingest client-feedback.opus
13
+ ```
14
+
15
+ That's it. You get a folder with `transcript.txt` and you're ready to feed it into whatever AI tool you're using.
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install ingestible
21
+ ```
22
+
23
+ You also need [ffmpeg](https://ffmpeg.org/) on your PATH:
24
+
25
+ | OS | Command |
26
+ |----|---------|
27
+ | macOS | `brew install ffmpeg` |
28
+ | Ubuntu/Debian | `sudo apt install ffmpeg` |
29
+ | Windows | `choco install ffmpeg` or download from [ffmpeg.org](https://ffmpeg.org/download.html) |
30
+
31
+ ## Usage
32
+
33
+ ```bash
34
+ # Transcribe a voice note
35
+ ingest recording.opus
36
+
37
+ # Process a video (transcript + frame grabs every 15 seconds)
38
+ ingest feedback.mp4
39
+
40
+ # Batch process everything in a folder
41
+ ingest *.opus *.mp4
42
+
43
+ # Grab frames more frequently
44
+ ingest --frame-interval 5 demo.mp4
45
+
46
+ # Use local Whisper instead of OpenAI API
47
+ pip install ingestible[local]
48
+ ingest --backend faster-whisper recording.opus
49
+
50
+ # Preview what would be processed
51
+ ingest --dry-run *.opus
52
+ ```
53
+
54
+ ## Output
55
+
56
+ Each file produces a folder alongside it:
57
+
58
+ ```
59
+ recording/
60
+ ├── transcript.txt # Plain text transcription
61
+ ├── frames/ # Video frame grabs (video only)
62
+ │ ├── frame_001.jpg
63
+ │ ├── frame_002.jpg
64
+ │ └── ...
65
+ └── process.log # Timestamped processing log
66
+ ```
67
+
68
+ Re-running the same command skips already-completed files (idempotent).
69
+
70
+ ## Privacy notice
71
+
72
+ **By default, ingestible sends your audio to [OpenAI's Whisper API](https://platform.openai.com/docs/guides/speech-to-text) for transcription.** Your audio is transmitted to OpenAI's servers. Review [OpenAI's data usage policy](https://openai.com/policies/api-data-usage-policies) to understand how your data is handled.
73
+
74
+ If you need fully local, private transcription — no data leaves your machine:
75
+
76
+ ```bash
77
+ pip install ingestible[local]
78
+ ingest --backend faster-whisper recording.opus
79
+ ```
80
+
81
+ This uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) running entirely on your CPU. It's slower but nothing leaves your computer.
82
+
83
+ ## Configuration
84
+
85
+ Set `OPENAI_API_KEY` in your environment or a `.env` file in the current directory.
86
+
87
+ | Variable | Default | Description |
88
+ |----------|---------|-------------|
89
+ | `OPENAI_API_KEY` | — | Required for OpenAI backend |
90
+ | `INGESTIBLE_BACKEND` | `openai` | Default backend (`openai` or `faster-whisper`) |
91
+ | `OPENAI_WHISPER_MODEL` | `whisper-1` | OpenAI model to use |
92
+ | `INGESTIBLE_WHISPER_MODEL` | `base` | Local Whisper model size (`base`, `small`, `medium`, `large`) |
93
+
94
+ ## Supported formats
95
+
96
+ **Audio:** `.opus`, `.m4a`, `.ogg`, `.mp3`, `.wav`
97
+
98
+ **Video:** `.mp4`, `.mkv`, `.mov`, `.webm`
99
+
100
+ All formats are normalised to MP3 before transcription — this ensures consistent behaviour regardless of input format.
101
+
102
+ ## Requirements
103
+
104
+ - Python 3.10+
105
+ - [ffmpeg](https://ffmpeg.org/) on your PATH
106
+
107
+ ## Contributing
108
+
109
+ Found a bug or want to add a format? See [CONTRIBUTING.md](CONTRIBUTING.md).
110
+
111
+ ## License
112
+
113
+ MIT
@@ -0,0 +1,55 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "slurpai"
7
+ version = "0.1.0"
8
+ description = "Convert voice notes, videos, and audio files into AI-ready text and images"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "Graham Rowe", email = "graham@phasetransitions.ai" },
14
+ ]
15
+ keywords = [
16
+ "whisper", "transcription", "voice-notes", "audio-to-text",
17
+ "video-to-text", "ffmpeg", "openai", "ai", "cli",
18
+ ]
19
+ classifiers = [
20
+ "Development Status :: 4 - Beta",
21
+ "Environment :: Console",
22
+ "Intended Audience :: Developers",
23
+ "License :: OSI Approved :: MIT License",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ "Programming Language :: Python :: 3.13",
28
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
29
+ "Topic :: Text Processing",
30
+ ]
31
+
32
+ dependencies = [
33
+ "click>=8.0",
34
+ "python-dotenv>=1.0",
35
+ "openai>=1.0",
36
+ ]
37
+
38
+ [project.urls]
39
+ Repository = "https://github.com/grahamrowe82/ingestible"
40
+ Issues = "https://github.com/grahamrowe82/ingestible/issues"
41
+
42
+ [project.optional-dependencies]
43
+ local = ["faster-whisper>=0.10"]
44
+ dev = [
45
+ "pytest>=7.0",
46
+ ]
47
+
48
+ [project.scripts]
49
+ ingest = "ingestible.cli:ingest"
50
+
51
+ [tool.hatch.build.targets.wheel]
52
+ packages = ["src/ingestible"]
53
+
54
+ [tool.pytest.ini_options]
55
+ testpaths = ["tests"]
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,106 @@
1
+ """CLI entry point for ingestible."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ import click
10
+ from dotenv import load_dotenv
11
+
12
+ from . import __version__
13
+
14
+
15
+ @click.command()
16
+ @click.argument("files", nargs=-1, required=True, type=click.Path(exists=True))
17
+ @click.option(
18
+ "-b",
19
+ "--backend",
20
+ type=click.Choice(["openai", "faster-whisper"]),
21
+ default=None,
22
+ help="Transcription backend (default: env or openai)",
23
+ )
24
+ @click.option(
25
+ "-f",
26
+ "--frame-interval",
27
+ type=int,
28
+ default=15,
29
+ help="Seconds between video frame grabs (default: 15)",
30
+ )
31
+ @click.option(
32
+ "-o",
33
+ "--output-dir",
34
+ type=click.Path(),
35
+ default=None,
36
+ help="Base output directory (default: next to input file)",
37
+ )
38
+ @click.option(
39
+ "-l",
40
+ "--language",
41
+ type=str,
42
+ default="en",
43
+ help="Language hint for transcription (default: en)",
44
+ )
45
+ @click.option("--dry-run", is_flag=True, help="Show what would be processed")
46
+ @click.version_option(version=__version__)
47
+ def ingest(
48
+ files: tuple[str, ...],
49
+ backend: str | None,
50
+ frame_interval: int,
51
+ output_dir: str | None,
52
+ language: str,
53
+ dry_run: bool,
54
+ ) -> None:
55
+ """Convert voice notes, audio files, and videos into text and images."""
56
+ load_dotenv()
57
+
58
+ backend = backend or os.getenv("INGESTIBLE_BACKEND", "openai")
59
+
60
+ from .ffmpeg import check_ffmpeg
61
+ from .process import SUPPORTED_EXTENSIONS, process_file
62
+
63
+ if not check_ffmpeg():
64
+ click.echo("Error: ffmpeg not found. Install it: brew install ffmpeg", err=True)
65
+ sys.exit(1)
66
+
67
+ output_base = Path(output_dir) if output_dir else None
68
+ paths = [Path(f) for f in files]
69
+
70
+ # Filter to supported formats
71
+ supported = []
72
+ for p in paths:
73
+ if p.suffix.lower() in SUPPORTED_EXTENSIONS:
74
+ supported.append(p)
75
+ else:
76
+ click.echo(f"Skipping unsupported format: {p.name}")
77
+
78
+ if not supported:
79
+ click.echo("No supported files to process.")
80
+ sys.exit(1)
81
+
82
+ if dry_run:
83
+ click.echo(f"Would process {len(supported)} file(s) with backend={backend}:")
84
+ for p in supported:
85
+ click.echo(f" {p}")
86
+ return
87
+
88
+ success = 0
89
+ failed = 0
90
+ for p in supported:
91
+ try:
92
+ result = process_file(
93
+ p,
94
+ backend=backend,
95
+ frame_interval=frame_interval,
96
+ output_dir=output_base,
97
+ language=language,
98
+ )
99
+ click.echo(f"Done: {result}")
100
+ success += 1
101
+ except Exception as e:
102
+ click.echo(f"Failed: {p.name} — {e}", err=True)
103
+ failed += 1
104
+
105
+ if len(supported) > 1:
106
+ click.echo(f"\n{success} succeeded, {failed} failed out of {len(supported)}")
@@ -0,0 +1,65 @@
1
+ """FFmpeg wrappers for audio extraction and frame capture."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import shutil
6
+ import subprocess
7
+ from pathlib import Path
8
+
9
+
10
+ def check_ffmpeg() -> bool:
11
+ """Return True if ffmpeg is available on PATH."""
12
+ return shutil.which("ffmpeg") is not None
13
+
14
+
15
+ def extract_audio(input_path: Path, output_path: Path) -> Path:
16
+ """Extract audio from video as compressed MP3 for API upload.
17
+
18
+ Uses mono, 16kHz, 64kbps — compresses a 10-min video from ~60MB to ~5MB,
19
+ staying under the 25MB Whisper API limit.
20
+ """
21
+ cmd = [
22
+ "ffmpeg", "-y", "-i", str(input_path),
23
+ "-vn", "-ac", "1", "-ar", "16000", "-b:a", "64k",
24
+ str(output_path),
25
+ "-loglevel", "warning",
26
+ ]
27
+ result = subprocess.run(cmd, capture_output=True, text=True)
28
+ if result.returncode != 0:
29
+ raise RuntimeError(f"ffmpeg audio extraction failed: {result.stderr.strip()}")
30
+ return output_path
31
+
32
+
33
+ def extract_frames(input_path: Path, output_dir: Path, *, interval: int = 15) -> int:
34
+ """Extract video frames every `interval` seconds as JPEG.
35
+
36
+ Returns the number of frames extracted.
37
+ """
38
+ output_dir.mkdir(parents=True, exist_ok=True)
39
+ pattern = str(output_dir / "frame_%03d.jpg")
40
+
41
+ cmd = [
42
+ "ffmpeg", "-i", str(input_path),
43
+ "-vf", f"fps=1/{interval}",
44
+ "-q:v", "2",
45
+ pattern,
46
+ "-loglevel", "warning",
47
+ ]
48
+ result = subprocess.run(cmd, capture_output=True, text=True)
49
+ if result.returncode != 0:
50
+ raise RuntimeError(f"ffmpeg frame extraction failed: {result.stderr.strip()}")
51
+
52
+ return len(list(output_dir.glob("frame_*.jpg")))
53
+
54
+
55
+ def has_video_stream(input_path: Path) -> bool:
56
+ """Check if file contains a video stream using ffprobe."""
57
+ cmd = [
58
+ "ffprobe", "-v", "error",
59
+ "-select_streams", "v",
60
+ "-show_entries", "stream=codec_type",
61
+ "-of", "csv=p=0",
62
+ str(input_path),
63
+ ]
64
+ result = subprocess.run(cmd, capture_output=True, text=True)
65
+ return "video" in result.stdout.lower()
@@ -0,0 +1,23 @@
1
+ """Simple dual logger — writes to stdout and a process.log file."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+
8
+
9
+ class ProcessLog:
10
+ """Logger that writes timestamped messages to both stdout and a file."""
11
+
12
+ def __init__(self, log_path: Path):
13
+ self.log_path = log_path
14
+ self.log_path.parent.mkdir(parents=True, exist_ok=True)
15
+
16
+ def log(self, message: str) -> None:
17
+ line = f"[{datetime.now().strftime('%H:%M:%S')}] {message}"
18
+ print(line)
19
+ with self.log_path.open("a", encoding="utf-8") as f:
20
+ f.write(line + "\n")
21
+
22
+ def skip(self, message: str) -> None:
23
+ self.log(f"[skip] {message}")
@@ -0,0 +1,98 @@
1
+ """Core orchestrator — processes a single file through the ingest pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from .ffmpeg import extract_audio, extract_frames, has_video_stream
8
+ from .log import ProcessLog
9
+ from .transcribe import transcribe
10
+
11
+ AUDIO_EXTENSIONS = {".opus", ".m4a", ".ogg", ".mp3", ".wav"}
12
+ VIDEO_EXTENSIONS = {".mp4", ".mkv", ".mov", ".webm"}
13
+ SUPPORTED_EXTENSIONS = AUDIO_EXTENSIONS | VIDEO_EXTENSIONS
14
+
15
+
16
+ def process_file(
17
+ input_path: Path,
18
+ *,
19
+ backend: str,
20
+ frame_interval: int = 15,
21
+ output_dir: Path | None = None,
22
+ language: str = "en",
23
+ ) -> Path:
24
+ """Process a single audio/video file. Returns the output directory."""
25
+ input_path = input_path.resolve()
26
+ ext = input_path.suffix.lower()
27
+
28
+ if ext not in SUPPORTED_EXTENSIONS:
29
+ raise ValueError(f"Unsupported format: {ext}")
30
+
31
+ out = _resolve_output_dir(input_path, output_dir)
32
+ out.mkdir(parents=True, exist_ok=True)
33
+
34
+ log = ProcessLog(out / "process.log")
35
+ log.log(f"=== Ingestible v0.1.0 ===")
36
+ log.log(f"Input: {input_path}")
37
+ log.log(f"Output: {out}/")
38
+ log.log(f"Backend: {backend}")
39
+
40
+ transcript_path = out / "transcript.txt"
41
+ is_video = ext in VIDEO_EXTENSIONS and has_video_stream(input_path)
42
+
43
+ # --- Step 1: Transcribe ---
44
+ # Always convert to MP3 first — normalises all formats into one known-good
45
+ # path. This matches the proven bash script behaviour: no conditionals,
46
+ # no format-compatibility surprises.
47
+ if transcript_path.exists():
48
+ log.skip(f"Transcript already exists: {transcript_path.name}")
49
+ else:
50
+ audio_tmp = out / "audio.mp3"
51
+ if audio_tmp.exists():
52
+ log.skip(f"Audio already extracted: {audio_tmp.name}")
53
+ else:
54
+ log.log("Extracting audio...")
55
+ extract_audio(input_path, audio_tmp)
56
+ log.log(f"Audio extracted: {_file_size(audio_tmp)}")
57
+
58
+ log.log(f"Transcribing with {backend}...")
59
+ text = transcribe(audio_tmp, backend=backend, language=language)
60
+ transcript_path.write_text(text, encoding="utf-8")
61
+ word_count = len(text.split())
62
+ log.log(f"Transcript: {word_count} words")
63
+
64
+ # Clean up intermediate audio
65
+ audio_tmp.unlink(missing_ok=True)
66
+
67
+ # --- Step 2: Extract frames (video only) ---
68
+ frames_dir = out / "frames"
69
+ if not is_video:
70
+ log.skip("Audio-only file — no frames to extract")
71
+ elif list(frames_dir.glob("frame_*.jpg")):
72
+ existing = len(list(frames_dir.glob("frame_*.jpg")))
73
+ log.skip(f"Frames already exist: {existing} frames")
74
+ else:
75
+ log.log(f"Extracting frames every {frame_interval}s...")
76
+ count = extract_frames(input_path, frames_dir, interval=frame_interval)
77
+ log.log(f"Extracted {count} frames")
78
+
79
+ log.log("=== Done ===")
80
+ return out
81
+
82
+
83
+ def _resolve_output_dir(input_path: Path, output_dir: Path | None) -> Path:
84
+ """Derive output directory: <parent>/<stem>/ or <output_dir>/<stem>/."""
85
+ stem = input_path.stem
86
+ if output_dir:
87
+ return output_dir / stem
88
+ return input_path.parent / stem
89
+
90
+
91
+ def _file_size(path: Path) -> str:
92
+ """Human-readable file size."""
93
+ size = path.stat().st_size
94
+ for unit in ("B", "KB", "MB", "GB"):
95
+ if size < 1024:
96
+ return f"{size:.1f} {unit}"
97
+ size /= 1024
98
+ return f"{size:.1f} TB"
@@ -0,0 +1,83 @@
1
+ """Transcription backends — OpenAI Whisper API and faster-whisper local."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ def transcribe(audio_path: Path, *, backend: str, language: str = "en") -> str:
11
+ """Transcribe an audio file, returning the text.
12
+
13
+ Routes to the appropriate backend based on the `backend` argument.
14
+ """
15
+ if backend == "openai":
16
+ return _transcribe_openai(audio_path, language=language)
17
+ elif backend == "faster-whisper":
18
+ return _transcribe_faster_whisper(audio_path, language=language)
19
+ else:
20
+ raise ValueError(f"Unknown backend: {backend!r}. Use 'openai' or 'faster-whisper'.")
21
+
22
+
23
+ def _extract_text(payload: Any) -> str:
24
+ """Defensively extract text from an OpenAI transcription response."""
25
+ text = getattr(payload, "text", None)
26
+ if isinstance(text, str) and text.strip():
27
+ return text.strip()
28
+ if isinstance(payload, dict):
29
+ candidate = payload.get("text")
30
+ if isinstance(candidate, str) and candidate.strip():
31
+ return candidate.strip()
32
+ raise ValueError("Transcription response missing text output")
33
+
34
+
35
+ def _transcribe_openai(audio_path: Path, *, language: str) -> str:
36
+ """Transcribe using the OpenAI Whisper API via the SDK."""
37
+ try:
38
+ from openai import OpenAI
39
+ except ImportError:
40
+ raise ImportError(
41
+ "OpenAI backend requires the openai package. "
42
+ "Install with: pip install ingestible[openai]"
43
+ )
44
+
45
+ api_key = os.getenv("OPENAI_API_KEY")
46
+ if not api_key:
47
+ raise RuntimeError(
48
+ "OPENAI_API_KEY not set. Add it to your .env file or environment."
49
+ )
50
+
51
+ client = OpenAI()
52
+ model = os.getenv("OPENAI_WHISPER_MODEL", "whisper-1").strip() or "whisper-1"
53
+
54
+ with audio_path.open("rb") as f:
55
+ response = client.audio.transcriptions.create(
56
+ file=f,
57
+ model=model,
58
+ language=language,
59
+ )
60
+
61
+ return _extract_text(response)
62
+
63
+
64
+ def _transcribe_faster_whisper(audio_path: Path, *, language: str) -> str:
65
+ """Transcribe locally using faster-whisper on CPU."""
66
+ try:
67
+ from faster_whisper import WhisperModel
68
+ except ImportError:
69
+ raise ImportError(
70
+ "Local backend requires faster-whisper. "
71
+ "Install with: pip install ingestible[local]"
72
+ )
73
+
74
+ model_size = os.getenv("INGESTIBLE_WHISPER_MODEL", "base").strip() or "base"
75
+ model = WhisperModel(model_size, device="cpu", compute_type="int8")
76
+
77
+ segments, _info = model.transcribe(str(audio_path), language=language)
78
+ text = " ".join(segment.text.strip() for segment in segments)
79
+
80
+ if not text.strip():
81
+ raise ValueError("Transcription produced no text")
82
+
83
+ return text.strip()
File without changes
@@ -0,0 +1,43 @@
1
+ """Shared test fixtures."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+ import pytest
9
+
10
+
11
+ @pytest.fixture
12
+ def sample_audio(tmp_path: Path) -> Path:
13
+ """Create a tiny audio file (1 second of silence) for testing."""
14
+ audio = tmp_path / "test.mp3"
15
+ subprocess.run(
16
+ [
17
+ "ffmpeg", "-y",
18
+ "-f", "lavfi", "-i", "anullsrc=r=16000:cl=mono",
19
+ "-t", "1", "-q:a", "9",
20
+ str(audio),
21
+ ],
22
+ capture_output=True,
23
+ check=True,
24
+ )
25
+ return audio
26
+
27
+
28
+ @pytest.fixture
29
+ def sample_video(tmp_path: Path) -> Path:
30
+ """Create a tiny video file (3 seconds, test pattern + silence)."""
31
+ video = tmp_path / "test.mp4"
32
+ subprocess.run(
33
+ [
34
+ "ffmpeg", "-y",
35
+ "-f", "lavfi", "-i", "testsrc=duration=3:size=320x240:rate=1",
36
+ "-f", "lavfi", "-i", "anullsrc=r=16000:cl=mono",
37
+ "-t", "3", "-shortest",
38
+ str(video),
39
+ ],
40
+ capture_output=True,
41
+ check=True,
42
+ )
43
+ return video
@@ -0,0 +1,38 @@
1
+ """Tests for the CLI entry point."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from click.testing import CliRunner
8
+
9
+ from ingestible.cli import ingest
10
+
11
+
12
+ def test_help():
13
+ runner = CliRunner()
14
+ result = runner.invoke(ingest, ["--help"])
15
+ assert result.exit_code == 0
16
+ assert "Convert voice notes" in result.output
17
+
18
+
19
+ def test_version():
20
+ runner = CliRunner()
21
+ result = runner.invoke(ingest, ["--version"])
22
+ assert result.exit_code == 0
23
+ assert "0.1.0" in result.output
24
+
25
+
26
+ def test_dry_run(sample_audio: Path):
27
+ runner = CliRunner()
28
+ result = runner.invoke(ingest, ["--dry-run", str(sample_audio)])
29
+ assert result.exit_code == 0
30
+ assert "Would process 1 file(s)" in result.output
31
+
32
+
33
+ def test_unsupported_file_skipped(tmp_path: Path):
34
+ txt = tmp_path / "notes.txt"
35
+ txt.write_text("hello")
36
+ runner = CliRunner()
37
+ result = runner.invoke(ingest, [str(txt)])
38
+ assert "Skipping unsupported format" in result.output
@@ -0,0 +1,32 @@
1
+ """Tests for the ffmpeg module."""
2
+
3
+ from pathlib import Path
4
+
5
+ from ingestible.ffmpeg import check_ffmpeg, extract_audio, extract_frames, has_video_stream
6
+
7
+
8
+ def test_check_ffmpeg():
9
+ assert check_ffmpeg() is True
10
+
11
+
12
+ def test_has_video_stream_with_video(sample_video: Path):
13
+ assert has_video_stream(sample_video) is True
14
+
15
+
16
+ def test_has_video_stream_with_audio(sample_audio: Path):
17
+ assert has_video_stream(sample_audio) is False
18
+
19
+
20
+ def test_extract_audio(sample_video: Path, tmp_path: Path):
21
+ output = tmp_path / "out.mp3"
22
+ result = extract_audio(sample_video, output)
23
+ assert result == output
24
+ assert output.exists()
25
+ assert output.stat().st_size > 0
26
+
27
+
28
+ def test_extract_frames(sample_video: Path, tmp_path: Path):
29
+ frames_dir = tmp_path / "frames"
30
+ count = extract_frames(sample_video, frames_dir, interval=1)
31
+ assert count >= 1
32
+ assert len(list(frames_dir.glob("frame_*.jpg"))) == count
@@ -0,0 +1,70 @@
1
+ """Tests for the process orchestrator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from unittest.mock import patch
7
+
8
+ from ingestible.process import SUPPORTED_EXTENSIONS, process_file
9
+
10
+
11
+ def test_supported_extensions_include_common_formats():
12
+ assert ".opus" in SUPPORTED_EXTENSIONS
13
+ assert ".m4a" in SUPPORTED_EXTENSIONS
14
+ assert ".mp4" in SUPPORTED_EXTENSIONS
15
+ assert ".mp3" in SUPPORTED_EXTENSIONS
16
+
17
+
18
+ def test_process_audio_file(sample_audio: Path):
19
+ """Process an audio file with mocked transcription."""
20
+ with patch("ingestible.process.transcribe", return_value="Hello from the test"):
21
+ out = process_file(sample_audio, backend="openai")
22
+
23
+ assert out.is_dir()
24
+ transcript = out / "transcript.txt"
25
+ assert transcript.exists()
26
+ assert transcript.read_text() == "Hello from the test"
27
+
28
+ log = out / "process.log"
29
+ assert log.exists()
30
+ assert "Done" in log.read_text()
31
+
32
+ # Audio-only: no frames directory should be populated
33
+ frames = out / "frames"
34
+ assert not list(frames.glob("frame_*.jpg")) if frames.exists() else True
35
+
36
+
37
+ def test_process_video_file(sample_video: Path):
38
+ """Process a video file with mocked transcription."""
39
+ with patch("ingestible.process.transcribe", return_value="Video transcript here"):
40
+ out = process_file(sample_video, backend="openai", frame_interval=1)
41
+
42
+ assert out.is_dir()
43
+ transcript = out / "transcript.txt"
44
+ assert transcript.exists()
45
+ assert transcript.read_text() == "Video transcript here"
46
+
47
+ frames = out / "frames"
48
+ assert frames.is_dir()
49
+ assert len(list(frames.glob("frame_*.jpg"))) >= 1
50
+
51
+
52
+ def test_idempotent_skip(sample_audio: Path):
53
+ """Second run should skip transcription."""
54
+ with patch("ingestible.process.transcribe", return_value="First run") as mock_t:
55
+ process_file(sample_audio, backend="openai")
56
+ assert mock_t.call_count == 1
57
+
58
+ with patch("ingestible.process.transcribe", return_value="Second run") as mock_t:
59
+ process_file(sample_audio, backend="openai")
60
+ assert mock_t.call_count == 0 # Should have been skipped
61
+
62
+
63
+ def test_unsupported_format(tmp_path: Path):
64
+ """Unsupported extensions raise ValueError."""
65
+ txt = tmp_path / "notes.txt"
66
+ txt.write_text("hello")
67
+
68
+ import pytest
69
+ with pytest.raises(ValueError, match="Unsupported format"):
70
+ process_file(txt, backend="openai")
@@ -0,0 +1,62 @@
1
+ """Tests for the transcription module (mocked — no API calls)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from unittest.mock import MagicMock, patch
7
+
8
+ import pytest
9
+
10
+ from ingestible.transcribe import _extract_text, transcribe
11
+
12
+
13
+ class TestExtractText:
14
+ def test_from_object_attribute(self):
15
+ obj = MagicMock()
16
+ obj.text = "Hello world"
17
+ assert _extract_text(obj) == "Hello world"
18
+
19
+ def test_from_dict(self):
20
+ assert _extract_text({"text": "Hello world"}) == "Hello world"
21
+
22
+ def test_strips_whitespace(self):
23
+ assert _extract_text({"text": " Hello world "}) == "Hello world"
24
+
25
+ def test_raises_on_empty(self):
26
+ with pytest.raises(ValueError, match="missing text"):
27
+ _extract_text({"text": ""})
28
+
29
+ def test_raises_on_missing(self):
30
+ with pytest.raises(ValueError, match="missing text"):
31
+ _extract_text({})
32
+
33
+
34
+ class TestTranscribeOpenAI:
35
+ @patch("ingestible.transcribe.OpenAI", create=True)
36
+ def test_calls_api(self, mock_openai_cls, sample_audio: Path, monkeypatch):
37
+ monkeypatch.setenv("OPENAI_API_KEY", "sk-test")
38
+
39
+ mock_response = MagicMock()
40
+ mock_response.text = "This is the transcript"
41
+ mock_client = MagicMock()
42
+ mock_client.audio.transcriptions.create.return_value = mock_response
43
+
44
+ # Patch the lazy import inside _transcribe_openai
45
+ with patch("ingestible.transcribe.OpenAI", return_value=mock_client, create=True):
46
+ # Need to patch at the point of import
47
+ import ingestible.transcribe as mod
48
+ with patch.object(mod, "_transcribe_openai") as mock_fn:
49
+ mock_fn.return_value = "This is the transcript"
50
+ result = transcribe(sample_audio, backend="openai")
51
+
52
+ assert result == "This is the transcript"
53
+
54
+ def test_raises_without_api_key(self, sample_audio: Path, monkeypatch):
55
+ monkeypatch.delenv("OPENAI_API_KEY", raising=False)
56
+ with pytest.raises(RuntimeError, match="OPENAI_API_KEY"):
57
+ transcribe(sample_audio, backend="openai")
58
+
59
+
60
+ def test_unknown_backend(sample_audio: Path):
61
+ with pytest.raises(ValueError, match="Unknown backend"):
62
+ transcribe(sample_audio, backend="nonexistent")