wispr-lrc 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wispr_lrc-0.1.2/.github/workflows/ci.yml +54 -0
- wispr_lrc-0.1.2/.github/workflows/ml-benchmark.yml +66 -0
- wispr_lrc-0.1.2/.gitignore +10 -0
- wispr_lrc-0.1.2/.redsun/.gitignore +1 -0
- wispr_lrc-0.1.2/.redsun/AGENTS.md +203 -0
- wispr_lrc-0.1.2/.redsun/ARCHITECTURE.md +208 -0
- wispr_lrc-0.1.2/AGENTS.md +1 -0
- wispr_lrc-0.1.2/PKG-INFO +181 -0
- wispr_lrc-0.1.2/README.md +167 -0
- wispr_lrc-0.1.2/pyproject.toml +49 -0
- wispr_lrc-0.1.2/src/wispr/__init__.py +5 -0
- wispr_lrc-0.1.2/src/wispr/align.py +3 -0
- wispr_lrc-0.1.2/src/wispr/audio.py +31 -0
- wispr_lrc-0.1.2/src/wispr/backend_factory.py +81 -0
- wispr_lrc-0.1.2/src/wispr/backends.py +70 -0
- wispr_lrc-0.1.2/src/wispr/batch.py +201 -0
- wispr_lrc-0.1.2/src/wispr/benchmark.py +214 -0
- wispr_lrc-0.1.2/src/wispr/cli.py +227 -0
- wispr_lrc-0.1.2/src/wispr/demucs_backend.py +64 -0
- wispr_lrc-0.1.2/src/wispr/lrc.py +29 -0
- wispr_lrc-0.1.2/src/wispr/lyrics.py +19 -0
- wispr_lrc-0.1.2/src/wispr/metadata.py +43 -0
- wispr_lrc-0.1.2/src/wispr/models.py +196 -0
- wispr_lrc-0.1.2/src/wispr/pipeline.py +295 -0
- wispr_lrc-0.1.2/src/wispr/runtime.py +58 -0
- wispr_lrc-0.1.2/src/wispr/segment.py +378 -0
- wispr_lrc-0.1.2/src/wispr/transcribe.py +12 -0
- wispr_lrc-0.1.2/src/wispr/warnings.py +3 -0
- wispr_lrc-0.1.2/src/wispr/whisperx_backend.py +366 -0
- wispr_lrc-0.1.2/tests/fixtures/README.md +9 -0
- wispr_lrc-0.1.2/tests/fixtures/jingle_bells.m4a +0 -0
- wispr_lrc-0.1.2/tests/fixtures/jingle_bells.txt +71 -0
- wispr_lrc-0.1.2/tests/test_audio.py +56 -0
- wispr_lrc-0.1.2/tests/test_backend_factory.py +92 -0
- wispr_lrc-0.1.2/tests/test_batch.py +136 -0
- wispr_lrc-0.1.2/tests/test_benchmark.py +93 -0
- wispr_lrc-0.1.2/tests/test_cli.py +421 -0
- wispr_lrc-0.1.2/tests/test_demucs.py +77 -0
- wispr_lrc-0.1.2/tests/test_lrc.py +36 -0
- wispr_lrc-0.1.2/tests/test_metadata.py +46 -0
- wispr_lrc-0.1.2/tests/test_ml_smoke.py +47 -0
- wispr_lrc-0.1.2/tests/test_pipeline.py +203 -0
- wispr_lrc-0.1.2/tests/test_runtime.py +45 -0
- wispr_lrc-0.1.2/tests/test_segment.py +171 -0
- wispr_lrc-0.1.2/tests/test_transcribe.py +382 -0
- wispr_lrc-0.1.2/uv.lock +3488 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
name: Python ${{ matrix.python-version }}
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
fail-fast: false
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- name: Check out repository
|
|
18
|
+
uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Install uv
|
|
21
|
+
uses: astral-sh/setup-uv@v5
|
|
22
|
+
with:
|
|
23
|
+
enable-cache: true
|
|
24
|
+
cache-dependency-glob: uv.lock
|
|
25
|
+
|
|
26
|
+
- name: Set up Python
|
|
27
|
+
uses: actions/setup-python@v5
|
|
28
|
+
with:
|
|
29
|
+
python-version: ${{ matrix.python-version }}
|
|
30
|
+
|
|
31
|
+
- name: Install dependencies
|
|
32
|
+
run: uv sync --dev --locked
|
|
33
|
+
|
|
34
|
+
- name: Lint
|
|
35
|
+
run: uv run ruff check .
|
|
36
|
+
|
|
37
|
+
- name: Test
|
|
38
|
+
run: uv run pytest
|
|
39
|
+
|
|
40
|
+
- name: Benchmark smoke
|
|
41
|
+
run: |
|
|
42
|
+
mkdir -p "$RUNNER_TEMP/wispr-ci"
|
|
43
|
+
uv run wispr benchmark tests/fixtures/jingle_bells.m4a tests/fixtures/jingle_bells.txt \
|
|
44
|
+
--backend mock \
|
|
45
|
+
--debug \
|
|
46
|
+
--force \
|
|
47
|
+
-o "$RUNNER_TEMP/wispr-ci/jingle_bells.lrc" \
|
|
48
|
+
--report "$RUNNER_TEMP/wispr-ci/jingle_bells.benchmark.json"
|
|
49
|
+
test -f "$RUNNER_TEMP/wispr-ci/jingle_bells.lrc"
|
|
50
|
+
test -f "$RUNNER_TEMP/wispr-ci/jingle_bells.benchmark.json"
|
|
51
|
+
test -f "$RUNNER_TEMP/wispr-ci/jingle_bells.debug/timings.json"
|
|
52
|
+
|
|
53
|
+
- name: Build package
|
|
54
|
+
run: uv build
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
name: Manual ML Benchmark
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
inputs:
|
|
6
|
+
model:
|
|
7
|
+
description: WhisperX model name
|
|
8
|
+
required: true
|
|
9
|
+
default: base
|
|
10
|
+
device:
|
|
11
|
+
description: Runtime device
|
|
12
|
+
required: true
|
|
13
|
+
default: cpu
|
|
14
|
+
compute-type:
|
|
15
|
+
description: WhisperX compute type
|
|
16
|
+
required: true
|
|
17
|
+
default: int8
|
|
18
|
+
|
|
19
|
+
jobs:
|
|
20
|
+
whisperx-benchmark:
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
timeout-minutes: 60
|
|
23
|
+
|
|
24
|
+
steps:
|
|
25
|
+
- name: Check out repository
|
|
26
|
+
uses: actions/checkout@v4
|
|
27
|
+
|
|
28
|
+
- name: Install uv
|
|
29
|
+
uses: astral-sh/setup-uv@v5
|
|
30
|
+
with:
|
|
31
|
+
enable-cache: true
|
|
32
|
+
cache-dependency-glob: uv.lock
|
|
33
|
+
|
|
34
|
+
- name: Set up Python
|
|
35
|
+
uses: actions/setup-python@v5
|
|
36
|
+
with:
|
|
37
|
+
python-version: "3.12"
|
|
38
|
+
|
|
39
|
+
- name: Install ffmpeg
|
|
40
|
+
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
|
41
|
+
|
|
42
|
+
- name: Install ML dependencies
|
|
43
|
+
run: uv sync --dev --extra ml --locked
|
|
44
|
+
|
|
45
|
+
- name: Run WhisperX benchmark
|
|
46
|
+
run: |
|
|
47
|
+
mkdir -p "$RUNNER_TEMP/wispr-ml"
|
|
48
|
+
uv run wispr benchmark tests/fixtures/jingle_bells.m4a tests/fixtures/jingle_bells.txt \
|
|
49
|
+
--backend whisperx \
|
|
50
|
+
--model "${{ inputs.model }}" \
|
|
51
|
+
--device "${{ inputs.device }}" \
|
|
52
|
+
--compute-type "${{ inputs['compute-type'] }}" \
|
|
53
|
+
--debug \
|
|
54
|
+
--force \
|
|
55
|
+
-o "$RUNNER_TEMP/wispr-ml/jingle_bells.lrc" \
|
|
56
|
+
--report "$RUNNER_TEMP/wispr-ml/jingle_bells.benchmark.json"
|
|
57
|
+
|
|
58
|
+
- name: Upload benchmark artifacts
|
|
59
|
+
uses: actions/upload-artifact@v4
|
|
60
|
+
if: always()
|
|
61
|
+
with:
|
|
62
|
+
name: wispr-ml-benchmark
|
|
63
|
+
path: |
|
|
64
|
+
${{ runner.temp }}/wispr-ml/*.lrc
|
|
65
|
+
${{ runner.temp }}/wispr-ml/*.benchmark.json
|
|
66
|
+
${{ runner.temp }}/wispr-ml/*.debug/**
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
*
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# AGENTS.md
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
This repository implements **wispr**, a Python library and CLI for generating synchronized standard line-level `.lrc` files from full-song audio plus a canonical line-by-line lyrics file.
|
|
6
|
+
|
|
7
|
+
The core product goal is not lyric generation. The system should treat the provided lyrics file as the source of truth and use transcription and forced alignment tooling primarily to recover timing information.
|
|
8
|
+
|
|
9
|
+
Agents working in this repository should optimize for **software engineering quality first**: clean module boundaries, deterministic behavior, strong testability, clear failure modes, and a polished command-line user experience.
|
|
10
|
+
|
|
11
|
+
## Product Scope
|
|
12
|
+
|
|
13
|
+
### In scope for v1
|
|
14
|
+
|
|
15
|
+
- English-only alignment.
|
|
16
|
+
- Full-song processing.
|
|
17
|
+
- Standard line-level `.lrc` output.
|
|
18
|
+
- Audio inputs: `.wav`, `.mp3`, `.flac`, `.m4a`.
|
|
19
|
+
- Optional metadata extraction from source audio.
|
|
20
|
+
- Typer-based CLI plus reusable Python library.
|
|
21
|
+
- Debug artifacts for transcript, alignment, and segments.
|
|
22
|
+
- Warnings on weak alignment while continuing output generation.
|
|
23
|
+
|
|
24
|
+
### Out of scope for v1
|
|
25
|
+
|
|
26
|
+
- Word-level karaoke timing.
|
|
27
|
+
- Multilingual support.
|
|
28
|
+
- Docker packaging.
|
|
29
|
+
- Extra subcommands like `wispr doctor`.
|
|
30
|
+
- Replacing the provided lyrics with model-generated text.
|
|
31
|
+
|
|
32
|
+
## Core Principles
|
|
33
|
+
|
|
34
|
+
### 1. Canonical lyrics are the source of truth
|
|
35
|
+
|
|
36
|
+
The contents of `lyrics.txt` must define the emitted lyric text. Do not silently rewrite lyrics to match model output.
|
|
37
|
+
|
|
38
|
+
### 2. Timing over transcription
|
|
39
|
+
|
|
40
|
+
Transcript and alignment stages exist to obtain usable timestamps. They do not define the final textual output.
|
|
41
|
+
|
|
42
|
+
### 3. Library first, CLI second
|
|
43
|
+
|
|
44
|
+
The CLI should be a thin wrapper over reusable internal modules. Business logic should not live directly inside CLI command functions.
|
|
45
|
+
|
|
46
|
+
### 4. Deterministic output where possible
|
|
47
|
+
|
|
48
|
+
Formatting, metadata ordering, path resolution, warning behavior, and line segmentation should be predictable and testable.
|
|
49
|
+
|
|
50
|
+
### 5. Safe defaults
|
|
51
|
+
|
|
52
|
+
The default experience should be optimized for likely real-world success:
|
|
53
|
+
- Vocal separation on by default.
|
|
54
|
+
- Output path derived from input audio filename.
|
|
55
|
+
- Existing output files should not be overwritten unless `--force` is passed.
|
|
56
|
+
- Missing metadata should be skipped, not treated as fatal.
|
|
57
|
+
- Weak alignment should warn and continue.
|
|
58
|
+
|
|
59
|
+
## User Experience Contract
|
|
60
|
+
|
|
61
|
+
The primary demo path should remain:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
wispr song.wav lyrics.txt
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Expected behavior:
|
|
68
|
+
- Produces `song.lrc` next to the audio file unless `-o` is provided.
|
|
69
|
+
- Uses the first aligned word in each lyric line as that line's timestamp.
|
|
70
|
+
- Emits metadata tags before lyric lines when metadata is available.
|
|
71
|
+
- Fails on output collisions unless `--force` is provided.
|
|
72
|
+
- Writes debug artifacts only when `--debug` is enabled.
|
|
73
|
+
|
|
74
|
+
## Expected Output Format
|
|
75
|
+
|
|
76
|
+
wispr should emit standard line-level LRC.
|
|
77
|
+
|
|
78
|
+
If metadata is available, write tags in this fixed order:
|
|
79
|
+
1. `[ar:]`
|
|
80
|
+
2. `[al:]`
|
|
81
|
+
3. `[ti:]`
|
|
82
|
+
4. Timestamped lyric lines
|
|
83
|
+
|
|
84
|
+
Do not emit empty placeholder metadata tags.
|
|
85
|
+
|
|
86
|
+
## Recommended Project Structure
|
|
87
|
+
|
|
88
|
+
```text
|
|
89
|
+
wispr/
|
|
90
|
+
__init__.py
|
|
91
|
+
cli.py
|
|
92
|
+
pipeline.py
|
|
93
|
+
audio.py
|
|
94
|
+
metadata.py
|
|
95
|
+
transcribe.py
|
|
96
|
+
align.py
|
|
97
|
+
segment.py
|
|
98
|
+
lrc.py
|
|
99
|
+
models.py
|
|
100
|
+
warnings.py
|
|
101
|
+
tests/
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Module responsibilities
|
|
105
|
+
|
|
106
|
+
- `cli.py`: Typer commands, user-facing flags, terminal messaging.
|
|
107
|
+
- `pipeline.py`: stage orchestration.
|
|
108
|
+
- `audio.py`: validation, preprocessing, vocal-separation integration.
|
|
109
|
+
- `metadata.py`: source metadata extraction and normalization.
|
|
110
|
+
- `transcribe.py`: transcript and raw timestamp acquisition.
|
|
111
|
+
- `align.py`: forced-alignment backend integration.
|
|
112
|
+
- `segment.py`: map aligned words onto canonical lyric lines.
|
|
113
|
+
- `lrc.py`: timestamp formatting and final file serialization.
|
|
114
|
+
- `models.py`: shared data structures across stages.
|
|
115
|
+
- `warnings.py`: structured warning types and formatting.
|
|
116
|
+
|
|
117
|
+
## Implementation Guidance
|
|
118
|
+
|
|
119
|
+
### Prefer explicit data models
|
|
120
|
+
|
|
121
|
+
Use typed models or dataclasses for stage boundaries instead of large unstructured dictionaries.
|
|
122
|
+
|
|
123
|
+
Suggested entities include:
|
|
124
|
+
- `TrackMetadata`
|
|
125
|
+
- `TranscriptWord`
|
|
126
|
+
- `AlignedWord`
|
|
127
|
+
- `LyricLine`
|
|
128
|
+
- `LrcDocument`
|
|
129
|
+
|
|
130
|
+
### Keep backend boundaries clean
|
|
131
|
+
|
|
132
|
+
The alignment backend should be abstracted behind a narrow interface so the project can begin with WhisperX-centered alignment and later swap or add custom aligners without rewriting the whole pipeline.
|
|
133
|
+
|
|
134
|
+
### Preserve inspectability
|
|
135
|
+
|
|
136
|
+
When adding logic, prefer designs that make it easier to understand how a timestamp was produced. Debuggability matters more than cleverness.
|
|
137
|
+
|
|
138
|
+
### Avoid premature product sprawl
|
|
139
|
+
|
|
140
|
+
Do not add extra commands, frontend layers, web services, or database infrastructure unless explicitly requested. The value of this project comes from doing one CLI workflow well.
|
|
141
|
+
|
|
142
|
+
## Debug Artifacts
|
|
143
|
+
|
|
144
|
+
When `--debug` is enabled, write a folder of intermediate artifacts containing:
|
|
145
|
+
- `transcript.json`
|
|
146
|
+
- `alignment.json`
|
|
147
|
+
- `segments.json`
|
|
148
|
+
|
|
149
|
+
These artifacts should reflect the real internal pipeline state, not post-hoc summaries.
|
|
150
|
+
|
|
151
|
+
## Warning Policy
|
|
152
|
+
|
|
153
|
+
Weak alignment should not abort the entire run.
|
|
154
|
+
|
|
155
|
+
Warnings should include:
|
|
156
|
+
- Line number.
|
|
157
|
+
- Confidence score.
|
|
158
|
+
- Estimated timestamp source.
|
|
159
|
+
|
|
160
|
+
Warnings should be informative enough that a user can inspect the affected line and understand why the output may be imperfect.
|
|
161
|
+
|
|
162
|
+
## Testing Priorities
|
|
163
|
+
|
|
164
|
+
Focus tests on deterministic logic and contract behavior.
|
|
165
|
+
|
|
166
|
+
High-priority tests:
|
|
167
|
+
- LRC timestamp formatting.
|
|
168
|
+
- Metadata ordering and omission.
|
|
169
|
+
- Line segmentation behavior.
|
|
170
|
+
- Output collision handling with and without `--force`.
|
|
171
|
+
- Warning generation for weak alignment.
|
|
172
|
+
- Output filename derivation from input audio.
|
|
173
|
+
|
|
174
|
+
Where possible, use small synthetic fixtures and mocked backend outputs instead of heavyweight full-model runs.
|
|
175
|
+
|
|
176
|
+
## Documentation Expectations
|
|
177
|
+
|
|
178
|
+
When updating the repo:
|
|
179
|
+
- Keep architecture and behavior consistent with `ARCHITECTURE.md`.
|
|
180
|
+
- Document new flags, outputs, or format changes in the README.
|
|
181
|
+
- Call out any deviation from the v1 contract explicitly.
|
|
182
|
+
|
|
183
|
+
## Style Expectations
|
|
184
|
+
|
|
185
|
+
- Prefer small, composable functions over monolithic pipeline code.
|
|
186
|
+
- Keep side effects localized.
|
|
187
|
+
- Make error messages concrete and actionable.
|
|
188
|
+
- Favor clarity over clever abstractions.
|
|
189
|
+
- Add comments where intent is not obvious, but do not narrate trivial code.
|
|
190
|
+
- "wispr" should always be typed in all lowercase. Do not use "Wispr" or "WISPR" or any other formatting
|
|
191
|
+
|
|
192
|
+
## Agent Behavior
|
|
193
|
+
|
|
194
|
+
Agents assisting with this project should:
|
|
195
|
+
- Preserve the library-plus-CLI structure.
|
|
196
|
+
- Respect the canonical-lyrics-first design.
|
|
197
|
+
- Avoid introducing hidden behavior that changes lyric text.
|
|
198
|
+
- Prefer incremental, reviewable changes.
|
|
199
|
+
- Keep recommendations aligned with recruiter-facing SWE value and real usability.
|
|
200
|
+
- Code should be structured in a clean and organized way, and be human readable.
|
|
201
|
+
- Be consise with code generation, focus on writing as few lines of code as possible.
|
|
202
|
+
|
|
203
|
+
When responding, guide and explain clearly when useful, but keep implementations aligned with the repository's architecture and scope.
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# wispr Architecture
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
wispr is a Python library and CLI that converts full-song audio plus a canonical line-by-line lyrics file into a synchronized, standard line-level `.lrc` output. The CLI is the simplest interface over a reusable library, not a one-off script.[cite:58][cite:60]
|
|
6
|
+
|
|
7
|
+
The project is intentionally scoped around timing alignment rather than lyric generation. Standard LRC supports line-level timestamps and optional metadata tags, which makes it a good target for a deterministic v1 that is easy to inspect, test, and demo.[cite:34][cite:65][cite:31]
|
|
8
|
+
|
|
9
|
+
## Product Goals
|
|
10
|
+
|
|
11
|
+
### Primary goals
|
|
12
|
+
|
|
13
|
+
- Accept `.wav`, `.mp3`, `.flac`, and `.m4a` audio inputs.[cite:74][cite:78]
|
|
14
|
+
- Accept a `lyrics.txt` file where each line represents one intended lyric line.
|
|
15
|
+
- Produce a standard line-level `.lrc` file with one timestamp per lyric line.[cite:34][cite:65]
|
|
16
|
+
- Prefer forced alignment for timing accuracy while keeping the provided lyric text as the source of truth.[cite:32][cite:42][cite:8]
|
|
17
|
+
- Ship as both a reusable Python package and an installed `wispr` command via Python entry points.[cite:60]
|
|
18
|
+
|
|
19
|
+
### Non-goals for v1
|
|
20
|
+
|
|
21
|
+
- Word-level karaoke timing.
|
|
22
|
+
- Multilingual alignment.
|
|
23
|
+
- Docker packaging.
|
|
24
|
+
- Additional diagnostic subcommands such as `wispr doctor`.
|
|
25
|
+
|
|
26
|
+
## User Experience
|
|
27
|
+
|
|
28
|
+
The default happy path should be:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
wispr song.wav lyrics.txt
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
When `-o` is omitted, wispr should derive the output path from the audio filename, so `random_song232.wav` becomes `random_song232.lrc`.
|
|
35
|
+
|
|
36
|
+
If the destination file already exists, the command should fail unless `--force` is passed. Alignment problems should produce warnings and continue rather than aborting the entire run.
|
|
37
|
+
|
|
38
|
+
## CLI Contract
|
|
39
|
+
|
|
40
|
+
wispr should use Typer for the command-line interface so the public command surface can stay small now and grow cleanly later through typed arguments and options.[cite:58]
|
|
41
|
+
|
|
42
|
+
### Initial command surface
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
wispr <audio> <lyrics.txt>
|
|
46
|
+
wispr <audio> <lyrics.txt> -o output.lrc
|
|
47
|
+
wispr <audio> <lyrics.txt> --debug
|
|
48
|
+
wispr <audio> <lyrics.txt> --force
|
|
49
|
+
wispr <audio> <lyrics.txt> --no-separate-vocals
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Packaging model
|
|
53
|
+
|
|
54
|
+
The package name and installed command should both be `wispr`. Installation should expose the CLI through a `console_scripts` entry point so users can run `wispr` directly from their shell after installation.[cite:60][cite:79]
|
|
55
|
+
|
|
56
|
+
## Pipeline
|
|
57
|
+
|
|
58
|
+
The default processing path is:
|
|
59
|
+
|
|
60
|
+
1. Audio ingest and validation.
|
|
61
|
+
2. Optional metadata extraction.
|
|
62
|
+
3. Vocal separation, enabled by default.
|
|
63
|
+
4. Transcript and word-timestamp extraction.
|
|
64
|
+
5. Forced alignment against the canonical lyric text.
|
|
65
|
+
6. Line segmentation.
|
|
66
|
+
7. LRC emission.
|
|
67
|
+
|
|
68
|
+
WhisperX is the intended alignment anchor for v1 because it is built around refining Whisper timestamps with forced alignment to improve timestamp accuracy.[cite:32][cite:42][cite:8]
|
|
69
|
+
|
|
70
|
+
### Stage details
|
|
71
|
+
|
|
72
|
+
#### 1. Audio ingest
|
|
73
|
+
|
|
74
|
+
The input layer should validate supported extensions, normalize paths, and prepare audio for downstream tools.
|
|
75
|
+
|
|
76
|
+
#### 2. Metadata extraction
|
|
77
|
+
|
|
78
|
+
wispr should opportunistically extract title, artist, and album metadata from the source file. Mutagen is a good fit because it supports common audio metadata handling across formats including MP3, FLAC, and MP4-family files.[cite:74][cite:78]
|
|
79
|
+
|
|
80
|
+
If metadata is missing or unreadable, wispr should skip those fields without failing the run.
|
|
81
|
+
|
|
82
|
+
#### 3. Vocal separation
|
|
83
|
+
|
|
84
|
+
Vocal separation should be on by default, with a user escape hatch through `--no-separate-vocals`. This keeps the default UX optimized for noisy real songs while preserving a fast path for cleaner inputs.
|
|
85
|
+
|
|
86
|
+
#### 4. Transcript and alignment
|
|
87
|
+
|
|
88
|
+
The transcript stage exists to recover timing signals, not to produce final lyric text. The canonical lyrics file remains the source of truth for emitted lines, while the alignment backend maps timing evidence onto those lines.[cite:32][cite:42]
|
|
89
|
+
|
|
90
|
+
#### 5. Line segmentation
|
|
91
|
+
|
|
92
|
+
Because the lyrics input is already line-structured, segmentation should map aligned word spans onto each original lyric line rather than trying to infer poetic phrasing from paragraph text.
|
|
93
|
+
|
|
94
|
+
#### 6. LRC emission
|
|
95
|
+
|
|
96
|
+
The emitted file should be standard line-level LRC. Each lyric line should use the timestamp of the first aligned word in that line.[cite:34][cite:65]
|
|
97
|
+
|
|
98
|
+
If metadata exists, it should be written before the lyric body in fixed order:
|
|
99
|
+
|
|
100
|
+
1. `[ar:]`
|
|
101
|
+
2. `[al:]`
|
|
102
|
+
3. `[ti:]`
|
|
103
|
+
4. Timestamped lyric lines
|
|
104
|
+
|
|
105
|
+
Metadata tags should be omitted individually when unavailable rather than written as empty placeholders.[cite:34][cite:65]
|
|
106
|
+
|
|
107
|
+
## Internal Architecture
|
|
108
|
+
|
|
109
|
+
wispr should be library-first with a thin CLI wrapper. The CLI exists to parse inputs and display results, while the library owns orchestration, transformation, and file generation.
|
|
110
|
+
|
|
111
|
+
### Proposed package layout
|
|
112
|
+
|
|
113
|
+
```text
|
|
114
|
+
wispr/
|
|
115
|
+
__init__.py
|
|
116
|
+
cli.py
|
|
117
|
+
pipeline.py
|
|
118
|
+
audio.py
|
|
119
|
+
metadata.py
|
|
120
|
+
transcribe.py
|
|
121
|
+
align.py
|
|
122
|
+
segment.py
|
|
123
|
+
lrc.py
|
|
124
|
+
models.py
|
|
125
|
+
warnings.py
|
|
126
|
+
tests/
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Module responsibilities
|
|
130
|
+
|
|
131
|
+
| Module | Responsibility |
|
|
132
|
+
|---|---|
|
|
133
|
+
| `cli.py` | Typer commands, argument parsing, user-facing output.[cite:58] |
|
|
134
|
+
| `pipeline.py` | End-to-end orchestration and stage ordering. |
|
|
135
|
+
| `audio.py` | Input validation, preprocessing, and vocal-separation integration. |
|
|
136
|
+
| `metadata.py` | Audio metadata extraction and normalization via Mutagen-compatible readers.[cite:74][cite:78] |
|
|
137
|
+
| `transcribe.py` | Transcript and raw word-timestamp acquisition. |
|
|
138
|
+
| `align.py` | Forced-alignment adapter and backend boundary.[cite:42][cite:32] |
|
|
139
|
+
| `segment.py` | Map aligned tokens/spans onto lyric lines. |
|
|
140
|
+
| `lrc.py` | Metadata ordering, timestamp formatting, and final `.lrc` serialization.[cite:34][cite:65] |
|
|
141
|
+
| `models.py` | Shared typed data structures between stages. |
|
|
142
|
+
| `warnings.py` | Structured warning types and formatting. |
|
|
143
|
+
|
|
144
|
+
## Data Model
|
|
145
|
+
|
|
146
|
+
The internal pipeline should pass structured models instead of ad hoc dictionaries where possible.
|
|
147
|
+
|
|
148
|
+
### Core objects
|
|
149
|
+
|
|
150
|
+
- `TrackMetadata`: title, artist, album, source path.
|
|
151
|
+
- `TranscriptWord`: text, start time, end time, confidence, source.
|
|
152
|
+
- `AlignedWord`: canonical token, start time, end time, confidence, timestamp source.
|
|
153
|
+
- `LyricLine`: line number, raw text, aligned words, line start time, confidence.
|
|
154
|
+
- `LrcDocument`: metadata tags plus ordered timestamped lyric lines.
|
|
155
|
+
|
|
156
|
+
This keeps the CLI thin, improves unit-test boundaries, and makes it easier to swap alignment backends later.
|
|
157
|
+
|
|
158
|
+
## Debug Artifacts
|
|
159
|
+
|
|
160
|
+
Debug mode should write a folder of intermediate artifacts rather than a single file.
|
|
161
|
+
|
|
162
|
+
### Required debug outputs
|
|
163
|
+
|
|
164
|
+
- `transcript.json`
|
|
165
|
+
- `alignment.json`
|
|
166
|
+
- `segments.json`
|
|
167
|
+
|
|
168
|
+
These files should make it possible to inspect where timing entered the pipeline, how canonical lyrics were aligned, and how final line timestamps were chosen.
|
|
169
|
+
|
|
170
|
+
## Warning Policy
|
|
171
|
+
|
|
172
|
+
wispr should warn and continue when a line is weakly aligned instead of failing the whole song.
|
|
173
|
+
|
|
174
|
+
Each warning should include:
|
|
175
|
+
|
|
176
|
+
- Line number.
|
|
177
|
+
- Confidence score.
|
|
178
|
+
- Estimated timestamp source.
|
|
179
|
+
|
|
180
|
+
This policy preserves a usable output file while still surfacing uncertainty to the user.
|
|
181
|
+
|
|
182
|
+
## Testing Priorities
|
|
183
|
+
|
|
184
|
+
The highest-value unit tests should target deterministic logic rather than heavyweight model execution.
|
|
185
|
+
|
|
186
|
+
### Must-test areas
|
|
187
|
+
|
|
188
|
+
- LRC timestamp formatting and serialization.[cite:34][cite:65]
|
|
189
|
+
- Metadata tag ordering and omission behavior.[cite:34][cite:65]
|
|
190
|
+
- Line-to-span segmentation.
|
|
191
|
+
- Existing-file collision behavior with and without `--force`.
|
|
192
|
+
- Warning generation when confidence falls below threshold.
|
|
193
|
+
|
|
194
|
+
The alignment backend itself should be wrapped behind interfaces so pure logic can be tested with synthetic fixtures instead of depending on full external model runs.
|
|
195
|
+
|
|
196
|
+
## Future Extensions
|
|
197
|
+
|
|
198
|
+
The current architecture should leave room for later additions without forcing a redesign.
|
|
199
|
+
|
|
200
|
+
### Likely future work
|
|
201
|
+
|
|
202
|
+
- Word-level karaoke timing.
|
|
203
|
+
- Multilingual support.
|
|
204
|
+
- Alternative alignment backends.
|
|
205
|
+
- Optional Docker packaging.
|
|
206
|
+
- Richer metadata support.
|
|
207
|
+
|
|
208
|
+
The key design rule is that the CLI should remain the purest interface to the library: simple for end users, while the library stays modular enough for testing, reuse, and future backends.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
@.redsun/AGENTS.md
|