easy-podcast 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easy_podcast-0.0.1/MANIFEST.in +8 -0
- easy_podcast-0.0.1/PKG-INFO +284 -0
- easy_podcast-0.0.1/README.md +243 -0
- easy_podcast-0.0.1/pyproject.toml +129 -0
- easy_podcast-0.0.1/setup.cfg +4 -0
- easy_podcast-0.0.1/src/easy_podcast/__init__.py +7 -0
- easy_podcast-0.0.1/src/easy_podcast/cli.py +102 -0
- easy_podcast-0.0.1/src/easy_podcast/config.py +163 -0
- easy_podcast-0.0.1/src/easy_podcast/downloader.py +221 -0
- easy_podcast-0.0.1/src/easy_podcast/episode_tracker.py +131 -0
- easy_podcast-0.0.1/src/easy_podcast/manager.py +207 -0
- easy_podcast-0.0.1/src/easy_podcast/models.py +70 -0
- easy_podcast-0.0.1/src/easy_podcast/parser.py +157 -0
- easy_podcast-0.0.1/src/easy_podcast/py.typed +0 -0
- easy_podcast-0.0.1/src/easy_podcast/utils.py +72 -0
- easy_podcast-0.0.1/src/easy_podcast.egg-info/PKG-INFO +284 -0
- easy_podcast-0.0.1/src/easy_podcast.egg-info/SOURCES.txt +35 -0
- easy_podcast-0.0.1/src/easy_podcast.egg-info/dependency_links.txt +1 -0
- easy_podcast-0.0.1/src/easy_podcast.egg-info/entry_points.txt +2 -0
- easy_podcast-0.0.1/src/easy_podcast.egg-info/requires.txt +20 -0
- easy_podcast-0.0.1/src/easy_podcast.egg-info/top_level.txt +1 -0
- easy_podcast-0.0.1/tests/__init__.py +3 -0
- easy_podcast-0.0.1/tests/base.py +147 -0
- easy_podcast-0.0.1/tests/test_cli.py +353 -0
- easy_podcast-0.0.1/tests/test_config.py +432 -0
- easy_podcast-0.0.1/tests/test_downloader.py +563 -0
- easy_podcast-0.0.1/tests/test_episode_tracker.py +456 -0
- easy_podcast-0.0.1/tests/test_integration.py +128 -0
- easy_podcast-0.0.1/tests/test_manager_downloads.py +211 -0
- easy_podcast-0.0.1/tests/test_manager_episodes.py +360 -0
- easy_podcast-0.0.1/tests/test_manager_initialization.py +410 -0
- easy_podcast-0.0.1/tests/test_manager_rss.py +153 -0
- easy_podcast-0.0.1/tests/test_models.py +135 -0
- easy_podcast-0.0.1/tests/test_parser.py +614 -0
- easy_podcast-0.0.1/tests/test_parser_class.py +436 -0
- easy_podcast-0.0.1/tests/test_utils.py +108 -0
- easy_podcast-0.0.1/tests/utils.py +82 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: easy-podcast
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A modular podcast episode downloader with RSS feed parsing and progress tracking
|
|
5
|
+
Author-email: Aryan Falahtpisheh <aryanfalahat@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/falahat/podcast
|
|
8
|
+
Project-URL: Repository, https://github.com/falahat/podcast.git
|
|
9
|
+
Project-URL: Issues, https://github.com/falahat/podcast/issues
|
|
10
|
+
Keywords: podcast,rss,download,audio,media
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
14
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
15
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Requires-Python: <3.13,>=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: feedparser
|
|
24
|
+
Requires-Dist: requests
|
|
25
|
+
Requires-Dist: tqdm
|
|
26
|
+
Requires-Dist: numpy
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=2.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: flake8>=4.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: coverage; extra == "dev"
|
|
34
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
35
|
+
Requires-Dist: types-tqdm; extra == "dev"
|
|
36
|
+
Provides-Extra: notebook
|
|
37
|
+
Requires-Dist: jupyter>=1.0.0; extra == "notebook"
|
|
38
|
+
Requires-Dist: pandas; extra == "notebook"
|
|
39
|
+
Requires-Dist: pandas-stubs; extra == "notebook"
|
|
40
|
+
Requires-Dist: matplotlib; extra == "notebook"
|
|
41
|
+
|
|
42
|
+
# Podcast Tracker
|
|
43
|
+
|
|
44
|
+
A modular Python package for downloading podcast episodes from RSS feeds and transcribing them using AI. Features progress tracking, metadata management, duplicate detection, and WhisperX-powered transcription with speaker diarization.
|
|
45
|
+
|
|
46
|
+
## Python Version Requirements
|
|
47
|
+
|
|
48
|
+
**This package requires Python 3.10, 3.11, or 3.12.** Python 3.13+ is not supported due to dependency limitations with the WhisperX library.
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
- **RSS Feed Parsing**: Download and parse podcast RSS feeds
|
|
53
|
+
- **Episode Management**: Track downloaded episodes with JSONL metadata
|
|
54
|
+
- **Progress Tracking**: Visual progress bars for downloads
|
|
55
|
+
- **AI Transcription**: WhisperX-powered transcription with speaker diarization
|
|
56
|
+
- **Duplicate Detection**: Automatically skip already downloaded episodes
|
|
57
|
+
- **Type Safety**: Comprehensive type hints throughout
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
### Standard Installation
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
git clone https://github.com/falahat/podcast.git
|
|
65
|
+
cd podcast
|
|
66
|
+
pip install -e .
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Installation with Transcription Support
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
git clone https://github.com/falahat/podcast.git
|
|
73
|
+
cd podcast
|
|
74
|
+
pip install -e .[transcribe]
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Development Installation
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
git clone https://github.com/falahat/podcast.git
|
|
81
|
+
cd podcast
|
|
82
|
+
pip install -e .[dev,notebook,transcribe]
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Quick Start
|
|
86
|
+
|
|
87
|
+
### Command Line Interface
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# Download episodes from an RSS feed
|
|
91
|
+
podcast_downloader "https://example.com/podcast/rss.xml"
|
|
92
|
+
|
|
93
|
+
# Specify custom data directory
|
|
94
|
+
podcast_downloader "https://example.com/podcast/rss.xml" --data-dir ./my_podcasts
|
|
95
|
+
|
|
96
|
+
# List episodes without downloading
|
|
97
|
+
podcast_downloader "https://example.com/podcast/rss.xml" --list-only
|
|
98
|
+
|
|
99
|
+
# Disable progress bars
|
|
100
|
+
podcast_downloader "https://example.com/podcast/rss.xml" --no-progress
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Python API
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from easy_podcast.manager import PodcastManager
|
|
107
|
+
|
|
108
|
+
# Create manager from RSS URL (downloads and parses automatically)
|
|
109
|
+
manager = PodcastManager.from_rss_url("https://example.com/podcast/rss.xml")
|
|
110
|
+
|
|
111
|
+
if manager:
|
|
112
|
+
podcast = manager.get_podcast()
|
|
113
|
+
print(f"Podcast: {podcast.title}")
|
|
114
|
+
|
|
115
|
+
# Get new episodes to download
|
|
116
|
+
new_episodes = manager.get_new_episodes()
|
|
117
|
+
print(f"Found {len(new_episodes)} new episodes")
|
|
118
|
+
|
|
119
|
+
# Download episodes with progress tracking
|
|
120
|
+
successful, skipped, failed = manager.download_episodes(new_episodes)
|
|
121
|
+
print(f"Downloaded: {successful}, Skipped: {skipped}, Failed: {failed}")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Working with Existing Podcast Data
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
# Load manager from existing podcast folder
|
|
128
|
+
manager = PodcastManager.from_podcast_folder("data/My Podcast/")
|
|
129
|
+
|
|
130
|
+
if manager:
|
|
131
|
+
# Continue downloading new episodes
|
|
132
|
+
new_episodes = manager.get_new_episodes()
|
|
133
|
+
manager.download_episodes(new_episodes)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Audio Transcription
|
|
137
|
+
|
|
138
|
+
The package includes AI-powered audio transcription using WhisperX with GPU acceleration and speaker diarization. Transcription functionality is available as an optional dependency.
|
|
139
|
+
|
|
140
|
+
**Installation**: To use transcription features, install with the `[transcribe]` option:
|
|
141
|
+
```bash
|
|
142
|
+
pip install -e .[transcribe]
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Prerequisites for Transcription
|
|
146
|
+
|
|
147
|
+
1. **NVIDIA GPU** with CUDA support
|
|
148
|
+
2. **Hugging Face Token** (for speaker diarization models)
|
|
149
|
+
3. **PyTorch with GPU support** (automatically installed with easy-whisperx)
|
|
150
|
+
|
|
151
|
+
**Note**: PyTorch is automatically installed as part of the `easy-whisperx` dependency. No manual installation required.
|
|
152
|
+
|
|
153
|
+
### Setting up Transcription Environment
|
|
154
|
+
|
|
155
|
+
1. **Get a Hugging Face Token**:
|
|
156
|
+
- Go to [Hugging Face Settings](https://huggingface.co/settings/tokens)
|
|
157
|
+
- Create a token with "read" permissions
|
|
158
|
+
- Accept user agreements for segmentation and diarization models
|
|
159
|
+
|
|
160
|
+
2. **Set Environment Variable**:
|
|
161
|
+
|
|
162
|
+
```powershell
|
|
163
|
+
# Windows PowerShell
|
|
164
|
+
$env:HF_TOKEN="your_token_here"
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
# Linux/macOS
|
|
169
|
+
export HF_TOKEN="your_token_here"
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Using Transcription in Python
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from easy_whisperx.transcriber import Transcriber
|
|
176
|
+
|
|
177
|
+
# Initialize transcriber
|
|
178
|
+
transcriber = Transcriber(
|
|
179
|
+
model_size="base",
|
|
180
|
+
device="cuda", # or "cpu"
|
|
181
|
+
compute_type="float16",
|
|
182
|
+
batch_size=16
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Transcribe audio file
|
|
186
|
+
with transcriber:
|
|
187
|
+
result = transcriber("path/to/audio.mp3")
|
|
188
|
+
print(result["text"])
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Data Storage Structure
|
|
192
|
+
|
|
193
|
+
Podcast data is organized in a clear directory structure:
|
|
194
|
+
|
|
195
|
+
```
|
|
196
|
+
data/
|
|
197
|
+
└── [Sanitized Podcast Name]/
|
|
198
|
+
├── episodes.jsonl # Episode metadata (one JSON object per line)
|
|
199
|
+
├── rss.xml # Cached RSS feed
|
|
200
|
+
└── downloads/ # Downloaded audio files
|
|
201
|
+
├── episode1.mp3
|
|
202
|
+
├── episode2.mp3
|
|
203
|
+
└── ...
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
**Important**: Episode objects store filenames only (e.g., `"727175.mp3"`), not full paths. Use `manager.get_episode_audio_path(episode)` to get complete file paths.
|
|
207
|
+
|
|
208
|
+
## Development
|
|
209
|
+
|
|
210
|
+
### Setting up Development Environment
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
git clone https://github.com/falahat/podcast.git
|
|
214
|
+
cd podcast
|
|
215
|
+
|
|
216
|
+
# Create virtual environment (note the .venv name)
|
|
217
|
+
python -m venv .venv
|
|
218
|
+
|
|
219
|
+
# Activate virtual environment
|
|
220
|
+
# Windows PowerShell:
|
|
221
|
+
.\.venv\Scripts\Activate.ps1
|
|
222
|
+
# Linux/macOS:
|
|
223
|
+
source .venv/bin/activate
|
|
224
|
+
|
|
225
|
+
# Install in development mode
|
|
226
|
+
pip install -e .[dev,notebook]
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### Running Tests
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
# Run all tests
|
|
233
|
+
pytest
|
|
234
|
+
|
|
235
|
+
# Run with coverage report
|
|
236
|
+
pytest --cov=easy_podcast --cov-report=html
|
|
237
|
+
|
|
238
|
+
# Run specific test file
|
|
239
|
+
pytest tests/test_manager.py -v
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### Code Quality Tools
|
|
243
|
+
|
|
244
|
+
The project uses:
|
|
245
|
+
|
|
246
|
+
- **Black** for code formatting
|
|
247
|
+
- **mypy** for type checking
|
|
248
|
+
- **flake8** for linting
|
|
249
|
+
- **pytest** for testing
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
# Format code
|
|
253
|
+
black src/ tests/
|
|
254
|
+
|
|
255
|
+
# Type checking
|
|
256
|
+
mypy src/easy_podcast/
|
|
257
|
+
|
|
258
|
+
# Linting
|
|
259
|
+
flake8 src/easy_podcast/
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## Core Components
|
|
263
|
+
|
|
264
|
+
The package is built with a modular architecture:
|
|
265
|
+
|
|
266
|
+
- **`PodcastManager`** - Main orchestrator for the complete workflow
|
|
267
|
+
- **`Episode`/`Podcast`** - Data models with computed properties
|
|
268
|
+
- **`EpisodeTracker`** - JSONL-based metadata persistence
|
|
269
|
+
- **`PodcastParser`** - RSS feed parsing with custom episode ID extraction
|
|
270
|
+
- **`PodcastDownloader`** - HTTP downloads with progress tracking
|
|
271
|
+
- **`Transcription`** - WhisperX-based transcription module
|
|
272
|
+
|
|
273
|
+
## Contributing
|
|
274
|
+
|
|
275
|
+
1. Fork the repository
|
|
276
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
277
|
+
3. Make your changes with tests
|
|
278
|
+
4. Ensure all tests pass (`pytest`)
|
|
279
|
+
5. Check code quality (`black src/ tests/` and `mypy src/`)
|
|
280
|
+
6. Submit a pull request
|
|
281
|
+
|
|
282
|
+
## License
|
|
283
|
+
|
|
284
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
# Podcast Tracker
|
|
2
|
+
|
|
3
|
+
A modular Python package for downloading podcast episodes from RSS feeds and transcribing them using AI. Features progress tracking, metadata management, duplicate detection, and WhisperX-powered transcription with speaker diarization.
|
|
4
|
+
|
|
5
|
+
## Python Version Requirements
|
|
6
|
+
|
|
7
|
+
**This package requires Python 3.10, 3.11, or 3.12.** Python 3.13+ is not supported due to dependency limitations with the WhisperX library.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **RSS Feed Parsing**: Download and parse podcast RSS feeds
|
|
12
|
+
- **Episode Management**: Track downloaded episodes with JSONL metadata
|
|
13
|
+
- **Progress Tracking**: Visual progress bars for downloads
|
|
14
|
+
- **AI Transcription**: WhisperX-powered transcription with speaker diarization
|
|
15
|
+
- **Duplicate Detection**: Automatically skip already downloaded episodes
|
|
16
|
+
- **Type Safety**: Comprehensive type hints throughout
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
### Standard Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
git clone https://github.com/falahat/podcast.git
|
|
24
|
+
cd podcast
|
|
25
|
+
pip install -e .
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Installation with Transcription Support
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
git clone https://github.com/falahat/podcast.git
|
|
32
|
+
cd podcast
|
|
33
|
+
pip install -e .[transcribe]
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Development Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
git clone https://github.com/falahat/podcast.git
|
|
40
|
+
cd podcast
|
|
41
|
+
pip install -e .[dev,notebook,transcribe]
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick Start
|
|
45
|
+
|
|
46
|
+
### Command Line Interface
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Download episodes from an RSS feed
|
|
50
|
+
podcast_downloader "https://example.com/podcast/rss.xml"
|
|
51
|
+
|
|
52
|
+
# Specify custom data directory
|
|
53
|
+
podcast_downloader "https://example.com/podcast/rss.xml" --data-dir ./my_podcasts
|
|
54
|
+
|
|
55
|
+
# List episodes without downloading
|
|
56
|
+
podcast_downloader "https://example.com/podcast/rss.xml" --list-only
|
|
57
|
+
|
|
58
|
+
# Disable progress bars
|
|
59
|
+
podcast_downloader "https://example.com/podcast/rss.xml" --no-progress
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Python API
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from easy_podcast.manager import PodcastManager
|
|
66
|
+
|
|
67
|
+
# Create manager from RSS URL (downloads and parses automatically)
|
|
68
|
+
manager = PodcastManager.from_rss_url("https://example.com/podcast/rss.xml")
|
|
69
|
+
|
|
70
|
+
if manager:
|
|
71
|
+
podcast = manager.get_podcast()
|
|
72
|
+
print(f"Podcast: {podcast.title}")
|
|
73
|
+
|
|
74
|
+
# Get new episodes to download
|
|
75
|
+
new_episodes = manager.get_new_episodes()
|
|
76
|
+
print(f"Found {len(new_episodes)} new episodes")
|
|
77
|
+
|
|
78
|
+
# Download episodes with progress tracking
|
|
79
|
+
successful, skipped, failed = manager.download_episodes(new_episodes)
|
|
80
|
+
print(f"Downloaded: {successful}, Skipped: {skipped}, Failed: {failed}")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Working with Existing Podcast Data
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
# Load manager from existing podcast folder
|
|
87
|
+
manager = PodcastManager.from_podcast_folder("data/My Podcast/")
|
|
88
|
+
|
|
89
|
+
if manager:
|
|
90
|
+
# Continue downloading new episodes
|
|
91
|
+
new_episodes = manager.get_new_episodes()
|
|
92
|
+
manager.download_episodes(new_episodes)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Audio Transcription
|
|
96
|
+
|
|
97
|
+
The package includes AI-powered audio transcription using WhisperX with GPU acceleration and speaker diarization. Transcription functionality is available as an optional dependency.
|
|
98
|
+
|
|
99
|
+
**Installation**: To use transcription features, install with the `[transcribe]` option:
|
|
100
|
+
```bash
|
|
101
|
+
pip install -e .[transcribe]
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Prerequisites for Transcription
|
|
105
|
+
|
|
106
|
+
1. **NVIDIA GPU** with CUDA support
|
|
107
|
+
2. **Hugging Face Token** (for speaker diarization models)
|
|
108
|
+
3. **PyTorch with GPU support** (automatically installed with easy-whisperx)
|
|
109
|
+
|
|
110
|
+
**Note**: PyTorch is automatically installed as part of the `easy-whisperx` dependency. No manual installation required.
|
|
111
|
+
|
|
112
|
+
### Setting up Transcription Environment
|
|
113
|
+
|
|
114
|
+
1. **Get a Hugging Face Token**:
|
|
115
|
+
- Go to [Hugging Face Settings](https://huggingface.co/settings/tokens)
|
|
116
|
+
- Create a token with "read" permissions
|
|
117
|
+
- Accept user agreements for segmentation and diarization models
|
|
118
|
+
|
|
119
|
+
2. **Set Environment Variable**:
|
|
120
|
+
|
|
121
|
+
```powershell
|
|
122
|
+
# Windows PowerShell
|
|
123
|
+
$env:HF_TOKEN="your_token_here"
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# Linux/macOS
|
|
128
|
+
export HF_TOKEN="your_token_here"
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Using Transcription in Python
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from easy_whisperx.transcriber import Transcriber
|
|
135
|
+
|
|
136
|
+
# Initialize transcriber
|
|
137
|
+
transcriber = Transcriber(
|
|
138
|
+
model_size="base",
|
|
139
|
+
device="cuda", # or "cpu"
|
|
140
|
+
compute_type="float16",
|
|
141
|
+
batch_size=16
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Transcribe audio file
|
|
145
|
+
with transcriber:
|
|
146
|
+
result = transcriber("path/to/audio.mp3")
|
|
147
|
+
print(result["text"])
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Data Storage Structure
|
|
151
|
+
|
|
152
|
+
Podcast data is organized in a clear directory structure:
|
|
153
|
+
|
|
154
|
+
```
|
|
155
|
+
data/
|
|
156
|
+
└── [Sanitized Podcast Name]/
|
|
157
|
+
├── episodes.jsonl # Episode metadata (one JSON object per line)
|
|
158
|
+
├── rss.xml # Cached RSS feed
|
|
159
|
+
└── downloads/ # Downloaded audio files
|
|
160
|
+
├── episode1.mp3
|
|
161
|
+
├── episode2.mp3
|
|
162
|
+
└── ...
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
**Important**: Episode objects store filenames only (e.g., `"727175.mp3"`), not full paths. Use `manager.get_episode_audio_path(episode)` to get complete file paths.
|
|
166
|
+
|
|
167
|
+
## Development
|
|
168
|
+
|
|
169
|
+
### Setting up Development Environment
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
git clone https://github.com/falahat/podcast.git
|
|
173
|
+
cd podcast
|
|
174
|
+
|
|
175
|
+
# Create virtual environment (note the .venv name)
|
|
176
|
+
python -m venv .venv
|
|
177
|
+
|
|
178
|
+
# Activate virtual environment
|
|
179
|
+
# Windows PowerShell:
|
|
180
|
+
.\.venv\Scripts\Activate.ps1
|
|
181
|
+
# Linux/macOS:
|
|
182
|
+
source .venv/bin/activate
|
|
183
|
+
|
|
184
|
+
# Install in development mode
|
|
185
|
+
pip install -e .[dev,notebook]
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Running Tests
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
# Run all tests
|
|
192
|
+
pytest
|
|
193
|
+
|
|
194
|
+
# Run with coverage report
|
|
195
|
+
pytest --cov=easy_podcast --cov-report=html
|
|
196
|
+
|
|
197
|
+
# Run specific test file
|
|
198
|
+
pytest tests/test_manager.py -v
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Code Quality Tools
|
|
202
|
+
|
|
203
|
+
The project uses:
|
|
204
|
+
|
|
205
|
+
- **Black** for code formatting
|
|
206
|
+
- **mypy** for type checking
|
|
207
|
+
- **flake8** for linting
|
|
208
|
+
- **pytest** for testing
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
# Format code
|
|
212
|
+
black src/ tests/
|
|
213
|
+
|
|
214
|
+
# Type checking
|
|
215
|
+
mypy src/easy_podcast/
|
|
216
|
+
|
|
217
|
+
# Linting
|
|
218
|
+
flake8 src/easy_podcast/
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## Core Components
|
|
222
|
+
|
|
223
|
+
The package is built with a modular architecture:
|
|
224
|
+
|
|
225
|
+
- **`PodcastManager`** - Main orchestrator for the complete workflow
|
|
226
|
+
- **`Episode`/`Podcast`** - Data models with computed properties
|
|
227
|
+
- **`EpisodeTracker`** - JSONL-based metadata persistence
|
|
228
|
+
- **`PodcastParser`** - RSS feed parsing with custom episode ID extraction
|
|
229
|
+
- **`PodcastDownloader`** - HTTP downloads with progress tracking
|
|
230
|
+
- **`Transcription`** - WhisperX-based transcription module
|
|
231
|
+
|
|
232
|
+
## Contributing
|
|
233
|
+
|
|
234
|
+
1. Fork the repository
|
|
235
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
236
|
+
3. Make your changes with tests
|
|
237
|
+
4. Ensure all tests pass (`pytest`)
|
|
238
|
+
5. Check code quality (`black src/ tests/` and `mypy src/`)
|
|
239
|
+
6. Submit a pull request
|
|
240
|
+
|
|
241
|
+
## License
|
|
242
|
+
|
|
243
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=45", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project.scripts]
|
|
6
|
+
easy_podcast = "easy_podcast.cli:main"
|
|
7
|
+
|
|
8
|
+
[project]
|
|
9
|
+
name = "easy-podcast"
|
|
10
|
+
version = "0.0.1"
|
|
11
|
+
description = "A modular podcast episode downloader with RSS feed parsing and progress tracking"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Aryan Falahtpisheh", email = "aryanfalahat@gmail.com"},
|
|
14
|
+
]
|
|
15
|
+
readme = "README.md"
|
|
16
|
+
license = {text = "MIT"}
|
|
17
|
+
keywords = ["podcast", "rss", "download", "audio", "media"]
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 4 - Beta",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Intended Audience :: End Users/Desktop",
|
|
22
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
23
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
24
|
+
"License :: OSI Approved :: MIT License",
|
|
25
|
+
"Programming Language :: Python :: 3",
|
|
26
|
+
"Programming Language :: Python :: 3.10",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
]
|
|
30
|
+
requires-python = ">=3.10,<3.13"
|
|
31
|
+
dependencies = [
|
|
32
|
+
"feedparser",
|
|
33
|
+
"requests",
|
|
34
|
+
"tqdm",
|
|
35
|
+
"numpy",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=6.0.0",
|
|
41
|
+
"pytest-cov>=2.0.0",
|
|
42
|
+
"mypy>=1.0.0",
|
|
43
|
+
"black>=22.0.0",
|
|
44
|
+
"flake8>=4.0.0",
|
|
45
|
+
"coverage",
|
|
46
|
+
"types-requests",
|
|
47
|
+
"types-tqdm",
|
|
48
|
+
]
|
|
49
|
+
notebook = [
|
|
50
|
+
"jupyter>=1.0.0",
|
|
51
|
+
"pandas",
|
|
52
|
+
"pandas-stubs",
|
|
53
|
+
"matplotlib",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
[project.urls]
|
|
58
|
+
Homepage = "https://github.com/falahat/podcast"
|
|
59
|
+
Repository = "https://github.com/falahat/podcast.git"
|
|
60
|
+
Issues = "https://github.com/falahat/podcast/issues"
|
|
61
|
+
|
|
62
|
+
[tool.setuptools.packages.find]
|
|
63
|
+
where = ["src"]
|
|
64
|
+
exclude = ["tests*"]
|
|
65
|
+
|
|
66
|
+
[tool.black]
|
|
67
|
+
line-length = 79
|
|
68
|
+
target-version = ['py311']
|
|
69
|
+
include = '\.pyi?$'
|
|
70
|
+
extend-exclude = '''
|
|
71
|
+
/(
|
|
72
|
+
# directories
|
|
73
|
+
\.eggs
|
|
74
|
+
| \.git
|
|
75
|
+
| \.hg
|
|
76
|
+
| \.mypy_cache
|
|
77
|
+
| \.tox
|
|
78
|
+
| \.venv
|
|
79
|
+
| build
|
|
80
|
+
| dist
|
|
81
|
+
)/
|
|
82
|
+
'''
|
|
83
|
+
|
|
84
|
+
[tool.mypy]
|
|
85
|
+
python_version = "3.11"
|
|
86
|
+
warn_return_any = true
|
|
87
|
+
warn_unused_configs = true
|
|
88
|
+
disallow_untyped_defs = true
|
|
89
|
+
disallow_incomplete_defs = true
|
|
90
|
+
check_untyped_defs = true
|
|
91
|
+
disallow_untyped_decorators = true
|
|
92
|
+
no_implicit_optional = true
|
|
93
|
+
warn_redundant_casts = true
|
|
94
|
+
warn_unused_ignores = true
|
|
95
|
+
warn_no_return = true
|
|
96
|
+
warn_unreachable = true
|
|
97
|
+
strict_equality = true
|
|
98
|
+
mypy_path = "typings"
|
|
99
|
+
explicit_package_bases = true
|
|
100
|
+
follow_untyped_imports = true
|
|
101
|
+
|
|
102
|
+
[tool.pytest.ini_options]
|
|
103
|
+
testpaths = ["tests"]
|
|
104
|
+
python_files = ["test_*.py"]
|
|
105
|
+
addopts = [
|
|
106
|
+
"--strict-markers",
|
|
107
|
+
"--strict-config",
|
|
108
|
+
"--verbose",
|
|
109
|
+
]
|
|
110
|
+
markers = [
|
|
111
|
+
"slow: marks tests as slow",
|
|
112
|
+
"integration: marks tests as integration tests",
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
[tool.coverage.run]
|
|
116
|
+
source = ["./easy_podcast"]
|
|
117
|
+
omit = [
|
|
118
|
+
"*/tests/*",
|
|
119
|
+
"*/test_*",
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
[tool.coverage.report]
|
|
123
|
+
exclude_lines = [
|
|
124
|
+
"pragma: no cover",
|
|
125
|
+
"def __repr__",
|
|
126
|
+
"raise NotImplementedError",
|
|
127
|
+
"if __name__ == .__main__.:",
|
|
128
|
+
"if TYPE_CHECKING:",
|
|
129
|
+
]
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Podcast package - Handles downloading RSS data, parsing into episodes,
|
|
3
|
+
downloading episodes, and managing metadata.
|
|
4
|
+
|
|
5
|
+
This package provides a modular approach to podcast management with
|
|
6
|
+
separate components for data models, parsing, downloading, and tracking.
|
|
7
|
+
"""
|