stock-weekly-report 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/fetch_episodes.py +11 -6
- package/package.json +1 -1
- package/pipeline.py +53 -8
- package/pyproject.toml +1 -1
- package/send_report.py +1 -1
- package/transcribe.py +26 -11
package/fetch_episodes.py
CHANGED
|
@@ -6,7 +6,10 @@ Fetches RSS feeds from Soundon, finds episodes published within the
|
|
|
6
6
|
configured lookback window, and downloads the audio files.
|
|
7
7
|
|
|
8
8
|
Output structure:
|
|
9
|
-
{parent_folder}/audio/{
|
|
9
|
+
{parent_folder}/audio/{program_name}/{program_name}_{YYYYMMDD}.ext
|
|
10
|
+
|
|
11
|
+
Each speaker has a persistent folder. Downloads are skipped if the file
|
|
12
|
+
already exists (checked by filename / date), so re-runs are safe.
|
|
10
13
|
"""
|
|
11
14
|
|
|
12
15
|
import os
|
|
@@ -144,11 +147,10 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
|
|
|
144
147
|
end_date = date(int(parts[1][:4]), int(parts[1][4:6]), int(parts[1][6:8]))
|
|
145
148
|
|
|
146
149
|
parent_folder = Path(config["parent_folder"])
|
|
147
|
-
|
|
148
|
-
audio_dir.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
audio_root = parent_folder / "audio"
|
|
149
151
|
|
|
150
152
|
print(f"Date range : {start_date} → {end_date}")
|
|
151
|
-
print(f"Audio
|
|
153
|
+
print(f"Audio root : {audio_root}")
|
|
152
154
|
print()
|
|
153
155
|
|
|
154
156
|
total_downloaded = 0
|
|
@@ -157,6 +159,9 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
|
|
|
157
159
|
program_name = feed_cfg["name"]
|
|
158
160
|
feed_url = feed_cfg["url"]
|
|
159
161
|
|
|
162
|
+
speaker_dir = audio_root / program_name
|
|
163
|
+
speaker_dir.mkdir(parents=True, exist_ok=True)
|
|
164
|
+
|
|
160
165
|
print(f"[{program_name}] Fetching feed …")
|
|
161
166
|
parsed = feedparser.parse(feed_url)
|
|
162
167
|
|
|
@@ -180,7 +185,7 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
|
|
|
180
185
|
ext = url_extension(audio_url)
|
|
181
186
|
date_str = pub_date.strftime("%Y%m%d")
|
|
182
187
|
filename = f"{program_name}_{date_str}{ext}"
|
|
183
|
-
dest =
|
|
188
|
+
dest = speaker_dir / filename
|
|
184
189
|
|
|
185
190
|
if dest.exists():
|
|
186
191
|
print(f" SKIP (already downloaded): {filename}")
|
|
@@ -203,7 +208,7 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
|
|
|
203
208
|
print(f" No new episodes in the past {lookback_days} days.")
|
|
204
209
|
print()
|
|
205
210
|
|
|
206
|
-
print(f"Done. {total_downloaded} file(s) newly downloaded → {
|
|
211
|
+
print(f"Done. {total_downloaded} file(s) newly downloaded → {audio_root}")
|
|
207
212
|
|
|
208
213
|
|
|
209
214
|
def run(config_path: str = "config.yaml") -> None:
|
package/package.json
CHANGED
package/pipeline.py
CHANGED
|
@@ -79,15 +79,24 @@ def validate_audio_files(config: dict, folder_name: str) -> bool:
|
|
|
79
79
|
- Returns False (abort) only when zero usable audio files remain after cleanup.
|
|
80
80
|
"""
|
|
81
81
|
banner("GUARD — Audio File Integrity")
|
|
82
|
-
|
|
82
|
+
audio_root = Path(config["parent_folder"]) / "audio"
|
|
83
|
+
parts = folder_name.split("-")
|
|
84
|
+
start_str, end_str = parts[0], parts[1]
|
|
83
85
|
|
|
84
86
|
audio_files = []
|
|
85
|
-
|
|
86
|
-
|
|
87
|
+
if audio_root.exists():
|
|
88
|
+
for speaker_dir in sorted(audio_root.iterdir()):
|
|
89
|
+
if not speaker_dir.is_dir():
|
|
90
|
+
continue
|
|
91
|
+
for ext in SUPPORTED_AUDIO_EXTS:
|
|
92
|
+
for f in speaker_dir.glob(f"*{ext}"):
|
|
93
|
+
date_str = f.stem.split("_")[-1]
|
|
94
|
+
if len(date_str) == 8 and start_str <= date_str <= end_str:
|
|
95
|
+
audio_files.append(f)
|
|
87
96
|
audio_files = sorted(audio_files)
|
|
88
97
|
|
|
89
98
|
if not audio_files:
|
|
90
|
-
print(f" ERROR: No audio files found in {
|
|
99
|
+
print(f" ERROR: No audio files found in {audio_root}")
|
|
91
100
|
return False
|
|
92
101
|
|
|
93
102
|
usable = 0
|
|
@@ -244,15 +253,51 @@ def _cleanup_data_dir(data_root: Path, label: str,
|
|
|
244
253
|
print(f"\n Cleaned up {removed_folders} folder(s).")
|
|
245
254
|
|
|
246
255
|
|
|
256
|
+
def _cleanup_audio_by_speaker(audio_root: Path, months: int) -> None:
|
|
257
|
+
"""Delete audio files older than `months` from per-speaker subdirectories."""
|
|
258
|
+
if months <= 0:
|
|
259
|
+
print(f" Audio: retention = 0 (keep forever), skipping.")
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
banner(f"CLEANUP — Audio (keep {months} month{'s' if months != 1 else ''})")
|
|
263
|
+
|
|
264
|
+
if not audio_root.exists():
|
|
265
|
+
print(f" Directory not found: {audio_root}")
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
cutoff = _cutoff_date(months)
|
|
269
|
+
print(f" Cutoff date : {cutoff} (deleting files published before this date)")
|
|
270
|
+
|
|
271
|
+
removed = 0
|
|
272
|
+
for speaker_dir in sorted(audio_root.iterdir()):
|
|
273
|
+
if not speaker_dir.is_dir():
|
|
274
|
+
continue
|
|
275
|
+
for ext in SUPPORTED_AUDIO_EXTS:
|
|
276
|
+
for f in speaker_dir.glob(f"*{ext}"):
|
|
277
|
+
date_str = f.stem.split("_")[-1]
|
|
278
|
+
try:
|
|
279
|
+
file_date = datetime.strptime(date_str, "%Y%m%d").date()
|
|
280
|
+
except ValueError:
|
|
281
|
+
continue
|
|
282
|
+
if file_date < cutoff:
|
|
283
|
+
size_mb = f.stat().st_size / (1024 * 1024)
|
|
284
|
+
print(f" Deleting ({size_mb:.1f} MB): {speaker_dir.name}/{f.name}")
|
|
285
|
+
f.unlink()
|
|
286
|
+
removed += 1
|
|
287
|
+
|
|
288
|
+
if removed == 0:
|
|
289
|
+
print(" No old audio files to remove.")
|
|
290
|
+
else:
|
|
291
|
+
print(f" Removed {removed} file(s).")
|
|
292
|
+
|
|
293
|
+
|
|
247
294
|
def cleanup_old_data(config: dict) -> bool:
|
|
248
295
|
"""Clean up old audio, transcript, and report files per retention config."""
|
|
249
296
|
retention = config.get("retention", {})
|
|
250
297
|
parent = Path(config["parent_folder"])
|
|
251
298
|
|
|
252
|
-
|
|
299
|
+
_cleanup_audio_by_speaker(
|
|
253
300
|
parent / "audio",
|
|
254
|
-
"Audio",
|
|
255
|
-
SUPPORTED_AUDIO_EXTS,
|
|
256
301
|
int(retention.get("audio_months", 3)),
|
|
257
302
|
)
|
|
258
303
|
_cleanup_data_dir(
|
|
@@ -340,7 +385,7 @@ def main() -> None:
|
|
|
340
385
|
results["fetch"] = "skipped"
|
|
341
386
|
|
|
342
387
|
# ── Guard: Audio integrity ───────────────────────────────────────
|
|
343
|
-
if not args.
|
|
388
|
+
if not args.skip_transcribe:
|
|
344
389
|
ok = validate_audio_files(config, folder_name)
|
|
345
390
|
results["audio_check"] = ok
|
|
346
391
|
if not ok:
|
package/pyproject.toml
CHANGED
package/send_report.py
CHANGED
|
@@ -439,7 +439,7 @@ def save_report(config: dict, folder_name: str, body: str) -> Path:
|
|
|
439
439
|
"""Save the report text to disk and return the file path."""
|
|
440
440
|
report_dir = Path(config["parent_folder"]) / "reports" / folder_name
|
|
441
441
|
report_dir.mkdir(parents=True, exist_ok=True)
|
|
442
|
-
report_path = report_dir / "
|
|
442
|
+
report_path = report_dir / f"weekly_report_{folder_name}.txt"
|
|
443
443
|
report_path.write_text(body, encoding="utf-8")
|
|
444
444
|
print(f" Report saved to: {report_path}")
|
|
445
445
|
return report_path
|
package/transcribe.py
CHANGED
|
@@ -15,6 +15,9 @@ Usage:
|
|
|
15
15
|
python transcribe.py --folder 20260218-20260225
|
|
16
16
|
python transcribe.py --config my_config.yaml --folder 20260218-20260225
|
|
17
17
|
|
|
18
|
+
Input structure:
|
|
19
|
+
{parent_folder}/audio/{program_name}/{program_name}_{YYYYMMDD}.ext
|
|
20
|
+
|
|
18
21
|
Output structure:
|
|
19
22
|
{parent_folder}/transcripts/{YYYYMMDD}-{YYYYMMDD}/{stem}.txt
|
|
20
23
|
"""
|
|
@@ -51,10 +54,26 @@ def default_folder_name(lookback_days: int) -> str:
|
|
|
51
54
|
return f"{start.strftime('%Y%m%d')}-{today.strftime('%Y%m%d')}"
|
|
52
55
|
|
|
53
56
|
|
|
54
|
-
def
|
|
57
|
+
def find_audio_files_for_run(audio_root: Path, folder_name: str) -> list[Path]:
|
|
58
|
+
"""Collect audio files across per-speaker subdirs whose date falls in the run window.
|
|
59
|
+
|
|
60
|
+
Expects filenames of the form {speaker}_{YYYYMMDD}.ext so the date can be
|
|
61
|
+
extracted from the stem suffix.
|
|
62
|
+
"""
|
|
63
|
+
parts = folder_name.split("-")
|
|
64
|
+
start_str, end_str = parts[0], parts[1]
|
|
65
|
+
|
|
55
66
|
files = []
|
|
56
|
-
|
|
57
|
-
files
|
|
67
|
+
if not audio_root.exists():
|
|
68
|
+
return files
|
|
69
|
+
for speaker_dir in sorted(audio_root.iterdir()):
|
|
70
|
+
if not speaker_dir.is_dir():
|
|
71
|
+
continue
|
|
72
|
+
for ext in SUPPORTED_AUDIO_EXTS:
|
|
73
|
+
for f in speaker_dir.glob(f"*{ext}"):
|
|
74
|
+
date_str = f.stem.split("_")[-1]
|
|
75
|
+
if len(date_str) == 8 and start_str <= date_str <= end_str:
|
|
76
|
+
files.append(f)
|
|
58
77
|
return sorted(files)
|
|
59
78
|
|
|
60
79
|
|
|
@@ -150,18 +169,14 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
|
|
|
150
169
|
from faster_whisper import WhisperModel
|
|
151
170
|
|
|
152
171
|
parent_folder = Path(config["parent_folder"])
|
|
153
|
-
|
|
172
|
+
audio_root = parent_folder / "audio"
|
|
154
173
|
transcript_dir = parent_folder / "transcripts" / folder_name
|
|
155
174
|
|
|
156
|
-
if not audio_dir.exists():
|
|
157
|
-
print(f"ERROR: Audio directory not found: {audio_dir}")
|
|
158
|
-
sys.exit(1)
|
|
159
|
-
|
|
160
175
|
transcript_dir.mkdir(parents=True, exist_ok=True)
|
|
161
176
|
|
|
162
|
-
audio_files =
|
|
177
|
+
audio_files = find_audio_files_for_run(audio_root, folder_name)
|
|
163
178
|
if not audio_files:
|
|
164
|
-
print(f"No audio files found in {
|
|
179
|
+
print(f"No audio files found in {audio_root} for run window {folder_name}")
|
|
165
180
|
return
|
|
166
181
|
|
|
167
182
|
model_name = config.get("whisper_model", "medium")
|
|
@@ -171,7 +186,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
|
|
|
171
186
|
print(f"Whisper model : {model_name} (faster-whisper / CTranslate2)")
|
|
172
187
|
print(f"Compute type : {compute_type}")
|
|
173
188
|
print(f"Language hint : {language}")
|
|
174
|
-
print(f"Audio
|
|
189
|
+
print(f"Audio root : {audio_root}")
|
|
175
190
|
print(f"Transcript dir : {transcript_dir}")
|
|
176
191
|
print(f"Files to process: {len(audio_files)}")
|
|
177
192
|
print(f"Max retries : {MAX_RETRIES}")
|