stock-weekly-report 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/fetch_episodes.py CHANGED
@@ -6,7 +6,10 @@ Fetches RSS feeds from Soundon, finds episodes published within the
6
6
  configured lookback window, and downloads the audio files.
7
7
 
8
8
  Output structure:
9
- {parent_folder}/audio/{YYYYMMDD}-{YYYYMMDD}/{program_name}_{YYYYMMDD}.ext
9
+ {parent_folder}/audio/{program_name}/{program_name}_{YYYYMMDD}.ext
10
+
11
+ Each speaker has a persistent folder. Downloads are skipped if the file
12
+ already exists (checked by filename / date), so re-runs are safe.
10
13
  """
11
14
 
12
15
  import os
@@ -144,11 +147,10 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
144
147
  end_date = date(int(parts[1][:4]), int(parts[1][4:6]), int(parts[1][6:8]))
145
148
 
146
149
  parent_folder = Path(config["parent_folder"])
147
- audio_dir = parent_folder / "audio" / run_folder
148
- audio_dir.mkdir(parents=True, exist_ok=True)
150
+ audio_root = parent_folder / "audio"
149
151
 
150
152
  print(f"Date range : {start_date} → {end_date}")
151
- print(f"Audio folder: {audio_dir}")
153
+ print(f"Audio root : {audio_root}")
152
154
  print()
153
155
 
154
156
  total_downloaded = 0
@@ -157,6 +159,9 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
157
159
  program_name = feed_cfg["name"]
158
160
  feed_url = feed_cfg["url"]
159
161
 
162
+ speaker_dir = audio_root / program_name
163
+ speaker_dir.mkdir(parents=True, exist_ok=True)
164
+
160
165
  print(f"[{program_name}] Fetching feed …")
161
166
  parsed = feedparser.parse(feed_url)
162
167
 
@@ -180,7 +185,7 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
180
185
  ext = url_extension(audio_url)
181
186
  date_str = pub_date.strftime("%Y%m%d")
182
187
  filename = f"{program_name}_{date_str}{ext}"
183
- dest = audio_dir / filename
188
+ dest = speaker_dir / filename
184
189
 
185
190
  if dest.exists():
186
191
  print(f" SKIP (already downloaded): {filename}")
@@ -203,7 +208,7 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
203
208
  print(f" No new episodes in the past {lookback_days} days.")
204
209
  print()
205
210
 
206
- print(f"Done. {total_downloaded} file(s) newly downloaded → {audio_dir}")
211
+ print(f"Done. {total_downloaded} file(s) newly downloaded → {audio_root}")
207
212
 
208
213
 
209
214
  def run(config_path: str = "config.yaml") -> None:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "stock-weekly-report",
3
- "version": "0.2.1",
3
+ "version": "0.2.2",
4
4
  "description": "Stock weekly podcast report pipeline — CLI and MCP server",
5
5
  "bin": {
6
6
  "swr": "bin/swr.js",
package/pipeline.py CHANGED
@@ -79,15 +79,24 @@ def validate_audio_files(config: dict, folder_name: str) -> bool:
79
79
  - Returns False (abort) only when zero usable audio files remain after cleanup.
80
80
  """
81
81
  banner("GUARD — Audio File Integrity")
82
- audio_dir = Path(config["parent_folder"]) / "audio" / folder_name
82
+ audio_root = Path(config["parent_folder"]) / "audio"
83
+ parts = folder_name.split("-")
84
+ start_str, end_str = parts[0], parts[1]
83
85
 
84
86
  audio_files = []
85
- for ext in SUPPORTED_AUDIO_EXTS:
86
- audio_files.extend(audio_dir.glob(f"*{ext}"))
87
+ if audio_root.exists():
88
+ for speaker_dir in sorted(audio_root.iterdir()):
89
+ if not speaker_dir.is_dir():
90
+ continue
91
+ for ext in SUPPORTED_AUDIO_EXTS:
92
+ for f in speaker_dir.glob(f"*{ext}"):
93
+ date_str = f.stem.split("_")[-1]
94
+ if len(date_str) == 8 and start_str <= date_str <= end_str:
95
+ audio_files.append(f)
87
96
  audio_files = sorted(audio_files)
88
97
 
89
98
  if not audio_files:
90
- print(f" ERROR: No audio files found in {audio_dir}")
99
+ print(f" ERROR: No audio files found in {audio_root}")
91
100
  return False
92
101
 
93
102
  usable = 0
@@ -244,15 +253,51 @@ def _cleanup_data_dir(data_root: Path, label: str,
244
253
  print(f"\n Cleaned up {removed_folders} folder(s).")
245
254
 
246
255
 
256
+ def _cleanup_audio_by_speaker(audio_root: Path, months: int) -> None:
257
+ """Delete audio files older than `months` from per-speaker subdirectories."""
258
+ if months <= 0:
259
+ print(f" Audio: retention = 0 (keep forever), skipping.")
260
+ return
261
+
262
+ banner(f"CLEANUP — Audio (keep {months} month{'s' if months != 1 else ''})")
263
+
264
+ if not audio_root.exists():
265
+ print(f" Directory not found: {audio_root}")
266
+ return
267
+
268
+ cutoff = _cutoff_date(months)
269
+ print(f" Cutoff date : {cutoff} (deleting files published before this date)")
270
+
271
+ removed = 0
272
+ for speaker_dir in sorted(audio_root.iterdir()):
273
+ if not speaker_dir.is_dir():
274
+ continue
275
+ for ext in SUPPORTED_AUDIO_EXTS:
276
+ for f in speaker_dir.glob(f"*{ext}"):
277
+ date_str = f.stem.split("_")[-1]
278
+ try:
279
+ file_date = datetime.strptime(date_str, "%Y%m%d").date()
280
+ except ValueError:
281
+ continue
282
+ if file_date < cutoff:
283
+ size_mb = f.stat().st_size / (1024 * 1024)
284
+ print(f" Deleting ({size_mb:.1f} MB): {speaker_dir.name}/{f.name}")
285
+ f.unlink()
286
+ removed += 1
287
+
288
+ if removed == 0:
289
+ print(" No old audio files to remove.")
290
+ else:
291
+ print(f" Removed {removed} file(s).")
292
+
293
+
247
294
  def cleanup_old_data(config: dict) -> bool:
248
295
  """Clean up old audio, transcript, and report files per retention config."""
249
296
  retention = config.get("retention", {})
250
297
  parent = Path(config["parent_folder"])
251
298
 
252
- _cleanup_data_dir(
299
+ _cleanup_audio_by_speaker(
253
300
  parent / "audio",
254
- "Audio",
255
- SUPPORTED_AUDIO_EXTS,
256
301
  int(retention.get("audio_months", 3)),
257
302
  )
258
303
  _cleanup_data_dir(
@@ -340,7 +385,7 @@ def main() -> None:
340
385
  results["fetch"] = "skipped"
341
386
 
342
387
  # ── Guard: Audio integrity ───────────────────────────────────────
343
- if not args.skip_fetch and not args.skip_transcribe:
388
+ if not args.skip_transcribe:
344
389
  ok = validate_audio_files(config, folder_name)
345
390
  results["audio_check"] = ok
346
391
  if not ok:
package/pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "stock-weekly-report"
7
- version = "0.2.1"
7
+ version = "0.2.2"
8
8
  requires-python = ">=3.10"
9
9
  authors = [
10
10
  { name = "Chang Yu Chuan", email = "changyuchuanmicron@gmail.com" },
package/send_report.py CHANGED
@@ -439,7 +439,7 @@ def save_report(config: dict, folder_name: str, body: str) -> Path:
439
439
  """Save the report text to disk and return the file path."""
440
440
  report_dir = Path(config["parent_folder"]) / "reports" / folder_name
441
441
  report_dir.mkdir(parents=True, exist_ok=True)
442
- report_path = report_dir / "weekly_report.txt"
442
+ report_path = report_dir / f"weekly_report_{folder_name}.txt"
443
443
  report_path.write_text(body, encoding="utf-8")
444
444
  print(f" Report saved to: {report_path}")
445
445
  return report_path
package/transcribe.py CHANGED
@@ -15,6 +15,9 @@ Usage:
15
15
  python transcribe.py --folder 20260218-20260225
16
16
  python transcribe.py --config my_config.yaml --folder 20260218-20260225
17
17
 
18
+ Input structure:
19
+ {parent_folder}/audio/{program_name}/{program_name}_{YYYYMMDD}.ext
20
+
18
21
  Output structure:
19
22
  {parent_folder}/transcripts/{YYYYMMDD}-{YYYYMMDD}/{stem}.txt
20
23
  """
@@ -51,10 +54,26 @@ def default_folder_name(lookback_days: int) -> str:
51
54
  return f"{start.strftime('%Y%m%d')}-{today.strftime('%Y%m%d')}"
52
55
 
53
56
 
54
- def find_audio_files(audio_dir: Path) -> list[Path]:
57
+ def find_audio_files_for_run(audio_root: Path, folder_name: str) -> list[Path]:
58
+ """Collect audio files across per-speaker subdirs whose date falls in the run window.
59
+
60
+ Expects filenames of the form {speaker}_{YYYYMMDD}.ext so the date can be
61
+ extracted from the stem suffix.
62
+ """
63
+ parts = folder_name.split("-")
64
+ start_str, end_str = parts[0], parts[1]
65
+
55
66
  files = []
56
- for ext in SUPPORTED_AUDIO_EXTS:
57
- files.extend(audio_dir.glob(f"*{ext}"))
67
+ if not audio_root.exists():
68
+ return files
69
+ for speaker_dir in sorted(audio_root.iterdir()):
70
+ if not speaker_dir.is_dir():
71
+ continue
72
+ for ext in SUPPORTED_AUDIO_EXTS:
73
+ for f in speaker_dir.glob(f"*{ext}"):
74
+ date_str = f.stem.split("_")[-1]
75
+ if len(date_str) == 8 and start_str <= date_str <= end_str:
76
+ files.append(f)
58
77
  return sorted(files)
59
78
 
60
79
 
@@ -150,18 +169,14 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
150
169
  from faster_whisper import WhisperModel
151
170
 
152
171
  parent_folder = Path(config["parent_folder"])
153
- audio_dir = parent_folder / "audio" / folder_name
172
+ audio_root = parent_folder / "audio"
154
173
  transcript_dir = parent_folder / "transcripts" / folder_name
155
174
 
156
- if not audio_dir.exists():
157
- print(f"ERROR: Audio directory not found: {audio_dir}")
158
- sys.exit(1)
159
-
160
175
  transcript_dir.mkdir(parents=True, exist_ok=True)
161
176
 
162
- audio_files = find_audio_files(audio_dir)
177
+ audio_files = find_audio_files_for_run(audio_root, folder_name)
163
178
  if not audio_files:
164
- print(f"No audio files found in {audio_dir}")
179
+ print(f"No audio files found in {audio_root} for run window {folder_name}")
165
180
  return
166
181
 
167
182
  model_name = config.get("whisper_model", "medium")
@@ -171,7 +186,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
171
186
  print(f"Whisper model : {model_name} (faster-whisper / CTranslate2)")
172
187
  print(f"Compute type : {compute_type}")
173
188
  print(f"Language hint : {language}")
174
- print(f"Audio folder : {audio_dir}")
189
+ print(f"Audio root : {audio_root}")
175
190
  print(f"Transcript dir : {transcript_dir}")
176
191
  print(f"Files to process: {len(audio_files)}")
177
192
  print(f"Max retries : {MAX_RETRIES}")