stock-weekly-report 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/fetch_episodes.py CHANGED
@@ -187,11 +187,6 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
187
187
  filename = f"{program_name}_{date_str}{ext}"
188
188
  dest = speaker_dir / filename
189
189
 
190
- if dest.exists():
191
- print(f" SKIP (already downloaded): {filename}")
192
- found += 1
193
- continue
194
-
195
190
  print(f" Downloading: {filename}")
196
191
  try:
197
192
  download_file(audio_url, dest)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "stock-weekly-report",
3
- "version": "0.2.2",
3
+ "version": "0.2.4",
4
4
  "description": "Stock weekly podcast report pipeline — CLI and MCP server",
5
5
  "bin": {
6
6
  "swr": "bin/swr.js",
package/pipeline.py CHANGED
@@ -253,26 +253,27 @@ def _cleanup_data_dir(data_root: Path, label: str,
253
253
  print(f"\n Cleaned up {removed_folders} folder(s).")
254
254
 
255
255
 
256
- def _cleanup_audio_by_speaker(audio_root: Path, months: int) -> None:
257
- """Delete audio files older than `months` from per-speaker subdirectories."""
256
+ def _cleanup_by_speaker(data_root: Path, label: str,
257
+ extensions: set[str], months: int) -> None:
258
+ """Delete files matching `extensions` from per-speaker subdirs older than `months`."""
258
259
  if months <= 0:
259
- print(f" Audio: retention = 0 (keep forever), skipping.")
260
+ print(f" {label}: retention = 0 (keep forever), skipping.")
260
261
  return
261
262
 
262
- banner(f"CLEANUP — Audio (keep {months} month{'s' if months != 1 else ''})")
263
+ banner(f"CLEANUP — {label} (keep {months} month{'s' if months != 1 else ''})")
263
264
 
264
- if not audio_root.exists():
265
- print(f" Directory not found: {audio_root}")
265
+ if not data_root.exists():
266
+ print(f" Directory not found: {data_root}")
266
267
  return
267
268
 
268
269
  cutoff = _cutoff_date(months)
269
270
  print(f" Cutoff date : {cutoff} (deleting files published before this date)")
270
271
 
271
272
  removed = 0
272
- for speaker_dir in sorted(audio_root.iterdir()):
273
+ for speaker_dir in sorted(data_root.iterdir()):
273
274
  if not speaker_dir.is_dir():
274
275
  continue
275
- for ext in SUPPORTED_AUDIO_EXTS:
276
+ for ext in extensions:
276
277
  for f in speaker_dir.glob(f"*{ext}"):
277
278
  date_str = f.stem.split("_")[-1]
278
279
  try:
@@ -281,16 +282,20 @@ def _cleanup_audio_by_speaker(audio_root: Path, months: int) -> None:
281
282
  continue
282
283
  if file_date < cutoff:
283
284
  size_mb = f.stat().st_size / (1024 * 1024)
284
- print(f" Deleting ({size_mb:.1f} MB): {speaker_dir.name}/{f.name}")
285
+ print(f" Deleting ({size_mb:.2f} MB): {speaker_dir.name}/{f.name}")
285
286
  f.unlink()
286
287
  removed += 1
287
288
 
288
289
  if removed == 0:
289
- print(" No old audio files to remove.")
290
+ print(" No old files to remove.")
290
291
  else:
291
292
  print(f" Removed {removed} file(s).")
292
293
 
293
294
 
295
+ def _cleanup_audio_by_speaker(audio_root: Path, months: int) -> None:
296
+ _cleanup_by_speaker(audio_root, "Audio", SUPPORTED_AUDIO_EXTS, months)
297
+
298
+
294
299
  def cleanup_old_data(config: dict) -> bool:
295
300
  """Clean up old audio, transcript, and report files per retention config."""
296
301
  retention = config.get("retention", {})
@@ -300,7 +305,7 @@ def cleanup_old_data(config: dict) -> bool:
300
305
  parent / "audio",
301
306
  int(retention.get("audio_months", 3)),
302
307
  )
303
- _cleanup_data_dir(
308
+ _cleanup_by_speaker(
304
309
  parent / "transcripts",
305
310
  "Transcripts",
306
311
  {".txt"},
package/pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "stock-weekly-report"
7
- version = "0.2.2"
7
+ version = "0.2.4"
8
8
  requires-python = ">=3.10"
9
9
  authors = [
10
10
  { name = "Chang Yu Chuan", email = "changyuchuanmicron@gmail.com" },
package/transcribe.py CHANGED
@@ -100,11 +100,13 @@ def verify_transcript(transcript_path: Path) -> tuple[bool, str]:
100
100
  return True, ""
101
101
 
102
102
 
103
- def verify_all(audio_files: list[Path], transcript_dir: Path) -> dict[str, tuple[bool, str]]:
103
+ def verify_all(audio_files: list[Path], transcript_root: Path) -> dict[str, tuple[bool, str]]:
104
104
  """Verify every expected transcript and return a {filename: (ok, reason)} map."""
105
105
  results = {}
106
106
  for audio_file in audio_files:
107
- transcript_path = transcript_dir / f"{audio_file.stem}.txt"
107
+ stem = audio_file.stem
108
+ speaker = stem.rsplit("_", 1)[0]
109
+ transcript_path = transcript_root / speaker / f"{stem}.txt"
108
110
  results[audio_file.name] = verify_transcript(transcript_path)
109
111
  return results
110
112
 
@@ -170,9 +172,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
170
172
 
171
173
  parent_folder = Path(config["parent_folder"])
172
174
  audio_root = parent_folder / "audio"
173
- transcript_dir = parent_folder / "transcripts" / folder_name
174
-
175
- transcript_dir.mkdir(parents=True, exist_ok=True)
175
+ transcript_root = parent_folder / "transcripts"
176
176
 
177
177
  audio_files = find_audio_files_for_run(audio_root, folder_name)
178
178
  if not audio_files:
@@ -187,7 +187,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
187
187
  print(f"Compute type : {compute_type}")
188
188
  print(f"Language hint : {language}")
189
189
  print(f"Audio root : {audio_root}")
190
- print(f"Transcript dir : {transcript_dir}")
190
+ print(f"Transcript root : {transcript_root}")
191
191
  print(f"Files to process: {len(audio_files)}")
192
192
  print(f"Max retries : {MAX_RETRIES}")
193
193
  print()
@@ -199,7 +199,11 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
199
199
  succeeded, skipped, failed = [], [], []
200
200
 
201
201
  for idx, audio_file in enumerate(audio_files, start=1):
202
- transcript_path = transcript_dir / f"{audio_file.stem}.txt"
202
+ stem = audio_file.stem # e.g. "股癌_20260225"
203
+ speaker = stem.rsplit("_", 1)[0] # e.g. "股癌"
204
+ speaker_transcript_dir = transcript_root / speaker
205
+ speaker_transcript_dir.mkdir(parents=True, exist_ok=True)
206
+ transcript_path = speaker_transcript_dir / f"{stem}.txt"
203
207
  label = f"[{idx}/{len(audio_files)}]"
204
208
 
205
209
  # Check if an existing transcript already passes verification
@@ -226,7 +230,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
226
230
  # ── Post-run verification ─────────────────────────────────────────────
227
231
  print("─" * 60)
228
232
  print("Verification pass …")
229
- verification = verify_all(audio_files, transcript_dir)
233
+ verification = verify_all(audio_files, transcript_root)
230
234
 
231
235
  all_ok = True
232
236
  for audio_name, (ok, reason) in verification.items():
@@ -49,6 +49,24 @@ def default_folder_name(lookback_days: int) -> str:
49
49
  return f"{start.strftime('%Y%m%d')}-{today.strftime('%Y%m%d')}"
50
50
 
51
51
 
52
+ def find_transcripts_for_run(transcript_root: Path, folder_name: str) -> list[Path]:
53
+ """Collect transcript files across per-speaker subdirs whose date falls in the run window."""
54
+ parts = folder_name.split("-")
55
+ start_str, end_str = parts[0], parts[1]
56
+
57
+ files = []
58
+ if not transcript_root.exists():
59
+ return files
60
+ for speaker_dir in sorted(transcript_root.iterdir()):
61
+ if not speaker_dir.is_dir():
62
+ continue
63
+ for f in speaker_dir.glob("*.txt"):
64
+ date_str = f.stem.split("_")[-1]
65
+ if len(date_str) == 8 and start_str <= date_str <= end_str:
66
+ files.append(f)
67
+ return sorted(files)
68
+
69
+
52
70
  # ---------------------------------------------------------------------------
53
71
  # nlm CLI wrappers
54
72
  # ---------------------------------------------------------------------------
@@ -148,19 +166,15 @@ def run(config: dict, folder_name: str) -> str:
148
166
  """Run the upload stage and return the notebook_id."""
149
167
  nlm_path = config.get("nlm_path", "nlm")
150
168
  parent_folder = Path(config["parent_folder"])
151
- transcript_dir = parent_folder / "transcripts" / folder_name
169
+ transcript_root = parent_folder / "transcripts"
152
170
 
153
- if not transcript_dir.exists():
154
- print(f"ERROR: Transcript directory not found: {transcript_dir}")
171
+ txt_files = find_transcripts_for_run(transcript_root, folder_name)
172
+ if not txt_files:
173
+ print(f"No transcript files found in {transcript_root} for window {folder_name}")
155
174
  print("Run transcribe.py first to generate transcripts.")
156
175
  sys.exit(1)
157
176
 
158
- txt_files = sorted(transcript_dir.glob("*.txt"))
159
- if not txt_files:
160
- print(f"No .txt files found in {transcript_dir}")
161
- sys.exit(0)
162
-
163
- print(f"Transcript folder : {transcript_dir}")
177
+ print(f"Transcript root : {transcript_root}")
164
178
  print(f"Files to upload : {len(txt_files)}")
165
179
  print()
166
180