npm - stock-weekly-report - Versions diffs - 0.2.2 → 0.2.4 - Mend

stock-weekly-report 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/fetch_episodes.py CHANGED Viewed

@@ -187,11 +187,6 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
             filename = f"{program_name}_{date_str}{ext}"
             dest = speaker_dir / filename
-            if dest.exists():
-                print(f"  SKIP (already downloaded): {filename}")
-                found += 1
-                continue
             print(f"  Downloading: {filename}")
             try:
                 download_file(audio_url, dest)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "stock-weekly-report",
-  "version": "0.2.2",
+  "version": "0.2.4",
   "description": "Stock weekly podcast report pipeline — CLI and MCP server",
   "bin": {
     "swr": "bin/swr.js",

package/pipeline.py CHANGED Viewed

@@ -253,26 +253,27 @@ def _cleanup_data_dir(data_root: Path, label: str,
         print(f"\n  Cleaned up {removed_folders} folder(s).")
-def _cleanup_audio_by_speaker(audio_root: Path, months: int) -> None:
-    """Delete audio files older than `months` from per-speaker subdirectories."""
+def _cleanup_by_speaker(data_root: Path, label: str,
+                        extensions: set[str], months: int) -> None:
+    """Delete files matching `extensions` from per-speaker subdirs older than `months`."""
     if months <= 0:
-        print(f"  Audio: retention = 0 (keep forever), skipping.")
+        print(f"  {label}: retention = 0 (keep forever), skipping.")
         return
-    banner(f"CLEANUP — Audio (keep {months} month{'s' if months != 1 else ''})")
+    banner(f"CLEANUP — {label} (keep {months} month{'s' if months != 1 else ''})")
-    if not audio_root.exists():
-        print(f"  Directory not found: {audio_root}")
+    if not data_root.exists():
+        print(f"  Directory not found: {data_root}")
         return
     cutoff = _cutoff_date(months)
     print(f"  Cutoff date : {cutoff}  (deleting files published before this date)")
     removed = 0
-    for speaker_dir in sorted(audio_root.iterdir()):
+    for speaker_dir in sorted(data_root.iterdir()):
         if not speaker_dir.is_dir():
             continue
-        for ext in SUPPORTED_AUDIO_EXTS:
+        for ext in extensions:
             for f in speaker_dir.glob(f"*{ext}"):
                 date_str = f.stem.split("_")[-1]
                 try:
@@ -281,16 +282,20 @@ def _cleanup_audio_by_speaker(audio_root: Path, months: int) -> None:
                     continue
                 if file_date < cutoff:
                     size_mb = f.stat().st_size / (1024 * 1024)
-                    print(f"  Deleting ({size_mb:.1f} MB): {speaker_dir.name}/{f.name}")
+                    print(f"  Deleting ({size_mb:.2f} MB): {speaker_dir.name}/{f.name}")
                     f.unlink()
                     removed += 1
     if removed == 0:
-        print("  No old audio files to remove.")
+        print("  No old files to remove.")
     else:
         print(f"  Removed {removed} file(s).")
+def _cleanup_audio_by_speaker(audio_root: Path, months: int) -> None:
+    _cleanup_by_speaker(audio_root, "Audio", SUPPORTED_AUDIO_EXTS, months)
 def cleanup_old_data(config: dict) -> bool:
     """Clean up old audio, transcript, and report files per retention config."""
     retention = config.get("retention", {})
@@ -300,7 +305,7 @@ def cleanup_old_data(config: dict) -> bool:
         parent / "audio",
         int(retention.get("audio_months", 3)),
     )
-    _cleanup_data_dir(
+    _cleanup_by_speaker(
         parent / "transcripts",
         "Transcripts",
         {".txt"},

package/pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "stock-weekly-report"
-version = "0.2.2"
+version = "0.2.4"
 requires-python = ">=3.10"
 authors = [
     { name = "Chang Yu Chuan", email = "changyuchuanmicron@gmail.com" },

package/transcribe.py CHANGED Viewed

@@ -100,11 +100,13 @@ def verify_transcript(transcript_path: Path) -> tuple[bool, str]:
     return True, ""
-def verify_all(audio_files: list[Path], transcript_dir: Path) -> dict[str, tuple[bool, str]]:
+def verify_all(audio_files: list[Path], transcript_root: Path) -> dict[str, tuple[bool, str]]:
     """Verify every expected transcript and return a {filename: (ok, reason)} map."""
     results = {}
     for audio_file in audio_files:
-        transcript_path = transcript_dir / f"{audio_file.stem}.txt"
+        stem = audio_file.stem
+        speaker = stem.rsplit("_", 1)[0]
+        transcript_path = transcript_root / speaker / f"{stem}.txt"
         results[audio_file.name] = verify_transcript(transcript_path)
     return results
@@ -170,9 +172,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
     parent_folder = Path(config["parent_folder"])
     audio_root = parent_folder / "audio"
-    transcript_dir = parent_folder / "transcripts" / folder_name
-    transcript_dir.mkdir(parents=True, exist_ok=True)
+    transcript_root = parent_folder / "transcripts"
     audio_files = find_audio_files_for_run(audio_root, folder_name)
     if not audio_files:
@@ -187,7 +187,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
     print(f"Compute type    : {compute_type}")
     print(f"Language hint   : {language}")
     print(f"Audio root      : {audio_root}")
-    print(f"Transcript dir  : {transcript_dir}")
+    print(f"Transcript root : {transcript_root}")
     print(f"Files to process: {len(audio_files)}")
     print(f"Max retries     : {MAX_RETRIES}")
     print()
@@ -199,7 +199,11 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
     succeeded, skipped, failed = [], [], []
     for idx, audio_file in enumerate(audio_files, start=1):
-        transcript_path = transcript_dir / f"{audio_file.stem}.txt"
+        stem = audio_file.stem                          # e.g. "股癌_20260225"
+        speaker = stem.rsplit("_", 1)[0]               # e.g. "股癌"
+        speaker_transcript_dir = transcript_root / speaker
+        speaker_transcript_dir.mkdir(parents=True, exist_ok=True)
+        transcript_path = speaker_transcript_dir / f"{stem}.txt"
         label = f"[{idx}/{len(audio_files)}]"
         # Check if an existing transcript already passes verification
@@ -226,7 +230,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
     # ── Post-run verification ─────────────────────────────────────────────
     print("─" * 60)
     print("Verification pass …")
-    verification = verify_all(audio_files, transcript_dir)
+    verification = verify_all(audio_files, transcript_root)
     all_ok = True
     for audio_name, (ok, reason) in verification.items():

package/upload_to_notebooklm.py CHANGED Viewed

@@ -49,6 +49,24 @@ def default_folder_name(lookback_days: int) -> str:
     return f"{start.strftime('%Y%m%d')}-{today.strftime('%Y%m%d')}"
+def find_transcripts_for_run(transcript_root: Path, folder_name: str) -> list[Path]:
+    """Collect transcript files across per-speaker subdirs whose date falls in the run window."""
+    parts = folder_name.split("-")
+    start_str, end_str = parts[0], parts[1]
+    files = []
+    if not transcript_root.exists():
+        return files
+    for speaker_dir in sorted(transcript_root.iterdir()):
+        if not speaker_dir.is_dir():
+            continue
+        for f in speaker_dir.glob("*.txt"):
+            date_str = f.stem.split("_")[-1]
+            if len(date_str) == 8 and start_str <= date_str <= end_str:
+                files.append(f)
+    return sorted(files)
 # ---------------------------------------------------------------------------
 # nlm CLI wrappers
 # ---------------------------------------------------------------------------
@@ -148,19 +166,15 @@ def run(config: dict, folder_name: str) -> str:
     """Run the upload stage and return the notebook_id."""
     nlm_path = config.get("nlm_path", "nlm")
     parent_folder = Path(config["parent_folder"])
-    transcript_dir = parent_folder / "transcripts" / folder_name
+    transcript_root = parent_folder / "transcripts"
-    if not transcript_dir.exists():
-        print(f"ERROR: Transcript directory not found: {transcript_dir}")
+    txt_files = find_transcripts_for_run(transcript_root, folder_name)
+    if not txt_files:
+        print(f"No transcript files found in {transcript_root} for window {folder_name}")
         print("Run transcribe.py first to generate transcripts.")
         sys.exit(1)
-    txt_files = sorted(transcript_dir.glob("*.txt"))
-    if not txt_files:
-        print(f"No .txt files found in {transcript_dir}")
-        sys.exit(0)
-    print(f"Transcript folder : {transcript_dir}")
+    print(f"Transcript root   : {transcript_root}")
     print(f"Files to upload   : {len(txt_files)}")
     print()