stock-weekly-report 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/fetch_episodes.py +0 -5
- package/package.json +1 -1
- package/pipeline.py +16 -11
- package/pyproject.toml +1 -1
- package/transcribe.py +12 -8
- package/upload_to_notebooklm.py +23 -9
package/fetch_episodes.py
CHANGED
|
@@ -187,11 +187,6 @@ def fetch_and_download(config: dict, folder_name: str | None = None) -> None:
|
|
|
187
187
|
filename = f"{program_name}_{date_str}{ext}"
|
|
188
188
|
dest = speaker_dir / filename
|
|
189
189
|
|
|
190
|
-
if dest.exists():
|
|
191
|
-
print(f" SKIP (already downloaded): {filename}")
|
|
192
|
-
found += 1
|
|
193
|
-
continue
|
|
194
|
-
|
|
195
190
|
print(f" Downloading: {filename}")
|
|
196
191
|
try:
|
|
197
192
|
download_file(audio_url, dest)
|
package/package.json
CHANGED
package/pipeline.py
CHANGED
|
@@ -253,26 +253,27 @@ def _cleanup_data_dir(data_root: Path, label: str,
|
|
|
253
253
|
print(f"\n Cleaned up {removed_folders} folder(s).")
|
|
254
254
|
|
|
255
255
|
|
|
256
|
-
def
|
|
257
|
-
|
|
256
|
+
def _cleanup_by_speaker(data_root: Path, label: str,
|
|
257
|
+
extensions: set[str], months: int) -> None:
|
|
258
|
+
"""Delete files matching `extensions` from per-speaker subdirs older than `months`."""
|
|
258
259
|
if months <= 0:
|
|
259
|
-
print(f"
|
|
260
|
+
print(f" {label}: retention = 0 (keep forever), skipping.")
|
|
260
261
|
return
|
|
261
262
|
|
|
262
|
-
banner(f"CLEANUP —
|
|
263
|
+
banner(f"CLEANUP — {label} (keep {months} month{'s' if months != 1 else ''})")
|
|
263
264
|
|
|
264
|
-
if not
|
|
265
|
-
print(f" Directory not found: {
|
|
265
|
+
if not data_root.exists():
|
|
266
|
+
print(f" Directory not found: {data_root}")
|
|
266
267
|
return
|
|
267
268
|
|
|
268
269
|
cutoff = _cutoff_date(months)
|
|
269
270
|
print(f" Cutoff date : {cutoff} (deleting files published before this date)")
|
|
270
271
|
|
|
271
272
|
removed = 0
|
|
272
|
-
for speaker_dir in sorted(
|
|
273
|
+
for speaker_dir in sorted(data_root.iterdir()):
|
|
273
274
|
if not speaker_dir.is_dir():
|
|
274
275
|
continue
|
|
275
|
-
for ext in
|
|
276
|
+
for ext in extensions:
|
|
276
277
|
for f in speaker_dir.glob(f"*{ext}"):
|
|
277
278
|
date_str = f.stem.split("_")[-1]
|
|
278
279
|
try:
|
|
@@ -281,16 +282,20 @@ def _cleanup_audio_by_speaker(audio_root: Path, months: int) -> None:
|
|
|
281
282
|
continue
|
|
282
283
|
if file_date < cutoff:
|
|
283
284
|
size_mb = f.stat().st_size / (1024 * 1024)
|
|
284
|
-
print(f" Deleting ({size_mb:.
|
|
285
|
+
print(f" Deleting ({size_mb:.2f} MB): {speaker_dir.name}/{f.name}")
|
|
285
286
|
f.unlink()
|
|
286
287
|
removed += 1
|
|
287
288
|
|
|
288
289
|
if removed == 0:
|
|
289
|
-
print(" No old
|
|
290
|
+
print(" No old files to remove.")
|
|
290
291
|
else:
|
|
291
292
|
print(f" Removed {removed} file(s).")
|
|
292
293
|
|
|
293
294
|
|
|
295
|
+
def _cleanup_audio_by_speaker(audio_root: Path, months: int) -> None:
|
|
296
|
+
_cleanup_by_speaker(audio_root, "Audio", SUPPORTED_AUDIO_EXTS, months)
|
|
297
|
+
|
|
298
|
+
|
|
294
299
|
def cleanup_old_data(config: dict) -> bool:
|
|
295
300
|
"""Clean up old audio, transcript, and report files per retention config."""
|
|
296
301
|
retention = config.get("retention", {})
|
|
@@ -300,7 +305,7 @@ def cleanup_old_data(config: dict) -> bool:
|
|
|
300
305
|
parent / "audio",
|
|
301
306
|
int(retention.get("audio_months", 3)),
|
|
302
307
|
)
|
|
303
|
-
|
|
308
|
+
_cleanup_by_speaker(
|
|
304
309
|
parent / "transcripts",
|
|
305
310
|
"Transcripts",
|
|
306
311
|
{".txt"},
|
package/pyproject.toml
CHANGED
package/transcribe.py
CHANGED
|
@@ -100,11 +100,13 @@ def verify_transcript(transcript_path: Path) -> tuple[bool, str]:
|
|
|
100
100
|
return True, ""
|
|
101
101
|
|
|
102
102
|
|
|
103
|
-
def verify_all(audio_files: list[Path],
|
|
103
|
+
def verify_all(audio_files: list[Path], transcript_root: Path) -> dict[str, tuple[bool, str]]:
|
|
104
104
|
"""Verify every expected transcript and return a {filename: (ok, reason)} map."""
|
|
105
105
|
results = {}
|
|
106
106
|
for audio_file in audio_files:
|
|
107
|
-
|
|
107
|
+
stem = audio_file.stem
|
|
108
|
+
speaker = stem.rsplit("_", 1)[0]
|
|
109
|
+
transcript_path = transcript_root / speaker / f"{stem}.txt"
|
|
108
110
|
results[audio_file.name] = verify_transcript(transcript_path)
|
|
109
111
|
return results
|
|
110
112
|
|
|
@@ -170,9 +172,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
|
|
|
170
172
|
|
|
171
173
|
parent_folder = Path(config["parent_folder"])
|
|
172
174
|
audio_root = parent_folder / "audio"
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
transcript_dir.mkdir(parents=True, exist_ok=True)
|
|
175
|
+
transcript_root = parent_folder / "transcripts"
|
|
176
176
|
|
|
177
177
|
audio_files = find_audio_files_for_run(audio_root, folder_name)
|
|
178
178
|
if not audio_files:
|
|
@@ -187,7 +187,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
|
|
|
187
187
|
print(f"Compute type : {compute_type}")
|
|
188
188
|
print(f"Language hint : {language}")
|
|
189
189
|
print(f"Audio root : {audio_root}")
|
|
190
|
-
print(f"Transcript
|
|
190
|
+
print(f"Transcript root : {transcript_root}")
|
|
191
191
|
print(f"Files to process: {len(audio_files)}")
|
|
192
192
|
print(f"Max retries : {MAX_RETRIES}")
|
|
193
193
|
print()
|
|
@@ -199,7 +199,11 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
|
|
|
199
199
|
succeeded, skipped, failed = [], [], []
|
|
200
200
|
|
|
201
201
|
for idx, audio_file in enumerate(audio_files, start=1):
|
|
202
|
-
|
|
202
|
+
stem = audio_file.stem # e.g. "股癌_20260225"
|
|
203
|
+
speaker = stem.rsplit("_", 1)[0] # e.g. "股癌"
|
|
204
|
+
speaker_transcript_dir = transcript_root / speaker
|
|
205
|
+
speaker_transcript_dir.mkdir(parents=True, exist_ok=True)
|
|
206
|
+
transcript_path = speaker_transcript_dir / f"{stem}.txt"
|
|
203
207
|
label = f"[{idx}/{len(audio_files)}]"
|
|
204
208
|
|
|
205
209
|
# Check if an existing transcript already passes verification
|
|
@@ -226,7 +230,7 @@ def transcribe_folder(config: dict, folder_name: str) -> None:
|
|
|
226
230
|
# ── Post-run verification ─────────────────────────────────────────────
|
|
227
231
|
print("─" * 60)
|
|
228
232
|
print("Verification pass …")
|
|
229
|
-
verification = verify_all(audio_files,
|
|
233
|
+
verification = verify_all(audio_files, transcript_root)
|
|
230
234
|
|
|
231
235
|
all_ok = True
|
|
232
236
|
for audio_name, (ok, reason) in verification.items():
|
package/upload_to_notebooklm.py
CHANGED
|
@@ -49,6 +49,24 @@ def default_folder_name(lookback_days: int) -> str:
|
|
|
49
49
|
return f"{start.strftime('%Y%m%d')}-{today.strftime('%Y%m%d')}"
|
|
50
50
|
|
|
51
51
|
|
|
52
|
+
def find_transcripts_for_run(transcript_root: Path, folder_name: str) -> list[Path]:
|
|
53
|
+
"""Collect transcript files across per-speaker subdirs whose date falls in the run window."""
|
|
54
|
+
parts = folder_name.split("-")
|
|
55
|
+
start_str, end_str = parts[0], parts[1]
|
|
56
|
+
|
|
57
|
+
files = []
|
|
58
|
+
if not transcript_root.exists():
|
|
59
|
+
return files
|
|
60
|
+
for speaker_dir in sorted(transcript_root.iterdir()):
|
|
61
|
+
if not speaker_dir.is_dir():
|
|
62
|
+
continue
|
|
63
|
+
for f in speaker_dir.glob("*.txt"):
|
|
64
|
+
date_str = f.stem.split("_")[-1]
|
|
65
|
+
if len(date_str) == 8 and start_str <= date_str <= end_str:
|
|
66
|
+
files.append(f)
|
|
67
|
+
return sorted(files)
|
|
68
|
+
|
|
69
|
+
|
|
52
70
|
# ---------------------------------------------------------------------------
|
|
53
71
|
# nlm CLI wrappers
|
|
54
72
|
# ---------------------------------------------------------------------------
|
|
@@ -148,19 +166,15 @@ def run(config: dict, folder_name: str) -> str:
|
|
|
148
166
|
"""Run the upload stage and return the notebook_id."""
|
|
149
167
|
nlm_path = config.get("nlm_path", "nlm")
|
|
150
168
|
parent_folder = Path(config["parent_folder"])
|
|
151
|
-
|
|
169
|
+
transcript_root = parent_folder / "transcripts"
|
|
152
170
|
|
|
153
|
-
|
|
154
|
-
|
|
171
|
+
txt_files = find_transcripts_for_run(transcript_root, folder_name)
|
|
172
|
+
if not txt_files:
|
|
173
|
+
print(f"No transcript files found in {transcript_root} for window {folder_name}")
|
|
155
174
|
print("Run transcribe.py first to generate transcripts.")
|
|
156
175
|
sys.exit(1)
|
|
157
176
|
|
|
158
|
-
|
|
159
|
-
if not txt_files:
|
|
160
|
-
print(f"No .txt files found in {transcript_dir}")
|
|
161
|
-
sys.exit(0)
|
|
162
|
-
|
|
163
|
-
print(f"Transcript folder : {transcript_dir}")
|
|
177
|
+
print(f"Transcript root : {transcript_root}")
|
|
164
178
|
print(f"Files to upload : {len(txt_files)}")
|
|
165
179
|
print()
|
|
166
180
|
|