any2summary 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
any2summary/cli.py ADDED
@@ -0,0 +1,3488 @@
1
+ """Command line interface for extracting structured YouTube transcripts.
2
+
3
+ The module provides a CLI entry point `run` that accepts a YouTube URL and
4
+ optional Azure OpenAI diarization support to enrich transcripts with speaker
5
+ labels. The default behaviour fetches caption segments with timestamps and
6
+ emits JSON to stdout.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import concurrent.futures
13
+ import contextlib
14
+ import copy
15
+ import base64
16
+ import hashlib
17
+ import io
18
+ import json
19
+ import math
20
+ import os
21
+ import re
22
+ import shutil
23
+ import subprocess
24
+ import sys
25
+ import tempfile
26
+ import wave
27
+ from datetime import datetime
28
+ from html.parser import HTMLParser
29
+ from collections import defaultdict
30
+ from importlib import metadata
31
+ from pathlib import Path
32
+ from typing import (
33
+ Any,
34
+ Callable,
35
+ Dict,
36
+ Iterable,
37
+ List,
38
+ Mapping,
39
+ MutableMapping,
40
+ Optional,
41
+ Sequence,
42
+ Tuple,
43
+ Union,
44
+ )
45
+ from urllib.parse import ParseResult, parse_qs, urljoin, urlparse
46
+
47
+
48
+ DEFAULT_YTDLP_USER_AGENT = (
49
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
50
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
51
+ "Chrome/123.0.0.0 Safari/537.36"
52
+ )
53
+ ANDROID_YTDLP_USER_AGENT = (
54
+ "Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) "
55
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
56
+ "Chrome/123.0.0.0 Mobile Safari/537.36"
57
+ )
58
+
59
+ AZURE_AUDIO_LIMIT_SECONDS = 1500.0
60
+ MAX_WAV_DURATION_SECONDS = AZURE_AUDIO_LIMIT_SECONDS
61
+ MAX_WAV_SIZE_BYTES = 100 * 1024 * 1024
62
+ AUDIO_SEGMENT_SECONDS = 1400.0
63
+ WAV_FRAME_CHUNK_SIZE = 32_768
64
+ ESTIMATED_TOKENS_PER_SECOND = 4.0
65
+ PROGRESS_BAR_WIDTH = 30
66
+ READING_WORDS_PER_MINUTE = 300
67
+ DEFAULT_OUTBOX_DIR = (
68
+ "/Users/clzhang/Library/Mobile Documents/"
69
+ "iCloud~md~obsidian/Documents/Obsidian Vault/010 outbox"
70
+ )
71
+ PROMPTS_ROOT = Path(__file__).resolve().parents[1] / "prompts"
72
+ DEFAULT_SUMMARY_PROMPT_PATH = PROMPTS_ROOT / "summary_prompt.txt"
73
+ DEFAULT_ARTICLE_PROMPT_PATH = PROMPTS_ROOT / "article_prompt.txt"
74
+
75
+ def _getenv(*keys: str) -> Optional[str]:
76
+ """Return the first defined environment variable among keys."""
77
+
78
+ for key in keys:
79
+ value = os.getenv(key)
80
+ if value is not None:
81
+ return value
82
+ return None
83
+
84
+
85
+ try:
86
+ __CLI_VERSION = metadata.version("any2summary")
87
+ except metadata.PackageNotFoundError:
88
+ __CLI_VERSION = "0.0.0"
89
+
90
+
91
+ _TIME_START_KEYS = (
92
+ "start",
93
+ "start_time",
94
+ "startTime",
95
+ "start_seconds",
96
+ "start_second",
97
+ "offset",
98
+ "begin",
99
+ "from",
100
+ "start_time_ms",
101
+ "startTimeMs",
102
+ "start_timeMillis",
103
+ "start_ms",
104
+ "startMillis",
105
+ "startMilliseconds",
106
+ "startMs",
107
+ )
108
+ _TIME_END_KEYS = (
109
+ "end",
110
+ "end_time",
111
+ "endTime",
112
+ "end_seconds",
113
+ "end_second",
114
+ "stop",
115
+ "to",
116
+ "finish",
117
+ "offset_end",
118
+ "end_time_ms",
119
+ "endTimeMs",
120
+ "end_timeMillis",
121
+ "end_ms",
122
+ "endMillis",
123
+ "endMilliseconds",
124
+ "endMs",
125
+ )
126
+ _TIME_DURATION_KEYS = (
127
+ "duration",
128
+ "duration_s",
129
+ "duration_seconds",
130
+ "duration_ms",
131
+ "durationMillis",
132
+ "durationMilliseconds",
133
+ "durationMs",
134
+ "length",
135
+ )
136
+ _TIME_CONTAINER_KEYS = (
137
+ "timestamp",
138
+ "timestamps",
139
+ "time",
140
+ "timing",
141
+ "time_range",
142
+ "timeRange",
143
+ "range",
144
+ "span",
145
+ "offsets",
146
+ "offset",
147
+ "start_end",
148
+ "startEnd",
149
+ )
150
+ _TEXT_VALUE_KEYS = (
151
+ "text",
152
+ "content",
153
+ "value",
154
+ "utterance",
155
+ "sentence",
156
+ "caption",
157
+ )
158
+ _TEXT_COLLECTION_KEYS = (
159
+ "alternatives",
160
+ "parts",
161
+ "lines",
162
+ "tokens",
163
+ "elements",
164
+ "chunks",
165
+ "segments",
166
+ "items",
167
+ "words",
168
+ "content",
169
+ "sentences",
170
+ )
171
+ _SPEAKER_KEYS = (
172
+ "speaker",
173
+ "speaker_label",
174
+ "speakerId",
175
+ "speaker_id",
176
+ "speaker_tag",
177
+ "speakerTag",
178
+ "label",
179
+ "name",
180
+ )
181
+
182
+ _MEDIA_HOST_SUFFIXES = (
183
+ "youtu.be",
184
+ "youtube.com",
185
+ "bilibili.com",
186
+ "soundcloud.com",
187
+ "music.apple.com",
188
+ "podcasts.apple.com",
189
+ "podcast.apple.com",
190
+ "spotify.com",
191
+ "open.spotify.com",
192
+ "podcasters.spotify.com",
193
+ )
194
+ _FORCED_AUDIO_HOST_SUFFIXES = (
195
+ "podcasts.apple.com",
196
+ "podcast.apple.com",
197
+ )
198
+ _MEDIA_PATH_EXTENSIONS = (".mp3", ".m4a", ".aac", ".wav", ".flac")
199
+
200
+
201
+ SUMMARY_PROMPT = '''
202
+ 你是一个可以帮助用户完成AI相关文章翻译和总结的助手。
203
+
204
+ 你需要完成如下任务:
205
+ 1. 如果原始是youtube资源或播客音频资源,要保留视频或音频中的时间线,可以大概没5分钟或将一个主题合并成一段。将非中文字幕,先翻译成中文,千万不要省略或遗漏任何信息,仅可以删掉一些无意义的口语表达,比如uh、yeah等。
206
+ 2. 如果内容很长,可以先给出Abstract和Keypoints,同时根据“主题”做分段,每段的标题即为提取的“主题”,每段最好不要超过300字。如果是多人对话,每段以`说话人`的姓名开始,按不同的`说话人`分段。
207
+ 3. 将你认为重要的、insightful、非共识的内容markdown加粗标识,以便阅读,但加粗内容不宜太多。
208
+
209
+
210
+ 注意:
211
+ 1. 始终用第一人称翻译,不要用转述的方式。
212
+ 2. 专业词汇和人名可以不翻译,例如`agent`、`llm`、`Sam`可以不翻译,或后面加上原始词,比如费曼图(Feynman diagram)。
213
+ 3. 输出格式可参考(但不必一定遵守):`摘要:1-3句话,可包含一些非共识的insight。\n<主题1>(00:00:00 - 00:05:03)\n<说话人名1>:<xxx>。\n<说话人名2>:<xxx>。\n<主题2>(00:05:03 - 00:09:52)\n<说话人名2>:<xxx>。\n<说话人名1>:<xxx>。......`。`<>`中是需要填充的内容。
214
+ '''
215
+
216
+
217
+ ARTICLE_SUMMARY_PROMPT = '''
218
+ 你是一个英文文章翻译和总结智能体。你的任务是分析用户提供的文章,自动抓取原文(包括关键图表等)。
219
+
220
+ 你的输出包含:
221
+ ## 总结
222
+ 最好不超过5句话总结文章的核心观点。
223
+ ## 要点
224
+ 1. 要点最好以层次化、结构化的方式展现。
225
+ 2. 每一个要点必须是一个观点/结论/事实。
226
+ 3. 要点之间最好有逻辑关系,当然要以忠实于原文为基础。
227
+ ## 翻译要求
228
+ 1. 如果是非中文,先将非关键信息翻译成中文,关键表达最好不翻译,若要翻译务必要附带上原始表达,以保留最准确的原始语言所传达的信息。
229
+ 2. 专业词汇和人名不要翻译,例如`agent`、`llm`、`Sam`不要翻译,或后面加上原始词,比如费曼图(Feynman diagram)。
230
+ 3. 翻译过程中千万不要压缩、省略或遗漏任何原始语言传达的信息,仅可以删掉一些无意义的口语表达或广告内容,比如uh、yeah等。
231
+ 4. 将你认为重要的、insightful、非共识的内容markdown加粗标识,以便阅读,但加粗内容不宜太多。
232
+ '''
233
+
234
+
235
+ DOMAIN_PROMPT = (
236
+ "你将收到一段中文摘要,请根据内容判断最贴切的领域标签。"
237
+ "直接输出一个中文标签,避免解释或补充说明。"
238
+ )
239
+
240
+
241
+ def _load_dotenv_if_present(explicit_path: Optional[str] = None) -> None:
242
+ """Load environment variables from a dotenv file when available."""
243
+
244
+ candidates: List[str] = []
245
+ if explicit_path and explicit_path.strip():
246
+ candidates.append(explicit_path.strip())
247
+ else:
248
+ candidates.append(".env")
249
+
250
+ for candidate in candidates:
251
+ path = candidate
252
+ if not os.path.isabs(path):
253
+ path = os.path.abspath(path)
254
+ if not os.path.isfile(path):
255
+ continue
256
+ try:
257
+ with open(path, "r", encoding="utf-8") as handle:
258
+ for line in handle:
259
+ stripped = line.strip()
260
+ if not stripped or stripped.startswith("#"):
261
+ continue
262
+ if "=" not in stripped:
263
+ continue
264
+ key, value = stripped.split("=", 1)
265
+ key = key.strip()
266
+ if not key or key in os.environ:
267
+ continue
268
+ cleaned = value.strip().strip('"').strip("'")
269
+ os.environ[key] = cleaned
270
+ except OSError as exc: # pragma: no cover - filesystem failure
271
+ raise RuntimeError(f"读取 dotenv 文件失败:{path}") from exc
272
+ break
273
+
274
+
275
+ def run(argv: Optional[Sequence[str]] = None) -> int:
276
+ """Entrypoint for the CLI.
277
+
278
+ Args:
279
+ argv: Sequence of command line arguments excluding the program name.
280
+
281
+ Returns:
282
+ Process exit code, 0 on success, non-zero on failure.
283
+ """
284
+
285
+ parser = argparse.ArgumentParser(
286
+ description=(
287
+ "Download YouTube captions, enrich with optional Azure OpenAI "
288
+ "diarization, and emit structured JSON output."
289
+ )
290
+ )
291
+ parser.add_argument("--url", required=True, help="YouTube video URL")
292
+ parser.add_argument(
293
+ "--language",
294
+ default="en",
295
+ help="Primary language code for caption retrieval (default: en)",
296
+ )
297
+ parser.add_argument(
298
+ "--fallback-language",
299
+ dest="fallback_languages",
300
+ action="append",
301
+ help=(
302
+ "Additional language codes to try if the primary language is not "
303
+ "available. Can be supplied multiple times."
304
+ ),
305
+ )
306
+ parser.add_argument(
307
+ "-V",
308
+ "--version",
309
+ action="version",
310
+ version=f"any2summary {__CLI_VERSION}",
311
+ help="显示版本信息并退出。",
312
+ )
313
+ parser.add_argument(
314
+ "--azure-streaming",
315
+ action=argparse.BooleanOptionalAction,
316
+ default=True,
317
+ help="是否以流式模式调用 Azure 转写(默认启用,可用 --no-azure-streaming 关闭)",
318
+ )
319
+ parser.add_argument(
320
+ "--force-azure-diarization",
321
+ action="store_true",
322
+ help="即使字幕可用也强制调用 Azure 说话人分离(文章链接不支持)。",
323
+ )
324
+ parser.add_argument(
325
+ "--azure-summary",
326
+ action="store_true",
327
+ help="调用 Azure GPT-5 对 ASR 结果进行翻译与总结。",
328
+ )
329
+ parser.add_argument(
330
+ "--summary-prompt-file",
331
+ type=str,
332
+ dest="summary_prompt_file",
333
+ help="自定义 Azure 摘要系统 Prompt 的配置文件路径。",
334
+ )
335
+ parser.add_argument(
336
+ "--article-summary-prompt-file",
337
+ type=str,
338
+ dest="article_summary_prompt_file",
339
+ help="针对网页文章的专用 Prompt 文件路径,仅在 --azure-summary 时生效。",
340
+ )
341
+ parser.add_argument(
342
+ "--max-speakers",
343
+ type=int,
344
+ default=None,
345
+ help="Optional upper bound for speaker count during diarization.",
346
+ )
347
+ parser.add_argument(
348
+ "--known-speaker",
349
+ dest="known_speakers",
350
+ action="append",
351
+ help=(
352
+ "Known speaker reference in the form name=path/to/audio.wav. "
353
+ "Can be supplied multiple times to improve diarization labeling."
354
+ ),
355
+ )
356
+ parser.add_argument(
357
+ "--known-speaker-name",
358
+ dest="known_speaker_names",
359
+ action="append",
360
+ help=(
361
+ "Known speaker name without reference audio. Can be supplied "
362
+ "multiple times to hint Azure diarization results."
363
+ ),
364
+ )
365
+ parser.add_argument(
366
+ "--clean-cache",
367
+ action="store_true",
368
+ help="Remove cached artifacts for the provided URL before processing.",
369
+ )
370
+
371
+ args = parser.parse_args(argv)
372
+
373
+ _load_dotenv_if_present(
374
+ _getenv("ANY2SUMMARY_DOTENV", "PODCAST_TRANSFORMER_DOTENV")
375
+ )
376
+
377
+ raw_urls = [item.strip() for item in args.url.split(",") if item and item.strip()]
378
+ if not raw_urls:
379
+ raise RuntimeError("--url 参数不能为空。")
380
+
381
+ if len(raw_urls) == 1:
382
+ args.url = raw_urls[0]
383
+ return _run_single(args)
384
+
385
+ return _run_multiple(args, raw_urls)
386
+
387
+
388
+ def _run_multiple(args: argparse.Namespace, urls: Sequence[str]) -> int:
389
+ clones = [_clone_args(args, url) for url in urls]
390
+ max_workers = max(1, min(len(clones), os.cpu_count() or len(clones)))
391
+
392
+ results: List[Tuple[int, str, int, str, Optional[str]]] = []
393
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
394
+ future_map: Dict[concurrent.futures.Future[Tuple[int, str, Optional[BaseException]]], Tuple[int, str]] = {}
395
+ for index, clone in enumerate(clones):
396
+ future = executor.submit(_run_single_with_capture, clone)
397
+ future_map[future] = (index, clone.url)
398
+
399
+ for future in concurrent.futures.as_completed(future_map):
400
+ index, url = future_map[future]
401
+ try:
402
+ exit_code, output, error = future.result()
403
+ except Exception as exc: # pragma: no cover - defensive
404
+ exit_code = 1
405
+ output = ""
406
+ error = str(exc)
407
+ error_message = str(error) if error else None
408
+ results.append((index, url, exit_code, output, error_message))
409
+
410
+ results.sort(key=lambda item: item[0])
411
+ final_exit_code = 0
412
+ for _, url, exit_code, output, error_message in results:
413
+ if output:
414
+ if output.endswith("\n"):
415
+ sys.stdout.write(output)
416
+ else:
417
+ sys.stdout.write(output + "\n")
418
+ if error_message:
419
+ sys.stderr.write(f"[{url}] {error_message}\n")
420
+ if exit_code != 0:
421
+ final_exit_code = 1
422
+
423
+ return final_exit_code
424
+
425
+
426
+ def _run_single_with_capture(args: argparse.Namespace) -> Tuple[int, str, Optional[str]]:
427
+ buffer = io.StringIO()
428
+ error_message: Optional[str] = None
429
+ try:
430
+ with contextlib.redirect_stdout(buffer):
431
+ exit_code = _run_single(args)
432
+ except Exception as exc: # pragma: no cover - error path
433
+ exit_code = 1
434
+ error_message = str(exc)
435
+ return exit_code, buffer.getvalue(), error_message
436
+
437
+
438
+ def _clone_args(args: argparse.Namespace, url: str) -> argparse.Namespace:
439
+ cloned = copy.deepcopy(args)
440
+ cloned.url = url
441
+ return cloned
442
+
443
+
444
+ def _run_single(args: argparse.Namespace) -> int:
445
+ auto_force_azure = _should_force_azure_transcription(args.url)
446
+
447
+ force_azure = bool(args.force_azure_diarization or auto_force_azure)
448
+
449
+ if args.summary_prompt_file and not args.azure_summary:
450
+ raise RuntimeError(
451
+ "--summary-prompt-file 仅能与 --azure-summary 搭配使用。"
452
+ )
453
+
454
+ if args.article_summary_prompt_file and not args.azure_summary:
455
+ raise RuntimeError(
456
+ "--article-summary-prompt-file 仅能与 --azure-summary 搭配使用。"
457
+ )
458
+
459
+ if args.clean_cache:
460
+ cache_directory = _resolve_video_cache_dir(args.url)
461
+ if os.path.isdir(cache_directory):
462
+ shutil.rmtree(cache_directory)
463
+
464
+ fallback_languages = args.fallback_languages or [args.language]
465
+ known_speaker_pairs = _parse_known_speakers(args.known_speakers)
466
+ known_speaker_names = args.known_speaker_names or None
467
+
468
+ transcript_segments: Optional[
469
+ List[MutableMapping[str, float | str]]
470
+ ] = None
471
+ transcript_error: Optional[RuntimeError] = None
472
+ article_bundle: Optional[Mapping[str, Any]] = None
473
+
474
+ try:
475
+ transcript_segments = fetch_transcript_with_metadata(
476
+ video_url=args.url,
477
+ language=args.language,
478
+ fallback_languages=fallback_languages,
479
+ )
480
+ except RuntimeError as exc:
481
+ transcript_error = exc
482
+
483
+ if transcript_segments is None and _is_probable_article_url(args.url):
484
+ article_bundle = _maybe_fetch_article_assets(args.url)
485
+ if article_bundle is not None:
486
+ transcript_segments = [
487
+ dict(segment)
488
+ for segment in article_bundle.get("segments", [])
489
+ if isinstance(segment, Mapping)
490
+ ]
491
+ if transcript_segments:
492
+ transcript_error = None
493
+ else:
494
+ article_bundle = None
495
+
496
+ diarization_segments: Optional[List[MutableMapping[str, float | str]]] = None
497
+ azure_payload: Optional[MutableMapping[str, Any]] = None
498
+
499
+ should_use_azure = False
500
+ if article_bundle is not None:
501
+ should_use_azure = False
502
+ else:
503
+ transcripts_available = bool(transcript_segments)
504
+ has_speaker_constraints = bool(
505
+ args.max_speakers is not None
506
+ or known_speaker_pairs
507
+ or known_speaker_names
508
+ )
509
+ if force_azure:
510
+ should_use_azure = True
511
+ elif not transcripts_available:
512
+ should_use_azure = True
513
+ elif has_speaker_constraints:
514
+ should_use_azure = True
515
+
516
+ if should_use_azure:
517
+ try:
518
+ azure_payload = perform_azure_diarization(
519
+ video_url=args.url,
520
+ language=args.language,
521
+ max_speakers=args.max_speakers,
522
+ known_speakers=known_speaker_pairs,
523
+ known_speaker_names=known_speaker_names,
524
+ streaming=args.azure_streaming,
525
+ )
526
+ except RuntimeError:
527
+ if _is_probable_article_url(args.url):
528
+ article_bundle = _maybe_fetch_article_assets(args.url)
529
+ if article_bundle is not None:
530
+ transcript_segments = [
531
+ dict(segment)
532
+ for segment in article_bundle.get("segments", [])
533
+ if isinstance(segment, Mapping)
534
+ ]
535
+ if transcript_segments:
536
+ transcript_error = None
537
+ should_use_azure = False
538
+ else:
539
+ article_bundle = None
540
+ if should_use_azure:
541
+ if transcript_error is not None:
542
+ raise transcript_error
543
+ raise
544
+ else:
545
+ diarization_segments = azure_payload.get("speakers") or []
546
+ if not transcript_segments:
547
+ transcript_segments = azure_payload.get("transcript")
548
+ if not transcript_segments:
549
+ if transcript_error is not None:
550
+ raise transcript_error
551
+ raise RuntimeError("Azure OpenAI 未返回可用的转写结果。")
552
+
553
+ if (
554
+ not transcript_segments
555
+ and article_bundle is None
556
+ and _is_probable_article_url(args.url)
557
+ ):
558
+ article_bundle = _maybe_fetch_article_assets(args.url)
559
+ if article_bundle is not None:
560
+ transcript_segments = [
561
+ dict(segment)
562
+ for segment in article_bundle.get("segments", [])
563
+ if isinstance(segment, Mapping)
564
+ ]
565
+ if transcript_segments:
566
+ transcript_error = None
567
+ else:
568
+ article_bundle = None
569
+
570
+ if not transcript_segments:
571
+ if transcript_error is not None:
572
+ raise transcript_error
573
+ raise RuntimeError(
574
+ "未能获取字幕数据。请确认视频是否启用字幕,或配置 Azure OpenAI 凭据(可搭配 --force-azure-diarization 强制调用 Azure 转写)。"
575
+ )
576
+
577
+ merged_segments = merge_segments_with_speakers(
578
+ transcript_segments, diarization_segments
579
+ )
580
+
581
+ summary_bundle: Optional[MutableMapping[str, Any]] = None
582
+ summary_paths: Optional[Mapping[str, str]] = None
583
+ article_metadata = article_bundle.get("metadata") if article_bundle else None
584
+ if args.azure_summary:
585
+ custom_prompt: Optional[str] = None
586
+ if article_bundle is not None:
587
+ if args.article_summary_prompt_file:
588
+ custom_prompt = _load_summary_prompt_file(
589
+ args.article_summary_prompt_file
590
+ )
591
+ else:
592
+ custom_prompt = _load_default_article_prompt()
593
+ elif args.summary_prompt_file:
594
+ custom_prompt = _load_summary_prompt_file(args.summary_prompt_file)
595
+ else:
596
+ custom_prompt = _load_default_summary_prompt()
597
+ summary_bundle = generate_translation_summary(
598
+ merged_segments,
599
+ args.url,
600
+ prompt=custom_prompt,
601
+ metadata=article_metadata,
602
+ )
603
+ summary_paths = _write_summary_documents(
604
+ args.url,
605
+ summary_bundle.get("summary_markdown", ""),
606
+ summary_bundle.get("timeline_markdown", ""),
607
+ summary_bundle.get("file_base", "summary"),
608
+ )
609
+
610
+ payload: Union[List[MutableMapping[str, float | str]], MutableMapping[str, Any]]
611
+ if summary_bundle is not None:
612
+ payload = {
613
+ "segments": merged_segments,
614
+ "summary": summary_bundle.get("summary_markdown"),
615
+ "timeline": summary_bundle.get("timeline_markdown"),
616
+ "summary_metadata": summary_bundle.get("metadata"),
617
+ }
618
+ if summary_paths is not None:
619
+ payload["summary_path"] = summary_paths.get("summary")
620
+ payload["timeline_path"] = summary_paths.get("timeline")
621
+ payload["summary_paths"] = summary_paths
622
+ if "total_words" in summary_bundle:
623
+ payload["total_words"] = summary_bundle["total_words"]
624
+ if "estimated_minutes" in summary_bundle:
625
+ payload["estimated_minutes"] = summary_bundle["estimated_minutes"]
626
+ else:
627
+ payload = merged_segments
628
+
629
+ if isinstance(payload, MutableMapping) and article_bundle is not None:
630
+ payload["article_metadata"] = article_bundle.get("metadata")
631
+ payload["article_assets"] = {
632
+ "raw_html_path": article_bundle.get("raw_html_path"),
633
+ "content_path": article_bundle.get("content_path"),
634
+ "metadata_path": article_bundle.get("metadata_path"),
635
+ "icon_path": article_bundle.get("icon_path"),
636
+ }
637
+
638
+ json.dump(payload, sys.stdout, indent=2, ensure_ascii=False)
639
+ sys.stdout.write("\n")
640
+
641
+ return 0
642
+
643
+
644
+ def fetch_transcript_with_metadata(
645
+ video_url: str, language: str, fallback_languages: Iterable[str]
646
+ ) -> List[MutableMapping[str, float | str]]:
647
+ """Fetch caption segments for a YouTube video.
648
+
649
+ Args:
650
+ video_url: URL of the YouTube video.
651
+ language: Preferred language code.
652
+ fallback_languages: Iterable of language codes to try sequentially.
653
+
654
+ Returns:
655
+ List of dictionaries containing `start`, `end`, and `text` keys.
656
+
657
+ Raises:
658
+ RuntimeError: If captions cannot be retrieved.
659
+ """
660
+
661
+ try:
662
+ from youtube_transcript_api import YouTubeTranscriptApi
663
+ except ModuleNotFoundError as exc: # pragma: no cover - depends on env
664
+ raise RuntimeError(
665
+ "youtube_transcript_api is required to fetch captions. Install it "
666
+ "via `pip install youtube-transcript-api`."
667
+ ) from exc
668
+
669
+ video_id = extract_video_id(video_url)
670
+ if not video_id:
671
+ raise RuntimeError(f"Unable to parse video id from URL: {video_url}")
672
+
673
+ language_preferences: List[str] = []
674
+ seen_codes = set()
675
+ for code in [language, *fallback_languages]:
676
+ if code and code not in seen_codes:
677
+ language_preferences.append(code)
678
+ seen_codes.add(code)
679
+
680
+ segments = None
681
+ transcript = None
682
+ available_languages: List[str] = []
683
+
684
+ if hasattr(YouTubeTranscriptApi, "list_transcripts"):
685
+ try:
686
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
687
+ except Exception as exc: # pragma: no cover - network failure paths
688
+ raise RuntimeError(f"Failed to list transcripts: {exc}")
689
+
690
+ for code in language_preferences:
691
+ try:
692
+ transcript = transcript_list.find_transcript([code])
693
+ break
694
+ except Exception:
695
+ continue
696
+
697
+ available_languages = [
698
+ item.language_code for item in transcript_list if hasattr(item, "language_code")
699
+ ]
700
+
701
+ if transcript is None and language_preferences:
702
+ target_language = language_preferences[0]
703
+ for candidate in transcript_list:
704
+ try:
705
+ translated = candidate.translate(target_language)
706
+ segments = translated.fetch()
707
+ transcript = translated
708
+ break
709
+ except Exception:
710
+ continue
711
+
712
+ if transcript is None:
713
+ for candidate in transcript_list:
714
+ try:
715
+ segments = candidate.fetch()
716
+ transcript = candidate
717
+ break
718
+ except Exception:
719
+ continue
720
+
721
+ if transcript is None:
722
+ maybe_plain_segments = []
723
+ for candidate in transcript_list:
724
+ try:
725
+ maybe_plain_segments = candidate.fetch()
726
+ transcript = candidate
727
+ break
728
+ except Exception:
729
+ continue
730
+ if transcript is not None and maybe_plain_segments:
731
+ segments = maybe_plain_segments
732
+
733
+ if segments is None and transcript is not None:
734
+ try:
735
+ segments = transcript.fetch()
736
+ except Exception as exc: # pragma: no cover - network failure paths
737
+ raise RuntimeError(f"Failed to fetch transcript: {exc}")
738
+
739
+ if segments is None:
740
+ message = (
741
+ "No transcript available after attempting preferences: "
742
+ f"{language_preferences}. Available languages: {available_languages}."
743
+ )
744
+ raise RuntimeError(
745
+ message
746
+ + " 请使用 --fallback-language 指定可用语言,或确认视频未限制字幕访问。"
747
+ )
748
+ else: # pragma: no cover - compatibility path for older versions
749
+ for code in language_preferences:
750
+ try:
751
+ segments = YouTubeTranscriptApi.get_transcript(
752
+ video_id, languages=[code]
753
+ )
754
+ break
755
+ except Exception:
756
+ continue
757
+
758
+ if segments is None:
759
+ raise RuntimeError(
760
+ "No transcript available in requested languages: "
761
+ f"{language_preferences}"
762
+ )
763
+
764
+ normalized_segments: List[MutableMapping[str, float | str]] = []
765
+ for segment in segments:
766
+ start = float(segment.get("start", 0.0))
767
+ duration = float(segment.get("duration", 0.0))
768
+ end = start + duration
769
+ normalized_segments.append(
770
+ {
771
+ "start": start,
772
+ "end": end,
773
+ "text": segment.get("text", ""),
774
+ }
775
+ )
776
+
777
+ return normalized_segments
778
+
779
+
780
+ def _segments_have_timestamps(
781
+ segments: Sequence[MutableMapping[str, float | str]]
782
+ ) -> bool:
783
+ """Return True when transcript segments contain usable timeline data."""
784
+
785
+ if not segments:
786
+ return False
787
+
788
+ for segment in segments:
789
+ start = segment.get("start")
790
+ end = segment.get("end")
791
+ try:
792
+ float(start) # type: ignore[arg-type]
793
+ float(end) # type: ignore[arg-type]
794
+ except (TypeError, ValueError):
795
+ return False
796
+ if float(end) < float(start):
797
+ return False
798
+ return True
799
+
800
+
801
+ def perform_azure_diarization(
802
+ video_url: str,
803
+ language: str,
804
+ max_speakers: Optional[int] = None,
805
+ known_speakers: Optional[List[Tuple[str, str]]] = None,
806
+ known_speaker_names: Optional[Sequence[str]] = None,
807
+ streaming: bool = True,
808
+ ) -> MutableMapping[str, Any]:
809
+ """Use Azure OpenAI GPT-4o diarization to identify speaker segments.
810
+
811
+ Args:
812
+ video_url: Source video URL.
813
+ language: Preferred language code.
814
+ max_speakers: Optional limit on distinct speaker labels.
815
+ known_speakers: Optional list of (name, audio_path) tuples for voice hints.
816
+ known_speaker_names: Optional speaker name hints without audio samples.
817
+ streaming: Whether to enable Azure streaming responses for progress updates.
818
+
819
+ Returns:
820
+ A mapping containing `speakers` and `transcript` lists.
821
+ """
822
+
823
+ azure_key = os.getenv("AZURE_OPENAI_API_KEY")
824
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
825
+ azure_api_version = os.getenv("AZURE_OPENAI_API_VERSION") or "2025-03-01-preview"
826
+ deployment = (
827
+ os.getenv("AZURE_OPENAI_TRANSCRIBE_DEPLOYMENT")
828
+ or "gpt-4o-transcribe-diarize"
829
+ )
830
+ if not azure_key or not azure_endpoint:
831
+ raise RuntimeError(
832
+ "Azure OpenAI 凭据缺失。请设置 AZURE_OPENAI_API_KEY与 AZURE_OPENAI_ENDPOINT。"
833
+ )
834
+
835
+ cache_directory = _resolve_video_cache_dir(video_url)
836
+ cache_path = _diarization_cache_path(cache_directory)
837
+ cached_payload = _load_cached_diarization(cache_path)
838
+ if cached_payload is not None:
839
+ return cached_payload
840
+
841
+ try:
842
+ from openai import AzureOpenAI
843
+ except ModuleNotFoundError as exc: # pragma: no cover - depends on env
844
+ raise RuntimeError(
845
+ "openai 库未安装。请执行 `pip install openai`."
846
+ ) from exc
847
+
848
+ wav_path = _prepare_audio_cache(video_url)
849
+ segment_paths = _ensure_audio_segments(wav_path)
850
+ if not segment_paths:
851
+ raise RuntimeError(
852
+ "音频缓存文件不存在或生成失败,请确认 ffmpeg 可用。"
853
+ )
854
+
855
+ segment_durations: List[float] = []
856
+ for path in segment_paths:
857
+ duration = max(_get_wav_duration(path), 0.0)
858
+ if duration <= 0.0:
859
+ try:
860
+ file_size = os.path.getsize(path)
861
+ except OSError:
862
+ file_size = 0
863
+ if file_size > 0:
864
+ duration = file_size / 32_000.0
865
+ if duration <= 0.0:
866
+ duration = 1.0
867
+ segment_durations.append(duration)
868
+
869
+ total_audio_duration = sum(segment_durations)
870
+ if total_audio_duration <= 0.0:
871
+ total_audio_duration = float(len(segment_paths))
872
+
873
+ total_estimated_tokens = _estimate_total_tokens(
874
+ segment_paths, segment_durations
875
+ )
876
+ processed_duration = 0.0
877
+ produced_tokens = 0.0
878
+ segments_done = 0
879
+ total_segments = len(segment_paths)
880
+
881
+ client = AzureOpenAI(
882
+ api_key=azure_key,
883
+ api_version=azure_api_version,
884
+ azure_endpoint=azure_endpoint,
885
+ )
886
+ openai_module = sys.modules.get("openai")
887
+ bad_request_error: Optional[type[BaseException]]
888
+ if openai_module is not None:
889
+ bad_request_error = getattr(openai_module, "BadRequestError", None)
890
+ else:
891
+ bad_request_error = None
892
+
893
+ extra_body = _build_extra_body(known_speakers)
894
+ chunking_strategy_config: Dict[str, Any] = {"type": "auto"}
895
+ request_extra_body: MutableMapping[str, Any] = dict(extra_body)
896
+ request_extra_body["chunking_strategy"] = chunking_strategy_config
897
+
898
+ request_known_names: List[str] = []
899
+ extra_names = extra_body.get("known_speaker_names")
900
+ if isinstance(extra_names, list):
901
+ request_known_names.extend(str(name) for name in extra_names)
902
+ if known_speaker_names:
903
+ for name in known_speaker_names:
904
+ if not isinstance(name, str):
905
+ continue
906
+ stripped = name.strip()
907
+ if not stripped:
908
+ continue
909
+ if stripped not in request_known_names:
910
+ request_known_names.append(stripped)
911
+
912
+ if total_segments > 0:
913
+ _update_progress_bar(
914
+ 0.0,
915
+ _format_progress_detail(
916
+ processed_duration,
917
+ total_audio_duration,
918
+ produced_tokens,
919
+ total_estimated_tokens,
920
+ segments_done,
921
+ total_segments,
922
+ ),
923
+ )
924
+
925
+ aggregated_diarization: List[MutableMapping[str, float | str]] = []
926
+ aggregated_transcript: List[MutableMapping[str, float | str]] = []
927
+ segment_offset = 0.0
928
+
929
+ for index, segment_path in enumerate(segment_paths, start=1):
930
+ segment_duration = (
931
+ segment_durations[index - 1]
932
+ if 0 <= index - 1 < len(segment_durations)
933
+ else 0.0
934
+ )
935
+ stream_tokens = 0.0
936
+
937
+ try:
938
+ with open(segment_path, "rb") as audio_file:
939
+ request_kwargs: MutableMapping[str, Any] = {
940
+ "model": deployment,
941
+ "file": audio_file,
942
+ "response_format": "diarized_json",
943
+ "language": language,
944
+ "chunking_strategy": "auto"
945
+ }
946
+ if request_known_names:
947
+ request_kwargs["known_speaker_names"] = request_known_names
948
+
949
+ def _handle_stream_chunk(payload: MutableMapping[str, Any]) -> None:
950
+ nonlocal stream_tokens
951
+ tokens = _extract_usage_tokens(payload)
952
+ if tokens is None:
953
+ return
954
+ stream_tokens = max(stream_tokens, float(tokens))
955
+ ratio = _compute_progress_ratio(
956
+ processed_duration,
957
+ total_audio_duration,
958
+ produced_tokens + stream_tokens,
959
+ total_estimated_tokens,
960
+ segments_done,
961
+ total_segments,
962
+ )
963
+ _update_progress_bar(
964
+ ratio,
965
+ _format_progress_detail(
966
+ processed_duration,
967
+ total_audio_duration,
968
+ produced_tokens + stream_tokens,
969
+ total_estimated_tokens,
970
+ segments_done,
971
+ total_segments,
972
+ ),
973
+ )
974
+
975
+ response = client.audio.transcriptions.create(
976
+ **request_kwargs, stream=streaming
977
+ )
978
+ except Exception as exc: # pragma: no cover - depends on API behaviour
979
+ if (
980
+ bad_request_error is not None
981
+ and isinstance(exc, bad_request_error)
982
+ ):
983
+ message = _extract_openai_error_message(exc)
984
+ raise RuntimeError(
985
+ "Azure OpenAI 调用失败:"
986
+ f"{message}。请尝试使用 --clean-cache 重新生成音频,并确认 ffmpeg 可用。"
987
+ ) from exc
988
+ raise
989
+
990
+ response_payload = _consume_transcription_response(
991
+ response,
992
+ on_chunk=_handle_stream_chunk if streaming else None,
993
+ )
994
+ if not streaming:
995
+ ratio = _compute_progress_ratio(
996
+ processed_duration,
997
+ total_audio_duration,
998
+ produced_tokens,
999
+ total_estimated_tokens,
1000
+ segments_done,
1001
+ total_segments,
1002
+ )
1003
+ _update_progress_bar(
1004
+ ratio,
1005
+ _format_progress_detail(
1006
+ processed_duration,
1007
+ total_audio_duration,
1008
+ produced_tokens,
1009
+ total_estimated_tokens,
1010
+ segments_done,
1011
+ total_segments,
1012
+ ),
1013
+ )
1014
+ if _getenv("ANY2SUMMARY_DEBUG_PAYLOAD", "PODCAST_TRANSFORMER_DEBUG_PAYLOAD"):
1015
+ debug_name = f"debug_payload_{index}.json"
1016
+ debug_path = os.path.join(cache_directory, debug_name)
1017
+ try:
1018
+ with open(debug_path, "w", encoding="utf-8") as handle:
1019
+ json.dump(response_payload, handle, ensure_ascii=False, indent=2)
1020
+ except OSError:
1021
+ pass
1022
+ diarization_segments = _extract_diarization_segments(response_payload)
1023
+ transcript_segments = _extract_transcript_segments(response_payload)
1024
+
1025
+ if not diarization_segments:
1026
+ if transcript_segments:
1027
+ diarization_segments = [
1028
+ {
1029
+ "start": float(item.get("start", 0.0)),
1030
+ "end": float(item.get("end", item.get("start", 0.0))),
1031
+ "speaker": item.get("speaker", "Unknown"),
1032
+ }
1033
+ for item in transcript_segments
1034
+ ]
1035
+ else:
1036
+ processed_duration = min(
1037
+ total_audio_duration, processed_duration + max(segment_duration, 0.0)
1038
+ )
1039
+ segments_done += 1
1040
+ ratio = _compute_progress_ratio(
1041
+ processed_duration,
1042
+ total_audio_duration,
1043
+ produced_tokens,
1044
+ total_estimated_tokens,
1045
+ segments_done,
1046
+ total_segments,
1047
+ )
1048
+ _update_progress_bar(
1049
+ ratio,
1050
+ _format_progress_detail(
1051
+ processed_duration,
1052
+ total_audio_duration,
1053
+ produced_tokens,
1054
+ total_estimated_tokens,
1055
+ segments_done,
1056
+ total_segments,
1057
+ ),
1058
+ )
1059
+ continue
1060
+
1061
+ diarization_with_offset = _offset_segments(diarization_segments, segment_offset)
1062
+ transcript_with_offset = _offset_segments(transcript_segments, segment_offset)
1063
+
1064
+ aggregated_diarization.extend(diarization_with_offset)
1065
+ aggregated_transcript.extend(transcript_with_offset)
1066
+
1067
+ max_end = _max_segment_end(diarization_with_offset, transcript_with_offset)
1068
+ if segment_duration <= 0.0:
1069
+ segment_duration = max(_get_wav_duration(segment_path), 0.0)
1070
+ if segment_duration > 0:
1071
+ segment_offset = max(segment_offset + segment_duration, max_end)
1072
+ else:
1073
+ segment_offset = max(segment_offset, max_end)
1074
+
1075
+ processed_duration = min(
1076
+ total_audio_duration, processed_duration + max(segment_duration, 0.0)
1077
+ )
1078
+ segment_tokens = _estimate_tokens_from_transcript(transcript_segments)
1079
+ if stream_tokens > 0:
1080
+ segment_tokens = max(segment_tokens, stream_tokens)
1081
+ produced_tokens += segment_tokens
1082
+ segments_done += 1
1083
+
1084
+ ratio = _compute_progress_ratio(
1085
+ processed_duration,
1086
+ total_audio_duration,
1087
+ produced_tokens,
1088
+ total_estimated_tokens,
1089
+ segments_done,
1090
+ total_segments,
1091
+ )
1092
+ _update_progress_bar(
1093
+ ratio,
1094
+ _format_progress_detail(
1095
+ processed_duration,
1096
+ total_audio_duration,
1097
+ produced_tokens,
1098
+ total_estimated_tokens,
1099
+ segments_done,
1100
+ total_segments,
1101
+ ),
1102
+ )
1103
+
1104
+ if total_segments > 0:
1105
+ final_ratio = _compute_progress_ratio(
1106
+ processed_duration,
1107
+ total_audio_duration,
1108
+ produced_tokens,
1109
+ total_estimated_tokens,
1110
+ segments_done,
1111
+ total_segments,
1112
+ )
1113
+ if final_ratio < 1.0:
1114
+ _update_progress_bar(
1115
+ 1.0,
1116
+ _format_progress_detail(
1117
+ total_audio_duration,
1118
+ total_audio_duration,
1119
+ max(produced_tokens, total_estimated_tokens),
1120
+ total_estimated_tokens,
1121
+ total_segments,
1122
+ total_segments,
1123
+ ),
1124
+ )
1125
+
1126
+ if not aggregated_diarization:
1127
+ if aggregated_transcript:
1128
+ aggregated_diarization = [
1129
+ {
1130
+ "start": float(item.get("start", 0.0)),
1131
+ "end": float(item.get("end", item.get("start", 0.0))),
1132
+ "speaker": item.get("speaker", "Unknown"),
1133
+ }
1134
+ for item in aggregated_transcript
1135
+ ]
1136
+ else:
1137
+ empty_payload: MutableMapping[str, Any] = {
1138
+ "speakers": [],
1139
+ "transcript": [],
1140
+ }
1141
+ _write_diarization_cache(cache_path, empty_payload)
1142
+ return empty_payload
1143
+
1144
+ if max_speakers and max_speakers > 0:
1145
+ aggregated_diarization = _limit_speaker_count(
1146
+ aggregated_diarization, max_speakers
1147
+ )
1148
+
1149
+ aggregated_diarization.sort(key=lambda item: item["start"])
1150
+ aggregated_transcript.sort(key=lambda item: item.get("start", 0.0))
1151
+ transcript_segments = aggregated_transcript
1152
+ diarization_segments = aggregated_diarization
1153
+ merged_entries: List[MutableMapping[str, float | str]] = []
1154
+ for entry in diarization_segments:
1155
+ if not merged_entries:
1156
+ merged_entries.append(dict(entry))
1157
+ continue
1158
+ previous = merged_entries[-1]
1159
+ if (
1160
+ previous.get("speaker") == entry.get("speaker")
1161
+ and abs(float(previous.get("end", 0.0)) - float(entry.get("start", 0.0))) < 0.2
1162
+ ):
1163
+ previous["end"] = max(
1164
+ float(previous.get("end", 0.0)), float(entry.get("end", 0.0))
1165
+ )
1166
+ else:
1167
+ merged_entries.append(dict(entry))
1168
+
1169
+ result_payload: MutableMapping[str, Any] = {
1170
+ "speakers": merged_entries,
1171
+ "transcript": transcript_segments,
1172
+ }
1173
+ _write_diarization_cache(cache_path, result_payload)
1174
+
1175
+ return result_payload
1176
+
1177
+
1178
+ def _load_cached_diarization(
1179
+ cache_path: str,
1180
+ ) -> Optional[MutableMapping[str, Any]]:
1181
+ """Load diarization results from cache when available."""
1182
+
1183
+ if not os.path.exists(cache_path):
1184
+ return None
1185
+
1186
+ try:
1187
+ with open(cache_path, "r", encoding="utf-8") as handle:
1188
+ payload = json.load(handle)
1189
+ except (OSError, json.JSONDecodeError):
1190
+ return None
1191
+
1192
+ if not isinstance(payload, MutableMapping):
1193
+ return None
1194
+
1195
+ speakers = payload.get("speakers")
1196
+ transcript = payload.get("transcript")
1197
+ if not isinstance(speakers, list) or not isinstance(transcript, list):
1198
+ return None
1199
+
1200
+ return payload
1201
+
1202
+
1203
+ def _write_diarization_cache(
1204
+ cache_path: str, payload: MutableMapping[str, Any]
1205
+ ) -> None:
1206
+ """Persist diarization payload to cache, ignoring failures."""
1207
+
1208
+ directory = os.path.dirname(cache_path)
1209
+ try:
1210
+ os.makedirs(directory, exist_ok=True)
1211
+ with tempfile.NamedTemporaryFile(
1212
+ mode="w",
1213
+ encoding="utf-8",
1214
+ delete=False,
1215
+ dir=directory,
1216
+ ) as handle:
1217
+ json.dump(payload, handle, ensure_ascii=False)
1218
+ temp_path = handle.name
1219
+ os.replace(temp_path, cache_path)
1220
+ except OSError:
1221
+ try:
1222
+ if "temp_path" in locals() and os.path.exists(temp_path):
1223
+ os.unlink(temp_path)
1224
+ except OSError:
1225
+ pass
1226
+
1227
+
1228
+ def _coerce_response_to_dict(response: object) -> MutableMapping[str, Any]:
1229
+ """Convert Azure OpenAI response to a dictionary."""
1230
+
1231
+ if response is None:
1232
+ return {}
1233
+ if isinstance(response, MutableMapping):
1234
+ return response
1235
+ if hasattr(response, "model_dump"):
1236
+ return response.model_dump() # type: ignore[return-value]
1237
+ if hasattr(response, "to_dict"):
1238
+ return response.to_dict() # type: ignore[call-arg]
1239
+ if hasattr(response, "__dict__"):
1240
+ return {
1241
+ key: value
1242
+ for key, value in response.__dict__.items()
1243
+ if not key.startswith("_")
1244
+ }
1245
+ return {}
1246
+
1247
+
1248
+ def _iter_nested_mappings(
1249
+ payload: Mapping[str, Any]
1250
+ ) -> Iterator[MutableMapping[str, Any]]:
1251
+ """Yield nested mappings discovered within the payload."""
1252
+
1253
+ if not isinstance(payload, MutableMapping):
1254
+ return
1255
+
1256
+ stack: List[MutableMapping[str, Any]] = [payload]
1257
+ seen_ids: set[int] = set()
1258
+
1259
+ while stack:
1260
+ current = stack.pop()
1261
+ identifier = id(current)
1262
+ if identifier in seen_ids:
1263
+ continue
1264
+ seen_ids.add(identifier)
1265
+ yield current
1266
+ for value in current.values():
1267
+ if isinstance(value, MutableMapping):
1268
+ stack.append(value)
1269
+ elif isinstance(value, list):
1270
+ for item in value:
1271
+ if isinstance(item, MutableMapping):
1272
+ stack.append(item)
1273
+
1274
+
1275
+ _ISO_8601_DURATION_RE = re.compile(
1276
+ r"^P(?:(?P<days>\d+(?:\.\d+)?)D)?"
1277
+ r"(?:T(?:(?P<hours>\d+(?:\.\d+)?)H)?(?:(?P<minutes>\d+(?:\.\d+)?)M)?"
1278
+ r"(?:(?P<seconds>\d+(?:\.\d+)?)S)?)?$",
1279
+ re.IGNORECASE,
1280
+ )
1281
+
1282
+
1283
+ def _parse_time_string(value: str) -> Optional[float]:
1284
+ """Parse textual time representation into seconds."""
1285
+
1286
+ text = value.strip()
1287
+ if not text:
1288
+ return None
1289
+
1290
+ lower = text.lower()
1291
+
1292
+ match = _ISO_8601_DURATION_RE.match(text)
1293
+ if match:
1294
+ total = 0.0
1295
+ days = match.group("days")
1296
+ hours = match.group("hours")
1297
+ minutes = match.group("minutes")
1298
+ seconds = match.group("seconds")
1299
+ if days:
1300
+ total += float(days) * 86_400.0
1301
+ if hours:
1302
+ total += float(hours) * 3_600.0
1303
+ if minutes:
1304
+ total += float(minutes) * 60.0
1305
+ if seconds:
1306
+ total += float(seconds)
1307
+ return total if total > 0.0 else 0.0
1308
+
1309
+ if ":" in text:
1310
+ parts = text.split(":")
1311
+ try:
1312
+ parts = [float(part) for part in parts]
1313
+ except ValueError:
1314
+ return None
1315
+ seconds = 0.0
1316
+ for part in parts:
1317
+ seconds = seconds * 60.0 + part
1318
+ return seconds
1319
+
1320
+ if lower.endswith("ms"):
1321
+ numeric = lower[:-2].strip()
1322
+ try:
1323
+ return float(numeric) / 1000.0
1324
+ except ValueError:
1325
+ return None
1326
+ if lower.endswith("s") and lower[-2:] != "ms":
1327
+ numeric = lower[:-1].strip()
1328
+ try:
1329
+ return float(numeric)
1330
+ except ValueError:
1331
+ return None
1332
+
1333
+ try:
1334
+ return float(text)
1335
+ except ValueError:
1336
+ return None
1337
+
1338
+
1339
+ def _coerce_time_value(value: Any, key_hint: Optional[str] = None) -> Optional[float]:
1340
+ """Convert a raw value into seconds, respecting millisecond hints."""
1341
+
1342
+ if isinstance(value, (list, tuple, set)):
1343
+ for item in value:
1344
+ coerced = _coerce_time_value(item, key_hint)
1345
+ if coerced is not None:
1346
+ return coerced
1347
+ return None
1348
+
1349
+ if isinstance(value, Mapping):
1350
+ seconds_component = None
1351
+ nanos_component = None
1352
+
1353
+ if "seconds" in value:
1354
+ seconds_component = _coerce_time_value(value.get("seconds"), "seconds")
1355
+ if "nanos" in value:
1356
+ nanos_component = _coerce_time_value(value.get("nanos"), "nanos")
1357
+ if "nanoSeconds" in value:
1358
+ nanos_component = _coerce_time_value(value.get("nanoSeconds"), "nanos")
1359
+ if seconds_component is not None or nanos_component is not None:
1360
+ total = seconds_component or 0.0
1361
+ if nanos_component is not None:
1362
+ total += nanos_component / 1_000_000_000
1363
+ return total
1364
+
1365
+ units = value.get("unit") or value.get("units")
1366
+ unit_value = value.get("value")
1367
+ if unit_value is not None:
1368
+ numeric = _coerce_time_value(unit_value, units or key_hint)
1369
+ if numeric is not None:
1370
+ if units:
1371
+ lowered = str(units).lower()
1372
+ if "ms" in lowered or "millis" in lowered:
1373
+ return numeric / 1000.0
1374
+ return numeric
1375
+
1376
+ candidate_keys = (
1377
+ "start",
1378
+ "end",
1379
+ "offset",
1380
+ "offset_seconds",
1381
+ "offsetSeconds",
1382
+ "offset_ms",
1383
+ "offsetMs",
1384
+ "offset_millis",
1385
+ "offsetMillis",
1386
+ "time",
1387
+ "time_seconds",
1388
+ "timeMillis",
1389
+ "time_ms",
1390
+ "milliseconds",
1391
+ "millis",
1392
+ "ms",
1393
+ )
1394
+ for candidate in candidate_keys:
1395
+ if candidate in value:
1396
+ nested = _coerce_time_value(value.get(candidate), candidate)
1397
+ if nested is not None:
1398
+ return nested
1399
+
1400
+ # Explore nested dictionaries to find any numeric field.
1401
+ for nested_value in value.values():
1402
+ nested = _coerce_time_value(nested_value, key_hint)
1403
+ if nested is not None:
1404
+ return nested
1405
+
1406
+ return None
1407
+
1408
+ if isinstance(value, str):
1409
+ return _parse_time_string(value)
1410
+
1411
+ try:
1412
+ numeric = float(value)
1413
+ except (TypeError, ValueError):
1414
+ return None
1415
+
1416
+ if key_hint:
1417
+ lower = str(key_hint).lower()
1418
+ if "nano" in lower:
1419
+ return numeric / 1_000_000_000
1420
+ if "ms" in lower or "millis" in lower:
1421
+ numeric /= 1000.0
1422
+
1423
+ return _to_seconds(numeric)
1424
+
1425
+
1426
+ def _find_time_value(obj: Any, keys: Sequence[str], visited: Optional[set[int]] = None) -> Optional[float]:
1427
+ """Recursively search for a time value using candidate keys."""
1428
+
1429
+ if visited is None:
1430
+ visited = set()
1431
+
1432
+ if isinstance(obj, Mapping):
1433
+ identifier = id(obj)
1434
+ if identifier in visited:
1435
+ return None
1436
+ visited.add(identifier)
1437
+
1438
+ for key in keys:
1439
+ if key not in obj:
1440
+ continue
1441
+ value = obj[key]
1442
+ if isinstance(value, Mapping):
1443
+ nested = _find_time_value(value, keys, visited)
1444
+ if nested is not None:
1445
+ return nested
1446
+ else:
1447
+ coerced = _coerce_time_value(value, key)
1448
+ if coerced is not None:
1449
+ return coerced
1450
+
1451
+ for container_key in _TIME_CONTAINER_KEYS:
1452
+ value = obj.get(container_key)
1453
+ if value is None:
1454
+ continue
1455
+ nested = _find_time_value(value, keys, visited)
1456
+ if nested is not None:
1457
+ return nested
1458
+
1459
+ elif isinstance(obj, list):
1460
+ for item in obj:
1461
+ nested = _find_time_value(item, keys, visited)
1462
+ if nested is not None:
1463
+ return nested
1464
+
1465
+ return None
1466
+
1467
+
1468
+ def _extract_time_range(segment: Mapping[str, Any]) -> Tuple[Optional[float], Optional[float]]:
1469
+ """Extract start and end seconds from a diarization/transcript segment."""
1470
+
1471
+ start = _find_time_value(segment, _TIME_START_KEYS)
1472
+ end = _find_time_value(segment, _TIME_END_KEYS)
1473
+
1474
+ if start is not None and end is None:
1475
+ duration = _find_time_value(segment, _TIME_DURATION_KEYS)
1476
+ if duration is not None:
1477
+ end = start + duration
1478
+
1479
+ if start is None and end is not None:
1480
+ duration = _find_time_value(segment, _TIME_DURATION_KEYS)
1481
+ if duration is not None:
1482
+ start = end - duration
1483
+
1484
+ if start is None:
1485
+ duration = _find_time_value(segment, _TIME_DURATION_KEYS)
1486
+ if duration is not None:
1487
+ start = 0.0
1488
+ end = duration if end is None else end
1489
+
1490
+ if start is not None and end is not None and end < start:
1491
+ end = start
1492
+
1493
+ return start, end
1494
+
1495
+
1496
+ def _extract_speaker_label(segment: Mapping[str, Any]) -> Optional[str]:
1497
+ """Derive speaker label from various field aliases."""
1498
+
1499
+ for key in _SPEAKER_KEYS:
1500
+ if key not in segment:
1501
+ continue
1502
+ value = segment.get(key)
1503
+ if isinstance(value, str):
1504
+ stripped = value.strip()
1505
+ if stripped:
1506
+ return stripped
1507
+ elif isinstance(value, Mapping):
1508
+ nested = _extract_speaker_label(value)
1509
+ if nested:
1510
+ return nested
1511
+
1512
+ speaker_info = segment.get("speaker_info")
1513
+ if isinstance(speaker_info, Mapping):
1514
+ nested = _extract_speaker_label(speaker_info)
1515
+ if nested:
1516
+ return nested
1517
+
1518
+ for alias in ("info", "metadata", "details", "properties"):
1519
+ nested_container = segment.get(alias)
1520
+ if isinstance(nested_container, Mapping):
1521
+ nested = _extract_speaker_label(nested_container)
1522
+ if nested:
1523
+ return nested
1524
+ elif isinstance(nested_container, list):
1525
+ for item in nested_container:
1526
+ if isinstance(item, Mapping):
1527
+ nested = _extract_speaker_label(item)
1528
+ if nested:
1529
+ return nested
1530
+
1531
+ return None
1532
+
1533
+
1534
+ def _extract_text_value(obj: Any, visited: Optional[set[int]] = None) -> Optional[str]:
1535
+ """Collect textual content from nested structures."""
1536
+
1537
+ if visited is None:
1538
+ visited = set()
1539
+
1540
+ if isinstance(obj, str):
1541
+ stripped = obj.strip()
1542
+ return stripped or None
1543
+
1544
+ if isinstance(obj, Mapping):
1545
+ identifier = id(obj)
1546
+ if identifier in visited:
1547
+ return None
1548
+ visited.add(identifier)
1549
+
1550
+ for key in _TEXT_VALUE_KEYS:
1551
+ if key not in obj:
1552
+ continue
1553
+ candidate = _extract_text_value(obj[key], visited)
1554
+ if candidate:
1555
+ return candidate
1556
+
1557
+ for key in _TEXT_COLLECTION_KEYS:
1558
+ if key not in obj:
1559
+ continue
1560
+ candidate = _extract_text_value(obj[key], visited)
1561
+ if candidate:
1562
+ return candidate
1563
+
1564
+ if isinstance(obj, list):
1565
+ pieces: List[str] = []
1566
+ for item in obj:
1567
+ candidate = _extract_text_value(item, visited)
1568
+ if candidate:
1569
+ pieces.append(candidate)
1570
+ if pieces:
1571
+ return " ".join(pieces).strip()
1572
+
1573
+ return None
1574
+
1575
+
1576
+ def _extract_diarization_segments(
1577
+ payload: MutableMapping[str, Any]
1578
+ ) -> List[MutableMapping[str, float | str]]:
1579
+ """Extract diarization segments from verbose JSON payload."""
1580
+
1581
+ normalized: List[MutableMapping[str, float | str]] = []
1582
+ seen = set()
1583
+
1584
+ for mapping in _iter_nested_mappings(payload):
1585
+ segment = _normalize_segment_entry(mapping)
1586
+ if segment is None:
1587
+ continue
1588
+ fingerprint = (
1589
+ round(segment.get("start", 0.0), 3),
1590
+ round(segment.get("end", 0.0), 3),
1591
+ segment.get("speaker"),
1592
+ )
1593
+ if fingerprint in seen:
1594
+ continue
1595
+ seen.add(fingerprint)
1596
+ normalized.append(segment)
1597
+
1598
+ if normalized:
1599
+ return normalized
1600
+
1601
+ return []
1602
+
1603
+
1604
+ def _extract_transcript_segments(
1605
+ payload: MutableMapping[str, Any]
1606
+ ) -> List[MutableMapping[str, float | str]]:
1607
+ """Extract transcript segments with text from payload."""
1608
+
1609
+ transcript_segments: List[MutableMapping[str, float | str]] = []
1610
+ seen = set()
1611
+
1612
+ for mapping in _iter_nested_mappings(payload):
1613
+ normalized = _normalize_transcript_entry(mapping)
1614
+ if normalized is None:
1615
+ continue
1616
+ fingerprint = (
1617
+ round(normalized.get("start", 0.0), 3),
1618
+ round(normalized.get("end", 0.0), 3),
1619
+ normalized.get("text"),
1620
+ )
1621
+ if fingerprint in seen:
1622
+ continue
1623
+ seen.add(fingerprint)
1624
+ transcript_segments.append(normalized)
1625
+
1626
+ transcript_segments.sort(key=lambda item: item.get("start", 0.0))
1627
+ return transcript_segments
1628
+
1629
+
1630
+ def _normalize_segment_entry(
1631
+ segment: MutableMapping[str, Any]
1632
+ ) -> Optional[MutableMapping[str, float | str]]:
1633
+ """Normalize a diarization segment into start/end/speaker fields."""
1634
+
1635
+ if not isinstance(segment, Mapping):
1636
+ return None
1637
+
1638
+ start, end = _extract_time_range(segment)
1639
+ if start is None:
1640
+ return None
1641
+
1642
+ if end is None:
1643
+ end = start
1644
+
1645
+ speaker = _extract_speaker_label(segment)
1646
+ if speaker is None:
1647
+ return None
1648
+
1649
+ return {
1650
+ "start": float(start),
1651
+ "end": float(end),
1652
+ "speaker": str(speaker),
1653
+ }
1654
+
1655
+
1656
+ def _normalize_transcript_entry(
1657
+ segment: MutableMapping[str, Any]
1658
+ ) -> Optional[MutableMapping[str, float | str]]:
1659
+ """Normalize transcript segment ensuring text exists."""
1660
+
1661
+ if not isinstance(segment, Mapping):
1662
+ return None
1663
+
1664
+ text = _extract_text_value(segment)
1665
+ if not text:
1666
+ return None
1667
+
1668
+ start, end = _extract_time_range(segment)
1669
+ if start is None:
1670
+ return None
1671
+
1672
+ if end is None:
1673
+ end = start
1674
+
1675
+ entry: MutableMapping[str, float | str] = {
1676
+ "start": float(start),
1677
+ "end": float(end),
1678
+ "text": text.strip(),
1679
+ }
1680
+
1681
+ speaker = _extract_speaker_label(segment)
1682
+ if speaker is not None:
1683
+ entry["speaker"] = str(speaker)
1684
+
1685
+ return entry
1686
+
1687
+
1688
+ def _to_seconds(value: Any) -> float:
1689
+ """Convert a value that may be in seconds or ticks to seconds."""
1690
+
1691
+ try:
1692
+ numeric = float(value)
1693
+ except (TypeError, ValueError): # pragma: no cover - defensive
1694
+ return 0.0
1695
+
1696
+ if numeric > 1_000_000: # heuristically treat as 100-ns ticks
1697
+ return numeric / 10_000_000
1698
+ return numeric
1699
+
1700
+
1701
+ def _limit_speaker_count(
1702
+ segments: List[MutableMapping[str, float | str]], max_speakers: int
1703
+ ) -> List[MutableMapping[str, float | str]]:
1704
+ """Remap segments to ensure the number of speakers does not exceed limit."""
1705
+
1706
+ identified = [
1707
+ segment.get("speaker")
1708
+ for segment in segments
1709
+ if isinstance(segment.get("speaker"), str)
1710
+ ]
1711
+ unique_speakers = {speaker for speaker in identified if speaker is not None}
1712
+ if len(unique_speakers) <= max_speakers:
1713
+ return segments
1714
+
1715
+ durations: defaultdict[str, float] = defaultdict(float)
1716
+ for segment in segments:
1717
+ speaker = segment.get("speaker")
1718
+ if not isinstance(speaker, str):
1719
+ continue
1720
+ duration = max(
1721
+ 0.0, float(segment.get("end", 0.0)) - float(segment.get("start", 0.0))
1722
+ )
1723
+ durations[speaker] += duration
1724
+
1725
+ ranked_speakers = [
1726
+ speaker for speaker, _ in sorted(durations.items(), key=lambda item: item[1], reverse=True)
1727
+ ]
1728
+ allowed = ranked_speakers[:max_speakers]
1729
+ if not allowed:
1730
+ return segments
1731
+
1732
+ totals = {speaker: durations.get(speaker, 0.0) for speaker in allowed}
1733
+ mapping: MutableMapping[str, str] = {}
1734
+ remapped: List[MutableMapping[str, float | str]] = []
1735
+
1736
+ for segment in segments:
1737
+ speaker = segment.get("speaker")
1738
+ if speaker is None or speaker in allowed:
1739
+ remapped.append(segment)
1740
+ continue
1741
+ if speaker not in mapping:
1742
+ target = min(allowed, key=lambda value: totals.get(value, 0.0))
1743
+ mapping[speaker] = target
1744
+ target = mapping[speaker]
1745
+ updated = dict(segment)
1746
+ updated["speaker"] = target
1747
+ duration = max(
1748
+ 0.0, float(updated.get("end", 0.0)) - float(updated.get("start", 0.0))
1749
+ )
1750
+ totals[target] = totals.get(target, 0.0) + duration
1751
+ remapped.append(updated)
1752
+
1753
+ return remapped
1754
+
1755
+
1756
+
1757
+ def _extract_openai_error_message(exc: Exception) -> str:
1758
+ """Extract a user-friendly message from an OpenAI exception."""
1759
+
1760
+ body = getattr(exc, "body", None)
1761
+ if isinstance(body, Mapping):
1762
+ error = body.get("error")
1763
+ if isinstance(error, Mapping):
1764
+ message = error.get("message")
1765
+ if isinstance(message, str) and message.strip():
1766
+ return message.strip()
1767
+ text = str(exc)
1768
+ return text.strip() or "Unknown Azure OpenAI error"
1769
+
1770
+
1771
+ def _load_summary_prompt_file(path: str) -> str:
1772
+ """Load custom summary prompt content from a file."""
1773
+
1774
+ absolute_path = os.path.abspath(path)
1775
+ if not os.path.isfile(absolute_path):
1776
+ raise RuntimeError(f"摘要 Prompt 配置文件不存在:{path}")
1777
+
1778
+ try:
1779
+ with open(absolute_path, "r", encoding="utf-8") as handle:
1780
+ content = handle.read()
1781
+ except OSError as exc:
1782
+ raise RuntimeError(f"读取摘要 Prompt 配置文件失败:{path}") from exc
1783
+
1784
+ if not content.strip():
1785
+ raise RuntimeError(f"摘要 Prompt 配置文件内容为空:{path}")
1786
+
1787
+ return content
1788
+
1789
+
1790
+ def _load_prompt_with_fallback(path: Path | str, fallback: str) -> str:
1791
+ try:
1792
+ return Path(path).read_text(encoding="utf-8")
1793
+ except OSError:
1794
+ return fallback
1795
+
1796
+
1797
+ def _load_default_summary_prompt() -> str:
1798
+ return _load_prompt_with_fallback(DEFAULT_SUMMARY_PROMPT_PATH, SUMMARY_PROMPT)
1799
+
1800
+
1801
+ def _load_default_article_prompt() -> str:
1802
+ return _load_prompt_with_fallback(
1803
+ DEFAULT_ARTICLE_PROMPT_PATH, ARTICLE_SUMMARY_PROMPT
1804
+ )
1805
+
1806
+
1807
+ def generate_translation_summary(
1808
+ segments: Sequence[MutableMapping[str, Any]],
1809
+ video_url: str,
1810
+ prompt: Optional[str] = None,
1811
+ metadata: Optional[Mapping[str, Any]] = None,
1812
+ ) -> MutableMapping[str, Any]:
1813
+ """Call Azure GPT-5 to translate and summarize ASR segments."""
1814
+
1815
+ if not segments:
1816
+ raise RuntimeError("无法生成翻译摘要:缺少 ASR 结果。")
1817
+
1818
+ azure_key = os.getenv("AZURE_OPENAI_API_KEY")
1819
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
1820
+ if not azure_key or not azure_endpoint:
1821
+ raise RuntimeError(
1822
+ "Azure OpenAI 凭据缺失。请设置 AZURE_OPENAI_API_KEY与 AZURE_OPENAI_ENDPOINT。"
1823
+ )
1824
+
1825
+ deployment = os.getenv("AZURE_OPENAI_SUMMARY_DEPLOYMENT") or "llab-gpt-5-pro"
1826
+
1827
+ instruction = prompt or _load_default_summary_prompt()
1828
+ timeline = _format_segments_for_summary(segments)
1829
+ user_message = "原始 ASR 片段如下:\n" + timeline
1830
+
1831
+ use_responses = deployment.endswith("-pro") or str(
1832
+ os.getenv("AZURE_OPENAI_USE_RESPONSES", "")
1833
+ ).lower() in {"1", "true", "yes"}
1834
+
1835
+ if use_responses:
1836
+ base_url = os.getenv("AZURE_OPENAI_RESPONSES_BASE_URL")
1837
+ if not base_url:
1838
+ base_url = _build_responses_base_url(azure_endpoint)
1839
+
1840
+ try:
1841
+ from openai import OpenAI
1842
+ except ModuleNotFoundError as exc: # pragma: no cover - depends on env
1843
+ raise RuntimeError(
1844
+ "openai 库未安装。请执行 `pip install openai`."
1845
+ ) from exc
1846
+
1847
+ client = OpenAI(base_url=base_url.rstrip("/"), api_key=azure_key)
1848
+ response = client.responses.create(
1849
+ model=deployment,
1850
+ input=[
1851
+ {
1852
+ "role": "system",
1853
+ "content": [{"type": "input_text", "text": instruction}],
1854
+ },
1855
+ {
1856
+ "role": "user",
1857
+ "content": [{"type": "input_text", "text": user_message}],
1858
+ },
1859
+ ],
1860
+ max_output_tokens=16384,
1861
+ )
1862
+ raw_summary = _extract_responses_text(response)
1863
+ else:
1864
+ summary_api_version = (
1865
+ os.getenv("AZURE_OPENAI_SUMMARY_API_VERSION") or "2025-01-01-preview"
1866
+ )
1867
+
1868
+ try:
1869
+ from openai import AzureOpenAI
1870
+ except ModuleNotFoundError as exc: # pragma: no cover - depends on env
1871
+ raise RuntimeError(
1872
+ "openai 库未安装。请执行 `pip install openai`."
1873
+ ) from exc
1874
+
1875
+ client = AzureOpenAI(
1876
+ api_key=azure_key,
1877
+ api_version=summary_api_version,
1878
+ azure_endpoint=azure_endpoint,
1879
+ )
1880
+
1881
+ response = client.chat.completions.create(
1882
+ model=deployment,
1883
+ messages=[
1884
+ {"role": "system", "content": instruction},
1885
+ {"role": "user", "content": user_message},
1886
+ ],
1887
+ max_completion_tokens=16384,
1888
+ )
1889
+ raw_summary = _extract_summary_text(response)
1890
+
1891
+ if metadata is not None:
1892
+ video_metadata = dict(metadata)
1893
+ else:
1894
+ video_metadata = _fetch_video_metadata(video_url)
1895
+ return _compose_summary_documents(segments, raw_summary, video_metadata, video_url)
1896
+
1897
+
1898
+ def _format_segments_for_summary(
1899
+ segments: Sequence[MutableMapping[str, Any]]
1900
+ ) -> str:
1901
+ """Format segments into timeline text for summarization prompt."""
1902
+
1903
+ lines = []
1904
+ for segment in sorted(
1905
+ segments, key=lambda item: float(item.get("start", 0.0))
1906
+ ):
1907
+ text = segment.get("text")
1908
+ if not isinstance(text, str) or not text.strip():
1909
+ continue
1910
+ start = _format_timestamp(segment.get("start", 0.0))
1911
+ end = _format_timestamp(segment.get("end", segment.get("start", 0.0)))
1912
+ speaker = segment.get("speaker")
1913
+ if isinstance(speaker, str) and speaker.strip():
1914
+ line = f"{start} - {end} | {speaker.strip()}: {text.strip()}"
1915
+ else:
1916
+ line = f"{start} - {end} | {text.strip()}"
1917
+ lines.append(line)
1918
+
1919
+ if not lines:
1920
+ raise RuntimeError("无法生成翻译摘要:ASR 结果中缺少有效文本。")
1921
+
1922
+ return "\n".join(lines)
1923
+
1924
+
1925
+ def _normalize_domain_tag(raw: str) -> str:
1926
+ text = str(raw or "").strip()
1927
+ if not text:
1928
+ return ""
1929
+
1930
+ first_line = text.splitlines()[0].strip()
1931
+ first_line = re.sub(r"^[\s\-\*\u2022\d\.、]+", "", first_line)
1932
+
1933
+ if ":" in first_line or ":" in first_line:
1934
+ parts = re.split(r"[::]", first_line, maxsplit=1)
1935
+ if len(parts) == 2 and parts[1].strip():
1936
+ first_line = parts[1].strip()
1937
+
1938
+ for separator in (",", ",", "。", ";", ";"):
1939
+ if separator in first_line:
1940
+ first_line = first_line.split(separator, 1)[0].strip()
1941
+
1942
+ if " " in first_line:
1943
+ first_line = first_line.split(" ", 1)[0].strip()
1944
+
1945
+ return first_line[:32].strip()
1946
+
1947
+
1948
+ def _infer_domain_from_summary(raw_summary: str) -> Optional[str]:
1949
+ text = str(raw_summary or "").strip()
1950
+ if not text:
1951
+ return None
1952
+
1953
+ azure_key = os.getenv("AZURE_OPENAI_API_KEY")
1954
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
1955
+ if not azure_key or not azure_endpoint:
1956
+ return None
1957
+
1958
+ deployment = (
1959
+ os.getenv("AZURE_OPENAI_DOMAIN_DEPLOYMENT")
1960
+ or os.getenv("AZURE_OPENAI_SUMMARY_DEPLOYMENT")
1961
+ or "llab-gpt-5-pro"
1962
+ )
1963
+
1964
+ use_responses = deployment.endswith("-pro") or str(
1965
+ os.getenv("AZURE_OPENAI_USE_RESPONSES", "")
1966
+ ).lower() in {"1", "true", "yes"}
1967
+
1968
+ try:
1969
+ if use_responses:
1970
+ base_url = os.getenv("AZURE_OPENAI_RESPONSES_BASE_URL")
1971
+ if not base_url:
1972
+ base_url = _build_responses_base_url(azure_endpoint)
1973
+
1974
+ from openai import OpenAI
1975
+
1976
+ client = OpenAI(base_url=base_url.rstrip("/"), api_key=azure_key)
1977
+ response = client.responses.create(
1978
+ model=deployment,
1979
+ input=[
1980
+ {
1981
+ "role": "system",
1982
+ "content": [{"type": "input_text", "text": DOMAIN_PROMPT}],
1983
+ },
1984
+ {
1985
+ "role": "user",
1986
+ "content": [{"type": "input_text", "text": text}],
1987
+ },
1988
+ ],
1989
+ max_output_tokens=1024,
1990
+ )
1991
+ content = _extract_responses_text(response)
1992
+ else:
1993
+ summary_api_version = (
1994
+ os.getenv("AZURE_OPENAI_SUMMARY_API_VERSION") or "2025-01-01-preview"
1995
+ )
1996
+
1997
+ from openai import AzureOpenAI
1998
+
1999
+ client = AzureOpenAI(
2000
+ api_key=azure_key,
2001
+ api_version=summary_api_version,
2002
+ azure_endpoint=azure_endpoint,
2003
+ )
2004
+ response = client.chat.completions.create(
2005
+ model=deployment,
2006
+ messages=[
2007
+ {"role": "system", "content": DOMAIN_PROMPT},
2008
+ {"role": "user", "content": text},
2009
+ ],
2010
+ max_completion_tokens=1024,
2011
+ )
2012
+ content = _extract_summary_text(response)
2013
+ except Exception: # pragma: no cover - depends on runtime environment
2014
+ return None
2015
+
2016
+ candidate = _normalize_domain_tag(content)
2017
+ if not candidate:
2018
+ return None
2019
+ return candidate
2020
+
2021
+
2022
+ def _compose_summary_documents(
2023
+ segments: Sequence[MutableMapping[str, Any]],
2024
+ raw_summary: str,
2025
+ video_metadata: Mapping[str, Any],
2026
+ video_url: str,
2027
+ ) -> MutableMapping[str, Any]:
2028
+ if not raw_summary or not raw_summary.strip():
2029
+ raise RuntimeError("Azure GPT-5 摘要结果为空。")
2030
+
2031
+ sorted_segments = sorted(
2032
+ (dict(segment) for segment in segments),
2033
+ key=lambda item: float(item.get("start", 0.0)),
2034
+ )
2035
+
2036
+ generated_at = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
2037
+ if sorted_segments:
2038
+ starts = [float(item.get("start", 0.0)) for item in sorted_segments]
2039
+ ends = [float(item.get("end", item.get("start", 0.0))) for item in sorted_segments]
2040
+ earliest = min(starts)
2041
+ latest = max(ends)
2042
+ total_duration = max(0.0, latest - earliest)
2043
+ else:
2044
+ total_duration = 0.0
2045
+
2046
+ formatted_duration = _format_timestamp(total_duration)
2047
+ speakers = sorted(
2048
+ {
2049
+ str(item.get("speaker", "")).strip()
2050
+ for item in sorted_segments
2051
+ if str(item.get("speaker", "")).strip()
2052
+ }
2053
+ )
2054
+
2055
+ title_raw = re.sub(r'[:\/\\`]', '', str(video_metadata.get("title") or ""))
2056
+ title = str(title_raw or "未知标题").strip() or "未知标题"
2057
+ publish_date_raw = str(video_metadata.get("upload_date") or "").strip()
2058
+ publish_date = _format_publish_date(publish_date_raw)
2059
+
2060
+ source_url = str(video_metadata.get("webpage_url") or video_url)
2061
+ domain = _extract_domain(video_metadata)
2062
+ if domain == "通用":
2063
+ generated_domain = _infer_domain_from_summary(raw_summary)
2064
+ if generated_domain:
2065
+ domain = generated_domain
2066
+
2067
+ total_words = _count_words(raw_summary)
2068
+ estimated_minutes = max(1, math.ceil(total_words / READING_WORDS_PER_MINUTE))
2069
+
2070
+ year, month = _derive_year_month(publish_date_raw, generated_at)
2071
+ year_month_code = f"{year}-M{month}"
2072
+ heading = f"【{domain}】{title}-{year_month_code}"
2073
+ file_base = _sanitize_filename_base(heading)
2074
+
2075
+ summary_lines: List[str] = []
2076
+ summary_lines.append(f"# {heading}")
2077
+ summary_lines.append("")
2078
+ summary_lines.append("## 封面")
2079
+ summary_lines.append(f"- 标题:{title}")
2080
+ summary_lines.append(f"- 链接:{source_url}")
2081
+ summary_lines.append(f"- 发布日期:{publish_date}")
2082
+ summary_lines.append(f"- 总字数:{total_words}")
2083
+ summary_lines.append(f"- 预估阅读时长:约 {estimated_minutes} 分钟")
2084
+ summary_lines.append(f"- 生成时间:{generated_at}")
2085
+ summary_lines.append(f"- 覆盖时长:{formatted_duration}")
2086
+
2087
+ summary_lines.append("")
2088
+ summary_lines.append(raw_summary.strip())
2089
+ summary_lines.append("")
2090
+ summary_lines.extend(_build_exchange_footer())
2091
+
2092
+ timeline_lines: List[str] = []
2093
+ timeline_lines.append(f"# {heading}")
2094
+ timeline_lines.append("")
2095
+ timeline_lines.append("## 封面")
2096
+ timeline_lines.append(f"- 标题:{title}")
2097
+ timeline_lines.append(f"- 链接:{source_url}")
2098
+ timeline_lines.append(f"- 发布日期:{publish_date}")
2099
+ timeline_lines.append(f"- 总字数:{total_words}")
2100
+ timeline_lines.append(f"- 预估阅读时长:约 {estimated_minutes} 分钟")
2101
+ timeline_lines.append(f"- 生成时间:{generated_at}")
2102
+ timeline_lines.append(f"- 覆盖时长:{formatted_duration}")
2103
+ if speakers:
2104
+ timeline_lines.append(f"- 识别说话人:{', '.join(speakers)}")
2105
+
2106
+ timeline_lines.append("")
2107
+ timeline_lines.append("## 时间轴")
2108
+ timeline_lines.append("| 序号 | 起始 | 结束 | 时长 | 说话人 | 文本 |")
2109
+ timeline_lines.append("| --- | --- | --- | --- | --- | --- |")
2110
+
2111
+ for index, segment in enumerate(sorted_segments, start=1):
2112
+ start_seconds = float(segment.get("start", 0.0))
2113
+ end_seconds = float(segment.get("end", start_seconds))
2114
+ duration_seconds = max(0.0, end_seconds - start_seconds)
2115
+ speaker = str(segment.get("speaker", "")).strip() or "-"
2116
+ text = str(segment.get("text", ""))
2117
+ cell_text = _sanitize_markdown_cell(text)
2118
+ timeline_lines.append(
2119
+ "| {idx} | {start} | {end} | {duration} | {speaker} | {text} |".format(
2120
+ idx=index,
2121
+ start=_format_timestamp(start_seconds),
2122
+ end=_format_timestamp(end_seconds),
2123
+ duration=_format_timestamp(duration_seconds),
2124
+ speaker=_sanitize_markdown_cell(speaker),
2125
+ text=cell_text,
2126
+ )
2127
+ )
2128
+
2129
+ metadata = {
2130
+ "generated_at": generated_at,
2131
+ "title": title,
2132
+ "url": source_url,
2133
+ "publish_date": publish_date,
2134
+ "total_words": total_words,
2135
+ "estimated_minutes": estimated_minutes,
2136
+ "duration": formatted_duration,
2137
+ "speakers": speakers,
2138
+ "domain": domain,
2139
+ "year": year,
2140
+ "month": month,
2141
+ }
2142
+
2143
+ return {
2144
+ "summary_markdown": "\n".join(summary_lines),
2145
+ "timeline_markdown": "\n".join(timeline_lines),
2146
+ "metadata": metadata,
2147
+ "total_words": total_words,
2148
+ "estimated_minutes": estimated_minutes,
2149
+ "file_base": file_base,
2150
+ "heading": heading,
2151
+ }
2152
+
2153
+
2154
+ def _build_exchange_footer() -> List[str]:
2155
+ weekday_map = {
2156
+ 0: "周一",
2157
+ 1: "周二",
2158
+ 2: "周三",
2159
+ 3: "周四",
2160
+ 4: "周五",
2161
+ 5: "周六",
2162
+ 6: "周日",
2163
+ }
2164
+
2165
+ now = datetime.now()
2166
+ zh_weekday = weekday_map.get(now.weekday(), "周?")
2167
+ formatted_date = f"{now.strftime('%Y-%m-%d')}_{zh_weekday}"
2168
+
2169
+ return [
2170
+ "## 欢迎交流与合作",
2171
+ "目前主要兴趣是探索agent的落地,想进一步交流可加微信(cleezhang),一些[自我介绍](https://lee-agi.github.io/85ed64eda0/)。",
2172
+ "",
2173
+ f"> 本文发表于 {formatted_date}。",
2174
+ ]
2175
+
2176
+
2177
+ def _build_responses_base_url(endpoint: str) -> str:
2178
+ normalized = endpoint.rstrip("/")
2179
+ if normalized.endswith("/openai/v1"):
2180
+ return normalized
2181
+ return f"{normalized}/openai/v1"
2182
+
2183
+
2184
+ def _extract_responses_text(response: object) -> str:
2185
+ candidate = getattr(response, "output_text", None)
2186
+ if isinstance(candidate, str) and candidate.strip():
2187
+ return candidate.strip()
2188
+
2189
+ data = None
2190
+ if isinstance(response, MutableMapping):
2191
+ data = response.get("output") or response.get("data") or response.get("choices")
2192
+ else:
2193
+ data = getattr(response, "output", None) or getattr(response, "data", None)
2194
+
2195
+ texts: List[str] = []
2196
+
2197
+ if isinstance(data, list):
2198
+ for item in data:
2199
+ content = None
2200
+ if isinstance(item, MutableMapping):
2201
+ content = item.get("content")
2202
+ else:
2203
+ content = getattr(item, "content", None)
2204
+
2205
+ if isinstance(content, list):
2206
+ for block in content:
2207
+ text_value = None
2208
+ if isinstance(block, MutableMapping):
2209
+ text_value = block.get("text")
2210
+ else:
2211
+ text_value = getattr(block, "text", None)
2212
+ if isinstance(text_value, str) and text_value.strip():
2213
+ texts.append(text_value.strip())
2214
+ elif isinstance(content, str) and content.strip():
2215
+ texts.append(content.strip())
2216
+
2217
+ if texts:
2218
+ return "\n".join(texts).strip()
2219
+
2220
+ raise RuntimeError("Azure GPT-5 未返回可用的摘要结果。")
2221
+
2222
+
2223
+ def _sanitize_markdown_cell(value: str) -> str:
2224
+ sanitized = value.replace("|", "\\|").replace("\n", " ").strip()
2225
+ return sanitized or "-"
2226
+
2227
+
2228
+ def _extract_summary_text(response: object) -> str:
2229
+ """Extract summary content from Azure chat completion response."""
2230
+
2231
+ choices = None
2232
+ if isinstance(response, MutableMapping):
2233
+ choices = response.get("choices")
2234
+ elif hasattr(response, "choices"):
2235
+ choices = getattr(response, "choices")
2236
+
2237
+ if not isinstance(choices, list) or not choices:
2238
+ raise RuntimeError("Azure GPT-5 未返回可用的摘要结果。")
2239
+
2240
+ first = choices[0]
2241
+ if isinstance(first, MutableMapping):
2242
+ message = first.get("message")
2243
+ else:
2244
+ message = getattr(first, "message", None)
2245
+
2246
+ if isinstance(message, MutableMapping):
2247
+ content = message.get("content")
2248
+ else:
2249
+ content = getattr(message, "content", None)
2250
+
2251
+ if isinstance(content, str) and content.strip():
2252
+ return content.strip()
2253
+
2254
+ raise RuntimeError("Azure GPT-5 摘要结果为空。")
2255
+
2256
+
2257
+ def _write_summary_documents(
2258
+ video_url: str,
2259
+ summary_markdown: str,
2260
+ timeline_markdown: str,
2261
+ file_base: str,
2262
+ ) -> Mapping[str, str]:
2263
+ if not summary_markdown or not summary_markdown.strip():
2264
+ raise RuntimeError("无法写入空的摘要 Markdown 内容。")
2265
+ if not timeline_markdown or not timeline_markdown.strip():
2266
+ raise RuntimeError("无法写入空的时间轴 Markdown 内容。")
2267
+
2268
+ directory = _resolve_video_cache_dir(video_url)
2269
+ summary_filename = f"{file_base}_summary.md"
2270
+ timeline_filename = f"{file_base}_timeline.md"
2271
+ summary_path = _ensure_unique_markdown_path(directory, summary_filename)
2272
+ timeline_path = _ensure_unique_markdown_path(directory, timeline_filename)
2273
+
2274
+ try:
2275
+ with open(summary_path, "w", encoding="utf-8") as summary_file:
2276
+ summary_file.write(summary_markdown)
2277
+ with open(timeline_path, "w", encoding="utf-8") as timeline_file:
2278
+ timeline_file.write(timeline_markdown)
2279
+ except OSError as exc: # pragma: no cover - filesystem failure
2280
+ raise RuntimeError(
2281
+ f"写入摘要/时间轴 Markdown 文件失败:{summary_path}, {timeline_path}"
2282
+ ) from exc
2283
+
2284
+ result: Dict[str, str] = {
2285
+ "summary": summary_path,
2286
+ "timeline": timeline_path,
2287
+ }
2288
+
2289
+ outbox_summary = _copy_file_to_outbox(summary_path)
2290
+ if outbox_summary is not None:
2291
+ result["outbox_summary"] = outbox_summary
2292
+
2293
+ return result
2294
+
2295
+
2296
+ def _ensure_unique_markdown_path(directory: str, filename: str) -> str:
2297
+ base_path = Path(directory)
2298
+ base_path.mkdir(parents=True, exist_ok=True)
2299
+ candidate = base_path / filename
2300
+ if not candidate.exists():
2301
+ return str(candidate)
2302
+
2303
+ stem = candidate.stem
2304
+ suffix = candidate.suffix or ".md"
2305
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
2306
+ attempt = 1
2307
+ while True:
2308
+ if attempt == 1:
2309
+ new_name = f"{stem}_{timestamp}{suffix}"
2310
+ else:
2311
+ new_name = f"{stem}_{timestamp}_{attempt}{suffix}"
2312
+ candidate = base_path / new_name
2313
+ if not candidate.exists():
2314
+ return str(candidate)
2315
+ attempt += 1
2316
+
2317
+
2318
+ def _copy_file_to_outbox(source_path: str) -> Optional[str]:
2319
+ """Copy a generated file to the configured outbox directory."""
2320
+
2321
+ outbox_dir = _getenv("ANY2SUMMARY_OUTBOX_DIR", "PODCAST_TRANSFORMER_OUTBOX_DIR")
2322
+ if not outbox_dir:
2323
+ outbox_dir = DEFAULT_OUTBOX_DIR
2324
+ if not outbox_dir:
2325
+ return None
2326
+
2327
+ try:
2328
+ resolved_dir = os.path.expanduser(outbox_dir)
2329
+ os.makedirs(resolved_dir, exist_ok=True)
2330
+ target_path = os.path.join(resolved_dir, os.path.basename(source_path))
2331
+ shutil.copyfile(source_path, target_path)
2332
+ return target_path
2333
+ except OSError:
2334
+ return None
2335
+
2336
+
2337
+ def _fetch_video_metadata(video_url: str) -> Mapping[str, Any]:
2338
+ metadata: Dict[str, Any] = {
2339
+ "title": None,
2340
+ "webpage_url": video_url,
2341
+ "upload_date": "",
2342
+ }
2343
+
2344
+ try:
2345
+ import yt_dlp
2346
+ except ModuleNotFoundError:
2347
+ return metadata
2348
+
2349
+ ydl_opts = {
2350
+ "quiet": True,
2351
+ "skip_download": True,
2352
+ "nocheckcertificate": True,
2353
+ }
2354
+
2355
+ try: # pragma: no cover - depends on network availability
2356
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
2357
+ info = ydl.extract_info(video_url, download=False)
2358
+ except Exception:
2359
+ return metadata
2360
+
2361
+ if isinstance(info, Mapping):
2362
+ for key in ("title", "webpage_url", "upload_date"):
2363
+ if key in info:
2364
+ metadata[key] = info.get(key)
2365
+
2366
+ return metadata
2367
+
2368
+
2369
+ def _count_words(text: str) -> int:
2370
+ if not isinstance(text, str):
2371
+ return 0
2372
+
2373
+ stripped = text.strip()
2374
+ if not stripped:
2375
+ return 0
2376
+
2377
+ latin_tokens = re.findall(r"[A-Za-z0-9]+", stripped)
2378
+ chinese_tokens = re.findall(r"[\u4e00-\u9fff]", stripped)
2379
+
2380
+ total = len(latin_tokens) + len(chinese_tokens)
2381
+ if total == 0:
2382
+ compact = re.sub(r"\s", "", stripped)
2383
+ total = len(compact)
2384
+
2385
+ return max(total, 0)
2386
+
2387
+
2388
+ def _format_timestamp(value: Any) -> str:
2389
+ """Format numeric seconds into HH:MM:SS.mmm."""
2390
+
2391
+ try:
2392
+ seconds = float(value)
2393
+ except (TypeError, ValueError):
2394
+ seconds = 0.0
2395
+
2396
+ if seconds < 0:
2397
+ seconds = 0.0
2398
+
2399
+ total_milliseconds = int(round(seconds * 1000))
2400
+ hours = total_milliseconds // 3_600_000
2401
+ minutes = (total_milliseconds // 60_000) % 60
2402
+ secs = (total_milliseconds // 1000) % 60
2403
+ milliseconds = total_milliseconds % 1000
2404
+ return f"{hours:02d}:{minutes:02d}:{secs:02d}"
2405
+
2406
+
2407
+ def _format_publish_date(raw: str) -> str:
2408
+ if raw and len(raw) == 8 and raw.isdigit():
2409
+ return f"{raw[0:4]}-{raw[4:6]}-{raw[6:8]}"
2410
+ if raw and re.match(r"\d{4}-\d{2}-\d{2}", raw):
2411
+ return raw
2412
+ return "未知日期"
2413
+
2414
+
2415
+ def _derive_year_month(publish_date_raw: str, generated_at: str) -> Tuple[str, str]:
2416
+ if publish_date_raw and publish_date_raw.isdigit() and len(publish_date_raw) == 8:
2417
+ return publish_date_raw[:4], publish_date_raw[4:6]
2418
+
2419
+ match = re.match(r"(\d{4})-(\d{2})", publish_date_raw)
2420
+ if match:
2421
+ return match.group(1), match.group(2)
2422
+
2423
+ if re.match(r"\d{4}-\d{2}-\d{2}", generated_at):
2424
+ return generated_at[:4], generated_at[5:7]
2425
+
2426
+ return "1970", "01"
2427
+
2428
+
2429
+ def _extract_domain(video_metadata: Mapping[str, Any]) -> str:
2430
+ domain = ""
2431
+ candidates = []
2432
+ categories = video_metadata.get("categories")
2433
+ if isinstance(categories, list):
2434
+ candidates.extend(categories)
2435
+ category = video_metadata.get("category")
2436
+ if category is not None:
2437
+ candidates.append(category)
2438
+ tags = video_metadata.get("tags")
2439
+ if isinstance(tags, list):
2440
+ candidates.extend(tags)
2441
+
2442
+ for candidate in candidates:
2443
+ if isinstance(candidate, str):
2444
+ name = candidate.strip()
2445
+ if name:
2446
+ domain = name
2447
+ break
2448
+
2449
+ domain = domain or "通用"
2450
+ return domain
2451
+
2452
+
2453
+ def _sanitize_filename_base(text: str) -> str:
2454
+ sanitized = re.sub(r"[\\/:*?\"<>|]", "", text)
2455
+ sanitized = sanitized.replace(" ", "")
2456
+ return sanitized or "summary"
2457
+
2458
+
2459
+ def _build_extra_body(
2460
+ known_speakers: Optional[List[Tuple[str, str]]]
2461
+ ) -> MutableMapping[str, Any]:
2462
+ if not known_speakers:
2463
+ return {}
2464
+
2465
+ names: List[str] = []
2466
+ references: List[str] = []
2467
+
2468
+ for name, path in known_speakers:
2469
+ data_url = _to_data_url(path)
2470
+ names.append(name)
2471
+ references.append(data_url)
2472
+
2473
+ return {
2474
+ "known_speaker_names": names,
2475
+ "known_speaker_references": references,
2476
+ }
2477
+
2478
+
2479
+ def _to_data_url(path: str) -> str:
2480
+ if not os.path.exists(path):
2481
+ raise RuntimeError(f"Known speaker reference file not found: {path}")
2482
+ with open(path, "rb") as fh:
2483
+ encoded = base64.b64encode(fh.read()).decode("utf-8")
2484
+ return "data:audio/wav;base64," + encoded
2485
+
2486
+
2487
+ def _parse_known_speakers(
2488
+ raw_values: Optional[Iterable[str]]
2489
+ ) -> Optional[List[Tuple[str, str]]]:
2490
+ if not raw_values:
2491
+ return None
2492
+
2493
+ parsed: List[Tuple[str, str]] = []
2494
+ for item in raw_values:
2495
+ if not item:
2496
+ continue
2497
+ if "=" not in item:
2498
+ raise RuntimeError(
2499
+ f"Known speaker entry '{item}' must follow name=path format."
2500
+ )
2501
+ name, path = item.split("=", 1)
2502
+ name = name.strip()
2503
+ path = path.strip()
2504
+ if not name or not path:
2505
+ raise RuntimeError(
2506
+ f"Known speaker entry '{item}' is invalid; name/path cannot be empty."
2507
+ )
2508
+ parsed.append((name, path))
2509
+ return parsed or None
2510
+
2511
+
2512
+ def _resolve_video_cache_dir(video_url: str) -> str:
2513
+ """Resolve the cache directory for a given video URL."""
2514
+
2515
+ parsed_url = urlparse(video_url)
2516
+ hostname = (parsed_url.hostname or "unknown-host").lower()
2517
+ host_segment = hostname.replace(".", "_") or "unknown-host"
2518
+
2519
+ video_id = extract_video_id(video_url)
2520
+ if video_id:
2521
+ cache_key = os.path.join("youtube", video_id)
2522
+ else:
2523
+ hasher = hashlib.sha256()
2524
+ hasher.update(video_url.strip().encode("utf-8"))
2525
+ digest = hasher.hexdigest()[:16]
2526
+ cache_key = os.path.join(host_segment, digest)
2527
+
2528
+ base_dir = _getenv("ANY2SUMMARY_CACHE_DIR", "PODCAST_TRANSFORMER_CACHE_DIR")
2529
+ if base_dir:
2530
+ cache_base = base_dir
2531
+ else:
2532
+ home_dir = os.path.expanduser("~")
2533
+ cache_base = os.path.join(home_dir, ".cache", "any2summary")
2534
+
2535
+ video_dir = os.path.join(cache_base, cache_key)
2536
+ os.makedirs(video_dir, exist_ok=True)
2537
+ return video_dir
2538
+
2539
+
2540
+ def _create_http_client():
2541
+ """Return an httpx.Client instance for fetching web content."""
2542
+
2543
+ try:
2544
+ import httpx
2545
+ except ModuleNotFoundError as exc: # pragma: no cover - depends on env
2546
+ raise RuntimeError("httpx 库未安装。请执行 `pip install httpx`." ) from exc
2547
+
2548
+ return httpx.Client(follow_redirects=True, timeout=30.0)
2549
+
2550
+
2551
+ def _normalize_article_text(text: str) -> str:
2552
+ normalized = re.sub(r"\s+", " ", text or "")
2553
+ return normalized.strip()
2554
+
2555
+
2556
+ class _ArticleHTMLParser(HTMLParser):
2557
+ """Lightweight HTML parser to extract article metadata."""
2558
+
2559
+ def __init__(self) -> None:
2560
+ super().__init__(convert_charrefs=True)
2561
+ self.in_title = False
2562
+ self.title_parts: List[str] = []
2563
+ self.in_heading = False
2564
+ self.heading_parts: List[str] = []
2565
+ self.current_paragraph_tag: Optional[str] = None
2566
+ self.current_paragraph_parts: List[str] = []
2567
+ self.paragraphs: List[str] = []
2568
+ self.description: Optional[str] = None
2569
+ self.icon_href: Optional[str] = None
2570
+ self._ignored_depth = 0
2571
+
2572
+ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
2573
+ tag_lower = tag.lower()
2574
+ if tag_lower in {"script", "style"}:
2575
+ self._ignored_depth += 1
2576
+ return
2577
+ if self._ignored_depth:
2578
+ return
2579
+
2580
+ if tag_lower == "title":
2581
+ self.in_title = True
2582
+ return
2583
+
2584
+ if tag_lower in {"p", "li"}:
2585
+ if self.current_paragraph_tag is None:
2586
+ self.current_paragraph_tag = tag_lower
2587
+ self.current_paragraph_parts = []
2588
+ return
2589
+
2590
+ if tag_lower == "h1":
2591
+ self.in_heading = True
2592
+ self.heading_parts = []
2593
+ return
2594
+
2595
+ if tag_lower == "meta":
2596
+ attr_dict = {key.lower(): value for key, value in attrs if value is not None}
2597
+ content = attr_dict.get("content")
2598
+ if not content:
2599
+ return
2600
+ name = (attr_dict.get("name") or "").lower()
2601
+ prop = (attr_dict.get("property") or "").lower()
2602
+ if name == "description" or prop in {"og:description", "twitter:description"}:
2603
+ self.description = content
2604
+ return
2605
+
2606
+ if tag_lower == "link":
2607
+ attr_dict = {key.lower(): value for key, value in attrs if value is not None}
2608
+ rel_value = (attr_dict.get("rel") or "").lower()
2609
+ href = attr_dict.get("href")
2610
+ if href and "icon" in rel_value:
2611
+ self.icon_href = href
2612
+ return
2613
+
2614
+ def handle_startendtag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
2615
+ self.handle_starttag(tag, attrs)
2616
+
2617
+ def handle_endtag(self, tag: str) -> None:
2618
+ tag_lower = tag.lower()
2619
+ if tag_lower in {"script", "style"}:
2620
+ if self._ignored_depth:
2621
+ self._ignored_depth -= 1
2622
+ return
2623
+ if self._ignored_depth:
2624
+ return
2625
+
2626
+ if tag_lower == "title":
2627
+ self.in_title = False
2628
+ return
2629
+
2630
+ if tag_lower == self.current_paragraph_tag:
2631
+ text = _normalize_article_text("".join(self.current_paragraph_parts))
2632
+ if text:
2633
+ self.paragraphs.append(text)
2634
+ self.current_paragraph_tag = None
2635
+ self.current_paragraph_parts = []
2636
+ return
2637
+
2638
+ if tag_lower == "h1":
2639
+ self.in_heading = False
2640
+
2641
+ def handle_data(self, data: str) -> None:
2642
+ if self._ignored_depth:
2643
+ return
2644
+ if self.in_title:
2645
+ self.title_parts.append(data)
2646
+ if self.in_heading:
2647
+ self.heading_parts.append(data)
2648
+ if self.current_paragraph_tag is not None:
2649
+ self.current_paragraph_parts.append(data)
2650
+
2651
+
2652
+ def _parse_article_html(html_text: str) -> Mapping[str, Any]:
2653
+ parser = _ArticleHTMLParser()
2654
+ parser.feed(html_text)
2655
+ parser.close()
2656
+
2657
+ title = _normalize_article_text("".join(parser.title_parts))
2658
+ if not title and parser.heading_parts:
2659
+ title = _normalize_article_text("".join(parser.heading_parts))
2660
+
2661
+ paragraphs: List[str] = []
2662
+ for text in parser.paragraphs:
2663
+ normalized = _normalize_article_text(text)
2664
+ if normalized:
2665
+ paragraphs.append(normalized)
2666
+
2667
+ return {
2668
+ "title": title,
2669
+ "description": _normalize_article_text(parser.description or ""),
2670
+ "icon_href": parser.icon_href,
2671
+ "paragraphs": paragraphs,
2672
+ }
2673
+
2674
+
2675
+ def _infer_icon_extension(icon_url: str, content_type: Optional[str]) -> str:
2676
+ path = urlparse(icon_url).path
2677
+ extension = os.path.splitext(path)[1]
2678
+ if extension:
2679
+ return extension
2680
+
2681
+ if content_type:
2682
+ lowered = content_type.lower()
2683
+ if "png" in lowered:
2684
+ return ".png"
2685
+ if "jpeg" in lowered or "jpg" in lowered:
2686
+ return ".jpg"
2687
+ if "svg" in lowered:
2688
+ return ".svg"
2689
+ if "gif" in lowered:
2690
+ return ".gif"
2691
+ return ".ico"
2692
+
2693
+
2694
+ def _download_article_icon(
2695
+ client: Any, icon_href: Optional[str], page_url: str, cache_dir: str
2696
+ ) -> Optional[str]:
2697
+ if not icon_href:
2698
+ return None
2699
+
2700
+ icon_url = urljoin(page_url, icon_href)
2701
+ try:
2702
+ response = client.get(
2703
+ icon_url,
2704
+ headers={
2705
+ "User-Agent": DEFAULT_YTDLP_USER_AGENT,
2706
+ "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
2707
+ "Referer": page_url,
2708
+ },
2709
+ )
2710
+ response.raise_for_status()
2711
+ except Exception: # pragma: no cover - 网络异常
2712
+ return None
2713
+
2714
+ extension = _infer_icon_extension(icon_url, response.headers.get("Content-Type"))
2715
+ filename = f"favicon{extension}"
2716
+ icon_path = os.path.join(cache_dir, filename)
2717
+ try:
2718
+ with open(icon_path, "wb") as icon_file:
2719
+ icon_file.write(response.content)
2720
+ except OSError: # pragma: no cover - 文件系统异常
2721
+ return None
2722
+
2723
+ return icon_path
2724
+
2725
+
2726
+ def fetch_article_assets(video_url: str) -> MutableMapping[str, Any]:
2727
+ """Fetch article content and metadata for non-audio webpages."""
2728
+
2729
+ if not video_url.lower().startswith(("http://", "https://")):
2730
+ raise RuntimeError("仅支持通过 HTTP/HTTPS 访问的网页链接。")
2731
+
2732
+ cache_dir = _resolve_video_cache_dir(video_url)
2733
+ os.makedirs(cache_dir, exist_ok=True)
2734
+
2735
+ with _create_http_client() as client:
2736
+ response = client.get(
2737
+ video_url,
2738
+ headers={
2739
+ "User-Agent": DEFAULT_YTDLP_USER_AGENT,
2740
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
2741
+ "Accept-Language": "en-US,en;q=0.9",
2742
+ },
2743
+ )
2744
+ response.raise_for_status()
2745
+ html_text = response.text
2746
+
2747
+ if not html_text or not html_text.strip():
2748
+ raise RuntimeError("网页内容为空,无法解析正文。")
2749
+ parsed = _parse_article_html(html_text)
2750
+ icon_path = _download_article_icon(
2751
+ client, parsed.get("icon_href"), video_url, cache_dir
2752
+ )
2753
+
2754
+ raw_html_path = os.path.join(cache_dir, "article_raw.html")
2755
+ with open(raw_html_path, "w", encoding="utf-8") as raw_file:
2756
+ raw_file.write(html_text)
2757
+
2758
+ paragraph_texts = list(parsed.get("paragraphs", []))
2759
+
2760
+ if not paragraph_texts:
2761
+ raise RuntimeError("未能从网页中提取正文文本。")
2762
+
2763
+ segments: List[MutableMapping[str, float | str]] = []
2764
+ for index, paragraph in enumerate(paragraph_texts):
2765
+ start = float(index)
2766
+ end = float(index + 1)
2767
+ segments.append({"start": start, "end": end, "text": paragraph})
2768
+
2769
+ content_path = os.path.join(cache_dir, "article_content.txt")
2770
+ with open(content_path, "w", encoding="utf-8") as content_file:
2771
+ content_file.write("\n\n".join(paragraph_texts))
2772
+
2773
+ metadata: Dict[str, Any] = {
2774
+ "title": parsed.get("title") or paragraph_texts[0][:80],
2775
+ "description": parsed.get("description", ""),
2776
+ "webpage_url": video_url,
2777
+ "source_type": "article",
2778
+ }
2779
+
2780
+ metadata_path = os.path.join(cache_dir, "article_metadata.json")
2781
+ with open(metadata_path, "w", encoding="utf-8") as metadata_file:
2782
+ json.dump(metadata, metadata_file, ensure_ascii=False, indent=2)
2783
+
2784
+ return {
2785
+ "segments": segments,
2786
+ "metadata": metadata,
2787
+ "raw_html_path": raw_html_path,
2788
+ "content_path": content_path,
2789
+ "metadata_path": metadata_path,
2790
+ "icon_path": icon_path,
2791
+ "cache_dir": cache_dir,
2792
+ }
2793
+
2794
+
2795
+ def _maybe_fetch_article_assets(video_url: str) -> Optional[Mapping[str, Any]]:
2796
+ try:
2797
+ bundle = fetch_article_assets(video_url)
2798
+ except RuntimeError:
2799
+ return None
2800
+
2801
+ segments = bundle.get("segments")
2802
+ if not isinstance(segments, list) or not segments:
2803
+ return None
2804
+
2805
+ return bundle
2806
+
2807
+
2808
+ def _diarization_cache_path(directory: str) -> str:
2809
+ """Return the cache file path for diarization payload."""
2810
+
2811
+ return os.path.join(directory, "diarization.json")
2812
+
2813
+
2814
+ def _prepare_audio_cache(video_url: str) -> str:
2815
+ """Prepare cached audio WAV for a video, avoiding redundant downloads."""
2816
+
2817
+ video_dir = _resolve_video_cache_dir(video_url)
2818
+
2819
+ wav_path = os.path.join(video_dir, "audio.wav")
2820
+ if os.path.exists(wav_path):
2821
+ return wav_path
2822
+
2823
+ raw_path = _find_cached_raw_audio(video_dir)
2824
+ if raw_path is None:
2825
+ raw_path = download_audio_stream(video_url, video_dir)
2826
+
2827
+ if raw_path.endswith(".wav"):
2828
+ if os.path.abspath(raw_path) == os.path.abspath(wav_path):
2829
+ return wav_path
2830
+ shutil.copyfile(raw_path, wav_path)
2831
+ return wav_path
2832
+
2833
+ convert_audio_to_wav(raw_path, wav_path)
2834
+ return wav_path
2835
+
2836
+
2837
+ def _find_cached_raw_audio(directory: str) -> Optional[str]:
2838
+ """Locate previously downloaded audio file in directory (non-WAV)."""
2839
+
2840
+ if not os.path.isdir(directory):
2841
+ return None
2842
+
2843
+ for name in os.listdir(directory):
2844
+ lower = name.lower()
2845
+ if lower.startswith("audio.") and not lower.endswith(".wav"):
2846
+ return os.path.join(directory, name)
2847
+ return None
2848
+
2849
+
2850
+ def _ensure_audio_segments(wav_path: str) -> List[str]:
2851
+ """Ensure large WAV files are split into manageable segments."""
2852
+
2853
+ if not os.path.exists(wav_path):
2854
+ return []
2855
+
2856
+ file_size = os.path.getsize(wav_path)
2857
+ duration = _get_wav_duration(wav_path)
2858
+ needs_split = (
2859
+ duration > float(MAX_WAV_DURATION_SECONDS)
2860
+ or file_size > int(MAX_WAV_SIZE_BYTES)
2861
+ )
2862
+
2863
+ directory = os.path.dirname(wav_path)
2864
+ base_name = os.path.splitext(os.path.basename(wav_path))[0]
2865
+ existing = _list_existing_segments(directory, base_name)
2866
+
2867
+ if existing:
2868
+ requires_resplit = False
2869
+ for segment_path in existing:
2870
+ if _get_wav_duration(segment_path) > AUDIO_SEGMENT_SECONDS:
2871
+ requires_resplit = True
2872
+ break
2873
+ if requires_resplit:
2874
+ for segment_path in existing:
2875
+ with contextlib.suppress(OSError):
2876
+ os.remove(segment_path)
2877
+ existing = []
2878
+ else:
2879
+ return existing
2880
+
2881
+ if not needs_split:
2882
+ return [wav_path]
2883
+
2884
+ return _split_wav_file(wav_path, directory, base_name)
2885
+
2886
+
2887
+ def _list_existing_segments(directory: str, base_name: str) -> List[str]:
2888
+ """Return sorted list of previously split WAV segments."""
2889
+
2890
+ if not os.path.isdir(directory):
2891
+ return []
2892
+
2893
+ prefix = f"{base_name}_part"
2894
+ segments: List[str] = []
2895
+ for name in sorted(os.listdir(directory)):
2896
+ if not name.startswith(prefix):
2897
+ continue
2898
+ if not name.lower().endswith(".wav"):
2899
+ continue
2900
+ path = os.path.join(directory, name)
2901
+ if os.path.isfile(path):
2902
+ segments.append(path)
2903
+ return segments
2904
+
2905
+
2906
+ def _split_wav_file(
2907
+ wav_path: str, directory: str, base_name: str
2908
+ ) -> List[str]:
2909
+ """Split WAV file into multiple segments using wave module."""
2910
+
2911
+ segment_paths: List[str] = []
2912
+
2913
+ try:
2914
+ with wave.open(wav_path, "rb") as source:
2915
+ params = source.getparams()
2916
+ frame_rate = source.getframerate() or 16000
2917
+ frames_per_segment = int(AUDIO_SEGMENT_SECONDS * frame_rate)
2918
+ if frames_per_segment <= 0:
2919
+ frames_per_segment = frame_rate
2920
+ frames_per_chunk = max(WAV_FRAME_CHUNK_SIZE, frame_rate)
2921
+
2922
+ total_frames = source.getnframes()
2923
+ frames_remaining = total_frames
2924
+ segment_index = 0
2925
+
2926
+ while frames_remaining > 0:
2927
+ segment_index += 1
2928
+ segment_path = os.path.join(
2929
+ directory, f"{base_name}_part{segment_index:03d}.wav"
2930
+ )
2931
+ with wave.open(segment_path, "wb") as destination:
2932
+ destination.setparams(params)
2933
+ frames_to_write = min(frames_per_segment, frames_remaining)
2934
+ written = 0
2935
+
2936
+ while written < frames_to_write:
2937
+ frames_to_read = min(
2938
+ frames_per_chunk, frames_to_write - written
2939
+ )
2940
+ frame_bytes = source.readframes(frames_to_read)
2941
+ if not frame_bytes:
2942
+ break
2943
+ destination.writeframes(frame_bytes)
2944
+ written += frames_to_read
2945
+
2946
+ frames_remaining -= written
2947
+
2948
+ if os.path.exists(segment_path) and os.path.getsize(segment_path) > 0:
2949
+ segment_paths.append(segment_path)
2950
+
2951
+ if not segment_paths:
2952
+ return [wav_path]
2953
+
2954
+ except (OSError, wave.Error):
2955
+ return [wav_path]
2956
+
2957
+ return segment_paths
2958
+
2959
+
2960
+ def _get_wav_duration(wav_path: str) -> float:
2961
+ """Return duration in seconds for given WAV file."""
2962
+
2963
+ try:
2964
+ with wave.open(wav_path, "rb") as handle:
2965
+ frames = handle.getnframes()
2966
+ frame_rate = handle.getframerate()
2967
+ except (OSError, wave.Error):
2968
+ return 0.0
2969
+
2970
+ if frame_rate <= 0:
2971
+ return 0.0
2972
+
2973
+ return frames / float(frame_rate)
2974
+
2975
+
2976
+ def _estimate_total_tokens(
2977
+ segment_paths: Sequence[str],
2978
+ durations: Optional[Sequence[float]] = None,
2979
+ ) -> float:
2980
+ """Estimate expected tokens based on segment durations."""
2981
+
2982
+ total_tokens = 0.0
2983
+ for index, path in enumerate(segment_paths):
2984
+ if durations is not None and index < len(durations):
2985
+ duration = durations[index]
2986
+ else:
2987
+ duration = max(_get_wav_duration(path), 0.0)
2988
+ if duration <= 0.0:
2989
+ try:
2990
+ file_size = os.path.getsize(path)
2991
+ except OSError:
2992
+ file_size = 0
2993
+ if file_size > 0:
2994
+ duration = file_size / 32_000.0
2995
+ total_tokens += max(duration, 0.0) * ESTIMATED_TOKENS_PER_SECOND
2996
+ return max(total_tokens, 1.0)
2997
+
2998
+
2999
+ def _estimate_tokens_from_transcript(
3000
+ segments: Iterable[MutableMapping[str, float | str]]
3001
+ ) -> float:
3002
+ """Approximate token count from transcript segments."""
3003
+
3004
+ total_chars = 0
3005
+ segment_count = 0
3006
+ for segment in segments:
3007
+ text = segment.get("text", "")
3008
+ if isinstance(text, str):
3009
+ total_chars += len(text)
3010
+ segment_count += 1
3011
+ if total_chars == 0 and segment_count == 0:
3012
+ return 0.0
3013
+ if total_chars == 0:
3014
+ total_chars = segment_count * 16
3015
+ return max(total_chars / 4.0, float(segment_count))
3016
+
3017
+
3018
+ def _update_progress_bar(ratio: float, detail: str) -> None:
3019
+ """Render a simple textual progress bar to stdout."""
3020
+
3021
+ ratio = min(max(ratio, 0.0), 1.0)
3022
+ filled = int(PROGRESS_BAR_WIDTH * ratio)
3023
+ bar = "#" * filled + "-" * (PROGRESS_BAR_WIDTH - filled)
3024
+ sys.stdout.write(
3025
+ f"\r[{bar}] {ratio * 100:5.1f}% {detail[:80]}"
3026
+ )
3027
+ sys.stdout.flush()
3028
+ if ratio >= 1.0:
3029
+ sys.stdout.write("\n")
3030
+ sys.stdout.flush()
3031
+
3032
+
3033
+ def _compute_progress_ratio(
3034
+ processed_duration: float,
3035
+ total_duration: float,
3036
+ produced_tokens: float,
3037
+ total_tokens: float,
3038
+ segments_done: int,
3039
+ total_segments: int,
3040
+ ) -> float:
3041
+ """Combine duration、token与片段比值,得到整体进度。"""
3042
+
3043
+ if total_duration <= 0:
3044
+ duration_ratio = 0.0
3045
+ else:
3046
+ duration_ratio = min(max(processed_duration / total_duration, 0.0), 1.0)
3047
+
3048
+ if total_tokens <= 0:
3049
+ token_ratio = duration_ratio
3050
+ else:
3051
+ token_ratio = min(max(produced_tokens / total_tokens, 0.0), 1.0)
3052
+
3053
+ if total_segments <= 0:
3054
+ segment_ratio = duration_ratio
3055
+ else:
3056
+ segment_ratio = min(max(segments_done / total_segments, 0.0), 1.0)
3057
+
3058
+ combined = 0.5 * duration_ratio + 0.3 * token_ratio + 0.2 * segment_ratio
3059
+ return min(max(combined, 0.0), 1.0)
3060
+
3061
+
3062
+ def _format_progress_detail(
3063
+ processed_duration: float,
3064
+ total_duration: float,
3065
+ produced_tokens: float,
3066
+ total_tokens: float,
3067
+ segments_done: int,
3068
+ total_segments: int,
3069
+ ) -> str:
3070
+ """Return user-friendly progress detail string."""
3071
+
3072
+ total_minutes = total_duration / 60.0 if total_duration > 0 else 0.0
3073
+ processed_minutes = processed_duration / 60.0
3074
+ return (
3075
+ f"Azure diarization {segments_done}/{total_segments} "
3076
+ f"{processed_minutes:.1f}m/{total_minutes:.1f}m "
3077
+ f"tokens≈{int(produced_tokens)}/{int(max(total_tokens, 1.0))}"
3078
+ )
3079
+
3080
+
3081
+ def _offset_segments(
3082
+ segments: Iterable[MutableMapping[str, float | str]], offset: float
3083
+ ) -> List[MutableMapping[str, float | str]]:
3084
+ """Return new segment list with applied time offset."""
3085
+
3086
+ adjusted: List[MutableMapping[str, float | str]] = []
3087
+ for segment in segments:
3088
+ start = float(segment.get("start", 0.0))
3089
+ end = float(segment.get("end", start))
3090
+ shifted = dict(segment)
3091
+ shifted["start"] = start + offset
3092
+ shifted["end"] = end + offset
3093
+ adjusted.append(shifted)
3094
+ return adjusted
3095
+
3096
+
3097
+ def _max_segment_end(
3098
+ diarization: Iterable[MutableMapping[str, float | str]],
3099
+ transcript: Iterable[MutableMapping[str, float | str]],
3100
+ ) -> float:
3101
+ """Return maximum end timestamp across provided segments."""
3102
+
3103
+ max_end = 0.0
3104
+ for collection in (diarization, transcript):
3105
+ for segment in collection:
3106
+ end = float(segment.get("end", segment.get("start", 0.0)))
3107
+ if end > max_end:
3108
+ max_end = end
3109
+ return max_end
3110
+
3111
+
3112
+ def _consume_transcription_response(
3113
+ response: Any,
3114
+ on_chunk: Optional[Callable[[MutableMapping[str, Any]], None]] = None,
3115
+ ) -> MutableMapping[str, Any]:
3116
+ collected: List[MutableMapping[str, Any]] = []
3117
+
3118
+ def _record(payload: MutableMapping[str, Any]) -> None:
3119
+ if not payload:
3120
+ return
3121
+ collected.append(payload)
3122
+ if on_chunk:
3123
+ on_chunk(payload)
3124
+
3125
+ if isinstance(response, MutableMapping):
3126
+ payload = _coerce_response_to_dict(response)
3127
+ if payload:
3128
+ _record(payload)
3129
+ if len(collected) > 1:
3130
+ enriched = dict(payload)
3131
+ enriched.setdefault("data", collected)
3132
+ return enriched
3133
+ return payload
3134
+
3135
+ if isinstance(response, Iterable) and not isinstance(response, (str, bytes)):
3136
+ final_payload: MutableMapping[str, Any] = {}
3137
+ for item in response:
3138
+ payload = _coerce_response_to_dict(item)
3139
+ if not payload:
3140
+ continue
3141
+ final_payload = payload
3142
+ _record(payload)
3143
+
3144
+ if not collected:
3145
+ return final_payload
3146
+
3147
+ result = dict(final_payload) if final_payload else {}
3148
+ result.setdefault("data", collected)
3149
+ return result
3150
+
3151
+ payload = _coerce_response_to_dict(response)
3152
+ if payload:
3153
+ _record(payload)
3154
+ return payload
3155
+
3156
+
3157
+ def _extract_usage_tokens(payload: Mapping[str, Any]) -> Optional[float]:
3158
+ candidates: List[Mapping[str, Any]] = []
3159
+ if isinstance(payload, Mapping):
3160
+ candidates.append(payload)
3161
+ response_obj = payload.get("response")
3162
+ if isinstance(response_obj, Mapping):
3163
+ candidates.append(response_obj)
3164
+
3165
+ for candidate in candidates:
3166
+ usage = candidate.get("usage") if isinstance(candidate, Mapping) else None
3167
+ if not isinstance(usage, Mapping):
3168
+ continue
3169
+ for key in ("output_tokens", "total_tokens", "completion_tokens"):
3170
+ tokens = usage.get(key)
3171
+ if tokens is None:
3172
+ continue
3173
+ try:
3174
+ return float(tokens)
3175
+ except (TypeError, ValueError):
3176
+ continue
3177
+ return None
3178
+
3179
+
3180
+ def _is_youtube_hostname(hostname: str) -> bool:
3181
+ if not hostname:
3182
+ return False
3183
+ if hostname in {"youtu.be", "youtube.com"}:
3184
+ return True
3185
+ if hostname.endswith(".youtube.com"):
3186
+ return True
3187
+ return hostname == "www.youtube.com"
3188
+
3189
+
3190
+ def _matches_host_suffix(hostname: str, suffix: str) -> bool:
3191
+ hostname = hostname.lower()
3192
+ check = suffix.lower().lstrip(".")
3193
+ if not check:
3194
+ return False
3195
+ return hostname == check or hostname.endswith(f".{check}")
3196
+
3197
+
3198
+ def _is_media_source_url(video_url: str) -> bool:
3199
+ parsed = urlparse(video_url)
3200
+ hostname = (parsed.hostname or "").lower()
3201
+ if hostname:
3202
+ for suffix in _MEDIA_HOST_SUFFIXES:
3203
+ if _matches_host_suffix(hostname, suffix):
3204
+ return True
3205
+ path = parsed.path.lower()
3206
+ for extension in _MEDIA_PATH_EXTENSIONS:
3207
+ if path.endswith(extension):
3208
+ return True
3209
+ return False
3210
+
3211
+
3212
+ def _is_probable_article_url(video_url: str) -> bool:
3213
+ """Heuristically determine whether a URL points to a webpage article."""
3214
+
3215
+ return not _is_media_source_url(video_url)
3216
+
3217
+
3218
+ def _should_force_azure_transcription(video_url: str) -> bool:
3219
+ """Return True when URL belongs to audio sources that require Azure pipeline."""
3220
+
3221
+ parsed = urlparse(video_url)
3222
+ hostname = (parsed.hostname or "").lower()
3223
+ if not hostname:
3224
+ return False
3225
+ for suffix in _FORCED_AUDIO_HOST_SUFFIXES:
3226
+ if _matches_host_suffix(hostname, suffix):
3227
+ return True
3228
+ return False
3229
+
3230
+
3231
+ def _default_referer_for_url(parsed: ParseResult, is_youtube: bool) -> str:
3232
+ if is_youtube:
3233
+ return "https://www.youtube.com/"
3234
+ scheme = parsed.scheme or "https"
3235
+ hostname = parsed.hostname or ""
3236
+ if hostname:
3237
+ return f"{scheme}://{hostname}/"
3238
+ return parsed.geturl() or "https://www.youtube.com/"
3239
+
3240
+
3241
+ def download_audio_stream(video_url: str, directory: str) -> str:
3242
+ """Download the best available audio stream using yt_dlp."""
3243
+
3244
+ try:
3245
+ import yt_dlp
3246
+ except ModuleNotFoundError as exc: # pragma: no cover - depends on env
3247
+ raise RuntimeError(
3248
+ "yt_dlp is required to download audio. Install it via "
3249
+ "`pip install yt-dlp`."
3250
+ ) from exc
3251
+
3252
+ os.makedirs(directory, exist_ok=True)
3253
+
3254
+ user_agent = _getenv(
3255
+ "ANY2SUMMARY_YTDLP_UA",
3256
+ "PODCAST_TRANSFORMER_YTDLP_UA",
3257
+ ) or DEFAULT_YTDLP_USER_AGENT
3258
+ parsed_url = urlparse(video_url)
3259
+ hostname = (parsed_url.hostname or "").lower()
3260
+ is_youtube = _is_youtube_hostname(hostname)
3261
+ referer = _default_referer_for_url(parsed_url, is_youtube)
3262
+
3263
+ try:
3264
+ from yt_dlp.utils import std_headers # type: ignore[import-error]
3265
+ except Exception: # pragma: no cover - fallback when utils missing
3266
+ std_headers = {}
3267
+
3268
+ http_headers: MutableMapping[str, str] = dict(std_headers or {})
3269
+ http_headers["User-Agent"] = user_agent
3270
+ http_headers.setdefault("Accept-Language", "en-US,en;q=0.9")
3271
+ http_headers.setdefault("Referer", referer)
3272
+
3273
+ ydl_opts: Dict[str, Any] = {
3274
+ "format": "bestaudio/best",
3275
+ "outtmpl": os.path.join(directory, "audio.%(ext)s"),
3276
+ "quiet": True,
3277
+ "no_warnings": True,
3278
+ "http_headers": http_headers,
3279
+ }
3280
+
3281
+ cookie_path = _getenv("ANY2SUMMARY_YTDLP_COOKIES", "PODCAST_TRANSFORMER_YTDLP_COOKIES")
3282
+ if cookie_path is not None:
3283
+ cookie_path = cookie_path.strip()
3284
+ if cookie_path:
3285
+ ydl_opts["cookiefile"] = cookie_path
3286
+
3287
+ try:
3288
+ from yt_dlp.utils import DownloadError # type: ignore[import-error]
3289
+ except Exception: # pragma: no cover - defensive fallback
3290
+ DownloadError = getattr(yt_dlp, "DownloadError", RuntimeError)
3291
+
3292
+ try:
3293
+ audio_path = _download_with_ytdlp(yt_dlp, video_url, ydl_opts)
3294
+ except DownloadError as exc: # pragma: no cover - depends on network
3295
+ if is_youtube and _should_try_android_fallback(exc, ydl_opts.get("cookiefile")):
3296
+ fallback_opts = _build_android_fallback_options(ydl_opts)
3297
+ try:
3298
+ audio_path = _download_with_ytdlp(yt_dlp, video_url, fallback_opts)
3299
+ except DownloadError as fallback_exc: # pragma: no cover - depends on network
3300
+ raise RuntimeError(
3301
+ "yt_dlp 无法下载音频,请确认 URL 可访问,"
3302
+ "或提供有效的 cookie(设置 ANY2SUMMARY_YTDLP_COOKIES"
3303
+ " 或 PODCAST_TRANSFORMER_YTDLP_COOKIES)。"
3304
+ ) from fallback_exc
3305
+ else:
3306
+ raise RuntimeError(
3307
+ "yt_dlp 无法下载音频,请确认 URL 可访问,"
3308
+ "或提供有效的 cookie(设置 ANY2SUMMARY_YTDLP_COOKIES"
3309
+ " 或 PODCAST_TRANSFORMER_YTDLP_COOKIES)。"
3310
+ ) from exc
3311
+
3312
+ if not os.path.exists(audio_path):
3313
+ raise RuntimeError("Audio download failed; file not found.")
3314
+
3315
+ return audio_path
3316
+
3317
+
3318
+ def _download_with_ytdlp(
3319
+ yt_dlp_module: Any, video_url: str, options: Mapping[str, Any]
3320
+ ) -> str:
3321
+ """Execute yt_dlp with supplied options and return downloaded path."""
3322
+
3323
+ with yt_dlp_module.YoutubeDL(options) as ydl:
3324
+ info = ydl.extract_info(video_url, download=True)
3325
+ audio_path = ydl.prepare_filename(info)
3326
+ return audio_path
3327
+
3328
+
3329
+ def _should_try_android_fallback(
3330
+ exc: BaseException, cookiefile: Optional[str]
3331
+ ) -> bool:
3332
+ """Return True when 403 occurs; cookies no longer prevent fallback."""
3333
+
3334
+ _ = cookiefile # legacy parameter retained for compatibility
3335
+ message = str(exc)
3336
+ if message and ("403" in message or "Forbidden" in message):
3337
+ return True
3338
+
3339
+ exc_info = getattr(exc, "exc_info", None)
3340
+ if exc_info and len(exc_info) > 1 and exc_info[1] is not None:
3341
+ nested_message = str(exc_info[1])
3342
+ if "403" in nested_message or "Forbidden" in nested_message:
3343
+ return True
3344
+
3345
+ return False
3346
+
3347
+
3348
+ def _build_android_fallback_options(base_options: Mapping[str, Any]) -> Dict[str, Any]:
3349
+ """Clone yt_dlp options and inject Android headers and args."""
3350
+
3351
+ fallback_options = dict(base_options)
3352
+
3353
+ headers = dict(base_options.get("http_headers", {}))
3354
+ headers["User-Agent"] = ANDROID_YTDLP_USER_AGENT
3355
+ headers.setdefault("Accept-Language", "en-US,en;q=0.9")
3356
+ headers.setdefault("Referer", "https://www.youtube.com/")
3357
+ fallback_options["http_headers"] = headers
3358
+
3359
+ extractor_args: Dict[str, Any] = {}
3360
+ youtube_args: Dict[str, Any] = {}
3361
+
3362
+ if "extractor_args" in base_options:
3363
+ original = base_options["extractor_args"]
3364
+ if isinstance(original, Mapping):
3365
+ extractor_args.update(original)
3366
+ youtube_original = original.get("youtube")
3367
+ if isinstance(youtube_original, Mapping):
3368
+ youtube_args.update(youtube_original)
3369
+
3370
+ youtube_args["player_client"] = ["android"]
3371
+ youtube_args.setdefault("player_skip", ["configs"])
3372
+ extractor_args["youtube"] = youtube_args
3373
+ fallback_options["extractor_args"] = extractor_args
3374
+
3375
+ return fallback_options
3376
+
3377
+
3378
+ def convert_audio_to_wav(source_path: str, target_path: str) -> None:
3379
+ """Convert audio to single-channel 16kHz WAV using ffmpeg."""
3380
+
3381
+ command = [
3382
+ "ffmpeg",
3383
+ "-y",
3384
+ "-i",
3385
+ source_path,
3386
+ "-ac",
3387
+ "1",
3388
+ "-ar",
3389
+ "16000",
3390
+ "-acodec",
3391
+ "pcm_s16le",
3392
+ "-f",
3393
+ "wav",
3394
+ target_path,
3395
+ ]
3396
+ try:
3397
+ subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3398
+ except FileNotFoundError as exc: # pragma: no cover - depends on env
3399
+ raise RuntimeError("ffmpeg is required to convert audio; install it first.") from exc
3400
+ except subprocess.CalledProcessError as exc: # pragma: no cover
3401
+ raise RuntimeError(f"ffmpeg failed to convert audio: {exc.stderr}")
3402
+
3403
+
3404
+ def merge_segments_with_speakers(
3405
+ transcript_segments: Iterable[MutableMapping[str, float | str]],
3406
+ diarization_segments: Optional[Iterable[MutableMapping[str, float | str]]],
3407
+ ) -> List[MutableMapping[str, float | str]]:
3408
+ """Merge transcript segments with diarization metadata.
3409
+
3410
+ Args:
3411
+ transcript_segments: Iterable of transcript dictionaries containing
3412
+ `start`, `end`, and `text` keys.
3413
+ diarization_segments: Iterable of diarization dictionaries containing
3414
+ `start`, `end`, and `speaker` keys or None.
3415
+
3416
+ Returns:
3417
+ List of transcript dictionaries enhanced with a `speaker` key when
3418
+ diarization data is supplied.
3419
+ """
3420
+
3421
+ diarization_list = list(diarization_segments or [])
3422
+ merged: List[MutableMapping[str, float | str]] = []
3423
+
3424
+ for segment in transcript_segments:
3425
+ start = float(segment.get("start", 0.0))
3426
+ end = float(segment.get("end", start))
3427
+ best_speaker = determine_best_speaker(start, end, diarization_list)
3428
+
3429
+ enriched = dict(segment)
3430
+ if best_speaker is not None:
3431
+ enriched["speaker"] = best_speaker
3432
+ merged.append(enriched)
3433
+
3434
+ return merged
3435
+
3436
+
3437
+ def determine_best_speaker(
3438
+ start: float,
3439
+ end: float,
3440
+ diarization_segments: Sequence[MutableMapping[str, float | str]],
3441
+ ) -> Optional[str]:
3442
+ """Determine the speaker label with the greatest overlap."""
3443
+
3444
+ best_label: Optional[str] = None
3445
+ best_overlap = 0.0
3446
+
3447
+ for segment in diarization_segments:
3448
+ diar_start = float(segment.get("start", 0.0))
3449
+ diar_end = float(segment.get("end", diar_start))
3450
+ label = segment.get("speaker")
3451
+ if diar_end <= start or diar_start >= end:
3452
+ continue
3453
+ overlap = min(diar_end, end) - max(diar_start, start)
3454
+ if overlap > best_overlap:
3455
+ best_overlap = overlap
3456
+ if isinstance(label, str):
3457
+ best_label = label
3458
+
3459
+ return best_label
3460
+
3461
+
3462
+ def extract_video_id(video_url: str) -> Optional[str]:
3463
+ """Extract YouTube video identifier from a URL."""
3464
+
3465
+ parsed = urlparse(video_url)
3466
+ if parsed.hostname in {"youtu.be"}:
3467
+ return parsed.path.lstrip("/") or None
3468
+
3469
+ if parsed.hostname in {"www.youtube.com", "youtube.com", "m.youtube.com"}:
3470
+ query = parse_qs(parsed.query)
3471
+ video_ids = query.get("v")
3472
+ if video_ids:
3473
+ return video_ids[0]
3474
+
3475
+ if parsed.path.startswith("/embed/"):
3476
+ return parsed.path.split("/", maxsplit=2)[2]
3477
+
3478
+ return None
3479
+
3480
+
3481
+ def main() -> int: # pragma: no cover - convenience wrapper
3482
+ """Console script entry point."""
3483
+
3484
+ return run()
3485
+
3486
+
3487
+ if __name__ == "__main__": # pragma: no cover - manual execution
3488
+ sys.exit(main())