digest-generator 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. digest_generator/__init__.py +0 -0
  2. digest_generator/api.py +631 -0
  3. digest_generator/cli.py +907 -0
  4. digest_generator/core/__init__.py +0 -0
  5. digest_generator/core/audio/__init__.py +12 -0
  6. digest_generator/core/audio/io.py +128 -0
  7. digest_generator/core/audio/narration.py +347 -0
  8. digest_generator/core/audio/narration_overrides.yaml +210 -0
  9. digest_generator/core/audio/renderer.py +156 -0
  10. digest_generator/core/audio/types.py +50 -0
  11. digest_generator/core/categories.py +91 -0
  12. digest_generator/core/digest/__init__.py +4 -0
  13. digest_generator/core/digest/io.py +255 -0
  14. digest_generator/core/digest/orchestrator.py +519 -0
  15. digest_generator/core/digest/prompts/__init__.py +23 -0
  16. digest_generator/core/digest/prompts/templates/cluster_system.md +73 -0
  17. digest_generator/core/digest/prompts/templates/editorial_pass_system.md +80 -0
  18. digest_generator/core/digest/prompts/templates/intro_system.md +70 -0
  19. digest_generator/core/digest/prompts/templates/section_merge_system.md +53 -0
  20. digest_generator/core/digest/prompts/templates/section_system.md +80 -0
  21. digest_generator/core/digest/prompts/templates/title_system.md +38 -0
  22. digest_generator/core/digest/prompts/templates/watch_system.md +71 -0
  23. digest_generator/core/digest/stages/__init__.py +0 -0
  24. digest_generator/core/digest/stages/clusterer.py +513 -0
  25. digest_generator/core/digest/stages/composer.py +111 -0
  26. digest_generator/core/digest/stages/editorial.py +233 -0
  27. digest_generator/core/digest/stages/framer.py +256 -0
  28. digest_generator/core/digest/stages/watcher.py +237 -0
  29. digest_generator/core/digest/stages/writer.py +375 -0
  30. digest_generator/core/digest/types.py +94 -0
  31. digest_generator/core/label/__init__.py +16 -0
  32. digest_generator/core/label/io.py +140 -0
  33. digest_generator/core/label/stages/__init__.py +8 -0
  34. digest_generator/core/label/stages/topic.py +174 -0
  35. digest_generator/core/prompt_loader.py +58 -0
  36. digest_generator/core/style.py +187 -0
  37. digest_generator/core/summary/__init__.py +11 -0
  38. digest_generator/core/summary/io.py +123 -0
  39. digest_generator/core/summary/prompts/__init__.py +25 -0
  40. digest_generator/core/summary/prompts/templates/article_summary_system.md +38 -0
  41. digest_generator/core/summary/stages/__init__.py +8 -0
  42. digest_generator/core/summary/stages/summarizer.py +175 -0
  43. digest_generator/core/types.py +205 -0
  44. digest_generator/feeds.example.yaml +41 -0
  45. digest_generator/py.typed +0 -0
  46. digest_generator/shared/__init__.py +0 -0
  47. digest_generator/shared/hf_hub.py +31 -0
  48. digest_generator/shared/llm/__init__.py +1 -0
  49. digest_generator/shared/llm/clients.py +83 -0
  50. digest_generator/shared/llm/sampling.py +256 -0
  51. digest_generator/shared/llm/telemetry.py +317 -0
  52. digest_generator/shared/llm/typography.py +27 -0
  53. digest_generator/shared/logging.py +403 -0
  54. digest_generator/shared/runtime/__init__.py +18 -0
  55. digest_generator/shared/runtime/dirs.py +45 -0
  56. digest_generator/shared/runtime/meta.py +349 -0
  57. digest_generator/shared/settings.py +205 -0
  58. digest_generator/shared/transformers/__init__.py +22 -0
  59. digest_generator/shared/transformers/registry.py +40 -0
  60. digest_generator/shared/transformers/types.py +84 -0
  61. digest_generator/shared/tts/__init__.py +19 -0
  62. digest_generator/shared/tts/engine.py +233 -0
  63. digest_generator/shared/tts/registry.py +107 -0
  64. digest_generator/shared/tts/types.py +39 -0
  65. digest_generator/sources/__init__.py +9 -0
  66. digest_generator/sources/rss/config.py +267 -0
  67. digest_generator/sources/rss/fetcher.py +263 -0
  68. digest_generator/sources/rss/io.py +154 -0
  69. digest_generator/sources/rss/types.py +71 -0
  70. digest_generator-0.1.0.dist-info/METADATA +148 -0
  71. digest_generator-0.1.0.dist-info/RECORD +75 -0
  72. digest_generator-0.1.0.dist-info/WHEEL +4 -0
  73. digest_generator-0.1.0.dist-info/entry_points.txt +2 -0
  74. digest_generator-0.1.0.dist-info/licenses/LICENSE.md +202 -0
  75. digest_generator-0.1.0.dist-info/licenses/NOTICE.md +7 -0
File without changes
@@ -0,0 +1,631 @@
1
+ """Public programmatic API for the digest-generator pipeline.
2
+
3
+ Six entry points: three async per-stage primitives (``fetch``,
4
+ ``summarize``, ``label``), the sync digest pipeline (``digest``), the
5
+ ``run`` composition, and the audio renderer (``render_audio``). Each
6
+ per-stage primitive is independently invocable and cache-aware: re-running on
7
+ the same ``run_dir`` skips work whose batch file already exists.
8
+
9
+ Run setup (``run_dir`` creation, ``run_context``, ``log_stage``,
10
+ ``llm_telemetry``, ``meta.json``) is the caller's responsibility.
11
+ ``digest_generator.cli`` handles all of that for command-line use; programmatic
12
+ callers stitch together what they need.
13
+
14
+ Usage::
15
+
16
+ import asyncio
17
+ from pathlib import Path
18
+
19
+ from digest_generator.api import resolve_feeds, run, digest
20
+ from digest_generator.core.types import Filter
21
+
22
+ feeds = resolve_feeds(content_types=["ai"])
23
+ filter = Filter.resolve(days_back=7)
24
+ run_dir = Path("output/myrun")
25
+ run_dir.mkdir(parents=True, exist_ok=True)
26
+
27
+ run(feeds, filter, run_dir=run_dir, with_digest=True)
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ from pathlib import Path
33
+ from typing import TYPE_CHECKING, Any
34
+
35
+ if TYPE_CHECKING:
36
+ from digest_generator.core.audio.renderer import AudioRenderer
37
+ from digest_generator.core.digest.types import DigestResult
38
+ from digest_generator.core.label import TopicClassifier
39
+ from digest_generator.core.summary import ContentSummarizer
40
+ from digest_generator.core.types import Filter
41
+ from digest_generator.shared.llm.sampling import SamplingConfig
42
+ from digest_generator.sources.rss.fetcher import FeedFetcher
43
+ from digest_generator.sources.rss.types import Feed
44
+
45
+
46
+ def resolve_feeds(
47
+ content_types: list[str] | None = None,
48
+ feed_names: list[str] | None = None,
49
+ *,
50
+ feeds_file: str | None = None,
51
+ config_dir: str | None = None,
52
+ ) -> list[Feed]:
53
+ """Load the configured feeds and filter by content type and/or name.
54
+
55
+ Args:
56
+ content_types: Content type values to include (e.g., ``["ai", "security"]``).
57
+ feed_names: Feed names to include (e.g., ``["openai-news"]``).
58
+ feeds_file: Explicit ``feeds.yaml`` path; overrides discovery.
59
+ config_dir: Config directory holding ``feeds.yaml``; overrides discovery.
60
+
61
+ Returns:
62
+ Filtered list of ``Feed`` objects.
63
+
64
+ Raises:
65
+ FeedsConfigError: If no feeds file is found or it is invalid.
66
+ ValueError: If a content type or feed name is invalid, or no feeds match.
67
+ """
68
+ from digest_generator.sources.rss.config import (
69
+ load_configured_categories,
70
+ load_configured_feeds,
71
+ )
72
+
73
+ all_feeds = load_configured_feeds(feeds_file=feeds_file, config_dir=config_dir)
74
+ result = all_feeds
75
+
76
+ if content_types:
77
+ valid = load_configured_categories(feeds_file=feeds_file, config_dir=config_dir).id_set()
78
+ for ct in content_types:
79
+ if ct not in valid:
80
+ msg = f"Unknown content type '{ct}'. Valid: {', '.join(sorted(valid))}"
81
+ raise ValueError(msg)
82
+ result = [f for f in result if f.content_type in content_types]
83
+
84
+ if feed_names:
85
+ known = {f.name for f in all_feeds}
86
+ for name in feed_names:
87
+ if name not in known:
88
+ msg = f"Unknown feed '{name}'."
89
+ raise ValueError(msg)
90
+ result = [f for f in result if f.name in set(feed_names)]
91
+
92
+ if not result:
93
+ msg = "No feeds matched the given filters."
94
+ raise ValueError(msg)
95
+
96
+ return result
97
+
98
+
99
+ async def fetch(
100
+ feeds: list[Feed],
101
+ filter: Filter,
102
+ *,
103
+ run_dir: Path,
104
+ fetcher: FeedFetcher | None = None,
105
+ ) -> None:
106
+ """Fetch entries from each feed and persist them under ``source-fetched/``.
107
+
108
+ Per-feed task fans out via ``asyncio.TaskGroup``, capped by
109
+ ``settings.fetch_concurrency``. Skips any feed whose batch file
110
+ already exists, so re-running on the same ``run_dir`` is idempotent
111
+ and only fetches feeds that haven't landed yet. Each per-feed task
112
+ runs the fetcher's own ``log_stage("fetcher")`` span.
113
+
114
+ Args:
115
+ feeds: Feeds to fetch.
116
+ filter: Resolved date / limit filter.
117
+ run_dir: Run root for cache files.
118
+ fetcher: Injected fetcher (mostly for tests). Defaults to a fresh
119
+ ``FeedFetcher()``.
120
+ """
121
+ import asyncio
122
+
123
+ from digest_generator.shared.settings import settings
124
+ from digest_generator.sources.rss.fetcher import FeedFetcher
125
+ from digest_generator.sources.rss.io import load_entries, save_entries
126
+
127
+ fetcher_obj = fetcher or FeedFetcher()
128
+ semaphore = asyncio.Semaphore(settings.fetch_concurrency)
129
+
130
+ async def _one(feed: Feed) -> None:
131
+ if load_entries(run_dir, feed.name) is not None:
132
+ return # cache hit, skip
133
+ try:
134
+ async with semaphore:
135
+ entries = await fetcher_obj.fetch_entries(feed, filter)
136
+ except Exception as e:
137
+ # One bad feed must not kill the run. ``asyncio.TaskGroup`` cancels
138
+ # every sibling task on an unhandled exception, so per-feed failures
139
+ # (truncated responses, SSL handshake errors, malformed XML) get
140
+ # swallowed here and surface as a logged warning instead. The lack
141
+ # of a saved batch leaves the feed cache-miss-able on the next
142
+ # run, so a transient failure is automatically retryable.
143
+ from digest_generator.shared.logging import logger
144
+
145
+ logger.warning(
146
+ "Fetch failed for feed '{}' ({}): {} — skipping; cache miss preserved for retry",
147
+ feed.name,
148
+ type(e).__name__,
149
+ e,
150
+ )
151
+ return
152
+ save_entries(run_dir, feed.name, entries)
153
+
154
+ async with asyncio.TaskGroup() as tg:
155
+ for feed in feeds:
156
+ tg.create_task(_one(feed))
157
+
158
+
159
+ async def summarize(
160
+ *,
161
+ run_dir: Path,
162
+ summarizer: ContentSummarizer | None = None,
163
+ ) -> None:
164
+ """Read fetched batches, summarize each via LLM, persist under ``source-summarized/``.
165
+
166
+ Iterates ``iter_fetched(run_dir)``. Skips any feed whose summarized
167
+ batch already exists. Per-feed processing is sequential at this
168
+ layer; in-flight LLM concurrency is capped by the summarizer's own
169
+ instance-level semaphore (``summarizer_concurrency``).
170
+
171
+ Topic labels are NOT populated here; the summarized JSON has empty
172
+ topic lists. The label branch writes ``source-labeled/`` separately;
173
+ ``api.digest`` joins both at read time.
174
+
175
+ Stage telemetry + materialized sampling are persisted to ``meta.json``
176
+ after the per-feed loop completes (once per call rather than once per
177
+ feed), so concurrent ``api.summarize`` and ``api.label`` from ``api.run``
178
+ each contribute exactly one meta-file write.
179
+
180
+ Args:
181
+ run_dir: Run root with ``source-fetched/`` populated.
182
+ summarizer: Injected summarizer (mostly for tests). Defaults to
183
+ a fresh ``ContentSummarizer()``.
184
+ """
185
+ from digest_generator.core.digest.orchestrator import _build_sampling_state
186
+ from digest_generator.core.summary import ContentSummarizer
187
+ from digest_generator.core.summary.io import load_summarized, save_summarized
188
+ from digest_generator.shared.logging import collect_stage_telemetry
189
+ from digest_generator.shared.runtime.meta import StageMeta, update_run_meta_telemetry
190
+ from digest_generator.shared.settings import settings
191
+ from digest_generator.sources.rss.io import iter_fetched
192
+
193
+ summarizer_obj = summarizer or ContentSummarizer()
194
+
195
+ # Materialize sampling once for the run so repeated ``summarize_entries``
196
+ # calls (one per feed) all share the same seed, and that seed lands in
197
+ # ``meta.json`` even if the user didn't pass one. Materialization here
198
+ # rather than inside ``ContentSummarizer.__init__`` keeps the stage
199
+ # class agnostic to meta-file plumbing.
200
+ user_sampling = getattr(summarizer_obj, "_sampling", None)
201
+ materialized_sampling, sampling_meta = _build_sampling_state(
202
+ user=user_sampling,
203
+ model=summarizer_obj.model,
204
+ default_temperature=settings.summarizer_temperature,
205
+ default_top_p=settings.summarizer_top_p,
206
+ default_repetition_penalty=settings.summarizer_repetition_penalty,
207
+ default_seed=settings.summarizer_seed,
208
+ )
209
+ summarizer_obj._sampling = materialized_sampling
210
+
211
+ with collect_stage_telemetry() as sink:
212
+ for _content_type, feed_name, entries in iter_fetched(run_dir):
213
+ if load_summarized(run_dir, feed_name) is not None:
214
+ continue
215
+ summaries = await summarizer_obj.summarize_entries(entries, feed=feed_name)
216
+ save_summarized(run_dir, feed_name, summaries)
217
+
218
+ if "summarizer" in sink:
219
+ fields = sink["summarizer"]
220
+ common_keys = {
221
+ "duration_ms",
222
+ "llm_calls",
223
+ "prompt_tokens",
224
+ "completion_tokens",
225
+ "llm_duration_ms",
226
+ "model",
227
+ }
228
+ stage_meta = StageMeta(
229
+ duration_ms=int(fields.get("duration_ms", 0)),
230
+ llm_calls=int(fields.get("llm_calls", 0)),
231
+ prompt_tokens=int(fields.get("prompt_tokens", 0)),
232
+ completion_tokens=int(fields.get("completion_tokens", 0)),
233
+ llm_duration_ms=int(fields.get("llm_duration_ms", 0)),
234
+ model=fields.get("model"),
235
+ extras={k: v for k, v in fields.items() if k not in common_keys},
236
+ )
237
+ if (run_dir / "meta.json").exists():
238
+ update_run_meta_telemetry(
239
+ run_dir,
240
+ stages={"summarizer": stage_meta},
241
+ sampling={"summarizer": sampling_meta},
242
+ )
243
+
244
+
245
+ async def label(
246
+ *,
247
+ run_dir: Path,
248
+ classifier: TopicClassifier | None = None,
249
+ ) -> None:
250
+ """Read fetched batches, classify raw text, persist under ``source-labeled/``.
251
+
252
+ Iterates ``iter_fetched(run_dir)``. Skips any feed whose labeled
253
+ batch already exists. Inference runs in ``asyncio.to_thread`` per
254
+ batch (BART-MNLI is blocking).
255
+
256
+ Independent of the summarizer: uses ``entry.content_head`` (with
257
+ title + description fallback) so the two stages can run
258
+ concurrently against the same fetched corpus. The output is
259
+ URL-keyed so ``api.digest`` can join it against ``source-summarized/``
260
+ at read time without a persistent merged artifact.
261
+
262
+ Stage telemetry is harvested into ``meta.json``'s ``stages.topic``
263
+ block (no sampling, since the topic classifier is HF rather than Ollama).
264
+
265
+ Args:
266
+ run_dir: Run root with ``source-fetched/`` populated.
267
+ classifier: Injected classifier (mostly for tests). Defaults to
268
+ a fresh ``TopicClassifier`` from ``model_registry``.
269
+ """
270
+ import asyncio
271
+
272
+ from digest_generator.core.label import TopicClassifier
273
+ from digest_generator.core.label.io import load_labeled, save_labeled
274
+ from digest_generator.shared.logging import collect_stage_telemetry
275
+ from digest_generator.shared.runtime.meta import StageMeta, update_run_meta_telemetry
276
+ from digest_generator.shared.transformers.registry import model_registry
277
+ from digest_generator.sources.rss.io import iter_fetched
278
+
279
+ classifier_obj = classifier or TopicClassifier(model_config=model_registry.topic)
280
+
281
+ with collect_stage_telemetry() as sink:
282
+ for _content_type, feed_name, entries in iter_fetched(run_dir):
283
+ if load_labeled(run_dir, feed_name) is not None:
284
+ continue
285
+ labels_per_entry = await asyncio.to_thread(
286
+ classifier_obj.classify_entries, entries, feed=feed_name
287
+ )
288
+ urls = [e.url for e in entries]
289
+ save_labeled(
290
+ run_dir,
291
+ feed_name,
292
+ urls=urls,
293
+ labels_per_entry=labels_per_entry,
294
+ )
295
+
296
+ if "topic" in sink:
297
+ fields = sink["topic"]
298
+ common_keys = {
299
+ "duration_ms",
300
+ "llm_calls",
301
+ "prompt_tokens",
302
+ "completion_tokens",
303
+ "llm_duration_ms",
304
+ "model",
305
+ }
306
+ stage_meta = StageMeta(
307
+ duration_ms=int(fields.get("duration_ms", 0)),
308
+ llm_calls=int(fields.get("llm_calls", 0)),
309
+ prompt_tokens=int(fields.get("prompt_tokens", 0)),
310
+ completion_tokens=int(fields.get("completion_tokens", 0)),
311
+ llm_duration_ms=int(fields.get("llm_duration_ms", 0)),
312
+ model=fields.get("model"),
313
+ extras={k: v for k, v in fields.items() if k not in common_keys},
314
+ )
315
+ if (run_dir / "meta.json").exists():
316
+ update_run_meta_telemetry(run_dir, stages={"topic": stage_meta})
317
+
318
+
319
+ def run(
320
+ feeds: list[Feed],
321
+ filter: Filter,
322
+ *,
323
+ run_dir: Path,
324
+ fetcher: FeedFetcher | None = None,
325
+ summarizer: ContentSummarizer | None = None,
326
+ classifier: TopicClassifier | None = None,
327
+ with_digest: bool = True,
328
+ with_audio: bool = False,
329
+ writer_model: str | None = None,
330
+ editorial_model: str | None = None,
331
+ framer_model: str | None = None,
332
+ watcher_model: str | None = None,
333
+ writer_sampling: SamplingConfig | None = None,
334
+ editorial_sampling: SamplingConfig | None = None,
335
+ framer_sampling: SamplingConfig | None = None,
336
+ watcher_sampling: SamplingConfig | None = None,
337
+ date_range: tuple[str, str] | None = None,
338
+ ) -> Path:
339
+ """Build the corpus and (optionally) the digest.
340
+
341
+ Composition: ``fetch``, then ``summarize`` and ``label`` in parallel, then digest
342
+ (if ``with_digest``). The summarize and label branches run
343
+ concurrently against the same fetched corpus and produce
344
+ independent on-disk artifacts (``source-summarized/`` and ``source-labeled/``).
345
+ ``api.digest`` joins them in memory at read time, with no persistent
346
+ merged artifact.
347
+
348
+ Run setup (``run_dir`` creation, ``meta.json``, ``run_context``,
349
+ ``llm_telemetry()``, feed resolution) is the caller's responsibility.
350
+
351
+ Args:
352
+ feeds: Feeds to fetch / summarize / label.
353
+ filter: Resolved date / limit filter.
354
+ run_dir: Run root for cache files.
355
+ fetcher: Injected fetcher (mostly for tests).
356
+ summarizer: Injected summarizer (mostly for tests).
357
+ classifier: Injected classifier (mostly for tests).
358
+ with_digest: When ``True`` (default), generate the markdown digest
359
+ after the corpus build. Maps to today's ``--no-digest`` CLI flag.
360
+ with_audio: When ``True``, render the digest to audio after the
361
+ digest stage. Default ``False`` so dev iteration stays cheap;
362
+ pass ``--audio`` explicitly to enable it.
363
+ Implies ``with_digest=True``.
364
+ writer_model / editorial_model / framer_model / watcher_model:
365
+ Override Ollama models for the digest stages.
366
+ writer_sampling / editorial_sampling / framer_sampling / watcher_sampling:
367
+ Sampling overrides for digest stages.
368
+ date_range: Optional ``(since, until)`` date strings for digest period.
369
+
370
+ Returns:
371
+ ``run_dir`` (data lives on disk; ``api.digest`` is the canonical
372
+ consumer that joins ``source-summarized/`` + ``source-labeled/``).
373
+ """
374
+ import asyncio
375
+
376
+ async def _build_corpus() -> None:
377
+ await fetch(feeds, filter, run_dir=run_dir, fetcher=fetcher)
378
+ await asyncio.gather(
379
+ summarize(run_dir=run_dir, summarizer=summarizer),
380
+ label(run_dir=run_dir, classifier=classifier),
381
+ )
382
+
383
+ asyncio.run(_build_corpus())
384
+
385
+ if with_digest:
386
+ digest(
387
+ run_dir=run_dir,
388
+ writer_model=writer_model,
389
+ editorial_model=editorial_model,
390
+ framer_model=framer_model,
391
+ watcher_model=watcher_model,
392
+ writer_sampling=writer_sampling,
393
+ editorial_sampling=editorial_sampling,
394
+ framer_sampling=framer_sampling,
395
+ watcher_sampling=watcher_sampling,
396
+ date_range=date_range,
397
+ )
398
+
399
+ if with_audio:
400
+ if not with_digest:
401
+ msg = "with_audio=True requires with_digest=True (no digest, nothing to narrate)"
402
+ raise ValueError(msg)
403
+ render_audio(run_dir=run_dir)
404
+
405
+ return run_dir
406
+
407
+
408
+ def _load_digest_input(run_dir: Path) -> dict[str, list[dict[str, Any]]]:
409
+ """Build the merged article view ``api.digest`` consumes.
410
+
411
+ Reads ``source-summarized/<feed>.json`` (Summary fields), looks up
412
+ ``source-labeled/<feed>.json`` for the URL-keyed topics, and
413
+ ``source-fetched/<feed>.json`` for the content_type tier
414
+ (carried in entry data). Returns ``{feed_name: [merged_dict, ...]}``.
415
+
416
+ A feed without a labeled or fetched batch is still included: the
417
+ article gets ``topics: {}`` or ``content_type: None`` respectively.
418
+ The digest stages tolerate missing fields, and per-stage failures
419
+ are surfaced via run.log rather than by dropping articles silently.
420
+ """
421
+ from digest_generator.core.label.io import load_labeled
422
+ from digest_generator.core.summary.io import iter_summarized
423
+ from digest_generator.sources.rss.io import iter_fetched
424
+
425
+ feed_content_type: dict[str, str] = {
426
+ feed_name: str(content_type) for content_type, feed_name, _ in iter_fetched(run_dir)
427
+ }
428
+
429
+ results: dict[str, list[dict[str, Any]]] = {}
430
+ for _src, feed_name, summarized_articles in iter_summarized(run_dir):
431
+ labels_by_url = load_labeled(run_dir, feed_name) or {}
432
+ content_type = feed_content_type.get(feed_name)
433
+
434
+ merged: list[dict[str, Any]] = []
435
+ for article in summarized_articles:
436
+ url = article.get("url", "")
437
+ topics = labels_by_url.get(url, [])
438
+ merged.append(
439
+ {
440
+ **article,
441
+ "content_type": content_type,
442
+ "topics": {label.value: label.confidence for label in topics},
443
+ }
444
+ )
445
+ results[feed_name] = merged
446
+ return results
447
+
448
+
449
+ def digest(
450
+ *,
451
+ run_dir: Path,
452
+ writer_model: str | None = None,
453
+ editorial_model: str | None = None,
454
+ framer_model: str | None = None,
455
+ watcher_model: str | None = None,
456
+ writer_sampling: SamplingConfig | None = None,
457
+ editorial_sampling: SamplingConfig | None = None,
458
+ framer_sampling: SamplingConfig | None = None,
459
+ watcher_sampling: SamplingConfig | None = None,
460
+ date_range: tuple[str, str] | None = None,
461
+ ) -> DigestResult | None:
462
+ """Generate the markdown digest from a run dir.
463
+
464
+ Reads ``source-summarized/`` and ``source-labeled/`` and joins them
465
+ in memory by URL into the merged article shape downstream digest
466
+ stages expect. Returns ``None`` when ``source-summarized/`` is empty
467
+ (nothing to digest).
468
+
469
+ Args:
470
+ run_dir: Run directory containing ``source-summarized/<feed>.json``
471
+ and ``source-labeled/<feed>.json``.
472
+ writer_model / editorial_model / framer_model / watcher_model:
473
+ Override Ollama models for digest stages.
474
+ writer_sampling / editorial_sampling / framer_sampling / watcher_sampling:
475
+ Sampling overrides.
476
+ date_range: Optional ``(since, until)`` strings for the digest period.
477
+ """
478
+ from digest_generator.core.digest.orchestrator import run_digest_from_json
479
+ from digest_generator.shared.logging import logger, run_context
480
+
481
+ with run_context(run_dir.name, run_dir):
482
+ results = _load_digest_input(run_dir)
483
+ if not results:
484
+ logger.warning("No summarized files found in {}", run_dir / "source-summarized")
485
+ return None
486
+
487
+ total = sum(len(v) for v in results.values())
488
+ logger.info("Loaded {} articles from {} feeds", total, len(results))
489
+
490
+ return run_digest_from_json(
491
+ results,
492
+ run_dir=run_dir,
493
+ writer_model=writer_model,
494
+ editorial_model=editorial_model,
495
+ framer_model=framer_model,
496
+ watcher_model=watcher_model,
497
+ writer_sampling=writer_sampling,
498
+ editorial_sampling=editorial_sampling,
499
+ framer_sampling=framer_sampling,
500
+ watcher_sampling=watcher_sampling,
501
+ date_range=date_range,
502
+ )
503
+
504
+
505
+ def render_audio(
506
+ *,
507
+ run_dir: Path,
508
+ bitrate_kbps: int | None = None,
509
+ renderer: AudioRenderer | None = None,
510
+ ) -> Path:
511
+ """Render the digest at ``run_dir`` to an Opus audio file.
512
+
513
+ Locates the deliverable ``{date}.md`` at the run root,
514
+ narrates it via ``core.audio.narration``, synthesizes via Piper,
515
+ encodes via ffmpeg, and writes ``audio/{date}.opus`` (shared stem).
516
+ Cache-aware: re-running against the same markdown + voice + bitrate
517
+ is a no-op (cache hit short-circuits at the renderer level).
518
+
519
+ Telemetry is harvested into ``meta.json``'s ``stages.audio`` block
520
+ (``duration_ms``, ``voice``, ``narration_chars``, ``audio_bytes``,
521
+ ``audio_duration_s``, ``real_time_factor``, ``cached``) plus
522
+ ``models.audio = "piper:<voice_id>"`` when the run has a
523
+ ``meta.json``. Telemetry is skipped silently if ``meta.json`` is
524
+ absent (programmatic callers without the CLI's meta lifecycle).
525
+
526
+ Voice and bitrate come from ``settings.audio_voice_model`` /
527
+ ``settings.audio_bitrate_kbps`` unless ``bitrate_kbps`` overrides
528
+ or a fully-configured ``renderer`` is injected. Multi-voice
529
+ support is a deferred follow-up.
530
+
531
+ Args:
532
+ run_dir: Run directory holding the digest ``.md`` deliverable.
533
+ bitrate_kbps: Override the configured Opus bitrate (24 by default).
534
+ renderer: Injected renderer (mostly for tests).
535
+
536
+ Returns:
537
+ Path to the ``.opus`` artifact under ``audio/``.
538
+
539
+ Raises:
540
+ FileNotFoundError: No ``.md`` deliverable at the run root.
541
+ ValueError: Multiple ``.md`` files at the run root (ambiguous).
542
+ """
543
+ from digest_generator.core.audio.io import find_digest_md
544
+ from digest_generator.core.audio.renderer import AudioRenderer as _AudioRenderer
545
+ from digest_generator.shared.logging import collect_stage_telemetry, log_stage
546
+ from digest_generator.shared.runtime.meta import StageMeta, update_run_meta_telemetry
547
+ from digest_generator.shared.settings import settings
548
+ from digest_generator.shared.tts.registry import voice_registry
549
+
550
+ digest_md = find_digest_md(run_dir)
551
+ bitrate = bitrate_kbps if bitrate_kbps is not None else settings.audio_bitrate_kbps
552
+
553
+ if renderer is None:
554
+ voice = voice_registry.default
555
+ renderer = _AudioRenderer(
556
+ voice=voice,
557
+ bitrate_kbps=bitrate,
558
+ sentence_silence_s=settings.audio_sentence_silence_s,
559
+ ffmpeg_path=settings.audio_ffmpeg_path,
560
+ )
561
+
562
+ with collect_stage_telemetry() as sink, log_stage("audio") as span:
563
+ artifact = renderer.render(run_dir, digest_md)
564
+ span.set(
565
+ voice=artifact.voice_id,
566
+ bitrate_kbps=artifact.bitrate_kbps,
567
+ narration_chars=artifact.narration_chars,
568
+ audio_bytes=artifact.audio_bytes,
569
+ audio_duration_s=round(artifact.audio_duration_s, 1),
570
+ cached=artifact.cached,
571
+ )
572
+
573
+ if "audio" in sink and (run_dir / "meta.json").exists():
574
+ fields = dict(sink["audio"])
575
+ duration_ms = int(fields.get("duration_ms", 0))
576
+ # real_time_factor = seconds of audio per second of wall time. >1 means
577
+ # synthesis runs faster than real time. Skip on cache hit since the
578
+ # synthesis pipeline didn't fire and ``duration_ms`` only reflects the
579
+ # cache-lookup overhead.
580
+ if not artifact.cached and duration_ms > 0:
581
+ fields["real_time_factor"] = round(
582
+ artifact.audio_duration_s / (duration_ms / 1000.0), 1
583
+ )
584
+ common_keys = {
585
+ "duration_ms",
586
+ "llm_calls",
587
+ "prompt_tokens",
588
+ "completion_tokens",
589
+ "llm_duration_ms",
590
+ "model",
591
+ }
592
+ stage_meta = StageMeta(
593
+ duration_ms=duration_ms,
594
+ extras={k: v for k, v in fields.items() if k not in common_keys},
595
+ )
596
+ update_run_meta_telemetry(
597
+ run_dir,
598
+ models={"audio": f"piper:{artifact.voice_id}"},
599
+ stages={"audio": stage_meta},
600
+ )
601
+
602
+ return artifact.opus_path
603
+
604
+
605
+ def list_feeds(
606
+ content_types: list[str] | None = None,
607
+ *,
608
+ feeds_file: str | None = None,
609
+ config_dir: str | None = None,
610
+ ) -> list[Feed]:
611
+ """Return the configured feeds, optionally filtered by content type.
612
+
613
+ Args:
614
+ content_types: Content type values to filter by.
615
+ feeds_file: Explicit ``feeds.yaml`` path; overrides discovery.
616
+ config_dir: Config directory holding ``feeds.yaml``; overrides discovery.
617
+
618
+ Returns:
619
+ List of ``Feed`` objects.
620
+
621
+ Raises:
622
+ FeedsConfigError: If no feeds file is found or it is invalid.
623
+ """
624
+ if content_types:
625
+ return resolve_feeds(
626
+ content_types=content_types, feeds_file=feeds_file, config_dir=config_dir
627
+ )
628
+
629
+ from digest_generator.sources.rss.config import load_configured_feeds
630
+
631
+ return load_configured_feeds(feeds_file=feeds_file, config_dir=config_dir)