digest-generator 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- digest_generator/__init__.py +0 -0
- digest_generator/api.py +631 -0
- digest_generator/cli.py +907 -0
- digest_generator/core/__init__.py +0 -0
- digest_generator/core/audio/__init__.py +12 -0
- digest_generator/core/audio/io.py +128 -0
- digest_generator/core/audio/narration.py +347 -0
- digest_generator/core/audio/narration_overrides.yaml +210 -0
- digest_generator/core/audio/renderer.py +156 -0
- digest_generator/core/audio/types.py +50 -0
- digest_generator/core/categories.py +91 -0
- digest_generator/core/digest/__init__.py +4 -0
- digest_generator/core/digest/io.py +255 -0
- digest_generator/core/digest/orchestrator.py +519 -0
- digest_generator/core/digest/prompts/__init__.py +23 -0
- digest_generator/core/digest/prompts/templates/cluster_system.md +73 -0
- digest_generator/core/digest/prompts/templates/editorial_pass_system.md +80 -0
- digest_generator/core/digest/prompts/templates/intro_system.md +70 -0
- digest_generator/core/digest/prompts/templates/section_merge_system.md +53 -0
- digest_generator/core/digest/prompts/templates/section_system.md +80 -0
- digest_generator/core/digest/prompts/templates/title_system.md +38 -0
- digest_generator/core/digest/prompts/templates/watch_system.md +71 -0
- digest_generator/core/digest/stages/__init__.py +0 -0
- digest_generator/core/digest/stages/clusterer.py +513 -0
- digest_generator/core/digest/stages/composer.py +111 -0
- digest_generator/core/digest/stages/editorial.py +233 -0
- digest_generator/core/digest/stages/framer.py +256 -0
- digest_generator/core/digest/stages/watcher.py +237 -0
- digest_generator/core/digest/stages/writer.py +375 -0
- digest_generator/core/digest/types.py +94 -0
- digest_generator/core/label/__init__.py +16 -0
- digest_generator/core/label/io.py +140 -0
- digest_generator/core/label/stages/__init__.py +8 -0
- digest_generator/core/label/stages/topic.py +174 -0
- digest_generator/core/prompt_loader.py +58 -0
- digest_generator/core/style.py +187 -0
- digest_generator/core/summary/__init__.py +11 -0
- digest_generator/core/summary/io.py +123 -0
- digest_generator/core/summary/prompts/__init__.py +25 -0
- digest_generator/core/summary/prompts/templates/article_summary_system.md +38 -0
- digest_generator/core/summary/stages/__init__.py +8 -0
- digest_generator/core/summary/stages/summarizer.py +175 -0
- digest_generator/core/types.py +205 -0
- digest_generator/feeds.example.yaml +41 -0
- digest_generator/py.typed +0 -0
- digest_generator/shared/__init__.py +0 -0
- digest_generator/shared/hf_hub.py +31 -0
- digest_generator/shared/llm/__init__.py +1 -0
- digest_generator/shared/llm/clients.py +83 -0
- digest_generator/shared/llm/sampling.py +256 -0
- digest_generator/shared/llm/telemetry.py +317 -0
- digest_generator/shared/llm/typography.py +27 -0
- digest_generator/shared/logging.py +403 -0
- digest_generator/shared/runtime/__init__.py +18 -0
- digest_generator/shared/runtime/dirs.py +45 -0
- digest_generator/shared/runtime/meta.py +349 -0
- digest_generator/shared/settings.py +205 -0
- digest_generator/shared/transformers/__init__.py +22 -0
- digest_generator/shared/transformers/registry.py +40 -0
- digest_generator/shared/transformers/types.py +84 -0
- digest_generator/shared/tts/__init__.py +19 -0
- digest_generator/shared/tts/engine.py +233 -0
- digest_generator/shared/tts/registry.py +107 -0
- digest_generator/shared/tts/types.py +39 -0
- digest_generator/sources/__init__.py +9 -0
- digest_generator/sources/rss/config.py +267 -0
- digest_generator/sources/rss/fetcher.py +263 -0
- digest_generator/sources/rss/io.py +154 -0
- digest_generator/sources/rss/types.py +71 -0
- digest_generator-0.1.0.dist-info/METADATA +148 -0
- digest_generator-0.1.0.dist-info/RECORD +75 -0
- digest_generator-0.1.0.dist-info/WHEEL +4 -0
- digest_generator-0.1.0.dist-info/entry_points.txt +2 -0
- digest_generator-0.1.0.dist-info/licenses/LICENSE.md +202 -0
- digest_generator-0.1.0.dist-info/licenses/NOTICE.md +7 -0
|
File without changes
|
digest_generator/api.py
ADDED
|
@@ -0,0 +1,631 @@
|
|
|
1
|
+
"""Public programmatic API for the digest-generator pipeline.
|
|
2
|
+
|
|
3
|
+
Six entry points: three async per-stage primitives (``fetch``,
|
|
4
|
+
``summarize``, ``label``), the sync digest pipeline (``digest``), the
|
|
5
|
+
``run`` composition, and the audio renderer (``render_audio``). Each
|
|
6
|
+
per-stage primitive is independently invocable and cache-aware: re-running on
|
|
7
|
+
the same ``run_dir`` skips work whose batch file already exists.
|
|
8
|
+
|
|
9
|
+
Run setup (``run_dir`` creation, ``run_context``, ``log_stage``,
|
|
10
|
+
``llm_telemetry``, ``meta.json``) is the caller's responsibility.
|
|
11
|
+
``digest_generator.cli`` handles all of that for command-line use; programmatic
|
|
12
|
+
callers stitch together what they need.
|
|
13
|
+
|
|
14
|
+
Usage::
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from digest_generator.api import resolve_feeds, run, digest
|
|
20
|
+
from digest_generator.core.types import Filter
|
|
21
|
+
|
|
22
|
+
feeds = resolve_feeds(content_types=["ai"])
|
|
23
|
+
filter = Filter.resolve(days_back=7)
|
|
24
|
+
run_dir = Path("output/myrun")
|
|
25
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
run(feeds, filter, run_dir=run_dir, with_digest=True)
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import TYPE_CHECKING, Any
|
|
34
|
+
|
|
35
|
+
if TYPE_CHECKING:
|
|
36
|
+
from digest_generator.core.audio.renderer import AudioRenderer
|
|
37
|
+
from digest_generator.core.digest.types import DigestResult
|
|
38
|
+
from digest_generator.core.label import TopicClassifier
|
|
39
|
+
from digest_generator.core.summary import ContentSummarizer
|
|
40
|
+
from digest_generator.core.types import Filter
|
|
41
|
+
from digest_generator.shared.llm.sampling import SamplingConfig
|
|
42
|
+
from digest_generator.sources.rss.fetcher import FeedFetcher
|
|
43
|
+
from digest_generator.sources.rss.types import Feed
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def resolve_feeds(
|
|
47
|
+
content_types: list[str] | None = None,
|
|
48
|
+
feed_names: list[str] | None = None,
|
|
49
|
+
*,
|
|
50
|
+
feeds_file: str | None = None,
|
|
51
|
+
config_dir: str | None = None,
|
|
52
|
+
) -> list[Feed]:
|
|
53
|
+
"""Load the configured feeds and filter by content type and/or name.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
content_types: Content type values to include (e.g., ``["ai", "security"]``).
|
|
57
|
+
feed_names: Feed names to include (e.g., ``["openai-news"]``).
|
|
58
|
+
feeds_file: Explicit ``feeds.yaml`` path; overrides discovery.
|
|
59
|
+
config_dir: Config directory holding ``feeds.yaml``; overrides discovery.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Filtered list of ``Feed`` objects.
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
FeedsConfigError: If no feeds file is found or it is invalid.
|
|
66
|
+
ValueError: If a content type or feed name is invalid, or no feeds match.
|
|
67
|
+
"""
|
|
68
|
+
from digest_generator.sources.rss.config import (
|
|
69
|
+
load_configured_categories,
|
|
70
|
+
load_configured_feeds,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
all_feeds = load_configured_feeds(feeds_file=feeds_file, config_dir=config_dir)
|
|
74
|
+
result = all_feeds
|
|
75
|
+
|
|
76
|
+
if content_types:
|
|
77
|
+
valid = load_configured_categories(feeds_file=feeds_file, config_dir=config_dir).id_set()
|
|
78
|
+
for ct in content_types:
|
|
79
|
+
if ct not in valid:
|
|
80
|
+
msg = f"Unknown content type '{ct}'. Valid: {', '.join(sorted(valid))}"
|
|
81
|
+
raise ValueError(msg)
|
|
82
|
+
result = [f for f in result if f.content_type in content_types]
|
|
83
|
+
|
|
84
|
+
if feed_names:
|
|
85
|
+
known = {f.name for f in all_feeds}
|
|
86
|
+
for name in feed_names:
|
|
87
|
+
if name not in known:
|
|
88
|
+
msg = f"Unknown feed '{name}'."
|
|
89
|
+
raise ValueError(msg)
|
|
90
|
+
result = [f for f in result if f.name in set(feed_names)]
|
|
91
|
+
|
|
92
|
+
if not result:
|
|
93
|
+
msg = "No feeds matched the given filters."
|
|
94
|
+
raise ValueError(msg)
|
|
95
|
+
|
|
96
|
+
return result
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
async def fetch(
|
|
100
|
+
feeds: list[Feed],
|
|
101
|
+
filter: Filter,
|
|
102
|
+
*,
|
|
103
|
+
run_dir: Path,
|
|
104
|
+
fetcher: FeedFetcher | None = None,
|
|
105
|
+
) -> None:
|
|
106
|
+
"""Fetch entries from each feed and persist them under ``source-fetched/``.
|
|
107
|
+
|
|
108
|
+
Per-feed task fans out via ``asyncio.TaskGroup``, capped by
|
|
109
|
+
``settings.fetch_concurrency``. Skips any feed whose batch file
|
|
110
|
+
already exists, so re-running on the same ``run_dir`` is idempotent
|
|
111
|
+
and only fetches feeds that haven't landed yet. Each per-feed task
|
|
112
|
+
runs the fetcher's own ``log_stage("fetcher")`` span.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
feeds: Feeds to fetch.
|
|
116
|
+
filter: Resolved date / limit filter.
|
|
117
|
+
run_dir: Run root for cache files.
|
|
118
|
+
fetcher: Injected fetcher (mostly for tests). Defaults to a fresh
|
|
119
|
+
``FeedFetcher()``.
|
|
120
|
+
"""
|
|
121
|
+
import asyncio
|
|
122
|
+
|
|
123
|
+
from digest_generator.shared.settings import settings
|
|
124
|
+
from digest_generator.sources.rss.fetcher import FeedFetcher
|
|
125
|
+
from digest_generator.sources.rss.io import load_entries, save_entries
|
|
126
|
+
|
|
127
|
+
fetcher_obj = fetcher or FeedFetcher()
|
|
128
|
+
semaphore = asyncio.Semaphore(settings.fetch_concurrency)
|
|
129
|
+
|
|
130
|
+
async def _one(feed: Feed) -> None:
|
|
131
|
+
if load_entries(run_dir, feed.name) is not None:
|
|
132
|
+
return # cache hit, skip
|
|
133
|
+
try:
|
|
134
|
+
async with semaphore:
|
|
135
|
+
entries = await fetcher_obj.fetch_entries(feed, filter)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
# One bad feed must not kill the run. ``asyncio.TaskGroup`` cancels
|
|
138
|
+
# every sibling task on an unhandled exception, so per-feed failures
|
|
139
|
+
# (truncated responses, SSL handshake errors, malformed XML) get
|
|
140
|
+
# swallowed here and surface as a logged warning instead. The lack
|
|
141
|
+
# of a saved batch leaves the feed cache-miss-able on the next
|
|
142
|
+
# run, so a transient failure is automatically retryable.
|
|
143
|
+
from digest_generator.shared.logging import logger
|
|
144
|
+
|
|
145
|
+
logger.warning(
|
|
146
|
+
"Fetch failed for feed '{}' ({}): {} — skipping; cache miss preserved for retry",
|
|
147
|
+
feed.name,
|
|
148
|
+
type(e).__name__,
|
|
149
|
+
e,
|
|
150
|
+
)
|
|
151
|
+
return
|
|
152
|
+
save_entries(run_dir, feed.name, entries)
|
|
153
|
+
|
|
154
|
+
async with asyncio.TaskGroup() as tg:
|
|
155
|
+
for feed in feeds:
|
|
156
|
+
tg.create_task(_one(feed))
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
async def summarize(
|
|
160
|
+
*,
|
|
161
|
+
run_dir: Path,
|
|
162
|
+
summarizer: ContentSummarizer | None = None,
|
|
163
|
+
) -> None:
|
|
164
|
+
"""Read fetched batches, summarize each via LLM, persist under ``source-summarized/``.
|
|
165
|
+
|
|
166
|
+
Iterates ``iter_fetched(run_dir)``. Skips any feed whose summarized
|
|
167
|
+
batch already exists. Per-feed processing is sequential at this
|
|
168
|
+
layer; in-flight LLM concurrency is capped by the summarizer's own
|
|
169
|
+
instance-level semaphore (``summarizer_concurrency``).
|
|
170
|
+
|
|
171
|
+
Topic labels are NOT populated here; the summarized JSON has empty
|
|
172
|
+
topic lists. The label branch writes ``source-labeled/`` separately;
|
|
173
|
+
``api.digest`` joins both at read time.
|
|
174
|
+
|
|
175
|
+
Stage telemetry + materialized sampling are persisted to ``meta.json``
|
|
176
|
+
after the per-feed loop completes (once per call rather than once per
|
|
177
|
+
feed), so concurrent ``api.summarize`` and ``api.label`` from ``api.run``
|
|
178
|
+
each contribute exactly one meta-file write.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
run_dir: Run root with ``source-fetched/`` populated.
|
|
182
|
+
summarizer: Injected summarizer (mostly for tests). Defaults to
|
|
183
|
+
a fresh ``ContentSummarizer()``.
|
|
184
|
+
"""
|
|
185
|
+
from digest_generator.core.digest.orchestrator import _build_sampling_state
|
|
186
|
+
from digest_generator.core.summary import ContentSummarizer
|
|
187
|
+
from digest_generator.core.summary.io import load_summarized, save_summarized
|
|
188
|
+
from digest_generator.shared.logging import collect_stage_telemetry
|
|
189
|
+
from digest_generator.shared.runtime.meta import StageMeta, update_run_meta_telemetry
|
|
190
|
+
from digest_generator.shared.settings import settings
|
|
191
|
+
from digest_generator.sources.rss.io import iter_fetched
|
|
192
|
+
|
|
193
|
+
summarizer_obj = summarizer or ContentSummarizer()
|
|
194
|
+
|
|
195
|
+
# Materialize sampling once for the run so repeated ``summarize_entries``
|
|
196
|
+
# calls (one per feed) all share the same seed, and that seed lands in
|
|
197
|
+
# ``meta.json`` even if the user didn't pass one. Materialization here
|
|
198
|
+
# rather than inside ``ContentSummarizer.__init__`` keeps the stage
|
|
199
|
+
# class agnostic to meta-file plumbing.
|
|
200
|
+
user_sampling = getattr(summarizer_obj, "_sampling", None)
|
|
201
|
+
materialized_sampling, sampling_meta = _build_sampling_state(
|
|
202
|
+
user=user_sampling,
|
|
203
|
+
model=summarizer_obj.model,
|
|
204
|
+
default_temperature=settings.summarizer_temperature,
|
|
205
|
+
default_top_p=settings.summarizer_top_p,
|
|
206
|
+
default_repetition_penalty=settings.summarizer_repetition_penalty,
|
|
207
|
+
default_seed=settings.summarizer_seed,
|
|
208
|
+
)
|
|
209
|
+
summarizer_obj._sampling = materialized_sampling
|
|
210
|
+
|
|
211
|
+
with collect_stage_telemetry() as sink:
|
|
212
|
+
for _content_type, feed_name, entries in iter_fetched(run_dir):
|
|
213
|
+
if load_summarized(run_dir, feed_name) is not None:
|
|
214
|
+
continue
|
|
215
|
+
summaries = await summarizer_obj.summarize_entries(entries, feed=feed_name)
|
|
216
|
+
save_summarized(run_dir, feed_name, summaries)
|
|
217
|
+
|
|
218
|
+
if "summarizer" in sink:
|
|
219
|
+
fields = sink["summarizer"]
|
|
220
|
+
common_keys = {
|
|
221
|
+
"duration_ms",
|
|
222
|
+
"llm_calls",
|
|
223
|
+
"prompt_tokens",
|
|
224
|
+
"completion_tokens",
|
|
225
|
+
"llm_duration_ms",
|
|
226
|
+
"model",
|
|
227
|
+
}
|
|
228
|
+
stage_meta = StageMeta(
|
|
229
|
+
duration_ms=int(fields.get("duration_ms", 0)),
|
|
230
|
+
llm_calls=int(fields.get("llm_calls", 0)),
|
|
231
|
+
prompt_tokens=int(fields.get("prompt_tokens", 0)),
|
|
232
|
+
completion_tokens=int(fields.get("completion_tokens", 0)),
|
|
233
|
+
llm_duration_ms=int(fields.get("llm_duration_ms", 0)),
|
|
234
|
+
model=fields.get("model"),
|
|
235
|
+
extras={k: v for k, v in fields.items() if k not in common_keys},
|
|
236
|
+
)
|
|
237
|
+
if (run_dir / "meta.json").exists():
|
|
238
|
+
update_run_meta_telemetry(
|
|
239
|
+
run_dir,
|
|
240
|
+
stages={"summarizer": stage_meta},
|
|
241
|
+
sampling={"summarizer": sampling_meta},
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
async def label(
|
|
246
|
+
*,
|
|
247
|
+
run_dir: Path,
|
|
248
|
+
classifier: TopicClassifier | None = None,
|
|
249
|
+
) -> None:
|
|
250
|
+
"""Read fetched batches, classify raw text, persist under ``source-labeled/``.
|
|
251
|
+
|
|
252
|
+
Iterates ``iter_fetched(run_dir)``. Skips any feed whose labeled
|
|
253
|
+
batch already exists. Inference runs in ``asyncio.to_thread`` per
|
|
254
|
+
batch (BART-MNLI is blocking).
|
|
255
|
+
|
|
256
|
+
Independent of the summarizer: uses ``entry.content_head`` (with
|
|
257
|
+
title + description fallback) so the two stages can run
|
|
258
|
+
concurrently against the same fetched corpus. The output is
|
|
259
|
+
URL-keyed so ``api.digest`` can join it against ``source-summarized/``
|
|
260
|
+
at read time without a persistent merged artifact.
|
|
261
|
+
|
|
262
|
+
Stage telemetry is harvested into ``meta.json``'s ``stages.topic``
|
|
263
|
+
block (no sampling, since the topic classifier is HF rather than Ollama).
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
run_dir: Run root with ``source-fetched/`` populated.
|
|
267
|
+
classifier: Injected classifier (mostly for tests). Defaults to
|
|
268
|
+
a fresh ``TopicClassifier`` from ``model_registry``.
|
|
269
|
+
"""
|
|
270
|
+
import asyncio
|
|
271
|
+
|
|
272
|
+
from digest_generator.core.label import TopicClassifier
|
|
273
|
+
from digest_generator.core.label.io import load_labeled, save_labeled
|
|
274
|
+
from digest_generator.shared.logging import collect_stage_telemetry
|
|
275
|
+
from digest_generator.shared.runtime.meta import StageMeta, update_run_meta_telemetry
|
|
276
|
+
from digest_generator.shared.transformers.registry import model_registry
|
|
277
|
+
from digest_generator.sources.rss.io import iter_fetched
|
|
278
|
+
|
|
279
|
+
classifier_obj = classifier or TopicClassifier(model_config=model_registry.topic)
|
|
280
|
+
|
|
281
|
+
with collect_stage_telemetry() as sink:
|
|
282
|
+
for _content_type, feed_name, entries in iter_fetched(run_dir):
|
|
283
|
+
if load_labeled(run_dir, feed_name) is not None:
|
|
284
|
+
continue
|
|
285
|
+
labels_per_entry = await asyncio.to_thread(
|
|
286
|
+
classifier_obj.classify_entries, entries, feed=feed_name
|
|
287
|
+
)
|
|
288
|
+
urls = [e.url for e in entries]
|
|
289
|
+
save_labeled(
|
|
290
|
+
run_dir,
|
|
291
|
+
feed_name,
|
|
292
|
+
urls=urls,
|
|
293
|
+
labels_per_entry=labels_per_entry,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
if "topic" in sink:
|
|
297
|
+
fields = sink["topic"]
|
|
298
|
+
common_keys = {
|
|
299
|
+
"duration_ms",
|
|
300
|
+
"llm_calls",
|
|
301
|
+
"prompt_tokens",
|
|
302
|
+
"completion_tokens",
|
|
303
|
+
"llm_duration_ms",
|
|
304
|
+
"model",
|
|
305
|
+
}
|
|
306
|
+
stage_meta = StageMeta(
|
|
307
|
+
duration_ms=int(fields.get("duration_ms", 0)),
|
|
308
|
+
llm_calls=int(fields.get("llm_calls", 0)),
|
|
309
|
+
prompt_tokens=int(fields.get("prompt_tokens", 0)),
|
|
310
|
+
completion_tokens=int(fields.get("completion_tokens", 0)),
|
|
311
|
+
llm_duration_ms=int(fields.get("llm_duration_ms", 0)),
|
|
312
|
+
model=fields.get("model"),
|
|
313
|
+
extras={k: v for k, v in fields.items() if k not in common_keys},
|
|
314
|
+
)
|
|
315
|
+
if (run_dir / "meta.json").exists():
|
|
316
|
+
update_run_meta_telemetry(run_dir, stages={"topic": stage_meta})
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def run(
|
|
320
|
+
feeds: list[Feed],
|
|
321
|
+
filter: Filter,
|
|
322
|
+
*,
|
|
323
|
+
run_dir: Path,
|
|
324
|
+
fetcher: FeedFetcher | None = None,
|
|
325
|
+
summarizer: ContentSummarizer | None = None,
|
|
326
|
+
classifier: TopicClassifier | None = None,
|
|
327
|
+
with_digest: bool = True,
|
|
328
|
+
with_audio: bool = False,
|
|
329
|
+
writer_model: str | None = None,
|
|
330
|
+
editorial_model: str | None = None,
|
|
331
|
+
framer_model: str | None = None,
|
|
332
|
+
watcher_model: str | None = None,
|
|
333
|
+
writer_sampling: SamplingConfig | None = None,
|
|
334
|
+
editorial_sampling: SamplingConfig | None = None,
|
|
335
|
+
framer_sampling: SamplingConfig | None = None,
|
|
336
|
+
watcher_sampling: SamplingConfig | None = None,
|
|
337
|
+
date_range: tuple[str, str] | None = None,
|
|
338
|
+
) -> Path:
|
|
339
|
+
"""Build the corpus and (optionally) the digest.
|
|
340
|
+
|
|
341
|
+
Composition: ``fetch``, then ``summarize`` and ``label`` in parallel, then digest
|
|
342
|
+
(if ``with_digest``). The summarize and label branches run
|
|
343
|
+
concurrently against the same fetched corpus and produce
|
|
344
|
+
independent on-disk artifacts (``source-summarized/`` and ``source-labeled/``).
|
|
345
|
+
``api.digest`` joins them in memory at read time, with no persistent
|
|
346
|
+
merged artifact.
|
|
347
|
+
|
|
348
|
+
Run setup (``run_dir`` creation, ``meta.json``, ``run_context``,
|
|
349
|
+
``llm_telemetry()``, feed resolution) is the caller's responsibility.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
feeds: Feeds to fetch / summarize / label.
|
|
353
|
+
filter: Resolved date / limit filter.
|
|
354
|
+
run_dir: Run root for cache files.
|
|
355
|
+
fetcher: Injected fetcher (mostly for tests).
|
|
356
|
+
summarizer: Injected summarizer (mostly for tests).
|
|
357
|
+
classifier: Injected classifier (mostly for tests).
|
|
358
|
+
with_digest: When ``True`` (default), generate the markdown digest
|
|
359
|
+
after the corpus build. Maps to today's ``--no-digest`` CLI flag.
|
|
360
|
+
with_audio: When ``True``, render the digest to audio after the
|
|
361
|
+
digest stage. Default ``False`` so dev iteration stays cheap;
|
|
362
|
+
pass ``--audio`` explicitly to enable it.
|
|
363
|
+
Implies ``with_digest=True``.
|
|
364
|
+
writer_model / editorial_model / framer_model / watcher_model:
|
|
365
|
+
Override Ollama models for the digest stages.
|
|
366
|
+
writer_sampling / editorial_sampling / framer_sampling / watcher_sampling:
|
|
367
|
+
Sampling overrides for digest stages.
|
|
368
|
+
date_range: Optional ``(since, until)`` date strings for digest period.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
``run_dir`` (data lives on disk; ``api.digest`` is the canonical
|
|
372
|
+
consumer that joins ``source-summarized/`` + ``source-labeled/``).
|
|
373
|
+
"""
|
|
374
|
+
import asyncio
|
|
375
|
+
|
|
376
|
+
async def _build_corpus() -> None:
|
|
377
|
+
await fetch(feeds, filter, run_dir=run_dir, fetcher=fetcher)
|
|
378
|
+
await asyncio.gather(
|
|
379
|
+
summarize(run_dir=run_dir, summarizer=summarizer),
|
|
380
|
+
label(run_dir=run_dir, classifier=classifier),
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
asyncio.run(_build_corpus())
|
|
384
|
+
|
|
385
|
+
if with_digest:
|
|
386
|
+
digest(
|
|
387
|
+
run_dir=run_dir,
|
|
388
|
+
writer_model=writer_model,
|
|
389
|
+
editorial_model=editorial_model,
|
|
390
|
+
framer_model=framer_model,
|
|
391
|
+
watcher_model=watcher_model,
|
|
392
|
+
writer_sampling=writer_sampling,
|
|
393
|
+
editorial_sampling=editorial_sampling,
|
|
394
|
+
framer_sampling=framer_sampling,
|
|
395
|
+
watcher_sampling=watcher_sampling,
|
|
396
|
+
date_range=date_range,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if with_audio:
|
|
400
|
+
if not with_digest:
|
|
401
|
+
msg = "with_audio=True requires with_digest=True (no digest, nothing to narrate)"
|
|
402
|
+
raise ValueError(msg)
|
|
403
|
+
render_audio(run_dir=run_dir)
|
|
404
|
+
|
|
405
|
+
return run_dir
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def _load_digest_input(run_dir: Path) -> dict[str, list[dict[str, Any]]]:
|
|
409
|
+
"""Build the merged article view ``api.digest`` consumes.
|
|
410
|
+
|
|
411
|
+
Reads ``source-summarized/<feed>.json`` (Summary fields), looks up
|
|
412
|
+
``source-labeled/<feed>.json`` for the URL-keyed topics, and
|
|
413
|
+
``source-fetched/<feed>.json`` for the content_type tier
|
|
414
|
+
(carried in entry data). Returns ``{feed_name: [merged_dict, ...]}``.
|
|
415
|
+
|
|
416
|
+
A feed without a labeled or fetched batch is still included: the
|
|
417
|
+
article gets ``topics: {}`` or ``content_type: None`` respectively.
|
|
418
|
+
The digest stages tolerate missing fields, and per-stage failures
|
|
419
|
+
are surfaced via run.log rather than by dropping articles silently.
|
|
420
|
+
"""
|
|
421
|
+
from digest_generator.core.label.io import load_labeled
|
|
422
|
+
from digest_generator.core.summary.io import iter_summarized
|
|
423
|
+
from digest_generator.sources.rss.io import iter_fetched
|
|
424
|
+
|
|
425
|
+
feed_content_type: dict[str, str] = {
|
|
426
|
+
feed_name: str(content_type) for content_type, feed_name, _ in iter_fetched(run_dir)
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
results: dict[str, list[dict[str, Any]]] = {}
|
|
430
|
+
for _src, feed_name, summarized_articles in iter_summarized(run_dir):
|
|
431
|
+
labels_by_url = load_labeled(run_dir, feed_name) or {}
|
|
432
|
+
content_type = feed_content_type.get(feed_name)
|
|
433
|
+
|
|
434
|
+
merged: list[dict[str, Any]] = []
|
|
435
|
+
for article in summarized_articles:
|
|
436
|
+
url = article.get("url", "")
|
|
437
|
+
topics = labels_by_url.get(url, [])
|
|
438
|
+
merged.append(
|
|
439
|
+
{
|
|
440
|
+
**article,
|
|
441
|
+
"content_type": content_type,
|
|
442
|
+
"topics": {label.value: label.confidence for label in topics},
|
|
443
|
+
}
|
|
444
|
+
)
|
|
445
|
+
results[feed_name] = merged
|
|
446
|
+
return results
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def digest(
|
|
450
|
+
*,
|
|
451
|
+
run_dir: Path,
|
|
452
|
+
writer_model: str | None = None,
|
|
453
|
+
editorial_model: str | None = None,
|
|
454
|
+
framer_model: str | None = None,
|
|
455
|
+
watcher_model: str | None = None,
|
|
456
|
+
writer_sampling: SamplingConfig | None = None,
|
|
457
|
+
editorial_sampling: SamplingConfig | None = None,
|
|
458
|
+
framer_sampling: SamplingConfig | None = None,
|
|
459
|
+
watcher_sampling: SamplingConfig | None = None,
|
|
460
|
+
date_range: tuple[str, str] | None = None,
|
|
461
|
+
) -> DigestResult | None:
|
|
462
|
+
"""Generate the markdown digest from a run dir.
|
|
463
|
+
|
|
464
|
+
Reads ``source-summarized/`` and ``source-labeled/`` and joins them
|
|
465
|
+
in memory by URL into the merged article shape downstream digest
|
|
466
|
+
stages expect. Returns ``None`` when ``source-summarized/`` is empty
|
|
467
|
+
(nothing to digest).
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
run_dir: Run directory containing ``source-summarized/<feed>.json``
|
|
471
|
+
and ``source-labeled/<feed>.json``.
|
|
472
|
+
writer_model / editorial_model / framer_model / watcher_model:
|
|
473
|
+
Override Ollama models for digest stages.
|
|
474
|
+
writer_sampling / editorial_sampling / framer_sampling / watcher_sampling:
|
|
475
|
+
Sampling overrides.
|
|
476
|
+
date_range: Optional ``(since, until)`` strings for the digest period.
|
|
477
|
+
"""
|
|
478
|
+
from digest_generator.core.digest.orchestrator import run_digest_from_json
|
|
479
|
+
from digest_generator.shared.logging import logger, run_context
|
|
480
|
+
|
|
481
|
+
with run_context(run_dir.name, run_dir):
|
|
482
|
+
results = _load_digest_input(run_dir)
|
|
483
|
+
if not results:
|
|
484
|
+
logger.warning("No summarized files found in {}", run_dir / "source-summarized")
|
|
485
|
+
return None
|
|
486
|
+
|
|
487
|
+
total = sum(len(v) for v in results.values())
|
|
488
|
+
logger.info("Loaded {} articles from {} feeds", total, len(results))
|
|
489
|
+
|
|
490
|
+
return run_digest_from_json(
|
|
491
|
+
results,
|
|
492
|
+
run_dir=run_dir,
|
|
493
|
+
writer_model=writer_model,
|
|
494
|
+
editorial_model=editorial_model,
|
|
495
|
+
framer_model=framer_model,
|
|
496
|
+
watcher_model=watcher_model,
|
|
497
|
+
writer_sampling=writer_sampling,
|
|
498
|
+
editorial_sampling=editorial_sampling,
|
|
499
|
+
framer_sampling=framer_sampling,
|
|
500
|
+
watcher_sampling=watcher_sampling,
|
|
501
|
+
date_range=date_range,
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def render_audio(
|
|
506
|
+
*,
|
|
507
|
+
run_dir: Path,
|
|
508
|
+
bitrate_kbps: int | None = None,
|
|
509
|
+
renderer: AudioRenderer | None = None,
|
|
510
|
+
) -> Path:
|
|
511
|
+
"""Render the digest at ``run_dir`` to an Opus audio file.
|
|
512
|
+
|
|
513
|
+
Locates the deliverable ``{date}.md`` at the run root,
|
|
514
|
+
narrates it via ``core.audio.narration``, synthesizes via Piper,
|
|
515
|
+
encodes via ffmpeg, and writes ``audio/{date}.opus`` (shared stem).
|
|
516
|
+
Cache-aware: re-running against the same markdown + voice + bitrate
|
|
517
|
+
is a no-op (cache hit short-circuits at the renderer level).
|
|
518
|
+
|
|
519
|
+
Telemetry is harvested into ``meta.json``'s ``stages.audio`` block
|
|
520
|
+
(``duration_ms``, ``voice``, ``narration_chars``, ``audio_bytes``,
|
|
521
|
+
``audio_duration_s``, ``real_time_factor``, ``cached``) plus
|
|
522
|
+
``models.audio = "piper:<voice_id>"`` when the run has a
|
|
523
|
+
``meta.json``. Telemetry is skipped silently if ``meta.json`` is
|
|
524
|
+
absent (programmatic callers without the CLI's meta lifecycle).
|
|
525
|
+
|
|
526
|
+
Voice and bitrate come from ``settings.audio_voice_model`` /
|
|
527
|
+
``settings.audio_bitrate_kbps`` unless ``bitrate_kbps`` overrides
|
|
528
|
+
or a fully-configured ``renderer`` is injected. Multi-voice
|
|
529
|
+
support is a deferred follow-up.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
run_dir: Run directory holding the digest ``.md`` deliverable.
|
|
533
|
+
bitrate_kbps: Override the configured Opus bitrate (24 by default).
|
|
534
|
+
renderer: Injected renderer (mostly for tests).
|
|
535
|
+
|
|
536
|
+
Returns:
|
|
537
|
+
Path to the ``.opus`` artifact under ``audio/``.
|
|
538
|
+
|
|
539
|
+
Raises:
|
|
540
|
+
FileNotFoundError: No ``.md`` deliverable at the run root.
|
|
541
|
+
ValueError: Multiple ``.md`` files at the run root (ambiguous).
|
|
542
|
+
"""
|
|
543
|
+
from digest_generator.core.audio.io import find_digest_md
|
|
544
|
+
from digest_generator.core.audio.renderer import AudioRenderer as _AudioRenderer
|
|
545
|
+
from digest_generator.shared.logging import collect_stage_telemetry, log_stage
|
|
546
|
+
from digest_generator.shared.runtime.meta import StageMeta, update_run_meta_telemetry
|
|
547
|
+
from digest_generator.shared.settings import settings
|
|
548
|
+
from digest_generator.shared.tts.registry import voice_registry
|
|
549
|
+
|
|
550
|
+
digest_md = find_digest_md(run_dir)
|
|
551
|
+
bitrate = bitrate_kbps if bitrate_kbps is not None else settings.audio_bitrate_kbps
|
|
552
|
+
|
|
553
|
+
if renderer is None:
|
|
554
|
+
voice = voice_registry.default
|
|
555
|
+
renderer = _AudioRenderer(
|
|
556
|
+
voice=voice,
|
|
557
|
+
bitrate_kbps=bitrate,
|
|
558
|
+
sentence_silence_s=settings.audio_sentence_silence_s,
|
|
559
|
+
ffmpeg_path=settings.audio_ffmpeg_path,
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
with collect_stage_telemetry() as sink, log_stage("audio") as span:
|
|
563
|
+
artifact = renderer.render(run_dir, digest_md)
|
|
564
|
+
span.set(
|
|
565
|
+
voice=artifact.voice_id,
|
|
566
|
+
bitrate_kbps=artifact.bitrate_kbps,
|
|
567
|
+
narration_chars=artifact.narration_chars,
|
|
568
|
+
audio_bytes=artifact.audio_bytes,
|
|
569
|
+
audio_duration_s=round(artifact.audio_duration_s, 1),
|
|
570
|
+
cached=artifact.cached,
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
if "audio" in sink and (run_dir / "meta.json").exists():
|
|
574
|
+
fields = dict(sink["audio"])
|
|
575
|
+
duration_ms = int(fields.get("duration_ms", 0))
|
|
576
|
+
# real_time_factor = seconds of audio per second of wall time. >1 means
|
|
577
|
+
# synthesis runs faster than real time. Skip on cache hit since the
|
|
578
|
+
# synthesis pipeline didn't fire and ``duration_ms`` only reflects the
|
|
579
|
+
# cache-lookup overhead.
|
|
580
|
+
if not artifact.cached and duration_ms > 0:
|
|
581
|
+
fields["real_time_factor"] = round(
|
|
582
|
+
artifact.audio_duration_s / (duration_ms / 1000.0), 1
|
|
583
|
+
)
|
|
584
|
+
common_keys = {
|
|
585
|
+
"duration_ms",
|
|
586
|
+
"llm_calls",
|
|
587
|
+
"prompt_tokens",
|
|
588
|
+
"completion_tokens",
|
|
589
|
+
"llm_duration_ms",
|
|
590
|
+
"model",
|
|
591
|
+
}
|
|
592
|
+
stage_meta = StageMeta(
|
|
593
|
+
duration_ms=duration_ms,
|
|
594
|
+
extras={k: v for k, v in fields.items() if k not in common_keys},
|
|
595
|
+
)
|
|
596
|
+
update_run_meta_telemetry(
|
|
597
|
+
run_dir,
|
|
598
|
+
models={"audio": f"piper:{artifact.voice_id}"},
|
|
599
|
+
stages={"audio": stage_meta},
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
return artifact.opus_path
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def list_feeds(
|
|
606
|
+
content_types: list[str] | None = None,
|
|
607
|
+
*,
|
|
608
|
+
feeds_file: str | None = None,
|
|
609
|
+
config_dir: str | None = None,
|
|
610
|
+
) -> list[Feed]:
|
|
611
|
+
"""Return the configured feeds, optionally filtered by content type.
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
content_types: Content type values to filter by.
|
|
615
|
+
feeds_file: Explicit ``feeds.yaml`` path; overrides discovery.
|
|
616
|
+
config_dir: Config directory holding ``feeds.yaml``; overrides discovery.
|
|
617
|
+
|
|
618
|
+
Returns:
|
|
619
|
+
List of ``Feed`` objects.
|
|
620
|
+
|
|
621
|
+
Raises:
|
|
622
|
+
FeedsConfigError: If no feeds file is found or it is invalid.
|
|
623
|
+
"""
|
|
624
|
+
if content_types:
|
|
625
|
+
return resolve_feeds(
|
|
626
|
+
content_types=content_types, feeds_file=feeds_file, config_dir=config_dir
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
from digest_generator.sources.rss.config import load_configured_feeds
|
|
630
|
+
|
|
631
|
+
return load_configured_feeds(feeds_file=feeds_file, config_dir=config_dir)
|