runtime-narrative 0.2.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. runtime_narrative-1.0.0/PKG-INFO +1124 -0
  2. runtime_narrative-1.0.0/README.MD +1068 -0
  3. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/pyproject.toml +32 -2
  4. runtime_narrative-1.0.0/runtime_narrative/__init__.py +116 -0
  5. runtime_narrative-1.0.0/runtime_narrative/analyzers/__init__.py +16 -0
  6. runtime_narrative-1.0.0/runtime_narrative/analyzers/anthropic.py +154 -0
  7. runtime_narrative-1.0.0/runtime_narrative/analyzers/base.py +20 -0
  8. runtime_narrative-1.0.0/runtime_narrative/analyzers/deduplication.py +100 -0
  9. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/analyzers/ollama.py +268 -220
  10. runtime_narrative-1.0.0/runtime_narrative/celery.py +87 -0
  11. runtime_narrative-1.0.0/runtime_narrative/cli.py +199 -0
  12. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/diagnostics.py +70 -9
  13. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/events.py +4 -0
  14. runtime_narrative-1.0.0/runtime_narrative/grpc_interceptor.py +147 -0
  15. runtime_narrative-1.0.0/runtime_narrative/instrumentation.py +254 -0
  16. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/middleware.py +31 -14
  17. runtime_narrative-1.0.0/runtime_narrative/middleware_django.py +122 -0
  18. runtime_narrative-1.0.0/runtime_narrative/renderer/alert_renderer.py +184 -0
  19. runtime_narrative-1.0.0/runtime_narrative/renderer/html_renderer.py +166 -0
  20. runtime_narrative-1.0.0/runtime_narrative/renderer/otel_log_renderer.py +134 -0
  21. runtime_narrative-1.0.0/runtime_narrative/renderer/otel_metrics_renderer.py +93 -0
  22. runtime_narrative-1.0.0/runtime_narrative/renderer/otel_renderer.py +148 -0
  23. runtime_narrative-1.0.0/runtime_narrative/renderer/persistence_renderer.py +200 -0
  24. runtime_narrative-1.0.0/runtime_narrative/renderer/prometheus_renderer.py +100 -0
  25. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/stage.py +106 -93
  26. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/story.py +21 -2
  27. runtime_narrative-1.0.0/runtime_narrative/task_group.py +70 -0
  28. runtime_narrative-1.0.0/runtime_narrative/testing.py +142 -0
  29. runtime_narrative-1.0.0/runtime_narrative.egg-info/PKG-INFO +1124 -0
  30. runtime_narrative-1.0.0/runtime_narrative.egg-info/SOURCES.txt +71 -0
  31. runtime_narrative-1.0.0/runtime_narrative.egg-info/entry_points.txt +2 -0
  32. runtime_narrative-1.0.0/runtime_narrative.egg-info/requires.txt +37 -0
  33. runtime_narrative-1.0.0/tests/test_alert_renderer.py +279 -0
  34. runtime_narrative-1.0.0/tests/test_analyzers.py +133 -0
  35. runtime_narrative-1.0.0/tests/test_anthropic_analyzer.py +134 -0
  36. runtime_narrative-1.0.0/tests/test_async_renderer.py +56 -0
  37. runtime_narrative-1.0.0/tests/test_celery.py +120 -0
  38. runtime_narrative-1.0.0/tests/test_console_renderer.py +120 -0
  39. runtime_narrative-1.0.0/tests/test_deduplication.py +145 -0
  40. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/tests/test_diagnostics.py +48 -0
  41. runtime_narrative-1.0.0/tests/test_dry_run.py +115 -0
  42. runtime_narrative-1.0.0/tests/test_grpc_interceptor.py +155 -0
  43. runtime_narrative-1.0.0/tests/test_html_renderer.py +92 -0
  44. runtime_narrative-1.0.0/tests/test_instrumentation.py +330 -0
  45. runtime_narrative-1.0.0/tests/test_instrumentation_phase2.py +333 -0
  46. runtime_narrative-1.0.0/tests/test_json_renderer.py +110 -0
  47. runtime_narrative-1.0.0/tests/test_middleware_django.py +110 -0
  48. runtime_narrative-1.0.0/tests/test_middleware_propagation.py +99 -0
  49. runtime_narrative-1.0.0/tests/test_otel_log_renderer.py +204 -0
  50. runtime_narrative-1.0.0/tests/test_otel_metrics_renderer.py +201 -0
  51. runtime_narrative-1.0.0/tests/test_otel_renderer.py +306 -0
  52. runtime_narrative-1.0.0/tests/test_persistence_renderer.py +368 -0
  53. runtime_narrative-1.0.0/tests/test_prometheus_renderer.py +193 -0
  54. runtime_narrative-1.0.0/tests/test_redaction_extended.py +248 -0
  55. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/tests/test_stage.py +13 -0
  56. runtime_narrative-1.0.0/tests/test_stage_metadata.py +123 -0
  57. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/tests/test_story.py +87 -0
  58. runtime_narrative-1.0.0/tests/test_structured_analysis.py +186 -0
  59. runtime_narrative-1.0.0/tests/test_task_group.py +141 -0
  60. runtime_narrative-1.0.0/tests/test_testing_utils.py +134 -0
  61. runtime_narrative-0.2.0/PKG-INFO +0 -408
  62. runtime_narrative-0.2.0/README.MD +0 -373
  63. runtime_narrative-0.2.0/runtime_narrative/__init__.py +0 -31
  64. runtime_narrative-0.2.0/runtime_narrative/analyzers/__init__.py +0 -3
  65. runtime_narrative-0.2.0/runtime_narrative.egg-info/PKG-INFO +0 -408
  66. runtime_narrative-0.2.0/runtime_narrative.egg-info/SOURCES.txt +0 -30
  67. runtime_narrative-0.2.0/runtime_narrative.egg-info/requires.txt +0 -10
  68. runtime_narrative-0.2.0/tests/test_async_renderer.py +0 -28
  69. runtime_narrative-0.2.0/tests/test_json_renderer.py +0 -50
  70. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/LICENSE +0 -0
  71. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/context.py +0 -0
  72. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/decorators.py +0 -0
  73. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/failure.py +0 -0
  74. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/renderer/__init__.py +0 -0
  75. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/renderer/console.py +0 -0
  76. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative/renderer/json_renderer.py +0 -0
  77. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative.egg-info/dependency_links.txt +0 -0
  78. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/runtime_narrative.egg-info/top_level.txt +0 -0
  79. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/setup.cfg +0 -0
  80. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/tests/test_decorators.py +0 -0
  81. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/tests/test_failure.py +0 -0
  82. {runtime_narrative-0.2.0 → runtime_narrative-1.0.0}/tests/test_middleware.py +0 -0
@@ -0,0 +1,1124 @@
1
+ Metadata-Version: 2.4
2
+ Name: runtime-narrative
3
+ Version: 1.0.0
4
+ Summary: Model execution as human-readable stories with lean/rich failure diagnostics and optional LLM analysis
5
+ Author-email: Shashank Raj <shashank.raj28@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/sraj0501/runtime_narrative
8
+ Project-URL: Repository, https://github.com/sraj0501/runtime_narrative
9
+ Project-URL: Bug Tracker, https://github.com/sraj0501/runtime_narrative/issues
10
+ Keywords: logging,observability,tracing,fastapi,debugging,diagnostics,runtime_narrative
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: System :: Logging
22
+ Classifier: Topic :: System :: Monitoring
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: python-dotenv>=1.2.1
28
+ Provides-Extra: console
29
+ Requires-Dist: typer>=0.9.0; extra == "console"
30
+ Provides-Extra: fastapi
31
+ Requires-Dist: starlette>=0.27.0; extra == "fastapi"
32
+ Provides-Extra: otel
33
+ Requires-Dist: opentelemetry-api>=1.20.0; extra == "otel"
34
+ Requires-Dist: opentelemetry-sdk>=1.20.0; extra == "otel"
35
+ Provides-Extra: prometheus
36
+ Requires-Dist: prometheus-client>=0.19.0; extra == "prometheus"
37
+ Provides-Extra: anthropic
38
+ Requires-Dist: anthropic>=0.25.0; extra == "anthropic"
39
+ Provides-Extra: django
40
+ Requires-Dist: django>=3.2; extra == "django"
41
+ Provides-Extra: celery
42
+ Requires-Dist: celery>=5.0; extra == "celery"
43
+ Provides-Extra: grpc
44
+ Requires-Dist: grpcio>=1.50.0; extra == "grpc"
45
+ Provides-Extra: all
46
+ Requires-Dist: typer>=0.9.0; extra == "all"
47
+ Requires-Dist: starlette>=0.27.0; extra == "all"
48
+ Requires-Dist: opentelemetry-api>=1.20.0; extra == "all"
49
+ Requires-Dist: opentelemetry-sdk>=1.20.0; extra == "all"
50
+ Requires-Dist: prometheus-client>=0.19.0; extra == "all"
51
+ Requires-Dist: anthropic>=0.25.0; extra == "all"
52
+ Requires-Dist: django>=3.2; extra == "all"
53
+ Requires-Dist: celery>=5.0; extra == "all"
54
+ Requires-Dist: grpcio>=1.50.0; extra == "all"
55
+ Dynamic: license-file
56
+
57
+ # runtime-narrative
58
+
59
+ **Turn any Python application into a traceable story. Get minimal logs when everything works — and surgical, LLM-powered diagnostics the moment something breaks.**
60
+
61
+ ---
62
+
63
+ ## The idea
64
+
65
+ Most logging tells you *that* something failed. `runtime-narrative` tells you *why* — with full awareness of every step that succeeded before the failure, what was supposed to happen next, and (optionally) a plain-English suggestion for how to fix it.
66
+
67
+ You model your application's execution as a **story** made up of **stages**. Each function or logical unit of work becomes a stage. The library watches everything:
68
+
69
+ - **When a stage passes:** one line — `✔ Stage completed: Validate Input (0.003s)`. No noise.
70
+ - **When anything fails:** a structured failure report with the exact file, line number, failing statement, the full timeline of what succeeded before it, and — if you plug in an LLM — a concrete logical fix suggestion.
71
+
72
+ This combines debugging and logging into a single mechanism: logs are minimal until something breaks, then they are explicit and actionable.
73
+
74
+ ---
75
+
76
+ ## Install
77
+
78
+ Zero dependencies at the core:
79
+
80
+ ```bash
81
+ pip install runtime-narrative
82
+ ```
83
+
84
+ Optional extras:
85
+
86
+ ```bash
87
+ pip install "runtime-narrative[console]" # colored terminal output (typer)
88
+ pip install "runtime-narrative[fastapi]" # FastAPI/Starlette middleware
89
+ pip install "runtime-narrative[otel]" # OpenTelemetry trace renderer
90
+ pip install "runtime-narrative[prometheus]" # Prometheus metrics renderer
91
+ pip install "runtime-narrative[anthropic]" # Anthropic Claude failure analyzer
92
+ pip install "runtime-narrative[django]" # Django WSGI/ASGI middleware
93
+ pip install "runtime-narrative[celery]" # Celery task integration
94
+ pip install "runtime-narrative[grpc]" # gRPC server interceptors
95
+ pip install "runtime-narrative[all]" # everything above
96
+ ```
97
+
98
+ ---
99
+
100
+ ## Quick start
101
+
102
+ ```python
103
+ from runtime_narrative import story, stage, StoryRuntime # StoryRuntime for type hints
104
+
105
+ with story("Import Customers"):
106
+ with stage("Load CSV"):
107
+ rows = load_csv("customers.csv")
108
+
109
+ with stage("Validate Data"):
110
+ validate(rows)
111
+
112
+ with stage("Insert Records"):
113
+ db.insert(rows)
114
+ ```
115
+
116
+ **Everything works — minimal output:**
117
+
118
+ ```
119
+ ▶ Story started: Import Customers
120
+ ✔ Stage completed: Load CSV (0.012s)
121
+ ✔ Stage completed: Validate Data (0.004s)
122
+ ✔ Stage completed: Insert Records (0.089s)
123
+ ▶ Story ended: SUCCESS
124
+ ```
125
+
126
+ **Something fails — full context, no guessing:**
127
+
128
+ ```
129
+ ▶ Story started: Import Customers
130
+ ✔ Stage completed: Load CSV (0.012s)
131
+ ✔ Stage completed: Validate Data (0.004s)
132
+
133
+ ❌ Failure detected
134
+ Story: Import Customers
135
+ Stage: Insert Records
136
+ Error: ValueError - duplicate customer id
137
+ Location: app/db.py:47 (insert_row)
138
+ Code: raise ValueError("duplicate customer id")
139
+ Recent stages: Load CSV=completed (0.012s) | Validate Data=completed (0.004s) | Insert Records=failed (0.001s)
140
+ Progress: 66% (2 / 3)
141
+ ```
142
+
143
+ The library knows what succeeded before the failure. That context is always part of the report.
144
+
145
+ Async code uses identical syntax with `async with`:
146
+
147
+ ```python
148
+ async with story("Import Customers"):
149
+ async with stage("Load CSV"):
150
+ rows = await load_csv("customers.csv")
151
+
152
+ async with stage("Insert Records"):
153
+ await db.insert(rows)
154
+ ```
155
+
156
+ ---
157
+
158
+ ## LLM-powered failure analysis (optional)
159
+
160
+ Plug in any local or remote LLM. When a failure occurs, the library packages the story name, stage name, error type, exact failing line, exception chain, and traceback — and asks the LLM for a targeted diagnostic.
161
+
162
+ ```python
163
+ from runtime_narrative import story, stage, OllamaFailureAnalyzer
164
+
165
+ analyzer = OllamaFailureAnalyzer(model="llama3")
166
+
167
+ with story("Import Customers", failure_analyzer=analyzer):
168
+ with stage("Load CSV"):
169
+ rows = load_csv("customers.csv")
170
+ with stage("Insert Records"):
171
+ db.insert(rows)
172
+ ```
173
+
174
+ The LLM response is structured and rendered inline:
175
+
176
+ ```
177
+ +-- LLM Debug -----------------------------------------------------------+
178
+ | Exact Why |
179
+ | The INSERT fails because customer_id already exists in the customers |
180
+ | table (UNIQUE constraint). The error is raised at db.py:47. |
181
+ | |
182
+ | Evidence |
183
+ | ValueError: duplicate customer id — raised after catching a |
184
+ | sqlite3.IntegrityError from the underlying INSERT call. |
185
+ | |
186
+ | Targeted Fix |
187
+ | Use INSERT OR IGNORE, or check for existence before inserting. |
188
+ | Alternatively, catch the duplicate and return the existing record. |
189
+ | |
190
+ >> Code Changes |
191
+ | db.py:47 — wrap the insert in try/except IntegrityError and handle |
192
+ | the duplicate case explicitly rather than re-raising ValueError. |
193
+ +------------------------------------------------------------------------+
194
+ ```
195
+
196
+ > **Note:** The LLM suggests logical fixes only — it does not rewrite your code. The suggestion names the exact location, explains what went wrong mechanically, and tells you what to change. What you change is up to you.
197
+
198
+ ### Analyzer options
199
+
200
+ | Class | API | Use case |
201
+ |---|---|---|
202
+ | `OllamaFailureAnalyzer` | Ollama native `/api/generate` | Local Ollama |
203
+ | `LLMFailureAnalyzer` | OpenAI-compatible `/v1/chat/completions` | vLLM, llama.cpp, LM Studio, Ollama OpenAI mode, any hosted API |
204
+ | `AnthropicFailureAnalyzer` | Anthropic API | Claude Haiku / Sonnet / Opus (`[anthropic]` extra required) |
205
+
206
+ ```python
207
+ from runtime_narrative import LLMFailureAnalyzer
208
+
209
+ analyzer = LLMFailureAnalyzer(
210
+ model="llama3",
211
+ endpoint="http://localhost:8000/v1/chat/completions",
212
+ )
213
+ ```
214
+
215
+ All analyzers fall back silently if the endpoint is unreachable — your application's exception still propagates normally.
216
+
217
+ All analyzers request structured JSON (`exact_why`, `evidence`, `targeted_fix`, `code_changes`) from the model and render it into guaranteed `## Header` sections. Responses that are not valid JSON fall back to raw text.
218
+
219
+ ### Anthropic Claude analyzer
220
+
221
+ `AnthropicFailureAnalyzer` requires the `[anthropic]` extra and an `ANTHROPIC_API_KEY` environment variable. Defaults to `claude-haiku-4-5-20251001`; override via `model=` or the `RUNTIME_NARRATIVE_MODEL` env var:
222
+
223
+ ```python
224
+ from runtime_narrative import story, stage, AnthropicFailureAnalyzer
225
+
226
+ analyzer = AnthropicFailureAnalyzer() # reads ANTHROPIC_API_KEY from env
227
+ # or explicitly:
228
+ analyzer = AnthropicFailureAnalyzer(
229
+ api_key="sk-ant-...",
230
+ model="claude-sonnet-4-6",
231
+ max_tokens=1024,
232
+ timeout_seconds=30.0,
233
+ )
234
+
235
+ async with story("Import Customers", failure_analyzer=analyzer):
236
+ async with stage("Insert Records"):
237
+ db.insert(rows)
238
+ ```
239
+
240
+ ### Context budget
241
+
242
+ All analyzers accept `max_context_chars: int = 8000`. When the traceback would push the prompt over budget, it is trimmed from the top (keeping the most recent frames). If the budget is exhausted entirely, a `<traceback omitted>` marker is used instead:
243
+
244
+ ```python
245
+ analyzer = LLMFailureAnalyzer(model="llama3", max_context_chars=4000)
246
+ ```
247
+
248
+ ### Failure deduplication
249
+
250
+ `DeduplicatingAnalyzer` wraps any analyzer with an LRU cache. Repeated failures at the same location return the cached suggestion immediately — no redundant LLM calls:
251
+
252
+ ```python
253
+ from runtime_narrative import DeduplicatingAnalyzer, OllamaFailureAnalyzer
254
+
255
+ analyzer = DeduplicatingAnalyzer(
256
+ OllamaFailureAnalyzer(model="llama3"),
257
+ max_cache_size=256, # LRU eviction above this count
258
+ )
259
+ ```
260
+
261
+ Cache key is a SHA-256 hash of `(error_type, filename, lineno, exception_chain)`. `None` results (network errors, timeouts) are never cached — next call retries the model. Thread-safe; works with both sync and async analysis paths.
262
+
263
+ ### Background analysis
264
+
265
+ For latency-sensitive services, use `background_analysis=True`. The `FailureOccurred` event is emitted immediately (so your error response is not delayed), and the LLM runs as a background task. When it finishes, a `LLMAnalysisReady` event is emitted:
266
+
267
+ ```python
268
+ async with story("Process Order", failure_analyzer=analyzer, background_analysis=True):
269
+ async with stage("Charge Payment"):
270
+ await charge(order)
271
+ ```
272
+
273
+ ---
274
+
275
+ ## Diagnostics depth
276
+
277
+ The library operates in two modes, controlled by environment variable or per-story kwargs:
278
+
279
+ | Mode | What you get |
280
+ |---|---|
281
+ | `lean` (default) | Error type, message, exact location, source line, exception chain, compressed stack summary |
282
+ | `rich` | Everything above + source code snippet (±2 lines around the error) + local variable values at the failing frame, with automatic redaction of secrets (`password`, `token`, `api_key`, etc.) |
283
+
284
+ ```bash
285
+ # Enable rich diagnostics for a run
286
+ RUNTIME_NARRATIVE_FAILURE_DIAGNOSTICS=rich python myapp.py
287
+ ```
288
+
289
+ Rich mode is automatically downgraded to lean in production unless explicitly allowed:
290
+
291
+ ```bash
292
+ RUNTIME_NARRATIVE_ENV=production
293
+ RUNTIME_NARRATIVE_ALLOW_RICH_IN_PRODUCTION=true # override when needed
294
+ ```
295
+
296
+ Per-story configuration:
297
+
298
+ ```python
299
+ from runtime_narrative import story, FailureDiagnosticsConfig
300
+
301
+ async with story(
302
+ "Import Customers",
303
+ runtime_environment="development",
304
+ failure_diagnostics="rich",
305
+ app_roots=("/path/to/my/app",), # optional; default uses cwd
306
+ redact_extra=("internal_id", "org_token"), # extend built-in secret list
307
+ ):
308
+ ...
309
+
310
+ # Or pass a fully built config
311
+ cfg = FailureDiagnosticsConfig(
312
+ failure_diagnostics="rich",
313
+ app_roots=("/app",),
314
+ redact_extra=("internal_id",),
315
+ )
316
+ async with story("Import Customers", diagnostics_config=cfg):
317
+ ...
318
+ ```
319
+
320
+ ### Custom redaction
321
+
322
+ Rich mode captures local variables at the failing frame and automatically redacts keys containing `password`, `secret`, `token`, `api_key`, `authorization`, `cookie`, `session`, and `credential`. Pass `redact_extra` to extend this list with project-specific names:
323
+
324
+ ```python
325
+ async with story("Sync Users", failure_diagnostics="rich", redact_extra=("org_id", "internal_key")):
326
+ ...
327
+ ```
328
+
329
+ The same kwarg is accepted by `RuntimeNarrativeMiddleware` and `FailureDiagnosticsConfig`.
330
+
331
+ ---
332
+
333
+ ## Server deployments — structured JSON logs
334
+
335
+ For production or any environment where you need machine-readable output, swap `ConsoleRenderer` for `JsonRenderer`. It emits one JSON object per lifecycle event — compatible with any structured log collector (Datadog, CloudWatch, Loki, OpenTelemetry log exporters):
336
+
337
+ ```python
338
+ from runtime_narrative import story, stage, JsonRenderer
339
+
340
+ async with story("Process Payment", renderers=[JsonRenderer()]):
341
+ async with stage("Validate Card"):
342
+ ...
343
+ async with stage("Charge"):
344
+ ...
345
+ ```
346
+
347
+ On success, output is minimal — one object per event:
348
+
349
+ ```json
350
+ {"event": "StoryStarted", "story_id": "abc-123", "story_name": "Process Payment", "timestamp": "..."}
351
+ {"event": "StageCompleted", "story_id": "abc-123", "stage_name": "Validate Card", "duration_seconds": 0.003, "timestamp": "..."}
352
+ {"event": "StoryCompleted", "story_id": "abc-123", "success": true, "progress": {"percent": 100, ...}, "timestamp": "..."}
353
+ ```
354
+
355
+ On failure, `FailureOccurred` carries the full diagnostics payload — exact location, stack frame classification, source snippet, local variables (rich mode), traceback — all in a structured, queryable form:
356
+
357
+ ```json
358
+ {
359
+ "event": "FailureOccurred",
360
+ "story_id": "abc-123",
361
+ "stage_name": "Charge",
362
+ "error_type": "TimeoutError",
363
+ "location": {"filename": "payment.py", "lineno": 82, "function": "charge_card", "source_line": "..."},
364
+ "llm_analysis": "...",
365
+ "diagnostics_mode": "lean",
366
+ "stack_frames": [...],
367
+ "compressed_stack_summary": "2 app frame(s), 4 other/hidden in full stack (6 total)",
368
+ "stage_timeline": "Validate Card=completed (0.003s) | Charge=failed (0.012s)"
369
+ }
370
+ ```
371
+
372
+ Write to a file instead of stdout:
373
+
374
+ ```python
375
+ JsonRenderer(output=open("narrative.log", "a"))
376
+ ```
377
+
378
+ ### Rotating log files
379
+
380
+ Use `RotatingJsonRenderer` to cap log file size automatically. When the active file reaches `max_bytes` it is renamed to `narrative.log.1` (shifting older backups) and a new file is opened — no external dependencies, no cron job required:
381
+
382
+ ```python
383
+ from runtime_narrative import story, stage, RotatingJsonRenderer
384
+
385
+ async with story("Process Payment", renderers=[RotatingJsonRenderer("narrative.log")]):
386
+ async with stage("Charge"):
387
+ ...
388
+ ```
389
+
390
+ ```python
391
+ RotatingJsonRenderer(
392
+ "narrative.log",
393
+ max_bytes=10 * 1024 * 1024, # rotate at 10 MB (default)
394
+ backup_count=5, # keep narrative.log.1 … narrative.log.5 (default)
395
+ indent=None, # compact single-line output (default)
396
+ )
397
+ ```
398
+
399
+ ---
400
+
401
+ ## FastAPI / Starlette middleware
402
+
403
+ Add the middleware once and every request becomes a story automatically. Route handlers only need to declare stages:
404
+
405
+ ```python
406
+ from fastapi import FastAPI
407
+ from runtime_narrative import RuntimeNarrativeMiddleware, JsonRenderer, OllamaFailureAnalyzer
408
+
409
+ app = FastAPI()
410
+ app.add_middleware(
411
+ RuntimeNarrativeMiddleware,
412
+ renderers=[JsonRenderer()], # structured logs for prod
413
+ failure_analyzer=OllamaFailureAnalyzer(model="llama3"),
414
+ runtime_environment="production", # enforces lean + traceback cap
415
+ )
416
+
417
+ @app.post("/orders")
418
+ async def create_order(payload: OrderIn):
419
+ with stage("Validate Input"):
420
+ validate(payload)
421
+
422
+ with stage("Persist Order"):
423
+ order = await db.insert(payload)
424
+
425
+ return {"id": order.id}
426
+ ```
427
+
428
+ Each request becomes a story named `"POST /orders"`. If the handler raises, the middleware captures the full failure context before returning the error response.
429
+
430
+ When no `renderers` are provided, the middleware auto-selects: `ConsoleRenderer` when `sys.stdout` is a real TTY (local `uvicorn` dev server), `JsonRenderer` otherwise (Docker, CI, any non-interactive environment).
431
+
432
+ When `opentelemetry-api` is installed, the middleware automatically extracts incoming W3C `traceparent` / `tracestate` headers and attaches the upstream trace context before entering the story. This means `OtelRenderer` story spans become children of the upstream trace — not orphaned roots — so distributed traces are connected end-to-end. Pass `propagate_trace_context=False` to disable this behavior.
433
+
434
+ ### Progress tracking
435
+
436
+ Declare the expected stage count upfront so `progress_percent` is accurate at every stage boundary — not just at story end:
437
+
438
+ ```python
439
+ from runtime_narrative import story, stage, StoryRuntime
440
+
441
+ with story("Import Customers", total_stages=3) as runtime:
442
+ with stage("Load CSV"):
443
+ rows = load_csv("customers.csv")
444
+ # progress_percent is now 33%
445
+
446
+ with stage("Validate Data"):
447
+ validate(rows)
448
+ # progress_percent is now 66%
449
+
450
+ with stage("Insert Records"):
451
+ db.insert(rows)
452
+ # progress_percent is now 100%
453
+ ```
454
+
455
+ You can also set the count dynamically after the story starts:
456
+
457
+ ```python
458
+ with story("Process Batch") as runtime:
459
+ items = fetch_items()
460
+ runtime.set_total_stages(len(items))
461
+ for item in items:
462
+ with stage(f"Process {item.id}"):
463
+ process(item)
464
+ ```
465
+
466
+ ---
467
+
468
+ ## Auto-instrumentation
469
+
470
+ Instrument an entire class or module without touching every function individually.
471
+
472
+ ### `@narrative_class`
473
+
474
+ Decorate a class and every public instance method becomes a stage automatically. The stage name is `ClassName.method_name`.
475
+
476
+ ```python
477
+ from runtime_narrative import narrative_class, no_stage
478
+
479
+ @narrative_class
480
+ class OrderService:
481
+ def validate(self, order): ... # → stage "OrderService.validate"
482
+ def charge(self, order): ... # → stage "OrderService.charge"
483
+ def fulfill(self, order): ... # → stage "OrderService.fulfill"
484
+
485
+ @no_stage
486
+ def _log(self, msg): ... # excluded — opt-out marker
487
+ ```
488
+
489
+ Equivalent to manually wrapping each method in `with stage("OrderService.validate")`. The decorator handles both sync and async methods; use `async with story(...)` to fully await async renderers.
490
+
491
+ **What is skipped:** names starting with `_`, `@no_stage`-marked methods, `@property`, and inherited methods (apply `@narrative_class` to the base class separately). `@classmethod` and `@staticmethod` are skipped by default — see below.
492
+
493
+ ### `@narrative_stage`
494
+
495
+ Override the auto-generated stage name for a specific method, or use it standalone on any function:
496
+
497
+ ```python
498
+ from runtime_narrative import narrative_class, narrative_stage
499
+
500
+ @narrative_class
501
+ class OrderService:
502
+ @narrative_stage("Validate Order") # custom name overrides "OrderService.validate"
503
+ def validate(self, order): ...
504
+
505
+ def charge(self, order): ... # → "OrderService.charge" (default)
506
+ ```
507
+
508
+ Standalone — any function, any depth, sync or async:
509
+
510
+ ```python
511
+ @narrative_stage("Process Order")
512
+ async def process(order):
513
+ ...
514
+ ```
515
+
516
+ When `name` is omitted (`@narrative_stage()`), the function name is title-cased: `validate_order` → `"Validate Order"`.
517
+
518
+ ### Classmethods and staticmethods
519
+
520
+ `@narrative_class` skips classmethods and staticmethods by default. Enable them explicitly:
521
+
522
+ ```python
523
+ @narrative_class(instrument_classmethods=True, instrument_staticmethods=True)
524
+ class Factory:
525
+ @classmethod
526
+ def create(cls): ... # → "Factory.create"
527
+
528
+ @staticmethod
529
+ def validate(data): ... # → "Factory.validate"
530
+
531
+ @classmethod
532
+ @no_stage
533
+ def _internal(cls): ... # excluded by @no_stage
534
+
535
+ @classmethod
536
+ @narrative_stage("Build Widget")
537
+ def build(cls): ... # → "Build Widget" (custom name)
538
+ ```
539
+
540
+ ### `@no_stage`
541
+
542
+ Opt-out marker. Apply to any method or function to exclude it from auto-instrumentation:
543
+
544
+ ```python
545
+ @no_stage
546
+ def _internal_helper(self): ...
547
+ ```
548
+
549
+ ### `instrument_module()`
550
+
551
+ Instrument all public callables in an existing module in one call. Classes get the full `@narrative_class` treatment; top-level functions are wrapped directly. Symbols imported from other modules are not touched.
552
+
553
+ ```python
554
+ import runtime_narrative
555
+ import myapp.services
556
+
557
+ runtime_narrative.instrument_module(myapp.services)
558
+ ```
559
+
560
+ Call this once at startup, after the module has been imported.
561
+
562
+ ### `auto_instrument()`
563
+
564
+ Zero-config option. Register a `sys.meta_path` import hook that instruments every app module as it is imported — no changes to application code required:
565
+
566
+ ```python
567
+ # Entry point (main.py or app factory) — one line:
568
+ import runtime_narrative
569
+ runtime_narrative.auto_instrument()
570
+
571
+ # Everything imported from this point on is instrumented automatically:
572
+ from myapp.services import OrderService
573
+ from myapp.pipeline import run_pipeline
574
+ ```
575
+
576
+ Only modules whose source file is under the current working directory (or `app_roots`) are instrumented — stdlib and installed packages are unaffected.
577
+
578
+ ```python
579
+ # Pin to specific directories instead of cwd:
580
+ runtime_narrative.auto_instrument(app_roots=["/app/src", "/app/workers"])
581
+ ```
582
+
583
+ The hook is removable:
584
+
585
+ ```python
586
+ finder = runtime_narrative.auto_instrument()
587
+ # ... later ...
588
+ import sys
589
+ sys.meta_path.remove(finder)
590
+ ```
591
+
592
+ ---
593
+
594
+ ## Decorators
595
+
596
+ Wrap entire functions without changing their call sites. The library detects `async def` automatically:
597
+
598
+ ```python
599
+ from runtime_narrative import runtime_narrative_story, runtime_narrative_stage
600
+
601
+ @runtime_narrative_story(failure_analyzer=analyzer)
602
+ async def run_pipeline():
603
+ await load_data()
604
+ await transform()
605
+ await export()
606
+
607
+ @runtime_narrative_stage("Load Source Data")
608
+ async def load_data():
609
+ ...
610
+ ```
611
+
612
+ All `story()` kwargs — `failure_analyzer`, `failure_diagnostics`, `runtime_environment`, `background_analysis`, `renderers`, etc. — are forwarded from `@runtime_narrative_story`.
613
+
614
+ ---
615
+
616
+ ## OpenTelemetry integration
617
+
618
+ `OtelRenderer` maps narrative events to OpenTelemetry spans. Requires the `[otel]` extra.
619
+
620
+ ```python
621
+ from opentelemetry.sdk.trace import TracerProvider
622
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
623
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
624
+
625
+ from runtime_narrative import story, stage, OtelRenderer
626
+
627
+ provider = TracerProvider()
628
+ provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
629
+
630
+ async with story("Process Order", renderers=[OtelRenderer(tracer_provider=provider)]):
631
+ async with stage("Validate"):
632
+ ...
633
+ async with stage("Charge"):
634
+ ...
635
+ ```
636
+
637
+ | Narrative event | OTel concept |
638
+ |---|---|
639
+ | `StoryStarted` → `StoryCompleted` (success) | Root span, status `OK` |
640
+ | `StoryStarted` → `StoryCompleted` (failure) | Root span, status `ERROR` + error attributes |
641
+ | `StageStarted` → `StageCompleted` | Child span of the story root |
642
+ | `FailureOccurred` | Sets `ERROR` status + attributes on root span; ends failing stage span as `ERROR` |
643
+ | `LLMAnalysisReady` | Span event on root with `narrative.llm_analysis` attribute |
644
+
645
+ Attributes on failure spans include `error.type`, `error.message`, `code.filepath`, `code.lineno`, `code.function`, `error.stack_trace`, `narrative.stage_name`, `narrative.exception_chain`.
646
+
647
+ If no `tracer_provider` is passed, the globally configured provider is used (`trace.get_tracer_provider()`).
648
+
649
+ ### Filtering
650
+
651
+ Skip low-value spans to reduce trace noise:
652
+
653
+ ```python
654
+ OtelRenderer(
655
+ tracer_provider=provider,
656
+ exclude_stages={"health_check", "cache_lookup"}, # never create spans for these
657
+ min_duration_ms=5.0, # suppress stage spans shorter than 5 ms
658
+ max_attribute_length=4096, # truncate long string attributes (default 8192)
659
+ )
660
+ ```
661
+
662
+ `exclude_stages` stages that fail still mark the root span `ERROR` — only the child span is suppressed. `min_duration_ms` stages that fail are not filtered (failures always produce a span).
663
+
664
+ ### OTel log renderer
665
+
666
+ `OtelLogRenderer` emits all 6 lifecycle events as OpenTelemetry log records via the `opentelemetry._logs` API. Combine it with `OtelRenderer` to get both traces and logs in your observability backend:
667
+
668
+ ```python
669
+ from runtime_narrative import story, stage, OtelRenderer, OtelLogRenderer
670
+ from opentelemetry.sdk._logs import LoggerProvider
671
+ from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
672
+ from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter
673
+
674
+ log_provider = LoggerProvider()
675
+ log_provider.add_log_record_processor(BatchLogRecordProcessor(OTLPLogExporter()))
676
+
677
+ async with story("Process Order", renderers=[
678
+ OtelRenderer(tracer_provider=trace_provider),
679
+ OtelLogRenderer(logger_provider=log_provider),
680
+ ]):
681
+ async with stage("Validate"):
682
+ ...
683
+ ```
684
+
685
+ | Event | OTel severity |
686
+ |---|---|
687
+ | `StoryStarted`, `StoryCompleted`, `LLMAnalysisReady` | `INFO` |
688
+ | `StageStarted`, `StageCompleted` | `DEBUG` |
689
+ | `FailureOccurred` | `ERROR` with `error.type`, `error.message`, `code.filepath`, `code.lineno`, `code.function`, `error.stack_trace`, `narrative.exception_chain` attributes |
690
+
691
+ Log records are automatically correlated with the ambient OTel span context (`trace_id` / `span_id`) so logs link to their enclosing traces in your backend.
692
+
693
+ ### OTel metrics renderer
694
+
695
+ `OtelMetricsRenderer` emits four instruments via the OpenTelemetry Metrics API:
696
+
697
+ ```python
698
+ from runtime_narrative import story, stage, OtelMetricsRenderer
699
+ from opentelemetry.sdk.metrics import MeterProvider
700
+ from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
701
+ from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
702
+
703
+ reader = PeriodicExportingMetricReader(OTLPMetricExporter(), export_interval_millis=60_000)
704
+ meter_provider = MeterProvider(metric_readers=[reader])
705
+
706
+ async with story("Nightly Batch", renderers=[OtelMetricsRenderer(meter_provider=meter_provider)]):
707
+ async with stage("Load"):
708
+ ...
709
+ ```
710
+
711
+ | Instrument | Type | Labels |
712
+ |---|---|---|
713
+ | `narrative.stage.duration` | Histogram (unit `s`) | `story_name`, `stage_name` |
714
+ | `narrative.story.duration` | Histogram (unit `s`) | `story_name`, `success` (`"true"` / `"false"`) |
715
+ | `narrative.story.failures` | Counter | `story_name`, `error_type` |
716
+ | `narrative.llm.analysis_latency` | Histogram (unit `s`) | `story_name` |
717
+
718
+ `narrative.llm.analysis_latency` measures the time between `FailureOccurred` and `LLMAnalysisReady` — only recorded when background LLM analysis is enabled.
719
+
720
+ ---
721
+
722
+ ## Prometheus metrics
723
+
724
+ `PrometheusRenderer` records four metrics via `prometheus-client`. Requires the `[prometheus]` extra.
725
+
726
+ ```python
727
+ from runtime_narrative import story, stage, PrometheusRenderer
728
+
729
+ async with story("Nightly Batch", renderers=[PrometheusRenderer()]):
730
+ async with stage("Load"):
731
+ ...
732
+ async with stage("Transform"):
733
+ ...
734
+ ```
735
+
736
+ | Metric | Type | Labels |
737
+ |---|---|---|
738
+ | `narrative_story_duration_seconds` | Histogram | `story_name`, `success` (`"true"` / `"false"`) |
739
+ | `narrative_stage_duration_seconds` | Histogram | `story_name`, `stage_name` |
740
+ | `narrative_story_failures_total` | Counter | `story_name`, `error_type` |
741
+ | `narrative_story_total` | Counter | `story_name`, `success` |
742
+
743
+ Use a custom registry to isolate metrics across services or in tests:
744
+
745
+ ```python
746
+ from prometheus_client import CollectorRegistry, start_http_server
747
+
748
+ registry = CollectorRegistry()
749
+ renderer = PrometheusRenderer(registry=registry)
750
+ start_http_server(8000, registry=registry)
751
+ ```
752
+
753
+ ---
754
+
755
+ ## Django middleware
756
+
757
+ `RuntimeNarrativeDjangoMiddleware` wraps every ASGI Django request in a story. `RuntimeNarrativeDjangoSyncMiddleware` does the same for WSGI (sync). Requires the `[django]` extra.
758
+
759
+ ```python
760
+ # settings.py
761
+ MIDDLEWARE = [
762
+ "runtime_narrative.middleware_django.RuntimeNarrativeDjangoMiddleware",
763
+ # ... other middleware
764
+ ]
765
+ ```
766
+
767
+ Or with explicit options in an ASGI entry point:
768
+
769
+ ```python
770
+ from runtime_narrative import RuntimeNarrativeDjangoMiddleware, JsonRenderer, OllamaFailureAnalyzer
771
+
772
+ application = RuntimeNarrativeDjangoMiddleware(
773
+ get_response=django_asgi_app,
774
+ renderers=[JsonRenderer()],
775
+ failure_analyzer=OllamaFailureAnalyzer(model="llama3"),
776
+ runtime_environment="production",
777
+ )
778
+ ```
779
+
780
+ Story name is `"METHOD /path"` (e.g. `"POST /api/orders"`).
781
+
782
+ ---
783
+
784
+ ## Celery integration
785
+
786
+ `NarrativeTask` is a Celery `Task` base class that wraps each task execution in a story. Requires the `[celery]` extra.
787
+
788
+ ```python
789
+ from celery import Celery
790
+ from runtime_narrative import NarrativeTask, OllamaFailureAnalyzer
791
+
792
+ app = Celery("myapp")
793
+
794
+ @app.task(base=NarrativeTask)
795
+ def process_order(order_id):
796
+ with stage("Validate"):
797
+ validate(order_id)
798
+ with stage("Charge"):
799
+ charge(order_id)
800
+ ```
801
+
802
+ To set defaults for all tasks in an app:
803
+
804
+ ```python
805
+ from runtime_narrative import connect_narrative, JsonRenderer
806
+
807
+ connect_narrative(
808
+ app,
809
+ renderers=[JsonRenderer()],
810
+ failure_analyzer=OllamaFailureAnalyzer(model="llama3"),
811
+ runtime_environment="production",
812
+ )
813
+ ```
814
+
815
+ Story name is `"<task.name> [task_id=<id>]"` (e.g. `"myapp.tasks.process_order [task_id=abc-123]"`). Override any option per-task by setting the `narrative_*` class attribute directly.
816
+
817
+ ---
818
+
819
+ ## Concurrent tasks — `NarrativeTaskGroup`
820
+
821
+ Run multiple async tasks under a single story and track all their stages together. No extra dependencies required.
822
+
823
+ ```python
824
+ from runtime_narrative import story, NarrativeTaskGroup
825
+
826
+ async with story("Parallel Pipeline", renderers=[...]):
827
+ async with NarrativeTaskGroup() as tg:
828
+ tg.create_task(load_data(), name="Load Data")
829
+ tg.create_task(load_config(), name="Load Config")
830
+ # both completed — stages from both appear in the story timeline
831
+ ```
832
+
833
+ Each task inherits the parent story's `ContextVar` context automatically, so `stage()` calls inside tasks are tracked normally. If tasks fail, `NarrativeTaskGroupError` is raised with a `failed_tasks: dict[str, BaseException]` mapping:
834
+
835
+ ```python
836
+ from runtime_narrative import NarrativeTaskGroupError
837
+
838
+ try:
839
+ async with NarrativeTaskGroup() as tg:
840
+ tg.create_task(risky_job(), name="Risky Job")
841
+ except NarrativeTaskGroupError as e:
842
+ for task_name, exc in e.failed_tasks.items():
843
+ print(f"{task_name} failed: {exc}")
844
+ ```
845
+
846
+ ---
847
+
848
+ ## gRPC interceptors
849
+
850
+ `RuntimeNarrativeInterceptor` (sync) and `RuntimeNarrativeAsyncInterceptor` (async) wrap each RPC in a story. Requires the `[grpc]` extra.
851
+
852
+ ```python
853
+ import grpc
854
+ from runtime_narrative import RuntimeNarrativeAsyncInterceptor, JsonRenderer
855
+
856
+ interceptor = RuntimeNarrativeAsyncInterceptor(renderers=[JsonRenderer()])
857
+
858
+ server = grpc.aio.server(interceptors=[interceptor])
859
+ ```
860
+
861
+ Story name is the full gRPC method path, e.g. `"/mypackage.MyService/DoThing"`.
862
+
863
+ For sync (non-async) gRPC servers:
864
+
865
+ ```python
866
+ import grpc
867
+ from runtime_narrative import RuntimeNarrativeInterceptor
868
+
869
+ interceptor = RuntimeNarrativeInterceptor(renderers=[JsonRenderer()])
870
+ server = grpc.server(
871
+ futures.ThreadPoolExecutor(),
872
+ interceptors=[interceptor],
873
+ )
874
+ ```
875
+
876
+ Both interceptors accept the same `renderers`, `failure_analyzer`, and diagnostic kwargs as all other integration points.
877
+
878
+ ---
879
+
880
+ ## Persistence and CLI
881
+
882
+ `SqliteStoryRenderer` records every story and failure to a local SQLite database with no external dependencies:
883
+
884
+ ```python
885
+ from runtime_narrative import story, stage
886
+ from runtime_narrative.renderer.persistence_renderer import SqliteStoryRenderer
887
+
888
+ async with story("Nightly ETL", renderers=[SqliteStoryRenderer("narrative.db")]):
889
+ async with stage("Load"):
890
+ pass
891
+ async with stage("Transform"):
892
+ pass
893
+ ```
894
+
895
+ Then query from the terminal:
896
+
897
+ ```bash
898
+ # List the 10 most recent failures
899
+ runtime-narrative failures --db narrative.db
900
+
901
+ # Filter by stage or story name
902
+ runtime-narrative failures --stage "Load" --story "Nightly ETL" --last 20
903
+
904
+ # Inspect a specific story
905
+ runtime-narrative story abc12345 --db narrative.db
906
+ ```
907
+
908
+ ---
909
+
910
+ ## Alert routing
911
+
912
+ `AlertRoutingRenderer` dispatches `FailureOccurred` events to HTTP webhooks and Slack. Destination errors are suppressed — they never crash your story:
913
+
914
+ ```python
915
+ from runtime_narrative import story
916
+ from runtime_narrative.renderer.alert_renderer import (
917
+ AlertRoutingRenderer, SlackWebhookDestination, HttpWebhookDestination,
918
+ )
919
+
920
+ renderer = AlertRoutingRenderer(
921
+ [
922
+ SlackWebhookDestination("https://hooks.slack.com/services/..."),
923
+ HttpWebhookDestination("https://alerts.internal/webhook"),
924
+ ],
925
+ only_stories={"Nightly ETL", "Import Pipeline"}, # None = all stories
926
+ only_error_types={"ValueError", "RuntimeError"}, # None = all errors
927
+ )
928
+
929
+ async with story("Nightly ETL", renderers=[renderer]):
930
+ ...
931
+ ```
932
+
933
+ ---
934
+
935
+ ## Custom redaction rules
936
+
937
+ Beyond the built-in keyword list (`password`, `token`, `secret`, …), you can add regex patterns and a custom callback:
938
+
939
+ ```python
940
+ from runtime_narrative import story
941
+ from runtime_narrative import FailureDiagnosticsConfig
942
+
943
+ config = FailureDiagnosticsConfig(
944
+ failure_diagnostics="rich",
945
+ redact_patterns=("^internal_.*", r"\bpii\b"), # regex, case-insensitive
946
+ redact_callback=lambda key: key.startswith("corp_"),
947
+ )
948
+
949
+ with story("Pipeline", diagnostics_config=config):
950
+ ...
951
+ # local vars matching the patterns or callback show as <redacted> in diagnostics
952
+ ```
953
+
954
+ ---
955
+
956
+ ## Testing utilities
957
+
958
+ `StoryRecorder` is a drop-in context manager that starts a story with a built-in capturing renderer and exposes assertion methods:
959
+
960
+ ```python
961
+ from runtime_narrative import stage
962
+ from runtime_narrative.testing import StoryRecorder
963
+
964
+ def test_etl_stages():
965
+ with StoryRecorder("ETL") as r:
966
+ with stage("Load"):
967
+ pass
968
+ with stage("Validate"):
969
+ pass
970
+ with stage("Export"):
971
+ pass
972
+
973
+ r.assert_stages_completed(["Load", "Validate", "Export"])
974
+ r.assert_no_failure()
975
+
976
+ def test_invalid_input_fails_at_validate():
977
+ with pytest.raises(ValueError):
978
+ with StoryRecorder("ETL") as r:
979
+ with stage("Load"):
980
+ pass
981
+ with stage("Validate"):
982
+ raise ValueError("bad schema")
983
+
984
+ r.assert_stage_failed("Validate", error_type="ValueError")
985
+ r.assert_story_completed(success=False)
986
+ ```
987
+
988
+ Works as `async with StoryRecorder(...)` too — pass any `**story_kwargs` (including `dry_run=True`).
989
+
990
+ ---
991
+
992
+ ## `dry_run` mode
993
+
994
+ Pass `dry_run=True` to `story()` to suppress all stage-body exceptions and still emit `StageStarted` / `StageCompleted` for every stage. The story always completes as `success=True`. Useful for verifying instrumentation wiring before running expensive operations:
995
+
996
+ ```python
997
+ with story("Nightly ETL", dry_run=True):
998
+ with stage("Load Warehouse"):
999
+ raise IOError("would connect to DB in production")
1000
+ with stage("Transform"):
1001
+ raise RuntimeError("would run transforms in production")
1002
+ with stage("Export"):
1003
+ raise IOError("would upload in production")
1004
+ # → StageCompleted emitted for all 3 stages, StoryCompleted(success=True)
1005
+ ```
1006
+
1007
+ Combine with `StoryRecorder` to assert your stage wiring without side effects:
1008
+
1009
+ ```python
1010
+ with StoryRecorder("Nightly ETL", dry_run=True) as r:
1011
+ run_pipeline()
1012
+
1013
+ r.assert_stages_completed(["Load Warehouse", "Transform", "Export"])
1014
+ r.assert_no_failure()
1015
+ ```
1016
+
1017
+ ---
1018
+
1019
+ ## HTML report
1020
+
1021
+ `HtmlReportRenderer` writes a self-contained HTML file when the story completes:
1022
+
1023
+ ```python
1024
+ from runtime_narrative import story, stage
1025
+ from runtime_narrative.renderer.html_renderer import HtmlReportRenderer
1026
+
1027
+ with story("Batch Job", renderers=[HtmlReportRenderer("report.html", open_browser=True)]):
1028
+ with stage("Load"):
1029
+ pass
1030
+ with stage("Process"):
1031
+ pass
1032
+ # → report.html written; browser opens automatically if open_browser=True
1033
+ ```
1034
+
1035
+ The report includes: story name, duration, success/failure badge, a per-stage duration bar chart, and a failure detail section with traceback and LLM analysis (if any).
1036
+
1037
+ ---
1038
+
1039
+ ## Custom renderer
1040
+
1041
+ Any object with a `handle(event)` method is a valid renderer. Async renderers (`async def handle`) are awaited automatically inside `async with story(...)`, including for `StageStarted` and `StageCompleted` events:
1042
+
1043
+ ```python
1044
+ class SlackRenderer:
1045
+ async def handle(self, event):
1046
+ if event.__class__.__name__ == "FailureOccurred":
1047
+ await slack.post(
1048
+ f"*{event.story_name}* failed at *{event.stage_name}*\n"
1049
+ f"`{event.error_type}: {event.error_message}`"
1050
+ )
1051
+
1052
+ async with story("Nightly ETL", renderers=[SlackRenderer()]):
1053
+ ...
1054
+ ```
1055
+
1056
+ Events you will receive:
1057
+
1058
+ | Event | Key fields |
1059
+ |---|---|
1060
+ | `StoryStarted` | `story_id`, `story_name`, `timestamp` |
1061
+ | `StageStarted` | `story_id`, `stage_name`, `timestamp`, `stage_index` (0-based), `parent_stage_name` (for nested stages) |
1062
+ | `StageCompleted` | `story_id`, `stage_name`, `timestamp`, `duration_seconds`, `stage_index`, `parent_stage_name` |
1063
+ | `FailureOccurred` | `story_id`, `story_name`, `stage_name`, `error_type`, `error_message`, `filename`, `lineno`, `function`, `traceback_text`, `exception_chain`, `stage_timeline`, `llm_analysis`, … |
1064
+ | `StoryCompleted` | `story_id`, `story_name`, `success`, `progress_percent`, `completed_stages`, `total_stages`, `timestamp` |
1065
+ | `LLMAnalysisReady` | `story_id`, `story_name`, `stage_name`, `llm_analysis`, `timestamp` — only emitted when `background_analysis=True` |
1066
+
1067
+ `stage_index` is the 0-based position of the stage in the story's stage list. `parent_stage_name` is `None` for top-level stages and set to the enclosing stage's name for nested stages.
1068
+
1069
+ ---
1070
+
1071
+ ## Custom failure analyzer
1072
+
1073
+ Any object with an `analyze_failure(...)` method works. Add `analyze_failure_async(...)` for native async — otherwise the sync version is called via `asyncio.to_thread` so it never blocks the event loop:
1074
+
1075
+ ```python
1076
+ class MyAnalyzer:
1077
+ async def analyze_failure_async(
1078
+ self, *, story_name, stage_name, failure, stage_timeline, progress_percent
1079
+ ):
1080
+ # failure is a FailureSummary:
1081
+ # .error_type, .error_message, .filename, .lineno,
1082
+ # .function, .source_line, .traceback_text, .exception_chain
1083
+ result = await my_llm_client.complete(build_prompt(failure))
1084
+ return result.text
1085
+
1086
+ async with story("Import", failure_analyzer=MyAnalyzer()):
1087
+ ...
1088
+ ```
1089
+
1090
+ Type-check your custom analyzer against the `FailureAnalyzer` protocol (all built-in analyzers already satisfy it):
1091
+
1092
+ ```python
1093
+ from runtime_narrative import FailureAnalyzer
1094
+ assert isinstance(MyAnalyzer(), FailureAnalyzer)
1095
+ ```
1096
+
1097
+ ---
1098
+
1099
+ ## Environment variables
1100
+
1101
+ | Variable | Values | Default | Effect |
1102
+ |---|---|---|---|
1103
+ | `RUNTIME_NARRATIVE_ENV` | `development`, `production` | `development` | Production caps traceback length and forces lean mode |
1104
+ | `RUNTIME_NARRATIVE_FAILURE_DIAGNOSTICS` | `lean`, `rich` | `lean` | `rich` captures local variables at the failing frames. Invalid values raise `ValueError` at story construction. |
1105
+ | `RUNTIME_NARRATIVE_ALLOW_RICH_IN_PRODUCTION` | `1`, `true` | off | Bypass production safeguard for rich diagnostics |
1106
+ | `RUNTIME_NARRATIVE_MODEL` | model name string | — | Default model for `AnthropicFailureAnalyzer`, `LLMFailureAnalyzer`, and `OllamaFailureAnalyzer` when `model=` is not passed explicitly |
1107
+ | `ANTHROPIC_API_KEY` | API key string | — | Required by `AnthropicFailureAnalyzer`; read automatically if not passed as `api_key=` |
1108
+
1109
+ ---
1110
+
1111
+ ## Philosophy
1112
+
1113
+ - **Zero noise on success.** One line per stage. No log spam when things work.
1114
+ - **Full context on failure.** The library already knows what succeeded, what failed, and where. It uses that to give you an actionable report, not a raw stacktrace dropped into a log file.
1115
+ - **LLM is optional, never required.** Every feature works without an LLM. The analyzer is purely additive. If it fails to respond, your exception still propagates normally.
1116
+ - **Logical fixes, not code rewrites.** The LLM suggestion names the exact mechanism and location of the failure, and tells you what logic to change. It does not generate code diffs.
1117
+ - **Async-first, sync-compatible.** Both `with story()` and `async with story()` work. The library never blocks the event loop — failure diagnostics and LLM calls both run via `asyncio.to_thread`.
1118
+ - **No framework lock-in.** Use it in a script, a FastAPI app, a Celery worker, a CLI, or a data pipeline. The only required hook is wrapping your code in `story()` / `stage()`.
1119
+
1120
+ ---
1121
+
1122
+ ## License
1123
+
1124
+ MIT