agent-panorama 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. agent_panorama-0.1.0/.github/workflows/ci.yml +40 -0
  2. agent_panorama-0.1.0/.github/workflows/publish.yml +29 -0
  3. agent_panorama-0.1.0/.gitignore +32 -0
  4. agent_panorama-0.1.0/LICENSE +21 -0
  5. agent_panorama-0.1.0/PKG-INFO +211 -0
  6. agent_panorama-0.1.0/README.md +182 -0
  7. agent_panorama-0.1.0/assets/logo.png +0 -0
  8. agent_panorama-0.1.0/config.example.yaml +27 -0
  9. agent_panorama-0.1.0/examples/langfuse_traces.json +143 -0
  10. agent_panorama-0.1.0/examples/langsmith_runs.json +56 -0
  11. agent_panorama-0.1.0/pyproject.toml +63 -0
  12. agent_panorama-0.1.0/src/agent_panorama/__init__.py +32 -0
  13. agent_panorama-0.1.0/src/agent_panorama/analysis.py +173 -0
  14. agent_panorama-0.1.0/src/agent_panorama/cli.py +84 -0
  15. agent_panorama-0.1.0/src/agent_panorama/config.py +90 -0
  16. agent_panorama-0.1.0/src/agent_panorama/core.py +76 -0
  17. agent_panorama-0.1.0/src/agent_panorama/models.py +146 -0
  18. agent_panorama-0.1.0/src/agent_panorama/parsers/__init__.py +34 -0
  19. agent_panorama-0.1.0/src/agent_panorama/parsers/common.py +240 -0
  20. agent_panorama-0.1.0/src/agent_panorama/parsers/langfuse.py +290 -0
  21. agent_panorama-0.1.0/src/agent_panorama/parsers/langsmith.py +163 -0
  22. agent_panorama-0.1.0/src/agent_panorama/render.py +82 -0
  23. agent_panorama-0.1.0/src/agent_panorama/templates/report.html.j2 +172 -0
  24. agent_panorama-0.1.0/src/agent_panorama/templates/report.md.j2 +65 -0
  25. agent_panorama-0.1.0/tests/_bootstrap.py +12 -0
  26. agent_panorama-0.1.0/tests/conftest.py +10 -0
  27. agent_panorama-0.1.0/tests/run_all_tests.py +24 -0
  28. agent_panorama-0.1.0/tests/test_analysis.py +84 -0
  29. agent_panorama-0.1.0/tests/test_common.py +73 -0
  30. agent_panorama-0.1.0/tests/test_core_cli.py +64 -0
  31. agent_panorama-0.1.0/tests/test_parsers.py +67 -0
  32. agent_panorama-0.1.0/tests/test_render.py +66 -0
  33. agent_panorama-0.1.0/uv.lock +584 -0
@@ -0,0 +1,40 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ test:
14
+ name: Lint & test (py${{ matrix.python-version }})
15
+ runs-on: ubuntu-latest
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ python-version: ["3.10", "3.11", "3.12"]
20
+
21
+ steps:
22
+ - name: Check out repository
23
+ uses: actions/checkout@v4
24
+
25
+ - name: Install uv
26
+ uses: astral-sh/setup-uv@v5
27
+ with:
28
+ enable-cache: true
29
+
30
+ - name: Sync dependencies
31
+ run: uv sync --extra dev --python ${{ matrix.python-version }}
32
+
33
+ - name: Ruff lint
34
+ run: uv run ruff check src tests
35
+
36
+ - name: Ruff format check
37
+ run: uv run ruff format --check src tests
38
+
39
+ - name: Run tests
40
+ run: uv run python tests/run_all_tests.py
@@ -0,0 +1,29 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ publish:
12
+ name: Build and publish to PyPI
13
+ runs-on: ubuntu-latest
14
+ environment: pypi
15
+ permissions:
16
+ id-token: write # required for PyPI Trusted Publishing (OIDC)
17
+
18
+ steps:
19
+ - name: Check out repository
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v5
24
+
25
+ - name: Build sdist and wheel
26
+ run: uv build
27
+
28
+ - name: Publish to PyPI (trusted publishing)
29
+ run: uv publish
@@ -0,0 +1,32 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ .venv/
9
+ venv/
10
+
11
+ # Tooling
12
+ .mypy_cache/
13
+ .ruff_cache/
14
+ .pytest_cache/
15
+
16
+ # Generated reports
17
+ report.md
18
+ report.html
19
+ /report/
20
+
21
+ # Real trace data dropped in for local validation (keep the instructions file)
22
+ /traces/*
23
+ !/traces/PUT_YOUR_TRACE_HERE.md
24
+
25
+ # Never commit raw trace exports anywhere in the tree (may contain private data)
26
+ trace-*.json
27
+ *trace_export*.json
28
+
29
+ # OS / editors
30
+ .DS_Store
31
+ .idea/
32
+ .vscode/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 agent-panorama contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,211 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-panorama
3
+ Version: 0.1.0
4
+ Summary: Turn Langfuse/LangSmith agent traces into human-readable Agent Activity Reports (Markdown + HTML).
5
+ Project-URL: Homepage, https://github.com/Idank96/agent-panorama
6
+ Project-URL: Repository, https://github.com/Idank96/agent-panorama
7
+ Project-URL: Issues, https://github.com/Idank96/agent-panorama/issues
8
+ Author: agent-panorama contributors
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: agents,langfuse,langsmith,llm,observability,reporting
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Software Development :: Libraries
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: click>=8.1
21
+ Requires-Dist: jinja2>=3.1
22
+ Requires-Dist: pyyaml>=6.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: mypy>=1.10; extra == 'dev'
25
+ Requires-Dist: pytest>=8.0; extra == 'dev'
26
+ Requires-Dist: ruff>=0.6; extra == 'dev'
27
+ Requires-Dist: types-pyyaml>=6.0; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ <p align="center">
31
+ <img src="assets/logo.png" alt="agent-panorama" width="320">
32
+ </p>
33
+
34
+ <h1 align="center">agent-panorama</h1>
35
+
36
+ <p align="center">
37
+ <a href="https://github.com/Idank96/agent-panorama/actions/workflows/ci.yml"><img src="https://github.com/Idank96/agent-panorama/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
38
+ </p>
39
+
40
+ Turn raw LLM agent traces into a **human-readable Agent Activity Report** that a
41
+ non-engineer can actually read. Point it at a Langfuse (or LangSmith) trace
42
+ export and get clean Markdown + a self-contained HTML report that explains, in
43
+ business language, what your agents did, what they decided, and anything that
44
+ looks off.
45
+
46
+ ## Why
47
+
48
+ Traces are great for engineers and terrible for everyone else. `agent-panorama`
49
+ translates tool calls, retries, token usage, and errors into plain English. It
50
+ also pulls the real user request and final answer out of LangGraph/LangChain
51
+ `messages` payloads, so the report reads like a story, not a JSON dump:
52
+
53
+ - `get_weather({"city": "Paris"})` → **"Looked up the weather"**
54
+ - 3 failed model calls → **"High retry count: 3 failed attempts before completing."**
55
+ - `human_handoff(...)` → run outcome **human-escalated**
56
+
57
+ > Cost/USD estimation is intentionally out of scope for now — the report reports
58
+ > token usage, not dollars.
59
+
60
+ ## Install
61
+
62
+ ```bash
63
+ pip install agent-panorama
64
+ # or, for local development:
65
+ uv pip install -e ".[dev]"
66
+ ```
67
+
68
+ Requires Python 3.10+. Dependencies are intentionally minimal: `click`,
69
+ `jinja2`, `pyyaml`.
70
+
71
+ ## CLI usage
72
+
73
+ ```bash
74
+ agent-panorama generate --input traces.json --output ./report --format html
75
+ ```
76
+
77
+ Options:
78
+
79
+ | Option | Description |
80
+ | --- | --- |
81
+ | `--input` | Path to the Langfuse/LangSmith JSON export (required). |
82
+ | `--output` | Output directory (default `./report`). |
83
+ | `--format` | `md`, `html`, or `both` (default `both`). |
84
+ | `--input-type` | `langfuse` or `langsmith` (default `langfuse`). |
85
+ | `--config` | Optional YAML config (tool naming, thresholds). |
86
+
87
+ Try it on the bundled example:
88
+
89
+ ```bash
90
+ agent-panorama generate --input examples/langfuse_traces.json --output ./report
91
+ ```
92
+
93
+ ## Library usage
94
+
95
+ ```python
96
+ from agent_panorama import generate_report
97
+
98
+ report = generate_report(
99
+ "traces.json",
100
+ output_dir="./report",
101
+ formats=["md", "html"],
102
+ input_type="langfuse",
103
+ config="config.yaml", # optional
104
+ )
105
+
106
+ print(report.total_runs, report.total_tokens)
107
+ ```
108
+
109
+ `generate_report` returns the in-memory `Report`, so you can also inspect runs,
110
+ the decision log, and anomalies programmatically without touching disk (use
111
+ `build_report_from_file` if you want the report without writing files).
112
+
113
+ ## What's in a report
114
+
115
+ - **Summary** — time range, total runs, total actions, total tokens.
116
+ - **Per-agent section** — what it was asked to do, what it decided/did (tool calls
117
+ in plain English), final outcome, and a confidence signal (retries / fallback).
118
+ - **Decision log** — a sortable table of every consequential action: timestamp,
119
+ agent, action, parameters summarized in plain English, outcome.
120
+ - **Anomalies** — high retry counts, slow runs, high activity, errors, fallbacks.
121
+
122
+ ## Configuration
123
+
124
+ All configuration is optional. See [`config.example.yaml`](config.example.yaml)
125
+ for the full set. Highlights:
126
+
127
+ ```yaml
128
+ tool_descriptions:
129
+ get_weather: "Looked up the weather"
130
+
131
+ consequential_tools: [send_email, human_handoff]
132
+ escalation_tools: [human_handoff, handoff_to_agent]
133
+
134
+ anomaly_thresholds:
135
+ max_retries: 2
136
+ max_latency_seconds: 30
137
+ max_tool_calls: 15
138
+ ```
139
+
140
+ ## Supported inputs
141
+
142
+ - **Langfuse** trace exports — a single trace dict, the single-trace
143
+ `{"trace": {...}, "observations": [...]}` shape, a list of traces, or the
144
+ `{"data": [...]}` list-API shape. Tool calls are read from `TOOL`
145
+ observations (falling back to tool spans), and from `toolCalls` / OpenAI-style
146
+ `tool_calls` declared on generations.
147
+ - **LangSmith** run exports — a flat list (or `{"runs": [...]}`) of run nodes;
148
+ each root run is flattened into one agent run.
149
+
150
+ Token usage is read from the trace (`inputUsage`/`outputUsage` or
151
+ `usage`/`usage_metadata`). Dollar-cost estimation is intentionally out of scope.
152
+
153
+ ## Roadmap
154
+
155
+ `agent-panorama` starts as a report generator and is growing into an **oversight
156
+ layer for fleets of agents** — a single pane of glass for everything your agents
157
+ did, decided, and got wrong. More than logs, across more than one agent.
158
+
159
+ **✅ v0.1 — Read one run clearly _(today)_**
160
+ - Langfuse + LangSmith trace ingestion
161
+ - Plain-language per-agent summaries, decision log, anomalies
162
+ - Markdown + self-contained HTML output; CLI and library API
163
+
164
+ **🔜 v0.2 — See the whole fleet (the panorama view)**
165
+ - A unified **cross-agent activity feed** — one scannable timeline of what every
166
+ agent did, in plain English:
167
+
168
+ ```text
169
+ Agent Activity — May 28, 14:30–15:00
170
+
171
+ research-assistant → searched the web, summarized 3 papers ✓ success
172
+ scheduling-assistant → checked the calendar, handed the task to a human ⤴ escalated
173
+ weather-assistant → looked up the weather (retried once), emailed it ✓ success
174
+ billing-agent → issued 2 refunds, flagged 1 for review ⚠ anomaly
175
+ ```
176
+ - Aggregate many traces into one report (by session, time window, or file glob)
177
+ - Per-agent rollups: runs, actions, success / escalation / retry rates
178
+ - Cross-agent decision log spanning every agent in the window
179
+
180
+ **📈 v0.3 — Trends & regressions**
181
+ - Track rates over time, not just a point-in-time snapshot
182
+ - Flag regressions (escalations or retries spiking vs. a baseline)
183
+ - Period-over-period comparison ("this week vs. last")
184
+
185
+ **🔌 v0.4 — More sources & deeper detail**
186
+ - OpenTelemetry / OpenInference and raw OpenAI-style logs
187
+ - Optionally fetch full input/output from the Langfuse API to enrich
188
+ decision-log parameters
189
+ - Pluggable parser interface for custom trace formats
190
+
191
+ **🎯 The vision — Continuous oversight**
192
+ - A live dashboard: the activity feed above, always-on, filterable by agent /
193
+ outcome / time
194
+ - Scheduled/continuous reports instead of one-off runs
195
+ - Accountability views a non-engineer can sign off on (what happened, what needs
196
+ a human)
197
+ - Alerting on anomalies across the fleet
198
+
199
+ > Have a use case or a trace format you want supported? Open an issue.
200
+
201
+ ## Development
202
+
203
+ ```bash
204
+ uv pip install -e ".[dev]"
205
+ python tests/run_all_tests.py # run the full suite
206
+ ruff check . && ruff format --check .
207
+ ```
208
+
209
+ ## License
210
+
211
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,182 @@
1
+ <p align="center">
2
+ <img src="assets/logo.png" alt="agent-panorama" width="320">
3
+ </p>
4
+
5
+ <h1 align="center">agent-panorama</h1>
6
+
7
+ <p align="center">
8
+ <a href="https://github.com/Idank96/agent-panorama/actions/workflows/ci.yml"><img src="https://github.com/Idank96/agent-panorama/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
9
+ </p>
10
+
11
+ Turn raw LLM agent traces into a **human-readable Agent Activity Report** that a
12
+ non-engineer can actually read. Point it at a Langfuse (or LangSmith) trace
13
+ export and get clean Markdown + a self-contained HTML report that explains, in
14
+ business language, what your agents did, what they decided, and anything that
15
+ looks off.
16
+
17
+ ## Why
18
+
19
+ Traces are great for engineers and terrible for everyone else. `agent-panorama`
20
+ translates tool calls, retries, token usage, and errors into plain English. It
21
+ also pulls the real user request and final answer out of LangGraph/LangChain
22
+ `messages` payloads, so the report reads like a story, not a JSON dump:
23
+
24
+ - `get_weather({"city": "Paris"})` → **"Looked up the weather"**
25
+ - 3 failed model calls → **"High retry count: 3 failed attempts before completing."**
26
+ - `human_handoff(...)` → run outcome **human-escalated**
27
+
28
+ > Cost/USD estimation is intentionally out of scope for now — the report reports
29
+ > token usage, not dollars.
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ pip install agent-panorama
35
+ # or, for local development:
36
+ uv pip install -e ".[dev]"
37
+ ```
38
+
39
+ Requires Python 3.10+. Dependencies are intentionally minimal: `click`,
40
+ `jinja2`, `pyyaml`.
41
+
42
+ ## CLI usage
43
+
44
+ ```bash
45
+ agent-panorama generate --input traces.json --output ./report --format html
46
+ ```
47
+
48
+ Options:
49
+
50
+ | Option | Description |
51
+ | --- | --- |
52
+ | `--input` | Path to the Langfuse/LangSmith JSON export (required). |
53
+ | `--output` | Output directory (default `./report`). |
54
+ | `--format` | `md`, `html`, or `both` (default `both`). |
55
+ | `--input-type` | `langfuse` or `langsmith` (default `langfuse`). |
56
+ | `--config` | Optional YAML config (tool naming, thresholds). |
57
+
58
+ Try it on the bundled example:
59
+
60
+ ```bash
61
+ agent-panorama generate --input examples/langfuse_traces.json --output ./report
62
+ ```
63
+
64
+ ## Library usage
65
+
66
+ ```python
67
+ from agent_panorama import generate_report
68
+
69
+ report = generate_report(
70
+ "traces.json",
71
+ output_dir="./report",
72
+ formats=["md", "html"],
73
+ input_type="langfuse",
74
+ config="config.yaml", # optional
75
+ )
76
+
77
+ print(report.total_runs, report.total_tokens)
78
+ ```
79
+
80
+ `generate_report` returns the in-memory `Report`, so you can also inspect runs,
81
+ the decision log, and anomalies programmatically without touching disk (use
82
+ `build_report_from_file` if you want the report without writing files).
83
+
84
+ ## What's in a report
85
+
86
+ - **Summary** — time range, total runs, total actions, total tokens.
87
+ - **Per-agent section** — what it was asked to do, what it decided/did (tool calls
88
+ in plain English), final outcome, and a confidence signal (retries / fallback).
89
+ - **Decision log** — a sortable table of every consequential action: timestamp,
90
+ agent, action, parameters summarized in plain English, outcome.
91
+ - **Anomalies** — high retry counts, slow runs, high activity, errors, fallbacks.
92
+
93
+ ## Configuration
94
+
95
+ All configuration is optional. See [`config.example.yaml`](config.example.yaml)
96
+ for the full set. Highlights:
97
+
98
+ ```yaml
99
+ tool_descriptions:
100
+ get_weather: "Looked up the weather"
101
+
102
+ consequential_tools: [send_email, human_handoff]
103
+ escalation_tools: [human_handoff, handoff_to_agent]
104
+
105
+ anomaly_thresholds:
106
+ max_retries: 2
107
+ max_latency_seconds: 30
108
+ max_tool_calls: 15
109
+ ```
110
+
111
+ ## Supported inputs
112
+
113
+ - **Langfuse** trace exports — a single trace dict, the single-trace
114
+ `{"trace": {...}, "observations": [...]}` shape, a list of traces, or the
115
+ `{"data": [...]}` list-API shape. Tool calls are read from `TOOL`
116
+ observations (falling back to tool spans), and from `toolCalls` / OpenAI-style
117
+ `tool_calls` declared on generations.
118
+ - **LangSmith** run exports — a flat list (or `{"runs": [...]}`) of run nodes;
119
+ each root run is flattened into one agent run.
120
+
121
+ Token usage is read from the trace (`inputUsage`/`outputUsage` or
122
+ `usage`/`usage_metadata`). Dollar-cost estimation is intentionally out of scope.
123
+
124
+ ## Roadmap
125
+
126
+ `agent-panorama` starts as a report generator and is growing into an **oversight
127
+ layer for fleets of agents** — a single pane of glass for everything your agents
128
+ did, decided, and got wrong. More than logs, across more than one agent.
129
+
130
+ **✅ v0.1 — Read one run clearly _(today)_**
131
+ - Langfuse + LangSmith trace ingestion
132
+ - Plain-language per-agent summaries, decision log, anomalies
133
+ - Markdown + self-contained HTML output; CLI and library API
134
+
135
+ **🔜 v0.2 — See the whole fleet (the panorama view)**
136
+ - A unified **cross-agent activity feed** — one scannable timeline of what every
137
+ agent did, in plain English:
138
+
139
+ ```text
140
+ Agent Activity — May 28, 14:30–15:00
141
+
142
+ research-assistant → searched the web, summarized 3 papers ✓ success
143
+ scheduling-assistant → checked the calendar, handed the task to a human ⤴ escalated
144
+ weather-assistant → looked up the weather (retried once), emailed it ✓ success
145
+ billing-agent → issued 2 refunds, flagged 1 for review ⚠ anomaly
146
+ ```
147
+ - Aggregate many traces into one report (by session, time window, or file glob)
148
+ - Per-agent rollups: runs, actions, success / escalation / retry rates
149
+ - Cross-agent decision log spanning every agent in the window
150
+
151
+ **📈 v0.3 — Trends & regressions**
152
+ - Track rates over time, not just a point-in-time snapshot
153
+ - Flag regressions (escalations or retries spiking vs. a baseline)
154
+ - Period-over-period comparison ("this week vs. last")
155
+
156
+ **🔌 v0.4 — More sources & deeper detail**
157
+ - OpenTelemetry / OpenInference and raw OpenAI-style logs
158
+ - Optionally fetch full input/output from the Langfuse API to enrich
159
+ decision-log parameters
160
+ - Pluggable parser interface for custom trace formats
161
+
162
+ **🎯 The vision — Continuous oversight**
163
+ - A live dashboard: the activity feed above, always-on, filterable by agent /
164
+ outcome / time
165
+ - Scheduled/continuous reports instead of one-off runs
166
+ - Accountability views a non-engineer can sign off on (what happened, what needs
167
+ a human)
168
+ - Alerting on anomalies across the fleet
169
+
170
+ > Have a use case or a trace format you want supported? Open an issue.
171
+
172
+ ## Development
173
+
174
+ ```bash
175
+ uv pip install -e ".[dev]"
176
+ python tests/run_all_tests.py # run the full suite
177
+ ruff check . && ruff format --check .
178
+ ```
179
+
180
+ ## License
181
+
182
+ MIT — see [LICENSE](LICENSE).
Binary file
@@ -0,0 +1,27 @@
1
+ # agent-panorama configuration (all sections optional).
2
+ #
3
+ # Pass with: agent-panorama generate --input traces.json --output ./report --config config.yaml
4
+
5
+ # Map raw tool names to readable descriptions used in the report.
6
+ tool_descriptions:
7
+ web_search: "Searched the web"
8
+ get_weather: "Looked up the weather"
9
+ send_email: "Sent an email"
10
+ human_handoff: "Handed the task off to a person"
11
+
12
+ # Tool names whose calls are considered "consequential" (side effects) and
13
+ # therefore listed in the Decision Log. If omitted, every tool call is listed.
14
+ consequential_tools:
15
+ - send_email
16
+ - human_handoff
17
+
18
+ # Tool names that signal a human escalation (sets run outcome to "escalated").
19
+ escalation_tools:
20
+ - human_handoff
21
+ - handoff_to_agent
22
+
23
+ # Thresholds for the Anomalies section.
24
+ anomaly_thresholds:
25
+ max_retries: 2 # more than this many retries is flagged
26
+ max_latency_seconds: 30 # runs slower than this are flagged
27
+ max_tool_calls: 15 # runs with more tool calls than this are flagged
@@ -0,0 +1,143 @@
1
+ [
2
+ {
3
+ "id": "trace-001",
4
+ "name": "research-assistant",
5
+ "timestamp": "2026-05-20T09:15:00.000Z",
6
+ "input": {"question": "What are the top AI papers this week?"},
7
+ "output": {"content": "Here are three notable AI papers from this week, each with a short summary."},
8
+ "observations": [
9
+ {
10
+ "id": "obs-001-a",
11
+ "type": "GENERATION",
12
+ "name": "plan-response",
13
+ "startTime": "2026-05-20T09:15:01.000Z",
14
+ "endTime": "2026-05-20T09:15:03.500Z",
15
+ "model": "gpt-4o",
16
+ "usage": {"input": 800, "output": 120, "unit": "TOKENS"},
17
+ "level": "DEFAULT",
18
+ "output": {"tool_calls": [{"function": {"name": "web_search", "arguments": "{\"query\": \"top AI papers this week\"}"}}]}
19
+ },
20
+ {
21
+ "id": "obs-001-b",
22
+ "type": "SPAN",
23
+ "name": "web_search",
24
+ "startTime": "2026-05-20T09:15:03.600Z",
25
+ "endTime": "2026-05-20T09:15:04.200Z",
26
+ "input": {"query": "top AI papers this week"},
27
+ "output": {"results": 3},
28
+ "level": "DEFAULT"
29
+ },
30
+ {
31
+ "id": "obs-001-c",
32
+ "type": "SPAN",
33
+ "name": "summarize_text",
34
+ "startTime": "2026-05-20T09:15:04.300Z",
35
+ "endTime": "2026-05-20T09:15:04.900Z",
36
+ "input": {"documents": 3, "max_words": 80},
37
+ "output": {"summary": "three short summaries"},
38
+ "level": "DEFAULT"
39
+ },
40
+ {
41
+ "id": "obs-001-d",
42
+ "type": "GENERATION",
43
+ "name": "final-answer",
44
+ "startTime": "2026-05-20T09:15:05.000Z",
45
+ "endTime": "2026-05-20T09:15:06.400Z",
46
+ "model": "gpt-4o",
47
+ "usage": {"input": 1000, "output": 100, "unit": "TOKENS"},
48
+ "level": "DEFAULT",
49
+ "output": {"content": "Here are three notable AI papers from this week."}
50
+ }
51
+ ]
52
+ },
53
+ {
54
+ "id": "trace-002",
55
+ "name": "scheduling-assistant",
56
+ "timestamp": "2026-05-20T10:02:00.000Z",
57
+ "input": {"question": "Book a meeting with the design team next week."},
58
+ "output": {"content": "This request has been handed off to a human scheduler."},
59
+ "observations": [
60
+ {
61
+ "id": "obs-002-a",
62
+ "type": "GENERATION",
63
+ "name": "assess-request",
64
+ "startTime": "2026-05-20T10:02:01.000Z",
65
+ "endTime": "2026-05-20T10:02:04.000Z",
66
+ "model": "claude-3-5-sonnet-20241022",
67
+ "usage": {"input": 1200, "output": 200, "unit": "TOKENS"},
68
+ "level": "DEFAULT",
69
+ "output": {"content": "This needs a human to confirm availability."}
70
+ },
71
+ {
72
+ "id": "obs-002-b",
73
+ "type": "SPAN",
74
+ "name": "check_calendar",
75
+ "startTime": "2026-05-20T10:02:04.100Z",
76
+ "endTime": "2026-05-20T10:02:04.400Z",
77
+ "input": {"team": "design", "window": "next_week"},
78
+ "output": {"free_slots": 2},
79
+ "level": "DEFAULT"
80
+ },
81
+ {
82
+ "id": "obs-002-c",
83
+ "type": "SPAN",
84
+ "name": "human_handoff",
85
+ "startTime": "2026-05-20T10:02:04.500Z",
86
+ "endTime": "2026-05-20T10:02:04.800Z",
87
+ "input": {"reason": "needs human confirmation", "queue": "scheduling"},
88
+ "output": {"ticket_id": "hx-3320"},
89
+ "level": "DEFAULT"
90
+ }
91
+ ]
92
+ },
93
+ {
94
+ "id": "trace-003",
95
+ "name": "weather-assistant",
96
+ "timestamp": "2026-05-20T11:30:00.000Z",
97
+ "input": {"task": "Email me tomorrow's forecast for Paris."},
98
+ "output": {"content": "Forecast sent after recovering from a provider timeout."},
99
+ "observations": [
100
+ {
101
+ "id": "obs-003-a",
102
+ "type": "GENERATION",
103
+ "name": "plan",
104
+ "startTime": "2026-05-20T11:30:01.000Z",
105
+ "endTime": "2026-05-20T11:30:02.000Z",
106
+ "model": "gpt-4o-mini",
107
+ "usage": {"input": 500, "output": 70, "unit": "TOKENS"},
108
+ "level": "DEFAULT"
109
+ },
110
+ {
111
+ "id": "obs-003-b",
112
+ "type": "SPAN",
113
+ "name": "get_weather",
114
+ "startTime": "2026-05-20T11:30:02.100Z",
115
+ "endTime": "2026-05-20T11:30:14.100Z",
116
+ "input": {"city": "Paris", "provider": "primary"},
117
+ "output": {"error": "provider timeout"},
118
+ "level": "ERROR",
119
+ "statusMessage": "Weather provider timed out after 12s"
120
+ },
121
+ {
122
+ "id": "obs-003-c",
123
+ "type": "SPAN",
124
+ "name": "get_weather",
125
+ "startTime": "2026-05-20T11:30:14.200Z",
126
+ "endTime": "2026-05-20T11:30:15.400Z",
127
+ "input": {"city": "Paris", "provider": "backup"},
128
+ "output": {"temp_c": 18, "summary": "partly cloudy"},
129
+ "level": "DEFAULT"
130
+ },
131
+ {
132
+ "id": "obs-003-d",
133
+ "type": "SPAN",
134
+ "name": "send_email",
135
+ "startTime": "2026-05-20T11:30:15.500Z",
136
+ "endTime": "2026-05-20T11:30:16.000Z",
137
+ "input": {"to": "user", "subject": "Paris forecast for tomorrow"},
138
+ "output": {"sent": true},
139
+ "level": "DEFAULT"
140
+ }
141
+ ]
142
+ }
143
+ ]