agent-observer 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_observer-1.0.0/LICENSE +21 -0
- agent_observer-1.0.0/PKG-INFO +192 -0
- agent_observer-1.0.0/README.md +169 -0
- agent_observer-1.0.0/agent_observer.egg-info/PKG-INFO +192 -0
- agent_observer-1.0.0/agent_observer.egg-info/SOURCES.txt +9 -0
- agent_observer-1.0.0/agent_observer.egg-info/dependency_links.txt +1 -0
- agent_observer-1.0.0/agent_observer.egg-info/requires.txt +6 -0
- agent_observer-1.0.0/agent_observer.egg-info/top_level.txt +1 -0
- agent_observer-1.0.0/pyproject.toml +27 -0
- agent_observer-1.0.0/setup.cfg +4 -0
- agent_observer-1.0.0/setup.py +29 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Manya
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-observer
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Real-time observability and auto-debugging toolkit for AI agents
|
|
5
|
+
Home-page: https://github.com/mannya05/agent-observer
|
|
6
|
+
Author: Mannya
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/mannya05/agent-observer
|
|
9
|
+
Project-URL: Repository, https://github.com/mannya05/agent-observer
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/mannya05/agent-observer/issues
|
|
11
|
+
Keywords: ai,agents,observability,monitoring,langchain,hallucination,rag
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: requests>=2.31.0
|
|
16
|
+
Requires-Dist: streamlit>=1.32.0
|
|
17
|
+
Provides-Extra: langchain
|
|
18
|
+
Requires-Dist: langchain>=0.1.0; extra == "langchain"
|
|
19
|
+
Requires-Dist: langchain-core>=0.1.0; extra == "langchain"
|
|
20
|
+
Dynamic: home-page
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
Dynamic: requires-python
|
|
23
|
+
|
|
24
|
+
# AgentObserver
|
|
25
|
+
|
|
26
|
+
**Real-time observability and auto-debugging for AI agents.**
|
|
27
|
+
|
|
28
|
+
> Inspired by the fact that 40% of enterprise AI agent projects get cancelled — not because of model quality, but due to unobservable production failures. *(Gartner, 2026)*
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## The Problem
|
|
33
|
+
|
|
34
|
+
AI agents fail silently in production. A loop here, a timeout there, a hallucination nobody caught — and suddenly your agent has rebooked 1,247 passengers onto wrong flights *(Air Canada, Jan 2026)*.
|
|
35
|
+
|
|
36
|
+
Companies deploying agents have no easy way to:
|
|
37
|
+
- Know *when* an agent is stuck or failing
|
|
38
|
+
- Understand *why* it failed
|
|
39
|
+
- Get a human-readable explanation without digging through raw logs
|
|
40
|
+
|
|
41
|
+
**AgentObserver solves this.**
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## What It Does
|
|
46
|
+
|
|
47
|
+
Wrap any agent tool call with `observer.monitor_step()` and get:
|
|
48
|
+
|
|
49
|
+
- **Real-time failure alerts** — loops, timeouts, tool errors, empty outputs, hallucination risks
|
|
50
|
+
- **Auto-generated postmortem reports** — powered by Claude API, explains root cause + fix
|
|
51
|
+
- **Session logs** — full JSON trace of every step with status and failure reason
|
|
52
|
+
- **Dashboard** — visual analytics of agent behavior over time
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Failure Types Detected
|
|
57
|
+
|
|
58
|
+
| Failure | Description |
|
|
59
|
+
|---|---|
|
|
60
|
+
| `loop_detected` | Agent repeating same action with same input N times |
|
|
61
|
+
| `tool_failure` | Tool raised an exception |
|
|
62
|
+
| `timeout` | Step exceeded configurable time threshold |
|
|
63
|
+
| `empty_output` | Agent returned None or blank output |
|
|
64
|
+
| `hallucination_risk` | Output contains contradictory statements |
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Quick Start
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
git clone https://github.com/yourusername/agent-observer
|
|
72
|
+
cd agent-observer
|
|
73
|
+
pip install -r requirements.txt
|
|
74
|
+
export ANTHROPIC_API_KEY=your-key-here
|
|
75
|
+
python examples/demo.py
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## Usage
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from core.observer import AgentObserver
|
|
84
|
+
from core.postmortem import print_postmortem
|
|
85
|
+
|
|
86
|
+
# 1. Create observer
|
|
87
|
+
observer = AgentObserver(
|
|
88
|
+
agent_name="MyAgent",
|
|
89
|
+
timeout_threshold=10.0,
|
|
90
|
+
loop_threshold=3
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# 2. Wrap any agent tool call
|
|
94
|
+
result = observer.monitor_step("web_search", search_function, query="AI trends")
|
|
95
|
+
result = observer.monitor_step("summarize", summarize_function, text=result)
|
|
96
|
+
|
|
97
|
+
# 3. Get session summary + postmortem
|
|
98
|
+
summary = observer.save_log("logs/session.json")
|
|
99
|
+
print_postmortem(summary, api_key=ANTHROPIC_API_KEY)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Output when failure detected:**
|
|
103
|
+
```
|
|
104
|
+
=======================================================
|
|
105
|
+
AGENT OBSERVER ALERT — MyAgent
|
|
106
|
+
=======================================================
|
|
107
|
+
Step : #4 — web_search
|
|
108
|
+
Status : LOOP_DETECTED
|
|
109
|
+
Reason : Agent repeated 'web_search' 3x with identical input — likely stuck in a loop
|
|
110
|
+
Duration : 234ms
|
|
111
|
+
Time : 2026-06-17T14:32:11
|
|
112
|
+
=======================================================
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Postmortem Report (AI-generated)
|
|
118
|
+
|
|
119
|
+
After each session, AgentObserver uses Claude API to generate:
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
POSTMORTEM REPORT
|
|
123
|
+
────────────────────────────────────────
|
|
124
|
+
1. Summary
|
|
125
|
+
Agent got stuck in an infinite search loop on Step 4,
|
|
126
|
+
causing 3 redundant API calls with no new information.
|
|
127
|
+
|
|
128
|
+
2. Root Cause
|
|
129
|
+
The search query was not updated between retries — agent
|
|
130
|
+
had no logic to modify query on repeated failure.
|
|
131
|
+
|
|
132
|
+
3. Impact
|
|
133
|
+
Wasted 3 API calls, added 700ms latency, no useful output produced.
|
|
134
|
+
|
|
135
|
+
4. Fix Recommendation
|
|
136
|
+
Add query variation logic — if same query fails twice,
|
|
137
|
+
rephrase before retrying. Add max_retries=2 guard.
|
|
138
|
+
────────────────────────────────────────
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Project Structure
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
agent-observer/
|
|
147
|
+
├── core/
|
|
148
|
+
│ ├── observer.py # Core monitoring engine
|
|
149
|
+
│ └── postmortem.py # AI postmortem generator
|
|
150
|
+
├── dashboard/ # Streamlit dashboard (Week 2)
|
|
151
|
+
├── examples/
|
|
152
|
+
│ └── demo.py # Live demo with 3 failure scenarios
|
|
153
|
+
├── logs/ # Auto-saved session logs
|
|
154
|
+
└── README.md
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## Roadmap
|
|
160
|
+
|
|
161
|
+
- [x] Core failure detection engine
|
|
162
|
+
- [x] AI-powered postmortem reports
|
|
163
|
+
- [x] Session logging (JSON)
|
|
164
|
+
- [ ] Streamlit dashboard with visual analytics
|
|
165
|
+
- [ ] Plug-and-play support for LangChain agents
|
|
166
|
+
- [ ] Slack/email alerts integration
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Tech Stack
|
|
171
|
+
|
|
172
|
+
- **Python** — core engine
|
|
173
|
+
- **Claude API** — postmortem generation
|
|
174
|
+
- **Streamlit** — dashboard (coming Week 2)
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Research Background
|
|
179
|
+
|
|
180
|
+
This project is inspired by open research challenges in agentic AI reliability:
|
|
181
|
+
- *"Agentic Uncertainty Quantification"* — arXiv:2601.15703 (Jan 2026)
|
|
182
|
+
- *"Why AI Agents Fail in Production"* — Gartner Hype Cycle for Agentic AI (2026)
|
|
183
|
+
- *"5 Production Scaling Challenges for Agentic AI"* — MachineLearningMastery (2026)
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Author
|
|
188
|
+
|
|
189
|
+
Built by Mannya — open to contributions, issues, and PRs!
|
|
190
|
+
=======
|
|
191
|
+
# agent-observer
|
|
192
|
+
Real-time observability and auto-debugging toolkit for AI agents
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# AgentObserver
|
|
2
|
+
|
|
3
|
+
**Real-time observability and auto-debugging for AI agents.**
|
|
4
|
+
|
|
5
|
+
> Inspired by the fact that 40% of enterprise AI agent projects get cancelled — not because of model quality, but due to unobservable production failures. *(Gartner, 2026)*
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## The Problem
|
|
10
|
+
|
|
11
|
+
AI agents fail silently in production. A loop here, a timeout there, a hallucination nobody caught — and suddenly your agent has rebooked 1,247 passengers onto wrong flights *(Air Canada, Jan 2026)*.
|
|
12
|
+
|
|
13
|
+
Companies deploying agents have no easy way to:
|
|
14
|
+
- Know *when* an agent is stuck or failing
|
|
15
|
+
- Understand *why* it failed
|
|
16
|
+
- Get a human-readable explanation without digging through raw logs
|
|
17
|
+
|
|
18
|
+
**AgentObserver solves this.**
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## What It Does
|
|
23
|
+
|
|
24
|
+
Wrap any agent tool call with `observer.monitor_step()` and get:
|
|
25
|
+
|
|
26
|
+
- **Real-time failure alerts** — loops, timeouts, tool errors, empty outputs, hallucination risks
|
|
27
|
+
- **Auto-generated postmortem reports** — powered by Claude API, explains root cause + fix
|
|
28
|
+
- **Session logs** — full JSON trace of every step with status and failure reason
|
|
29
|
+
- **Dashboard** — visual analytics of agent behavior over time
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Failure Types Detected
|
|
34
|
+
|
|
35
|
+
| Failure | Description |
|
|
36
|
+
|---|---|
|
|
37
|
+
| `loop_detected` | Agent repeating same action with same input N times |
|
|
38
|
+
| `tool_failure` | Tool raised an exception |
|
|
39
|
+
| `timeout` | Step exceeded configurable time threshold |
|
|
40
|
+
| `empty_output` | Agent returned None or blank output |
|
|
41
|
+
| `hallucination_risk` | Output contains contradictory statements |
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
git clone https://github.com/yourusername/agent-observer
|
|
49
|
+
cd agent-observer
|
|
50
|
+
pip install -r requirements.txt
|
|
51
|
+
export ANTHROPIC_API_KEY=your-key-here
|
|
52
|
+
python examples/demo.py
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Usage
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from core.observer import AgentObserver
|
|
61
|
+
from core.postmortem import print_postmortem
|
|
62
|
+
|
|
63
|
+
# 1. Create observer
|
|
64
|
+
observer = AgentObserver(
|
|
65
|
+
agent_name="MyAgent",
|
|
66
|
+
timeout_threshold=10.0,
|
|
67
|
+
loop_threshold=3
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# 2. Wrap any agent tool call
|
|
71
|
+
result = observer.monitor_step("web_search", search_function, query="AI trends")
|
|
72
|
+
result = observer.monitor_step("summarize", summarize_function, text=result)
|
|
73
|
+
|
|
74
|
+
# 3. Get session summary + postmortem
|
|
75
|
+
summary = observer.save_log("logs/session.json")
|
|
76
|
+
print_postmortem(summary, api_key=ANTHROPIC_API_KEY)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**Output when failure detected:**
|
|
80
|
+
```
|
|
81
|
+
=======================================================
|
|
82
|
+
AGENT OBSERVER ALERT — MyAgent
|
|
83
|
+
=======================================================
|
|
84
|
+
Step : #4 — web_search
|
|
85
|
+
Status : LOOP_DETECTED
|
|
86
|
+
Reason : Agent repeated 'web_search' 3x with identical input — likely stuck in a loop
|
|
87
|
+
Duration : 234ms
|
|
88
|
+
Time : 2026-06-17T14:32:11
|
|
89
|
+
=======================================================
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Postmortem Report (AI-generated)
|
|
95
|
+
|
|
96
|
+
After each session, AgentObserver uses Claude API to generate:
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
POSTMORTEM REPORT
|
|
100
|
+
────────────────────────────────────────
|
|
101
|
+
1. Summary
|
|
102
|
+
Agent got stuck in an infinite search loop on Step 4,
|
|
103
|
+
causing 3 redundant API calls with no new information.
|
|
104
|
+
|
|
105
|
+
2. Root Cause
|
|
106
|
+
The search query was not updated between retries — agent
|
|
107
|
+
had no logic to modify query on repeated failure.
|
|
108
|
+
|
|
109
|
+
3. Impact
|
|
110
|
+
Wasted 3 API calls, added 700ms latency, no useful output produced.
|
|
111
|
+
|
|
112
|
+
4. Fix Recommendation
|
|
113
|
+
Add query variation logic — if same query fails twice,
|
|
114
|
+
rephrase before retrying. Add max_retries=2 guard.
|
|
115
|
+
────────────────────────────────────────
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Project Structure
|
|
121
|
+
|
|
122
|
+
```
|
|
123
|
+
agent-observer/
|
|
124
|
+
├── core/
|
|
125
|
+
│ ├── observer.py # Core monitoring engine
|
|
126
|
+
│ └── postmortem.py # AI postmortem generator
|
|
127
|
+
├── dashboard/ # Streamlit dashboard (Week 2)
|
|
128
|
+
├── examples/
|
|
129
|
+
│ └── demo.py # Live demo with 3 failure scenarios
|
|
130
|
+
├── logs/ # Auto-saved session logs
|
|
131
|
+
└── README.md
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Roadmap
|
|
137
|
+
|
|
138
|
+
- [x] Core failure detection engine
|
|
139
|
+
- [x] AI-powered postmortem reports
|
|
140
|
+
- [x] Session logging (JSON)
|
|
141
|
+
- [ ] Streamlit dashboard with visual analytics
|
|
142
|
+
- [ ] Plug-and-play support for LangChain agents
|
|
143
|
+
- [ ] Slack/email alerts integration
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Tech Stack
|
|
148
|
+
|
|
149
|
+
- **Python** — core engine
|
|
150
|
+
- **Claude API** — postmortem generation
|
|
151
|
+
- **Streamlit** — dashboard (coming Week 2)
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Research Background
|
|
156
|
+
|
|
157
|
+
This project is inspired by open research challenges in agentic AI reliability:
|
|
158
|
+
- *"Agentic Uncertainty Quantification"* — arXiv:2601.15703 (Jan 2026)
|
|
159
|
+
- *"Why AI Agents Fail in Production"* — Gartner Hype Cycle for Agentic AI (2026)
|
|
160
|
+
- *"5 Production Scaling Challenges for Agentic AI"* — MachineLearningMastery (2026)
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Author
|
|
165
|
+
|
|
166
|
+
Built by Mannya — open to contributions, issues, and PRs!
|
|
167
|
+
=======
|
|
168
|
+
# agent-observer
|
|
169
|
+
Real-time observability and auto-debugging toolkit for AI agents
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-observer
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Real-time observability and auto-debugging toolkit for AI agents
|
|
5
|
+
Home-page: https://github.com/mannya05/agent-observer
|
|
6
|
+
Author: Mannya
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/mannya05/agent-observer
|
|
9
|
+
Project-URL: Repository, https://github.com/mannya05/agent-observer
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/mannya05/agent-observer/issues
|
|
11
|
+
Keywords: ai,agents,observability,monitoring,langchain,hallucination,rag
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: requests>=2.31.0
|
|
16
|
+
Requires-Dist: streamlit>=1.32.0
|
|
17
|
+
Provides-Extra: langchain
|
|
18
|
+
Requires-Dist: langchain>=0.1.0; extra == "langchain"
|
|
19
|
+
Requires-Dist: langchain-core>=0.1.0; extra == "langchain"
|
|
20
|
+
Dynamic: home-page
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
Dynamic: requires-python
|
|
23
|
+
|
|
24
|
+
# AgentObserver
|
|
25
|
+
|
|
26
|
+
**Real-time observability and auto-debugging for AI agents.**
|
|
27
|
+
|
|
28
|
+
> Inspired by the fact that 40% of enterprise AI agent projects get cancelled — not because of model quality, but due to unobservable production failures. *(Gartner, 2026)*
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## The Problem
|
|
33
|
+
|
|
34
|
+
AI agents fail silently in production. A loop here, a timeout there, a hallucination nobody caught — and suddenly your agent has rebooked 1,247 passengers onto wrong flights *(Air Canada, Jan 2026)*.
|
|
35
|
+
|
|
36
|
+
Companies deploying agents have no easy way to:
|
|
37
|
+
- Know *when* an agent is stuck or failing
|
|
38
|
+
- Understand *why* it failed
|
|
39
|
+
- Get a human-readable explanation without digging through raw logs
|
|
40
|
+
|
|
41
|
+
**AgentObserver solves this.**
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## What It Does
|
|
46
|
+
|
|
47
|
+
Wrap any agent tool call with `observer.monitor_step()` and get:
|
|
48
|
+
|
|
49
|
+
- **Real-time failure alerts** — loops, timeouts, tool errors, empty outputs, hallucination risks
|
|
50
|
+
- **Auto-generated postmortem reports** — powered by Claude API, explains root cause + fix
|
|
51
|
+
- **Session logs** — full JSON trace of every step with status and failure reason
|
|
52
|
+
- **Dashboard** — visual analytics of agent behavior over time
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Failure Types Detected
|
|
57
|
+
|
|
58
|
+
| Failure | Description |
|
|
59
|
+
|---|---|
|
|
60
|
+
| `loop_detected` | Agent repeating same action with same input N times |
|
|
61
|
+
| `tool_failure` | Tool raised an exception |
|
|
62
|
+
| `timeout` | Step exceeded configurable time threshold |
|
|
63
|
+
| `empty_output` | Agent returned None or blank output |
|
|
64
|
+
| `hallucination_risk` | Output contains contradictory statements |
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Quick Start
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
git clone https://github.com/yourusername/agent-observer
|
|
72
|
+
cd agent-observer
|
|
73
|
+
pip install -r requirements.txt
|
|
74
|
+
export ANTHROPIC_API_KEY=your-key-here
|
|
75
|
+
python examples/demo.py
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## Usage
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from core.observer import AgentObserver
|
|
84
|
+
from core.postmortem import print_postmortem
|
|
85
|
+
|
|
86
|
+
# 1. Create observer
|
|
87
|
+
observer = AgentObserver(
|
|
88
|
+
agent_name="MyAgent",
|
|
89
|
+
timeout_threshold=10.0,
|
|
90
|
+
loop_threshold=3
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# 2. Wrap any agent tool call
|
|
94
|
+
result = observer.monitor_step("web_search", search_function, query="AI trends")
|
|
95
|
+
result = observer.monitor_step("summarize", summarize_function, text=result)
|
|
96
|
+
|
|
97
|
+
# 3. Get session summary + postmortem
|
|
98
|
+
summary = observer.save_log("logs/session.json")
|
|
99
|
+
print_postmortem(summary, api_key=ANTHROPIC_API_KEY)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Output when failure detected:**
|
|
103
|
+
```
|
|
104
|
+
=======================================================
|
|
105
|
+
AGENT OBSERVER ALERT — MyAgent
|
|
106
|
+
=======================================================
|
|
107
|
+
Step : #4 — web_search
|
|
108
|
+
Status : LOOP_DETECTED
|
|
109
|
+
Reason : Agent repeated 'web_search' 3x with identical input — likely stuck in a loop
|
|
110
|
+
Duration : 234ms
|
|
111
|
+
Time : 2026-06-17T14:32:11
|
|
112
|
+
=======================================================
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Postmortem Report (AI-generated)
|
|
118
|
+
|
|
119
|
+
After each session, AgentObserver uses Claude API to generate:
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
POSTMORTEM REPORT
|
|
123
|
+
────────────────────────────────────────
|
|
124
|
+
1. Summary
|
|
125
|
+
Agent got stuck in an infinite search loop on Step 4,
|
|
126
|
+
causing 3 redundant API calls with no new information.
|
|
127
|
+
|
|
128
|
+
2. Root Cause
|
|
129
|
+
The search query was not updated between retries — agent
|
|
130
|
+
had no logic to modify query on repeated failure.
|
|
131
|
+
|
|
132
|
+
3. Impact
|
|
133
|
+
Wasted 3 API calls, added 700ms latency, no useful output produced.
|
|
134
|
+
|
|
135
|
+
4. Fix Recommendation
|
|
136
|
+
Add query variation logic — if same query fails twice,
|
|
137
|
+
rephrase before retrying. Add max_retries=2 guard.
|
|
138
|
+
────────────────────────────────────────
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Project Structure
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
agent-observer/
|
|
147
|
+
├── core/
|
|
148
|
+
│ ├── observer.py # Core monitoring engine
|
|
149
|
+
│ └── postmortem.py # AI postmortem generator
|
|
150
|
+
├── dashboard/ # Streamlit dashboard (Week 2)
|
|
151
|
+
├── examples/
|
|
152
|
+
│ └── demo.py # Live demo with 3 failure scenarios
|
|
153
|
+
├── logs/ # Auto-saved session logs
|
|
154
|
+
└── README.md
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## Roadmap
|
|
160
|
+
|
|
161
|
+
- [x] Core failure detection engine
|
|
162
|
+
- [x] AI-powered postmortem reports
|
|
163
|
+
- [x] Session logging (JSON)
|
|
164
|
+
- [ ] Streamlit dashboard with visual analytics
|
|
165
|
+
- [ ] Plug-and-play support for LangChain agents
|
|
166
|
+
- [ ] Slack/email alerts integration
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Tech Stack
|
|
171
|
+
|
|
172
|
+
- **Python** — core engine
|
|
173
|
+
- **Claude API** — postmortem generation
|
|
174
|
+
- **Streamlit** — dashboard (coming Week 2)
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## Research Background
|
|
179
|
+
|
|
180
|
+
This project is inspired by open research challenges in agentic AI reliability:
|
|
181
|
+
- *"Agentic Uncertainty Quantification"* — arXiv:2601.15703 (Jan 2026)
|
|
182
|
+
- *"Why AI Agents Fail in Production"* — Gartner Hype Cycle for Agentic AI (2026)
|
|
183
|
+
- *"5 Production Scaling Challenges for Agentic AI"* — MachineLearningMastery (2026)
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Author
|
|
188
|
+
|
|
189
|
+
Built by Mannya — open to contributions, issues, and PRs!
|
|
190
|
+
=======
|
|
191
|
+
# agent-observer
|
|
192
|
+
Real-time observability and auto-debugging toolkit for AI agents
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "agent-observer"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Real-time observability and auto-debugging toolkit for AI agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Mannya"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["ai", "agents", "observability", "monitoring", "langchain", "hallucination", "rag"]
|
|
16
|
+
dependencies = [
|
|
17
|
+
"requests>=2.31.0",
|
|
18
|
+
"streamlit>=1.32.0",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
langchain = ["langchain>=0.1.0", "langchain-core>=0.1.0"]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://github.com/mannya05/agent-observer"
|
|
26
|
+
Repository = "https://github.com/mannya05/agent-observer"
|
|
27
|
+
"Bug Tracker" = "https://github.com/mannya05/agent-observer/issues"
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as f:
|
|
4
|
+
long_description = f.read()
|
|
5
|
+
|
|
6
|
+
setup(
|
|
7
|
+
name="agent-observer-ai",
|
|
8
|
+
version="0.1.0",
|
|
9
|
+
author="Manya",
|
|
10
|
+
description="Real-time observability and auto-debugging toolkit for AI agents",
|
|
11
|
+
long_description=long_description,
|
|
12
|
+
long_description_content_type="text/markdown",
|
|
13
|
+
url="https://github.com/mannya05/agent-observer",
|
|
14
|
+
packages=find_packages(),
|
|
15
|
+
classifiers=[
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
20
|
+
],
|
|
21
|
+
python_requires=">=3.8",
|
|
22
|
+
install_requires=[
|
|
23
|
+
"requests>=2.31.0",
|
|
24
|
+
],
|
|
25
|
+
extras_require={
|
|
26
|
+
"dashboard": ["streamlit>=1.32.0", "pandas>=1.5.0"],
|
|
27
|
+
"langchain": ["langchain>=0.1.0", "langchain-core>=0.1.0"],
|
|
28
|
+
},
|
|
29
|
+
)
|