agent-observability-studio 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ # agent-observability-studio
2
+
3
+ A real-time monitoring and debugging platform for multi-agent AI systems that provides live interaction tracing, token cost analysis, and performance bottleneck detection. Complements your existing mesh-health-dashboard by adding deep inspection tools for agent conversations, decision trees, and failure analysis — think "Chrome DevTools for AI agents."
@@ -0,0 +1,159 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-observability-studio
3
+ Version: 0.1.0
4
+ Summary: Real-time monitoring and debugging platform for multi-agent AI systems
5
+ Project-URL: Homepage, https://github.com/yourusername/agent-observability-studio
6
+ Project-URL: Documentation, https://github.com/yourusername/agent-observability-studio#readme
7
+ Project-URL: Repository, https://github.com/yourusername/agent-observability-studio
8
+ Author-email: Zach <zach@example.com>
9
+ License: MIT
10
+ Keywords: agents,ai,debugging,llm,monitoring,observability
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: alembic>=1.13.1
19
+ Requires-Dist: anthropic>=0.18.0
20
+ Requires-Dist: fastapi>=0.109.0
21
+ Requires-Dist: httpx>=0.26.0
22
+ Requires-Dist: openai>=1.12.0
23
+ Requires-Dist: pydantic-settings>=2.1.0
24
+ Requires-Dist: pydantic>=2.5.3
25
+ Requires-Dist: python-multipart>=0.0.6
26
+ Requires-Dist: redis>=5.0.1
27
+ Requires-Dist: sqlalchemy>=2.0.25
28
+ Requires-Dist: tiktoken>=0.5.2
29
+ Requires-Dist: uvicorn[standard]>=0.27.0
30
+ Requires-Dist: websockets>=12.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: black>=24.1.1; extra == 'dev'
33
+ Requires-Dist: mypy>=1.8.0; extra == 'dev'
34
+ Requires-Dist: pytest-asyncio>=0.23.3; extra == 'dev'
35
+ Requires-Dist: pytest>=7.4.4; extra == 'dev'
36
+ Requires-Dist: ruff>=0.1.14; extra == 'dev'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # Agent Observability Studio
40
+
41
+ Real-time monitoring and debugging platform for multi-agent AI systems—Chrome DevTools for AI agents.
42
+
43
+ ## What is this?
44
+
45
+ Agent Observability Studio provides deep inspection tools for multi-agent AI systems, complementing existing infrastructure dashboards. It captures live interaction traces, analyzes token costs, detects performance bottlenecks, and helps debug agent decision trees and failures in production environments.
46
+
47
+ ## Features
48
+
49
+ - **Live Interaction Tracing** — Real-time visibility into agent conversations and inter-agent communication
50
+ - **Token Cost Analysis** — Granular per-agent, per-interaction token accounting with cost attribution
51
+ - **Performance Bottleneck Detection** — Automatic identification of slow agents, network delays, and processing inefficiencies
52
+ - **Decision Tree Inspection** — Visualize agent reasoning paths and fallback logic execution
53
+ - **Failure Analysis** — Capture and replay failed interactions with full context
54
+ - **WebSocket Streaming** — Push-based updates for low-latency monitoring
55
+ - **RESTful API** — Programmatic access to traces, metrics, and historical data
56
+ - **CLI Tools** — Command-line utilities for local debugging and integration
57
+
58
+ ## Quick Start
59
+
60
+ ### Installation
61
+
62
+ ```bash
63
+ pip install agent-observability-studio
64
+ ```
65
+
66
+ ### Basic Setup
67
+
68
+ ```python
69
+ from agent_observability_studio import ObservabilityClient
70
+
71
+ # Initialize the client
72
+ client = ObservabilityClient(
73
+ api_url="http://localhost:8000",
74
+ api_key="your-api-key"
75
+ )
76
+
77
+ # Start monitoring an agent interaction
78
+ trace = client.start_trace(
79
+ agent_id="my-agent",
80
+ session_id="session-123"
81
+ )
82
+
83
+ # Log a step
84
+ trace.log_step(
85
+ name="retrieve_documents",
86
+ duration_ms=245,
87
+ tokens_used=1024,
88
+ status="success"
89
+ )
90
+
91
+ # End trace
92
+ trace.end()
93
+ ```
94
+
95
+ ### CLI Usage
96
+
97
+ ```bash
98
+ # Start the monitoring server
99
+ aos-server --port 8000 --db-url postgresql://localhost/observability
100
+
101
+ # Stream live traces
102
+ aos-trace watch
103
+
104
+ # Export session data
105
+ aos-export --session session-123 --format json --output trace.json
106
+
107
+ # Analyze costs
108
+ aos-costs --agent-id my-agent --date-range "2025-03-01:2025-03-18"
109
+ ```
110
+
111
+ ## Usage Examples
112
+
113
+ **Monitor agent costs in real-time:**
114
+ ```python
115
+ trace = client.start_trace(agent_id="researcher")
116
+ # ... agent work ...
117
+ cost_report = trace.get_cost_summary()
118
+ print(f"Total tokens: {cost_report.total_tokens}")
119
+ print(f"Estimated cost: ${cost_report.estimated_cost:.4f}")
120
+ ```
121
+
122
+ **Capture and replay failures:**
123
+ ```python
124
+ failed_traces = client.query_traces(status="error", limit=10)
125
+ for trace in failed_traces:
126
+ print(f"Error: {trace.error_message}")
127
+ print(f"Decision tree: {trace.decision_path}")
128
+ ```
129
+
130
+ **Subscribe to live events:**
131
+ ```python
132
+ async def handle_trace(event):
133
+ print(f"Trace {event.trace_id}: {event.status}")
134
+
135
+ client.subscribe("trace.completed", handle_trace)
136
+ ```
137
+
138
+ ## Tech Stack
139
+
140
+ - **Runtime**: Python 3.12+
141
+ - **API**: FastAPI with async support
142
+ - **Database**: PostgreSQL with SQLAlchemy ORM
143
+ - **Real-time**: WebSocket streams via websockets library
144
+ - **Configuration**: Pydantic for settings management
145
+ - **CLI**: Click for command-line interface
146
+ - **Packaging**: Poetry for dependency management
147
+
148
+ ## Architecture
149
+
150
+ The studio consists of four main components:
151
+
152
+ - **API Server** (`api.py`) — RESTful endpoints for trace queries and configuration
153
+ - **WebSocket Manager** (`websocket_manager.py`) — Real-time event streaming
154
+ - **Database Layer** (`database.py`) — Persistent storage of traces and metrics
155
+ - **Client SDK** (`client.py`) — Python library for agent instrumentation
156
+
157
+ ## License
158
+
159
+ MIT
@@ -0,0 +1,121 @@
1
+ # Agent Observability Studio
2
+
3
+ Real-time monitoring and debugging platform for multi-agent AI systems—Chrome DevTools for AI agents.
4
+
5
+ ## What is this?
6
+
7
+ Agent Observability Studio provides deep inspection tools for multi-agent AI systems, complementing existing infrastructure dashboards. It captures live interaction traces, analyzes token costs, detects performance bottlenecks, and helps debug agent decision trees and failures in production environments.
8
+
9
+ ## Features
10
+
11
+ - **Live Interaction Tracing** — Real-time visibility into agent conversations and inter-agent communication
12
+ - **Token Cost Analysis** — Granular per-agent, per-interaction token accounting with cost attribution
13
+ - **Performance Bottleneck Detection** — Automatic identification of slow agents, network delays, and processing inefficiencies
14
+ - **Decision Tree Inspection** — Visualize agent reasoning paths and fallback logic execution
15
+ - **Failure Analysis** — Capture and replay failed interactions with full context
16
+ - **WebSocket Streaming** — Push-based updates for low-latency monitoring
17
+ - **RESTful API** — Programmatic access to traces, metrics, and historical data
18
+ - **CLI Tools** — Command-line utilities for local debugging and integration
19
+
20
+ ## Quick Start
21
+
22
+ ### Installation
23
+
24
+ ```bash
25
+ pip install agent-observability-studio
26
+ ```
27
+
28
+ ### Basic Setup
29
+
30
+ ```python
31
+ from agent_observability_studio import ObservabilityClient
32
+
33
+ # Initialize the client
34
+ client = ObservabilityClient(
35
+ api_url="http://localhost:8000",
36
+ api_key="your-api-key"
37
+ )
38
+
39
+ # Start monitoring an agent interaction
40
+ trace = client.start_trace(
41
+ agent_id="my-agent",
42
+ session_id="session-123"
43
+ )
44
+
45
+ # Log a step
46
+ trace.log_step(
47
+ name="retrieve_documents",
48
+ duration_ms=245,
49
+ tokens_used=1024,
50
+ status="success"
51
+ )
52
+
53
+ # End trace
54
+ trace.end()
55
+ ```
56
+
57
+ ### CLI Usage
58
+
59
+ ```bash
60
+ # Start the monitoring server
61
+ aos-server --port 8000 --db-url postgresql://localhost/observability
62
+
63
+ # Stream live traces
64
+ aos-trace watch
65
+
66
+ # Export session data
67
+ aos-export --session session-123 --format json --output trace.json
68
+
69
+ # Analyze costs
70
+ aos-costs --agent-id my-agent --date-range "2025-03-01:2025-03-18"
71
+ ```
72
+
73
+ ## Usage Examples
74
+
75
+ **Monitor agent costs in real-time:**
76
+ ```python
77
+ trace = client.start_trace(agent_id="researcher")
78
+ # ... agent work ...
79
+ cost_report = trace.get_cost_summary()
80
+ print(f"Total tokens: {cost_report.total_tokens}")
81
+ print(f"Estimated cost: ${cost_report.estimated_cost:.4f}")
82
+ ```
83
+
84
+ **Capture and replay failures:**
85
+ ```python
86
+ failed_traces = client.query_traces(status="error", limit=10)
87
+ for trace in failed_traces:
88
+ print(f"Error: {trace.error_message}")
89
+ print(f"Decision tree: {trace.decision_path}")
90
+ ```
91
+
92
+ **Subscribe to live events:**
93
+ ```python
94
+ async def handle_trace(event):
95
+ print(f"Trace {event.trace_id}: {event.status}")
96
+
97
+ client.subscribe("trace.completed", handle_trace)
98
+ ```
99
+
100
+ ## Tech Stack
101
+
102
+ - **Runtime**: Python 3.12+
103
+ - **API**: FastAPI with async support
104
+ - **Database**: PostgreSQL with SQLAlchemy ORM
105
+ - **Real-time**: WebSocket streams via websockets library
106
+ - **Configuration**: Pydantic for settings management
107
+ - **CLI**: Click for command-line interface
108
+ - **Packaging**: Poetry for dependency management
109
+
110
+ ## Architecture
111
+
112
+ The studio consists of four main components:
113
+
114
+ - **API Server** (`api.py`) — RESTful endpoints for trace queries and configuration
115
+ - **WebSocket Manager** (`websocket_manager.py`) — Real-time event streaming
116
+ - **Database Layer** (`database.py`) — Persistent storage of traces and metrics
117
+ - **Client SDK** (`client.py`) — Python library for agent instrumentation
118
+
119
+ ## License
120
+
121
+ MIT
@@ -0,0 +1,18 @@
1
+ """Agent Observability Studio - Deep inspection for multi-agent AI systems."""
2
+
3
+ from agent_observability_studio.client import ObservabilityClient
4
+ from agent_observability_studio.models import (
5
+ Session,
6
+ Interaction,
7
+ SessionStatus,
8
+ InteractionType,
9
+ )
10
+
11
+ __version__ = "0.1.0"
12
+ __all__ = [
13
+ "ObservabilityClient",
14
+ "Session",
15
+ "Interaction",
16
+ "SessionStatus",
17
+ "InteractionType",
18
+ ]
@@ -0,0 +1,407 @@
1
+ """FastAPI application for observability platform."""
2
+
3
+ from datetime import datetime
4
+ from typing import List, Optional
5
+ from uuid import UUID
6
+ from fastapi import FastAPI, HTTPException, Depends, WebSocket, WebSocketDisconnect, Query
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.staticfiles import StaticFiles
9
+ from sqlalchemy.orm import Session
10
+ from sqlalchemy import func, and_
11
+
12
+ from agent_observability_studio.database import Database, SessionDB, InteractionDB
13
+ from agent_observability_studio.models import (
14
+ Session as SessionModel,
15
+ Interaction as InteractionModel,
16
+ SessionCreate,
17
+ SessionUpdate,
18
+ InteractionCreate,
19
+ SessionQuery,
20
+ SessionStatus,
21
+ CostBreakdown,
22
+ DecisionNode,
23
+ )
24
+ from agent_observability_studio.websocket_manager import ConnectionManager
25
+ from agent_observability_studio.config import get_settings
26
+
27
+
28
+ settings = get_settings()
29
+ db_manager = Database(settings.database_url)
30
+ app = FastAPI(title="Agent Observability Studio", version="0.1.0")
31
+ ws_manager = ConnectionManager()
32
+
33
+ # CORS middleware
34
+ app.add_middleware(
35
+ CORSMiddleware,
36
+ allow_origins=["*"],
37
+ allow_credentials=True,
38
+ allow_methods=["*"],
39
+ allow_headers=["*"],
40
+ )
41
+
42
+
43
+ def get_db():
44
+ """Dependency to get database session."""
45
+ return next(db_manager.get_session())
46
+
47
+
48
+ @app.get("/")
49
+ async def root():
50
+ """Root endpoint."""
51
+ return {
52
+ "service": "Agent Observability Studio",
53
+ "version": "0.1.0",
54
+ "endpoints": {
55
+ "api": "/api/v1",
56
+ "docs": "/docs",
57
+ "websocket": "/ws",
58
+ "ui": "/ui",
59
+ },
60
+ }
61
+
62
+
63
+ @app.post("/api/v1/sessions", response_model=SessionModel)
64
+ async def create_session(
65
+ session_data: SessionCreate, db: Session = Depends(get_db)
66
+ ) -> SessionModel:
67
+ """Create a new agent session."""
68
+ db_session = SessionDB(
69
+ agent_name=session_data.agent_name,
70
+ task_id=session_data.task_id,
71
+ metadata=session_data.metadata,
72
+ parent_session_id=session_data.parent_session_id,
73
+ )
74
+ db.add(db_session)
75
+ db.commit()
76
+ db.refresh(db_session)
77
+
78
+ session_model = SessionModel(
79
+ id=db_session.id,
80
+ agent_name=db_session.agent_name,
81
+ task_id=db_session.task_id,
82
+ status=db_session.status,
83
+ start_time=db_session.start_time,
84
+ end_time=db_session.end_time,
85
+ total_tokens=db_session.total_tokens,
86
+ total_cost_usd=db_session.total_cost_usd,
87
+ metadata=db_session.metadata,
88
+ parent_session_id=db_session.parent_session_id,
89
+ )
90
+
91
+ await ws_manager.broadcast(
92
+ {"type": "session_created", "session": session_model.model_dump(mode="json")}
93
+ )
94
+
95
+ return session_model
96
+
97
+
98
+ @app.patch("/api/v1/sessions/{session_id}", response_model=SessionModel)
99
+ async def update_session(
100
+ session_id: UUID, update_data: SessionUpdate, db: Session = Depends(get_db)
101
+ ) -> SessionModel:
102
+ """Update an existing session."""
103
+ db_session = db.query(SessionDB).filter(SessionDB.id == session_id).first()
104
+ if not db_session:
105
+ raise HTTPException(status_code=404, detail="Session not found")
106
+
107
+ if update_data.status:
108
+ db_session.status = update_data.status
109
+ if update_data.end_time:
110
+ db_session.end_time = update_data.end_time
111
+ if update_data.metadata:
112
+ db_session.metadata.update(update_data.metadata)
113
+
114
+ db.commit()
115
+ db.refresh(db_session)
116
+
117
+ session_model = SessionModel(
118
+ id=db_session.id,
119
+ agent_name=db_session.agent_name,
120
+ task_id=db_session.task_id,
121
+ status=db_session.status,
122
+ start_time=db_session.start_time,
123
+ end_time=db_session.end_time,
124
+ total_tokens=db_session.total_tokens,
125
+ total_cost_usd=db_session.total_cost_usd,
126
+ metadata=db_session.metadata,
127
+ parent_session_id=db_session.parent_session_id,
128
+ )
129
+
130
+ await ws_manager.broadcast(
131
+ {"type": "session_updated", "session": session_model.model_dump(mode="json")}
132
+ )
133
+
134
+ return session_model
135
+
136
+
137
+ @app.get("/api/v1/sessions", response_model=List[SessionModel])
138
+ async def query_sessions(
139
+ agent_name: Optional[str] = None,
140
+ task_id: Optional[str] = None,
141
+ status: Optional[SessionStatus] = None,
142
+ start_date: Optional[datetime] = None,
143
+ end_date: Optional[datetime] = None,
144
+ limit: int = Query(100, le=1000),
145
+ offset: int = 0,
146
+ db: Session = Depends(get_db),
147
+ ) -> List[SessionModel]:
148
+ """Query sessions with filters."""
149
+ query = db.query(SessionDB)
150
+
151
+ if agent_name:
152
+ query = query.filter(SessionDB.agent_name == agent_name)
153
+ if task_id:
154
+ query = query.filter(SessionDB.task_id == task_id)
155
+ if status:
156
+ query = query.filter(SessionDB.status == status)
157
+ if start_date:
158
+ query = query.filter(SessionDB.start_time >= start_date)
159
+ if end_date:
160
+ query = query.filter(SessionDB.start_time <= end_date)
161
+
162
+ sessions = query.order_by(SessionDB.start_time.desc()).limit(limit).offset(offset).all()
163
+
164
+ return [
165
+ SessionModel(
166
+ id=s.id,
167
+ agent_name=s.agent_name,
168
+ task_id=s.task_id,
169
+ status=s.status,
170
+ start_time=s.start_time,
171
+ end_time=s.end_time,
172
+ total_tokens=s.total_tokens,
173
+ total_cost_usd=s.total_cost_usd,
174
+ metadata=s.metadata,
175
+ parent_session_id=s.parent_session_id,
176
+ )
177
+ for s in sessions
178
+ ]
179
+
180
+
181
+ @app.get("/api/v1/sessions/{session_id}", response_model=SessionModel)
182
+ async def get_session(session_id: UUID, db: Session = Depends(get_db)) -> SessionModel:
183
+ """Get a specific session by ID."""
184
+ db_session = db.query(SessionDB).filter(SessionDB.id == session_id).first()
185
+ if not db_session:
186
+ raise HTTPException(status_code=404, detail="Session not found")
187
+
188
+ return SessionModel(
189
+ id=db_session.id,
190
+ agent_name=db_session.agent_name,
191
+ task_id=db_session.task_id,
192
+ status=db_session.status,
193
+ start_time=db_session.start_time,
194
+ end_time=db_session.end_time,
195
+ total_tokens=db_session.total_tokens,
196
+ total_cost_usd=db_session.total_cost_usd,
197
+ metadata=db_session.metadata,
198
+ parent_session_id=db_session.parent_session_id,
199
+ )
200
+
201
+
202
+ @app.post("/api/v1/interactions", response_model=InteractionModel)
203
+ async def log_interaction(
204
+ interaction_data: InteractionCreate, db: Session = Depends(get_db)
205
+ ) -> InteractionModel:
206
+ """Log a new interaction for a session."""
207
+ # Verify session exists
208
+ db_session = (
209
+ db.query(SessionDB).filter(SessionDB.id == interaction_data.session_id).first()
210
+ )
211
+ if not db_session:
212
+ raise HTTPException(status_code=404, detail="Session not found")
213
+
214
+ # Create interaction
215
+ db_interaction = InteractionDB(
216
+ session_id=interaction_data.session_id,
217
+ type=interaction_data.type,
218
+ request=interaction_data.request,
219
+ response=interaction_data.response,
220
+ tokens_used=interaction_data.tokens_used,
221
+ cost_usd=interaction_data.cost_usd,
222
+ latency_ms=interaction_data.latency_ms,
223
+ model=interaction_data.model,
224
+ error=interaction_data.error,
225
+ metadata=interaction_data.metadata,
226
+ )
227
+ db.add(db_interaction)
228
+
229
+ # Update session totals
230
+ db_session.total_tokens += interaction_data.tokens_used
231
+ db_session.total_cost_usd += interaction_data.cost_usd
232
+
233
+ db.commit()
234
+ db.refresh(db_interaction)
235
+
236
+ interaction_model = InteractionModel(
237
+ id=db_interaction.id,
238
+ session_id=db_interaction.session_id,
239
+ type=db_interaction.type,
240
+ timestamp=db_interaction.timestamp,
241
+ request=db_interaction.request,
242
+ response=db_interaction.response,
243
+ tokens_used=db_interaction.tokens_used,
244
+ cost_usd=db_interaction.cost_usd,
245
+ latency_ms=db_interaction.latency_ms,
246
+ model=db_interaction.model,
247
+ error=db_interaction.error,
248
+ metadata=db_interaction.metadata,
249
+ )
250
+
251
+ await ws_manager.broadcast(
252
+ {"type": "interaction_logged", "interaction": interaction_model.model_dump(mode="json")}
253
+ )
254
+
255
+ return interaction_model
256
+
257
+
258
+ @app.get("/api/v1/sessions/{session_id}/interactions", response_model=List[InteractionModel])
259
+ async def get_session_interactions(
260
+ session_id: UUID, db: Session = Depends(get_db)
261
+ ) -> List[InteractionModel]:
262
+ """Get all interactions for a session."""
263
+ interactions = (
264
+ db.query(InteractionDB)
265
+ .filter(InteractionDB.session_id == session_id)
266
+ .order_by(InteractionDB.timestamp)
267
+ .all()
268
+ )
269
+
270
+ return [
271
+ InteractionModel(
272
+ id=i.id,
273
+ session_id=i.session_id,
274
+ type=i.type,
275
+ timestamp=i.timestamp,
276
+ request=i.request,
277
+ response=i.response,
278
+ tokens_used=i.tokens_used,
279
+ cost_usd=i.cost_usd,
280
+ latency_ms=i.latency_ms,
281
+ model=i.model,
282
+ error=i.error,
283
+ metadata=i.metadata,
284
+ )
285
+ for i in interactions
286
+ ]
287
+
288
+
289
+ @app.get("/api/v1/analytics/cost-breakdown", response_model=CostBreakdown)
290
+ async def get_cost_breakdown(
291
+ start_date: Optional[datetime] = None,
292
+ end_date: Optional[datetime] = None,
293
+ db: Session = Depends(get_db),
294
+ ) -> CostBreakdown:
295
+ """Get cost breakdown analytics."""
296
+ query = db.query(SessionDB)
297
+
298
+ if start_date:
299
+ query = query.filter(SessionDB.start_time >= start_date)
300
+ if end_date:
301
+ query = query.filter(SessionDB.start_time <= end_date)
302
+
303
+ sessions = query.all()
304
+
305
+ total_cost = sum(s.total_cost_usd for s in sessions)
306
+ total_tokens = sum(s.total_tokens for s in sessions)
307
+
308
+ by_agent = {}
309
+ by_task = {}
310
+ for s in sessions:
311
+ by_agent[s.agent_name] = by_agent.get(s.agent_name, 0) + s.total_cost_usd
312
+ if s.task_id:
313
+ by_task[s.task_id] = by_task.get(s.task_id, 0) + s.total_cost_usd
314
+
315
+ # Get model breakdown from interactions
316
+ interactions = (
317
+ db.query(InteractionDB.model, func.sum(InteractionDB.cost_usd))
318
+ .filter(
319
+ and_(
320
+ InteractionDB.session_id.in_([s.id for s in sessions]),
321
+ InteractionDB.model.isnot(None),
322
+ )
323
+ )
324
+ .group_by(InteractionDB.model)
325
+ .all()
326
+ )
327
+ by_model = {model: float(cost) for model, cost in interactions if model}
328
+
329
+ time_range = (
330
+ min(s.start_time for s in sessions) if sessions else datetime.utcnow(),
331
+ max(s.start_time for s in sessions) if sessions else datetime.utcnow(),
332
+ )
333
+
334
+ return CostBreakdown(
335
+ total_cost_usd=total_cost,
336
+ total_tokens=total_tokens,
337
+ by_agent=by_agent,
338
+ by_model=by_model,
339
+ by_task=by_task,
340
+ time_range=time_range,
341
+ )
342
+
343
+
344
+ @app.get("/api/v1/analytics/decision-tree", response_model=List[DecisionNode])
345
+ async def get_decision_tree(
346
+ root_session_id: Optional[UUID] = None, db: Session = Depends(get_db)
347
+ ) -> List[DecisionNode]:
348
+ """Get decision tree for agent collaboration."""
349
+ if root_session_id:
350
+ root = db.query(SessionDB).filter(SessionDB.id == root_session_id).first()
351
+ if not root:
352
+ raise HTTPException(status_code=404, detail="Root session not found")
353
+ sessions = [root] + _get_all_children(db, root.id)
354
+ else:
355
+ # Get all root sessions (no parent)
356
+ sessions = db.query(SessionDB).filter(SessionDB.parent_session_id.is_(None)).all()
357
+
358
+ nodes = []
359
+ for s in sessions:
360
+ children = (
361
+ db.query(SessionDB.id)
362
+ .filter(SessionDB.parent_session_id == s.id)
363
+ .all()
364
+ )
365
+ nodes.append(
366
+ DecisionNode(
367
+ session_id=s.id,
368
+ agent_name=s.agent_name,
369
+ task_id=s.task_id,
370
+ children=[c[0] for c in children],
371
+ status=s.status,
372
+ total_cost_usd=s.total_cost_usd,
373
+ metadata=s.metadata,
374
+ )
375
+ )
376
+
377
+ return nodes
378
+
379
+
380
+ def _get_all_children(db: Session, parent_id: UUID) -> List[SessionDB]:
381
+ """Recursively get all child sessions."""
382
+ children = db.query(SessionDB).filter(SessionDB.parent_session_id == parent_id).all()
383
+ all_children = children.copy()
384
+ for child in children:
385
+ all_children.extend(_get_all_children(db, child.id))
386
+ return all_children
387
+
388
+
389
+ @app.websocket("/ws")
390
+ async def websocket_endpoint(websocket: WebSocket):
391
+ """WebSocket endpoint for real-time updates."""
392
+ await ws_manager.connect(websocket)
393
+ try:
394
+ while True:
395
+ # Keep connection alive and receive client filters
396
+ data = await websocket.receive_json()
397
+ if data.get("type") == "filter":
398
+ # Client can send filters to only receive certain events
399
+ await websocket.send_json({"status": "filter_applied", "filters": data.get("filters", {})})
400
+ except WebSocketDisconnect:
401
+ ws_manager.disconnect(websocket)
402
+
403
+
404
+ @app.get("/health")
405
+ async def health_check():
406
+ """Health check endpoint."""
407
+ return {"status": "healthy", "timestamp": datetime.utcnow().isoformat()}
@@ -0,0 +1,32 @@
1
+ """CLI for running the observability platform."""
2
+
3
+ import argparse
4
+ import uvicorn
5
+
6
+
7
+ def main():
8
+ """Main CLI entry point."""
9
+ parser = argparse.ArgumentParser(description="Agent Observability Studio CLI")
10
+ subparsers = parser.add_subparsers(dest="command", help="Command to run")
11
+
12
+ # Serve command
13
+ serve_parser = subparsers.add_parser("serve", help="Start the API server")
14
+ serve_parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
15
+ serve_parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
16
+ serve_parser.add_argument("--reload", action="store_true", help="Enable auto-reload")
17
+
18
+ args = parser.parse_args()
19
+
20
+ if args.command == "serve":
21
+ uvicorn.run(
22
+ "agent_observability_studio.api:app",
23
+ host=args.host,
24
+ port=args.port,
25
+ reload=args.reload,
26
+ )
27
+ else:
28
+ parser.print_help()
29
+
30
+
31
+ if __name__ == "__main__":
32
+ main()
@@ -0,0 +1,146 @@
1
+ """Python SDK for instrumenting agents."""
2
+
3
+ from datetime import datetime
4
+ from typing import Any, Dict, Optional
5
+ from uuid import UUID
6
+ import httpx
7
+
8
+ from agent_observability_studio.models import (
9
+ SessionCreate,
10
+ SessionUpdate,
11
+ InteractionCreate,
12
+ SessionStatus,
13
+ InteractionType,
14
+ )
15
+
16
+
17
+ class ObservabilityClient:
18
+ """Client for logging agent interactions to the observability platform."""
19
+
20
+ def __init__(
21
+ self, api_url: str = "http://localhost:8000", timeout: float = 5.0
22
+ ):
23
+ """Initialize client.
24
+
25
+ Args:
26
+ api_url: Base URL of the observability API
27
+ timeout: Request timeout in seconds
28
+ """
29
+ self.api_url = api_url.rstrip("/")
30
+ self.timeout = timeout
31
+ self.client = httpx.Client(timeout=timeout)
32
+
33
+ def start_session(
34
+ self,
35
+ agent_name: str,
36
+ task_id: Optional[str] = None,
37
+ metadata: Optional[Dict[str, Any]] = None,
38
+ parent_session_id: Optional[UUID] = None,
39
+ ) -> UUID:
40
+ """Start a new agent session.
41
+
42
+ Args:
43
+ agent_name: Name of the agent
44
+ task_id: Optional task identifier
45
+ metadata: Optional metadata dictionary
46
+ parent_session_id: Optional parent session for nested agents
47
+
48
+ Returns:
49
+ Session UUID
50
+ """
51
+ session_data = SessionCreate(
52
+ agent_name=agent_name,
53
+ task_id=task_id,
54
+ metadata=metadata or {},
55
+ parent_session_id=parent_session_id,
56
+ )
57
+
58
+ response = self.client.post(
59
+ f"{self.api_url}/api/v1/sessions",
60
+ json=session_data.model_dump(mode="json"),
61
+ )
62
+ response.raise_for_status()
63
+
64
+ return UUID(response.json()["id"])
65
+
66
+ def end_session(
67
+ self,
68
+ session_id: UUID,
69
+ status: SessionStatus = SessionStatus.SUCCESS,
70
+ metadata: Optional[Dict[str, Any]] = None,
71
+ ) -> None:
72
+ """End an agent session.
73
+
74
+ Args:
75
+ session_id: Session UUID to end
76
+ status: Final status of the session
77
+ metadata: Optional additional metadata
78
+ """
79
+ update_data = SessionUpdate(
80
+ status=status, end_time=datetime.utcnow(), metadata=metadata
81
+ )
82
+
83
+ response = self.client.patch(
84
+ f"{self.api_url}/api/v1/sessions/{session_id}",
85
+ json=update_data.model_dump(mode="json", exclude_none=True),
86
+ )
87
+ response.raise_for_status()
88
+
89
+ def log_interaction(
90
+ self,
91
+ session_id: UUID,
92
+ request: Dict[str, Any],
93
+ response: Dict[str, Any],
94
+ tokens_used: int,
95
+ cost_usd: float = 0.0,
96
+ latency_ms: Optional[int] = None,
97
+ model: Optional[str] = None,
98
+ error: Optional[str] = None,
99
+ interaction_type: InteractionType = InteractionType.LLM_CALL,
100
+ metadata: Optional[Dict[str, Any]] = None,
101
+ ) -> UUID:
102
+ """Log an interaction within a session.
103
+
104
+ Args:
105
+ session_id: Session UUID this interaction belongs to
106
+ request: Request data (e.g., prompt, messages)
107
+ response: Response data (e.g., completion, tool output)
108
+ tokens_used: Number of tokens consumed
109
+ cost_usd: Cost in USD
110
+ latency_ms: Latency in milliseconds
111
+ model: Model identifier
112
+ error: Error message if interaction failed
113
+ interaction_type: Type of interaction
114
+ metadata: Optional additional metadata
115
+
116
+ Returns:
117
+ Interaction UUID
118
+ """
119
+ interaction_data = InteractionCreate(
120
+ session_id=session_id,
121
+ type=interaction_type,
122
+ request=request,
123
+ response=response,
124
+ tokens_used=tokens_used,
125
+ cost_usd=cost_usd,
126
+ latency_ms=latency_ms,
127
+ model=model,
128
+ error=error,
129
+ metadata=metadata or {},
130
+ )
131
+
132
+ response = self.client.post(
133
+ f"{self.api_url}/api/v1/interactions",
134
+ json=interaction_data.model_dump(mode="json"),
135
+ )
136
+ response.raise_for_status()
137
+
138
+ return UUID(response.json()["id"])
139
+
140
+ def __enter__(self):
141
+ """Context manager entry."""
142
+ return self
143
+
144
+ def __exit__(self, exc_type, exc_val, exc_tb):
145
+ """Context manager exit."""
146
+ self.client.close()
@@ -0,0 +1,25 @@
1
+ """Configuration management."""
2
+
3
+ from functools import lru_cache
4
+ from pydantic_settings import BaseSettings
5
+
6
+
7
+ class Settings(BaseSettings):
8
+ """Application settings."""
9
+
10
+ database_url: str = "sqlite:///./agent_studio.db"
11
+ redis_url: str = "redis://localhost:6379/0"
12
+ host: str = "0.0.0.0"
13
+ port: int = 8000
14
+ reload: bool = False
15
+ log_level: str = "info"
16
+
17
+ class Config:
18
+ env_file = ".env"
19
+ env_file_encoding = "utf-8"
20
+
21
+
22
+ @lru_cache()
23
+ def get_settings() -> Settings:
24
+ """Get cached settings instance."""
25
+ return Settings()
@@ -0,0 +1,86 @@
1
+ """Database configuration and models."""
2
+
3
+ from datetime import datetime
4
+ from typing import Optional
5
+ from sqlalchemy import (
6
+ create_engine,
7
+ Column,
8
+ String,
9
+ Integer,
10
+ Float,
11
+ DateTime,
12
+ Enum,
13
+ JSON,
14
+ ForeignKey,
15
+ )
16
+ from sqlalchemy.ext.declarative import declarative_base
17
+ from sqlalchemy.orm import sessionmaker, relationship
18
+ from sqlalchemy.dialects.postgresql import UUID
19
+ import uuid
20
+
21
+ from agent_observability_studio.models import SessionStatus, InteractionType
22
+
23
+
24
+ Base = declarative_base()
25
+
26
+
27
+ class SessionDB(Base):
28
+ """Database model for sessions."""
29
+
30
+ __tablename__ = "sessions"
31
+
32
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
33
+ agent_name = Column(String, nullable=False, index=True)
34
+ task_id = Column(String, nullable=True, index=True)
35
+ status = Column(Enum(SessionStatus), nullable=False, default=SessionStatus.ACTIVE)
36
+ start_time = Column(DateTime, nullable=False, default=datetime.utcnow, index=True)
37
+ end_time = Column(DateTime, nullable=True)
38
+ total_tokens = Column(Integer, nullable=False, default=0)
39
+ total_cost_usd = Column(Float, nullable=False, default=0.0)
40
+ metadata = Column(JSON, nullable=False, default=dict)
41
+ parent_session_id = Column(UUID(as_uuid=True), ForeignKey("sessions.id"), nullable=True)
42
+
43
+ interactions = relationship("InteractionDB", back_populates="session", cascade="all, delete")
44
+ children = relationship("SessionDB", backref="parent", remote_side=[id])
45
+
46
+
47
+ class InteractionDB(Base):
48
+ """Database model for interactions."""
49
+
50
+ __tablename__ = "interactions"
51
+
52
+ id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
53
+ session_id = Column(
54
+ UUID(as_uuid=True), ForeignKey("sessions.id"), nullable=False, index=True
55
+ )
56
+ type = Column(Enum(InteractionType), nullable=False, default=InteractionType.LLM_CALL)
57
+ timestamp = Column(DateTime, nullable=False, default=datetime.utcnow, index=True)
58
+ request = Column(JSON, nullable=False)
59
+ response = Column(JSON, nullable=False)
60
+ tokens_used = Column(Integer, nullable=False)
61
+ cost_usd = Column(Float, nullable=False, default=0.0)
62
+ latency_ms = Column(Integer, nullable=True)
63
+ model = Column(String, nullable=True, index=True)
64
+ error = Column(String, nullable=True)
65
+ metadata = Column(JSON, nullable=False, default=dict)
66
+
67
+ session = relationship("SessionDB", back_populates="interactions")
68
+
69
+
70
+ class Database:
71
+ """Database connection manager."""
72
+
73
+ def __init__(self, database_url: str = "sqlite:///./agent_studio.db"):
74
+ self.engine = create_engine(
75
+ database_url, connect_args={"check_same_thread": False} if "sqlite" in database_url else {}
76
+ )
77
+ self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine)
78
+ Base.metadata.create_all(bind=self.engine)
79
+
80
+ def get_session(self):
81
+ """Get database session."""
82
+ db = self.SessionLocal()
83
+ try:
84
+ yield db
85
+ finally:
86
+ db.close()
@@ -0,0 +1,126 @@
1
+ """Data models for observability platform."""
2
+
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import Any, Dict, Optional
6
+ from pydantic import BaseModel, Field
7
+ from uuid import UUID, uuid4
8
+
9
+
10
+ class SessionStatus(str, Enum):
11
+ """Status of an agent session."""
12
+
13
+ ACTIVE = "active"
14
+ SUCCESS = "success"
15
+ FAILED = "failed"
16
+ TIMEOUT = "timeout"
17
+
18
+
19
+ class InteractionType(str, Enum):
20
+ """Type of agent interaction."""
21
+
22
+ LLM_CALL = "llm_call"
23
+ TOOL_USE = "tool_use"
24
+ MEMORY_ACCESS = "memory_access"
25
+ AGENT_COMMUNICATION = "agent_communication"
26
+
27
+
28
+ class Session(BaseModel):
29
+ """An agent execution session."""
30
+
31
+ id: UUID = Field(default_factory=uuid4)
32
+ agent_name: str
33
+ task_id: Optional[str] = None
34
+ status: SessionStatus = SessionStatus.ACTIVE
35
+ start_time: datetime = Field(default_factory=datetime.utcnow)
36
+ end_time: Optional[datetime] = None
37
+ total_tokens: int = 0
38
+ total_cost_usd: float = 0.0
39
+ metadata: Dict[str, Any] = Field(default_factory=dict)
40
+ parent_session_id: Optional[UUID] = None
41
+
42
+
43
+ class Interaction(BaseModel):
44
+ """A single LLM or tool interaction within a session."""
45
+
46
+ id: UUID = Field(default_factory=uuid4)
47
+ session_id: UUID
48
+ type: InteractionType = InteractionType.LLM_CALL
49
+ timestamp: datetime = Field(default_factory=datetime.utcnow)
50
+ request: Dict[str, Any]
51
+ response: Dict[str, Any]
52
+ tokens_used: int
53
+ cost_usd: float = 0.0
54
+ latency_ms: Optional[int] = None
55
+ model: Optional[str] = None
56
+ error: Optional[str] = None
57
+ metadata: Dict[str, Any] = Field(default_factory=dict)
58
+
59
+
60
+ class SessionCreate(BaseModel):
61
+ """Request to create a new session."""
62
+
63
+ agent_name: str
64
+ task_id: Optional[str] = None
65
+ metadata: Dict[str, Any] = Field(default_factory=dict)
66
+ parent_session_id: Optional[UUID] = None
67
+
68
+
69
+ class SessionUpdate(BaseModel):
70
+ """Request to update a session."""
71
+
72
+ status: Optional[SessionStatus] = None
73
+ end_time: Optional[datetime] = None
74
+ metadata: Optional[Dict[str, Any]] = None
75
+
76
+
77
+ class InteractionCreate(BaseModel):
78
+ """Request to log an interaction."""
79
+
80
+ session_id: UUID
81
+ type: InteractionType = InteractionType.LLM_CALL
82
+ request: Dict[str, Any]
83
+ response: Dict[str, Any]
84
+ tokens_used: int
85
+ cost_usd: float = 0.0
86
+ latency_ms: Optional[int] = None
87
+ model: Optional[str] = None
88
+ error: Optional[str] = None
89
+ metadata: Dict[str, Any] = Field(default_factory=dict)
90
+
91
+
92
+ class SessionQuery(BaseModel):
93
+ """Query parameters for filtering sessions."""
94
+
95
+ agent_name: Optional[str] = None
96
+ task_id: Optional[str] = None
97
+ status: Optional[SessionStatus] = None
98
+ start_date: Optional[datetime] = None
99
+ end_date: Optional[datetime] = None
100
+ min_tokens: Optional[int] = None
101
+ max_tokens: Optional[int] = None
102
+ limit: int = 100
103
+ offset: int = 0
104
+
105
+
106
+ class CostBreakdown(BaseModel):
107
+ """Token cost analysis results."""
108
+
109
+ total_cost_usd: float
110
+ total_tokens: int
111
+ by_agent: Dict[str, float]
112
+ by_model: Dict[str, float]
113
+ by_task: Dict[str, float]
114
+ time_range: tuple[datetime, datetime]
115
+
116
+
117
+ class DecisionNode(BaseModel):
118
+ """Node in an agent decision tree."""
119
+
120
+ session_id: UUID
121
+ agent_name: str
122
+ task_id: Optional[str]
123
+ children: list[UUID] = Field(default_factory=list)
124
+ status: SessionStatus
125
+ total_cost_usd: float
126
+ metadata: Dict[str, Any] = Field(default_factory=dict)
@@ -0,0 +1,30 @@
1
+ """WebSocket connection manager for real-time updates."""
2
+
3
+ from typing import List, Dict, Any
4
+ from fastapi import WebSocket
5
+
6
+
7
+ class ConnectionManager:
8
+ """Manages WebSocket connections and broadcasting."""
9
+
10
+ def __init__(self):
11
+ self.active_connections: List[WebSocket] = []
12
+
13
+ async def connect(self, websocket: WebSocket):
14
+ """Accept and store a new WebSocket connection."""
15
+ await websocket.accept()
16
+ self.active_connections.append(websocket)
17
+
18
+ def disconnect(self, websocket: WebSocket):
19
+ """Remove a WebSocket connection."""
20
+ if websocket in self.active_connections:
21
+ self.active_connections.remove(websocket)
22
+
23
+ async def broadcast(self, message: Dict[str, Any]):
24
+ """Broadcast a message to all connected clients."""
25
+ for connection in self.active_connections:
26
+ try:
27
+ await connection.send_json(message)
28
+ except Exception:
29
+ # Connection might be closed, will be removed on next disconnect
30
+ pass
@@ -0,0 +1,72 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "agent-observability-studio"
7
+ version = "0.1.0"
8
+ description = "Real-time monitoring and debugging platform for multi-agent AI systems"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Zach", email = "zach@example.com"}
14
+ ]
15
+ keywords = ["ai", "agents", "observability", "monitoring", "debugging", "llm"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ ]
24
+ dependencies = [
25
+ "fastapi>=0.109.0",
26
+ "uvicorn[standard]>=0.27.0",
27
+ "websockets>=12.0",
28
+ "sqlalchemy>=2.0.25",
29
+ "alembic>=1.13.1",
30
+ "pydantic>=2.5.3",
31
+ "pydantic-settings>=2.1.0",
32
+ "httpx>=0.26.0",
33
+ "redis>=5.0.1",
34
+ "python-multipart>=0.0.6",
35
+ "tiktoken>=0.5.2",
36
+ "anthropic>=0.18.0",
37
+ "openai>=1.12.0",
38
+ ]
39
+
40
+ [project.optional-dependencies]
41
+ dev = [
42
+ "pytest>=7.4.4",
43
+ "pytest-asyncio>=0.23.3",
44
+ "black>=24.1.1",
45
+ "ruff>=0.1.14",
46
+ "mypy>=1.8.0",
47
+ ]
48
+
49
+ [project.urls]
50
+ Homepage = "https://github.com/yourusername/agent-observability-studio"
51
+ Documentation = "https://github.com/yourusername/agent-observability-studio#readme"
52
+ Repository = "https://github.com/yourusername/agent-observability-studio"
53
+
54
+ [project.scripts]
55
+ agent-studio = "agent_observability_studio.cli:main"
56
+
57
+ [tool.hatch.build.targets.wheel]
58
+ packages = ["agent_observability_studio"]
59
+
60
+ [tool.black]
61
+ line-length = 100
62
+ target-version = ['py310']
63
+
64
+ [tool.ruff]
65
+ line-length = 100
66
+ target-version = "py310"
67
+
68
+ [tool.mypy]
69
+ python_version = "3.10"
70
+ warn_return_any = true
71
+ warn_unused_configs = true
72
+ disallow_untyped_defs = true