finobs 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- finobs-0.0.1/.gitignore +111 -0
- finobs-0.0.1/PKG-INFO +105 -0
- finobs-0.0.1/README.md +82 -0
- finobs-0.0.1/examples/sample_agent/main.py +83 -0
- finobs-0.0.1/finobs/__init__.py +7 -0
- finobs-0.0.1/finobs/agents/__init__.py +3 -0
- finobs-0.0.1/finobs/agents/behavior.py +48 -0
- finobs-0.0.1/finobs/agents/cost.py +36 -0
- finobs-0.0.1/finobs/agents/latency.py +39 -0
- finobs-0.0.1/finobs/agents/quality.py +51 -0
- finobs-0.0.1/finobs/agents/rag.py +60 -0
- finobs-0.0.1/finobs/cli.py +19 -0
- finobs-0.0.1/finobs/commands/__init__.py +0 -0
- finobs-0.0.1/finobs/commands/audit.py +175 -0
- finobs-0.0.1/finobs/commands/compare.py +67 -0
- finobs-0.0.1/finobs/commands/cost.py +55 -0
- finobs-0.0.1/finobs/commands/debug.py +63 -0
- finobs-0.0.1/finobs/commands/loops.py +30 -0
- finobs-0.0.1/finobs/commands/optimize.py +207 -0
- finobs-0.0.1/finobs/commands/performance.py +27 -0
- finobs-0.0.1/finobs/commands/trace.py +25 -0
- finobs-0.0.1/finobs/commands/watch.py +61 -0
- finobs-0.0.1/finobs/config/__init__.py +0 -0
- finobs-0.0.1/finobs/config/pricing.json +22 -0
- finobs-0.0.1/finobs/scripts/__init__.py +0 -0
- finobs-0.0.1/finobs/scripts/cost_analyzer.py +43 -0
- finobs-0.0.1/finobs/scripts/degradation_detector.py +146 -0
- finobs-0.0.1/finobs/scripts/diff_engine.py +93 -0
- finobs-0.0.1/finobs/scripts/llm_analyzer.py +186 -0
- finobs-0.0.1/finobs/scripts/loop_detector.py +21 -0
- finobs-0.0.1/finobs/scripts/metrics_aggregator.py +20 -0
- finobs-0.0.1/finobs/scripts/optimizer.py +216 -0
- finobs-0.0.1/finobs/scripts/patcher.py +432 -0
- finobs-0.0.1/finobs/scripts/report_generator.py +113 -0
- finobs-0.0.1/finobs/scripts/scoring.py +54 -0
- finobs-0.0.1/finobs/scripts/trace_generator.py +115 -0
- finobs-0.0.1/finobs/scripts/trace_interceptor.py +82 -0
- finobs-0.0.1/finobs/scripts/trace_parser.py +80 -0
- finobs-0.0.1/pyproject.toml +36 -0
- finobs-0.0.1/tests/test_core.py +127 -0
finobs-0.0.1/.gitignore
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
|
2
|
+
|
|
3
|
+
# dependencies
|
|
4
|
+
/node_modules
|
|
5
|
+
/.pnp
|
|
6
|
+
.pnp.*
|
|
7
|
+
.yarn/*
|
|
8
|
+
!.yarn/patches
|
|
9
|
+
!.yarn/plugins
|
|
10
|
+
!.yarn/releases
|
|
11
|
+
!.yarn/versions
|
|
12
|
+
|
|
13
|
+
# testing
|
|
14
|
+
/coverage
|
|
15
|
+
|
|
16
|
+
# next.js
|
|
17
|
+
/.next/
|
|
18
|
+
/out/
|
|
19
|
+
|
|
20
|
+
# production
|
|
21
|
+
/build
|
|
22
|
+
|
|
23
|
+
# misc
|
|
24
|
+
.DS_Store
|
|
25
|
+
*.pem
|
|
26
|
+
|
|
27
|
+
# debug
|
|
28
|
+
npm-debug.log*
|
|
29
|
+
yarn-debug.log*
|
|
30
|
+
yarn-error.log*
|
|
31
|
+
.pnpm-debug.log*
|
|
32
|
+
|
|
33
|
+
# env files (can opt-in for committing if needed)
|
|
34
|
+
.env*
|
|
35
|
+
|
|
36
|
+
# vercel
|
|
37
|
+
.vercel
|
|
38
|
+
|
|
39
|
+
# typescript
|
|
40
|
+
*.tsbuildinfo
|
|
41
|
+
next-env.d.ts
|
|
42
|
+
|
|
43
|
+
/src/generated/prisma
|
|
44
|
+
|
|
45
|
+
.vercel
|
|
46
|
+
|
|
47
|
+
# Documentation
|
|
48
|
+
/docs/
|
|
49
|
+
# SQLite databases
|
|
50
|
+
*.db
|
|
51
|
+
/intelligence_engine/data/
|
|
52
|
+
/intelligence_engine/**/data/
|
|
53
|
+
|
|
54
|
+
# Agent & Tools Clutter
|
|
55
|
+
.agents/
|
|
56
|
+
.playwright-mcp/
|
|
57
|
+
playwright-report/
|
|
58
|
+
test-results/
|
|
59
|
+
.gemini/
|
|
60
|
+
*.log
|
|
61
|
+
*.resolved
|
|
62
|
+
|
|
63
|
+
# Python
|
|
64
|
+
__pycache__/
|
|
65
|
+
*.pyc
|
|
66
|
+
|
|
67
|
+
# Temporary Scripts
|
|
68
|
+
fetch_leads.js
|
|
69
|
+
fix_db.js
|
|
70
|
+
save_intelligence.js
|
|
71
|
+
tmp_*.js
|
|
72
|
+
scripts/inject-test-lead.ts
|
|
73
|
+
|
|
74
|
+
node_modules/
|
|
75
|
+
.env
|
|
76
|
+
.next
|
|
77
|
+
fix-*.js
|
|
78
|
+
|
|
79
|
+
# Virtual Environments
|
|
80
|
+
venv/
|
|
81
|
+
.venv/
|
|
82
|
+
env/
|
|
83
|
+
|
|
84
|
+
# Claude Code
|
|
85
|
+
.claude/
|
|
86
|
+
|
|
87
|
+
# Local Deployment & Platform
|
|
88
|
+
/agents-platform/
|
|
89
|
+
launch_agencia_ai.ps1
|
|
90
|
+
|
|
91
|
+
# Local Strategy Assets (opcional)
|
|
92
|
+
# Strategy/
|
|
93
|
+
|
|
94
|
+
# Backups & Data
|
|
95
|
+
*.sql
|
|
96
|
+
*.tar.gz
|
|
97
|
+
*.zip
|
|
98
|
+
|
|
99
|
+
# Agent & Platform Tools
|
|
100
|
+
.mcp.json
|
|
101
|
+
.agent/
|
|
102
|
+
.agents/
|
|
103
|
+
.gemini/
|
|
104
|
+
.claude/
|
|
105
|
+
|
|
106
|
+
# Logs & Debug
|
|
107
|
+
*.log
|
|
108
|
+
npm-debug.log*
|
|
109
|
+
yarn-debug.log*
|
|
110
|
+
yarn-error.log*
|
|
111
|
+
.pnpm-debug.log*
|
finobs-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: finobs
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: CLI tool to monitor, analyze and optimize LLM agents in production
|
|
5
|
+
Project-URL: Repository, https://github.com/fmonfasani/finobs
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: agents,cli,llm,monitoring,observability
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Requires-Dist: anthropic>=0.25
|
|
10
|
+
Requires-Dist: httpx>=0.27
|
|
11
|
+
Requires-Dist: numpy>=1.26
|
|
12
|
+
Requires-Dist: rich>=13
|
|
13
|
+
Requires-Dist: typer[all]>=0.12
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: hatch; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
17
|
+
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
18
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
19
|
+
Provides-Extra: fastapi
|
|
20
|
+
Requires-Dist: fastapi>=0.110; extra == 'fastapi'
|
|
21
|
+
Requires-Dist: uvicorn[standard]; extra == 'fastapi'
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# finobs — LLM Agent Observability Suite
|
|
25
|
+
|
|
26
|
+
Monitor, analyze and optimize LLM agents in production.
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install finobs
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Commands
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
finobs audit run_123 # Full observability audit (5 agents in parallel)
|
|
36
|
+
finobs trace run_123 # Visualize execution trace
|
|
37
|
+
finobs loops run_123 # Detect loops and redundant steps
|
|
38
|
+
finobs cost run_123 # Cost breakdown by run
|
|
39
|
+
finobs cost tenant_abc # Cost breakdown by tenant
|
|
40
|
+
finobs performance run_123 # Latency analysis (p50/p95/p99)
|
|
41
|
+
finobs debug run_123 # Debug failed steps
|
|
42
|
+
finobs debug run_123 --llm # Debug with Claude semantic analysis (Phase 4)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Instrument your agent
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from finobs import traced, flush_trace
|
|
49
|
+
|
|
50
|
+
@traced("search_documents")
|
|
51
|
+
def search_documents(query: str) -> str:
|
|
52
|
+
...
|
|
53
|
+
|
|
54
|
+
@traced("llm_call")
|
|
55
|
+
def call_llm(prompt: str) -> str:
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
# At end of run:
|
|
59
|
+
flush_trace(run_id="run_123", tenant="my_tenant")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Generate test traces
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from finobs.scripts.trace_generator import generate_trace, Scenario
|
|
66
|
+
|
|
67
|
+
generate_trace(Scenario.LOOP) # loop detection
|
|
68
|
+
generate_trace(Scenario.LATENCY_SPIKE) # p95 spike
|
|
69
|
+
generate_trace(Scenario.TOKEN_WASTE) # cost waste
|
|
70
|
+
generate_trace(Scenario.MULTI_FAILURE) # combined issues
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Then audit:
|
|
74
|
+
```bash
|
|
75
|
+
finobs audit loop_xxxxxx
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Scoring
|
|
79
|
+
|
|
80
|
+
| Dimension | Weight |
|
|
81
|
+
|-------------|--------|
|
|
82
|
+
| Performance | 25% |
|
|
83
|
+
| Quality | 25% |
|
|
84
|
+
| Cost | 20% |
|
|
85
|
+
| Stability | 15% |
|
|
86
|
+
| RAG | 15% |
|
|
87
|
+
|
|
88
|
+
Bands: 🟢 85–100 HEALTHY · 🟡 65–84 DEGRADED · 🔴 0–64 CRITICAL
|
|
89
|
+
|
|
90
|
+
## Development
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
git clone https://github.com/tu-org/finobs
|
|
94
|
+
cd finobs
|
|
95
|
+
pip install -e ".[dev,fastapi]"
|
|
96
|
+
pytest tests/ -v
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Roadmap
|
|
100
|
+
|
|
101
|
+
- [x] Phase 1 — CLI + trace interceptor + 5 analysis agents
|
|
102
|
+
- [x] Phase 2 — Scoring system + report generation
|
|
103
|
+
- [x] Phase 3 — Deterministic scripts (loop detector, cost analyzer, metrics)
|
|
104
|
+
- [ ] Phase 4 — Claude LLM semantic analysis (`--llm` flag)
|
|
105
|
+
- [ ] Phase 5 — Self-healing + A/B prompt comparison + degradation detection
|
finobs-0.0.1/README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# finobs — LLM Agent Observability Suite
|
|
2
|
+
|
|
3
|
+
Monitor, analyze and optimize LLM agents in production.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install finobs
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Commands
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
finobs audit run_123 # Full observability audit (5 agents in parallel)
|
|
13
|
+
finobs trace run_123 # Visualize execution trace
|
|
14
|
+
finobs loops run_123 # Detect loops and redundant steps
|
|
15
|
+
finobs cost run_123 # Cost breakdown by run
|
|
16
|
+
finobs cost tenant_abc # Cost breakdown by tenant
|
|
17
|
+
finobs performance run_123 # Latency analysis (p50/p95/p99)
|
|
18
|
+
finobs debug run_123 # Debug failed steps
|
|
19
|
+
finobs debug run_123 --llm # Debug with Claude semantic analysis (Phase 4)
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Instrument your agent
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from finobs import traced, flush_trace
|
|
26
|
+
|
|
27
|
+
@traced("search_documents")
|
|
28
|
+
def search_documents(query: str) -> str:
|
|
29
|
+
...
|
|
30
|
+
|
|
31
|
+
@traced("llm_call")
|
|
32
|
+
def call_llm(prompt: str) -> str:
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
# At end of run:
|
|
36
|
+
flush_trace(run_id="run_123", tenant="my_tenant")
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Generate test traces
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from finobs.scripts.trace_generator import generate_trace, Scenario
|
|
43
|
+
|
|
44
|
+
generate_trace(Scenario.LOOP) # loop detection
|
|
45
|
+
generate_trace(Scenario.LATENCY_SPIKE) # p95 spike
|
|
46
|
+
generate_trace(Scenario.TOKEN_WASTE) # cost waste
|
|
47
|
+
generate_trace(Scenario.MULTI_FAILURE) # combined issues
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Then audit:
|
|
51
|
+
```bash
|
|
52
|
+
finobs audit loop_xxxxxx
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Scoring
|
|
56
|
+
|
|
57
|
+
| Dimension | Weight |
|
|
58
|
+
|-------------|--------|
|
|
59
|
+
| Performance | 25% |
|
|
60
|
+
| Quality | 25% |
|
|
61
|
+
| Cost | 20% |
|
|
62
|
+
| Stability | 15% |
|
|
63
|
+
| RAG | 15% |
|
|
64
|
+
|
|
65
|
+
Bands: 🟢 85–100 HEALTHY · 🟡 65–84 DEGRADED · 🔴 0–64 CRITICAL
|
|
66
|
+
|
|
67
|
+
## Development
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
git clone https://github.com/tu-org/finobs
|
|
71
|
+
cd finobs
|
|
72
|
+
pip install -e ".[dev,fastapi]"
|
|
73
|
+
pytest tests/ -v
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Roadmap
|
|
77
|
+
|
|
78
|
+
- [x] Phase 1 — CLI + trace interceptor + 5 analysis agents
|
|
79
|
+
- [x] Phase 2 — Scoring system + report generation
|
|
80
|
+
- [x] Phase 3 — Deterministic scripts (loop detector, cost analyzer, metrics)
|
|
81
|
+
- [ ] Phase 4 — Claude LLM semantic analysis (`--llm` flag)
|
|
82
|
+
- [ ] Phase 5 — Self-healing + A/B prompt comparison + degradation detection
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sample FastAPI agent instrumented with finobs @traced decorator.
|
|
3
|
+
Run: uvicorn examples.sample_agent.main:app --reload --port 8001
|
|
4
|
+
"""
|
|
5
|
+
import uuid
|
|
6
|
+
import random
|
|
7
|
+
from fastapi import FastAPI
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from finobs import traced, flush_trace
|
|
11
|
+
from finobs.scripts.trace_interceptor import set_run_id
|
|
12
|
+
|
|
13
|
+
app = FastAPI(title="Sample LLM Agent")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TaskRequest(BaseModel):
|
|
17
|
+
task: str
|
|
18
|
+
tenant: str = "default"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TaskResponse(BaseModel):
|
|
22
|
+
run_id: str
|
|
23
|
+
result: str
|
|
24
|
+
trace_path: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@traced("search_documents")
|
|
28
|
+
def search_documents(query: str) -> str:
|
|
29
|
+
# Simulate RAG retrieval
|
|
30
|
+
import time; time.sleep(random.uniform(0.1, 0.5))
|
|
31
|
+
return f"Found 3 documents relevant to: {query}"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@traced("llm_call")
|
|
35
|
+
def llm_call(prompt: str) -> str:
|
|
36
|
+
# Simulate LLM call latency
|
|
37
|
+
import time; time.sleep(random.uniform(0.3, 1.2))
|
|
38
|
+
return f"LLM response for: {prompt[:50]}"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@traced("validate_output")
|
|
42
|
+
def validate_output(output: str) -> str:
|
|
43
|
+
import time; time.sleep(0.05)
|
|
44
|
+
if random.random() < 0.1:
|
|
45
|
+
raise ValueError("Validation failed")
|
|
46
|
+
return "valid"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@traced("write_file")
|
|
50
|
+
def write_file(content: str) -> str:
|
|
51
|
+
import time; time.sleep(0.05)
|
|
52
|
+
return f"Written {len(content)} chars"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@app.post("/invoke", response_model=TaskResponse)
|
|
56
|
+
async def invoke(request: TaskRequest):
|
|
57
|
+
run_id = str(uuid.uuid4())[:8]
|
|
58
|
+
set_run_id(run_id)
|
|
59
|
+
|
|
60
|
+
# Simulate agent execution
|
|
61
|
+
docs = search_documents(request.task)
|
|
62
|
+
response = llm_call(f"Given: {docs}. Task: {request.task}")
|
|
63
|
+
search_documents(request.task) # Intentional loop for demo
|
|
64
|
+
llm_call(f"Refine: {response}")
|
|
65
|
+
search_documents(request.task) # Loop again
|
|
66
|
+
try:
|
|
67
|
+
validate_output(response)
|
|
68
|
+
except ValueError:
|
|
69
|
+
pass # captured in trace as error
|
|
70
|
+
write_file(response)
|
|
71
|
+
|
|
72
|
+
trace_path = flush_trace(run_id=run_id, tenant=request.tenant)
|
|
73
|
+
|
|
74
|
+
return TaskResponse(
|
|
75
|
+
run_id=run_id,
|
|
76
|
+
result=response,
|
|
77
|
+
trace_path=trace_path,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@app.get("/health")
|
|
82
|
+
def health():
|
|
83
|
+
return {"status": "ok"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from finobs.scripts.trace_parser import Trace
|
|
3
|
+
from finobs.scripts.loop_detector import detect_loops
|
|
4
|
+
from finobs.scripts.scoring import score_stability
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class AgentResult:
|
|
9
|
+
agent: str
|
|
10
|
+
score: int
|
|
11
|
+
summary: str
|
|
12
|
+
issues: list[dict] = field(default_factory=list)
|
|
13
|
+
metrics: dict = field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def analyze(trace: Trace) -> AgentResult:
|
|
17
|
+
steps = trace.steps
|
|
18
|
+
loops = detect_loops(steps)
|
|
19
|
+
error_rate = sum(1 for s in steps if s.status == "error") / max(len(steps), 1)
|
|
20
|
+
score = score_stability(len(loops), error_rate, len(steps))
|
|
21
|
+
|
|
22
|
+
issues = []
|
|
23
|
+
for loop in loops:
|
|
24
|
+
issues.append({
|
|
25
|
+
"severity": "high",
|
|
26
|
+
"description": (
|
|
27
|
+
f"Loop detected: tool='{loop['tool']}' "
|
|
28
|
+
f"repeated {loop['occurrences']}x at steps {loop['step_indices']}"
|
|
29
|
+
),
|
|
30
|
+
"recommendation": "Add loop guard — check result before re-invoking same tool with same input",
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
if len(steps) > 40:
|
|
34
|
+
issues.append({
|
|
35
|
+
"severity": "medium",
|
|
36
|
+
"description": f"Excessive step count: {len(steps)} steps",
|
|
37
|
+
"recommendation": "Review planner — may be generating redundant sub-tasks",
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
metrics = {
|
|
41
|
+
"total_steps": len(steps),
|
|
42
|
+
"loops_detected": len(loops),
|
|
43
|
+
"loop_detail": loops,
|
|
44
|
+
"error_rate": round(error_rate, 3),
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
summary = f"{len(loops)} loops" if loops else "no loops"
|
|
48
|
+
return AgentResult(agent="behavior", score=score, summary=summary, issues=issues, metrics=metrics)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from finobs.scripts.trace_parser import Trace
|
|
3
|
+
from finobs.scripts.cost_analyzer import compute_cost
|
|
4
|
+
from finobs.scripts.scoring import score_cost
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class AgentResult:
|
|
9
|
+
agent: str
|
|
10
|
+
score: int
|
|
11
|
+
summary: str
|
|
12
|
+
issues: list[dict] = field(default_factory=list)
|
|
13
|
+
metrics: dict = field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def analyze(trace: Trace) -> AgentResult:
|
|
17
|
+
result = compute_cost(trace.steps)
|
|
18
|
+
score = score_cost(result["total_usd"], len(result["waste_steps"]))
|
|
19
|
+
|
|
20
|
+
issues = []
|
|
21
|
+
from finobs.scripts.scoring import THRESHOLDS
|
|
22
|
+
if result["total_usd"] > THRESHOLDS["cost_per_run_usd"]:
|
|
23
|
+
issues.append({
|
|
24
|
+
"severity": "high",
|
|
25
|
+
"description": f"Run cost ${result['total_usd']:.4f} exceeds budget (${THRESHOLDS['cost_per_run_usd']})",
|
|
26
|
+
"recommendation": "Consider using a cheaper model for non-critical steps",
|
|
27
|
+
})
|
|
28
|
+
if result["waste_steps"]:
|
|
29
|
+
issues.append({
|
|
30
|
+
"severity": "medium",
|
|
31
|
+
"description": f"{len(result['waste_steps'])} steps with high input/output token ratio",
|
|
32
|
+
"recommendation": "Trim context window — avoid re-injecting full history each step",
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
summary = f"${result['total_usd']:.4f}"
|
|
36
|
+
return AgentResult(agent="cost", score=score, summary=summary, issues=issues, metrics=result)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from finobs.scripts.trace_parser import Trace
|
|
3
|
+
from finobs.scripts.metrics_aggregator import compute_latency_metrics
|
|
4
|
+
from finobs.scripts.scoring import score_latency
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class AgentResult:
|
|
9
|
+
agent: str
|
|
10
|
+
score: int
|
|
11
|
+
summary: str
|
|
12
|
+
issues: list[dict] = field(default_factory=list)
|
|
13
|
+
metrics: dict = field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def analyze(trace: Trace) -> AgentResult:
|
|
17
|
+
metrics = compute_latency_metrics(trace.steps)
|
|
18
|
+
error_rate = sum(1 for s in trace.steps if s.status == "error") / max(len(trace.steps), 1)
|
|
19
|
+
|
|
20
|
+
score = score_latency(metrics["p95"], metrics["p99"], error_rate)
|
|
21
|
+
issues = []
|
|
22
|
+
|
|
23
|
+
from finobs.scripts.scoring import THRESHOLDS
|
|
24
|
+
if metrics["p95"] > THRESHOLDS["latency_p95_ms"]:
|
|
25
|
+
issues.append({
|
|
26
|
+
"severity": "high",
|
|
27
|
+
"description": f"p95 latency {metrics['p95']:.0f}ms exceeds threshold ({THRESHOLDS['latency_p95_ms']}ms)",
|
|
28
|
+
"recommendation": "Profile slowest steps and consider caching or parallelization",
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
if error_rate > THRESHOLDS["error_rate_max"]:
|
|
32
|
+
issues.append({
|
|
33
|
+
"severity": "high",
|
|
34
|
+
"description": f"Error rate {error_rate:.1%} exceeds threshold",
|
|
35
|
+
"recommendation": "Review failed steps and add retry logic",
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
summary = f"p95: {metrics['p95']:.0f}ms"
|
|
39
|
+
return AgentResult(agent="latency", score=score, summary=summary, issues=issues, metrics=metrics)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from finobs.scripts.trace_parser import Trace
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class AgentResult:
|
|
7
|
+
agent: str
|
|
8
|
+
score: int
|
|
9
|
+
summary: str
|
|
10
|
+
issues: list[dict] = field(default_factory=list)
|
|
11
|
+
metrics: dict = field(default_factory=dict)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def analyze(trace: Trace) -> AgentResult:
|
|
15
|
+
steps = trace.steps
|
|
16
|
+
total = len(steps)
|
|
17
|
+
errors = [s for s in steps if s.status == "error"]
|
|
18
|
+
error_rate = len(errors) / max(total, 1)
|
|
19
|
+
|
|
20
|
+
# Detectar pasos con output muy bajo (posible respuesta vacía)
|
|
21
|
+
low_output = [s for s in steps if s.output_tokens < 10]
|
|
22
|
+
low_output_rate = len(low_output) / max(total, 1)
|
|
23
|
+
|
|
24
|
+
score = 100
|
|
25
|
+
score -= int(error_rate * 100 * 1.5)
|
|
26
|
+
score -= int(low_output_rate * 100 * 0.8)
|
|
27
|
+
score = max(0, score)
|
|
28
|
+
|
|
29
|
+
issues = []
|
|
30
|
+
if error_rate > 0.05:
|
|
31
|
+
issues.append({
|
|
32
|
+
"severity": "high",
|
|
33
|
+
"description": f"High failure rate: {error_rate:.1%} of steps failed",
|
|
34
|
+
"recommendation": "Add input validation before tool calls",
|
|
35
|
+
})
|
|
36
|
+
if low_output_rate > 0.10:
|
|
37
|
+
issues.append({
|
|
38
|
+
"severity": "medium",
|
|
39
|
+
"description": f"{len(low_output)} steps produced near-empty outputs",
|
|
40
|
+
"recommendation": "Check prompt clarity and context sufficiency",
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
metrics = {
|
|
44
|
+
"total_steps": total,
|
|
45
|
+
"error_count": len(errors),
|
|
46
|
+
"error_rate": round(error_rate, 3),
|
|
47
|
+
"low_output_steps": len(low_output),
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
summary = f"error_rate: {error_rate:.1%}"
|
|
51
|
+
return AgentResult(agent="quality", score=score, summary=summary, issues=issues, metrics=metrics)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from finobs.scripts.trace_parser import Trace
|
|
3
|
+
from finobs.scripts.scoring import score_rag
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class AgentResult:
|
|
8
|
+
agent: str
|
|
9
|
+
score: int
|
|
10
|
+
summary: str
|
|
11
|
+
issues: list[dict] = field(default_factory=list)
|
|
12
|
+
metrics: dict = field(default_factory=dict)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def analyze(trace: Trace) -> AgentResult:
|
|
16
|
+
rag_steps = [s for s in trace.steps if "rag" in s.tool.lower() or "retrieve" in s.tool.lower()]
|
|
17
|
+
|
|
18
|
+
if not rag_steps:
|
|
19
|
+
return AgentResult(
|
|
20
|
+
agent="rag",
|
|
21
|
+
score=100,
|
|
22
|
+
summary="no RAG steps",
|
|
23
|
+
metrics={"rag_steps": 0},
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Proxy de relevancia: ratio output/input tokens (bajo = recuperó pero no usó)
|
|
27
|
+
relevance_scores = []
|
|
28
|
+
for s in rag_steps:
|
|
29
|
+
ratio = s.output_tokens / max(s.input_tokens, 1)
|
|
30
|
+
relevance_scores.append(min(1.0, ratio * 2))
|
|
31
|
+
|
|
32
|
+
avg_relevance = sum(relevance_scores) / len(relevance_scores)
|
|
33
|
+
empty_steps = [s for s in rag_steps if s.output_tokens < 15]
|
|
34
|
+
empty_rate = len(empty_steps) / len(rag_steps)
|
|
35
|
+
|
|
36
|
+
score = score_rag(avg_relevance, empty_rate)
|
|
37
|
+
issues = []
|
|
38
|
+
|
|
39
|
+
from finobs.scripts.scoring import THRESHOLDS
|
|
40
|
+
if avg_relevance < THRESHOLDS["rag_relevance_min"]:
|
|
41
|
+
issues.append({
|
|
42
|
+
"severity": "high",
|
|
43
|
+
"description": f"Low RAG utilization: avg relevance proxy {avg_relevance:.2f}",
|
|
44
|
+
"recommendation": "Review chunking strategy and embedding model",
|
|
45
|
+
})
|
|
46
|
+
if empty_rate > 0.2:
|
|
47
|
+
issues.append({
|
|
48
|
+
"severity": "medium",
|
|
49
|
+
"description": f"{len(empty_steps)}/{len(rag_steps)} retrievals returned near-empty results",
|
|
50
|
+
"recommendation": "Check retrieval thresholds and fallback strategy",
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
metrics = {
|
|
54
|
+
"rag_steps": len(rag_steps),
|
|
55
|
+
"avg_relevance_proxy": round(avg_relevance, 3),
|
|
56
|
+
"empty_retrieval_rate": round(empty_rate, 3),
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
summary = f"relevance: {avg_relevance:.2f}"
|
|
60
|
+
return AgentResult(agent="rag", score=score, summary=summary, issues=issues, metrics=metrics)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from finobs.commands import audit, trace, loops, cost, performance, debug
|
|
3
|
+
|
|
4
|
+
app = typer.Typer(
|
|
5
|
+
name="finobs",
|
|
6
|
+
help="finobs — LLM Agent Observability Suite",
|
|
7
|
+
no_args_is_help=True,
|
|
8
|
+
rich_markup_mode="rich",
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
app.add_typer(audit.app, name="audit", help="Full observability audit of an agent run")
|
|
12
|
+
app.add_typer(trace.app, name="trace", help="Reconstruct and visualize execution trace")
|
|
13
|
+
app.add_typer(loops.app, name="loops", help="Detect loops and redundant steps")
|
|
14
|
+
app.add_typer(cost.app, name="cost", help="Cost analysis by run or tenant")
|
|
15
|
+
app.add_typer(performance.app, name="performance", help="Latency and throughput analysis")
|
|
16
|
+
app.add_typer(debug.app, name="debug", help="Deep debug of a failed run")
|
|
17
|
+
|
|
18
|
+
if __name__ == "__main__":
|
|
19
|
+
app()
|
|
File without changes
|