finobs 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. finobs-0.0.1/.gitignore +111 -0
  2. finobs-0.0.1/PKG-INFO +105 -0
  3. finobs-0.0.1/README.md +82 -0
  4. finobs-0.0.1/examples/sample_agent/main.py +83 -0
  5. finobs-0.0.1/finobs/__init__.py +7 -0
  6. finobs-0.0.1/finobs/agents/__init__.py +3 -0
  7. finobs-0.0.1/finobs/agents/behavior.py +48 -0
  8. finobs-0.0.1/finobs/agents/cost.py +36 -0
  9. finobs-0.0.1/finobs/agents/latency.py +39 -0
  10. finobs-0.0.1/finobs/agents/quality.py +51 -0
  11. finobs-0.0.1/finobs/agents/rag.py +60 -0
  12. finobs-0.0.1/finobs/cli.py +19 -0
  13. finobs-0.0.1/finobs/commands/__init__.py +0 -0
  14. finobs-0.0.1/finobs/commands/audit.py +175 -0
  15. finobs-0.0.1/finobs/commands/compare.py +67 -0
  16. finobs-0.0.1/finobs/commands/cost.py +55 -0
  17. finobs-0.0.1/finobs/commands/debug.py +63 -0
  18. finobs-0.0.1/finobs/commands/loops.py +30 -0
  19. finobs-0.0.1/finobs/commands/optimize.py +207 -0
  20. finobs-0.0.1/finobs/commands/performance.py +27 -0
  21. finobs-0.0.1/finobs/commands/trace.py +25 -0
  22. finobs-0.0.1/finobs/commands/watch.py +61 -0
  23. finobs-0.0.1/finobs/config/__init__.py +0 -0
  24. finobs-0.0.1/finobs/config/pricing.json +22 -0
  25. finobs-0.0.1/finobs/scripts/__init__.py +0 -0
  26. finobs-0.0.1/finobs/scripts/cost_analyzer.py +43 -0
  27. finobs-0.0.1/finobs/scripts/degradation_detector.py +146 -0
  28. finobs-0.0.1/finobs/scripts/diff_engine.py +93 -0
  29. finobs-0.0.1/finobs/scripts/llm_analyzer.py +186 -0
  30. finobs-0.0.1/finobs/scripts/loop_detector.py +21 -0
  31. finobs-0.0.1/finobs/scripts/metrics_aggregator.py +20 -0
  32. finobs-0.0.1/finobs/scripts/optimizer.py +216 -0
  33. finobs-0.0.1/finobs/scripts/patcher.py +432 -0
  34. finobs-0.0.1/finobs/scripts/report_generator.py +113 -0
  35. finobs-0.0.1/finobs/scripts/scoring.py +54 -0
  36. finobs-0.0.1/finobs/scripts/trace_generator.py +115 -0
  37. finobs-0.0.1/finobs/scripts/trace_interceptor.py +82 -0
  38. finobs-0.0.1/finobs/scripts/trace_parser.py +80 -0
  39. finobs-0.0.1/pyproject.toml +36 -0
  40. finobs-0.0.1/tests/test_core.py +127 -0
@@ -0,0 +1,111 @@
1
+ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2
+
3
+ # dependencies
4
+ /node_modules
5
+ /.pnp
6
+ .pnp.*
7
+ .yarn/*
8
+ !.yarn/patches
9
+ !.yarn/plugins
10
+ !.yarn/releases
11
+ !.yarn/versions
12
+
13
+ # testing
14
+ /coverage
15
+
16
+ # next.js
17
+ /.next/
18
+ /out/
19
+
20
+ # production
21
+ /build
22
+
23
+ # misc
24
+ .DS_Store
25
+ *.pem
26
+
27
+ # debug
28
+ npm-debug.log*
29
+ yarn-debug.log*
30
+ yarn-error.log*
31
+ .pnpm-debug.log*
32
+
33
+ # env files (can opt-in for committing if needed)
34
+ .env*
35
+
36
+ # vercel
37
+ .vercel
38
+
39
+ # typescript
40
+ *.tsbuildinfo
41
+ next-env.d.ts
42
+
43
+ /src/generated/prisma
44
+
45
+ .vercel
46
+
47
+ # Documentation
48
+ /docs/
49
+ # SQLite databases
50
+ *.db
51
+ /intelligence_engine/data/
52
+ /intelligence_engine/**/data/
53
+
54
+ # Agent & Tools Clutter
55
+ .agents/
56
+ .playwright-mcp/
57
+ playwright-report/
58
+ test-results/
59
+ .gemini/
60
+ *.log
61
+ *.resolved
62
+
63
+ # Python
64
+ __pycache__/
65
+ *.pyc
66
+
67
+ # Temporary Scripts
68
+ fetch_leads.js
69
+ fix_db.js
70
+ save_intelligence.js
71
+ tmp_*.js
72
+ scripts/inject-test-lead.ts
73
+
74
+ node_modules/
75
+ .env
76
+ .next
77
+ fix-*.js
78
+
79
+ # Virtual Environments
80
+ venv/
81
+ .venv/
82
+ env/
83
+
84
+ # Claude Code
85
+ .claude/
86
+
87
+ # Local Deployment & Platform
88
+ /agents-platform/
89
+ launch_agencia_ai.ps1
90
+
91
+ # Local Strategy Assets (opcional)
92
+ # Strategy/
93
+
94
+ # Backups & Data
95
+ *.sql
96
+ *.tar.gz
97
+ *.zip
98
+
99
+ # Agent & Platform Tools
100
+ .mcp.json
101
+ .agent/
102
+ .agents/
103
+ .gemini/
104
+ .claude/
105
+
106
+ # Logs & Debug
107
+ *.log
108
+ npm-debug.log*
109
+ yarn-debug.log*
110
+ yarn-error.log*
111
+ .pnpm-debug.log*
finobs-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,105 @@
1
+ Metadata-Version: 2.4
2
+ Name: finobs
3
+ Version: 0.0.1
4
+ Summary: CLI tool to monitor, analyze and optimize LLM agents in production
5
+ Project-URL: Repository, https://github.com/fmonfasani/finobs
6
+ License: MIT
7
+ Keywords: agents,cli,llm,monitoring,observability
8
+ Requires-Python: >=3.11
9
+ Requires-Dist: anthropic>=0.25
10
+ Requires-Dist: httpx>=0.27
11
+ Requires-Dist: numpy>=1.26
12
+ Requires-Dist: rich>=13
13
+ Requires-Dist: typer[all]>=0.12
14
+ Provides-Extra: dev
15
+ Requires-Dist: hatch; extra == 'dev'
16
+ Requires-Dist: pytest; extra == 'dev'
17
+ Requires-Dist: pytest-asyncio; extra == 'dev'
18
+ Requires-Dist: ruff; extra == 'dev'
19
+ Provides-Extra: fastapi
20
+ Requires-Dist: fastapi>=0.110; extra == 'fastapi'
21
+ Requires-Dist: uvicorn[standard]; extra == 'fastapi'
22
+ Description-Content-Type: text/markdown
23
+
24
+ # finobs — LLM Agent Observability Suite
25
+
26
+ Monitor, analyze and optimize LLM agents in production.
27
+
28
+ ```bash
29
+ pip install finobs
30
+ ```
31
+
32
+ ## Commands
33
+
34
+ ```bash
35
+ finobs audit run_123 # Full observability audit (5 agents in parallel)
36
+ finobs trace run_123 # Visualize execution trace
37
+ finobs loops run_123 # Detect loops and redundant steps
38
+ finobs cost run_123 # Cost breakdown by run
39
+ finobs cost tenant_abc # Cost breakdown by tenant
40
+ finobs performance run_123 # Latency analysis (p50/p95/p99)
41
+ finobs debug run_123 # Debug failed steps
42
+ finobs debug run_123 --llm # Debug with Claude semantic analysis (Phase 4)
43
+ ```
44
+
45
+ ## Instrument your agent
46
+
47
+ ```python
48
+ from finobs import traced, flush_trace
49
+
50
+ @traced("search_documents")
51
+ def search_documents(query: str) -> str:
52
+ ...
53
+
54
+ @traced("llm_call")
55
+ def call_llm(prompt: str) -> str:
56
+ ...
57
+
58
+ # At end of run:
59
+ flush_trace(run_id="run_123", tenant="my_tenant")
60
+ ```
61
+
62
+ ## Generate test traces
63
+
64
+ ```python
65
+ from finobs.scripts.trace_generator import generate_trace, Scenario
66
+
67
+ generate_trace(Scenario.LOOP) # loop detection
68
+ generate_trace(Scenario.LATENCY_SPIKE) # p95 spike
69
+ generate_trace(Scenario.TOKEN_WASTE) # cost waste
70
+ generate_trace(Scenario.MULTI_FAILURE) # combined issues
71
+ ```
72
+
73
+ Then audit:
74
+ ```bash
75
+ finobs audit loop_xxxxxx
76
+ ```
77
+
78
+ ## Scoring
79
+
80
+ | Dimension | Weight |
81
+ |-------------|--------|
82
+ | Performance | 25% |
83
+ | Quality | 25% |
84
+ | Cost | 20% |
85
+ | Stability | 15% |
86
+ | RAG | 15% |
87
+
88
+ Bands: 🟢 85–100 HEALTHY · 🟡 65–84 DEGRADED · 🔴 0–64 CRITICAL
89
+
90
+ ## Development
91
+
92
+ ```bash
93
+ git clone https://github.com/tu-org/finobs
94
+ cd finobs
95
+ pip install -e ".[dev,fastapi]"
96
+ pytest tests/ -v
97
+ ```
98
+
99
+ ## Roadmap
100
+
101
+ - [x] Phase 1 — CLI + trace interceptor + 5 analysis agents
102
+ - [x] Phase 2 — Scoring system + report generation
103
+ - [x] Phase 3 — Deterministic scripts (loop detector, cost analyzer, metrics)
104
+ - [ ] Phase 4 — Claude LLM semantic analysis (`--llm` flag)
105
+ - [ ] Phase 5 — Self-healing + A/B prompt comparison + degradation detection
finobs-0.0.1/README.md ADDED
@@ -0,0 +1,82 @@
1
+ # finobs — LLM Agent Observability Suite
2
+
3
+ Monitor, analyze and optimize LLM agents in production.
4
+
5
+ ```bash
6
+ pip install finobs
7
+ ```
8
+
9
+ ## Commands
10
+
11
+ ```bash
12
+ finobs audit run_123 # Full observability audit (5 agents in parallel)
13
+ finobs trace run_123 # Visualize execution trace
14
+ finobs loops run_123 # Detect loops and redundant steps
15
+ finobs cost run_123 # Cost breakdown by run
16
+ finobs cost tenant_abc # Cost breakdown by tenant
17
+ finobs performance run_123 # Latency analysis (p50/p95/p99)
18
+ finobs debug run_123 # Debug failed steps
19
+ finobs debug run_123 --llm # Debug with Claude semantic analysis (Phase 4)
20
+ ```
21
+
22
+ ## Instrument your agent
23
+
24
+ ```python
25
+ from finobs import traced, flush_trace
26
+
27
+ @traced("search_documents")
28
+ def search_documents(query: str) -> str:
29
+ ...
30
+
31
+ @traced("llm_call")
32
+ def call_llm(prompt: str) -> str:
33
+ ...
34
+
35
+ # At end of run:
36
+ flush_trace(run_id="run_123", tenant="my_tenant")
37
+ ```
38
+
39
+ ## Generate test traces
40
+
41
+ ```python
42
+ from finobs.scripts.trace_generator import generate_trace, Scenario
43
+
44
+ generate_trace(Scenario.LOOP) # loop detection
45
+ generate_trace(Scenario.LATENCY_SPIKE) # p95 spike
46
+ generate_trace(Scenario.TOKEN_WASTE) # cost waste
47
+ generate_trace(Scenario.MULTI_FAILURE) # combined issues
48
+ ```
49
+
50
+ Then audit:
51
+ ```bash
52
+ finobs audit loop_xxxxxx
53
+ ```
54
+
55
+ ## Scoring
56
+
57
+ | Dimension | Weight |
58
+ |-------------|--------|
59
+ | Performance | 25% |
60
+ | Quality | 25% |
61
+ | Cost | 20% |
62
+ | Stability | 15% |
63
+ | RAG | 15% |
64
+
65
+ Bands: 🟢 85–100 HEALTHY · 🟡 65–84 DEGRADED · 🔴 0–64 CRITICAL
66
+
67
+ ## Development
68
+
69
+ ```bash
70
+ git clone https://github.com/tu-org/finobs
71
+ cd finobs
72
+ pip install -e ".[dev,fastapi]"
73
+ pytest tests/ -v
74
+ ```
75
+
76
+ ## Roadmap
77
+
78
+ - [x] Phase 1 — CLI + trace interceptor + 5 analysis agents
79
+ - [x] Phase 2 — Scoring system + report generation
80
+ - [x] Phase 3 — Deterministic scripts (loop detector, cost analyzer, metrics)
81
+ - [ ] Phase 4 — Claude LLM semantic analysis (`--llm` flag)
82
+ - [ ] Phase 5 — Self-healing + A/B prompt comparison + degradation detection
@@ -0,0 +1,83 @@
1
+ """
2
+ Sample FastAPI agent instrumented with finobs @traced decorator.
3
+ Run: uvicorn examples.sample_agent.main:app --reload --port 8001
4
+ """
5
+ import uuid
6
+ import random
7
+ from fastapi import FastAPI
8
+ from pydantic import BaseModel
9
+
10
+ from finobs import traced, flush_trace
11
+ from finobs.scripts.trace_interceptor import set_run_id
12
+
13
+ app = FastAPI(title="Sample LLM Agent")
14
+
15
+
16
+ class TaskRequest(BaseModel):
17
+ task: str
18
+ tenant: str = "default"
19
+
20
+
21
+ class TaskResponse(BaseModel):
22
+ run_id: str
23
+ result: str
24
+ trace_path: str
25
+
26
+
27
+ @traced("search_documents")
28
+ def search_documents(query: str) -> str:
29
+ # Simulate RAG retrieval
30
+ import time; time.sleep(random.uniform(0.1, 0.5))
31
+ return f"Found 3 documents relevant to: {query}"
32
+
33
+
34
+ @traced("llm_call")
35
+ def llm_call(prompt: str) -> str:
36
+ # Simulate LLM call latency
37
+ import time; time.sleep(random.uniform(0.3, 1.2))
38
+ return f"LLM response for: {prompt[:50]}"
39
+
40
+
41
+ @traced("validate_output")
42
+ def validate_output(output: str) -> str:
43
+ import time; time.sleep(0.05)
44
+ if random.random() < 0.1:
45
+ raise ValueError("Validation failed")
46
+ return "valid"
47
+
48
+
49
+ @traced("write_file")
50
+ def write_file(content: str) -> str:
51
+ import time; time.sleep(0.05)
52
+ return f"Written {len(content)} chars"
53
+
54
+
55
+ @app.post("/invoke", response_model=TaskResponse)
56
+ async def invoke(request: TaskRequest):
57
+ run_id = str(uuid.uuid4())[:8]
58
+ set_run_id(run_id)
59
+
60
+ # Simulate agent execution
61
+ docs = search_documents(request.task)
62
+ response = llm_call(f"Given: {docs}. Task: {request.task}")
63
+ search_documents(request.task) # Intentional loop for demo
64
+ llm_call(f"Refine: {response}")
65
+ search_documents(request.task) # Loop again
66
+ try:
67
+ validate_output(response)
68
+ except ValueError:
69
+ pass # captured in trace as error
70
+ write_file(response)
71
+
72
+ trace_path = flush_trace(run_id=run_id, tenant=request.tenant)
73
+
74
+ return TaskResponse(
75
+ run_id=run_id,
76
+ result=response,
77
+ trace_path=trace_path,
78
+ )
79
+
80
+
81
+ @app.get("/health")
82
+ def health():
83
+ return {"status": "ok"}
@@ -0,0 +1,7 @@
1
+ """finobs — LLM Agent Observability Suite"""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from finobs.scripts.trace_interceptor import traced, flush_trace
6
+
7
+ __all__ = ["traced", "flush_trace"]
@@ -0,0 +1,3 @@
1
+ from finobs.agents import latency, quality, cost, behavior, rag
2
+
3
+ __all__ = ["latency", "quality", "cost", "behavior", "rag"]
@@ -0,0 +1,48 @@
1
+ from dataclasses import dataclass, field
2
+ from finobs.scripts.trace_parser import Trace
3
+ from finobs.scripts.loop_detector import detect_loops
4
+ from finobs.scripts.scoring import score_stability
5
+
6
+
7
+ @dataclass
8
+ class AgentResult:
9
+ agent: str
10
+ score: int
11
+ summary: str
12
+ issues: list[dict] = field(default_factory=list)
13
+ metrics: dict = field(default_factory=dict)
14
+
15
+
16
+ async def analyze(trace: Trace) -> AgentResult:
17
+ steps = trace.steps
18
+ loops = detect_loops(steps)
19
+ error_rate = sum(1 for s in steps if s.status == "error") / max(len(steps), 1)
20
+ score = score_stability(len(loops), error_rate, len(steps))
21
+
22
+ issues = []
23
+ for loop in loops:
24
+ issues.append({
25
+ "severity": "high",
26
+ "description": (
27
+ f"Loop detected: tool='{loop['tool']}' "
28
+ f"repeated {loop['occurrences']}x at steps {loop['step_indices']}"
29
+ ),
30
+ "recommendation": "Add loop guard — check result before re-invoking same tool with same input",
31
+ })
32
+
33
+ if len(steps) > 40:
34
+ issues.append({
35
+ "severity": "medium",
36
+ "description": f"Excessive step count: {len(steps)} steps",
37
+ "recommendation": "Review planner — may be generating redundant sub-tasks",
38
+ })
39
+
40
+ metrics = {
41
+ "total_steps": len(steps),
42
+ "loops_detected": len(loops),
43
+ "loop_detail": loops,
44
+ "error_rate": round(error_rate, 3),
45
+ }
46
+
47
+ summary = f"{len(loops)} loops" if loops else "no loops"
48
+ return AgentResult(agent="behavior", score=score, summary=summary, issues=issues, metrics=metrics)
@@ -0,0 +1,36 @@
1
+ from dataclasses import dataclass, field
2
+ from finobs.scripts.trace_parser import Trace
3
+ from finobs.scripts.cost_analyzer import compute_cost
4
+ from finobs.scripts.scoring import score_cost
5
+
6
+
7
+ @dataclass
8
+ class AgentResult:
9
+ agent: str
10
+ score: int
11
+ summary: str
12
+ issues: list[dict] = field(default_factory=list)
13
+ metrics: dict = field(default_factory=dict)
14
+
15
+
16
+ async def analyze(trace: Trace) -> AgentResult:
17
+ result = compute_cost(trace.steps)
18
+ score = score_cost(result["total_usd"], len(result["waste_steps"]))
19
+
20
+ issues = []
21
+ from finobs.scripts.scoring import THRESHOLDS
22
+ if result["total_usd"] > THRESHOLDS["cost_per_run_usd"]:
23
+ issues.append({
24
+ "severity": "high",
25
+ "description": f"Run cost ${result['total_usd']:.4f} exceeds budget (${THRESHOLDS['cost_per_run_usd']})",
26
+ "recommendation": "Consider using a cheaper model for non-critical steps",
27
+ })
28
+ if result["waste_steps"]:
29
+ issues.append({
30
+ "severity": "medium",
31
+ "description": f"{len(result['waste_steps'])} steps with high input/output token ratio",
32
+ "recommendation": "Trim context window — avoid re-injecting full history each step",
33
+ })
34
+
35
+ summary = f"${result['total_usd']:.4f}"
36
+ return AgentResult(agent="cost", score=score, summary=summary, issues=issues, metrics=result)
@@ -0,0 +1,39 @@
1
+ from dataclasses import dataclass, field
2
+ from finobs.scripts.trace_parser import Trace
3
+ from finobs.scripts.metrics_aggregator import compute_latency_metrics
4
+ from finobs.scripts.scoring import score_latency
5
+
6
+
7
+ @dataclass
8
+ class AgentResult:
9
+ agent: str
10
+ score: int
11
+ summary: str
12
+ issues: list[dict] = field(default_factory=list)
13
+ metrics: dict = field(default_factory=dict)
14
+
15
+
16
+ async def analyze(trace: Trace) -> AgentResult:
17
+ metrics = compute_latency_metrics(trace.steps)
18
+ error_rate = sum(1 for s in trace.steps if s.status == "error") / max(len(trace.steps), 1)
19
+
20
+ score = score_latency(metrics["p95"], metrics["p99"], error_rate)
21
+ issues = []
22
+
23
+ from finobs.scripts.scoring import THRESHOLDS
24
+ if metrics["p95"] > THRESHOLDS["latency_p95_ms"]:
25
+ issues.append({
26
+ "severity": "high",
27
+ "description": f"p95 latency {metrics['p95']:.0f}ms exceeds threshold ({THRESHOLDS['latency_p95_ms']}ms)",
28
+ "recommendation": "Profile slowest steps and consider caching or parallelization",
29
+ })
30
+
31
+ if error_rate > THRESHOLDS["error_rate_max"]:
32
+ issues.append({
33
+ "severity": "high",
34
+ "description": f"Error rate {error_rate:.1%} exceeds threshold",
35
+ "recommendation": "Review failed steps and add retry logic",
36
+ })
37
+
38
+ summary = f"p95: {metrics['p95']:.0f}ms"
39
+ return AgentResult(agent="latency", score=score, summary=summary, issues=issues, metrics=metrics)
@@ -0,0 +1,51 @@
1
+ from dataclasses import dataclass, field
2
+ from finobs.scripts.trace_parser import Trace
3
+
4
+
5
+ @dataclass
6
+ class AgentResult:
7
+ agent: str
8
+ score: int
9
+ summary: str
10
+ issues: list[dict] = field(default_factory=list)
11
+ metrics: dict = field(default_factory=dict)
12
+
13
+
14
+ async def analyze(trace: Trace) -> AgentResult:
15
+ steps = trace.steps
16
+ total = len(steps)
17
+ errors = [s for s in steps if s.status == "error"]
18
+ error_rate = len(errors) / max(total, 1)
19
+
20
+ # Detectar pasos con output muy bajo (posible respuesta vacía)
21
+ low_output = [s for s in steps if s.output_tokens < 10]
22
+ low_output_rate = len(low_output) / max(total, 1)
23
+
24
+ score = 100
25
+ score -= int(error_rate * 100 * 1.5)
26
+ score -= int(low_output_rate * 100 * 0.8)
27
+ score = max(0, score)
28
+
29
+ issues = []
30
+ if error_rate > 0.05:
31
+ issues.append({
32
+ "severity": "high",
33
+ "description": f"High failure rate: {error_rate:.1%} of steps failed",
34
+ "recommendation": "Add input validation before tool calls",
35
+ })
36
+ if low_output_rate > 0.10:
37
+ issues.append({
38
+ "severity": "medium",
39
+ "description": f"{len(low_output)} steps produced near-empty outputs",
40
+ "recommendation": "Check prompt clarity and context sufficiency",
41
+ })
42
+
43
+ metrics = {
44
+ "total_steps": total,
45
+ "error_count": len(errors),
46
+ "error_rate": round(error_rate, 3),
47
+ "low_output_steps": len(low_output),
48
+ }
49
+
50
+ summary = f"error_rate: {error_rate:.1%}"
51
+ return AgentResult(agent="quality", score=score, summary=summary, issues=issues, metrics=metrics)
@@ -0,0 +1,60 @@
1
+ from dataclasses import dataclass, field
2
+ from finobs.scripts.trace_parser import Trace
3
+ from finobs.scripts.scoring import score_rag
4
+
5
+
6
+ @dataclass
7
+ class AgentResult:
8
+ agent: str
9
+ score: int
10
+ summary: str
11
+ issues: list[dict] = field(default_factory=list)
12
+ metrics: dict = field(default_factory=dict)
13
+
14
+
15
+ async def analyze(trace: Trace) -> AgentResult:
16
+ rag_steps = [s for s in trace.steps if "rag" in s.tool.lower() or "retrieve" in s.tool.lower()]
17
+
18
+ if not rag_steps:
19
+ return AgentResult(
20
+ agent="rag",
21
+ score=100,
22
+ summary="no RAG steps",
23
+ metrics={"rag_steps": 0},
24
+ )
25
+
26
+ # Proxy de relevancia: ratio output/input tokens (bajo = recuperó pero no usó)
27
+ relevance_scores = []
28
+ for s in rag_steps:
29
+ ratio = s.output_tokens / max(s.input_tokens, 1)
30
+ relevance_scores.append(min(1.0, ratio * 2))
31
+
32
+ avg_relevance = sum(relevance_scores) / len(relevance_scores)
33
+ empty_steps = [s for s in rag_steps if s.output_tokens < 15]
34
+ empty_rate = len(empty_steps) / len(rag_steps)
35
+
36
+ score = score_rag(avg_relevance, empty_rate)
37
+ issues = []
38
+
39
+ from finobs.scripts.scoring import THRESHOLDS
40
+ if avg_relevance < THRESHOLDS["rag_relevance_min"]:
41
+ issues.append({
42
+ "severity": "high",
43
+ "description": f"Low RAG utilization: avg relevance proxy {avg_relevance:.2f}",
44
+ "recommendation": "Review chunking strategy and embedding model",
45
+ })
46
+ if empty_rate > 0.2:
47
+ issues.append({
48
+ "severity": "medium",
49
+ "description": f"{len(empty_steps)}/{len(rag_steps)} retrievals returned near-empty results",
50
+ "recommendation": "Check retrieval thresholds and fallback strategy",
51
+ })
52
+
53
+ metrics = {
54
+ "rag_steps": len(rag_steps),
55
+ "avg_relevance_proxy": round(avg_relevance, 3),
56
+ "empty_retrieval_rate": round(empty_rate, 3),
57
+ }
58
+
59
+ summary = f"relevance: {avg_relevance:.2f}"
60
+ return AgentResult(agent="rag", score=score, summary=summary, issues=issues, metrics=metrics)
@@ -0,0 +1,19 @@
1
+ import typer
2
+ from finobs.commands import audit, trace, loops, cost, performance, debug
3
+
4
+ app = typer.Typer(
5
+ name="finobs",
6
+ help="finobs — LLM Agent Observability Suite",
7
+ no_args_is_help=True,
8
+ rich_markup_mode="rich",
9
+ )
10
+
11
+ app.add_typer(audit.app, name="audit", help="Full observability audit of an agent run")
12
+ app.add_typer(trace.app, name="trace", help="Reconstruct and visualize execution trace")
13
+ app.add_typer(loops.app, name="loops", help="Detect loops and redundant steps")
14
+ app.add_typer(cost.app, name="cost", help="Cost analysis by run or tenant")
15
+ app.add_typer(performance.app, name="performance", help="Latency and throughput analysis")
16
+ app.add_typer(debug.app, name="debug", help="Deep debug of a failed run")
17
+
18
+ if __name__ == "__main__":
19
+ app()
File without changes