failure-forensics 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- failure_forensics-0.1.1/LICENSE +21 -0
- failure_forensics-0.1.1/MANIFEST.in +5 -0
- failure_forensics-0.1.1/PKG-INFO +269 -0
- failure_forensics-0.1.1/README.md +255 -0
- failure_forensics-0.1.1/data/ab_report.json +133 -0
- failure_forensics-0.1.1/data/baseline.json +10 -0
- failure_forensics-0.1.1/data/eval_candidates/candidates.jsonl +1 -0
- failure_forensics-0.1.1/data/logs/requests.jsonl +5 -0
- failure_forensics-0.1.1/failure_forensics/__init__.py +5 -0
- failure_forensics-0.1.1/failure_forensics/ab_report.py +116 -0
- failure_forensics-0.1.1/failure_forensics/alerts.py +100 -0
- failure_forensics-0.1.1/failure_forensics/baseline.py +91 -0
- failure_forensics-0.1.1/failure_forensics/config.py +36 -0
- failure_forensics-0.1.1/failure_forensics/dashboard.py +104 -0
- failure_forensics-0.1.1/failure_forensics/eval_collector.py +113 -0
- failure_forensics-0.1.1/failure_forensics/forensics.py +143 -0
- failure_forensics-0.1.1/failure_forensics/llm_analyzer.py +68 -0
- failure_forensics-0.1.1/failure_forensics/logger.py +105 -0
- failure_forensics-0.1.1/failure_forensics/pattern.py +175 -0
- failure_forensics-0.1.1/failure_forensics/prompt_optimizer.py +82 -0
- failure_forensics-0.1.1/failure_forensics/recommender.py +62 -0
- failure_forensics-0.1.1/failure_forensics/regression_guard.py +80 -0
- failure_forensics-0.1.1/failure_forensics/trace.py +72 -0
- failure_forensics-0.1.1/failure_forensics/versioning.py +120 -0
- failure_forensics-0.1.1/failure_forensics.egg-info/PKG-INFO +269 -0
- failure_forensics-0.1.1/failure_forensics.egg-info/SOURCES.txt +32 -0
- failure_forensics-0.1.1/failure_forensics.egg-info/dependency_links.txt +1 -0
- failure_forensics-0.1.1/failure_forensics.egg-info/requires.txt +3 -0
- failure_forensics-0.1.1/failure_forensics.egg-info/top_level.txt +1 -0
- failure_forensics-0.1.1/pyproject.toml +22 -0
- failure_forensics-0.1.1/setup.cfg +4 -0
- failure_forensics-0.1.1/tests/test_advanced.py +155 -0
- failure_forensics-0.1.1/tests/test_forensics.py +210 -0
- failure_forensics-0.1.1/tests/test_trace.py +64 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 jasstt
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: failure-forensics
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Production AI pipeline monitoring — root cause detection, anomaly alerts, regression guard
|
|
5
|
+
License: MIT License
|
|
6
|
+
Project-URL: Homepage, https://github.com/jasstt/failure-forensics
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: requests
|
|
11
|
+
Requires-Dist: python-dotenv
|
|
12
|
+
Requires-Dist: google-generativeai
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# Failure Forensics
|
|
16
|
+
|
|
17
|
+
Production AI pipeline monitoring — root cause detection, anomaly alerts, regression guard, and Gemini-powered recommendations.
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install failure-forensics
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from failure_forensics import trace
|
|
29
|
+
|
|
30
|
+
@trace(step="retrieval", version="v1")
|
|
31
|
+
def my_retrieval_function(query):
|
|
32
|
+
# your code here
|
|
33
|
+
pass
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Features 🔬
|
|
37
|
+
|
|
38
|
+

|
|
39
|
+

|
|
40
|
+

|
|
41
|
+

|
|
42
|
+
|
|
43
|
+
A **self-hosted, zero-cost** LLM pipeline observability tool that gives you root cause detection, anomaly alerts, A/B reporting, and a live terminal dashboard — without sending your data to any third-party service.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## 🆚 Why Not LangSmith or Braintrust?
|
|
48
|
+
|
|
49
|
+
| | **Failure Forensics** | LangSmith | Braintrust |
|
|
50
|
+
|---|---|---|---|
|
|
51
|
+
| Cost | **Free** | Paid tiers | Paid tiers |
|
|
52
|
+
| Data privacy | **Stays on your machine** | Sent to cloud | Sent to cloud |
|
|
53
|
+
| Customization | **Full control** | Limited | Limited |
|
|
54
|
+
| Slack alerts | **Built-in** | Premium only | Premium only |
|
|
55
|
+
| A/B reporting | **Built-in** | Basic | Basic |
|
|
56
|
+
| Circuit breaker / trend | **Built-in** | ❌ | ❌ |
|
|
57
|
+
|
|
58
|
+
**Failure Forensics is designed for teams who need production-grade observability without vendor lock-in.**
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## ✨ What It Does
|
|
63
|
+
|
|
64
|
+
Every pipeline run passes through a structured logging and analysis layer:
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
Pipeline Step → logger.py → requests.jsonl
|
|
68
|
+
↓
|
|
69
|
+
┌────────────────┴────────────────┐
|
|
70
|
+
│ │
|
|
71
|
+
forensics.py pattern.py
|
|
72
|
+
(root cause detection) (time series + anomaly)
|
|
73
|
+
│ │
|
|
74
|
+
versioning.py baseline.py
|
|
75
|
+
(v1 vs v2 comparison) (7-day moving average)
|
|
76
|
+
│ │
|
|
77
|
+
ab_report.py alerts.py
|
|
78
|
+
(A/B comparison table) (Slack / console alert)
|
|
79
|
+
└────────────────┬────────────────┘
|
|
80
|
+
↓
|
|
81
|
+
dashboard.py
|
|
82
|
+
(ASCII terminal dashboard)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## 📁 Project Structure
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
failure-forensics/
|
|
91
|
+
├── src/
|
|
92
|
+
│ ├── logger.py # Logs every pipeline step to JSONL
|
|
93
|
+
│ ├── forensics.py # Root cause detection (5 categories)
|
|
94
|
+
│ ├── pattern.py # Time-series failure rate + anomaly detection
|
|
95
|
+
│ ├── baseline.py # 7-day moving average + trend (IMPROVING/STABLE/DEGRADING)
|
|
96
|
+
│ ├── alerts.py # Slack webhook + console alerts
|
|
97
|
+
│ ├── versioning.py # Per-version failure rate stats
|
|
98
|
+
│ ├── ab_report.py # A/B comparison report (table + JSON)
|
|
99
|
+
│ └── dashboard.py # ASCII bar chart terminal dashboard
|
|
100
|
+
├── data/
|
|
101
|
+
│ └── logs/
|
|
102
|
+
│ └── requests.jsonl # All pipeline logs (gitignored)
|
|
103
|
+
├── tests/
|
|
104
|
+
│ └── test_forensics.py # 8 unit tests
|
|
105
|
+
├── config.py # Thresholds, Slack URL, step limits
|
|
106
|
+
├── main.py # 5-scenario demo runner
|
|
107
|
+
├── simulate.py # Realistic test data generator (100 runs, anomaly day)
|
|
108
|
+
└── requirements.txt
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## 🚀 Getting Started
|
|
114
|
+
|
|
115
|
+
### 1. Clone & Install
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
git clone https://github.com/jasstt/failure-forensics.git
|
|
119
|
+
cd failure-forensics
|
|
120
|
+
pip install -r requirements.txt
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### 2. (Optional) Configure Slack Alerts
|
|
124
|
+
|
|
125
|
+
Edit `config.py`:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
SLACK_WEBHOOK_URL = "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
If left empty, all alerts print to the console.
|
|
132
|
+
|
|
133
|
+
### 3. Run the Full Demo
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
python main.py
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
This runs 5 scenarios:
|
|
140
|
+
1. **Simulation** — generates 100 realistic pipeline runs (2 prompt versions, anomaly day)
|
|
141
|
+
2. **Root cause analysis** — detects the failing step and assigns a category
|
|
142
|
+
3. **7-day pattern report** — failure rate per day + step breakdown + anomaly check
|
|
143
|
+
4. **A/B report** — `prompt_v1` vs `prompt_v2` with per-step improvement table
|
|
144
|
+
5. **Terminal dashboard** — live ASCII bar charts, trend, top 5 failed runs
|
|
145
|
+
|
|
146
|
+
### 4. Run Unit Tests
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
python tests/test_forensics.py
|
|
150
|
+
python tests/test_advanced.py
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## 🚀 Advanced Features (New in v2)
|
|
156
|
+
|
|
157
|
+
| Katman | Özellik | Teknoloji |
|
|
158
|
+
|--------|---------|-----------|
|
|
159
|
+
| 1 | Otomatik öneri motoru | Kural tabanlı |
|
|
160
|
+
| 2 | AI destekli hata analizi | Gemini 2.5 Pro |
|
|
161
|
+
| 3 | Eval seti otomatik büyütme | Frequency analysis |
|
|
162
|
+
| 4 | Prompt optimizasyon açıklaması | Gemini 2.5 Pro |
|
|
163
|
+
| 5 | Regression guard | Baseline comparison |
|
|
164
|
+
|
|
165
|
+
**Senaryo 6: Regression Guard**
|
|
166
|
+
Yeni bir prompt (`v3`) deploy edilmeden önce otomatik regresyon kontrolü yapar:
|
|
167
|
+
```
|
|
168
|
+
REGRESSION CHECK — v3
|
|
169
|
+
Baseline (v2): 11.0% failure rate
|
|
170
|
+
Yeni (v3): 24.5% failure rate
|
|
171
|
+
Delta: +13.5pp → REGRESSION_DETECTED ❌
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Test Results
|
|
175
|
+
|
|
176
|
+
| Katman | Test | Sonuç |
|
|
177
|
+
|--------|------|-------|
|
|
178
|
+
| 1 — Recommender | Kategori → öneri mapping | ✅ PASS |
|
|
179
|
+
| 2 — LLM Analyzer | Gemini fallback | ✅ PASS |
|
|
180
|
+
| 3 — Eval Collector | Duplicate prevention | ✅ PASS |
|
|
181
|
+
| 4 — Prompt Optimizer | A/B açıklama (v2: +10pp) | ✅ PASS |
|
|
182
|
+
| 5 — Regression Guard | DETECTED + PASS senaryoları | ✅ PASS |
|
|
183
|
+
|
|
184
|
+
### Key Results
|
|
185
|
+
- A/B: prompt_v2, v1'e göre 10pp iyileşme
|
|
186
|
+
- Regression Guard: v3 deploy'u +6pp delta ile WARNING olarak engelledi
|
|
187
|
+
- Eval Collector: 5 yeni eval adayı otomatik toplandı
|
|
188
|
+
- LLM Analyzer: Gemini kapalıyken kural tabanlına sorunsuz fallback
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## 📊 Results
|
|
193
|
+
|
|
194
|
+
| Feature | Result |
|
|
195
|
+
|---------|--------|
|
|
196
|
+
| Unit Tests | **8/8 PASS** ✅ |
|
|
197
|
+
| Root cause categories | **5 types** (RETRIEVAL_QUALITY, RERANKER_FAILURE, LLM_HALLUCINATION, CITATION_MISS, API_ERROR) |
|
|
198
|
+
| Anomaly detection | **20% delta threshold** — flags when today's rate exceeds 7-day average by >20pp |
|
|
199
|
+
| A/B comparison | **v2: 11.5pp improvement** over v1 (22.5% → 11.0% failure rate) |
|
|
200
|
+
| Trend analysis | **IMPROVING / STABLE / DEGRADING** based on 7-day moving average |
|
|
201
|
+
| Slack integration | **Webhook ready** — fires on rate threshold, anomaly, or 3 consecutive failures |
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## ⚙️ Configuration (`config.py`)
|
|
206
|
+
|
|
207
|
+
| Parameter | Default | Description |
|
|
208
|
+
|-----------|---------|-------------|
|
|
209
|
+
| `FAILURE_RATE_THRESHOLD` | `0.25` | Alert fires above this failure rate |
|
|
210
|
+
| `ANOMALY_THRESHOLD` | `0.20` | Flag if today exceeds 7-day avg by this delta |
|
|
211
|
+
| `SLACK_WEBHOOK_URL` | `""` | Empty = console output |
|
|
212
|
+
| `CONSECUTIVE_FAILURE_THRESHOLD` | `3` | Alert after N consecutive step failures |
|
|
213
|
+
| `STEP_THRESHOLDS` | see config | Per-step max acceptable failure rate |
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## 🧪 Root Cause Categories
|
|
218
|
+
|
|
219
|
+
| Category | Trigger |
|
|
220
|
+
|----------|---------|
|
|
221
|
+
| `RETRIEVAL_QUALITY` | Retrieval step fails — no results, low score |
|
|
222
|
+
| `RERANKER_FAILURE` | Reranker can't parse LLM response or times out |
|
|
223
|
+
| `LLM_HALLUCINATION` | Generation returns empty or uncited response |
|
|
224
|
+
| `CITATION_MISS` | Answer produced but no source citations found |
|
|
225
|
+
| `API_ERROR` | Timeout, 429 rate limit, 503 service unavailable |
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
## 📈 Terminal Dashboard (Sample Output)
|
|
230
|
+
|
|
231
|
+
```
|
|
232
|
+
═════════════════════════════════════════════════════════════
|
|
233
|
+
🔬 FAILURE FORENSICS — Terminal Dashboard
|
|
234
|
+
═════════════════════════════════════════════════════════════
|
|
235
|
+
|
|
236
|
+
📅 SON 7 GÜNÜN FAILURE RATE GRAFİĞİ
|
|
237
|
+
2026-06-03 [███░░░░░░░░░░░░░░░░░░░░░░░░░░░] 13.0%
|
|
238
|
+
2026-06-07 [████████░░░░░░░░░░░░░░░░░░░░░░] 27.3% ⚠️
|
|
239
|
+
2026-06-10 [███░░░░░░░░░░░░░░░░░░░░░░░░░░░] 12.0%
|
|
240
|
+
|
|
241
|
+
🔍 ADIM BAZINDA HATA DAĞILIMI
|
|
242
|
+
retrieval [███████░░░░░░░░░░░░░] 38.0% (38/100 hatalı)
|
|
243
|
+
reranking [██░░░░░░░░░░░░░░░░░░] 13.0% (13/100 hatalı)
|
|
244
|
+
generation [██░░░░░░░░░░░░░░░░░░] 10.0% (10/100 hatalı)
|
|
245
|
+
citation [█░░░░░░░░░░░░░░░░░░░] 6.0% (6/100 hatalı)
|
|
246
|
+
|
|
247
|
+
⚡ ANOMALİ: ✅ Normal: Bugün (12.0%) ≈ 7g ort. (16.2%)
|
|
248
|
+
📊 TREND: ➡️ STABLE — Hareketli Ort: 16.0%
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
---
|
|
252
|
+
|
|
253
|
+
## 🛠 Technologies Used
|
|
254
|
+
|
|
255
|
+
* **Python standard library** — `json`, `collections`, `datetime`, `threading`
|
|
256
|
+
* **[requests](https://pypi.org/project/requests/)** — Slack webhook HTTP calls
|
|
257
|
+
* **[python-dotenv](https://pypi.org/project/python-dotenv/)** — Environment variable management
|
|
258
|
+
|
|
259
|
+
No heavy dependencies. No cloud. No API keys required.
|
|
260
|
+
|
|
261
|
+
---
|
|
262
|
+
|
|
263
|
+
## 🔭 Roadmap
|
|
264
|
+
|
|
265
|
+
- [ ] FastAPI REST endpoint for remote log ingestion
|
|
266
|
+
- [ ] HTML report export
|
|
267
|
+
- [ ] PostgreSQL backend for large-scale log storage
|
|
268
|
+
- [ ] Multi-pipeline support (compare RAG vs fine-tuned model)
|
|
269
|
+
- [ ] Email alerts as alternative to Slack
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
# Failure Forensics
|
|
2
|
+
|
|
3
|
+
Production AI pipeline monitoring — root cause detection, anomaly alerts, regression guard, and Gemini-powered recommendations.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install failure-forensics
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from failure_forensics import trace
|
|
15
|
+
|
|
16
|
+
@trace(step="retrieval", version="v1")
|
|
17
|
+
def my_retrieval_function(query):
|
|
18
|
+
# your code here
|
|
19
|
+
pass
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Features 🔬
|
|
23
|
+
|
|
24
|
+

|
|
25
|
+

|
|
26
|
+

|
|
27
|
+

|
|
28
|
+
|
|
29
|
+
A **self-hosted, zero-cost** LLM pipeline observability tool that gives you root cause detection, anomaly alerts, A/B reporting, and a live terminal dashboard — without sending your data to any third-party service.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## 🆚 Why Not LangSmith or Braintrust?
|
|
34
|
+
|
|
35
|
+
| | **Failure Forensics** | LangSmith | Braintrust |
|
|
36
|
+
|---|---|---|---|
|
|
37
|
+
| Cost | **Free** | Paid tiers | Paid tiers |
|
|
38
|
+
| Data privacy | **Stays on your machine** | Sent to cloud | Sent to cloud |
|
|
39
|
+
| Customization | **Full control** | Limited | Limited |
|
|
40
|
+
| Slack alerts | **Built-in** | Premium only | Premium only |
|
|
41
|
+
| A/B reporting | **Built-in** | Basic | Basic |
|
|
42
|
+
| Circuit breaker / trend | **Built-in** | ❌ | ❌ |
|
|
43
|
+
|
|
44
|
+
**Failure Forensics is designed for teams who need production-grade observability without vendor lock-in.**
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## ✨ What It Does
|
|
49
|
+
|
|
50
|
+
Every pipeline run passes through a structured logging and analysis layer:
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
Pipeline Step → logger.py → requests.jsonl
|
|
54
|
+
↓
|
|
55
|
+
┌────────────────┴────────────────┐
|
|
56
|
+
│ │
|
|
57
|
+
forensics.py pattern.py
|
|
58
|
+
(root cause detection) (time series + anomaly)
|
|
59
|
+
│ │
|
|
60
|
+
versioning.py baseline.py
|
|
61
|
+
(v1 vs v2 comparison) (7-day moving average)
|
|
62
|
+
│ │
|
|
63
|
+
ab_report.py alerts.py
|
|
64
|
+
(A/B comparison table) (Slack / console alert)
|
|
65
|
+
└────────────────┬────────────────┘
|
|
66
|
+
↓
|
|
67
|
+
dashboard.py
|
|
68
|
+
(ASCII terminal dashboard)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## 📁 Project Structure
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
failure-forensics/
|
|
77
|
+
├── src/
|
|
78
|
+
│ ├── logger.py # Logs every pipeline step to JSONL
|
|
79
|
+
│ ├── forensics.py # Root cause detection (5 categories)
|
|
80
|
+
│ ├── pattern.py # Time-series failure rate + anomaly detection
|
|
81
|
+
│ ├── baseline.py # 7-day moving average + trend (IMPROVING/STABLE/DEGRADING)
|
|
82
|
+
│ ├── alerts.py # Slack webhook + console alerts
|
|
83
|
+
│ ├── versioning.py # Per-version failure rate stats
|
|
84
|
+
│ ├── ab_report.py # A/B comparison report (table + JSON)
|
|
85
|
+
│ └── dashboard.py # ASCII bar chart terminal dashboard
|
|
86
|
+
├── data/
|
|
87
|
+
│ └── logs/
|
|
88
|
+
│ └── requests.jsonl # All pipeline logs (gitignored)
|
|
89
|
+
├── tests/
|
|
90
|
+
│ └── test_forensics.py # 8 unit tests
|
|
91
|
+
├── config.py # Thresholds, Slack URL, step limits
|
|
92
|
+
├── main.py # 5-scenario demo runner
|
|
93
|
+
├── simulate.py # Realistic test data generator (100 runs, anomaly day)
|
|
94
|
+
└── requirements.txt
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## 🚀 Getting Started
|
|
100
|
+
|
|
101
|
+
### 1. Clone & Install
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
git clone https://github.com/jasstt/failure-forensics.git
|
|
105
|
+
cd failure-forensics
|
|
106
|
+
pip install -r requirements.txt
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### 2. (Optional) Configure Slack Alerts
|
|
110
|
+
|
|
111
|
+
Edit `config.py`:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
SLACK_WEBHOOK_URL = "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
If left empty, all alerts print to the console.
|
|
118
|
+
|
|
119
|
+
### 3. Run the Full Demo
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
python main.py
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
This runs 5 scenarios:
|
|
126
|
+
1. **Simulation** — generates 100 realistic pipeline runs (2 prompt versions, anomaly day)
|
|
127
|
+
2. **Root cause analysis** — detects the failing step and assigns a category
|
|
128
|
+
3. **7-day pattern report** — failure rate per day + step breakdown + anomaly check
|
|
129
|
+
4. **A/B report** — `prompt_v1` vs `prompt_v2` with per-step improvement table
|
|
130
|
+
5. **Terminal dashboard** — live ASCII bar charts, trend, top 5 failed runs
|
|
131
|
+
|
|
132
|
+
### 4. Run Unit Tests
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
python tests/test_forensics.py
|
|
136
|
+
python tests/test_advanced.py
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## 🚀 Advanced Features (New in v2)
|
|
142
|
+
|
|
143
|
+
| Katman | Özellik | Teknoloji |
|
|
144
|
+
|--------|---------|-----------|
|
|
145
|
+
| 1 | Otomatik öneri motoru | Kural tabanlı |
|
|
146
|
+
| 2 | AI destekli hata analizi | Gemini 2.5 Pro |
|
|
147
|
+
| 3 | Eval seti otomatik büyütme | Frequency analysis |
|
|
148
|
+
| 4 | Prompt optimizasyon açıklaması | Gemini 2.5 Pro |
|
|
149
|
+
| 5 | Regression guard | Baseline comparison |
|
|
150
|
+
|
|
151
|
+
**Senaryo 6: Regression Guard**
|
|
152
|
+
Yeni bir prompt (`v3`) deploy edilmeden önce otomatik regresyon kontrolü yapar:
|
|
153
|
+
```
|
|
154
|
+
REGRESSION CHECK — v3
|
|
155
|
+
Baseline (v2): 11.0% failure rate
|
|
156
|
+
Yeni (v3): 24.5% failure rate
|
|
157
|
+
Delta: +13.5pp → REGRESSION_DETECTED ❌
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Test Results
|
|
161
|
+
|
|
162
|
+
| Katman | Test | Sonuç |
|
|
163
|
+
|--------|------|-------|
|
|
164
|
+
| 1 — Recommender | Kategori → öneri mapping | ✅ PASS |
|
|
165
|
+
| 2 — LLM Analyzer | Gemini fallback | ✅ PASS |
|
|
166
|
+
| 3 — Eval Collector | Duplicate prevention | ✅ PASS |
|
|
167
|
+
| 4 — Prompt Optimizer | A/B açıklama (v2: +10pp) | ✅ PASS |
|
|
168
|
+
| 5 — Regression Guard | DETECTED + PASS senaryoları | ✅ PASS |
|
|
169
|
+
|
|
170
|
+
### Key Results
|
|
171
|
+
- A/B: prompt_v2, v1'e göre 10pp iyileşme
|
|
172
|
+
- Regression Guard: v3 deploy'u +6pp delta ile WARNING olarak engelledi
|
|
173
|
+
- Eval Collector: 5 yeni eval adayı otomatik toplandı
|
|
174
|
+
- LLM Analyzer: Gemini kapalıyken kural tabanlına sorunsuz fallback
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## 📊 Results
|
|
179
|
+
|
|
180
|
+
| Feature | Result |
|
|
181
|
+
|---------|--------|
|
|
182
|
+
| Unit Tests | **8/8 PASS** ✅ |
|
|
183
|
+
| Root cause categories | **5 types** (RETRIEVAL_QUALITY, RERANKER_FAILURE, LLM_HALLUCINATION, CITATION_MISS, API_ERROR) |
|
|
184
|
+
| Anomaly detection | **20% delta threshold** — flags when today's rate exceeds 7-day average by >20pp |
|
|
185
|
+
| A/B comparison | **v2: 11.5pp improvement** over v1 (22.5% → 11.0% failure rate) |
|
|
186
|
+
| Trend analysis | **IMPROVING / STABLE / DEGRADING** based on 7-day moving average |
|
|
187
|
+
| Slack integration | **Webhook ready** — fires on rate threshold, anomaly, or 3 consecutive failures |
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## ⚙️ Configuration (`config.py`)
|
|
192
|
+
|
|
193
|
+
| Parameter | Default | Description |
|
|
194
|
+
|-----------|---------|-------------|
|
|
195
|
+
| `FAILURE_RATE_THRESHOLD` | `0.25` | Alert fires above this failure rate |
|
|
196
|
+
| `ANOMALY_THRESHOLD` | `0.20` | Flag if today exceeds 7-day avg by this delta |
|
|
197
|
+
| `SLACK_WEBHOOK_URL` | `""` | Empty = console output |
|
|
198
|
+
| `CONSECUTIVE_FAILURE_THRESHOLD` | `3` | Alert after N consecutive step failures |
|
|
199
|
+
| `STEP_THRESHOLDS` | see config | Per-step max acceptable failure rate |
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## 🧪 Root Cause Categories
|
|
204
|
+
|
|
205
|
+
| Category | Trigger |
|
|
206
|
+
|----------|---------|
|
|
207
|
+
| `RETRIEVAL_QUALITY` | Retrieval step fails — no results, low score |
|
|
208
|
+
| `RERANKER_FAILURE` | Reranker can't parse LLM response or times out |
|
|
209
|
+
| `LLM_HALLUCINATION` | Generation returns empty or uncited response |
|
|
210
|
+
| `CITATION_MISS` | Answer produced but no source citations found |
|
|
211
|
+
| `API_ERROR` | Timeout, 429 rate limit, 503 service unavailable |
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## 📈 Terminal Dashboard (Sample Output)
|
|
216
|
+
|
|
217
|
+
```
|
|
218
|
+
═════════════════════════════════════════════════════════════
|
|
219
|
+
🔬 FAILURE FORENSICS — Terminal Dashboard
|
|
220
|
+
═════════════════════════════════════════════════════════════
|
|
221
|
+
|
|
222
|
+
📅 SON 7 GÜNÜN FAILURE RATE GRAFİĞİ
|
|
223
|
+
2026-06-03 [███░░░░░░░░░░░░░░░░░░░░░░░░░░░] 13.0%
|
|
224
|
+
2026-06-07 [████████░░░░░░░░░░░░░░░░░░░░░░] 27.3% ⚠️
|
|
225
|
+
2026-06-10 [███░░░░░░░░░░░░░░░░░░░░░░░░░░░] 12.0%
|
|
226
|
+
|
|
227
|
+
🔍 ADIM BAZINDA HATA DAĞILIMI
|
|
228
|
+
retrieval [███████░░░░░░░░░░░░░] 38.0% (38/100 hatalı)
|
|
229
|
+
reranking [██░░░░░░░░░░░░░░░░░░] 13.0% (13/100 hatalı)
|
|
230
|
+
generation [██░░░░░░░░░░░░░░░░░░] 10.0% (10/100 hatalı)
|
|
231
|
+
citation [█░░░░░░░░░░░░░░░░░░░] 6.0% (6/100 hatalı)
|
|
232
|
+
|
|
233
|
+
⚡ ANOMALİ: ✅ Normal: Bugün (12.0%) ≈ 7g ort. (16.2%)
|
|
234
|
+
📊 TREND: ➡️ STABLE — Hareketli Ort: 16.0%
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## 🛠 Technologies Used
|
|
240
|
+
|
|
241
|
+
* **Python standard library** — `json`, `collections`, `datetime`, `threading`
|
|
242
|
+
* **[requests](https://pypi.org/project/requests/)** — Slack webhook HTTP calls
|
|
243
|
+
* **[python-dotenv](https://pypi.org/project/python-dotenv/)** — Environment variable management
|
|
244
|
+
|
|
245
|
+
No heavy dependencies. No cloud. No API keys required.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## 🔭 Roadmap
|
|
250
|
+
|
|
251
|
+
- [ ] FastAPI REST endpoint for remote log ingestion
|
|
252
|
+
- [ ] HTML report export
|
|
253
|
+
- [ ] PostgreSQL backend for large-scale log storage
|
|
254
|
+
- [ ] Multi-pipeline support (compare RAG vs fine-tuned model)
|
|
255
|
+
- [ ] Email alerts as alternative to Slack
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
{
|
|
2
|
+
"summary": {
|
|
3
|
+
"v1": "v1",
|
|
4
|
+
"v2": "v2",
|
|
5
|
+
"winner": "v2",
|
|
6
|
+
"v1_failure_rate": 0.25,
|
|
7
|
+
"v2_failure_rate": 0.15,
|
|
8
|
+
"improvement_pct": 10.0,
|
|
9
|
+
"v1_runs": 50,
|
|
10
|
+
"v2_runs": 50
|
|
11
|
+
},
|
|
12
|
+
"step_breakdown": {
|
|
13
|
+
"reranking": {
|
|
14
|
+
"v1": {
|
|
15
|
+
"total": 50,
|
|
16
|
+
"failed": 13,
|
|
17
|
+
"rate": 0.26
|
|
18
|
+
},
|
|
19
|
+
"v2": {
|
|
20
|
+
"total": 50,
|
|
21
|
+
"failed": 8,
|
|
22
|
+
"rate": 0.16
|
|
23
|
+
},
|
|
24
|
+
"delta_pct": 10.0,
|
|
25
|
+
"step_winner": "v2"
|
|
26
|
+
},
|
|
27
|
+
"citation": {
|
|
28
|
+
"v1": {
|
|
29
|
+
"total": 50,
|
|
30
|
+
"failed": 3,
|
|
31
|
+
"rate": 0.06
|
|
32
|
+
},
|
|
33
|
+
"v2": {
|
|
34
|
+
"total": 50,
|
|
35
|
+
"failed": 6,
|
|
36
|
+
"rate": 0.12
|
|
37
|
+
},
|
|
38
|
+
"delta_pct": -6.0,
|
|
39
|
+
"step_winner": "v1"
|
|
40
|
+
},
|
|
41
|
+
"generation": {
|
|
42
|
+
"v1": {
|
|
43
|
+
"total": 50,
|
|
44
|
+
"failed": 15,
|
|
45
|
+
"rate": 0.3
|
|
46
|
+
},
|
|
47
|
+
"v2": {
|
|
48
|
+
"total": 50,
|
|
49
|
+
"failed": 2,
|
|
50
|
+
"rate": 0.04
|
|
51
|
+
},
|
|
52
|
+
"delta_pct": 26.0,
|
|
53
|
+
"step_winner": "v2"
|
|
54
|
+
},
|
|
55
|
+
"retrieval": {
|
|
56
|
+
"v1": {
|
|
57
|
+
"total": 50,
|
|
58
|
+
"failed": 19,
|
|
59
|
+
"rate": 0.38
|
|
60
|
+
},
|
|
61
|
+
"v2": {
|
|
62
|
+
"total": 50,
|
|
63
|
+
"failed": 14,
|
|
64
|
+
"rate": 0.28
|
|
65
|
+
},
|
|
66
|
+
"delta_pct": 10.0,
|
|
67
|
+
"step_winner": "v2"
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
"raw_stats": {
|
|
71
|
+
"v1": {
|
|
72
|
+
"total_steps": 200,
|
|
73
|
+
"failed_steps": 50,
|
|
74
|
+
"failure_rate": 0.25,
|
|
75
|
+
"runs": 50,
|
|
76
|
+
"steps": {
|
|
77
|
+
"retrieval": {
|
|
78
|
+
"total": 50,
|
|
79
|
+
"failed": 19,
|
|
80
|
+
"rate": 0.38
|
|
81
|
+
},
|
|
82
|
+
"reranking": {
|
|
83
|
+
"total": 50,
|
|
84
|
+
"failed": 13,
|
|
85
|
+
"rate": 0.26
|
|
86
|
+
},
|
|
87
|
+
"generation": {
|
|
88
|
+
"total": 50,
|
|
89
|
+
"failed": 15,
|
|
90
|
+
"rate": 0.3
|
|
91
|
+
},
|
|
92
|
+
"citation": {
|
|
93
|
+
"total": 50,
|
|
94
|
+
"failed": 3,
|
|
95
|
+
"rate": 0.06
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
},
|
|
99
|
+
"v2": {
|
|
100
|
+
"total_steps": 200,
|
|
101
|
+
"failed_steps": 30,
|
|
102
|
+
"failure_rate": 0.15,
|
|
103
|
+
"runs": 50,
|
|
104
|
+
"steps": {
|
|
105
|
+
"retrieval": {
|
|
106
|
+
"total": 50,
|
|
107
|
+
"failed": 14,
|
|
108
|
+
"rate": 0.28
|
|
109
|
+
},
|
|
110
|
+
"reranking": {
|
|
111
|
+
"total": 50,
|
|
112
|
+
"failed": 8,
|
|
113
|
+
"rate": 0.16
|
|
114
|
+
},
|
|
115
|
+
"generation": {
|
|
116
|
+
"total": 50,
|
|
117
|
+
"failed": 2,
|
|
118
|
+
"rate": 0.04
|
|
119
|
+
},
|
|
120
|
+
"citation": {
|
|
121
|
+
"total": 50,
|
|
122
|
+
"failed": 6,
|
|
123
|
+
"rate": 0.12
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
},
|
|
128
|
+
"improvement_reason": {
|
|
129
|
+
"error": "Gemini not available",
|
|
130
|
+
"improvements": [],
|
|
131
|
+
"next_iteration": "Manuel inceleme gerekiyor."
|
|
132
|
+
}
|
|
133
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"query": "Aynı sorgu iki kere patladı", "failure_count": 3, "root_cause_category": "RETRIEVAL_QUALITY", "first_seen": "2026-06-11T01:53:53.961023+00:00", "last_seen": "2026-06-11T01:53:53.963734+00:00", "auto_add_eligible": true}
|