failure-forensics 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. failure_forensics-0.1.1/LICENSE +21 -0
  2. failure_forensics-0.1.1/MANIFEST.in +5 -0
  3. failure_forensics-0.1.1/PKG-INFO +269 -0
  4. failure_forensics-0.1.1/README.md +255 -0
  5. failure_forensics-0.1.1/data/ab_report.json +133 -0
  6. failure_forensics-0.1.1/data/baseline.json +10 -0
  7. failure_forensics-0.1.1/data/eval_candidates/candidates.jsonl +1 -0
  8. failure_forensics-0.1.1/data/logs/requests.jsonl +5 -0
  9. failure_forensics-0.1.1/failure_forensics/__init__.py +5 -0
  10. failure_forensics-0.1.1/failure_forensics/ab_report.py +116 -0
  11. failure_forensics-0.1.1/failure_forensics/alerts.py +100 -0
  12. failure_forensics-0.1.1/failure_forensics/baseline.py +91 -0
  13. failure_forensics-0.1.1/failure_forensics/config.py +36 -0
  14. failure_forensics-0.1.1/failure_forensics/dashboard.py +104 -0
  15. failure_forensics-0.1.1/failure_forensics/eval_collector.py +113 -0
  16. failure_forensics-0.1.1/failure_forensics/forensics.py +143 -0
  17. failure_forensics-0.1.1/failure_forensics/llm_analyzer.py +68 -0
  18. failure_forensics-0.1.1/failure_forensics/logger.py +105 -0
  19. failure_forensics-0.1.1/failure_forensics/pattern.py +175 -0
  20. failure_forensics-0.1.1/failure_forensics/prompt_optimizer.py +82 -0
  21. failure_forensics-0.1.1/failure_forensics/recommender.py +62 -0
  22. failure_forensics-0.1.1/failure_forensics/regression_guard.py +80 -0
  23. failure_forensics-0.1.1/failure_forensics/trace.py +72 -0
  24. failure_forensics-0.1.1/failure_forensics/versioning.py +120 -0
  25. failure_forensics-0.1.1/failure_forensics.egg-info/PKG-INFO +269 -0
  26. failure_forensics-0.1.1/failure_forensics.egg-info/SOURCES.txt +32 -0
  27. failure_forensics-0.1.1/failure_forensics.egg-info/dependency_links.txt +1 -0
  28. failure_forensics-0.1.1/failure_forensics.egg-info/requires.txt +3 -0
  29. failure_forensics-0.1.1/failure_forensics.egg-info/top_level.txt +1 -0
  30. failure_forensics-0.1.1/pyproject.toml +22 -0
  31. failure_forensics-0.1.1/setup.cfg +4 -0
  32. failure_forensics-0.1.1/tests/test_advanced.py +155 -0
  33. failure_forensics-0.1.1/tests/test_forensics.py +210 -0
  34. failure_forensics-0.1.1/tests/test_trace.py +64 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 jasstt
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ include README.md
2
+ include LICENSE
3
+ recursive-include failure_forensics *.py
4
+ recursive-include data *.json
5
+ recursive-include data *.jsonl
@@ -0,0 +1,269 @@
1
+ Metadata-Version: 2.4
2
+ Name: failure-forensics
3
+ Version: 0.1.1
4
+ Summary: Production AI pipeline monitoring — root cause detection, anomaly alerts, regression guard
5
+ License: MIT License
6
+ Project-URL: Homepage, https://github.com/jasstt/failure-forensics
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: requests
11
+ Requires-Dist: python-dotenv
12
+ Requires-Dist: google-generativeai
13
+ Dynamic: license-file
14
+
15
+ # Failure Forensics
16
+
17
+ Production AI pipeline monitoring — root cause detection, anomaly alerts, regression guard, and Gemini-powered recommendations.
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ pip install failure-forensics
23
+ ```
24
+
25
+ ## Quick Start
26
+
27
+ ```python
28
+ from failure_forensics import trace
29
+
30
+ @trace(step="retrieval", version="v1")
31
+ def my_retrieval_function(query):
32
+ # your code here
33
+ pass
34
+ ```
35
+
36
+ ## Features 🔬
37
+
38
+ ![Python](https://img.shields.io/badge/Python-3.10+-blue)
39
+ ![Tests](https://img.shields.io/badge/Tests-8%2F8_PASS-brightgreen)
40
+ ![License](https://img.shields.io/badge/License-MIT-lightgrey)
41
+ ![Alerts](https://img.shields.io/badge/Slack-Alerts_Ready-4A154B)
42
+
43
+ A **self-hosted, zero-cost** LLM pipeline observability tool that gives you root cause detection, anomaly alerts, A/B reporting, and a live terminal dashboard — without sending your data to any third-party service.
44
+
45
+ ---
46
+
47
+ ## 🆚 Why Not LangSmith or Braintrust?
48
+
49
+ | | **Failure Forensics** | LangSmith | Braintrust |
50
+ |---|---|---|---|
51
+ | Cost | **Free** | Paid tiers | Paid tiers |
52
+ | Data privacy | **Stays on your machine** | Sent to cloud | Sent to cloud |
53
+ | Customization | **Full control** | Limited | Limited |
54
+ | Slack alerts | **Built-in** | Premium only | Premium only |
55
+ | A/B reporting | **Built-in** | Basic | Basic |
56
+ | Circuit breaker / trend | **Built-in** | ❌ | ❌ |
57
+
58
+ **Failure Forensics is designed for teams who need production-grade observability without vendor lock-in.**
59
+
60
+ ---
61
+
62
+ ## ✨ What It Does
63
+
64
+ Every pipeline run passes through a structured logging and analysis layer:
65
+
66
+ ```
67
+ Pipeline Step → logger.py → requests.jsonl
68
+
69
+ ┌────────────────┴────────────────┐
70
+ │ │
71
+ forensics.py pattern.py
72
+ (root cause detection) (time series + anomaly)
73
+ │ │
74
+ versioning.py baseline.py
75
+ (v1 vs v2 comparison) (7-day moving average)
76
+ │ │
77
+ ab_report.py alerts.py
78
+ (A/B comparison table) (Slack / console alert)
79
+ └────────────────┬────────────────┘
80
+
81
+ dashboard.py
82
+ (ASCII terminal dashboard)
83
+ ```
84
+
85
+ ---
86
+
87
+ ## 📁 Project Structure
88
+
89
+ ```
90
+ failure-forensics/
91
+ ├── src/
92
+ │ ├── logger.py # Logs every pipeline step to JSONL
93
+ │ ├── forensics.py # Root cause detection (5 categories)
94
+ │ ├── pattern.py # Time-series failure rate + anomaly detection
95
+ │ ├── baseline.py # 7-day moving average + trend (IMPROVING/STABLE/DEGRADING)
96
+ │ ├── alerts.py # Slack webhook + console alerts
97
+ │ ├── versioning.py # Per-version failure rate stats
98
+ │ ├── ab_report.py # A/B comparison report (table + JSON)
99
+ │ └── dashboard.py # ASCII bar chart terminal dashboard
100
+ ├── data/
101
+ │ └── logs/
102
+ │ └── requests.jsonl # All pipeline logs (gitignored)
103
+ ├── tests/
104
+ │ └── test_forensics.py # 8 unit tests
105
+ ├── config.py # Thresholds, Slack URL, step limits
106
+ ├── main.py # 5-scenario demo runner
107
+ ├── simulate.py # Realistic test data generator (100 runs, anomaly day)
108
+ └── requirements.txt
109
+ ```
110
+
111
+ ---
112
+
113
+ ## 🚀 Getting Started
114
+
115
+ ### 1. Clone & Install
116
+
117
+ ```bash
118
+ git clone https://github.com/jasstt/failure-forensics.git
119
+ cd failure-forensics
120
+ pip install -r requirements.txt
121
+ ```
122
+
123
+ ### 2. (Optional) Configure Slack Alerts
124
+
125
+ Edit `config.py`:
126
+
127
+ ```python
128
+ SLACK_WEBHOOK_URL = "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
129
+ ```
130
+
131
+ If left empty, all alerts print to the console.
132
+
133
+ ### 3. Run the Full Demo
134
+
135
+ ```bash
136
+ python main.py
137
+ ```
138
+
139
+ This runs 5 scenarios:
140
+ 1. **Simulation** — generates 100 realistic pipeline runs (2 prompt versions, anomaly day)
141
+ 2. **Root cause analysis** — detects the failing step and assigns a category
142
+ 3. **7-day pattern report** — failure rate per day + step breakdown + anomaly check
143
+ 4. **A/B report** — `prompt_v1` vs `prompt_v2` with per-step improvement table
144
+ 5. **Terminal dashboard** — live ASCII bar charts, trend, top 5 failed runs
145
+
146
+ ### 4. Run Unit Tests
147
+
148
+ ```bash
149
+ python tests/test_forensics.py
150
+ python tests/test_advanced.py
151
+ ```
152
+
153
+ ---
154
+
155
+ ## 🚀 Advanced Features (New in v2)
156
+
157
+ | Katman | Özellik | Teknoloji |
158
+ |--------|---------|-----------|
159
+ | 1 | Otomatik öneri motoru | Kural tabanlı |
160
+ | 2 | AI destekli hata analizi | Gemini 2.5 Pro |
161
+ | 3 | Eval seti otomatik büyütme | Frequency analysis |
162
+ | 4 | Prompt optimizasyon açıklaması | Gemini 2.5 Pro |
163
+ | 5 | Regression guard | Baseline comparison |
164
+
165
+ **Senaryo 6: Regression Guard**
166
+ Yeni bir prompt (`v3`) deploy edilmeden önce otomatik regresyon kontrolü yapar:
167
+ ```
168
+ REGRESSION CHECK — v3
169
+ Baseline (v2): 11.0% failure rate
170
+ Yeni (v3): 24.5% failure rate
171
+ Delta: +13.5pp → REGRESSION_DETECTED ❌
172
+ ```
173
+
174
+ ## Test Results
175
+
176
+ | Katman | Test | Sonuç |
177
+ |--------|------|-------|
178
+ | 1 — Recommender | Kategori → öneri mapping | ✅ PASS |
179
+ | 2 — LLM Analyzer | Gemini fallback | ✅ PASS |
180
+ | 3 — Eval Collector | Duplicate prevention | ✅ PASS |
181
+ | 4 — Prompt Optimizer | A/B açıklama (v2: +10pp) | ✅ PASS |
182
+ | 5 — Regression Guard | DETECTED + PASS senaryoları | ✅ PASS |
183
+
184
+ ### Key Results
185
+ - A/B: prompt_v2, v1'e göre 10pp iyileşme
186
+ - Regression Guard: v3 deploy'u +6pp delta ile WARNING olarak engelledi
187
+ - Eval Collector: 5 yeni eval adayı otomatik toplandı
188
+ - LLM Analyzer: Gemini kapalıyken kural tabanlına sorunsuz fallback
189
+
190
+ ---
191
+
192
+ ## 📊 Results
193
+
194
+ | Feature | Result |
195
+ |---------|--------|
196
+ | Unit Tests | **8/8 PASS** ✅ |
197
+ | Root cause categories | **5 types** (RETRIEVAL_QUALITY, RERANKER_FAILURE, LLM_HALLUCINATION, CITATION_MISS, API_ERROR) |
198
+ | Anomaly detection | **20% delta threshold** — flags when today's rate exceeds 7-day average by >20pp |
199
+ | A/B comparison | **v2: 11.5pp improvement** over v1 (22.5% → 11.0% failure rate) |
200
+ | Trend analysis | **IMPROVING / STABLE / DEGRADING** based on 7-day moving average |
201
+ | Slack integration | **Webhook ready** — fires on rate threshold, anomaly, or 3 consecutive failures |
202
+
203
+ ---
204
+
205
+ ## ⚙️ Configuration (`config.py`)
206
+
207
+ | Parameter | Default | Description |
208
+ |-----------|---------|-------------|
209
+ | `FAILURE_RATE_THRESHOLD` | `0.25` | Alert fires above this failure rate |
210
+ | `ANOMALY_THRESHOLD` | `0.20` | Flag if today exceeds 7-day avg by this delta |
211
+ | `SLACK_WEBHOOK_URL` | `""` | Empty = console output |
212
+ | `CONSECUTIVE_FAILURE_THRESHOLD` | `3` | Alert after N consecutive step failures |
213
+ | `STEP_THRESHOLDS` | see config | Per-step max acceptable failure rate |
214
+
215
+ ---
216
+
217
+ ## 🧪 Root Cause Categories
218
+
219
+ | Category | Trigger |
220
+ |----------|---------|
221
+ | `RETRIEVAL_QUALITY` | Retrieval step fails — no results, low score |
222
+ | `RERANKER_FAILURE` | Reranker can't parse LLM response or times out |
223
+ | `LLM_HALLUCINATION` | Generation returns empty or uncited response |
224
+ | `CITATION_MISS` | Answer produced but no source citations found |
225
+ | `API_ERROR` | Timeout, 429 rate limit, 503 service unavailable |
226
+
227
+ ---
228
+
229
+ ## 📈 Terminal Dashboard (Sample Output)
230
+
231
+ ```
232
+ ═════════════════════════════════════════════════════════════
233
+ 🔬 FAILURE FORENSICS — Terminal Dashboard
234
+ ═════════════════════════════════════════════════════════════
235
+
236
+ 📅 SON 7 GÜNÜN FAILURE RATE GRAFİĞİ
237
+ 2026-06-03 [███░░░░░░░░░░░░░░░░░░░░░░░░░░░] 13.0%
238
+ 2026-06-07 [████████░░░░░░░░░░░░░░░░░░░░░░] 27.3% ⚠️
239
+ 2026-06-10 [███░░░░░░░░░░░░░░░░░░░░░░░░░░░] 12.0%
240
+
241
+ 🔍 ADIM BAZINDA HATA DAĞILIMI
242
+ retrieval [███████░░░░░░░░░░░░░] 38.0% (38/100 hatalı)
243
+ reranking [██░░░░░░░░░░░░░░░░░░] 13.0% (13/100 hatalı)
244
+ generation [██░░░░░░░░░░░░░░░░░░] 10.0% (10/100 hatalı)
245
+ citation [█░░░░░░░░░░░░░░░░░░░] 6.0% (6/100 hatalı)
246
+
247
+ ⚡ ANOMALİ: ✅ Normal: Bugün (12.0%) ≈ 7g ort. (16.2%)
248
+ 📊 TREND: ➡️ STABLE — Hareketli Ort: 16.0%
249
+ ```
250
+
251
+ ---
252
+
253
+ ## 🛠 Technologies Used
254
+
255
+ * **Python standard library** — `json`, `collections`, `datetime`, `threading`
256
+ * **[requests](https://pypi.org/project/requests/)** — Slack webhook HTTP calls
257
+ * **[python-dotenv](https://pypi.org/project/python-dotenv/)** — Environment variable management
258
+
259
+ No heavy dependencies. No cloud. No API keys required.
260
+
261
+ ---
262
+
263
+ ## 🔭 Roadmap
264
+
265
+ - [ ] FastAPI REST endpoint for remote log ingestion
266
+ - [ ] HTML report export
267
+ - [ ] PostgreSQL backend for large-scale log storage
268
+ - [ ] Multi-pipeline support (compare RAG vs fine-tuned model)
269
+ - [ ] Email alerts as alternative to Slack
@@ -0,0 +1,255 @@
1
+ # Failure Forensics
2
+
3
+ Production AI pipeline monitoring — root cause detection, anomaly alerts, regression guard, and Gemini-powered recommendations.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install failure-forensics
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```python
14
+ from failure_forensics import trace
15
+
16
+ @trace(step="retrieval", version="v1")
17
+ def my_retrieval_function(query):
18
+ # your code here
19
+ pass
20
+ ```
21
+
22
+ ## Features 🔬
23
+
24
+ ![Python](https://img.shields.io/badge/Python-3.10+-blue)
25
+ ![Tests](https://img.shields.io/badge/Tests-8%2F8_PASS-brightgreen)
26
+ ![License](https://img.shields.io/badge/License-MIT-lightgrey)
27
+ ![Alerts](https://img.shields.io/badge/Slack-Alerts_Ready-4A154B)
28
+
29
+ A **self-hosted, zero-cost** LLM pipeline observability tool that gives you root cause detection, anomaly alerts, A/B reporting, and a live terminal dashboard — without sending your data to any third-party service.
30
+
31
+ ---
32
+
33
+ ## 🆚 Why Not LangSmith or Braintrust?
34
+
35
+ | | **Failure Forensics** | LangSmith | Braintrust |
36
+ |---|---|---|---|
37
+ | Cost | **Free** | Paid tiers | Paid tiers |
38
+ | Data privacy | **Stays on your machine** | Sent to cloud | Sent to cloud |
39
+ | Customization | **Full control** | Limited | Limited |
40
+ | Slack alerts | **Built-in** | Premium only | Premium only |
41
+ | A/B reporting | **Built-in** | Basic | Basic |
42
+ | Circuit breaker / trend | **Built-in** | ❌ | ❌ |
43
+
44
+ **Failure Forensics is designed for teams who need production-grade observability without vendor lock-in.**
45
+
46
+ ---
47
+
48
+ ## ✨ What It Does
49
+
50
+ Every pipeline run passes through a structured logging and analysis layer:
51
+
52
+ ```
53
+ Pipeline Step → logger.py → requests.jsonl
54
+
55
+ ┌────────────────┴────────────────┐
56
+ │ │
57
+ forensics.py pattern.py
58
+ (root cause detection) (time series + anomaly)
59
+ │ │
60
+ versioning.py baseline.py
61
+ (v1 vs v2 comparison) (7-day moving average)
62
+ │ │
63
+ ab_report.py alerts.py
64
+ (A/B comparison table) (Slack / console alert)
65
+ └────────────────┬────────────────┘
66
+
67
+ dashboard.py
68
+ (ASCII terminal dashboard)
69
+ ```
70
+
71
+ ---
72
+
73
+ ## 📁 Project Structure
74
+
75
+ ```
76
+ failure-forensics/
77
+ ├── src/
78
+ │ ├── logger.py # Logs every pipeline step to JSONL
79
+ │ ├── forensics.py # Root cause detection (5 categories)
80
+ │ ├── pattern.py # Time-series failure rate + anomaly detection
81
+ │ ├── baseline.py # 7-day moving average + trend (IMPROVING/STABLE/DEGRADING)
82
+ │ ├── alerts.py # Slack webhook + console alerts
83
+ │ ├── versioning.py # Per-version failure rate stats
84
+ │ ├── ab_report.py # A/B comparison report (table + JSON)
85
+ │ └── dashboard.py # ASCII bar chart terminal dashboard
86
+ ├── data/
87
+ │ └── logs/
88
+ │ └── requests.jsonl # All pipeline logs (gitignored)
89
+ ├── tests/
90
+ │ └── test_forensics.py # 8 unit tests
91
+ ├── config.py # Thresholds, Slack URL, step limits
92
+ ├── main.py # 5-scenario demo runner
93
+ ├── simulate.py # Realistic test data generator (100 runs, anomaly day)
94
+ └── requirements.txt
95
+ ```
96
+
97
+ ---
98
+
99
+ ## 🚀 Getting Started
100
+
101
+ ### 1. Clone & Install
102
+
103
+ ```bash
104
+ git clone https://github.com/jasstt/failure-forensics.git
105
+ cd failure-forensics
106
+ pip install -r requirements.txt
107
+ ```
108
+
109
+ ### 2. (Optional) Configure Slack Alerts
110
+
111
+ Edit `config.py`:
112
+
113
+ ```python
114
+ SLACK_WEBHOOK_URL = "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
115
+ ```
116
+
117
+ If left empty, all alerts print to the console.
118
+
119
+ ### 3. Run the Full Demo
120
+
121
+ ```bash
122
+ python main.py
123
+ ```
124
+
125
+ This runs 5 scenarios:
126
+ 1. **Simulation** — generates 100 realistic pipeline runs (2 prompt versions, anomaly day)
127
+ 2. **Root cause analysis** — detects the failing step and assigns a category
128
+ 3. **7-day pattern report** — failure rate per day + step breakdown + anomaly check
129
+ 4. **A/B report** — `prompt_v1` vs `prompt_v2` with per-step improvement table
130
+ 5. **Terminal dashboard** — live ASCII bar charts, trend, top 5 failed runs
131
+
132
+ ### 4. Run Unit Tests
133
+
134
+ ```bash
135
+ python tests/test_forensics.py
136
+ python tests/test_advanced.py
137
+ ```
138
+
139
+ ---
140
+
141
+ ## 🚀 Advanced Features (New in v2)
142
+
143
+ | Katman | Özellik | Teknoloji |
144
+ |--------|---------|-----------|
145
+ | 1 | Otomatik öneri motoru | Kural tabanlı |
146
+ | 2 | AI destekli hata analizi | Gemini 2.5 Pro |
147
+ | 3 | Eval seti otomatik büyütme | Frequency analysis |
148
+ | 4 | Prompt optimizasyon açıklaması | Gemini 2.5 Pro |
149
+ | 5 | Regression guard | Baseline comparison |
150
+
151
+ **Senaryo 6: Regression Guard**
152
+ Yeni bir prompt (`v3`) deploy edilmeden önce otomatik regresyon kontrolü yapar:
153
+ ```
154
+ REGRESSION CHECK — v3
155
+ Baseline (v2): 11.0% failure rate
156
+ Yeni (v3): 24.5% failure rate
157
+ Delta: +13.5pp → REGRESSION_DETECTED ❌
158
+ ```
159
+
160
+ ## Test Results
161
+
162
+ | Katman | Test | Sonuç |
163
+ |--------|------|-------|
164
+ | 1 — Recommender | Kategori → öneri mapping | ✅ PASS |
165
+ | 2 — LLM Analyzer | Gemini fallback | ✅ PASS |
166
+ | 3 — Eval Collector | Duplicate prevention | ✅ PASS |
167
+ | 4 — Prompt Optimizer | A/B açıklama (v2: +10pp) | ✅ PASS |
168
+ | 5 — Regression Guard | DETECTED + PASS senaryoları | ✅ PASS |
169
+
170
+ ### Key Results
171
+ - A/B: prompt_v2, v1'e göre 10pp iyileşme
172
+ - Regression Guard: v3 deploy'u +6pp delta ile WARNING olarak engelledi
173
+ - Eval Collector: 5 yeni eval adayı otomatik toplandı
174
+ - LLM Analyzer: Gemini kapalıyken kural tabanlına sorunsuz fallback
175
+
176
+ ---
177
+
178
+ ## 📊 Results
179
+
180
+ | Feature | Result |
181
+ |---------|--------|
182
+ | Unit Tests | **8/8 PASS** ✅ |
183
+ | Root cause categories | **5 types** (RETRIEVAL_QUALITY, RERANKER_FAILURE, LLM_HALLUCINATION, CITATION_MISS, API_ERROR) |
184
+ | Anomaly detection | **20% delta threshold** — flags when today's rate exceeds 7-day average by >20pp |
185
+ | A/B comparison | **v2: 11.5pp improvement** over v1 (22.5% → 11.0% failure rate) |
186
+ | Trend analysis | **IMPROVING / STABLE / DEGRADING** based on 7-day moving average |
187
+ | Slack integration | **Webhook ready** — fires on rate threshold, anomaly, or 3 consecutive failures |
188
+
189
+ ---
190
+
191
+ ## ⚙️ Configuration (`config.py`)
192
+
193
+ | Parameter | Default | Description |
194
+ |-----------|---------|-------------|
195
+ | `FAILURE_RATE_THRESHOLD` | `0.25` | Alert fires above this failure rate |
196
+ | `ANOMALY_THRESHOLD` | `0.20` | Flag if today exceeds 7-day avg by this delta |
197
+ | `SLACK_WEBHOOK_URL` | `""` | Empty = console output |
198
+ | `CONSECUTIVE_FAILURE_THRESHOLD` | `3` | Alert after N consecutive step failures |
199
+ | `STEP_THRESHOLDS` | see config | Per-step max acceptable failure rate |
200
+
201
+ ---
202
+
203
+ ## 🧪 Root Cause Categories
204
+
205
+ | Category | Trigger |
206
+ |----------|---------|
207
+ | `RETRIEVAL_QUALITY` | Retrieval step fails — no results, low score |
208
+ | `RERANKER_FAILURE` | Reranker can't parse LLM response or times out |
209
+ | `LLM_HALLUCINATION` | Generation returns empty or uncited response |
210
+ | `CITATION_MISS` | Answer produced but no source citations found |
211
+ | `API_ERROR` | Timeout, 429 rate limit, 503 service unavailable |
212
+
213
+ ---
214
+
215
+ ## 📈 Terminal Dashboard (Sample Output)
216
+
217
+ ```
218
+ ═════════════════════════════════════════════════════════════
219
+ 🔬 FAILURE FORENSICS — Terminal Dashboard
220
+ ═════════════════════════════════════════════════════════════
221
+
222
+ 📅 SON 7 GÜNÜN FAILURE RATE GRAFİĞİ
223
+ 2026-06-03 [███░░░░░░░░░░░░░░░░░░░░░░░░░░░] 13.0%
224
+ 2026-06-07 [████████░░░░░░░░░░░░░░░░░░░░░░] 27.3% ⚠️
225
+ 2026-06-10 [███░░░░░░░░░░░░░░░░░░░░░░░░░░░] 12.0%
226
+
227
+ 🔍 ADIM BAZINDA HATA DAĞILIMI
228
+ retrieval [███████░░░░░░░░░░░░░] 38.0% (38/100 hatalı)
229
+ reranking [██░░░░░░░░░░░░░░░░░░] 13.0% (13/100 hatalı)
230
+ generation [██░░░░░░░░░░░░░░░░░░] 10.0% (10/100 hatalı)
231
+ citation [█░░░░░░░░░░░░░░░░░░░] 6.0% (6/100 hatalı)
232
+
233
+ ⚡ ANOMALİ: ✅ Normal: Bugün (12.0%) ≈ 7g ort. (16.2%)
234
+ 📊 TREND: ➡️ STABLE — Hareketli Ort: 16.0%
235
+ ```
236
+
237
+ ---
238
+
239
+ ## 🛠 Technologies Used
240
+
241
+ * **Python standard library** — `json`, `collections`, `datetime`, `threading`
242
+ * **[requests](https://pypi.org/project/requests/)** — Slack webhook HTTP calls
243
+ * **[python-dotenv](https://pypi.org/project/python-dotenv/)** — Environment variable management
244
+
245
+ No heavy dependencies. No cloud. No API keys required.
246
+
247
+ ---
248
+
249
+ ## 🔭 Roadmap
250
+
251
+ - [ ] FastAPI REST endpoint for remote log ingestion
252
+ - [ ] HTML report export
253
+ - [ ] PostgreSQL backend for large-scale log storage
254
+ - [ ] Multi-pipeline support (compare RAG vs fine-tuned model)
255
+ - [ ] Email alerts as alternative to Slack
@@ -0,0 +1,133 @@
1
+ {
2
+ "summary": {
3
+ "v1": "v1",
4
+ "v2": "v2",
5
+ "winner": "v2",
6
+ "v1_failure_rate": 0.25,
7
+ "v2_failure_rate": 0.15,
8
+ "improvement_pct": 10.0,
9
+ "v1_runs": 50,
10
+ "v2_runs": 50
11
+ },
12
+ "step_breakdown": {
13
+ "reranking": {
14
+ "v1": {
15
+ "total": 50,
16
+ "failed": 13,
17
+ "rate": 0.26
18
+ },
19
+ "v2": {
20
+ "total": 50,
21
+ "failed": 8,
22
+ "rate": 0.16
23
+ },
24
+ "delta_pct": 10.0,
25
+ "step_winner": "v2"
26
+ },
27
+ "citation": {
28
+ "v1": {
29
+ "total": 50,
30
+ "failed": 3,
31
+ "rate": 0.06
32
+ },
33
+ "v2": {
34
+ "total": 50,
35
+ "failed": 6,
36
+ "rate": 0.12
37
+ },
38
+ "delta_pct": -6.0,
39
+ "step_winner": "v1"
40
+ },
41
+ "generation": {
42
+ "v1": {
43
+ "total": 50,
44
+ "failed": 15,
45
+ "rate": 0.3
46
+ },
47
+ "v2": {
48
+ "total": 50,
49
+ "failed": 2,
50
+ "rate": 0.04
51
+ },
52
+ "delta_pct": 26.0,
53
+ "step_winner": "v2"
54
+ },
55
+ "retrieval": {
56
+ "v1": {
57
+ "total": 50,
58
+ "failed": 19,
59
+ "rate": 0.38
60
+ },
61
+ "v2": {
62
+ "total": 50,
63
+ "failed": 14,
64
+ "rate": 0.28
65
+ },
66
+ "delta_pct": 10.0,
67
+ "step_winner": "v2"
68
+ }
69
+ },
70
+ "raw_stats": {
71
+ "v1": {
72
+ "total_steps": 200,
73
+ "failed_steps": 50,
74
+ "failure_rate": 0.25,
75
+ "runs": 50,
76
+ "steps": {
77
+ "retrieval": {
78
+ "total": 50,
79
+ "failed": 19,
80
+ "rate": 0.38
81
+ },
82
+ "reranking": {
83
+ "total": 50,
84
+ "failed": 13,
85
+ "rate": 0.26
86
+ },
87
+ "generation": {
88
+ "total": 50,
89
+ "failed": 15,
90
+ "rate": 0.3
91
+ },
92
+ "citation": {
93
+ "total": 50,
94
+ "failed": 3,
95
+ "rate": 0.06
96
+ }
97
+ }
98
+ },
99
+ "v2": {
100
+ "total_steps": 200,
101
+ "failed_steps": 30,
102
+ "failure_rate": 0.15,
103
+ "runs": 50,
104
+ "steps": {
105
+ "retrieval": {
106
+ "total": 50,
107
+ "failed": 14,
108
+ "rate": 0.28
109
+ },
110
+ "reranking": {
111
+ "total": 50,
112
+ "failed": 8,
113
+ "rate": 0.16
114
+ },
115
+ "generation": {
116
+ "total": 50,
117
+ "failed": 2,
118
+ "rate": 0.04
119
+ },
120
+ "citation": {
121
+ "total": 50,
122
+ "failed": 6,
123
+ "rate": 0.12
124
+ }
125
+ }
126
+ }
127
+ },
128
+ "improvement_reason": {
129
+ "error": "Gemini not available",
130
+ "improvements": [],
131
+ "next_iteration": "Manuel inceleme gerekiyor."
132
+ }
133
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "2026-01-01": 0.1,
3
+ "2026-01-02": 0.15,
4
+ "2026-01-03": 0.2,
5
+ "2026-01-04": 0.25,
6
+ "2026-01-05": 0.3,
7
+ "2026-01-06": 0.35,
8
+ "2026-01-07": 0.4,
9
+ "2026-06-11": 0.2
10
+ }
@@ -0,0 +1 @@
1
+ {"query": "Aynı sorgu iki kere patladı", "failure_count": 3, "root_cause_category": "RETRIEVAL_QUALITY", "first_seen": "2026-06-11T01:53:53.961023+00:00", "last_seen": "2026-06-11T01:53:53.963734+00:00", "auto_add_eligible": true}