flow-doctor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flow_doctor-0.1.0/LICENSE +21 -0
- flow_doctor-0.1.0/PKG-INFO +365 -0
- flow_doctor-0.1.0/README.md +300 -0
- flow_doctor-0.1.0/flow_doctor/__init__.py +8 -0
- flow_doctor-0.1.0/flow_doctor/core/__init__.py +0 -0
- flow_doctor-0.1.0/flow_doctor/core/client.py +723 -0
- flow_doctor-0.1.0/flow_doctor/core/config.py +362 -0
- flow_doctor-0.1.0/flow_doctor/core/dedup.py +94 -0
- flow_doctor-0.1.0/flow_doctor/core/handler.py +105 -0
- flow_doctor-0.1.0/flow_doctor/core/models.py +126 -0
- flow_doctor-0.1.0/flow_doctor/core/rate_limiter.py +57 -0
- flow_doctor-0.1.0/flow_doctor/core/scrubber.py +114 -0
- flow_doctor-0.1.0/flow_doctor/diagnosis/__init__.py +13 -0
- flow_doctor-0.1.0/flow_doctor/diagnosis/agent_provider.py +180 -0
- flow_doctor-0.1.0/flow_doctor/diagnosis/context.py +200 -0
- flow_doctor-0.1.0/flow_doctor/diagnosis/git_context.py +94 -0
- flow_doctor-0.1.0/flow_doctor/diagnosis/knowledge_base.py +89 -0
- flow_doctor-0.1.0/flow_doctor/diagnosis/provider.py +130 -0
- flow_doctor-0.1.0/flow_doctor/digest/__init__.py +5 -0
- flow_doctor-0.1.0/flow_doctor/digest/generator.py +122 -0
- flow_doctor-0.1.0/flow_doctor/fix/__init__.py +1 -0
- flow_doctor-0.1.0/flow_doctor/fix/cli.py +439 -0
- flow_doctor-0.1.0/flow_doctor/fix/generator.py +96 -0
- flow_doctor-0.1.0/flow_doctor/fix/pr_creator.py +188 -0
- flow_doctor-0.1.0/flow_doctor/fix/prompts.py +85 -0
- flow_doctor-0.1.0/flow_doctor/fix/replay_store.py +41 -0
- flow_doctor-0.1.0/flow_doctor/fix/scope_guard.py +56 -0
- flow_doctor-0.1.0/flow_doctor/fix/validator.py +39 -0
- flow_doctor-0.1.0/flow_doctor/notify/__init__.py +0 -0
- flow_doctor-0.1.0/flow_doctor/notify/base.py +29 -0
- flow_doctor-0.1.0/flow_doctor/notify/email.py +114 -0
- flow_doctor-0.1.0/flow_doctor/notify/github.py +196 -0
- flow_doctor-0.1.0/flow_doctor/notify/slack.py +94 -0
- flow_doctor-0.1.0/flow_doctor/remediation/__init__.py +0 -0
- flow_doctor-0.1.0/flow_doctor/remediation/decision_gate.py +229 -0
- flow_doctor-0.1.0/flow_doctor/remediation/executor.py +279 -0
- flow_doctor-0.1.0/flow_doctor/remediation/playbook.py +100 -0
- flow_doctor-0.1.0/flow_doctor/storage/__init__.py +0 -0
- flow_doctor-0.1.0/flow_doctor/storage/base.py +89 -0
- flow_doctor-0.1.0/flow_doctor/storage/sqlite.py +540 -0
- flow_doctor-0.1.0/flow_doctor.egg-info/PKG-INFO +365 -0
- flow_doctor-0.1.0/flow_doctor.egg-info/SOURCES.txt +70 -0
- flow_doctor-0.1.0/flow_doctor.egg-info/dependency_links.txt +1 -0
- flow_doctor-0.1.0/flow_doctor.egg-info/entry_points.txt +2 -0
- flow_doctor-0.1.0/flow_doctor.egg-info/requires.txt +24 -0
- flow_doctor-0.1.0/flow_doctor.egg-info/top_level.txt +1 -0
- flow_doctor-0.1.0/pyproject.toml +55 -0
- flow_doctor-0.1.0/setup.cfg +4 -0
- flow_doctor-0.1.0/tests/test_config.py +125 -0
- flow_doctor-0.1.0/tests/test_context_assembler.py +132 -0
- flow_doctor-0.1.0/tests/test_coverage_gaps.py +314 -0
- flow_doctor-0.1.0/tests/test_dedup.py +116 -0
- flow_doctor-0.1.0/tests/test_diagnosis_provider.py +165 -0
- flow_doctor-0.1.0/tests/test_digest.py +169 -0
- flow_doctor-0.1.0/tests/test_fix_cli.py +197 -0
- flow_doctor-0.1.0/tests/test_fix_generator.py +155 -0
- flow_doctor-0.1.0/tests/test_git_context.py +71 -0
- flow_doctor-0.1.0/tests/test_github_notifier.py +122 -0
- flow_doctor-0.1.0/tests/test_handler.py +239 -0
- flow_doctor-0.1.0/tests/test_knowledge_base.py +148 -0
- flow_doctor-0.1.0/tests/test_models.py +81 -0
- flow_doctor-0.1.0/tests/test_notifications.py +114 -0
- flow_doctor-0.1.0/tests/test_phase2_integration.py +251 -0
- flow_doctor-0.1.0/tests/test_pr_creator.py +113 -0
- flow_doctor-0.1.0/tests/test_rate_limiter.py +104 -0
- flow_doctor-0.1.0/tests/test_remediation_pipeline.py +618 -0
- flow_doctor-0.1.0/tests/test_replay_store.py +67 -0
- flow_doctor-0.1.0/tests/test_reporter.py +272 -0
- flow_doctor-0.1.0/tests/test_scope_guard.py +62 -0
- flow_doctor-0.1.0/tests/test_scrubber.py +81 -0
- flow_doctor-0.1.0/tests/test_storage.py +147 -0
- flow_doctor-0.1.0/tests/test_validator.py +78 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Brian McMahon
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flow-doctor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pipeline error handler: capture, deduplicate, diagnose, and auto-fix failures.
|
|
5
|
+
Author: Brian McMahon
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Brian McMahon
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/cipher813/flow-doctor
|
|
29
|
+
Project-URL: Repository, https://github.com/cipher813/flow-doctor
|
|
30
|
+
Project-URL: Issues, https://github.com/cipher813/flow-doctor/issues
|
|
31
|
+
Keywords: error-handling,pipeline,monitoring,diagnosis,auto-fix,llm
|
|
32
|
+
Classifier: Development Status :: 4 - Beta
|
|
33
|
+
Classifier: Intended Audience :: Developers
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Programming Language :: Python :: 3
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
41
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
42
|
+
Classifier: Topic :: System :: Monitoring
|
|
43
|
+
Requires-Python: >=3.9
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
License-File: LICENSE
|
|
46
|
+
Requires-Dist: pyyaml>=6.0
|
|
47
|
+
Provides-Extra: slack
|
|
48
|
+
Requires-Dist: requests>=2.28; extra == "slack"
|
|
49
|
+
Provides-Extra: diagnosis
|
|
50
|
+
Requires-Dist: anthropic>=0.40; extra == "diagnosis"
|
|
51
|
+
Provides-Extra: remediation
|
|
52
|
+
Requires-Dist: boto3>=1.26; extra == "remediation"
|
|
53
|
+
Provides-Extra: agent
|
|
54
|
+
Requires-Dist: claude-agent-sdk>=0.1; extra == "agent"
|
|
55
|
+
Requires-Dist: anthropic>=0.40; extra == "agent"
|
|
56
|
+
Provides-Extra: all
|
|
57
|
+
Requires-Dist: requests>=2.28; extra == "all"
|
|
58
|
+
Requires-Dist: anthropic>=0.40; extra == "all"
|
|
59
|
+
Requires-Dist: boto3>=1.26; extra == "all"
|
|
60
|
+
Provides-Extra: dev
|
|
61
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
62
|
+
Requires-Dist: requests>=2.28; extra == "dev"
|
|
63
|
+
Requires-Dist: anthropic>=0.40; extra == "dev"
|
|
64
|
+
Dynamic: license-file
|
|
65
|
+
|
|
66
|
+
# Flow Doctor
|
|
67
|
+
|
|
68
|
+
[](https://pypi.org/project/flow-doctor/)
|
|
69
|
+
[](https://pypi.org/project/flow-doctor/)
|
|
70
|
+
[](https://github.com/cipher813/flow-doctor/actions)
|
|
71
|
+
[](https://github.com/cipher813/flow-doctor)
|
|
72
|
+
[](LICENSE)
|
|
73
|
+
|
|
74
|
+
Pipeline error handler for Python. Captures exceptions, diagnoses root causes with LLMs, files GitHub issues, and generates fix PRs.
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import flow_doctor
|
|
78
|
+
|
|
79
|
+
fd = flow_doctor.init(config_path="flow-doctor.yaml")
|
|
80
|
+
handler = flow_doctor.FlowDoctorHandler(fd, level=logging.WARNING)
|
|
81
|
+
logging.getLogger().addHandler(handler)
|
|
82
|
+
|
|
83
|
+
# Every WARNING+ log is now captured, deduplicated, diagnosed, and routed.
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## How It Works
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
Exception → Capture → Dedup → Diagnose (LLM) → GitHub Issue → Fix PR
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
1. **Capture** — exception, traceback, logs, and runtime context
|
|
93
|
+
2. **Dedup** — same error signature within cooldown window is suppressed
|
|
94
|
+
3. **Cascade** — if a declared upstream dependency also failed, tag it and skip diagnosis
|
|
95
|
+
4. **Diagnose** — check the knowledge base (free), then call Claude if rate limit allows
|
|
96
|
+
5. **Notify** — file a GitHub issue, send Slack/email (rate-limited with daily digest fallback)
|
|
97
|
+
6. **Fix** — human adds `flow-doctor:fix` label on the issue, triggering automated fix PR generation
|
|
98
|
+
|
|
99
|
+
## Installation
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
pip install flow-doctor # core only
|
|
103
|
+
pip install "flow-doctor[diagnosis]" # + LLM diagnosis (anthropic SDK)
|
|
104
|
+
pip install "flow-doctor[diagnosis,remediation]" # + auto-remediation (boto3 for SSM/Step Functions)
|
|
105
|
+
pip install "flow-doctor[all]" # everything
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Quick Start
|
|
109
|
+
|
|
110
|
+
### Option 1: Logging handler (recommended)
|
|
111
|
+
|
|
112
|
+
Attach to Python's logging system. Zero changes at call sites — any `WARNING+` log triggers the full pipeline.
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import logging
|
|
116
|
+
import flow_doctor
|
|
117
|
+
|
|
118
|
+
fd = flow_doctor.init(config_path="flow-doctor.yaml")
|
|
119
|
+
handler = flow_doctor.FlowDoctorHandler(fd, level=logging.WARNING)
|
|
120
|
+
logging.getLogger().addHandler(handler)
|
|
121
|
+
|
|
122
|
+
# These now trigger dedup, diagnosis, and notifications automatically:
|
|
123
|
+
logger.warning("Upstream data is 48h stale")
|
|
124
|
+
logger.error("S3 backup failed: AccessDenied")
|
|
125
|
+
logger.exception("Pipeline crashed")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
The handler is **non-blocking** — `emit()` enqueues work and returns immediately. A background thread calls `fd.report()` asynchronously.
|
|
129
|
+
|
|
130
|
+
### Option 2: Direct reporting
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
fd = flow_doctor.init(config_path="flow-doctor.yaml")
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
run_pipeline()
|
|
137
|
+
except Exception as e:
|
|
138
|
+
fd.report(e) # never crashes the caller
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Option 3: Context manager / decorator
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
with fd.guard():
|
|
145
|
+
run_pipeline() # exceptions are reported and re-raised
|
|
146
|
+
|
|
147
|
+
@fd.monitor
|
|
148
|
+
def handler(event, context):
|
|
149
|
+
run_pipeline()
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### Log capture
|
|
153
|
+
|
|
154
|
+
Attach recent logs to the next error report for richer diagnosis context:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
with fd.capture_logs(level=logging.INFO):
|
|
158
|
+
logger.info("Starting scan with 900 tickers...")
|
|
159
|
+
run_pipeline()
|
|
160
|
+
# All captured logs are attached to the next fd.report() call
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Configuration
|
|
164
|
+
|
|
165
|
+
Create a `flow-doctor.yaml` in your project root:
|
|
166
|
+
|
|
167
|
+
```yaml
|
|
168
|
+
flow_name: my-pipeline
|
|
169
|
+
repo: owner/repo
|
|
170
|
+
|
|
171
|
+
notify:
|
|
172
|
+
- type: github
|
|
173
|
+
repo: owner/repo
|
|
174
|
+
- type: email
|
|
175
|
+
sender: alerts@example.com
|
|
176
|
+
recipients: oncall@example.com
|
|
177
|
+
|
|
178
|
+
store:
|
|
179
|
+
type: sqlite
|
|
180
|
+
path: flow_doctor.db
|
|
181
|
+
|
|
182
|
+
diagnosis:
|
|
183
|
+
enabled: true
|
|
184
|
+
model: claude-sonnet-4-6-20250514
|
|
185
|
+
api_key: ${ANTHROPIC_API_KEY}
|
|
186
|
+
timeout_seconds: 30
|
|
187
|
+
max_daily_cost_usd: 1.00
|
|
188
|
+
|
|
189
|
+
github:
|
|
190
|
+
token: ${GITHUB_TOKEN}
|
|
191
|
+
labels: [flow-doctor]
|
|
192
|
+
|
|
193
|
+
rate_limits:
|
|
194
|
+
max_diagnosed_per_day: 3
|
|
195
|
+
max_issues_per_day: 3
|
|
196
|
+
dedup_cooldown_minutes: 60
|
|
197
|
+
|
|
198
|
+
dependencies:
|
|
199
|
+
- upstream-pipeline
|
|
200
|
+
|
|
201
|
+
remediation:
|
|
202
|
+
enabled: true
|
|
203
|
+
dry_run: true
|
|
204
|
+
auto_remediate_min_confidence: 0.9
|
|
205
|
+
market_hours_lockout: false
|
|
206
|
+
|
|
207
|
+
auto_fix:
|
|
208
|
+
enabled: true
|
|
209
|
+
confidence_threshold: 0.90
|
|
210
|
+
test_command: "python -m pytest tests/ -x -q"
|
|
211
|
+
scope:
|
|
212
|
+
allow: ["src/", "lib/"]
|
|
213
|
+
deny: ["*.yaml", "*.yml"]
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Environment variables in `${VAR}` syntax are resolved at load time.
|
|
217
|
+
|
|
218
|
+
Inline configuration (no YAML file):
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
fd = flow_doctor.init(
|
|
222
|
+
flow_name="my-pipeline",
|
|
223
|
+
repo="owner/repo",
|
|
224
|
+
store={"type": "sqlite", "path": "flow_doctor.db"},
|
|
225
|
+
notify=["github:owner/repo"],
|
|
226
|
+
)
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Features
|
|
230
|
+
|
|
231
|
+
### Error Capture and Dedup
|
|
232
|
+
|
|
233
|
+
- Traceback extraction with frame-based signature hashing
|
|
234
|
+
- Configurable cooldown window (default 60 min) — same error is captured once, not spammed
|
|
235
|
+
- Cascade detection tags downstream failures caused by upstream dependency outages
|
|
236
|
+
- Automatic secret scrubbing (AWS keys, Bearer tokens, passwords in URLs)
|
|
237
|
+
|
|
238
|
+
### LLM Diagnosis
|
|
239
|
+
|
|
240
|
+
- Structured root cause analysis via Claude: category, confidence, affected files, remediation
|
|
241
|
+
- Six categories: `TRANSIENT`, `DATA`, `CODE`, `CONFIG`, `EXTERNAL`, `INFRA`
|
|
242
|
+
- Knowledge base caching — known patterns are matched for free before calling the LLM
|
|
243
|
+
- Git context assembly (recent commits, changed files) for better diagnosis accuracy
|
|
244
|
+
- Daily cost cap (default $1.00) and rate limiting (default 3 diagnoses/day)
|
|
245
|
+
|
|
246
|
+
### GitHub Issues
|
|
247
|
+
|
|
248
|
+
- Auto-filed with diagnosis, traceback, and captured logs
|
|
249
|
+
- Machine-readable metadata embedded in HTML comments for downstream automation
|
|
250
|
+
- Rate-limited with graceful degradation to daily digest
|
|
251
|
+
|
|
252
|
+
### Auto-Fix PRs
|
|
253
|
+
|
|
254
|
+
Human-in-the-loop: a human reviews the diagnosis, adds a `flow-doctor:fix` label, and a GitHub Actions workflow generates a validated fix PR.
|
|
255
|
+
|
|
256
|
+
1. An error occurs and Flow Doctor creates a GitHub issue with structured diagnosis
|
|
257
|
+
2. A human reviews the diagnosis and adds the `flow-doctor:fix` label
|
|
258
|
+
3. GitHub Actions triggers `flow-doctor generate-fix`
|
|
259
|
+
4. The CLI generates a diff via LLM, validates against scope rules, runs tests
|
|
260
|
+
5. If tests pass, a PR is opened. If tests fail, a comment explains what went wrong.
|
|
261
|
+
|
|
262
|
+
**Safety gates** — fix generation is skipped when:
|
|
263
|
+
- Confidence below threshold (default 90%)
|
|
264
|
+
- Category is `EXTERNAL` or `INFRA` (nothing to fix in code)
|
|
265
|
+
- Config issue involves credentials/secrets
|
|
266
|
+
- Generated diff touches files outside configured scope
|
|
267
|
+
- Tests fail after applying the fix
|
|
268
|
+
|
|
269
|
+
### Remediation Playbooks
|
|
270
|
+
|
|
271
|
+
Define patterns that map failure signatures to automated actions:
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
from flow_doctor.remediation.playbook import Playbook, PlaybookPattern, RemediationAction, RemediationType
|
|
275
|
+
|
|
276
|
+
my_playbook = Playbook(patterns=[
|
|
277
|
+
PlaybookPattern(
|
|
278
|
+
name="service_down",
|
|
279
|
+
description="App service not responding",
|
|
280
|
+
category="INFRA",
|
|
281
|
+
message_pattern=r"(connection refused|service unavailable)",
|
|
282
|
+
action=RemediationAction(
|
|
283
|
+
action_type=RemediationType.RESTART_SERVICE,
|
|
284
|
+
description="Restart the app service",
|
|
285
|
+
commands=["sudo systemctl restart myapp"],
|
|
286
|
+
ssm_target="app-server",
|
|
287
|
+
),
|
|
288
|
+
),
|
|
289
|
+
])
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
### Notifications
|
|
293
|
+
|
|
294
|
+
- **GitHub issues** — primary notification with full diagnosis
|
|
295
|
+
- **Slack** — webhook-based alerts with severity emoji and diagnosis snippet
|
|
296
|
+
- **Email** — SMTP with detailed body (traceback, diagnosis, affected files)
|
|
297
|
+
- **Daily digest** — summarizes rate-limited/suppressed errors at end of day
|
|
298
|
+
|
|
299
|
+
## Auto-Fix CLI
|
|
300
|
+
|
|
301
|
+
```bash
|
|
302
|
+
flow-doctor generate-fix \
|
|
303
|
+
--issue-number 42 \
|
|
304
|
+
--repo owner/repo \
|
|
305
|
+
--token $GITHUB_TOKEN \
|
|
306
|
+
--config flow-doctor.yaml \
|
|
307
|
+
--dry-run
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
GitHub Actions workflow (copy to your repo at `.github/workflows/flow-doctor-fix.yml`):
|
|
311
|
+
|
|
312
|
+
```yaml
|
|
313
|
+
name: Flow Doctor Fix
|
|
314
|
+
on:
|
|
315
|
+
issues:
|
|
316
|
+
types: [labeled]
|
|
317
|
+
jobs:
|
|
318
|
+
generate-fix:
|
|
319
|
+
if: github.event.label.name == 'flow-doctor:fix'
|
|
320
|
+
runs-on: ubuntu-latest
|
|
321
|
+
steps:
|
|
322
|
+
- uses: actions/checkout@v4
|
|
323
|
+
- uses: actions/setup-python@v5
|
|
324
|
+
with:
|
|
325
|
+
python-version: '3.11'
|
|
326
|
+
- run: pip install flow-doctor[diagnosis]
|
|
327
|
+
- run: |
|
|
328
|
+
python -m flow_doctor.fix.cli generate-fix \
|
|
329
|
+
--issue-number ${{ github.event.issue.number }} \
|
|
330
|
+
--repo ${{ github.repository }} \
|
|
331
|
+
--token $GITHUB_TOKEN
|
|
332
|
+
env:
|
|
333
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
334
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
## Architecture
|
|
338
|
+
|
|
339
|
+
```
|
|
340
|
+
flow_doctor/
|
|
341
|
+
core/ # Client, config, models, dedup, rate limiting, scrubber, logging handler
|
|
342
|
+
diagnosis/ # LLM provider, context assembly, knowledge base, git context
|
|
343
|
+
digest/ # Daily digest generator
|
|
344
|
+
fix/ # Auto-fix: LLM generator, scope guard, test validator, PR creator, CLI
|
|
345
|
+
notify/ # Slack, email, GitHub issue backends
|
|
346
|
+
remediation/ # Decision gate, executor, playbook patterns
|
|
347
|
+
storage/ # SQLite backend (thread-safe, per-thread connections)
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
## Development
|
|
351
|
+
|
|
352
|
+
```bash
|
|
353
|
+
git clone https://github.com/cipher813/flow-doctor.git
|
|
354
|
+
cd flow-doctor
|
|
355
|
+
python -m venv .venv && source .venv/bin/activate
|
|
356
|
+
pip install -e ".[dev]"
|
|
357
|
+
|
|
358
|
+
python -m pytest tests/ -x -q # 212 tests
|
|
359
|
+
python -m pytest tests/ --cov=flow_doctor # coverage report
|
|
360
|
+
python examples/smoke_test.py # end-to-end smoke test
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
## License
|
|
364
|
+
|
|
365
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
# Flow Doctor
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/flow-doctor/)
|
|
4
|
+
[](https://pypi.org/project/flow-doctor/)
|
|
5
|
+
[](https://github.com/cipher813/flow-doctor/actions)
|
|
6
|
+
[](https://github.com/cipher813/flow-doctor)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
9
|
+
Pipeline error handler for Python. Captures exceptions, diagnoses root causes with LLMs, files GitHub issues, and generates fix PRs.
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
import flow_doctor
|
|
13
|
+
|
|
14
|
+
fd = flow_doctor.init(config_path="flow-doctor.yaml")
|
|
15
|
+
handler = flow_doctor.FlowDoctorHandler(fd, level=logging.WARNING)
|
|
16
|
+
logging.getLogger().addHandler(handler)
|
|
17
|
+
|
|
18
|
+
# Every WARNING+ log is now captured, deduplicated, diagnosed, and routed.
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## How It Works
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
Exception → Capture → Dedup → Diagnose (LLM) → GitHub Issue → Fix PR
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
1. **Capture** — exception, traceback, logs, and runtime context
|
|
28
|
+
2. **Dedup** — same error signature within cooldown window is suppressed
|
|
29
|
+
3. **Cascade** — if a declared upstream dependency also failed, tag it and skip diagnosis
|
|
30
|
+
4. **Diagnose** — check the knowledge base (free), then call Claude if rate limit allows
|
|
31
|
+
5. **Notify** — file a GitHub issue, send Slack/email (rate-limited with daily digest fallback)
|
|
32
|
+
6. **Fix** — human adds `flow-doctor:fix` label on the issue, triggering automated fix PR generation
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install flow-doctor # core only
|
|
38
|
+
pip install "flow-doctor[diagnosis]" # + LLM diagnosis (anthropic SDK)
|
|
39
|
+
pip install "flow-doctor[diagnosis,remediation]" # + auto-remediation (boto3 for SSM/Step Functions)
|
|
40
|
+
pip install "flow-doctor[all]" # everything
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Quick Start
|
|
44
|
+
|
|
45
|
+
### Option 1: Logging handler (recommended)
|
|
46
|
+
|
|
47
|
+
Attach to Python's logging system. Zero changes at call sites — any `WARNING+` log triggers the full pipeline.
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import logging
|
|
51
|
+
import flow_doctor
|
|
52
|
+
|
|
53
|
+
fd = flow_doctor.init(config_path="flow-doctor.yaml")
|
|
54
|
+
handler = flow_doctor.FlowDoctorHandler(fd, level=logging.WARNING)
|
|
55
|
+
logging.getLogger().addHandler(handler)
|
|
56
|
+
|
|
57
|
+
# These now trigger dedup, diagnosis, and notifications automatically:
|
|
58
|
+
logger.warning("Upstream data is 48h stale")
|
|
59
|
+
logger.error("S3 backup failed: AccessDenied")
|
|
60
|
+
logger.exception("Pipeline crashed")
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The handler is **non-blocking** — `emit()` enqueues work and returns immediately. A background thread calls `fd.report()` asynchronously.
|
|
64
|
+
|
|
65
|
+
### Option 2: Direct reporting
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
fd = flow_doctor.init(config_path="flow-doctor.yaml")
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
run_pipeline()
|
|
72
|
+
except Exception as e:
|
|
73
|
+
fd.report(e) # never crashes the caller
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Option 3: Context manager / decorator
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
with fd.guard():
|
|
80
|
+
run_pipeline() # exceptions are reported and re-raised
|
|
81
|
+
|
|
82
|
+
@fd.monitor
|
|
83
|
+
def handler(event, context):
|
|
84
|
+
run_pipeline()
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Log capture
|
|
88
|
+
|
|
89
|
+
Attach recent logs to the next error report for richer diagnosis context:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
with fd.capture_logs(level=logging.INFO):
|
|
93
|
+
logger.info("Starting scan with 900 tickers...")
|
|
94
|
+
run_pipeline()
|
|
95
|
+
# All captured logs are attached to the next fd.report() call
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Configuration
|
|
99
|
+
|
|
100
|
+
Create a `flow-doctor.yaml` in your project root:
|
|
101
|
+
|
|
102
|
+
```yaml
|
|
103
|
+
flow_name: my-pipeline
|
|
104
|
+
repo: owner/repo
|
|
105
|
+
|
|
106
|
+
notify:
|
|
107
|
+
- type: github
|
|
108
|
+
repo: owner/repo
|
|
109
|
+
- type: email
|
|
110
|
+
sender: alerts@example.com
|
|
111
|
+
recipients: oncall@example.com
|
|
112
|
+
|
|
113
|
+
store:
|
|
114
|
+
type: sqlite
|
|
115
|
+
path: flow_doctor.db
|
|
116
|
+
|
|
117
|
+
diagnosis:
|
|
118
|
+
enabled: true
|
|
119
|
+
model: claude-sonnet-4-6-20250514
|
|
120
|
+
api_key: ${ANTHROPIC_API_KEY}
|
|
121
|
+
timeout_seconds: 30
|
|
122
|
+
max_daily_cost_usd: 1.00
|
|
123
|
+
|
|
124
|
+
github:
|
|
125
|
+
token: ${GITHUB_TOKEN}
|
|
126
|
+
labels: [flow-doctor]
|
|
127
|
+
|
|
128
|
+
rate_limits:
|
|
129
|
+
max_diagnosed_per_day: 3
|
|
130
|
+
max_issues_per_day: 3
|
|
131
|
+
dedup_cooldown_minutes: 60
|
|
132
|
+
|
|
133
|
+
dependencies:
|
|
134
|
+
- upstream-pipeline
|
|
135
|
+
|
|
136
|
+
remediation:
|
|
137
|
+
enabled: true
|
|
138
|
+
dry_run: true
|
|
139
|
+
auto_remediate_min_confidence: 0.9
|
|
140
|
+
market_hours_lockout: false
|
|
141
|
+
|
|
142
|
+
auto_fix:
|
|
143
|
+
enabled: true
|
|
144
|
+
confidence_threshold: 0.90
|
|
145
|
+
test_command: "python -m pytest tests/ -x -q"
|
|
146
|
+
scope:
|
|
147
|
+
allow: ["src/", "lib/"]
|
|
148
|
+
deny: ["*.yaml", "*.yml"]
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Environment variables in `${VAR}` syntax are resolved at load time.
|
|
152
|
+
|
|
153
|
+
Inline configuration (no YAML file):
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
fd = flow_doctor.init(
|
|
157
|
+
flow_name="my-pipeline",
|
|
158
|
+
repo="owner/repo",
|
|
159
|
+
store={"type": "sqlite", "path": "flow_doctor.db"},
|
|
160
|
+
notify=["github:owner/repo"],
|
|
161
|
+
)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Features
|
|
165
|
+
|
|
166
|
+
### Error Capture and Dedup
|
|
167
|
+
|
|
168
|
+
- Traceback extraction with frame-based signature hashing
|
|
169
|
+
- Configurable cooldown window (default 60 min) — same error is captured once, not spammed
|
|
170
|
+
- Cascade detection tags downstream failures caused by upstream dependency outages
|
|
171
|
+
- Automatic secret scrubbing (AWS keys, Bearer tokens, passwords in URLs)
|
|
172
|
+
|
|
173
|
+
### LLM Diagnosis
|
|
174
|
+
|
|
175
|
+
- Structured root cause analysis via Claude: category, confidence, affected files, remediation
|
|
176
|
+
- Six categories: `TRANSIENT`, `DATA`, `CODE`, `CONFIG`, `EXTERNAL`, `INFRA`
|
|
177
|
+
- Knowledge base caching — known patterns are matched for free before calling the LLM
|
|
178
|
+
- Git context assembly (recent commits, changed files) for better diagnosis accuracy
|
|
179
|
+
- Daily cost cap (default $1.00) and rate limiting (default 3 diagnoses/day)
|
|
180
|
+
|
|
181
|
+
### GitHub Issues
|
|
182
|
+
|
|
183
|
+
- Auto-filed with diagnosis, traceback, and captured logs
|
|
184
|
+
- Machine-readable metadata embedded in HTML comments for downstream automation
|
|
185
|
+
- Rate-limited with graceful degradation to daily digest
|
|
186
|
+
|
|
187
|
+
### Auto-Fix PRs
|
|
188
|
+
|
|
189
|
+
Human-in-the-loop: a human reviews the diagnosis, adds a `flow-doctor:fix` label, and a GitHub Actions workflow generates a validated fix PR.
|
|
190
|
+
|
|
191
|
+
1. An error occurs and Flow Doctor creates a GitHub issue with structured diagnosis
|
|
192
|
+
2. A human reviews the diagnosis and adds the `flow-doctor:fix` label
|
|
193
|
+
3. GitHub Actions triggers `flow-doctor generate-fix`
|
|
194
|
+
4. The CLI generates a diff via LLM, validates against scope rules, runs tests
|
|
195
|
+
5. If tests pass, a PR is opened. If tests fail, a comment explains what went wrong.
|
|
196
|
+
|
|
197
|
+
**Safety gates** — fix generation is skipped when:
|
|
198
|
+
- Confidence below threshold (default 90%)
|
|
199
|
+
- Category is `EXTERNAL` or `INFRA` (nothing to fix in code)
|
|
200
|
+
- Config issue involves credentials/secrets
|
|
201
|
+
- Generated diff touches files outside configured scope
|
|
202
|
+
- Tests fail after applying the fix
|
|
203
|
+
|
|
204
|
+
### Remediation Playbooks
|
|
205
|
+
|
|
206
|
+
Define patterns that map failure signatures to automated actions:
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from flow_doctor.remediation.playbook import Playbook, PlaybookPattern, RemediationAction, RemediationType
|
|
210
|
+
|
|
211
|
+
my_playbook = Playbook(patterns=[
|
|
212
|
+
PlaybookPattern(
|
|
213
|
+
name="service_down",
|
|
214
|
+
description="App service not responding",
|
|
215
|
+
category="INFRA",
|
|
216
|
+
message_pattern=r"(connection refused|service unavailable)",
|
|
217
|
+
action=RemediationAction(
|
|
218
|
+
action_type=RemediationType.RESTART_SERVICE,
|
|
219
|
+
description="Restart the app service",
|
|
220
|
+
commands=["sudo systemctl restart myapp"],
|
|
221
|
+
ssm_target="app-server",
|
|
222
|
+
),
|
|
223
|
+
),
|
|
224
|
+
])
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Notifications
|
|
228
|
+
|
|
229
|
+
- **GitHub issues** — primary notification with full diagnosis
|
|
230
|
+
- **Slack** — webhook-based alerts with severity emoji and diagnosis snippet
|
|
231
|
+
- **Email** — SMTP with detailed body (traceback, diagnosis, affected files)
|
|
232
|
+
- **Daily digest** — summarizes rate-limited/suppressed errors at end of day
|
|
233
|
+
|
|
234
|
+
## Auto-Fix CLI
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
flow-doctor generate-fix \
|
|
238
|
+
--issue-number 42 \
|
|
239
|
+
--repo owner/repo \
|
|
240
|
+
--token $GITHUB_TOKEN \
|
|
241
|
+
--config flow-doctor.yaml \
|
|
242
|
+
--dry-run
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
GitHub Actions workflow (copy to your repo at `.github/workflows/flow-doctor-fix.yml`):
|
|
246
|
+
|
|
247
|
+
```yaml
|
|
248
|
+
name: Flow Doctor Fix
|
|
249
|
+
on:
|
|
250
|
+
issues:
|
|
251
|
+
types: [labeled]
|
|
252
|
+
jobs:
|
|
253
|
+
generate-fix:
|
|
254
|
+
if: github.event.label.name == 'flow-doctor:fix'
|
|
255
|
+
runs-on: ubuntu-latest
|
|
256
|
+
steps:
|
|
257
|
+
- uses: actions/checkout@v4
|
|
258
|
+
- uses: actions/setup-python@v5
|
|
259
|
+
with:
|
|
260
|
+
python-version: '3.11'
|
|
261
|
+
- run: pip install flow-doctor[diagnosis]
|
|
262
|
+
- run: |
|
|
263
|
+
python -m flow_doctor.fix.cli generate-fix \
|
|
264
|
+
--issue-number ${{ github.event.issue.number }} \
|
|
265
|
+
--repo ${{ github.repository }} \
|
|
266
|
+
--token $GITHUB_TOKEN
|
|
267
|
+
env:
|
|
268
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
269
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## Architecture
|
|
273
|
+
|
|
274
|
+
```
|
|
275
|
+
flow_doctor/
|
|
276
|
+
core/ # Client, config, models, dedup, rate limiting, scrubber, logging handler
|
|
277
|
+
diagnosis/ # LLM provider, context assembly, knowledge base, git context
|
|
278
|
+
digest/ # Daily digest generator
|
|
279
|
+
fix/ # Auto-fix: LLM generator, scope guard, test validator, PR creator, CLI
|
|
280
|
+
notify/ # Slack, email, GitHub issue backends
|
|
281
|
+
remediation/ # Decision gate, executor, playbook patterns
|
|
282
|
+
storage/ # SQLite backend (thread-safe, per-thread connections)
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
## Development
|
|
286
|
+
|
|
287
|
+
```bash
|
|
288
|
+
git clone https://github.com/cipher813/flow-doctor.git
|
|
289
|
+
cd flow-doctor
|
|
290
|
+
python -m venv .venv && source .venv/bin/activate
|
|
291
|
+
pip install -e ".[dev]"
|
|
292
|
+
|
|
293
|
+
python -m pytest tests/ -x -q # 212 tests
|
|
294
|
+
python -m pytest tests/ --cov=flow_doctor # coverage report
|
|
295
|
+
python examples/smoke_test.py # end-to-end smoke test
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
## License
|
|
299
|
+
|
|
300
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Flow Doctor -- call-site error handler for pipeline reliability."""
|
|
2
|
+
|
|
3
|
+
from flow_doctor.core.client import FlowDoctor, init
|
|
4
|
+
from flow_doctor.core.handler import FlowDoctorHandler
|
|
5
|
+
from flow_doctor.core.models import Severity
|
|
6
|
+
|
|
7
|
+
__all__ = ["FlowDoctor", "FlowDoctorHandler", "Severity", "init"]
|
|
8
|
+
__version__ = "0.1.0"
|
|
File without changes
|