async-runtime-auditor 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- async_auditor/__init__.py +0 -0
- async_auditor/cli.py +350 -0
- async_auditor/default_metrics.yaml +21 -0
- async_runtime_auditor-0.1.0.dist-info/METADATA +261 -0
- async_runtime_auditor-0.1.0.dist-info/RECORD +9 -0
- async_runtime_auditor-0.1.0.dist-info/WHEEL +5 -0
- async_runtime_auditor-0.1.0.dist-info/entry_points.txt +2 -0
- async_runtime_auditor-0.1.0.dist-info/licenses/LICENSE +21 -0
- async_runtime_auditor-0.1.0.dist-info/top_level.txt +1 -0
|
File without changes
|
async_auditor/cli.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import math
|
|
4
|
+
import requests
|
|
5
|
+
import shutil
|
|
6
|
+
import sys
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
# ---------------------------------------------------------
|
|
12
|
+
# DEFAULT CONFIG PATH (Used ONLY for --init)
|
|
13
|
+
# ---------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
DEFAULT_CONFIG_PATH = (
|
|
16
|
+
Path(__file__).parent / "default_metrics.yaml"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# ---------------------------------------------------------
|
|
20
|
+
# CLI ARGUMENT PARSER
|
|
21
|
+
# ---------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
def parse_args():
|
|
24
|
+
|
|
25
|
+
parser = argparse.ArgumentParser(
|
|
26
|
+
prog="async-auditor",
|
|
27
|
+
description=(
|
|
28
|
+
"Runtime audit CLI for detecting asyncio "
|
|
29
|
+
"event-loop starvation and threadpool saturation."
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--config",
|
|
35
|
+
type=str,
|
|
36
|
+
default=None, # <-- STRICT CI: No silent fallback
|
|
37
|
+
help="Path to metrics configuration YAML"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"--target",
|
|
42
|
+
type=str,
|
|
43
|
+
default=None,
|
|
44
|
+
help="Override Prometheus URL target"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"--output-format",
|
|
49
|
+
type=str,
|
|
50
|
+
choices=["text", "json"],
|
|
51
|
+
default="text",
|
|
52
|
+
help="Output format"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"--fail-on-critical",
|
|
57
|
+
action="store_true",
|
|
58
|
+
help=(
|
|
59
|
+
"Exit with code 1 if runtime "
|
|
60
|
+
"state is CRITICAL or COLLAPSE_RISK"
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--init",
|
|
66
|
+
action="store_true",
|
|
67
|
+
help="Generate a template metrics.yaml in the current directory"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return parser.parse_args()
|
|
71
|
+
|
|
72
|
+
# ---------------------------------------------------------
|
|
73
|
+
# CONFIG PATH RESOLUTION
|
|
74
|
+
# ---------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
def resolve_config_path(explicit_config, output_format):
|
|
77
|
+
|
|
78
|
+
# 1. EXPLICIT CONFIG OVERRIDE
|
|
79
|
+
if explicit_config:
|
|
80
|
+
return Path(explicit_config)
|
|
81
|
+
|
|
82
|
+
# 2. LOCAL metrics.yaml IN CURRENT DIRECTORY
|
|
83
|
+
local_config = Path.cwd() / "metrics.yaml"
|
|
84
|
+
if local_config.exists():
|
|
85
|
+
return local_config
|
|
86
|
+
|
|
87
|
+
# 3. NO CONFIG FOUND (HARD FAIL)
|
|
88
|
+
if output_format == "text":
|
|
89
|
+
print(
|
|
90
|
+
"[FATAL] No metrics configuration found.\n"
|
|
91
|
+
"Run:\n"
|
|
92
|
+
" async-auditor --init\n"
|
|
93
|
+
"or provide explicit path:\n"
|
|
94
|
+
" async-auditor --config <path>"
|
|
95
|
+
)
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
# ---------------------------------------------------------
|
|
99
|
+
# CLI UX: INIT CONFIG
|
|
100
|
+
# ---------------------------------------------------------
|
|
101
|
+
|
|
102
|
+
def init_template_config():
|
|
103
|
+
target_path = Path.cwd() / "metrics.yaml"
|
|
104
|
+
|
|
105
|
+
if target_path.exists():
|
|
106
|
+
print(f"[ERROR] {target_path.name} already exists in this directory.")
|
|
107
|
+
sys.exit(1)
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
shutil.copy(DEFAULT_CONFIG_PATH, target_path)
|
|
111
|
+
print(f"Successfully generated template: {target_path.absolute()}")
|
|
112
|
+
sys.exit(0)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
print(f"[FATAL] Could not generate template: {e}")
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------
|
|
118
|
+
# PRE-FLIGHT TELEMETRY CHECK
|
|
119
|
+
# ---------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def verify_telemetry_backend(prometheus_url, output_format):
|
|
122
|
+
health_endpoint = f"{prometheus_url}/-/healthy"
|
|
123
|
+
try:
|
|
124
|
+
response = requests.get(health_endpoint, timeout=5)
|
|
125
|
+
response.raise_for_status()
|
|
126
|
+
except Exception as e:
|
|
127
|
+
if output_format == "text":
|
|
128
|
+
print(f"[FATAL] Telemetry backend unreachable at {health_endpoint}")
|
|
129
|
+
print(f"Error: {e}")
|
|
130
|
+
print("Cannot audit runtime state without observability data.")
|
|
131
|
+
sys.exit(1)
|
|
132
|
+
|
|
133
|
+
# ---------------------------------------------------------
|
|
134
|
+
# SAFE NUMERIC HANDLING
|
|
135
|
+
# ---------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
def sanitize_metric(value):
|
|
138
|
+
if value is None or math.isnan(value) or math.isinf(value):
|
|
139
|
+
return 0.0
|
|
140
|
+
return value
|
|
141
|
+
|
|
142
|
+
# ---------------------------------------------------------
|
|
143
|
+
# CONFIGURATION LOADER
|
|
144
|
+
# ---------------------------------------------------------
|
|
145
|
+
|
|
146
|
+
def load_config(config_path, output_format):
|
|
147
|
+
try:
|
|
148
|
+
with open(config_path, "r") as f:
|
|
149
|
+
config = yaml.safe_load(f)
|
|
150
|
+
if config is None:
|
|
151
|
+
raise ValueError("metrics config is empty")
|
|
152
|
+
return config
|
|
153
|
+
except Exception as e:
|
|
154
|
+
if output_format == "text":
|
|
155
|
+
print(f"[FATAL] Failed to load config {config_path}: {e}")
|
|
156
|
+
sys.exit(1)
|
|
157
|
+
|
|
158
|
+
# ---------------------------------------------------------
|
|
159
|
+
# PROMETHEUS QUERY HELPER
|
|
160
|
+
# ---------------------------------------------------------
|
|
161
|
+
|
|
162
|
+
def query_prometheus(query_key, queries, query_endpoint, output_format, config_path):
|
|
163
|
+
query = queries.get(query_key)
|
|
164
|
+
if not query:
|
|
165
|
+
if output_format == "text":
|
|
166
|
+
print(f"[WARNING] Query key '{query_key}' not found in {config_path}")
|
|
167
|
+
return 0.0
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
response = requests.get(
|
|
171
|
+
query_endpoint,
|
|
172
|
+
params={"query": query},
|
|
173
|
+
timeout=10,
|
|
174
|
+
)
|
|
175
|
+
response.raise_for_status()
|
|
176
|
+
results = response.json()["data"]["result"]
|
|
177
|
+
|
|
178
|
+
if not results:
|
|
179
|
+
return 0.0
|
|
180
|
+
|
|
181
|
+
value = float(results[0]["value"][1])
|
|
182
|
+
return sanitize_metric(value)
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
if output_format == "text":
|
|
186
|
+
print(f"[ERROR] Prometheus query failed for '{query_key}': {e}")
|
|
187
|
+
return 0.0
|
|
188
|
+
|
|
189
|
+
# ---------------------------------------------------------
|
|
190
|
+
# RUNTIME HEALTH CLASSIFICATION
|
|
191
|
+
# ---------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
def classify_runtime_health(score, thresholds):
|
|
194
|
+
score = sanitize_metric(score)
|
|
195
|
+
if score >= thresholds.get("health_score_collapse", 2500): return "COLLAPSE_RISK"
|
|
196
|
+
elif score >= thresholds.get("health_score_critical", 1200): return "CRITICAL"
|
|
197
|
+
elif score >= thresholds.get("health_score_degraded", 400): return "DEGRADED"
|
|
198
|
+
return "HEALTHY"
|
|
199
|
+
|
|
200
|
+
# ---------------------------------------------------------
|
|
201
|
+
# FAILURE MODE CLASSIFICATION
|
|
202
|
+
# ---------------------------------------------------------
|
|
203
|
+
|
|
204
|
+
def classify_failure_modes(max_lag, avg_blocking_duration, peak_active_requests, threadpool_queue_wait, threadpool_backlog, thresholds):
|
|
205
|
+
failure_modes = []
|
|
206
|
+
|
|
207
|
+
if max_lag > thresholds.get("max_event_loop_lag_s", 0.5) and avg_blocking_duration > thresholds.get("max_blocking_duration_s", 1.0):
|
|
208
|
+
failure_modes.append({"type": "EVENT_LOOP_STARVATION", "severity": "HIGH", "description": "Synchronous blocking is stalling the asyncio scheduler."})
|
|
209
|
+
|
|
210
|
+
if threadpool_queue_wait > thresholds.get("max_queue_wait_s", 1.0) and threadpool_backlog > thresholds.get("max_threadpool_backlog", 3):
|
|
211
|
+
failure_modes.append({"type": "THREADPOOL_SATURATION", "severity": "HIGH", "description": "Executor queue amplification detected."})
|
|
212
|
+
|
|
213
|
+
if peak_active_requests > thresholds.get("max_active_requests", 150):
|
|
214
|
+
failure_modes.append({"type": "LOAD_SATURATION", "severity": "MEDIUM", "description": "Concurrent request pressure is approaching runtime limits."})
|
|
215
|
+
|
|
216
|
+
return failure_modes
|
|
217
|
+
|
|
218
|
+
# ---------------------------------------------------------
|
|
219
|
+
# AUDIT REPORT GENERATOR
|
|
220
|
+
# ---------------------------------------------------------
|
|
221
|
+
|
|
222
|
+
def generate_audit_report(args):
|
|
223
|
+
config = load_config(args.config, args.output_format)
|
|
224
|
+
queries = config.get("queries", {})
|
|
225
|
+
thresholds = config.get("thresholds", {})
|
|
226
|
+
|
|
227
|
+
prometheus_url = args.target if args.target else config.get("target", "http://localhost:9090")
|
|
228
|
+
prometheus_url = prometheus_url.rstrip("/")
|
|
229
|
+
query_endpoint = f"{prometheus_url}/api/v1/query"
|
|
230
|
+
output_file = Path.cwd() / "audit_results.json"
|
|
231
|
+
|
|
232
|
+
# Pre-flight Check
|
|
233
|
+
verify_telemetry_backend(prometheus_url, args.output_format)
|
|
234
|
+
|
|
235
|
+
# -----------------------------------------------------
|
|
236
|
+
# METRIC INGESTION
|
|
237
|
+
# -----------------------------------------------------
|
|
238
|
+
p99_latency = query_prometheus("p99_latency", queries, query_endpoint, args.output_format, args.config)
|
|
239
|
+
max_lag = query_prometheus("event_loop_lag", queries, query_endpoint, args.output_format, args.config)
|
|
240
|
+
total_blocking_events = query_prometheus("blocking_events_total", queries, query_endpoint, args.output_format, args.config)
|
|
241
|
+
avg_blocking_duration = query_prometheus("blocking_duration_avg", queries, query_endpoint, args.output_format, args.config)
|
|
242
|
+
peak_active_requests = query_prometheus("peak_active_requests", queries, query_endpoint, args.output_format, args.config)
|
|
243
|
+
threadpool_queue_wait = query_prometheus("threadpool_queue_wait", queries, query_endpoint, args.output_format, args.config)
|
|
244
|
+
tasks_started = query_prometheus("threadpool_tasks_started", queries, query_endpoint, args.output_format, args.config)
|
|
245
|
+
tasks_completed = query_prometheus("threadpool_tasks_completed", queries, query_endpoint, args.output_format, args.config)
|
|
246
|
+
threadpool_backlog = sanitize_metric(max(0, tasks_started - tasks_completed))
|
|
247
|
+
|
|
248
|
+
# -----------------------------------------------------
|
|
249
|
+
# RUNTIME HEALTH SCORE
|
|
250
|
+
# -----------------------------------------------------
|
|
251
|
+
runtime_health_score = sanitize_metric(
|
|
252
|
+
(max_lag * 1000 * 1.5) +
|
|
253
|
+
(avg_blocking_duration * 400) +
|
|
254
|
+
(threadpool_queue_wait * 300) +
|
|
255
|
+
(threadpool_backlog * 8) +
|
|
256
|
+
(peak_active_requests * 1)
|
|
257
|
+
)
|
|
258
|
+
runtime_status = classify_runtime_health(runtime_health_score, thresholds)
|
|
259
|
+
|
|
260
|
+
# -----------------------------------------------------
|
|
261
|
+
# FAILURE MODE ANALYSIS
|
|
262
|
+
# -----------------------------------------------------
|
|
263
|
+
failure_modes = classify_failure_modes(
|
|
264
|
+
max_lag, avg_blocking_duration, peak_active_requests,
|
|
265
|
+
threadpool_queue_wait, threadpool_backlog, thresholds
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# -----------------------------------------------------
|
|
269
|
+
# STRUCTURAL FINDINGS
|
|
270
|
+
# -----------------------------------------------------
|
|
271
|
+
findings = []
|
|
272
|
+
if max_lag > thresholds.get("max_event_loop_lag_s", 0.5): findings.append("Event-loop starvation detected.")
|
|
273
|
+
if avg_blocking_duration > thresholds.get("max_blocking_duration_s", 1.0): findings.append("Long-duration synchronous blocking observed.")
|
|
274
|
+
if max_lag > thresholds.get("latency_hide_threshold_s", 1.0) and p99_latency < thresholds.get("p99_latency_hide_threshold_s", 0.15):
|
|
275
|
+
findings.append("Customer-visible latency hides runtime instability.")
|
|
276
|
+
if peak_active_requests > thresholds.get("load_warning_threshold", 100): findings.append("Concurrent load saturation detected.")
|
|
277
|
+
if threadpool_backlog > thresholds.get("max_threadpool_backlog", 3): findings.append("Threadpool backlog accumulation detected.")
|
|
278
|
+
if threadpool_queue_wait > thresholds.get("max_queue_wait_s", 1.0): findings.append("Executor queue amplification detected.")
|
|
279
|
+
|
|
280
|
+
# -----------------------------------------------------
|
|
281
|
+
# REPORT OBJECT
|
|
282
|
+
# -----------------------------------------------------
|
|
283
|
+
report = {
|
|
284
|
+
"status": runtime_status,
|
|
285
|
+
"runtime_health_score": round(runtime_health_score, 2),
|
|
286
|
+
"p99_latency_ms": round(p99_latency * 1000, 2),
|
|
287
|
+
"max_event_loop_lag_ms": round(max_lag * 1000, 2),
|
|
288
|
+
"blocking_events": int(total_blocking_events),
|
|
289
|
+
"avg_blocking_duration_s": round(avg_blocking_duration, 2),
|
|
290
|
+
"peak_active_requests": int(peak_active_requests),
|
|
291
|
+
"threadpool_queue_wait_s": round(threadpool_queue_wait, 2),
|
|
292
|
+
"threadpool_backlog": int(threadpool_backlog),
|
|
293
|
+
"failure_modes": failure_modes,
|
|
294
|
+
"findings": findings,
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
# -----------------------------------------------------
|
|
298
|
+
# TEXT OUTPUT
|
|
299
|
+
# -----------------------------------------------------
|
|
300
|
+
if args.output_format == "text":
|
|
301
|
+
print("=" * 60)
|
|
302
|
+
print("ASYNC RUNTIME AUDITOR")
|
|
303
|
+
print("=" * 60)
|
|
304
|
+
print(f"Target: {prometheus_url}")
|
|
305
|
+
print(f"Config: {args.config}\n")
|
|
306
|
+
print(f"Runtime Status: {runtime_status}")
|
|
307
|
+
print(f"Runtime Health Score: {runtime_health_score:.0f}\n")
|
|
308
|
+
|
|
309
|
+
if findings:
|
|
310
|
+
print("Findings:")
|
|
311
|
+
for finding in findings:
|
|
312
|
+
print(f" - {finding}")
|
|
313
|
+
else:
|
|
314
|
+
print("No major runtime issues detected.")
|
|
315
|
+
print()
|
|
316
|
+
|
|
317
|
+
with open(output_file, "w") as f:
|
|
318
|
+
json.dump(report, f, indent=2)
|
|
319
|
+
print(f"JSON audit report written to:\n{output_file}")
|
|
320
|
+
else:
|
|
321
|
+
print(json.dumps(report, indent=2))
|
|
322
|
+
|
|
323
|
+
# -----------------------------------------------------
|
|
324
|
+
# CI EXIT SEMANTICS
|
|
325
|
+
# -----------------------------------------------------
|
|
326
|
+
if args.fail_on_critical and runtime_status in ["CRITICAL", "COLLAPSE_RISK"]:
|
|
327
|
+
sys.exit(1)
|
|
328
|
+
|
|
329
|
+
sys.exit(0)
|
|
330
|
+
|
|
331
|
+
# ---------------------------------------------------------
|
|
332
|
+
# CLI ENTRYPOINT
|
|
333
|
+
# ---------------------------------------------------------
|
|
334
|
+
|
|
335
|
+
def main():
|
|
336
|
+
args = parse_args()
|
|
337
|
+
|
|
338
|
+
# 1. Handle Init First
|
|
339
|
+
if args.init:
|
|
340
|
+
init_template_config()
|
|
341
|
+
|
|
342
|
+
# 2. Resolve Config (Enforce Determinism)
|
|
343
|
+
resolved_config = resolve_config_path(args.config, args.output_format)
|
|
344
|
+
args.config = str(resolved_config)
|
|
345
|
+
|
|
346
|
+
# 3. Execute Audit
|
|
347
|
+
generate_audit_report(args)
|
|
348
|
+
|
|
349
|
+
if __name__ == "__main__":
|
|
350
|
+
main()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
target: "http://localhost:9090"
|
|
2
|
+
|
|
3
|
+
queries:
|
|
4
|
+
p99_latency: "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))"
|
|
5
|
+
event_loop_lag: "max_over_time(asyncio_event_loop_lag_seconds[15m])"
|
|
6
|
+
blocking_events_total: "blocking_events_total"
|
|
7
|
+
blocking_duration_avg: "rate(blocking_duration_seconds_sum[15m]) / rate(blocking_duration_seconds_count[15m])"
|
|
8
|
+
peak_active_requests: "max_over_time(active_requests[15m])"
|
|
9
|
+
threadpool_queue_wait: "rate(threadpool_queue_wait_seconds_sum[15m]) / rate(threadpool_queue_wait_seconds_count[15m])"
|
|
10
|
+
threadpool_tasks_started: "threadpool_tasks_started_total"
|
|
11
|
+
threadpool_tasks_completed: "threadpool_tasks_completed_total"
|
|
12
|
+
|
|
13
|
+
thresholds:
|
|
14
|
+
health_score_collapse: 2500
|
|
15
|
+
health_score_critical: 1200
|
|
16
|
+
health_score_degraded: 400
|
|
17
|
+
max_event_loop_lag_s: 0.5
|
|
18
|
+
max_blocking_duration_s: 1.0
|
|
19
|
+
max_queue_wait_s: 1.0
|
|
20
|
+
max_threadpool_backlog: 3
|
|
21
|
+
max_active_requests: 150
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: async-runtime-auditor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Runtime audit CLI for detecting asyncio degradation and threadpool saturation.
|
|
5
|
+
Author: Priyanshu
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
10
|
+
Classifier: Topic :: System :: Monitoring
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Requires-Python: >=3.9
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: requests>=2.31.0
|
|
17
|
+
Requires-Dist: PyYAML>=6.0.1
|
|
18
|
+
Dynamic: license-file
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Async Runtime Auditor
|
|
22
|
+
|
|
23
|
+
A lightweight CI/CD runtime audit CLI for Python `asyncio` applications.
|
|
24
|
+
|
|
25
|
+
Standard APM metrics (like P99 HTTP latency) frequently hide severe async runtime degradation. Synchronous blocking calls and executor queue amplification can stall the event loop while external HTTP metrics still appear healthy.
|
|
26
|
+
|
|
27
|
+
**Async Runtime Auditor** is a heuristic-driven operational tool designed to run in staging environments and deployment pipelines. It queries a telemetry backend (such as Prometheus), evaluates runtime state against configurable thresholds, and fails the build before blocking code reaches production.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
# The Problem: Telemetry Asymmetry
|
|
32
|
+
|
|
33
|
+
In many production Python systems:
|
|
34
|
+
|
|
35
|
+
```text
|
|
36
|
+
healthy HTTP latency != healthy runtime state
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Standard infrastructure metrics frequently fail to detect:
|
|
40
|
+
|
|
41
|
+
- synchronous I/O executed on the main async thread
|
|
42
|
+
- threadpool saturation and executor queue wait times
|
|
43
|
+
- asynchronous scheduler starvation
|
|
44
|
+
- hidden queue amplification
|
|
45
|
+
|
|
46
|
+
This tool exposes these blind spots by evaluating event-loop and executor telemetry directly.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
# Installation
|
|
51
|
+
|
|
52
|
+
Requires Python 3.9+
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install async-runtime-auditor
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
For local development:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install -e .
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
from the repository root.
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
# Quick Start
|
|
69
|
+
|
|
70
|
+
## 1. Initialize Configuration
|
|
71
|
+
|
|
72
|
+
Generate a local `metrics.yaml` configuration file:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
async-auditor --init
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## 2. Run an Audit
|
|
81
|
+
|
|
82
|
+
Run the tool against a staging or local telemetry backend:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
async-auditor --target http://localhost:9090
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## 3. CI/CD Pipeline Gating
|
|
91
|
+
|
|
92
|
+
Run with strict exit semantics to fail the pipeline if critical degradation is detected:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
async-auditor --target http://localhost:9090 --fail-on-critical
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
The CLI exits with code `1` if runtime state is:
|
|
99
|
+
|
|
100
|
+
- `CRITICAL`
|
|
101
|
+
- `COLLAPSE_RISK`
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
# Configuration (`metrics.yaml`)
|
|
106
|
+
|
|
107
|
+
The auditor is intentionally decoupled from specific telemetry naming conventions.
|
|
108
|
+
|
|
109
|
+
Prometheus queries are mapped into the heuristic engine through `metrics.yaml`.
|
|
110
|
+
|
|
111
|
+
```yaml
|
|
112
|
+
target: "http://localhost:9090"
|
|
113
|
+
|
|
114
|
+
queries:
|
|
115
|
+
p99_latency: "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))"
|
|
116
|
+
event_loop_lag: "max_over_time(asyncio_event_loop_lag_seconds[15m])"
|
|
117
|
+
blocking_events_total: "blocking_events_total"
|
|
118
|
+
blocking_duration_avg: "rate(blocking_duration_seconds_sum[15m]) / rate(blocking_duration_seconds_count[15m])"
|
|
119
|
+
peak_active_requests: "max_over_time(active_requests[15m])"
|
|
120
|
+
threadpool_queue_wait: "rate(threadpool_queue_wait_seconds_sum[15m]) / rate(threadpool_queue_wait_seconds_count[15m])"
|
|
121
|
+
threadpool_tasks_started: "threadpool_tasks_started_total"
|
|
122
|
+
threadpool_tasks_completed: "threadpool_tasks_completed_total"
|
|
123
|
+
|
|
124
|
+
thresholds:
|
|
125
|
+
health_score_collapse: 2500
|
|
126
|
+
health_score_critical: 1200
|
|
127
|
+
health_score_degraded: 400
|
|
128
|
+
max_event_loop_lag_s: 0.5
|
|
129
|
+
max_blocking_duration_s: 1.0
|
|
130
|
+
max_queue_wait_s: 1.0
|
|
131
|
+
max_threadpool_backlog: 3
|
|
132
|
+
max_active_requests: 150
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
# Output Formats
|
|
138
|
+
|
|
139
|
+
The CLI supports:
|
|
140
|
+
|
|
141
|
+
- human-readable terminal output
|
|
142
|
+
- structured JSON output for CI pipelines
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Standard Text Output
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
async-auditor --output-format text
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## JSON Output
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
async-auditor --output-format json
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
A structured report is also written to:
|
|
161
|
+
|
|
162
|
+
```text
|
|
163
|
+
audit_results.json
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
inside the current working directory.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
# CI/CD Integration Example
|
|
171
|
+
|
|
172
|
+
A reference GitHub Actions workflow is included in:
|
|
173
|
+
|
|
174
|
+
```text
|
|
175
|
+
examples/github-actions-gating.yml
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
This demonstrates how to use the auditor as a deployment quality gate inside pull request pipelines.
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
# Operational Model
|
|
183
|
+
|
|
184
|
+
The auditor computes a deterministic runtime health score using:
|
|
185
|
+
|
|
186
|
+
- event-loop lag
|
|
187
|
+
- blocking duration
|
|
188
|
+
- queue wait amplification
|
|
189
|
+
- active request pressure
|
|
190
|
+
- threadpool backlog behavior
|
|
191
|
+
|
|
192
|
+
The scoring model is intentionally threshold-driven and explainable.
|
|
193
|
+
|
|
194
|
+
This project does not use:
|
|
195
|
+
|
|
196
|
+
- machine learning classification
|
|
197
|
+
- anomaly detection systems
|
|
198
|
+
- probabilistic runtime forecasting
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
# Intended Usage
|
|
203
|
+
|
|
204
|
+
This tool is designed primarily for:
|
|
205
|
+
|
|
206
|
+
- deployment pipeline gating
|
|
207
|
+
- staging environment validation
|
|
208
|
+
- runtime regression detection
|
|
209
|
+
- async infrastructure diagnostics
|
|
210
|
+
|
|
211
|
+
It is not intended to replace full observability platforms.
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
# Scope
|
|
216
|
+
|
|
217
|
+
This project is intentionally narrow in scope.
|
|
218
|
+
|
|
219
|
+
## Included
|
|
220
|
+
|
|
221
|
+
- heuristic-based async runtime failure classification
|
|
222
|
+
- CI/CD exit-code semantics
|
|
223
|
+
- configurable operational thresholds
|
|
224
|
+
- Prometheus-compatible telemetry querying
|
|
225
|
+
- runtime degradation detection
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
## Not Included
|
|
230
|
+
|
|
231
|
+
- distributed tracing infrastructure
|
|
232
|
+
- observability data storage systems
|
|
233
|
+
- automated remediation
|
|
234
|
+
- Kubernetes orchestration
|
|
235
|
+
- OpenTelemetry collectors
|
|
236
|
+
- AI-driven diagnosis systems
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
# Design Constraints
|
|
241
|
+
|
|
242
|
+
The project intentionally prioritizes:
|
|
243
|
+
|
|
244
|
+
- operational clarity
|
|
245
|
+
- deterministic output
|
|
246
|
+
- explainable runtime scoring
|
|
247
|
+
- low dependency overhead
|
|
248
|
+
- simple deployment integration
|
|
249
|
+
|
|
250
|
+
over:
|
|
251
|
+
|
|
252
|
+
- infrastructure breadth
|
|
253
|
+
- platform extensibility
|
|
254
|
+
- distributed orchestration
|
|
255
|
+
- autonomous remediation
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
# License
|
|
260
|
+
|
|
261
|
+
MIT License
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
async_auditor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
async_auditor/cli.py,sha256=8rc6nfgsmxXxuWJUaRdpVoqQuIfpJ6rq4BtsZmZ376E,13540
|
|
3
|
+
async_auditor/default_metrics.yaml,sha256=L2fcOA3KCmr7N7mUy3zub_uNLOT0EdmGtmoBvB-7CYs,933
|
|
4
|
+
async_runtime_auditor-0.1.0.dist-info/licenses/LICENSE,sha256=ISMzH0EMJ7qQm0voHwx5hn6p0MFFWcFMcuSDtgn5Vqs,1093
|
|
5
|
+
async_runtime_auditor-0.1.0.dist-info/METADATA,sha256=LgVqYRGZfYlPzhbpWNL2koBPHgtPbFMCQAG7hRtuJgs,6042
|
|
6
|
+
async_runtime_auditor-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
async_runtime_auditor-0.1.0.dist-info/entry_points.txt,sha256=kmLXYS_YPMeT6O3yAzulN1qweMu5EM6TGxQRjD9lLeM,57
|
|
8
|
+
async_runtime_auditor-0.1.0.dist-info/top_level.txt,sha256=hEkLzKd3KM2TVu3zbh0f8uqEWq6Ns2m7vtOfvgFNe-U,14
|
|
9
|
+
async_runtime_auditor-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 PRIYANSHU KUMAR
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
async_auditor
|