kailash 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -1
- kailash/nodes/api/__init__.py +5 -0
- kailash/nodes/api/monitoring.py +463 -0
- kailash/nodes/api/security.py +822 -0
- kailash/nodes/base.py +3 -3
- kailash/nodes/data/__init__.py +6 -0
- kailash/nodes/data/event_generation.py +297 -0
- kailash/nodes/data/file_discovery.py +601 -0
- kailash/nodes/transform/processors.py +1 -1
- kailash/runtime/async_local.py +1 -1
- kailash/runtime/docker.py +4 -4
- kailash/runtime/local.py +39 -15
- kailash/runtime/parallel.py +2 -2
- kailash/runtime/parallel_cyclic.py +2 -2
- kailash/runtime/testing.py +2 -2
- kailash/utils/templates.py +6 -6
- kailash/visualization/performance.py +16 -3
- kailash/visualization/reports.py +5 -1
- kailash/workflow/cycle_analyzer.py +8 -1
- kailash/workflow/cyclic_runner.py +1 -1
- kailash/workflow/graph.py +18 -6
- kailash/workflow/visualization.py +10 -2
- kailash-0.3.0.dist-info/METADATA +428 -0
- {kailash-0.2.2.dist-info → kailash-0.3.0.dist-info}/RECORD +28 -24
- kailash-0.2.2.dist-info/METADATA +0 -121
- {kailash-0.2.2.dist-info → kailash-0.3.0.dist-info}/WHEEL +0 -0
- {kailash-0.2.2.dist-info → kailash-0.3.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.2.2.dist-info → kailash-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.2.2.dist-info → kailash-0.3.0.dist-info}/top_level.txt +0 -0
kailash/__init__.py
CHANGED
kailash/nodes/api/__init__.py
CHANGED
@@ -23,6 +23,7 @@ import warnings
|
|
23
23
|
from .auth import APIKeyNode, BasicAuthNode, OAuth2Node
|
24
24
|
from .graphql import AsyncGraphQLClientNode, GraphQLClientNode
|
25
25
|
from .http import AsyncHTTPRequestNode, HTTPRequestNode
|
26
|
+
from .monitoring import HealthCheckNode
|
26
27
|
from .rate_limiting import (
|
27
28
|
AsyncRateLimitedAPINode,
|
28
29
|
RateLimitConfig,
|
@@ -33,6 +34,7 @@ from .rate_limiting import (
|
|
33
34
|
create_rate_limiter,
|
34
35
|
)
|
35
36
|
from .rest import AsyncRESTClientNode, RESTClientNode
|
37
|
+
from .security import SecurityScannerNode
|
36
38
|
|
37
39
|
# Backwards compatibility aliases
|
38
40
|
HTTPClientNode = HTTPRequestNode # Deprecated: Use HTTPRequestNode instead
|
@@ -68,6 +70,9 @@ __all__ = [
|
|
68
70
|
"RateLimitedAPINode",
|
69
71
|
"AsyncRateLimitedAPINode",
|
70
72
|
"create_rate_limiter",
|
73
|
+
# Monitoring and Security
|
74
|
+
"HealthCheckNode",
|
75
|
+
"SecurityScannerNode",
|
71
76
|
# Backwards compatibility
|
72
77
|
"HTTPClientNode", # Deprecated alias
|
73
78
|
]
|
@@ -0,0 +1,463 @@
|
|
1
|
+
"""Monitoring and health check nodes for system observability."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import socket
|
5
|
+
import subprocess
|
6
|
+
import time
|
7
|
+
from datetime import datetime, timezone
|
8
|
+
from typing import Any, Dict, List
|
9
|
+
|
10
|
+
import requests
|
11
|
+
|
12
|
+
from kailash.nodes.base import Node, NodeParameter, register_node
|
13
|
+
|
14
|
+
|
15
|
+
@register_node()
|
16
|
+
class HealthCheckNode(Node):
|
17
|
+
"""
|
18
|
+
Performs health checks on various system components and services.
|
19
|
+
|
20
|
+
This node provides comprehensive health monitoring capabilities for
|
21
|
+
distributed systems, replacing DataTransformer with embedded Python code
|
22
|
+
for monitoring tasks. It supports HTTP endpoints, TCP ports, databases,
|
23
|
+
file systems, and custom health check commands.
|
24
|
+
|
25
|
+
Design Philosophy:
|
26
|
+
Modern distributed systems require robust health monitoring. This node
|
27
|
+
provides a declarative way to define health checks without writing
|
28
|
+
custom code in DataTransformer nodes. It standardizes health check
|
29
|
+
patterns and provides consistent output formats.
|
30
|
+
|
31
|
+
Upstream Dependencies:
|
32
|
+
- Configuration nodes with endpoint definitions
|
33
|
+
- Service discovery nodes
|
34
|
+
- Timer nodes for scheduled checks
|
35
|
+
- Alert threshold nodes
|
36
|
+
|
37
|
+
Downstream Consumers:
|
38
|
+
- Alert generation nodes
|
39
|
+
- Dashboard visualization nodes
|
40
|
+
- Logging and metrics nodes
|
41
|
+
- Auto-scaling decision nodes
|
42
|
+
- Incident response workflows
|
43
|
+
|
44
|
+
Configuration:
|
45
|
+
- Target endpoints and services
|
46
|
+
- Check types and parameters
|
47
|
+
- Timeout and retry settings
|
48
|
+
- Success/failure criteria
|
49
|
+
- Alert thresholds
|
50
|
+
|
51
|
+
Implementation Details:
|
52
|
+
- Parallel execution of multiple checks
|
53
|
+
- Proper timeout handling
|
54
|
+
- Retry logic with exponential backoff
|
55
|
+
- Structured output with metrics
|
56
|
+
- Support for various check types
|
57
|
+
|
58
|
+
Error Handling:
|
59
|
+
- Graceful handling of network failures
|
60
|
+
- Timeout management
|
61
|
+
- Invalid configuration detection
|
62
|
+
- Partial failure reporting
|
63
|
+
|
64
|
+
Side Effects:
|
65
|
+
- Network requests to target systems
|
66
|
+
- File system access for disk checks
|
67
|
+
- Process execution for custom commands
|
68
|
+
- Minimal impact design
|
69
|
+
|
70
|
+
Examples:
|
71
|
+
>>> # HTTP endpoint health checks
|
72
|
+
>>> health_check = HealthCheckNode(
|
73
|
+
... targets=[
|
74
|
+
... {'type': 'http', 'url': 'https://api.example.com/health'},
|
75
|
+
... {'type': 'http', 'url': 'https://app.example.com/status'}
|
76
|
+
... ],
|
77
|
+
... timeout=30
|
78
|
+
... )
|
79
|
+
>>> result = health_check.execute()
|
80
|
+
>>> assert 'health_results' in result
|
81
|
+
>>> assert result['summary']['total_checks'] == 2
|
82
|
+
>>>
|
83
|
+
>>> # Mixed health checks
|
84
|
+
>>> health_check = HealthCheckNode(
|
85
|
+
... targets=[
|
86
|
+
... {'type': 'tcp', 'host': 'database.example.com', 'port': 5432},
|
87
|
+
... {'type': 'disk', 'path': '/var/log', 'threshold': 80},
|
88
|
+
... {'type': 'command', 'command': 'systemctl is-active nginx'}
|
89
|
+
... ]
|
90
|
+
... )
|
91
|
+
>>> result = health_check.execute()
|
92
|
+
>>> assert 'health_results' in result
|
93
|
+
"""
|
94
|
+
|
95
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
96
|
+
return {
|
97
|
+
"targets": NodeParameter(
|
98
|
+
name="targets",
|
99
|
+
type=list,
|
100
|
+
required=True,
|
101
|
+
description="List of health check targets with type and configuration",
|
102
|
+
),
|
103
|
+
"timeout": NodeParameter(
|
104
|
+
name="timeout",
|
105
|
+
type=int,
|
106
|
+
required=False,
|
107
|
+
default=30,
|
108
|
+
description="Timeout in seconds for each health check",
|
109
|
+
),
|
110
|
+
"retries": NodeParameter(
|
111
|
+
name="retries",
|
112
|
+
type=int,
|
113
|
+
required=False,
|
114
|
+
default=2,
|
115
|
+
description="Number of retry attempts for failed checks",
|
116
|
+
),
|
117
|
+
"parallel": NodeParameter(
|
118
|
+
name="parallel",
|
119
|
+
type=bool,
|
120
|
+
required=False,
|
121
|
+
default=True,
|
122
|
+
description="Execute health checks in parallel",
|
123
|
+
),
|
124
|
+
"include_metrics": NodeParameter(
|
125
|
+
name="include_metrics",
|
126
|
+
type=bool,
|
127
|
+
required=False,
|
128
|
+
default=True,
|
129
|
+
description="Include performance metrics in results",
|
130
|
+
),
|
131
|
+
}
|
132
|
+
|
133
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
134
|
+
targets = kwargs["targets"]
|
135
|
+
timeout = kwargs.get("timeout", 30)
|
136
|
+
retries = kwargs.get("retries", 2)
|
137
|
+
parallel = kwargs.get("parallel", True)
|
138
|
+
include_metrics = kwargs.get("include_metrics", True)
|
139
|
+
|
140
|
+
start_time = time.time()
|
141
|
+
|
142
|
+
if parallel:
|
143
|
+
# Use asyncio for parallel execution
|
144
|
+
results = asyncio.run(
|
145
|
+
self._run_checks_parallel(targets, timeout, retries, include_metrics)
|
146
|
+
)
|
147
|
+
else:
|
148
|
+
# Sequential execution
|
149
|
+
results = self._run_checks_sequential(
|
150
|
+
targets, timeout, retries, include_metrics
|
151
|
+
)
|
152
|
+
|
153
|
+
execution_time = time.time() - start_time
|
154
|
+
|
155
|
+
# Generate summary
|
156
|
+
summary = self._generate_summary(results, execution_time)
|
157
|
+
|
158
|
+
return {
|
159
|
+
"health_results": results,
|
160
|
+
"summary": summary,
|
161
|
+
"check_count": len(results),
|
162
|
+
"healthy_count": len([r for r in results if r["status"] == "healthy"]),
|
163
|
+
"unhealthy_count": len([r for r in results if r["status"] == "unhealthy"]),
|
164
|
+
"execution_time": execution_time,
|
165
|
+
"timestamp": datetime.now(timezone.utc).isoformat() + "Z",
|
166
|
+
}
|
167
|
+
|
168
|
+
async def _run_checks_parallel(
|
169
|
+
self, targets: List[Dict], timeout: int, retries: int, include_metrics: bool
|
170
|
+
) -> List[Dict[str, Any]]:
|
171
|
+
"""Run health checks in parallel using asyncio."""
|
172
|
+
|
173
|
+
async def run_single_check(target):
|
174
|
+
return await asyncio.get_event_loop().run_in_executor(
|
175
|
+
None,
|
176
|
+
self._perform_health_check,
|
177
|
+
target,
|
178
|
+
timeout,
|
179
|
+
retries,
|
180
|
+
include_metrics,
|
181
|
+
)
|
182
|
+
|
183
|
+
tasks = [run_single_check(target) for target in targets]
|
184
|
+
return await asyncio.gather(*tasks, return_exceptions=True)
|
185
|
+
|
186
|
+
def _run_checks_sequential(
|
187
|
+
self, targets: List[Dict], timeout: int, retries: int, include_metrics: bool
|
188
|
+
) -> List[Dict[str, Any]]:
|
189
|
+
"""Run health checks sequentially."""
|
190
|
+
return [
|
191
|
+
self._perform_health_check(target, timeout, retries, include_metrics)
|
192
|
+
for target in targets
|
193
|
+
]
|
194
|
+
|
195
|
+
def _perform_health_check(
|
196
|
+
self, target: Dict, timeout: int, retries: int, include_metrics: bool
|
197
|
+
) -> Dict[str, Any]:
|
198
|
+
"""Perform a single health check with retry logic."""
|
199
|
+
|
200
|
+
check_type = target.get("type", "unknown")
|
201
|
+
check_id = target.get("id", f"{check_type}_{hash(str(target)) % 10000}")
|
202
|
+
|
203
|
+
for attempt in range(retries + 1):
|
204
|
+
try:
|
205
|
+
start_time = time.time()
|
206
|
+
|
207
|
+
if check_type == "http":
|
208
|
+
result = self._check_http(target, timeout)
|
209
|
+
elif check_type == "tcp":
|
210
|
+
result = self._check_tcp(target, timeout)
|
211
|
+
elif check_type == "disk":
|
212
|
+
result = self._check_disk(target)
|
213
|
+
elif check_type == "command":
|
214
|
+
result = self._check_command(target, timeout)
|
215
|
+
elif check_type == "database":
|
216
|
+
result = self._check_database(target, timeout)
|
217
|
+
else:
|
218
|
+
result = {
|
219
|
+
"status": "unhealthy",
|
220
|
+
"message": f"Unknown check type: {check_type}",
|
221
|
+
"details": {},
|
222
|
+
}
|
223
|
+
|
224
|
+
# Add timing information
|
225
|
+
response_time = time.time() - start_time
|
226
|
+
result["response_time"] = response_time
|
227
|
+
result["attempt"] = attempt + 1
|
228
|
+
result["check_id"] = check_id
|
229
|
+
result["check_type"] = check_type
|
230
|
+
result["target"] = target
|
231
|
+
result["timestamp"] = datetime.now(timezone.utc).isoformat() + "Z"
|
232
|
+
|
233
|
+
# If successful, return immediately
|
234
|
+
if result["status"] == "healthy":
|
235
|
+
return result
|
236
|
+
|
237
|
+
except Exception as e:
|
238
|
+
if attempt == retries: # Last attempt
|
239
|
+
return {
|
240
|
+
"check_id": check_id,
|
241
|
+
"check_type": check_type,
|
242
|
+
"target": target,
|
243
|
+
"status": "unhealthy",
|
244
|
+
"message": f"Health check failed after {retries + 1} attempts: {str(e)}",
|
245
|
+
"details": {"error": str(e), "error_type": type(e).__name__},
|
246
|
+
"response_time": time.time() - start_time,
|
247
|
+
"attempt": attempt + 1,
|
248
|
+
"timestamp": datetime.now(timezone.utc).isoformat() + "Z",
|
249
|
+
}
|
250
|
+
|
251
|
+
# Wait before retry (exponential backoff)
|
252
|
+
time.sleep(min(2**attempt, 10))
|
253
|
+
|
254
|
+
return result
|
255
|
+
|
256
|
+
def _check_http(self, target: Dict, timeout: int) -> Dict[str, Any]:
|
257
|
+
"""Perform HTTP health check."""
|
258
|
+
url = target["url"]
|
259
|
+
expected_status = target.get("expected_status", 200)
|
260
|
+
expected_content = target.get("expected_content")
|
261
|
+
headers = target.get("headers", {})
|
262
|
+
|
263
|
+
response = requests.get(url, timeout=timeout, headers=headers)
|
264
|
+
|
265
|
+
# Check status code
|
266
|
+
if response.status_code != expected_status:
|
267
|
+
return {
|
268
|
+
"status": "unhealthy",
|
269
|
+
"message": f"HTTP status {response.status_code}, expected {expected_status}",
|
270
|
+
"details": {
|
271
|
+
"status_code": response.status_code,
|
272
|
+
"response_size": len(response.content),
|
273
|
+
"url": url,
|
274
|
+
},
|
275
|
+
}
|
276
|
+
|
277
|
+
# Check content if specified
|
278
|
+
if expected_content and expected_content not in response.text:
|
279
|
+
return {
|
280
|
+
"status": "unhealthy",
|
281
|
+
"message": f"Expected content '{expected_content}' not found in response",
|
282
|
+
"details": {
|
283
|
+
"status_code": response.status_code,
|
284
|
+
"response_size": len(response.content),
|
285
|
+
"url": url,
|
286
|
+
},
|
287
|
+
}
|
288
|
+
|
289
|
+
return {
|
290
|
+
"status": "healthy",
|
291
|
+
"message": f"HTTP check successful: {response.status_code}",
|
292
|
+
"details": {
|
293
|
+
"status_code": response.status_code,
|
294
|
+
"response_size": len(response.content),
|
295
|
+
"url": url,
|
296
|
+
},
|
297
|
+
}
|
298
|
+
|
299
|
+
def _check_tcp(self, target: Dict, timeout: int) -> Dict[str, Any]:
|
300
|
+
"""Perform TCP port connectivity check."""
|
301
|
+
host = target["host"]
|
302
|
+
port = target["port"]
|
303
|
+
|
304
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
305
|
+
sock.settimeout(timeout)
|
306
|
+
|
307
|
+
try:
|
308
|
+
result = sock.connect_ex((host, port))
|
309
|
+
if result == 0:
|
310
|
+
return {
|
311
|
+
"status": "healthy",
|
312
|
+
"message": f"TCP connection successful to {host}:{port}",
|
313
|
+
"details": {"host": host, "port": port},
|
314
|
+
}
|
315
|
+
else:
|
316
|
+
return {
|
317
|
+
"status": "unhealthy",
|
318
|
+
"message": f"TCP connection failed to {host}:{port}",
|
319
|
+
"details": {"host": host, "port": port, "error_code": result},
|
320
|
+
}
|
321
|
+
finally:
|
322
|
+
sock.close()
|
323
|
+
|
324
|
+
def _check_disk(self, target: Dict) -> Dict[str, Any]:
|
325
|
+
"""Perform disk space check."""
|
326
|
+
import shutil
|
327
|
+
|
328
|
+
path = target["path"]
|
329
|
+
threshold = target.get("threshold", 90) # Default 90% threshold
|
330
|
+
|
331
|
+
try:
|
332
|
+
total, used, free = shutil.disk_usage(path)
|
333
|
+
usage_percent = (used / total) * 100
|
334
|
+
|
335
|
+
if usage_percent > threshold:
|
336
|
+
return {
|
337
|
+
"status": "unhealthy",
|
338
|
+
"message": f"Disk usage {usage_percent:.1f}% exceeds threshold {threshold}%",
|
339
|
+
"details": {
|
340
|
+
"path": path,
|
341
|
+
"usage_percent": usage_percent,
|
342
|
+
"threshold": threshold,
|
343
|
+
"total_gb": total / (1024**3),
|
344
|
+
"used_gb": used / (1024**3),
|
345
|
+
"free_gb": free / (1024**3),
|
346
|
+
},
|
347
|
+
}
|
348
|
+
else:
|
349
|
+
return {
|
350
|
+
"status": "healthy",
|
351
|
+
"message": f"Disk usage {usage_percent:.1f}% within threshold",
|
352
|
+
"details": {
|
353
|
+
"path": path,
|
354
|
+
"usage_percent": usage_percent,
|
355
|
+
"threshold": threshold,
|
356
|
+
"total_gb": total / (1024**3),
|
357
|
+
"used_gb": used / (1024**3),
|
358
|
+
"free_gb": free / (1024**3),
|
359
|
+
},
|
360
|
+
}
|
361
|
+
except Exception as e:
|
362
|
+
return {
|
363
|
+
"status": "unhealthy",
|
364
|
+
"message": f"Disk check failed: {str(e)}",
|
365
|
+
"details": {"path": path, "error": str(e)},
|
366
|
+
}
|
367
|
+
|
368
|
+
def _check_command(self, target: Dict, timeout: int) -> Dict[str, Any]:
|
369
|
+
"""Perform custom command health check."""
|
370
|
+
command = target["command"]
|
371
|
+
expected_exit_code = target.get("expected_exit_code", 0)
|
372
|
+
|
373
|
+
try:
|
374
|
+
result = subprocess.run(
|
375
|
+
command,
|
376
|
+
shell=True,
|
377
|
+
timeout=timeout,
|
378
|
+
capture_output=True,
|
379
|
+
text=True,
|
380
|
+
)
|
381
|
+
|
382
|
+
if result.returncode == expected_exit_code:
|
383
|
+
return {
|
384
|
+
"status": "healthy",
|
385
|
+
"message": f"Command succeeded with exit code {result.returncode}",
|
386
|
+
"details": {
|
387
|
+
"command": command,
|
388
|
+
"exit_code": result.returncode,
|
389
|
+
"stdout": result.stdout.strip(),
|
390
|
+
"stderr": result.stderr.strip(),
|
391
|
+
},
|
392
|
+
}
|
393
|
+
else:
|
394
|
+
return {
|
395
|
+
"status": "unhealthy",
|
396
|
+
"message": f"Command failed with exit code {result.returncode}",
|
397
|
+
"details": {
|
398
|
+
"command": command,
|
399
|
+
"exit_code": result.returncode,
|
400
|
+
"expected_exit_code": expected_exit_code,
|
401
|
+
"stdout": result.stdout.strip(),
|
402
|
+
"stderr": result.stderr.strip(),
|
403
|
+
},
|
404
|
+
}
|
405
|
+
except subprocess.TimeoutExpired:
|
406
|
+
return {
|
407
|
+
"status": "unhealthy",
|
408
|
+
"message": f"Command timed out after {timeout} seconds",
|
409
|
+
"details": {"command": command, "timeout": timeout},
|
410
|
+
}
|
411
|
+
|
412
|
+
def _check_database(self, target: Dict, timeout: int) -> Dict[str, Any]:
|
413
|
+
"""Perform database connectivity check."""
|
414
|
+
# This is a simplified example - in production, you'd use actual database drivers
|
415
|
+
db_type = target.get("db_type", "postgresql")
|
416
|
+
host = target["host"]
|
417
|
+
port = target.get("port", 5432 if db_type == "postgresql" else 3306)
|
418
|
+
|
419
|
+
# For now, just check TCP connectivity
|
420
|
+
# In a real implementation, you'd use database-specific health checks
|
421
|
+
return self._check_tcp({"host": host, "port": port}, timeout)
|
422
|
+
|
423
|
+
def _generate_summary(
|
424
|
+
self, results: List[Dict], execution_time: float
|
425
|
+
) -> Dict[str, Any]:
|
426
|
+
"""Generate summary statistics from health check results."""
|
427
|
+
total_checks = len(results)
|
428
|
+
healthy_checks = len([r for r in results if r.get("status") == "healthy"])
|
429
|
+
unhealthy_checks = total_checks - healthy_checks
|
430
|
+
|
431
|
+
# Calculate average response time
|
432
|
+
response_times = [
|
433
|
+
r.get("response_time", 0) for r in results if "response_time" in r
|
434
|
+
]
|
435
|
+
avg_response_time = (
|
436
|
+
sum(response_times) / len(response_times) if response_times else 0
|
437
|
+
)
|
438
|
+
|
439
|
+
# Group by check type
|
440
|
+
check_types = {}
|
441
|
+
for result in results:
|
442
|
+
check_type = result.get("check_type", "unknown")
|
443
|
+
if check_type not in check_types:
|
444
|
+
check_types[check_type] = {"total": 0, "healthy": 0, "unhealthy": 0}
|
445
|
+
|
446
|
+
check_types[check_type]["total"] += 1
|
447
|
+
if result.get("status") == "healthy":
|
448
|
+
check_types[check_type]["healthy"] += 1
|
449
|
+
else:
|
450
|
+
check_types[check_type]["unhealthy"] += 1
|
451
|
+
|
452
|
+
return {
|
453
|
+
"total_checks": total_checks,
|
454
|
+
"healthy_checks": healthy_checks,
|
455
|
+
"unhealthy_checks": unhealthy_checks,
|
456
|
+
"health_percentage": (
|
457
|
+
(healthy_checks / total_checks * 100) if total_checks > 0 else 0
|
458
|
+
),
|
459
|
+
"average_response_time": avg_response_time,
|
460
|
+
"execution_time": execution_time,
|
461
|
+
"check_types": check_types,
|
462
|
+
"overall_status": "healthy" if unhealthy_checks == 0 else "unhealthy",
|
463
|
+
}
|