kailash 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +1 -1
- kailash/access_control/__init__.py +1 -1
- kailash/core/actors/adaptive_pool_controller.py +630 -0
- kailash/core/ml/__init__.py +1 -0
- kailash/core/ml/query_patterns.py +544 -0
- kailash/core/monitoring/__init__.py +19 -0
- kailash/core/monitoring/connection_metrics.py +488 -0
- kailash/core/optimization/__init__.py +1 -0
- kailash/core/resilience/__init__.py +17 -0
- kailash/core/resilience/circuit_breaker.py +382 -0
- kailash/middleware/auth/access_control.py +6 -6
- kailash/middleware/communication/ai_chat.py +7 -7
- kailash/middleware/communication/api_gateway.py +5 -15
- kailash/middleware/gateway/event_store.py +66 -26
- kailash/middleware/mcp/enhanced_server.py +2 -2
- kailash/nodes/data/query_pipeline.py +641 -0
- kailash/nodes/data/query_router.py +895 -0
- kailash/nodes/data/workflow_connection_pool.py +451 -23
- kailash/nodes/monitoring/__init__.py +3 -5
- kailash/nodes/monitoring/connection_dashboard.py +822 -0
- kailash/nodes/rag/__init__.py +1 -3
- {kailash-0.6.0.dist-info → kailash-0.6.1.dist-info}/METADATA +13 -1
- {kailash-0.6.0.dist-info → kailash-0.6.1.dist-info}/RECORD +27 -16
- {kailash-0.6.0.dist-info → kailash-0.6.1.dist-info}/WHEEL +0 -0
- {kailash-0.6.0.dist-info → kailash-0.6.1.dist-info}/entry_points.txt +0 -0
- {kailash-0.6.0.dist-info → kailash-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.6.0.dist-info → kailash-0.6.1.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,17 @@ from kailash.core.actors import (
|
|
19
19
|
ConnectionState,
|
20
20
|
SupervisionStrategy,
|
21
21
|
)
|
22
|
+
from kailash.core.actors.adaptive_pool_controller import AdaptivePoolController
|
23
|
+
from kailash.core.ml.query_patterns import QueryPatternTracker
|
24
|
+
from kailash.core.monitoring.connection_metrics import (
|
25
|
+
ConnectionMetricsCollector,
|
26
|
+
ErrorCategory,
|
27
|
+
)
|
28
|
+
from kailash.core.resilience.circuit_breaker import (
|
29
|
+
CircuitBreakerConfig,
|
30
|
+
CircuitBreakerError,
|
31
|
+
ConnectionCircuitBreaker,
|
32
|
+
)
|
22
33
|
from kailash.nodes.base import NodeParameter, register_node
|
23
34
|
from kailash.nodes.base_async import AsyncNode
|
24
35
|
from kailash.sdk_exceptions import NodeExecutionError
|
@@ -187,6 +198,8 @@ class WorkflowConnectionPool(AsyncNode):
|
|
187
198
|
self.max_connections = config.get("max_connections", 10)
|
188
199
|
self.health_threshold = config.get("health_threshold", 50)
|
189
200
|
self.pre_warm_enabled = config.get("pre_warm", True)
|
201
|
+
self.adaptive_sizing_enabled = config.get("adaptive_sizing", False)
|
202
|
+
self.enable_query_routing = config.get("enable_query_routing", False)
|
190
203
|
|
191
204
|
# Database configuration
|
192
205
|
self.db_config = {
|
@@ -223,6 +236,41 @@ class WorkflowConnectionPool(AsyncNode):
|
|
223
236
|
self._initialized = False
|
224
237
|
self._closing = False
|
225
238
|
|
239
|
+
# Phase 2 components
|
240
|
+
self.query_pattern_tracker = None
|
241
|
+
self.adaptive_controller = None
|
242
|
+
|
243
|
+
if self.enable_query_routing:
|
244
|
+
self.query_pattern_tracker = QueryPatternTracker()
|
245
|
+
|
246
|
+
if self.adaptive_sizing_enabled:
|
247
|
+
self.adaptive_controller = AdaptivePoolController(
|
248
|
+
min_size=self.min_connections, max_size=self.max_connections
|
249
|
+
)
|
250
|
+
|
251
|
+
# Phase 3 components
|
252
|
+
# Circuit breaker for connection failures
|
253
|
+
self.circuit_breaker_config = CircuitBreakerConfig(
|
254
|
+
failure_threshold=config.get("circuit_breaker_failure_threshold", 5),
|
255
|
+
recovery_timeout=config.get("circuit_breaker_recovery_timeout", 60),
|
256
|
+
error_rate_threshold=config.get("circuit_breaker_error_rate", 0.5),
|
257
|
+
)
|
258
|
+
self.circuit_breaker = ConnectionCircuitBreaker(self.circuit_breaker_config)
|
259
|
+
|
260
|
+
# Comprehensive metrics collector
|
261
|
+
self.metrics_collector = ConnectionMetricsCollector(
|
262
|
+
pool_name=self.metadata.name,
|
263
|
+
retention_minutes=config.get("metrics_retention_minutes", 60),
|
264
|
+
)
|
265
|
+
|
266
|
+
# Enable query pipelining support
|
267
|
+
self.enable_pipelining = config.get("enable_pipelining", False)
|
268
|
+
self.pipeline_batch_size = config.get("pipeline_batch_size", 100)
|
269
|
+
|
270
|
+
# Monitoring dashboard integration
|
271
|
+
self.enable_monitoring = config.get("enable_monitoring", False)
|
272
|
+
self.monitoring_port = config.get("monitoring_port", 8080)
|
273
|
+
|
226
274
|
def get_parameters(self) -> Dict[str, NodeParameter]:
|
227
275
|
"""Define node parameters."""
|
228
276
|
params = [
|
@@ -287,6 +335,77 @@ class WorkflowConnectionPool(AsyncNode):
|
|
287
335
|
default=True,
|
288
336
|
description="Enable pattern-based pre-warming",
|
289
337
|
),
|
338
|
+
NodeParameter(
|
339
|
+
name="adaptive_sizing",
|
340
|
+
type=bool,
|
341
|
+
required=False,
|
342
|
+
default=False,
|
343
|
+
description="Enable adaptive pool sizing based on workload",
|
344
|
+
),
|
345
|
+
NodeParameter(
|
346
|
+
name="enable_query_routing",
|
347
|
+
type=bool,
|
348
|
+
required=False,
|
349
|
+
default=False,
|
350
|
+
description="Enable query pattern tracking for routing optimization",
|
351
|
+
),
|
352
|
+
# Phase 3 parameters
|
353
|
+
NodeParameter(
|
354
|
+
name="circuit_breaker_failure_threshold",
|
355
|
+
type=int,
|
356
|
+
required=False,
|
357
|
+
default=5,
|
358
|
+
description="Failures before circuit breaker opens",
|
359
|
+
),
|
360
|
+
NodeParameter(
|
361
|
+
name="circuit_breaker_recovery_timeout",
|
362
|
+
type=int,
|
363
|
+
required=False,
|
364
|
+
default=60,
|
365
|
+
description="Seconds before circuit breaker tries recovery",
|
366
|
+
),
|
367
|
+
NodeParameter(
|
368
|
+
name="circuit_breaker_error_rate",
|
369
|
+
type=float,
|
370
|
+
required=False,
|
371
|
+
default=0.5,
|
372
|
+
description="Error rate threshold to open circuit",
|
373
|
+
),
|
374
|
+
NodeParameter(
|
375
|
+
name="metrics_retention_minutes",
|
376
|
+
type=int,
|
377
|
+
required=False,
|
378
|
+
default=60,
|
379
|
+
description="How long to retain detailed metrics",
|
380
|
+
),
|
381
|
+
NodeParameter(
|
382
|
+
name="enable_pipelining",
|
383
|
+
type=bool,
|
384
|
+
required=False,
|
385
|
+
default=False,
|
386
|
+
description="Enable query pipelining for batch operations",
|
387
|
+
),
|
388
|
+
NodeParameter(
|
389
|
+
name="pipeline_batch_size",
|
390
|
+
type=int,
|
391
|
+
required=False,
|
392
|
+
default=100,
|
393
|
+
description="Maximum queries per pipeline batch",
|
394
|
+
),
|
395
|
+
NodeParameter(
|
396
|
+
name="enable_monitoring",
|
397
|
+
type=bool,
|
398
|
+
required=False,
|
399
|
+
default=False,
|
400
|
+
description="Enable monitoring dashboard",
|
401
|
+
),
|
402
|
+
NodeParameter(
|
403
|
+
name="monitoring_port",
|
404
|
+
type=int,
|
405
|
+
required=False,
|
406
|
+
default=8080,
|
407
|
+
description="Port for monitoring dashboard",
|
408
|
+
),
|
290
409
|
# Operation parameters
|
291
410
|
NodeParameter(
|
292
411
|
name="operation",
|
@@ -355,6 +474,20 @@ class WorkflowConnectionPool(AsyncNode):
|
|
355
474
|
return await self._execute_query(inputs)
|
356
475
|
elif operation == "stats":
|
357
476
|
return await self._get_stats()
|
477
|
+
elif operation == "get_status":
|
478
|
+
return await self._get_pool_status()
|
479
|
+
elif operation == "adjust_pool_size":
|
480
|
+
return await self.adjust_pool_size(inputs.get("new_size"))
|
481
|
+
elif operation == "get_pool_statistics":
|
482
|
+
return await self.get_pool_statistics()
|
483
|
+
elif operation == "get_comprehensive_status":
|
484
|
+
return await self.get_comprehensive_status()
|
485
|
+
elif operation == "start_monitoring":
|
486
|
+
return await self._start_monitoring_dashboard()
|
487
|
+
elif operation == "stop_monitoring":
|
488
|
+
return await self._stop_monitoring_dashboard()
|
489
|
+
elif operation == "export_metrics":
|
490
|
+
return {"prometheus_metrics": self.metrics_collector.export_prometheus()}
|
358
491
|
else:
|
359
492
|
raise NodeExecutionError(f"Unknown operation: {operation}")
|
360
493
|
|
@@ -374,12 +507,20 @@ class WorkflowConnectionPool(AsyncNode):
|
|
374
507
|
# Create minimum connections
|
375
508
|
await self._ensure_min_connections()
|
376
509
|
|
510
|
+
# Start adaptive controller if enabled
|
511
|
+
if self.adaptive_controller:
|
512
|
+
await self.adaptive_controller.start(
|
513
|
+
pool_ref=self, pattern_tracker=self.query_pattern_tracker
|
514
|
+
)
|
515
|
+
|
377
516
|
self._initialized = True
|
378
517
|
|
379
518
|
return {
|
380
519
|
"status": "initialized",
|
381
520
|
"min_connections": self.min_connections,
|
382
521
|
"max_connections": self.max_connections,
|
522
|
+
"adaptive_sizing": self.adaptive_sizing_enabled,
|
523
|
+
"query_routing": self.enable_query_routing,
|
383
524
|
}
|
384
525
|
|
385
526
|
except Exception as e:
|
@@ -394,28 +535,39 @@ class WorkflowConnectionPool(AsyncNode):
|
|
394
535
|
start_time = time.time()
|
395
536
|
|
396
537
|
try:
|
397
|
-
#
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
#
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
538
|
+
# Use circuit breaker to protect connection acquisition
|
539
|
+
async def acquire_with_circuit_breaker():
|
540
|
+
# Try to get available connection
|
541
|
+
connection = None
|
542
|
+
|
543
|
+
# Fast path: try to get immediately available connection
|
544
|
+
try:
|
545
|
+
connection = await asyncio.wait_for(
|
546
|
+
self.available_connections.get(), timeout=0.1
|
547
|
+
)
|
548
|
+
except asyncio.TimeoutError:
|
549
|
+
# Need to create new connection or wait
|
550
|
+
if len(self.all_connections) < self.max_connections:
|
551
|
+
# Create new connection
|
552
|
+
connection = await self._create_connection()
|
553
|
+
# Don't put it in available queue - we'll use it directly
|
554
|
+
else:
|
555
|
+
# Wait for available connection
|
556
|
+
connection = await self.available_connections.get()
|
557
|
+
|
558
|
+
return connection
|
559
|
+
|
560
|
+
# Execute with circuit breaker protection
|
561
|
+
connection = await self.circuit_breaker.call(acquire_with_circuit_breaker)
|
414
562
|
|
415
563
|
# Record acquisition time
|
416
564
|
wait_time = time.time() - start_time
|
417
565
|
self.metrics.record_acquisition_time(wait_time)
|
418
566
|
|
567
|
+
# Track in comprehensive metrics
|
568
|
+
with self.metrics_collector.track_acquisition() as timer:
|
569
|
+
pass # Already acquired, just recording time
|
570
|
+
|
419
571
|
# Move to active
|
420
572
|
self.active_connections[connection.id] = connection
|
421
573
|
|
@@ -431,8 +583,14 @@ class WorkflowConnectionPool(AsyncNode):
|
|
431
583
|
"acquisition_time_ms": wait_time * 1000,
|
432
584
|
}
|
433
585
|
|
586
|
+
except CircuitBreakerError as e:
|
587
|
+
# Circuit is open - pool is experiencing failures
|
588
|
+
self.metrics_collector.track_pool_exhaustion()
|
589
|
+
logger.error(f"Circuit breaker open: {e}")
|
590
|
+
raise NodeExecutionError(f"Connection pool circuit breaker open: {e}")
|
434
591
|
except Exception as e:
|
435
592
|
logger.error(f"Failed to acquire connection: {e}")
|
593
|
+
self.metrics_collector.track_query_error("ACQUIRE", e)
|
436
594
|
raise NodeExecutionError(f"Connection acquisition failed: {e}")
|
437
595
|
|
438
596
|
async def _release_connection(self, connection_id: Optional[str]) -> Dict[str, Any]:
|
@@ -462,18 +620,45 @@ class WorkflowConnectionPool(AsyncNode):
|
|
462
620
|
|
463
621
|
connection = self.active_connections[connection_id]
|
464
622
|
|
623
|
+
# Determine query type for metrics
|
624
|
+
query = inputs.get("query", "").strip().upper()
|
625
|
+
query_type = "UNKNOWN"
|
626
|
+
if query.startswith("SELECT"):
|
627
|
+
query_type = "SELECT"
|
628
|
+
elif query.startswith("INSERT"):
|
629
|
+
query_type = "INSERT"
|
630
|
+
elif query.startswith("UPDATE"):
|
631
|
+
query_type = "UPDATE"
|
632
|
+
elif query.startswith("DELETE"):
|
633
|
+
query_type = "DELETE"
|
634
|
+
|
465
635
|
try:
|
466
|
-
# Execute query
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
636
|
+
# Execute query with comprehensive metrics tracking
|
637
|
+
with self.metrics_collector.track_query(query_type) as timer:
|
638
|
+
result = await connection.execute(
|
639
|
+
query=inputs.get("query"),
|
640
|
+
params=inputs.get("params"),
|
641
|
+
fetch_mode=inputs.get("fetch_mode", "all"),
|
642
|
+
)
|
472
643
|
|
473
644
|
# Update metrics
|
474
645
|
self.metrics.queries_executed += 1
|
475
646
|
if not result.success:
|
476
647
|
self.metrics.query_errors += 1
|
648
|
+
self.metrics_collector.track_query_error(
|
649
|
+
query_type, Exception(result.error)
|
650
|
+
)
|
651
|
+
|
652
|
+
# Track query pattern if enabled
|
653
|
+
if self.query_pattern_tracker and inputs.get("query"):
|
654
|
+
self.query_pattern_tracker.record_execution(
|
655
|
+
fingerprint=inputs.get("query_fingerprint", inputs.get("query")),
|
656
|
+
execution_time_ms=result.execution_time * 1000,
|
657
|
+
connection_id=connection_id,
|
658
|
+
parameters=inputs.get("params", {}),
|
659
|
+
success=result.success,
|
660
|
+
result_size=len(result.data) if result.data else 0,
|
661
|
+
)
|
477
662
|
|
478
663
|
return {
|
479
664
|
"success": result.success,
|
@@ -593,6 +778,10 @@ class WorkflowConnectionPool(AsyncNode):
|
|
593
778
|
# Stop accepting new connections
|
594
779
|
self._initialized = False
|
595
780
|
|
781
|
+
# Stop adaptive controller if running
|
782
|
+
if self.adaptive_controller:
|
783
|
+
await self.adaptive_controller.stop()
|
784
|
+
|
596
785
|
# Stop all connection actors gracefully
|
597
786
|
actors_to_stop = list(self.all_connections.values())
|
598
787
|
for actor in actors_to_stop:
|
@@ -641,3 +830,242 @@ class WorkflowConnectionPool(AsyncNode):
|
|
641
830
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
642
831
|
"""Context manager exit."""
|
643
832
|
await self._cleanup()
|
833
|
+
|
834
|
+
async def _get_pool_status(self) -> Dict[str, Any]:
|
835
|
+
"""Get pool status for query router."""
|
836
|
+
connections = {}
|
837
|
+
|
838
|
+
for conn_id, conn in self.all_connections.items():
|
839
|
+
connections[conn_id] = {
|
840
|
+
"health_score": conn.health_score,
|
841
|
+
"active_queries": 1 if conn_id in self.active_connections else 0,
|
842
|
+
"capabilities": [
|
843
|
+
"read",
|
844
|
+
"write",
|
845
|
+
], # TODO: Add actual capability detection
|
846
|
+
"avg_latency_ms": 0.0, # TODO: Track actual latency
|
847
|
+
"last_used": datetime.now().isoformat(),
|
848
|
+
}
|
849
|
+
|
850
|
+
return {
|
851
|
+
"connections": connections,
|
852
|
+
"pool_size": len(self.all_connections),
|
853
|
+
"active_count": len(self.active_connections),
|
854
|
+
"available_count": self.available_connections.qsize(),
|
855
|
+
}
|
856
|
+
|
857
|
+
async def adjust_pool_size(self, new_size: int) -> Dict[str, Any]:
|
858
|
+
"""Dynamically adjust pool size."""
|
859
|
+
if new_size < self.min_connections or new_size > self.max_connections:
|
860
|
+
return {
|
861
|
+
"success": False,
|
862
|
+
"reason": f"Size must be between {self.min_connections} and {self.max_connections}",
|
863
|
+
}
|
864
|
+
|
865
|
+
current_size = len(self.all_connections)
|
866
|
+
|
867
|
+
if new_size > current_size:
|
868
|
+
# Scale up
|
869
|
+
connections_to_add = new_size - current_size
|
870
|
+
for _ in range(connections_to_add):
|
871
|
+
try:
|
872
|
+
await self._create_connection()
|
873
|
+
except Exception as e:
|
874
|
+
logger.error(f"Failed to create connection during scale up: {e}")
|
875
|
+
|
876
|
+
elif new_size < current_size:
|
877
|
+
# Scale down - remove idle connections first
|
878
|
+
connections_to_remove = current_size - new_size
|
879
|
+
removed = 0
|
880
|
+
|
881
|
+
# Try to remove idle connections
|
882
|
+
while (
|
883
|
+
removed < connections_to_remove
|
884
|
+
and not self.available_connections.empty()
|
885
|
+
):
|
886
|
+
try:
|
887
|
+
conn = await asyncio.wait_for(
|
888
|
+
self.available_connections.get(), timeout=0.1
|
889
|
+
)
|
890
|
+
await self._recycle_connection(conn)
|
891
|
+
removed += 1
|
892
|
+
except asyncio.TimeoutError:
|
893
|
+
break
|
894
|
+
|
895
|
+
return {
|
896
|
+
"success": True,
|
897
|
+
"previous_size": current_size,
|
898
|
+
"new_size": len(self.all_connections),
|
899
|
+
}
|
900
|
+
|
901
|
+
async def get_pool_statistics(self) -> Dict[str, Any]:
|
902
|
+
"""Get detailed pool statistics for adaptive sizing."""
|
903
|
+
total_connections = len(self.all_connections)
|
904
|
+
active_connections = len(self.active_connections)
|
905
|
+
idle_connections = self.available_connections.qsize()
|
906
|
+
|
907
|
+
# Calculate metrics
|
908
|
+
utilization_rate = (
|
909
|
+
active_connections / total_connections if total_connections > 0 else 0
|
910
|
+
)
|
911
|
+
|
912
|
+
# Get average health score
|
913
|
+
health_scores = [conn.health_score for conn in self.all_connections.values()]
|
914
|
+
avg_health_score = (
|
915
|
+
sum(health_scores) / len(health_scores) if health_scores else 100
|
916
|
+
)
|
917
|
+
|
918
|
+
# Queue depth (approximate based on waiters)
|
919
|
+
queue_depth = 0 # TODO: Track actual queue depth
|
920
|
+
|
921
|
+
# Get timing metrics from pool metrics
|
922
|
+
stats = self.metrics.get_stats()
|
923
|
+
|
924
|
+
return {
|
925
|
+
"total_connections": total_connections,
|
926
|
+
"active_connections": active_connections,
|
927
|
+
"idle_connections": idle_connections,
|
928
|
+
"queue_depth": queue_depth,
|
929
|
+
"utilization_rate": utilization_rate,
|
930
|
+
"avg_health_score": avg_health_score,
|
931
|
+
"avg_acquisition_time_ms": stats["performance"]["avg_acquisition_time_ms"],
|
932
|
+
"avg_query_time_ms": 50.0, # TODO: Track actual query time
|
933
|
+
"queries_per_second": (
|
934
|
+
stats["queries"]["executed"] / stats["uptime_seconds"]
|
935
|
+
if stats["uptime_seconds"] > 0
|
936
|
+
else 0
|
937
|
+
),
|
938
|
+
# Phase 3 additions
|
939
|
+
"circuit_breaker_status": self.circuit_breaker.get_status(),
|
940
|
+
"comprehensive_metrics": self.metrics_collector.get_all_metrics(),
|
941
|
+
"error_rate": self.metrics_collector.get_error_summary()["error_rate"],
|
942
|
+
"health_score": avg_health_score,
|
943
|
+
"pool_name": self.metadata.name,
|
944
|
+
}
|
945
|
+
|
946
|
+
async def get_comprehensive_status(self) -> Dict[str, Any]:
|
947
|
+
"""Get comprehensive status including all Phase 3 features."""
|
948
|
+
base_stats = await self.get_pool_statistics()
|
949
|
+
|
950
|
+
# Add circuit breaker details
|
951
|
+
cb_status = self.circuit_breaker.get_status()
|
952
|
+
|
953
|
+
# Add comprehensive metrics
|
954
|
+
metrics = self.metrics_collector.get_all_metrics()
|
955
|
+
|
956
|
+
# Add pattern learning insights if enabled
|
957
|
+
pattern_insights = {}
|
958
|
+
if self.query_pattern_tracker:
|
959
|
+
patterns = self.query_pattern_tracker.get_all_patterns()
|
960
|
+
pattern_insights = {
|
961
|
+
"detected_patterns": len(patterns),
|
962
|
+
"workload_forecast": self.query_pattern_tracker.get_workload_forecast(
|
963
|
+
15
|
964
|
+
),
|
965
|
+
}
|
966
|
+
|
967
|
+
# Add adaptive controller status if enabled
|
968
|
+
adaptive_status = {}
|
969
|
+
if self.adaptive_controller:
|
970
|
+
adaptive_status = {
|
971
|
+
"current_size": len(self.all_connections),
|
972
|
+
"recommended_size": self.adaptive_controller.get_recommended_size(),
|
973
|
+
"last_adjustment": self.adaptive_controller.get_last_adjustment(),
|
974
|
+
}
|
975
|
+
|
976
|
+
return {
|
977
|
+
**base_stats,
|
978
|
+
"circuit_breaker": {
|
979
|
+
"state": cb_status["state"],
|
980
|
+
"metrics": cb_status["metrics"],
|
981
|
+
"time_until_recovery": cb_status.get("time_until_recovery"),
|
982
|
+
},
|
983
|
+
"detailed_metrics": {
|
984
|
+
"counters": metrics["counters"],
|
985
|
+
"gauges": metrics["gauges"],
|
986
|
+
"histograms": metrics["histograms"],
|
987
|
+
"errors": metrics["errors"],
|
988
|
+
"query_summary": metrics["queries"],
|
989
|
+
},
|
990
|
+
"pattern_insights": pattern_insights,
|
991
|
+
"adaptive_control": adaptive_status,
|
992
|
+
"monitoring": {
|
993
|
+
"dashboard_enabled": self.enable_monitoring,
|
994
|
+
"dashboard_url": (
|
995
|
+
f"http://localhost:{self.monitoring_port}"
|
996
|
+
if self.enable_monitoring
|
997
|
+
else None
|
998
|
+
),
|
999
|
+
},
|
1000
|
+
}
|
1001
|
+
|
1002
|
+
async def _start_monitoring_dashboard(self) -> Dict[str, Any]:
|
1003
|
+
"""Start the monitoring dashboard if enabled."""
|
1004
|
+
if not self.enable_monitoring:
|
1005
|
+
return {"error": "Monitoring not enabled in configuration"}
|
1006
|
+
|
1007
|
+
try:
|
1008
|
+
# Register this pool with the global metrics aggregator
|
1009
|
+
if hasattr(self.runtime, "metrics_aggregator"):
|
1010
|
+
self.runtime.metrics_aggregator.register_collector(
|
1011
|
+
self.metrics_collector
|
1012
|
+
)
|
1013
|
+
|
1014
|
+
# Start monitoring dashboard if not already running
|
1015
|
+
if not hasattr(self.runtime, "monitoring_dashboard"):
|
1016
|
+
from kailash.nodes.monitoring.connection_dashboard import (
|
1017
|
+
ConnectionDashboardNode,
|
1018
|
+
)
|
1019
|
+
|
1020
|
+
dashboard = ConnectionDashboardNode(
|
1021
|
+
name="global_dashboard",
|
1022
|
+
port=self.monitoring_port,
|
1023
|
+
update_interval=1.0,
|
1024
|
+
)
|
1025
|
+
|
1026
|
+
# Store dashboard in runtime for sharing
|
1027
|
+
self.runtime.monitoring_dashboard = dashboard
|
1028
|
+
await dashboard.start()
|
1029
|
+
|
1030
|
+
return {
|
1031
|
+
"status": "started",
|
1032
|
+
"dashboard_url": f"http://localhost:{self.monitoring_port}",
|
1033
|
+
}
|
1034
|
+
else:
|
1035
|
+
return {
|
1036
|
+
"status": "already_running",
|
1037
|
+
"dashboard_url": f"http://localhost:{self.monitoring_port}",
|
1038
|
+
}
|
1039
|
+
|
1040
|
+
except Exception as e:
|
1041
|
+
logger.error(f"Failed to start monitoring dashboard: {e}")
|
1042
|
+
return {"error": str(e)}
|
1043
|
+
|
1044
|
+
async def _stop_monitoring_dashboard(self) -> Dict[str, Any]:
|
1045
|
+
"""Stop the monitoring dashboard."""
|
1046
|
+
try:
|
1047
|
+
if hasattr(self.runtime, "monitoring_dashboard"):
|
1048
|
+
await self.runtime.monitoring_dashboard.stop()
|
1049
|
+
del self.runtime.monitoring_dashboard
|
1050
|
+
return {"status": "stopped"}
|
1051
|
+
else:
|
1052
|
+
return {"status": "not_running"}
|
1053
|
+
except Exception as e:
|
1054
|
+
logger.error(f"Failed to stop monitoring dashboard: {e}")
|
1055
|
+
return {"error": str(e)}
|
1056
|
+
|
1057
|
+
def _update_pool_metrics(self):
|
1058
|
+
"""Update pool metrics for monitoring."""
|
1059
|
+
total = len(self.all_connections)
|
1060
|
+
active = len(self.active_connections)
|
1061
|
+
idle = self.available_connections.qsize()
|
1062
|
+
|
1063
|
+
# Update comprehensive metrics
|
1064
|
+
self.metrics_collector.update_pool_stats(active, idle, total)
|
1065
|
+
|
1066
|
+
# Track health checks
|
1067
|
+
for conn in self.all_connections.values():
|
1068
|
+
self.metrics_collector.track_health_check(
|
1069
|
+
success=conn.health_score > self.health_threshold,
|
1070
|
+
duration_ms=5.0, # Placeholder - real implementation would track actual time
|
1071
|
+
)
|
@@ -1,7 +1,5 @@
|
|
1
|
-
"""Monitoring
|
1
|
+
"""Monitoring nodes for connection and workflow visualization."""
|
2
2
|
|
3
|
-
from .
|
3
|
+
from .connection_dashboard import ConnectionDashboardNode
|
4
4
|
|
5
|
-
__all__ = [
|
6
|
-
"PerformanceBenchmarkNode",
|
7
|
-
]
|
5
|
+
__all__ = ["ConnectionDashboardNode"]
|