kailash 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. kailash/__init__.py +1 -1
  2. kailash/access_control/__init__.py +1 -1
  3. kailash/core/actors/adaptive_pool_controller.py +630 -0
  4. kailash/core/actors/connection_actor.py +3 -3
  5. kailash/core/ml/__init__.py +1 -0
  6. kailash/core/ml/query_patterns.py +544 -0
  7. kailash/core/monitoring/__init__.py +19 -0
  8. kailash/core/monitoring/connection_metrics.py +488 -0
  9. kailash/core/optimization/__init__.py +1 -0
  10. kailash/core/resilience/__init__.py +17 -0
  11. kailash/core/resilience/circuit_breaker.py +382 -0
  12. kailash/gateway/api.py +7 -5
  13. kailash/gateway/enhanced_gateway.py +1 -1
  14. kailash/middleware/auth/access_control.py +11 -11
  15. kailash/middleware/communication/ai_chat.py +7 -7
  16. kailash/middleware/communication/api_gateway.py +5 -15
  17. kailash/middleware/gateway/checkpoint_manager.py +45 -8
  18. kailash/middleware/gateway/event_store.py +66 -26
  19. kailash/middleware/mcp/enhanced_server.py +2 -2
  20. kailash/nodes/admin/permission_check.py +110 -30
  21. kailash/nodes/admin/schema.sql +387 -0
  22. kailash/nodes/admin/tenant_isolation.py +249 -0
  23. kailash/nodes/admin/transaction_utils.py +244 -0
  24. kailash/nodes/admin/user_management.py +37 -9
  25. kailash/nodes/ai/ai_providers.py +55 -3
  26. kailash/nodes/ai/llm_agent.py +115 -13
  27. kailash/nodes/data/query_pipeline.py +641 -0
  28. kailash/nodes/data/query_router.py +895 -0
  29. kailash/nodes/data/sql.py +24 -0
  30. kailash/nodes/data/workflow_connection_pool.py +451 -23
  31. kailash/nodes/monitoring/__init__.py +3 -5
  32. kailash/nodes/monitoring/connection_dashboard.py +822 -0
  33. kailash/nodes/rag/__init__.py +1 -3
  34. kailash/resources/registry.py +6 -0
  35. kailash/runtime/async_local.py +7 -0
  36. kailash/utils/export.py +152 -0
  37. kailash/workflow/builder.py +42 -0
  38. kailash/workflow/graph.py +86 -17
  39. kailash/workflow/templates.py +4 -9
  40. {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/METADATA +14 -1
  41. {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/RECORD +45 -31
  42. {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/WHEEL +0 -0
  43. {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/entry_points.txt +0 -0
  44. {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/licenses/LICENSE +0 -0
  45. {kailash-0.6.0.dist-info → kailash-0.6.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,244 @@
1
+ """Transaction utilities for admin nodes to handle timing and persistence issues.
2
+
3
+ This module provides utilities to handle common transaction and timing issues
4
+ encountered in admin node operations, particularly around user creation,
5
+ role assignment, and permission checks.
6
+ """
7
+
8
+ import logging
9
+ import time
10
+ from typing import Any, Callable, Dict, Optional, TypeVar
11
+
12
+ from kailash.sdk_exceptions import NodeExecutionError, NodeValidationError
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ T = TypeVar("T")
17
+
18
+
19
+ class TransactionHelper:
20
+ """Helper class for handling database transaction timing and persistence issues."""
21
+
22
+ def __init__(self, db_node, max_retries: int = 3, retry_delay: float = 0.1):
23
+ """
24
+ Initialize transaction helper.
25
+
26
+ Args:
27
+ db_node: Database node instance (SQLDatabaseNode)
28
+ max_retries: Maximum number of retries for transient failures
29
+ retry_delay: Delay between retries in seconds
30
+ """
31
+ self.db_node = db_node
32
+ self.max_retries = max_retries
33
+ self.retry_delay = retry_delay
34
+
35
+ def execute_with_retry(self, operation: Callable[[], T], operation_name: str) -> T:
36
+ """
37
+ Execute a database operation with retry logic.
38
+
39
+ Args:
40
+ operation: Function that performs the database operation
41
+ operation_name: Description of the operation for logging
42
+
43
+ Returns:
44
+ Result of the operation
45
+
46
+ Raises:
47
+ NodeExecutionError: If operation fails after all retries
48
+ """
49
+ last_exception = None
50
+
51
+ for attempt in range(self.max_retries):
52
+ try:
53
+ result = operation()
54
+ if attempt > 0:
55
+ logger.info(f"{operation_name} succeeded on attempt {attempt + 1}")
56
+ return result
57
+ except Exception as e:
58
+ last_exception = e
59
+ if attempt < self.max_retries - 1:
60
+ logger.warning(
61
+ f"{operation_name} failed on attempt {attempt + 1}, retrying: {e}"
62
+ )
63
+ time.sleep(self.retry_delay * (2**attempt)) # Exponential backoff
64
+ else:
65
+ logger.error(
66
+ f"{operation_name} failed after {self.max_retries} attempts: {e}"
67
+ )
68
+
69
+ raise NodeExecutionError(
70
+ f"{operation_name} failed after {self.max_retries} attempts: {last_exception}"
71
+ )
72
+
73
+ def verify_operation_success(
74
+ self,
75
+ verification_query: str,
76
+ expected_result: Any,
77
+ operation_name: str,
78
+ timeout_seconds: float = 5.0,
79
+ ) -> bool:
80
+ """
81
+ Verify that a database operation was successful by checking the result.
82
+
83
+ Args:
84
+ verification_query: SQL query to verify the operation
85
+ expected_result: Expected result from the verification query
86
+ operation_name: Description of the operation for logging
87
+ timeout_seconds: Maximum time to wait for verification
88
+
89
+ Returns:
90
+ True if verification succeeds
91
+
92
+ Raises:
93
+ NodeValidationError: If verification fails after timeout
94
+ """
95
+ start_time = time.time()
96
+
97
+ while time.time() - start_time < timeout_seconds:
98
+ try:
99
+ result = self.db_node.run(
100
+ query=verification_query, result_format="dict"
101
+ )
102
+ data = result.get("data", [])
103
+
104
+ if data and len(data) > 0:
105
+ # Operation was successful
106
+ logger.debug(f"{operation_name} verification succeeded")
107
+ return True
108
+
109
+ except Exception as e:
110
+ logger.debug(f"{operation_name} verification error: {e}")
111
+
112
+ # Wait before retrying
113
+ time.sleep(0.05) # 50ms
114
+
115
+ raise NodeValidationError(
116
+ f"{operation_name} verification failed after {timeout_seconds}s"
117
+ )
118
+
119
+ def create_user_with_verification(
120
+ self, user_data: Dict[str, Any], tenant_id: str
121
+ ) -> Dict[str, Any]:
122
+ """
123
+ Create a user and verify the creation was successful.
124
+
125
+ Args:
126
+ user_data: User data dictionary
127
+ tenant_id: Tenant ID
128
+
129
+ Returns:
130
+ User creation result
131
+ """
132
+ user_id = user_data.get("user_id")
133
+
134
+ def create_operation():
135
+ # Perform the user creation
136
+ from .user_management import UserManagementNode
137
+
138
+ user_mgmt = UserManagementNode(database_url=self.db_node.connection_string)
139
+ return user_mgmt.run(
140
+ operation="create_user", user_data=user_data, tenant_id=tenant_id
141
+ )
142
+
143
+ # Execute creation with retry
144
+ result = self.execute_with_retry(
145
+ create_operation, f"User creation for {user_id}"
146
+ )
147
+
148
+ # Verify user was created
149
+ verification_query = """
150
+ SELECT user_id FROM users
151
+ WHERE user_id = $1 AND tenant_id = $2
152
+ """
153
+
154
+ self.verify_operation_success(
155
+ verification_query,
156
+ user_id,
157
+ f"User {user_id} creation verification",
158
+ timeout_seconds=2.0,
159
+ )
160
+
161
+ return result
162
+
163
+ def assign_role_with_verification(
164
+ self, user_id: str, role_id: str, tenant_id: str
165
+ ) -> Dict[str, Any]:
166
+ """
167
+ Assign a role to a user and verify the assignment was successful.
168
+
169
+ Args:
170
+ user_id: User ID
171
+ role_id: Role ID
172
+ tenant_id: Tenant ID
173
+
174
+ Returns:
175
+ Role assignment result
176
+ """
177
+
178
+ def assign_operation():
179
+ from .role_management import RoleManagementNode
180
+
181
+ role_mgmt = RoleManagementNode(database_url=self.db_node.connection_string)
182
+ return role_mgmt.run(
183
+ operation="assign_user",
184
+ user_id=user_id,
185
+ role_id=role_id,
186
+ tenant_id=tenant_id,
187
+ )
188
+
189
+ # Execute assignment with retry
190
+ result = self.execute_with_retry(
191
+ assign_operation, f"Role assignment {role_id} to {user_id}"
192
+ )
193
+
194
+ # Verify role was assigned
195
+ verification_query = """
196
+ SELECT user_id, role_id FROM user_role_assignments
197
+ WHERE user_id = $1 AND role_id = $2 AND tenant_id = $3 AND is_active = true
198
+ """
199
+
200
+ self.verify_operation_success(
201
+ verification_query,
202
+ {"user_id": user_id, "role_id": role_id},
203
+ f"Role assignment {role_id} to {user_id} verification",
204
+ timeout_seconds=2.0,
205
+ )
206
+
207
+ return result
208
+
209
+
210
+ def with_transaction_retry(max_retries: int = 3, retry_delay: float = 0.1):
211
+ """
212
+ Decorator to add retry logic to admin node operations.
213
+
214
+ Args:
215
+ max_retries: Maximum number of retries
216
+ retry_delay: Initial delay between retries
217
+ """
218
+
219
+ def decorator(func):
220
+ def wrapper(*args, **kwargs):
221
+ last_exception = None
222
+
223
+ for attempt in range(max_retries):
224
+ try:
225
+ return func(*args, **kwargs)
226
+ except Exception as e:
227
+ last_exception = e
228
+ if attempt < max_retries - 1:
229
+ logger.warning(
230
+ f"{func.__name__} failed on attempt {attempt + 1}, retrying: {e}"
231
+ )
232
+ time.sleep(retry_delay * (2**attempt))
233
+ else:
234
+ logger.error(
235
+ f"{func.__name__} failed after {max_retries} attempts: {e}"
236
+ )
237
+
238
+ raise NodeExecutionError(
239
+ f"{func.__name__} failed after {max_retries} attempts: {last_exception}"
240
+ )
241
+
242
+ return wrapper
243
+
244
+ return decorator
@@ -25,6 +25,8 @@ from enum import Enum
25
25
  from typing import Any, Dict, List, Optional, Set, Union
26
26
  from uuid import uuid4
27
27
 
28
+ import bcrypt
29
+
28
30
  from kailash.nodes.base import Node, NodeParameter, register_node
29
31
  from kailash.nodes.data import SQLDatabaseNode
30
32
  from kailash.sdk_exceptions import NodeExecutionError, NodeValidationError
@@ -32,6 +34,25 @@ from kailash.sdk_exceptions import NodeExecutionError, NodeValidationError
32
34
  from .schema_manager import AdminSchemaManager
33
35
 
34
36
 
37
+ def hash_password(password: str) -> str:
38
+ """Hash password using bcrypt with salt."""
39
+ if not password:
40
+ return ""
41
+ salt = bcrypt.gensalt()
42
+ hashed = bcrypt.hashpw(password.encode("utf-8"), salt)
43
+ return hashed.decode("utf-8")
44
+
45
+
46
+ def verify_password(password: str, hashed: str) -> bool:
47
+ """Verify password against bcrypt hash."""
48
+ if not password or not hashed:
49
+ return False
50
+ try:
51
+ return bcrypt.checkpw(password.encode("utf-8"), hashed.encode("utf-8"))
52
+ except Exception:
53
+ return False
54
+
55
+
35
56
  def parse_datetime(value: Union[str, datetime, None]) -> Optional[datetime]:
36
57
  """Parse datetime from various formats."""
37
58
  if value is None:
@@ -496,7 +517,7 @@ class UserManagementNode(Node):
496
517
  user.user_id,
497
518
  user.email,
498
519
  user.username,
499
- inputs.get("password_hash"),
520
+ hash_password(inputs.get("password", "")),
500
521
  user.first_name,
501
522
  user.last_name,
502
523
  user.display_name,
@@ -509,12 +530,15 @@ class UserManagementNode(Node):
509
530
  ],
510
531
  )
511
532
 
512
- # Get the created user to return complete data
513
- created_user = self._get_user_by_id(user.user_id, tenant_id)
533
+ # Return the user data that was successfully inserted
534
+ # Add timestamps that would be set by the database
535
+ user_dict = user.to_dict()
536
+ user_dict["created_at"] = datetime.now(UTC).isoformat()
537
+ user_dict["updated_at"] = datetime.now(UTC).isoformat()
514
538
 
515
539
  return {
516
540
  "result": {
517
- "user": created_user.to_dict(),
541
+ "user": user_dict,
518
542
  "operation": "create_user",
519
543
  "timestamp": datetime.now(UTC).isoformat(),
520
544
  }
@@ -918,7 +942,8 @@ class UserManagementNode(Node):
918
942
  """Set user password hash."""
919
943
  user_id = inputs["user_id"]
920
944
  tenant_id = inputs["tenant_id"]
921
- password_hash = inputs["password_hash"]
945
+ password = inputs.get("password", "")
946
+ password_hash = hash_password(password)
922
947
 
923
948
  update_query = """
924
949
  UPDATE users
@@ -964,9 +989,13 @@ class UserManagementNode(Node):
964
989
  for i, user_data in enumerate(users_data):
965
990
  try:
966
991
  # Create each user individually for better error handling
992
+ # Extract password from user_data if present
993
+ user_data_copy = user_data.copy()
994
+ password = user_data_copy.pop("password", "")
967
995
  create_inputs = {
968
996
  "operation": "create_user",
969
- "user_data": user_data,
997
+ "user_data": user_data_copy,
998
+ "password": password,
970
999
  "tenant_id": tenant_id,
971
1000
  "database_config": inputs["database_config"],
972
1001
  }
@@ -1370,7 +1399,7 @@ class UserManagementNode(Node):
1370
1399
  user_id = result["data"][0]["user_id"]
1371
1400
 
1372
1401
  # Update password
1373
- password_hash = hashlib.sha256(new_password.encode()).hexdigest()
1402
+ password_hash = hash_password(new_password)
1374
1403
  update_query = """
1375
1404
  UPDATE users
1376
1405
  SET password_hash = :password_hash,
@@ -1441,9 +1470,8 @@ class UserManagementNode(Node):
1441
1470
 
1442
1471
  user_data = result["data"][0]
1443
1472
  stored_hash = user_data["password_hash"]
1444
- provided_hash = hashlib.sha256(password.encode()).hexdigest()
1445
1473
 
1446
- if stored_hash != provided_hash:
1474
+ if not verify_password(password, stored_hash):
1447
1475
  return {"authenticated": False, "message": "Invalid password"}
1448
1476
 
1449
1477
  if user_data["status"] != "active":
@@ -387,10 +387,16 @@ class OllamaProvider(UnifiedAIProvider):
387
387
  return self._available
388
388
 
389
389
  try:
390
+ import os
391
+
390
392
  import ollama
391
393
 
394
+ # Check with environment-configured host if available
395
+ host = os.getenv("OLLAMA_BASE_URL") or os.getenv("OLLAMA_HOST")
396
+ client = ollama.Client(host=host) if host else ollama.Client()
397
+
392
398
  # Check if Ollama is running
393
- ollama.list()
399
+ client.list()
394
400
  self._available = True
395
401
  except Exception:
396
402
  self._available = False
@@ -409,6 +415,9 @@ class OllamaProvider(UnifiedAIProvider):
409
415
  * temperature, max_tokens, top_p, top_k, repeat_penalty
410
416
  * seed, stop, num_ctx, num_batch, num_thread
411
417
  * tfs_z, typical_p, mirostat, mirostat_tau, mirostat_eta
418
+ backend_config (dict): Backend configuration including:
419
+ * host (str): Ollama host URL (default: from env or http://localhost:11434)
420
+ * port (int): Ollama port (if provided, will be appended to host)
412
421
 
413
422
  Returns:
414
423
  Dict containing the standardized response.
@@ -418,6 +427,28 @@ class OllamaProvider(UnifiedAIProvider):
418
427
 
419
428
  model = kwargs.get("model", "llama3.1:8b-instruct-q8_0")
420
429
  generation_config = kwargs.get("generation_config", {})
430
+ backend_config = kwargs.get("backend_config", {})
431
+
432
+ # Configure Ollama client with custom host if provided
433
+ if backend_config:
434
+ host = backend_config.get("host", "localhost")
435
+ port = backend_config.get("port")
436
+ if port:
437
+ # Construct full URL if port is provided
438
+ host = (
439
+ f"http://{host}:{port}"
440
+ if not host.startswith("http")
441
+ else f"{host}:{port}"
442
+ )
443
+ elif backend_config.get("base_url"):
444
+ host = backend_config["base_url"]
445
+ self._client = ollama.Client(host=host)
446
+ elif self._client is None:
447
+ # Use default client
448
+ import os
449
+
450
+ host = os.getenv("OLLAMA_BASE_URL") or os.getenv("OLLAMA_HOST")
451
+ self._client = ollama.Client(host=host) if host else ollama.Client()
421
452
 
422
453
  # Map generation_config to Ollama options
423
454
  options = {
@@ -482,7 +513,7 @@ class OllamaProvider(UnifiedAIProvider):
482
513
  processed_messages.append(msg)
483
514
 
484
515
  # Call Ollama
485
- response = ollama.chat(
516
+ response = self._client.chat(
486
517
  model=model, messages=processed_messages, options=options
487
518
  )
488
519
 
@@ -522,16 +553,37 @@ class OllamaProvider(UnifiedAIProvider):
522
553
  Supported kwargs:
523
554
  - model (str): Ollama model name (default: "snowflake-arctic-embed2")
524
555
  - normalize (bool): Normalize embeddings to unit length
556
+ - backend_config (dict): Backend configuration (host, port, base_url)
525
557
  """
526
558
  try:
527
559
  import ollama
528
560
 
529
561
  model = kwargs.get("model", "snowflake-arctic-embed2")
530
562
  normalize = kwargs.get("normalize", False)
563
+ backend_config = kwargs.get("backend_config", {})
564
+
565
+ # Configure Ollama client if not already configured
566
+ if backend_config and not hasattr(self, "_client"):
567
+ host = backend_config.get("host", "localhost")
568
+ port = backend_config.get("port")
569
+ if port:
570
+ host = (
571
+ f"http://{host}:{port}"
572
+ if not host.startswith("http")
573
+ else f"{host}:{port}"
574
+ )
575
+ elif backend_config.get("base_url"):
576
+ host = backend_config["base_url"]
577
+ self._client = ollama.Client(host=host)
578
+ elif not hasattr(self, "_client") or self._client is None:
579
+ import os
580
+
581
+ host = os.getenv("OLLAMA_BASE_URL") or os.getenv("OLLAMA_HOST")
582
+ self._client = ollama.Client(host=host) if host else ollama.Client()
531
583
 
532
584
  embeddings = []
533
585
  for text in texts:
534
- response = ollama.embeddings(model=model, prompt=text)
586
+ response = self._client.embeddings(model=model, prompt=text)
535
587
  embedding = response.get("embedding", [])
536
588
 
537
589
  if normalize and embedding:
@@ -853,6 +853,62 @@ class LLMAgentNode(Node):
853
853
  "loaded_from": "mock_storage",
854
854
  }
855
855
 
856
+ def _run_async_in_sync_context(self, coro):
857
+ """
858
+ Run async coroutine in a synchronous context, handling existing event loops.
859
+
860
+ This helper method detects if an event loop is already running and handles
861
+ the execution appropriately to avoid "RuntimeError: This event loop is already running".
862
+
863
+ Args:
864
+ coro: The coroutine to execute
865
+
866
+ Returns:
867
+ The result of the coroutine execution
868
+
869
+ Raises:
870
+ TimeoutError: If the operation times out (30 seconds)
871
+ Exception: Any exception raised by the coroutine
872
+ """
873
+ import asyncio
874
+
875
+ try:
876
+ # Check if there's already a running event loop
877
+ loop = asyncio.get_running_loop()
878
+ # If we're here, there's a running loop - create a new thread
879
+ import threading
880
+
881
+ result = None
882
+ exception = None
883
+
884
+ def run_in_thread():
885
+ nonlocal result, exception
886
+ try:
887
+ # Create new event loop in thread
888
+ new_loop = asyncio.new_event_loop()
889
+ asyncio.set_event_loop(new_loop)
890
+ try:
891
+ result = new_loop.run_until_complete(coro)
892
+ finally:
893
+ new_loop.close()
894
+ except Exception as e:
895
+ exception = e
896
+
897
+ thread = threading.Thread(target=run_in_thread)
898
+ thread.start()
899
+ thread.join(timeout=30) # 30 second timeout
900
+
901
+ if thread.is_alive():
902
+ raise TimeoutError("MCP operation timed out after 30 seconds")
903
+
904
+ if exception:
905
+ raise exception
906
+ return result
907
+
908
+ except RuntimeError:
909
+ # No running event loop, use asyncio.run()
910
+ return asyncio.run(coro)
911
+
856
912
  def _retrieve_mcp_context(
857
913
  self, mcp_servers: list[dict], mcp_context: list[str]
858
914
  ) -> list[dict[str, Any]]:
@@ -939,14 +995,14 @@ class LLMAgentNode(Node):
939
995
  for server_config in mcp_servers:
940
996
  try:
941
997
  # List resources from server
942
- resources = asyncio.run(
998
+ resources = self._run_async_in_sync_context(
943
999
  self._mcp_client.list_resources(server_config)
944
1000
  )
945
1001
 
946
1002
  # Read specific resources if requested
947
1003
  for uri in mcp_context:
948
1004
  try:
949
- resource_data = asyncio.run(
1005
+ resource_data = self._run_async_in_sync_context(
950
1006
  self._mcp_client.read_resource(server_config, uri)
951
1007
  )
952
1008
 
@@ -1014,17 +1070,48 @@ class LLMAgentNode(Node):
1014
1070
  }
1015
1071
  )
1016
1072
 
1073
+ except TimeoutError as e:
1074
+ self.logger.warning(
1075
+ f"MCP server '{server_config.get('name', 'unknown')}' timed out after 30 seconds: {e}"
1076
+ )
1077
+ # Fall back to mock for this server
1078
+ context_data.append(
1079
+ {
1080
+ "uri": f"mcp://{server_config.get('name', 'unknown')}/fallback",
1081
+ "content": "MCP server timed out - using fallback content. Check if the server is running and accessible.",
1082
+ "source": server_config.get("name", "unknown"),
1083
+ "retrieved_at": datetime.now().isoformat(),
1084
+ "relevance_score": 0.5,
1085
+ "metadata": {
1086
+ "error": "timeout",
1087
+ "error_message": str(e),
1088
+ },
1089
+ }
1090
+ )
1017
1091
  except Exception as e:
1018
- self.logger.debug(f"MCP server connection failed: {e}")
1092
+ error_type = type(e).__name__
1093
+ self.logger.error(
1094
+ f"MCP server '{server_config.get('name', 'unknown')}' connection failed ({error_type}): {e}"
1095
+ )
1096
+
1097
+ # Provide helpful error messages based on exception type
1098
+ if "coroutine" in str(e).lower() and "await" in str(e).lower():
1099
+ self.logger.error(
1100
+ "This appears to be an async/await issue. Please report this bug to the Kailash SDK team."
1101
+ )
1102
+
1019
1103
  # Fall back to mock for this server
1020
1104
  context_data.append(
1021
1105
  {
1022
1106
  "uri": f"mcp://{server_config.get('name', 'unknown')}/fallback",
1023
- "content": "Connection failed, using fallback content",
1107
+ "content": f"Connection failed ({error_type}) - using fallback content. Error: {str(e)}",
1024
1108
  "source": server_config.get("name", "unknown"),
1025
1109
  "retrieved_at": datetime.now().isoformat(),
1026
1110
  "relevance_score": 0.5,
1027
- "metadata": {"error": str(e)},
1111
+ "metadata": {
1112
+ "error": error_type,
1113
+ "error_message": str(e),
1114
+ },
1028
1115
  }
1029
1116
  )
1030
1117
 
@@ -1032,11 +1119,17 @@ class LLMAgentNode(Node):
1032
1119
  if context_data:
1033
1120
  return context_data
1034
1121
 
1035
- except ImportError:
1122
+ except ImportError as e:
1036
1123
  # MCPClient not available, fall back to mock
1124
+ self.logger.info(
1125
+ "MCP client not available. Install the MCP SDK with 'pip install mcp' to use real MCP servers."
1126
+ )
1037
1127
  pass
1038
1128
  except Exception as e:
1039
- self.logger.debug(f"MCP retrieval error: {e}")
1129
+ self.logger.error(
1130
+ f"Unexpected error in MCP retrieval: {type(e).__name__}: {e}"
1131
+ )
1132
+ self.logger.info("Falling back to mock MCP implementation.")
1040
1133
 
1041
1134
  # Fallback to mock implementation
1042
1135
  for uri in mcp_context:
@@ -1089,8 +1182,6 @@ class LLMAgentNode(Node):
1089
1182
 
1090
1183
  if use_real_mcp:
1091
1184
  try:
1092
- import asyncio
1093
-
1094
1185
  from kailash.mcp import MCPClient
1095
1186
 
1096
1187
  # Initialize MCP client if not already done
@@ -1101,7 +1192,7 @@ class LLMAgentNode(Node):
1101
1192
  for server_config in mcp_servers:
1102
1193
  try:
1103
1194
  # Discover tools asynchronously
1104
- tools = asyncio.run(
1195
+ tools = self._run_async_in_sync_context(
1105
1196
  self._mcp_client.discover_tools(server_config)
1106
1197
  )
1107
1198
 
@@ -1131,16 +1222,27 @@ class LLMAgentNode(Node):
1131
1222
  {"type": "function", "function": function_def}
1132
1223
  )
1133
1224
 
1225
+ except TimeoutError as e:
1226
+ self.logger.warning(
1227
+ f"Tool discovery timed out for MCP server '{server_config.get('name', 'unknown')}': {e}"
1228
+ )
1134
1229
  except Exception as e:
1135
- self.logger.debug(
1136
- f"Failed to discover tools from {server_config.get('name', 'unknown')}: {e}"
1230
+ error_type = type(e).__name__
1231
+ self.logger.error(
1232
+ f"Failed to discover tools from '{server_config.get('name', 'unknown')}' ({error_type}): {e}"
1137
1233
  )
1138
1234
 
1139
1235
  except ImportError:
1140
1236
  # MCPClient not available, use mock tools
1237
+ self.logger.info(
1238
+ "MCP client not available for tool discovery. Install with 'pip install mcp' for real MCP tools."
1239
+ )
1141
1240
  pass
1142
1241
  except Exception as e:
1143
- self.logger.debug(f"MCP tool discovery error: {e}")
1242
+ self.logger.error(
1243
+ f"Unexpected error in MCP tool discovery: {type(e).__name__}: {e}"
1244
+ )
1245
+ self.logger.info("Using mock tools as fallback.")
1144
1246
 
1145
1247
  # If no real tools discovered, provide minimal generic tools
1146
1248
  if not discovered_tools: