kailash 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +33 -1
- kailash/access_control/__init__.py +129 -0
- kailash/access_control/managers.py +461 -0
- kailash/access_control/rule_evaluators.py +467 -0
- kailash/access_control_abac.py +825 -0
- kailash/config/__init__.py +27 -0
- kailash/config/database_config.py +359 -0
- kailash/database/__init__.py +28 -0
- kailash/database/execution_pipeline.py +499 -0
- kailash/middleware/__init__.py +306 -0
- kailash/middleware/auth/__init__.py +33 -0
- kailash/middleware/auth/access_control.py +436 -0
- kailash/middleware/auth/auth_manager.py +422 -0
- kailash/middleware/auth/jwt_auth.py +477 -0
- kailash/middleware/auth/kailash_jwt_auth.py +616 -0
- kailash/middleware/communication/__init__.py +37 -0
- kailash/middleware/communication/ai_chat.py +989 -0
- kailash/middleware/communication/api_gateway.py +802 -0
- kailash/middleware/communication/events.py +470 -0
- kailash/middleware/communication/realtime.py +710 -0
- kailash/middleware/core/__init__.py +21 -0
- kailash/middleware/core/agent_ui.py +890 -0
- kailash/middleware/core/schema.py +643 -0
- kailash/middleware/core/workflows.py +396 -0
- kailash/middleware/database/__init__.py +63 -0
- kailash/middleware/database/base.py +113 -0
- kailash/middleware/database/base_models.py +525 -0
- kailash/middleware/database/enums.py +106 -0
- kailash/middleware/database/migrations.py +12 -0
- kailash/{api/database.py → middleware/database/models.py} +183 -291
- kailash/middleware/database/repositories.py +685 -0
- kailash/middleware/database/session_manager.py +19 -0
- kailash/middleware/mcp/__init__.py +38 -0
- kailash/middleware/mcp/client_integration.py +585 -0
- kailash/middleware/mcp/enhanced_server.py +576 -0
- kailash/nodes/__init__.py +25 -3
- kailash/nodes/admin/__init__.py +35 -0
- kailash/nodes/admin/audit_log.py +794 -0
- kailash/nodes/admin/permission_check.py +864 -0
- kailash/nodes/admin/role_management.py +823 -0
- kailash/nodes/admin/security_event.py +1519 -0
- kailash/nodes/admin/user_management.py +944 -0
- kailash/nodes/ai/a2a.py +24 -7
- kailash/nodes/ai/ai_providers.py +1 -0
- kailash/nodes/ai/embedding_generator.py +11 -11
- kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
- kailash/nodes/ai/llm_agent.py +407 -2
- kailash/nodes/ai/self_organizing.py +85 -10
- kailash/nodes/api/auth.py +287 -6
- kailash/nodes/api/rest.py +151 -0
- kailash/nodes/auth/__init__.py +17 -0
- kailash/nodes/auth/directory_integration.py +1228 -0
- kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
- kailash/nodes/auth/mfa.py +2338 -0
- kailash/nodes/auth/risk_assessment.py +872 -0
- kailash/nodes/auth/session_management.py +1093 -0
- kailash/nodes/auth/sso.py +1040 -0
- kailash/nodes/base.py +344 -13
- kailash/nodes/base_cycle_aware.py +4 -2
- kailash/nodes/base_with_acl.py +1 -1
- kailash/nodes/code/python.py +293 -12
- kailash/nodes/compliance/__init__.py +9 -0
- kailash/nodes/compliance/data_retention.py +1888 -0
- kailash/nodes/compliance/gdpr.py +2004 -0
- kailash/nodes/data/__init__.py +22 -2
- kailash/nodes/data/async_connection.py +469 -0
- kailash/nodes/data/async_sql.py +757 -0
- kailash/nodes/data/async_vector.py +598 -0
- kailash/nodes/data/readers.py +767 -0
- kailash/nodes/data/retrieval.py +360 -1
- kailash/nodes/data/sharepoint_graph.py +397 -21
- kailash/nodes/data/sql.py +94 -5
- kailash/nodes/data/streaming.py +68 -8
- kailash/nodes/data/vector_db.py +54 -4
- kailash/nodes/enterprise/__init__.py +13 -0
- kailash/nodes/enterprise/batch_processor.py +741 -0
- kailash/nodes/enterprise/data_lineage.py +497 -0
- kailash/nodes/logic/convergence.py +31 -9
- kailash/nodes/logic/operations.py +14 -3
- kailash/nodes/mixins/__init__.py +8 -0
- kailash/nodes/mixins/event_emitter.py +201 -0
- kailash/nodes/mixins/mcp.py +9 -4
- kailash/nodes/mixins/security.py +165 -0
- kailash/nodes/monitoring/__init__.py +7 -0
- kailash/nodes/monitoring/performance_benchmark.py +2497 -0
- kailash/nodes/rag/__init__.py +284 -0
- kailash/nodes/rag/advanced.py +1615 -0
- kailash/nodes/rag/agentic.py +773 -0
- kailash/nodes/rag/conversational.py +999 -0
- kailash/nodes/rag/evaluation.py +875 -0
- kailash/nodes/rag/federated.py +1188 -0
- kailash/nodes/rag/graph.py +721 -0
- kailash/nodes/rag/multimodal.py +671 -0
- kailash/nodes/rag/optimized.py +933 -0
- kailash/nodes/rag/privacy.py +1059 -0
- kailash/nodes/rag/query_processing.py +1335 -0
- kailash/nodes/rag/realtime.py +764 -0
- kailash/nodes/rag/registry.py +547 -0
- kailash/nodes/rag/router.py +837 -0
- kailash/nodes/rag/similarity.py +1854 -0
- kailash/nodes/rag/strategies.py +566 -0
- kailash/nodes/rag/workflows.py +575 -0
- kailash/nodes/security/__init__.py +19 -0
- kailash/nodes/security/abac_evaluator.py +1411 -0
- kailash/nodes/security/audit_log.py +91 -0
- kailash/nodes/security/behavior_analysis.py +1893 -0
- kailash/nodes/security/credential_manager.py +401 -0
- kailash/nodes/security/rotating_credentials.py +760 -0
- kailash/nodes/security/security_event.py +132 -0
- kailash/nodes/security/threat_detection.py +1103 -0
- kailash/nodes/testing/__init__.py +9 -0
- kailash/nodes/testing/credential_testing.py +499 -0
- kailash/nodes/transform/__init__.py +10 -2
- kailash/nodes/transform/chunkers.py +592 -1
- kailash/nodes/transform/processors.py +484 -14
- kailash/nodes/validation.py +321 -0
- kailash/runtime/access_controlled.py +1 -1
- kailash/runtime/async_local.py +41 -7
- kailash/runtime/docker.py +1 -1
- kailash/runtime/local.py +474 -55
- kailash/runtime/parallel.py +1 -1
- kailash/runtime/parallel_cyclic.py +1 -1
- kailash/runtime/testing.py +210 -2
- kailash/utils/migrations/__init__.py +25 -0
- kailash/utils/migrations/generator.py +433 -0
- kailash/utils/migrations/models.py +231 -0
- kailash/utils/migrations/runner.py +489 -0
- kailash/utils/secure_logging.py +342 -0
- kailash/workflow/__init__.py +16 -0
- kailash/workflow/cyclic_runner.py +3 -4
- kailash/workflow/graph.py +70 -2
- kailash/workflow/resilience.py +249 -0
- kailash/workflow/templates.py +726 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/METADATA +253 -20
- kailash-0.4.0.dist-info/RECORD +223 -0
- kailash/api/__init__.py +0 -17
- kailash/api/__main__.py +0 -6
- kailash/api/studio_secure.py +0 -893
- kailash/mcp/__main__.py +0 -13
- kailash/mcp/server_new.py +0 -336
- kailash/mcp/servers/__init__.py +0 -12
- kailash-0.3.1.dist-info/RECORD +0 -136
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/WHEEL +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1188 @@
|
|
1
|
+
"""
|
2
|
+
Federated RAG Implementation
|
3
|
+
|
4
|
+
Implements RAG across distributed data sources without centralization:
|
5
|
+
- Federated learning for distributed embeddings
|
6
|
+
- Cross-silo and cross-device federation
|
7
|
+
- Secure aggregation protocols
|
8
|
+
- Heterogeneous data handling
|
9
|
+
- Communication-efficient protocols
|
10
|
+
|
11
|
+
Based on federated learning and distributed systems research.
|
12
|
+
"""
|
13
|
+
|
14
|
+
import asyncio
|
15
|
+
import hashlib
|
16
|
+
import json
|
17
|
+
import logging
|
18
|
+
import random
|
19
|
+
import re
|
20
|
+
from collections import defaultdict
|
21
|
+
from datetime import datetime
|
22
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
23
|
+
|
24
|
+
from ...workflow.builder import WorkflowBuilder
|
25
|
+
from ..api.rest import RESTClientNode
|
26
|
+
from ..base import Node, NodeParameter, register_node
|
27
|
+
from ..code.python import PythonCodeNode
|
28
|
+
from ..logic.workflow import WorkflowNode
|
29
|
+
|
30
|
+
logger = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
|
33
|
+
@register_node()
|
34
|
+
class FederatedRAGNode(WorkflowNode):
|
35
|
+
"""
|
36
|
+
Federated RAG for Distributed Data Sources
|
37
|
+
|
38
|
+
Implements RAG that operates across multiple distributed data sources
|
39
|
+
without requiring data centralization, preserving data locality and privacy.
|
40
|
+
|
41
|
+
When to use:
|
42
|
+
- Best for: Multi-organization data, edge computing, privacy-critical scenarios
|
43
|
+
- Not ideal for: Small datasets, single-source data
|
44
|
+
- Performance: 2-10 seconds depending on federation size
|
45
|
+
- Privacy: Data never leaves source organizations
|
46
|
+
|
47
|
+
Key features:
|
48
|
+
- Distributed query execution across federated nodes
|
49
|
+
- Local computation with global aggregation
|
50
|
+
- Heterogeneous data source support
|
51
|
+
- Communication-efficient protocols
|
52
|
+
- Fault tolerance for node failures
|
53
|
+
- Secure aggregation of results
|
54
|
+
|
55
|
+
Example:
|
56
|
+
federated_rag = FederatedRAGNode(
|
57
|
+
federation_nodes=["hospital_a", "hospital_b", "research_lab"],
|
58
|
+
aggregation_strategy="weighted_average",
|
59
|
+
min_participating_nodes=2
|
60
|
+
)
|
61
|
+
|
62
|
+
# Query across all federated sources
|
63
|
+
result = await federated_rag.run(
|
64
|
+
query="Latest treatment protocols for condition X",
|
65
|
+
node_endpoints={
|
66
|
+
"hospital_a": "https://hospitalA.api/rag",
|
67
|
+
"hospital_b": "https://hospitalB.api/rag",
|
68
|
+
"research_lab": "https://lab.api/rag"
|
69
|
+
}
|
70
|
+
)
|
71
|
+
|
72
|
+
# Returns aggregated results without exposing individual data
|
73
|
+
|
74
|
+
Parameters:
|
75
|
+
federation_nodes: List of participating nodes
|
76
|
+
aggregation_strategy: How to combine results
|
77
|
+
min_participating_nodes: Minimum nodes for valid result
|
78
|
+
timeout_per_node: Maximum wait time per node
|
79
|
+
enable_caching: Cache results at edge nodes
|
80
|
+
|
81
|
+
Returns:
|
82
|
+
federated_results: Aggregated results from all nodes
|
83
|
+
node_contributions: Which nodes participated
|
84
|
+
aggregation_metadata: How results were combined
|
85
|
+
federation_health: Status of federated network
|
86
|
+
"""
|
87
|
+
|
88
|
+
def __init__(
|
89
|
+
self,
|
90
|
+
name: str = "federated_rag",
|
91
|
+
federation_nodes: List[str] = None,
|
92
|
+
aggregation_strategy: str = "weighted_average",
|
93
|
+
min_participating_nodes: int = 2,
|
94
|
+
timeout_per_node: float = 5.0,
|
95
|
+
enable_caching: bool = True,
|
96
|
+
):
|
97
|
+
self.federation_nodes = federation_nodes or []
|
98
|
+
self.aggregation_strategy = aggregation_strategy
|
99
|
+
self.min_participating_nodes = min_participating_nodes
|
100
|
+
self.timeout_per_node = timeout_per_node
|
101
|
+
self.enable_caching = enable_caching
|
102
|
+
super().__init__(name, self._create_workflow())
|
103
|
+
|
104
|
+
def _create_workflow(self) -> WorkflowNode:
|
105
|
+
"""Create federated RAG workflow"""
|
106
|
+
builder = WorkflowBuilder()
|
107
|
+
|
108
|
+
# Query distributor
|
109
|
+
query_distributor_id = builder.add_node(
|
110
|
+
"PythonCodeNode",
|
111
|
+
node_id="query_distributor",
|
112
|
+
config={
|
113
|
+
"code": f"""
|
114
|
+
import hashlib
|
115
|
+
from datetime import datetime
|
116
|
+
|
117
|
+
def distribute_query(query, node_endpoints, federation_config):
|
118
|
+
'''Prepare query for distribution to federated nodes'''
|
119
|
+
|
120
|
+
# Generate query ID for tracking
|
121
|
+
query_id = hashlib.sha256(
|
122
|
+
f"{{query}}_{{datetime.now().isoformat()}}".encode()
|
123
|
+
).hexdigest()[:16]
|
124
|
+
|
125
|
+
# Prepare distribution plan
|
126
|
+
distribution_plan = {{
|
127
|
+
"query_id": query_id,
|
128
|
+
"query": query,
|
129
|
+
"timestamp": datetime.now().isoformat(),
|
130
|
+
"target_nodes": [],
|
131
|
+
"federation_metadata": {{
|
132
|
+
"total_nodes": len(node_endpoints),
|
133
|
+
"min_required": {self.min_participating_nodes},
|
134
|
+
"timeout_per_node": {self.timeout_per_node},
|
135
|
+
"aggregation_strategy": "{self.aggregation_strategy}"
|
136
|
+
}}
|
137
|
+
}}
|
138
|
+
|
139
|
+
# Create node-specific queries
|
140
|
+
for node_id, endpoint in node_endpoints.items():
|
141
|
+
node_query = {{
|
142
|
+
"node_id": node_id,
|
143
|
+
"endpoint": endpoint,
|
144
|
+
"query_payload": {{
|
145
|
+
"query": query,
|
146
|
+
"query_id": query_id,
|
147
|
+
"federation_context": {{
|
148
|
+
"requesting_node": "coordinator",
|
149
|
+
"protocol_version": "1.0",
|
150
|
+
"response_format": "standardized"
|
151
|
+
}}
|
152
|
+
}},
|
153
|
+
"timeout": {self.timeout_per_node}
|
154
|
+
}}
|
155
|
+
|
156
|
+
distribution_plan["target_nodes"].append(node_query)
|
157
|
+
|
158
|
+
result = {{
|
159
|
+
"distribution_plan": distribution_plan,
|
160
|
+
"ready_for_distribution": True
|
161
|
+
}}
|
162
|
+
"""
|
163
|
+
},
|
164
|
+
)
|
165
|
+
|
166
|
+
# Federated query executor (simulated - would use actual network calls)
|
167
|
+
federated_executor_id = builder.add_node(
|
168
|
+
"PythonCodeNode",
|
169
|
+
node_id="federated_executor",
|
170
|
+
config={
|
171
|
+
"code": f"""
|
172
|
+
import asyncio
|
173
|
+
import random
|
174
|
+
import time
|
175
|
+
|
176
|
+
def execute_federated_queries(distribution_plan):
|
177
|
+
'''Execute queries across federated nodes'''
|
178
|
+
|
179
|
+
node_responses = []
|
180
|
+
failed_nodes = []
|
181
|
+
|
182
|
+
# Simulate parallel execution across nodes
|
183
|
+
for node_info in distribution_plan["target_nodes"]:
|
184
|
+
node_id = node_info["node_id"]
|
185
|
+
|
186
|
+
# Simulate network call with varying latency
|
187
|
+
start_time = time.time()
|
188
|
+
|
189
|
+
# Simulate different node behaviors
|
190
|
+
if random.random() > 0.9: # 10% failure rate
|
191
|
+
failed_nodes.append({{
|
192
|
+
"node_id": node_id,
|
193
|
+
"error": "Connection timeout",
|
194
|
+
"timestamp": datetime.now().isoformat()
|
195
|
+
}})
|
196
|
+
continue
|
197
|
+
|
198
|
+
# Simulate node processing
|
199
|
+
latency = random.uniform(0.5, 3.0)
|
200
|
+
|
201
|
+
# Generate simulated response based on node type
|
202
|
+
if "hospital" in node_id:
|
203
|
+
results = [
|
204
|
+
{{
|
205
|
+
"content": f"Clinical protocol from {{node_id}}: Treatment approach...",
|
206
|
+
"score": 0.85 + random.random() * 0.1,
|
207
|
+
"metadata": {{"source": "clinical_database", "last_updated": "2024-01"}}
|
208
|
+
}},
|
209
|
+
{{
|
210
|
+
"content": f"Patient outcomes data from {{node_id}}...",
|
211
|
+
"score": 0.80 + random.random() * 0.1,
|
212
|
+
"metadata": {{"source": "patient_records", "anonymized": True}}
|
213
|
+
}}
|
214
|
+
]
|
215
|
+
elif "research" in node_id:
|
216
|
+
results = [
|
217
|
+
{{
|
218
|
+
"content": f"Research findings from {{node_id}}: Latest studies show...",
|
219
|
+
"score": 0.90 + random.random() * 0.05,
|
220
|
+
"metadata": {{"source": "research_papers", "peer_reviewed": True}}
|
221
|
+
}},
|
222
|
+
{{
|
223
|
+
"content": f"Experimental data from {{node_id}}...",
|
224
|
+
"score": 0.75 + random.random() * 0.15,
|
225
|
+
"metadata": {{"source": "lab_results", "trial_phase": "3"}}
|
226
|
+
}}
|
227
|
+
]
|
228
|
+
else:
|
229
|
+
results = [
|
230
|
+
{{
|
231
|
+
"content": f"General data from {{node_id}}...",
|
232
|
+
"score": 0.70 + random.random() * 0.2,
|
233
|
+
"metadata": {{"source": "general_database"}}
|
234
|
+
}}
|
235
|
+
]
|
236
|
+
|
237
|
+
# Add response
|
238
|
+
response_time = time.time() - start_time
|
239
|
+
|
240
|
+
node_responses.append({{
|
241
|
+
"node_id": node_id,
|
242
|
+
"status": "success",
|
243
|
+
"results": results,
|
244
|
+
"metadata": {{
|
245
|
+
"response_time": response_time,
|
246
|
+
"result_count": len(results),
|
247
|
+
"node_load": random.uniform(0.3, 0.9),
|
248
|
+
"cache_hit": random.random() > 0.7 if {self.enable_caching} else False
|
249
|
+
}},
|
250
|
+
"timestamp": datetime.now().isoformat()
|
251
|
+
}})
|
252
|
+
|
253
|
+
# Check if minimum nodes responded
|
254
|
+
successful_nodes = len(node_responses)
|
255
|
+
minimum_met = successful_nodes >= {self.min_participating_nodes}
|
256
|
+
|
257
|
+
result = {{
|
258
|
+
"federated_responses": {{
|
259
|
+
"query_id": distribution_plan["query_id"],
|
260
|
+
"node_responses": node_responses,
|
261
|
+
"failed_nodes": failed_nodes,
|
262
|
+
"statistics": {{
|
263
|
+
"total_nodes": len(distribution_plan["target_nodes"]),
|
264
|
+
"successful_nodes": successful_nodes,
|
265
|
+
"failed_nodes": len(failed_nodes),
|
266
|
+
"minimum_requirement_met": minimum_met,
|
267
|
+
"avg_response_time": sum(r["metadata"]["response_time"] for r in node_responses) / len(node_responses) if node_responses else 0
|
268
|
+
}}
|
269
|
+
}}
|
270
|
+
}}
|
271
|
+
"""
|
272
|
+
},
|
273
|
+
)
|
274
|
+
|
275
|
+
# Result aggregator
|
276
|
+
result_aggregator_id = builder.add_node(
|
277
|
+
"PythonCodeNode",
|
278
|
+
node_id="result_aggregator",
|
279
|
+
config={
|
280
|
+
"code": f"""
|
281
|
+
from collections import defaultdict
|
282
|
+
import statistics
|
283
|
+
|
284
|
+
def aggregate_federated_results(federated_responses):
|
285
|
+
'''Aggregate results from multiple federated nodes'''
|
286
|
+
|
287
|
+
if not federated_responses["statistics"]["minimum_requirement_met"]:
|
288
|
+
return {{
|
289
|
+
"aggregated_results": {{
|
290
|
+
"error": "Insufficient nodes responded",
|
291
|
+
"required": {self.min_participating_nodes},
|
292
|
+
"received": federated_responses["statistics"]["successful_nodes"]
|
293
|
+
}}
|
294
|
+
}}
|
295
|
+
|
296
|
+
# Collect all results
|
297
|
+
all_results = []
|
298
|
+
node_weights = {{}}
|
299
|
+
|
300
|
+
for node_response in federated_responses["node_responses"]:
|
301
|
+
node_id = node_response["node_id"]
|
302
|
+
|
303
|
+
# Calculate node weight based on various factors
|
304
|
+
weight = 1.0
|
305
|
+
|
306
|
+
# Adjust weight based on response time (faster = higher weight)
|
307
|
+
avg_response_time = federated_responses["statistics"]["avg_response_time"]
|
308
|
+
if avg_response_time > 0:
|
309
|
+
weight *= avg_response_time / node_response["metadata"]["response_time"]
|
310
|
+
|
311
|
+
# Adjust weight based on result count
|
312
|
+
weight *= min(2.0, 1 + node_response["metadata"]["result_count"] / 10)
|
313
|
+
|
314
|
+
# Boost weight for cache hits
|
315
|
+
if node_response["metadata"].get("cache_hit"):
|
316
|
+
weight *= 1.2
|
317
|
+
|
318
|
+
node_weights[node_id] = weight
|
319
|
+
|
320
|
+
# Add results with node information
|
321
|
+
for result in node_response["results"]:
|
322
|
+
result_with_node = result.copy()
|
323
|
+
result_with_node["source_node"] = node_id
|
324
|
+
result_with_node["node_weight"] = weight
|
325
|
+
all_results.append(result_with_node)
|
326
|
+
|
327
|
+
# Aggregate based on strategy
|
328
|
+
if "{self.aggregation_strategy}" == "weighted_average":
|
329
|
+
# Group similar results and weight scores
|
330
|
+
grouped_results = defaultdict(list)
|
331
|
+
|
332
|
+
for result in all_results:
|
333
|
+
# Simple content hashing for grouping
|
334
|
+
content_key = result["content"][:50] # First 50 chars as key
|
335
|
+
grouped_results[content_key].append(result)
|
336
|
+
|
337
|
+
aggregated = []
|
338
|
+
for content_key, group in grouped_results.items():
|
339
|
+
# Calculate weighted average score
|
340
|
+
total_weight = sum(r["node_weight"] for r in group)
|
341
|
+
weighted_score = sum(r["score"] * r["node_weight"] for r in group) / total_weight
|
342
|
+
|
343
|
+
# Merge metadata
|
344
|
+
merged_metadata = {{
|
345
|
+
"source_nodes": list(set(r["source_node"] for r in group)),
|
346
|
+
"aggregation_method": "weighted_average",
|
347
|
+
"individual_scores": {{r["source_node"]: r["score"] for r in group}},
|
348
|
+
"confidence": statistics.stdev([r["score"] for r in group]) if len(group) > 1 else 1.0
|
349
|
+
}}
|
350
|
+
|
351
|
+
aggregated.append({{
|
352
|
+
"content": group[0]["content"], # Use full content from first
|
353
|
+
"score": weighted_score,
|
354
|
+
"metadata": merged_metadata,
|
355
|
+
"node_agreement": len(group) / len(federated_responses["node_responses"])
|
356
|
+
}})
|
357
|
+
|
358
|
+
elif "{self.aggregation_strategy}" == "voting":
|
359
|
+
# Majority voting on top results
|
360
|
+
node_top_results = {{}}
|
361
|
+
|
362
|
+
for node_response in federated_responses["node_responses"]:
|
363
|
+
node_id = node_response["node_id"]
|
364
|
+
# Get top 3 results from each node
|
365
|
+
top_results = sorted(node_response["results"], key=lambda x: x["score"], reverse=True)[:3]
|
366
|
+
node_top_results[node_id] = [r["content"] for r in top_results]
|
367
|
+
|
368
|
+
# Count votes
|
369
|
+
content_votes = defaultdict(int)
|
370
|
+
for node_id, results in node_top_results.items():
|
371
|
+
for i, content in enumerate(results):
|
372
|
+
# Higher rank = more votes
|
373
|
+
content_votes[content] += (3 - i)
|
374
|
+
|
375
|
+
# Sort by votes
|
376
|
+
aggregated = []
|
377
|
+
for content, votes in sorted(content_votes.items(), key=lambda x: x[1], reverse=True)[:10]:
|
378
|
+
# Find original result data
|
379
|
+
for result in all_results:
|
380
|
+
if result["content"] == content:
|
381
|
+
aggregated.append({{
|
382
|
+
"content": content,
|
383
|
+
"score": votes / (3 * len(federated_responses["node_responses"])),
|
384
|
+
"metadata": {{
|
385
|
+
"aggregation_method": "voting",
|
386
|
+
"vote_count": votes,
|
387
|
+
"max_possible_votes": 3 * len(federated_responses["node_responses"])
|
388
|
+
}}
|
389
|
+
}})
|
390
|
+
break
|
391
|
+
|
392
|
+
else: # Simple merge
|
393
|
+
# Just combine and sort by score
|
394
|
+
aggregated = sorted(all_results, key=lambda x: x["score"], reverse=True)[:10]
|
395
|
+
for result in aggregated:
|
396
|
+
result["metadata"]["aggregation_method"] = "simple_merge"
|
397
|
+
|
398
|
+
# Sort final results by score
|
399
|
+
aggregated.sort(key=lambda x: x["score"], reverse=True)
|
400
|
+
|
401
|
+
# Calculate federation health metrics
|
402
|
+
federation_health = {{
|
403
|
+
"overall_health": "healthy" if federated_responses["statistics"]["successful_nodes"] >= len(federated_responses["node_responses"]) * 0.8 else "degraded",
|
404
|
+
"node_participation_rate": federated_responses["statistics"]["successful_nodes"] / federated_responses["statistics"]["total_nodes"],
|
405
|
+
"avg_node_latency": federated_responses["statistics"]["avg_response_time"],
|
406
|
+
"result_diversity": len(set(r["content"][:30] for r in all_results)) / len(all_results) if all_results else 0
|
407
|
+
}}
|
408
|
+
|
409
|
+
result = {{
|
410
|
+
"aggregated_results": {{
|
411
|
+
"results": aggregated[:10], # Top 10 aggregated results
|
412
|
+
"total_raw_results": len(all_results),
|
413
|
+
"aggregation_metadata": {{
|
414
|
+
"strategy": "{self.aggregation_strategy}",
|
415
|
+
"node_weights": node_weights,
|
416
|
+
"participating_nodes": list(node_weights.keys())
|
417
|
+
}},
|
418
|
+
"federation_health": federation_health
|
419
|
+
}}
|
420
|
+
}}
|
421
|
+
"""
|
422
|
+
},
|
423
|
+
)
|
424
|
+
|
425
|
+
# Cache coordinator (if enabled)
|
426
|
+
if self.enable_caching:
|
427
|
+
cache_coordinator_id = builder.add_node(
|
428
|
+
"PythonCodeNode",
|
429
|
+
node_id="cache_coordinator",
|
430
|
+
config={
|
431
|
+
"code": """
|
432
|
+
def coordinate_caching(aggregated_results, distribution_plan):
|
433
|
+
'''Coordinate caching across federated nodes'''
|
434
|
+
|
435
|
+
# Identify high-value results to cache
|
436
|
+
cache_candidates = []
|
437
|
+
|
438
|
+
for result in aggregated_results["results"][:5]: # Top 5 results
|
439
|
+
if result["score"] > 0.8 and result.get("node_agreement", 0) > 0.5:
|
440
|
+
cache_candidates.append({
|
441
|
+
"content_hash": hashlib.sha256(result["content"].encode()).hexdigest()[:16],
|
442
|
+
"result": result,
|
443
|
+
"cache_priority": result["score"] * result.get("node_agreement", 1),
|
444
|
+
"ttl": 3600 # 1 hour
|
445
|
+
})
|
446
|
+
|
447
|
+
# Create cache distribution plan
|
448
|
+
cache_distribution = {
|
449
|
+
"cache_candidates": cache_candidates,
|
450
|
+
"distribution_strategy": "broadcast", # Send to all nodes
|
451
|
+
"cache_metadata": {
|
452
|
+
"query_id": distribution_plan["query_id"],
|
453
|
+
"cached_at": datetime.now().isoformat(),
|
454
|
+
"cache_version": "1.0"
|
455
|
+
}
|
456
|
+
}
|
457
|
+
|
458
|
+
result = {
|
459
|
+
"cache_coordination": {
|
460
|
+
"candidates_identified": len(cache_candidates),
|
461
|
+
"cache_distribution": cache_distribution,
|
462
|
+
"estimated_hit_rate_improvement": min(0.3, len(cache_candidates) * 0.05)
|
463
|
+
}
|
464
|
+
}
|
465
|
+
"""
|
466
|
+
},
|
467
|
+
)
|
468
|
+
|
469
|
+
# Result formatter
|
470
|
+
result_formatter_id = builder.add_node(
|
471
|
+
"PythonCodeNode",
|
472
|
+
node_id="result_formatter",
|
473
|
+
config={
|
474
|
+
"code": f"""
|
475
|
+
def format_federated_results(aggregated_results, federated_responses, cache_coordination=None):
|
476
|
+
'''Format final federated RAG results'''
|
477
|
+
|
478
|
+
# Extract key information
|
479
|
+
results = aggregated_results.get("results", [])
|
480
|
+
aggregation_metadata = aggregated_results.get("aggregation_metadata", {{}})
|
481
|
+
federation_health = aggregated_results.get("federation_health", {{}})
|
482
|
+
|
483
|
+
# Build node contribution summary
|
484
|
+
node_contributions = {{}}
|
485
|
+
for node_response in federated_responses["node_responses"]:
|
486
|
+
node_id = node_response["node_id"]
|
487
|
+
node_contributions[node_id] = {{
|
488
|
+
"status": node_response["status"],
|
489
|
+
"results_contributed": node_response["metadata"]["result_count"],
|
490
|
+
"response_time": node_response["metadata"]["response_time"],
|
491
|
+
"weight": aggregation_metadata["node_weights"].get(node_id, 0)
|
492
|
+
}}
|
493
|
+
|
494
|
+
# Add failed nodes
|
495
|
+
for failed_node in federated_responses["failed_nodes"]:
|
496
|
+
node_contributions[failed_node["node_id"]] = {{
|
497
|
+
"status": "failed",
|
498
|
+
"error": failed_node["error"]
|
499
|
+
}}
|
500
|
+
|
501
|
+
# Build final output
|
502
|
+
formatted_output = {{
|
503
|
+
"federated_results": results,
|
504
|
+
"node_contributions": node_contributions,
|
505
|
+
"aggregation_metadata": {{
|
506
|
+
"strategy_used": aggregation_metadata.get("strategy", "{self.aggregation_strategy}"),
|
507
|
+
"nodes_participated": len(aggregation_metadata.get("participating_nodes", [])),
|
508
|
+
"total_results_aggregated": aggregated_results.get("total_raw_results", 0),
|
509
|
+
"minimum_nodes_required": {self.min_participating_nodes},
|
510
|
+
"minimum_requirement_met": federated_responses["statistics"]["minimum_requirement_met"]
|
511
|
+
}},
|
512
|
+
"federation_health": federation_health,
|
513
|
+
"performance_metrics": {{
|
514
|
+
"total_query_time": federated_responses["statistics"]["avg_response_time"],
|
515
|
+
"successful_node_rate": federated_responses["statistics"]["successful_nodes"] / federated_responses["statistics"]["total_nodes"],
|
516
|
+
"result_diversity_score": federation_health.get("result_diversity", 0)
|
517
|
+
}}
|
518
|
+
}}
|
519
|
+
|
520
|
+
# Add caching information if available
|
521
|
+
if cache_coordination and {self.enable_caching}:
|
522
|
+
formatted_output["cache_optimization"] = {{
|
523
|
+
"cache_candidates": cache_coordination["cache_coordination"]["candidates_identified"],
|
524
|
+
"expected_hit_rate_improvement": cache_coordination["cache_coordination"]["estimated_hit_rate_improvement"]
|
525
|
+
}}
|
526
|
+
|
527
|
+
result = {{"federated_rag_output": formatted_output}}
|
528
|
+
"""
|
529
|
+
},
|
530
|
+
)
|
531
|
+
|
532
|
+
# Connect workflow
|
533
|
+
builder.add_connection(
|
534
|
+
query_distributor_id,
|
535
|
+
"distribution_plan",
|
536
|
+
federated_executor_id,
|
537
|
+
"distribution_plan",
|
538
|
+
)
|
539
|
+
builder.add_connection(
|
540
|
+
federated_executor_id,
|
541
|
+
"federated_responses",
|
542
|
+
result_aggregator_id,
|
543
|
+
"federated_responses",
|
544
|
+
)
|
545
|
+
|
546
|
+
if self.enable_caching:
|
547
|
+
builder.add_connection(
|
548
|
+
result_aggregator_id,
|
549
|
+
"aggregated_results",
|
550
|
+
cache_coordinator_id,
|
551
|
+
"aggregated_results",
|
552
|
+
)
|
553
|
+
builder.add_connection(
|
554
|
+
query_distributor_id,
|
555
|
+
"distribution_plan",
|
556
|
+
cache_coordinator_id,
|
557
|
+
"distribution_plan",
|
558
|
+
)
|
559
|
+
builder.add_connection(
|
560
|
+
cache_coordinator_id,
|
561
|
+
"cache_coordination",
|
562
|
+
result_formatter_id,
|
563
|
+
"cache_coordination",
|
564
|
+
)
|
565
|
+
|
566
|
+
builder.add_connection(
|
567
|
+
result_aggregator_id,
|
568
|
+
"aggregated_results",
|
569
|
+
result_formatter_id,
|
570
|
+
"aggregated_results",
|
571
|
+
)
|
572
|
+
builder.add_connection(
|
573
|
+
federated_executor_id,
|
574
|
+
"federated_responses",
|
575
|
+
result_formatter_id,
|
576
|
+
"federated_responses",
|
577
|
+
)
|
578
|
+
|
579
|
+
return builder.build(name="federated_rag_workflow")
|
580
|
+
|
581
|
+
|
582
|
+
@register_node()
|
583
|
+
class EdgeRAGNode(Node):
|
584
|
+
"""
|
585
|
+
Edge Computing RAG Node
|
586
|
+
|
587
|
+
Optimized RAG for edge devices with limited resources.
|
588
|
+
|
589
|
+
When to use:
|
590
|
+
- Best for: IoT devices, mobile apps, offline scenarios
|
591
|
+
- Constraints: Limited memory, CPU, storage
|
592
|
+
- Features: Model quantization, selective caching, incremental updates
|
593
|
+
|
594
|
+
Example:
|
595
|
+
edge_rag = EdgeRAGNode(
|
596
|
+
model_size="tiny", # 50MB model
|
597
|
+
max_cache_size_mb=100,
|
598
|
+
update_strategy="incremental"
|
599
|
+
)
|
600
|
+
|
601
|
+
result = await edge_rag.run(
|
602
|
+
query="Local sensor anomaly detection",
|
603
|
+
local_data=sensor_readings,
|
604
|
+
sync_with_cloud=False
|
605
|
+
)
|
606
|
+
|
607
|
+
Parameters:
|
608
|
+
model_size: Size constraints (tiny, small, medium)
|
609
|
+
max_cache_size_mb: Maximum cache size
|
610
|
+
update_strategy: How to update the edge model
|
611
|
+
power_mode: Optimization for battery life
|
612
|
+
|
613
|
+
Returns:
|
614
|
+
results: Local RAG results
|
615
|
+
resource_usage: Memory and CPU consumption
|
616
|
+
sync_recommendations: When to sync with cloud
|
617
|
+
"""
|
618
|
+
|
619
|
+
def __init__(
|
620
|
+
self,
|
621
|
+
name: str = "edge_rag",
|
622
|
+
model_size: str = "small",
|
623
|
+
max_cache_size_mb: int = 100,
|
624
|
+
update_strategy: str = "incremental",
|
625
|
+
power_mode: str = "balanced",
|
626
|
+
):
|
627
|
+
self.model_size = model_size
|
628
|
+
self.max_cache_size_mb = max_cache_size_mb
|
629
|
+
self.update_strategy = update_strategy
|
630
|
+
self.power_mode = power_mode
|
631
|
+
self.cache = {}
|
632
|
+
self.cache_size_bytes = 0
|
633
|
+
super().__init__(name)
|
634
|
+
|
635
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
636
|
+
return {
|
637
|
+
"query": NodeParameter(
|
638
|
+
name="query", type=str, required=True, description="Query to process"
|
639
|
+
),
|
640
|
+
"local_data": NodeParameter(
|
641
|
+
name="local_data",
|
642
|
+
type=list,
|
643
|
+
required=True,
|
644
|
+
description="Local data available on edge",
|
645
|
+
),
|
646
|
+
"sync_with_cloud": NodeParameter(
|
647
|
+
name="sync_with_cloud",
|
648
|
+
type=bool,
|
649
|
+
required=False,
|
650
|
+
default=False,
|
651
|
+
description="Whether to sync with cloud",
|
652
|
+
),
|
653
|
+
}
|
654
|
+
|
655
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
656
|
+
"""Execute edge-optimized RAG"""
|
657
|
+
query = kwargs.get("query", "")
|
658
|
+
local_data = kwargs.get("local_data", [])
|
659
|
+
sync_with_cloud = kwargs.get("sync_with_cloud", False)
|
660
|
+
|
661
|
+
# Check cache first
|
662
|
+
cache_key = hashlib.sha256(query.encode()).hexdigest()[:8]
|
663
|
+
if cache_key in self.cache and self.power_mode != "performance":
|
664
|
+
logger.info(f"Cache hit for query: {cache_key}")
|
665
|
+
return self.cache[cache_key]
|
666
|
+
|
667
|
+
# Resource tracking
|
668
|
+
start_memory = self._estimate_memory_usage()
|
669
|
+
|
670
|
+
# Lightweight retrieval optimized for edge
|
671
|
+
results = self._edge_optimized_retrieval(query, local_data)
|
672
|
+
|
673
|
+
# Generate response with constrained model
|
674
|
+
response = self._generate_edge_response(query, results)
|
675
|
+
|
676
|
+
# Calculate resource usage
|
677
|
+
end_memory = self._estimate_memory_usage()
|
678
|
+
resource_usage = {
|
679
|
+
"memory_mb": (end_memory - start_memory) / 1024 / 1024,
|
680
|
+
"estimated_cpu_ms": 50 if self.model_size == "tiny" else 200,
|
681
|
+
"model_size": self.model_size,
|
682
|
+
"cache_size_mb": self.cache_size_bytes / 1024 / 1024,
|
683
|
+
}
|
684
|
+
|
685
|
+
# Determine sync recommendations
|
686
|
+
sync_recommendations = self._calculate_sync_recommendations(
|
687
|
+
len(local_data), self.cache_size_bytes, sync_with_cloud
|
688
|
+
)
|
689
|
+
|
690
|
+
# Cache result if space available
|
691
|
+
result = {
|
692
|
+
"results": response["results"],
|
693
|
+
"resource_usage": resource_usage,
|
694
|
+
"sync_recommendations": sync_recommendations,
|
695
|
+
"edge_metadata": {
|
696
|
+
"model_size": self.model_size,
|
697
|
+
"power_mode": self.power_mode,
|
698
|
+
"cache_hit": False,
|
699
|
+
"local_data_size": len(local_data),
|
700
|
+
},
|
701
|
+
}
|
702
|
+
|
703
|
+
# Update cache
|
704
|
+
self._update_cache(cache_key, result)
|
705
|
+
|
706
|
+
return result
|
707
|
+
|
708
|
+
def _edge_optimized_retrieval(
|
709
|
+
self, query: str, local_data: List[Dict]
|
710
|
+
) -> List[Dict]:
|
711
|
+
"""Perform retrieval optimized for edge constraints"""
|
712
|
+
# Simple keyword matching for efficiency
|
713
|
+
query_words = set(query.lower().split())
|
714
|
+
scored_results = []
|
715
|
+
|
716
|
+
# Limit processing based on power mode
|
717
|
+
max_docs = 50 if self.power_mode == "low_power" else 200
|
718
|
+
|
719
|
+
for doc in local_data[:max_docs]:
|
720
|
+
content = doc.get("content", "").lower()
|
721
|
+
doc_words = set(content.split())
|
722
|
+
|
723
|
+
# Quick scoring
|
724
|
+
if query_words:
|
725
|
+
score = len(query_words & doc_words) / len(query_words)
|
726
|
+
if score > 0:
|
727
|
+
scored_results.append({"document": doc, "score": score})
|
728
|
+
|
729
|
+
# Sort and limit results
|
730
|
+
scored_results.sort(key=lambda x: x["score"], reverse=True)
|
731
|
+
return scored_results[:5] # Keep only top 5 for edge
|
732
|
+
|
733
|
+
def _generate_edge_response(
|
734
|
+
self, query: str, results: List[Dict]
|
735
|
+
) -> Dict[str, Any]:
|
736
|
+
"""Generate response with edge-constrained model"""
|
737
|
+
# Simulate different model sizes
|
738
|
+
if self.model_size == "tiny":
|
739
|
+
# Very basic response
|
740
|
+
if results:
|
741
|
+
response = f"Found {len(results)} relevant results for: {query}"
|
742
|
+
else:
|
743
|
+
response = f"No local results for: {query}"
|
744
|
+
elif self.model_size == "small":
|
745
|
+
# Slightly better response
|
746
|
+
if results:
|
747
|
+
top_content = results[0]["document"].get("content", "")[:100]
|
748
|
+
response = f"Based on local data: {top_content}..."
|
749
|
+
else:
|
750
|
+
response = "No relevant local data found. Consider syncing with cloud."
|
751
|
+
else: # medium
|
752
|
+
# Best edge response
|
753
|
+
if results:
|
754
|
+
contents = [r["document"].get("content", "")[:200] for r in results[:2]]
|
755
|
+
response = f"Local analysis for '{query}': " + " ".join(contents)
|
756
|
+
else:
|
757
|
+
response = f"No local matches for '{query}'. Cloud sync recommended."
|
758
|
+
|
759
|
+
return {
|
760
|
+
"results": [
|
761
|
+
{
|
762
|
+
"content": response,
|
763
|
+
"score": results[0]["score"] if results else 0,
|
764
|
+
"source": "edge_processing",
|
765
|
+
}
|
766
|
+
]
|
767
|
+
}
|
768
|
+
|
769
|
+
def _estimate_memory_usage(self) -> int:
|
770
|
+
"""Estimate current memory usage in bytes"""
|
771
|
+
# Simplified estimation
|
772
|
+
base_memory = {
|
773
|
+
"tiny": 50 * 1024 * 1024, # 50MB
|
774
|
+
"small": 200 * 1024 * 1024, # 200MB
|
775
|
+
"medium": 500 * 1024 * 1024, # 500MB
|
776
|
+
}
|
777
|
+
return (
|
778
|
+
base_memory.get(self.model_size, 200 * 1024 * 1024) + self.cache_size_bytes
|
779
|
+
)
|
780
|
+
|
781
|
+
def _calculate_sync_recommendations(
|
782
|
+
self, local_data_size: int, cache_size: int, sync_requested: bool
|
783
|
+
) -> Dict[str, Any]:
|
784
|
+
"""Calculate when to sync with cloud"""
|
785
|
+
recommendations = {"should_sync": False, "sync_priority": "low", "reasons": []}
|
786
|
+
|
787
|
+
# Check various conditions
|
788
|
+
if local_data_size < 10:
|
789
|
+
recommendations["should_sync"] = True
|
790
|
+
recommendations["reasons"].append("Insufficient local data")
|
791
|
+
recommendations["sync_priority"] = "high"
|
792
|
+
|
793
|
+
if cache_size > self.max_cache_size_mb * 1024 * 1024 * 0.9:
|
794
|
+
recommendations["should_sync"] = True
|
795
|
+
recommendations["reasons"].append("Cache near capacity")
|
796
|
+
recommendations["sync_priority"] = "medium"
|
797
|
+
|
798
|
+
if sync_requested:
|
799
|
+
recommendations["should_sync"] = True
|
800
|
+
recommendations["reasons"].append("User requested sync")
|
801
|
+
recommendations["sync_priority"] = "high"
|
802
|
+
|
803
|
+
# Add sync strategy
|
804
|
+
if self.update_strategy == "incremental":
|
805
|
+
recommendations["sync_type"] = "differential"
|
806
|
+
else:
|
807
|
+
recommendations["sync_type"] = "full"
|
808
|
+
|
809
|
+
return recommendations
|
810
|
+
|
811
|
+
def _update_cache(self, key: str, result: Dict):
|
812
|
+
"""Update cache with size management"""
|
813
|
+
result_size = len(json.dumps(result))
|
814
|
+
|
815
|
+
# Check if we need to evict
|
816
|
+
while (
|
817
|
+
self.cache_size_bytes + result_size > self.max_cache_size_mb * 1024 * 1024
|
818
|
+
and self.cache
|
819
|
+
):
|
820
|
+
# Evict oldest (simple FIFO)
|
821
|
+
oldest_key = next(iter(self.cache))
|
822
|
+
evicted_size = len(json.dumps(self.cache[oldest_key]))
|
823
|
+
del self.cache[oldest_key]
|
824
|
+
self.cache_size_bytes -= evicted_size
|
825
|
+
logger.debug(f"Evicted cache entry: {oldest_key}")
|
826
|
+
|
827
|
+
# Add to cache
|
828
|
+
self.cache[key] = result
|
829
|
+
self.cache_size_bytes += result_size
|
830
|
+
|
831
|
+
|
832
|
+
@register_node()
|
833
|
+
class CrossSiloRAGNode(Node):
|
834
|
+
"""
|
835
|
+
Cross-Silo Federated RAG
|
836
|
+
|
837
|
+
RAG across organizational boundaries with strict data governance.
|
838
|
+
|
839
|
+
When to use:
|
840
|
+
- Best for: Multi-organization collaborations, consortiums
|
841
|
+
- Features: Data sovereignty, audit trails, access control
|
842
|
+
- Compliance: GDPR, HIPAA compatible
|
843
|
+
|
844
|
+
Example:
|
845
|
+
cross_silo_rag = CrossSiloRAGNode(
|
846
|
+
silos=["org_a", "org_b", "org_c"],
|
847
|
+
data_sharing_agreement="minimal",
|
848
|
+
audit_mode="comprehensive"
|
849
|
+
)
|
850
|
+
|
851
|
+
result = await cross_silo_rag.run(
|
852
|
+
query="Industry-wide trend analysis",
|
853
|
+
requester_org="org_a",
|
854
|
+
access_permissions=["read_aggregated", "no_raw_data"]
|
855
|
+
)
|
856
|
+
|
857
|
+
Parameters:
|
858
|
+
silos: Participating organizations
|
859
|
+
data_sharing_agreement: Level of data sharing allowed
|
860
|
+
audit_mode: Audit trail comprehensiveness
|
861
|
+
governance_rules: Data governance policies
|
862
|
+
|
863
|
+
Returns:
|
864
|
+
silo_results: Results respecting data boundaries
|
865
|
+
audit_trail: Complete audit of data access
|
866
|
+
compliance_report: Governance compliance status
|
867
|
+
"""
|
868
|
+
|
869
|
+
def __init__(
|
870
|
+
self,
|
871
|
+
name: str = "cross_silo_rag",
|
872
|
+
silos: List[str] = None,
|
873
|
+
data_sharing_agreement: str = "minimal",
|
874
|
+
audit_mode: str = "standard",
|
875
|
+
governance_rules: Dict[str, Any] = None,
|
876
|
+
):
|
877
|
+
self.silos = silos or []
|
878
|
+
self.data_sharing_agreement = data_sharing_agreement
|
879
|
+
self.audit_mode = audit_mode
|
880
|
+
self.governance_rules = governance_rules or {}
|
881
|
+
super().__init__(name)
|
882
|
+
|
883
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
884
|
+
return {
|
885
|
+
"query": NodeParameter(
|
886
|
+
name="query", type=str, required=True, description="Cross-silo query"
|
887
|
+
),
|
888
|
+
"requester_org": NodeParameter(
|
889
|
+
name="requester_org",
|
890
|
+
type=str,
|
891
|
+
required=True,
|
892
|
+
description="Organization making request",
|
893
|
+
),
|
894
|
+
"access_permissions": NodeParameter(
|
895
|
+
name="access_permissions",
|
896
|
+
type=list,
|
897
|
+
required=True,
|
898
|
+
description="Granted permissions",
|
899
|
+
),
|
900
|
+
"purpose": NodeParameter(
|
901
|
+
name="purpose", type=str, required=False, description="Purpose of query"
|
902
|
+
),
|
903
|
+
}
|
904
|
+
|
905
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
906
|
+
"""Execute cross-silo federated RAG"""
|
907
|
+
query = kwargs.get("query", "")
|
908
|
+
requester_org = kwargs.get("requester_org", "")
|
909
|
+
access_permissions = kwargs.get("access_permissions", [])
|
910
|
+
purpose = kwargs.get("purpose", "analysis")
|
911
|
+
|
912
|
+
# Validate access
|
913
|
+
access_valid = self._validate_cross_silo_access(
|
914
|
+
requester_org, access_permissions, purpose
|
915
|
+
)
|
916
|
+
|
917
|
+
if not access_valid["granted"]:
|
918
|
+
return {
|
919
|
+
"error": "Access denied",
|
920
|
+
"reason": access_valid["reason"],
|
921
|
+
"required_permissions": access_valid["required"],
|
922
|
+
}
|
923
|
+
|
924
|
+
# Execute query across silos
|
925
|
+
silo_results = self._execute_cross_silo_query(
|
926
|
+
query, requester_org, access_permissions
|
927
|
+
)
|
928
|
+
|
929
|
+
# Apply data governance rules
|
930
|
+
governed_results = self._apply_governance(
|
931
|
+
silo_results, requester_org, self.data_sharing_agreement
|
932
|
+
)
|
933
|
+
|
934
|
+
# Generate audit trail
|
935
|
+
audit_trail = self._generate_audit_trail(
|
936
|
+
query, requester_org, silo_results, governed_results
|
937
|
+
)
|
938
|
+
|
939
|
+
# Create compliance report
|
940
|
+
compliance_report = self._generate_compliance_report(
|
941
|
+
requester_org, access_permissions, governed_results
|
942
|
+
)
|
943
|
+
|
944
|
+
return {
|
945
|
+
"silo_results": governed_results,
|
946
|
+
"audit_trail": (
|
947
|
+
audit_trail
|
948
|
+
if self.audit_mode != "minimal"
|
949
|
+
else "Audit available on request"
|
950
|
+
),
|
951
|
+
"compliance_report": compliance_report,
|
952
|
+
"federation_metadata": {
|
953
|
+
"participating_silos": len(
|
954
|
+
[r for r in silo_results if r["participated"]]
|
955
|
+
),
|
956
|
+
"data_sharing_level": self.data_sharing_agreement,
|
957
|
+
"governance_applied": True,
|
958
|
+
},
|
959
|
+
}
|
960
|
+
|
961
|
+
def _validate_cross_silo_access(
|
962
|
+
self, requester: str, permissions: List[str], purpose: str
|
963
|
+
) -> Dict[str, Any]:
|
964
|
+
"""Validate cross-silo access request"""
|
965
|
+
# Check if requester is part of federation
|
966
|
+
if requester not in self.silos:
|
967
|
+
return {
|
968
|
+
"granted": False,
|
969
|
+
"reason": "Organization not part of federation",
|
970
|
+
"required": ["federation_membership"],
|
971
|
+
}
|
972
|
+
|
973
|
+
# Check required permissions
|
974
|
+
required_permissions = {
|
975
|
+
"minimal": ["read_aggregated"],
|
976
|
+
"standard": ["read_aggregated", "read_anonymized"],
|
977
|
+
"full": ["read_aggregated", "read_anonymized", "read_samples"],
|
978
|
+
}
|
979
|
+
|
980
|
+
required = required_permissions.get(
|
981
|
+
self.data_sharing_agreement, ["read_aggregated"]
|
982
|
+
)
|
983
|
+
|
984
|
+
if not all(perm in permissions for perm in required):
|
985
|
+
return {
|
986
|
+
"granted": False,
|
987
|
+
"reason": "Insufficient permissions",
|
988
|
+
"required": required,
|
989
|
+
}
|
990
|
+
|
991
|
+
# Purpose-based validation
|
992
|
+
allowed_purposes = self.governance_rules.get(
|
993
|
+
"allowed_purposes", ["analysis", "research", "compliance", "improvement"]
|
994
|
+
)
|
995
|
+
|
996
|
+
if purpose not in allowed_purposes:
|
997
|
+
return {
|
998
|
+
"granted": False,
|
999
|
+
"reason": f"Purpose '{purpose}' not allowed",
|
1000
|
+
"required": allowed_purposes,
|
1001
|
+
}
|
1002
|
+
|
1003
|
+
return {"granted": True, "reason": "Access approved"}
|
1004
|
+
|
1005
|
+
def _execute_cross_silo_query(
|
1006
|
+
self, query: str, requester: str, permissions: List[str]
|
1007
|
+
) -> List[Dict[str, Any]]:
|
1008
|
+
"""Execute query across organizational silos"""
|
1009
|
+
silo_results = []
|
1010
|
+
|
1011
|
+
for silo in self.silos:
|
1012
|
+
if silo == requester:
|
1013
|
+
# Full access to own data
|
1014
|
+
access_level = "full"
|
1015
|
+
else:
|
1016
|
+
# Restricted access based on agreement
|
1017
|
+
access_level = self.data_sharing_agreement
|
1018
|
+
|
1019
|
+
# Simulate silo response
|
1020
|
+
if random.random() > 0.1: # 90% success rate
|
1021
|
+
results = []
|
1022
|
+
|
1023
|
+
# Generate results based on access level
|
1024
|
+
if access_level == "full":
|
1025
|
+
results = [
|
1026
|
+
{
|
1027
|
+
"content": f"Detailed data from {silo}: {query} analysis...",
|
1028
|
+
"score": 0.9,
|
1029
|
+
"raw_data_included": True,
|
1030
|
+
}
|
1031
|
+
]
|
1032
|
+
elif access_level == "standard":
|
1033
|
+
results = [
|
1034
|
+
{
|
1035
|
+
"content": f"Anonymized data from {silo}: aggregated {query} insights...",
|
1036
|
+
"score": 0.8,
|
1037
|
+
"raw_data_included": False,
|
1038
|
+
}
|
1039
|
+
]
|
1040
|
+
else: # minimal
|
1041
|
+
results = [
|
1042
|
+
{
|
1043
|
+
"content": f"Summary from {silo}: high-level {query} trends...",
|
1044
|
+
"score": 0.7,
|
1045
|
+
"raw_data_included": False,
|
1046
|
+
}
|
1047
|
+
]
|
1048
|
+
|
1049
|
+
silo_results.append(
|
1050
|
+
{
|
1051
|
+
"silo": silo,
|
1052
|
+
"participated": True,
|
1053
|
+
"results": results,
|
1054
|
+
"access_level": access_level,
|
1055
|
+
"response_time": random.uniform(1, 3),
|
1056
|
+
}
|
1057
|
+
)
|
1058
|
+
else:
|
1059
|
+
silo_results.append(
|
1060
|
+
{
|
1061
|
+
"silo": silo,
|
1062
|
+
"participated": False,
|
1063
|
+
"reason": "Silo temporarily unavailable",
|
1064
|
+
}
|
1065
|
+
)
|
1066
|
+
|
1067
|
+
return silo_results
|
1068
|
+
|
1069
|
+
def _apply_governance(
|
1070
|
+
self, silo_results: List[Dict], requester: str, agreement: str
|
1071
|
+
) -> List[Dict[str, Any]]:
|
1072
|
+
"""Apply data governance rules to results"""
|
1073
|
+
governed_results = []
|
1074
|
+
|
1075
|
+
for silo_result in silo_results:
|
1076
|
+
if not silo_result["participated"]:
|
1077
|
+
governed_results.append(silo_result)
|
1078
|
+
continue
|
1079
|
+
|
1080
|
+
# Apply governance based on agreement
|
1081
|
+
governed_silo_result = silo_result.copy()
|
1082
|
+
|
1083
|
+
if silo_result["silo"] != requester:
|
1084
|
+
# Apply restrictions for other silos
|
1085
|
+
if agreement == "minimal":
|
1086
|
+
# Remove any detailed information
|
1087
|
+
for result in governed_silo_result["results"]:
|
1088
|
+
result["content"] = self._minimize_content(result["content"])
|
1089
|
+
result["governance_applied"] = "minimal_sharing"
|
1090
|
+
|
1091
|
+
elif agreement == "standard":
|
1092
|
+
# Ensure anonymization
|
1093
|
+
for result in governed_silo_result["results"]:
|
1094
|
+
result["content"] = self._anonymize_content(result["content"])
|
1095
|
+
result["governance_applied"] = "anonymized"
|
1096
|
+
|
1097
|
+
governed_results.append(governed_silo_result)
|
1098
|
+
|
1099
|
+
return governed_results
|
1100
|
+
|
1101
|
+
def _minimize_content(self, content: str) -> str:
|
1102
|
+
"""Minimize content to high-level summary"""
|
1103
|
+
# In production, would use NLP summarization
|
1104
|
+
words = content.split()[:20]
|
1105
|
+
return " ".join(words) + "... [Details restricted by data sharing agreement]"
|
1106
|
+
|
1107
|
+
def _anonymize_content(self, content: str) -> str:
|
1108
|
+
"""Anonymize content while preserving insights"""
|
1109
|
+
# Simple anonymization (would be more sophisticated in production)
|
1110
|
+
anonymized = content
|
1111
|
+
|
1112
|
+
# Remove organization names
|
1113
|
+
for silo in self.silos:
|
1114
|
+
anonymized = anonymized.replace(silo, "[Organization]")
|
1115
|
+
|
1116
|
+
# Remove potential identifiers
|
1117
|
+
anonymized = re.sub(r"\b\d{3,}\b", "[Number]", anonymized)
|
1118
|
+
anonymized = re.sub(r"\b[A-Z]{2,}\b", "[Identifier]", anonymized)
|
1119
|
+
|
1120
|
+
return anonymized
|
1121
|
+
|
1122
|
+
def _generate_audit_trail(
|
1123
|
+
self,
|
1124
|
+
query: str,
|
1125
|
+
requester: str,
|
1126
|
+
silo_results: List[Dict],
|
1127
|
+
governed_results: List[Dict],
|
1128
|
+
) -> Dict[str, Any]:
|
1129
|
+
"""Generate comprehensive audit trail"""
|
1130
|
+
audit = {
|
1131
|
+
"timestamp": datetime.now().isoformat(),
|
1132
|
+
"query_hash": hashlib.sha256(query.encode()).hexdigest()[:16],
|
1133
|
+
"requester": requester,
|
1134
|
+
"federation_activity": {
|
1135
|
+
"silos_queried": len(self.silos),
|
1136
|
+
"silos_responded": len([r for r in silo_results if r["participated"]]),
|
1137
|
+
"data_governance_applied": True,
|
1138
|
+
},
|
1139
|
+
"data_flow": [],
|
1140
|
+
}
|
1141
|
+
|
1142
|
+
# Track data flow
|
1143
|
+
for silo_result in silo_results:
|
1144
|
+
flow = {
|
1145
|
+
"silo": silo_result["silo"],
|
1146
|
+
"data_shared": silo_result["participated"],
|
1147
|
+
"access_level": silo_result.get("access_level", "none"),
|
1148
|
+
"governance_applied": any(
|
1149
|
+
r.get("governance_applied") for r in silo_result.get("results", [])
|
1150
|
+
),
|
1151
|
+
}
|
1152
|
+
audit["data_flow"].append(flow)
|
1153
|
+
|
1154
|
+
if self.audit_mode == "comprehensive":
|
1155
|
+
# Add detailed audit information
|
1156
|
+
audit["detailed_access"] = {
|
1157
|
+
"permissions_used": ["read_aggregated"],
|
1158
|
+
"data_categories_accessed": ["aggregated_insights"],
|
1159
|
+
"purpose_stated": "analysis",
|
1160
|
+
"retention_period": "0 days", # No retention
|
1161
|
+
}
|
1162
|
+
|
1163
|
+
return audit
|
1164
|
+
|
1165
|
+
def _generate_compliance_report(
|
1166
|
+
self, requester: str, permissions: List[str], results: List[Dict]
|
1167
|
+
) -> Dict[str, Any]:
|
1168
|
+
"""Generate compliance report"""
|
1169
|
+
return {
|
1170
|
+
"compliance_status": "compliant",
|
1171
|
+
"regulations_checked": ["GDPR", "CCPA", "Industry Standards"],
|
1172
|
+
"data_minimization": True,
|
1173
|
+
"purpose_limitation": True,
|
1174
|
+
"access_controls": "enforced",
|
1175
|
+
"audit_trail": "maintained",
|
1176
|
+
"data_retention": "none",
|
1177
|
+
"cross_border_transfer": "not_applicable",
|
1178
|
+
"user_rights": {
|
1179
|
+
"access": "supported",
|
1180
|
+
"rectification": "supported",
|
1181
|
+
"erasure": "supported",
|
1182
|
+
"portability": "limited",
|
1183
|
+
},
|
1184
|
+
}
|
1185
|
+
|
1186
|
+
|
1187
|
+
# Export all federated nodes
|
1188
|
+
__all__ = ["FederatedRAGNode", "EdgeRAGNode", "CrossSiloRAGNode"]
|