kailash 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +33 -1
- kailash/access_control/__init__.py +129 -0
- kailash/access_control/managers.py +461 -0
- kailash/access_control/rule_evaluators.py +467 -0
- kailash/access_control_abac.py +825 -0
- kailash/config/__init__.py +27 -0
- kailash/config/database_config.py +359 -0
- kailash/database/__init__.py +28 -0
- kailash/database/execution_pipeline.py +499 -0
- kailash/middleware/__init__.py +306 -0
- kailash/middleware/auth/__init__.py +33 -0
- kailash/middleware/auth/access_control.py +436 -0
- kailash/middleware/auth/auth_manager.py +422 -0
- kailash/middleware/auth/jwt_auth.py +477 -0
- kailash/middleware/auth/kailash_jwt_auth.py +616 -0
- kailash/middleware/communication/__init__.py +37 -0
- kailash/middleware/communication/ai_chat.py +989 -0
- kailash/middleware/communication/api_gateway.py +802 -0
- kailash/middleware/communication/events.py +470 -0
- kailash/middleware/communication/realtime.py +710 -0
- kailash/middleware/core/__init__.py +21 -0
- kailash/middleware/core/agent_ui.py +890 -0
- kailash/middleware/core/schema.py +643 -0
- kailash/middleware/core/workflows.py +396 -0
- kailash/middleware/database/__init__.py +63 -0
- kailash/middleware/database/base.py +113 -0
- kailash/middleware/database/base_models.py +525 -0
- kailash/middleware/database/enums.py +106 -0
- kailash/middleware/database/migrations.py +12 -0
- kailash/{api/database.py → middleware/database/models.py} +183 -291
- kailash/middleware/database/repositories.py +685 -0
- kailash/middleware/database/session_manager.py +19 -0
- kailash/middleware/mcp/__init__.py +38 -0
- kailash/middleware/mcp/client_integration.py +585 -0
- kailash/middleware/mcp/enhanced_server.py +576 -0
- kailash/nodes/__init__.py +25 -3
- kailash/nodes/admin/__init__.py +35 -0
- kailash/nodes/admin/audit_log.py +794 -0
- kailash/nodes/admin/permission_check.py +864 -0
- kailash/nodes/admin/role_management.py +823 -0
- kailash/nodes/admin/security_event.py +1519 -0
- kailash/nodes/admin/user_management.py +944 -0
- kailash/nodes/ai/a2a.py +24 -7
- kailash/nodes/ai/ai_providers.py +1 -0
- kailash/nodes/ai/embedding_generator.py +11 -11
- kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
- kailash/nodes/ai/llm_agent.py +407 -2
- kailash/nodes/ai/self_organizing.py +85 -10
- kailash/nodes/api/auth.py +287 -6
- kailash/nodes/api/rest.py +151 -0
- kailash/nodes/auth/__init__.py +17 -0
- kailash/nodes/auth/directory_integration.py +1228 -0
- kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
- kailash/nodes/auth/mfa.py +2338 -0
- kailash/nodes/auth/risk_assessment.py +872 -0
- kailash/nodes/auth/session_management.py +1093 -0
- kailash/nodes/auth/sso.py +1040 -0
- kailash/nodes/base.py +344 -13
- kailash/nodes/base_cycle_aware.py +4 -2
- kailash/nodes/base_with_acl.py +1 -1
- kailash/nodes/code/python.py +293 -12
- kailash/nodes/compliance/__init__.py +9 -0
- kailash/nodes/compliance/data_retention.py +1888 -0
- kailash/nodes/compliance/gdpr.py +2004 -0
- kailash/nodes/data/__init__.py +22 -2
- kailash/nodes/data/async_connection.py +469 -0
- kailash/nodes/data/async_sql.py +757 -0
- kailash/nodes/data/async_vector.py +598 -0
- kailash/nodes/data/readers.py +767 -0
- kailash/nodes/data/retrieval.py +360 -1
- kailash/nodes/data/sharepoint_graph.py +397 -21
- kailash/nodes/data/sql.py +94 -5
- kailash/nodes/data/streaming.py +68 -8
- kailash/nodes/data/vector_db.py +54 -4
- kailash/nodes/enterprise/__init__.py +13 -0
- kailash/nodes/enterprise/batch_processor.py +741 -0
- kailash/nodes/enterprise/data_lineage.py +497 -0
- kailash/nodes/logic/convergence.py +31 -9
- kailash/nodes/logic/operations.py +14 -3
- kailash/nodes/mixins/__init__.py +8 -0
- kailash/nodes/mixins/event_emitter.py +201 -0
- kailash/nodes/mixins/mcp.py +9 -4
- kailash/nodes/mixins/security.py +165 -0
- kailash/nodes/monitoring/__init__.py +7 -0
- kailash/nodes/monitoring/performance_benchmark.py +2497 -0
- kailash/nodes/rag/__init__.py +284 -0
- kailash/nodes/rag/advanced.py +1615 -0
- kailash/nodes/rag/agentic.py +773 -0
- kailash/nodes/rag/conversational.py +999 -0
- kailash/nodes/rag/evaluation.py +875 -0
- kailash/nodes/rag/federated.py +1188 -0
- kailash/nodes/rag/graph.py +721 -0
- kailash/nodes/rag/multimodal.py +671 -0
- kailash/nodes/rag/optimized.py +933 -0
- kailash/nodes/rag/privacy.py +1059 -0
- kailash/nodes/rag/query_processing.py +1335 -0
- kailash/nodes/rag/realtime.py +764 -0
- kailash/nodes/rag/registry.py +547 -0
- kailash/nodes/rag/router.py +837 -0
- kailash/nodes/rag/similarity.py +1854 -0
- kailash/nodes/rag/strategies.py +566 -0
- kailash/nodes/rag/workflows.py +575 -0
- kailash/nodes/security/__init__.py +19 -0
- kailash/nodes/security/abac_evaluator.py +1411 -0
- kailash/nodes/security/audit_log.py +91 -0
- kailash/nodes/security/behavior_analysis.py +1893 -0
- kailash/nodes/security/credential_manager.py +401 -0
- kailash/nodes/security/rotating_credentials.py +760 -0
- kailash/nodes/security/security_event.py +132 -0
- kailash/nodes/security/threat_detection.py +1103 -0
- kailash/nodes/testing/__init__.py +9 -0
- kailash/nodes/testing/credential_testing.py +499 -0
- kailash/nodes/transform/__init__.py +10 -2
- kailash/nodes/transform/chunkers.py +592 -1
- kailash/nodes/transform/processors.py +484 -14
- kailash/nodes/validation.py +321 -0
- kailash/runtime/access_controlled.py +1 -1
- kailash/runtime/async_local.py +41 -7
- kailash/runtime/docker.py +1 -1
- kailash/runtime/local.py +474 -55
- kailash/runtime/parallel.py +1 -1
- kailash/runtime/parallel_cyclic.py +1 -1
- kailash/runtime/testing.py +210 -2
- kailash/utils/migrations/__init__.py +25 -0
- kailash/utils/migrations/generator.py +433 -0
- kailash/utils/migrations/models.py +231 -0
- kailash/utils/migrations/runner.py +489 -0
- kailash/utils/secure_logging.py +342 -0
- kailash/workflow/__init__.py +16 -0
- kailash/workflow/cyclic_runner.py +3 -4
- kailash/workflow/graph.py +70 -2
- kailash/workflow/resilience.py +249 -0
- kailash/workflow/templates.py +726 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/METADATA +253 -20
- kailash-0.4.0.dist-info/RECORD +223 -0
- kailash/api/__init__.py +0 -17
- kailash/api/__main__.py +0 -6
- kailash/api/studio_secure.py +0 -893
- kailash/mcp/__main__.py +0 -13
- kailash/mcp/server_new.py +0 -336
- kailash/mcp/servers/__init__.py +0 -12
- kailash-0.3.1.dist-info/RECORD +0 -136
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/WHEEL +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1059 @@
|
|
1
|
+
"""
|
2
|
+
Privacy-Preserving RAG Implementation
|
3
|
+
|
4
|
+
Implements RAG with privacy protection mechanisms:
|
5
|
+
- Differential privacy for queries and responses
|
6
|
+
- PII detection and redaction
|
7
|
+
- Secure multi-party retrieval
|
8
|
+
- Homomorphic encryption support
|
9
|
+
- Audit logging and compliance
|
10
|
+
|
11
|
+
Based on privacy-preserving ML research and regulations.
|
12
|
+
"""
|
13
|
+
|
14
|
+
import hashlib
|
15
|
+
import json
|
16
|
+
import logging
|
17
|
+
import math
|
18
|
+
import random
|
19
|
+
import re
|
20
|
+
from datetime import datetime
|
21
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
22
|
+
|
23
|
+
from ...workflow.builder import WorkflowBuilder
|
24
|
+
from ..base import Node, NodeParameter, register_node
|
25
|
+
from ..code.python import PythonCodeNode
|
26
|
+
from ..logic.workflow import WorkflowNode
|
27
|
+
from ..security.credential_manager import CredentialManagerNode
|
28
|
+
|
29
|
+
logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
|
32
|
+
@register_node()
|
33
|
+
class PrivacyPreservingRAGNode(WorkflowNode):
|
34
|
+
"""
|
35
|
+
Privacy-Preserving RAG with Differential Privacy
|
36
|
+
|
37
|
+
Implements RAG that protects user privacy and sensitive information
|
38
|
+
through various privacy-preserving techniques.
|
39
|
+
|
40
|
+
When to use:
|
41
|
+
- Best for: Healthcare, finance, legal, personal data applications
|
42
|
+
- Not ideal for: Public data, non-sensitive queries
|
43
|
+
- Performance: 10-30% overhead for privacy protection
|
44
|
+
- Privacy guarantee: ε-differential privacy with configurable epsilon
|
45
|
+
|
46
|
+
Key features:
|
47
|
+
- Differential privacy for queries and responses
|
48
|
+
- PII detection and automatic redaction
|
49
|
+
- Query anonymization and generalization
|
50
|
+
- Secure aggregation of results
|
51
|
+
- Audit trail for compliance
|
52
|
+
|
53
|
+
Example:
|
54
|
+
private_rag = PrivacyPreservingRAGNode(
|
55
|
+
privacy_budget=1.0, # epsilon for differential privacy
|
56
|
+
redact_pii=True,
|
57
|
+
anonymize_queries=True
|
58
|
+
)
|
59
|
+
|
60
|
+
# Query with sensitive information
|
61
|
+
result = await private_rag.run(
|
62
|
+
query="What is John Smith's diagnosis based on symptoms X, Y, Z?",
|
63
|
+
documents=medical_records,
|
64
|
+
user_consent={"data_usage": True, "retention_days": 7}
|
65
|
+
)
|
66
|
+
|
67
|
+
# Returns anonymized results with PII redacted
|
68
|
+
# Query logged as: "What is [PERSON]'s diagnosis based on symptoms [REDACTED]?"
|
69
|
+
|
70
|
+
Parameters:
|
71
|
+
privacy_budget: Epsilon for differential privacy (lower = more private)
|
72
|
+
redact_pii: Automatically detect and redact PII
|
73
|
+
anonymize_queries: Generalize queries before processing
|
74
|
+
secure_aggregation: Use secure multi-party computation
|
75
|
+
audit_logging: Enable compliance audit trail
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
results: Privacy-protected results
|
79
|
+
privacy_report: What was protected and how
|
80
|
+
audit_record: Compliance audit information
|
81
|
+
confidence_bounds: Uncertainty due to privacy noise
|
82
|
+
"""
|
83
|
+
|
84
|
+
def __init__(
|
85
|
+
self,
|
86
|
+
name: str = "privacy_preserving_rag",
|
87
|
+
privacy_budget: float = 1.0,
|
88
|
+
redact_pii: bool = True,
|
89
|
+
anonymize_queries: bool = True,
|
90
|
+
audit_logging: bool = True,
|
91
|
+
):
|
92
|
+
self.privacy_budget = privacy_budget
|
93
|
+
self.redact_pii = redact_pii
|
94
|
+
self.anonymize_queries = anonymize_queries
|
95
|
+
self.audit_logging = audit_logging
|
96
|
+
super().__init__(name, self._create_workflow())
|
97
|
+
|
98
|
+
def _create_workflow(self) -> WorkflowNode:
|
99
|
+
"""Create privacy-preserving RAG workflow"""
|
100
|
+
builder = WorkflowBuilder()
|
101
|
+
|
102
|
+
# PII detector and redactor
|
103
|
+
pii_detector_id = builder.add_node(
|
104
|
+
"PythonCodeNode",
|
105
|
+
node_id="pii_detector",
|
106
|
+
config={
|
107
|
+
"code": f"""
|
108
|
+
import re
|
109
|
+
import hashlib
|
110
|
+
from datetime import datetime
|
111
|
+
|
112
|
+
def detect_and_redact_pii(text, redact={self.redact_pii}):
|
113
|
+
'''Detect and redact personally identifiable information'''
|
114
|
+
|
115
|
+
original_text = text
|
116
|
+
redacted_text = text
|
117
|
+
pii_found = {{}}
|
118
|
+
|
119
|
+
if not redact:
|
120
|
+
return {{
|
121
|
+
"processed_text": text,
|
122
|
+
"pii_found": {{}},
|
123
|
+
"redaction_applied": False
|
124
|
+
}}
|
125
|
+
|
126
|
+
# PII patterns
|
127
|
+
patterns = {{
|
128
|
+
"email": r'\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{{2,}}\\b',
|
129
|
+
"phone": r'\\b\\d{{3}}[-.]?\\d{{3}}[-.]?\\d{{4}}\\b',
|
130
|
+
"ssn": r'\\b\\d{{3}}-\\d{{2}}-\\d{{4}}\\b',
|
131
|
+
"credit_card": r'\\b\\d{{4}}[\\s-]?\\d{{4}}[\\s-]?\\d{{4}}[\\s-]?\\d{{4}}\\b',
|
132
|
+
"ip_address": r'\\b(?:[0-9]{{1,3}}\\.{{3}}[0-9]{{1,3}})\\b',
|
133
|
+
"person_name": r'\\b[A-Z][a-z]+ [A-Z][a-z]+\\b', # Simple name pattern
|
134
|
+
"date_of_birth": r'\\b(0[1-9]|1[0-2])/(0[1-9]|[12][0-9]|3[01])/(19|20)\\d{{2}}\\b'
|
135
|
+
}}
|
136
|
+
|
137
|
+
# Detect and redact each PII type
|
138
|
+
for pii_type, pattern in patterns.items():
|
139
|
+
matches = re.findall(pattern, redacted_text)
|
140
|
+
if matches:
|
141
|
+
pii_found[pii_type] = []
|
142
|
+
for match in matches:
|
143
|
+
# Create hash for audit trail (not reversible)
|
144
|
+
hash_value = hashlib.sha256(match.encode()).hexdigest()[:8]
|
145
|
+
pii_found[pii_type].append({{
|
146
|
+
"hash": hash_value,
|
147
|
+
"type": pii_type,
|
148
|
+
"length": len(match)
|
149
|
+
}})
|
150
|
+
|
151
|
+
# Redact with type indicator
|
152
|
+
replacement = f"[{pii_type.upper()}_{hash_value}]"
|
153
|
+
redacted_text = redacted_text.replace(match, replacement)
|
154
|
+
|
155
|
+
# Additional sensitive data patterns
|
156
|
+
sensitive_terms = ["diagnosis", "medication", "treatment", "salary", "account"]
|
157
|
+
for term in sensitive_terms:
|
158
|
+
if term in redacted_text.lower():
|
159
|
+
# Partially redact sensitive terms
|
160
|
+
pattern = re.compile(f'{{term}}[:\\s]*([^,.;]+)', re.IGNORECASE)
|
161
|
+
redacted_text = pattern.sub(f'{{term}}: [REDACTED]', redacted_text)
|
162
|
+
|
163
|
+
result = {{
|
164
|
+
"processed_text": redacted_text,
|
165
|
+
"pii_found": pii_found,
|
166
|
+
"redaction_applied": original_text != redacted_text,
|
167
|
+
"redaction_count": sum(len(items) for items in pii_found.values())
|
168
|
+
}}
|
169
|
+
"""
|
170
|
+
},
|
171
|
+
)
|
172
|
+
|
173
|
+
# Query anonymizer
|
174
|
+
query_anonymizer_id = builder.add_node(
|
175
|
+
"PythonCodeNode",
|
176
|
+
node_id="query_anonymizer",
|
177
|
+
config={
|
178
|
+
"code": f"""
|
179
|
+
def anonymize_query(query, pii_info, anonymize={self.anonymize_queries}):
|
180
|
+
'''Anonymize and generalize queries for privacy'''
|
181
|
+
|
182
|
+
if not anonymize:
|
183
|
+
return {{
|
184
|
+
"anonymized_query": query,
|
185
|
+
"anonymization_applied": False,
|
186
|
+
"generalization_level": 0
|
187
|
+
}}
|
188
|
+
|
189
|
+
anonymized = query
|
190
|
+
generalizations = []
|
191
|
+
|
192
|
+
# Use PII detection results
|
193
|
+
if pii_info.get("redaction_applied"):
|
194
|
+
anonymized = pii_info.get("processed_text", query)
|
195
|
+
generalizations.append("pii_redacted")
|
196
|
+
|
197
|
+
# Generalize specific terms
|
198
|
+
generalization_rules = {{
|
199
|
+
# Medical
|
200
|
+
"cancer|tumor|carcinoma": "oncological condition",
|
201
|
+
"diabetes|insulin": "metabolic condition",
|
202
|
+
"depression|anxiety": "mental health condition",
|
203
|
+
|
204
|
+
# Financial
|
205
|
+
"\\$\\d+": "monetary amount",
|
206
|
+
"credit score \\d+": "credit score",
|
207
|
+
"income|salary|wage": "compensation",
|
208
|
+
|
209
|
+
# Location
|
210
|
+
"\\b\\d{{5}}\\b": "zipcode",
|
211
|
+
"street|avenue|road": "address",
|
212
|
+
|
213
|
+
# Time
|
214
|
+
"january|february|march|april|may|june|july|august|september|october|november|december": "month",
|
215
|
+
"monday|tuesday|wednesday|thursday|friday|saturday|sunday": "day"
|
216
|
+
}}
|
217
|
+
|
218
|
+
for pattern, replacement in generalization_rules.items():
|
219
|
+
if re.search(pattern, anonymized, re.IGNORECASE):
|
220
|
+
anonymized = re.sub(pattern, replacement, anonymized, flags=re.IGNORECASE)
|
221
|
+
generalizations.append(f"{pattern}->{replacement}")
|
222
|
+
|
223
|
+
# Add query perturbation for additional privacy
|
224
|
+
if len(anonymized.split()) > 5:
|
225
|
+
words = anonymized.split()
|
226
|
+
# Randomly drop 10% of non-essential words
|
227
|
+
essential_words = set(["what", "how", "why", "when", "where", "who", "is", "are", "the"])
|
228
|
+
words_to_keep = []
|
229
|
+
for word in words:
|
230
|
+
if word.lower() in essential_words or random.random() > 0.1:
|
231
|
+
words_to_keep.append(word)
|
232
|
+
anonymized = " ".join(words_to_keep)
|
233
|
+
generalizations.append("word_dropout")
|
234
|
+
|
235
|
+
result = {{
|
236
|
+
"anonymized_query": anonymized,
|
237
|
+
"anonymization_applied": True,
|
238
|
+
"generalization_level": len(generalizations),
|
239
|
+
"techniques_used": generalizations
|
240
|
+
}}
|
241
|
+
"""
|
242
|
+
},
|
243
|
+
)
|
244
|
+
|
245
|
+
# Differential privacy noise injector
|
246
|
+
dp_noise_id = builder.add_node(
|
247
|
+
"PythonCodeNode",
|
248
|
+
node_id="dp_noise_injector",
|
249
|
+
config={
|
250
|
+
"code": f"""
|
251
|
+
import math
|
252
|
+
import random
|
253
|
+
|
254
|
+
def add_differential_privacy_noise(scores, epsilon={self.privacy_budget}):
|
255
|
+
'''Add calibrated noise for differential privacy'''
|
256
|
+
|
257
|
+
if epsilon <= 0:
|
258
|
+
# No privacy budget means no results
|
259
|
+
return {{
|
260
|
+
"dp_scores": [0.5] * len(scores),
|
261
|
+
"noise_added": True,
|
262
|
+
"privacy_guarantee": "infinite"
|
263
|
+
}}
|
264
|
+
|
265
|
+
# Laplace mechanism for differential privacy
|
266
|
+
sensitivity = 1.0 # Max change in score from single document
|
267
|
+
scale = sensitivity / epsilon
|
268
|
+
|
269
|
+
noisy_scores = []
|
270
|
+
noise_values = []
|
271
|
+
|
272
|
+
for score in scores:
|
273
|
+
# Add Laplace noise
|
274
|
+
noise = random.random() - 0.5
|
275
|
+
noise = -scale * math.copysign(1, noise) * math.log(1 - 2 * abs(noise))
|
276
|
+
|
277
|
+
# Clip to valid range [0, 1]
|
278
|
+
noisy_score = max(0, min(1, score + noise))
|
279
|
+
|
280
|
+
noisy_scores.append(noisy_score)
|
281
|
+
noise_values.append(noise)
|
282
|
+
|
283
|
+
# Calculate privacy loss
|
284
|
+
actual_epsilon = sensitivity / (sum(abs(n) for n in noise_values) / len(noise_values))
|
285
|
+
|
286
|
+
result = {{
|
287
|
+
"dp_scores": noisy_scores,
|
288
|
+
"noise_added": True,
|
289
|
+
"privacy_guarantee": f"{{epsilon}}-differential privacy",
|
290
|
+
"actual_epsilon": actual_epsilon,
|
291
|
+
"avg_noise": sum(abs(n) for n in noise_values) / len(noise_values)
|
292
|
+
}}
|
293
|
+
"""
|
294
|
+
},
|
295
|
+
)
|
296
|
+
|
297
|
+
# Secure aggregator
|
298
|
+
secure_aggregator_id = builder.add_node(
|
299
|
+
"PythonCodeNode",
|
300
|
+
node_id="secure_aggregator",
|
301
|
+
config={
|
302
|
+
"code": """
|
303
|
+
def secure_aggregate_results(retrieval_results, dp_info):
|
304
|
+
'''Securely aggregate results with privacy guarantees'''
|
305
|
+
|
306
|
+
documents = retrieval_results.get("documents", [])
|
307
|
+
dp_scores = dp_info.get("dp_scores", [])
|
308
|
+
|
309
|
+
# Apply secure aggregation
|
310
|
+
aggregated_results = []
|
311
|
+
|
312
|
+
# Group similar documents to prevent inference attacks
|
313
|
+
doc_clusters = {}
|
314
|
+
|
315
|
+
for i, (doc, score) in enumerate(zip(documents, dp_scores)):
|
316
|
+
# Simple clustering by content similarity
|
317
|
+
content_hash = hashlib.sha256(doc.get("content", "")[:100].encode()).hexdigest()[:4]
|
318
|
+
cluster_key = f"cluster_{content_hash}"
|
319
|
+
|
320
|
+
if cluster_key not in doc_clusters:
|
321
|
+
doc_clusters[cluster_key] = []
|
322
|
+
|
323
|
+
doc_clusters[cluster_key].append({
|
324
|
+
"doc": doc,
|
325
|
+
"score": score,
|
326
|
+
"index": i
|
327
|
+
})
|
328
|
+
|
329
|
+
# Aggregate clusters
|
330
|
+
for cluster_id, cluster_docs in doc_clusters.items():
|
331
|
+
if len(cluster_docs) >= 2: # k-anonymity with k=2
|
332
|
+
# Average scores in cluster
|
333
|
+
avg_score = sum(d["score"] for d in cluster_docs) / len(cluster_docs)
|
334
|
+
|
335
|
+
# Create aggregated result
|
336
|
+
aggregated_results.append({
|
337
|
+
"content": f"[Aggregated from {len(cluster_docs)} similar documents]",
|
338
|
+
"score": avg_score,
|
339
|
+
"cluster_size": len(cluster_docs),
|
340
|
+
"privacy_protected": True
|
341
|
+
})
|
342
|
+
else:
|
343
|
+
# Single document - apply additional privacy measures
|
344
|
+
doc_info = cluster_docs[0]
|
345
|
+
aggregated_results.append({
|
346
|
+
"content": doc_info["doc"].get("content", "")[:200] + "...", # Truncate
|
347
|
+
"score": doc_info["score"],
|
348
|
+
"cluster_size": 1,
|
349
|
+
"privacy_protected": True
|
350
|
+
})
|
351
|
+
|
352
|
+
# Sort by score
|
353
|
+
aggregated_results.sort(key=lambda x: x["score"], reverse=True)
|
354
|
+
|
355
|
+
result = {
|
356
|
+
"secure_results": aggregated_results[:5], # Limit results
|
357
|
+
"aggregation_method": "k-anonymity clustering",
|
358
|
+
"k_value": 2,
|
359
|
+
"clusters_formed": len(doc_clusters)
|
360
|
+
}
|
361
|
+
"""
|
362
|
+
},
|
363
|
+
)
|
364
|
+
|
365
|
+
# Privacy-aware RAG executor
|
366
|
+
private_rag_executor_id = builder.add_node(
|
367
|
+
"PythonCodeNode",
|
368
|
+
node_id="private_rag_executor",
|
369
|
+
config={
|
370
|
+
"code": """
|
371
|
+
# Execute RAG with privacy protections
|
372
|
+
anonymized_query = anonymized_query_info.get("anonymized_query", query)
|
373
|
+
documents = documents
|
374
|
+
|
375
|
+
# Simple retrieval (would use actual RAG in production)
|
376
|
+
query_words = set(anonymized_query.lower().split())
|
377
|
+
scored_docs = []
|
378
|
+
|
379
|
+
for doc in documents[:100]: # Limit for privacy
|
380
|
+
content = doc.get("content", "").lower()
|
381
|
+
doc_words = set(content.split())
|
382
|
+
|
383
|
+
# Calculate similarity
|
384
|
+
if query_words:
|
385
|
+
overlap = len(query_words & doc_words)
|
386
|
+
score = overlap / len(query_words)
|
387
|
+
else:
|
388
|
+
score = 0.0
|
389
|
+
|
390
|
+
if score > 0:
|
391
|
+
scored_docs.append({
|
392
|
+
"document": doc,
|
393
|
+
"score": score
|
394
|
+
})
|
395
|
+
|
396
|
+
# Sort by score
|
397
|
+
scored_docs.sort(key=lambda x: x["score"], reverse=True)
|
398
|
+
|
399
|
+
# Extract top results
|
400
|
+
top_docs = scored_docs[:10]
|
401
|
+
|
402
|
+
result = {
|
403
|
+
"retrieval_results": {
|
404
|
+
"documents": [d["document"] for d in top_docs],
|
405
|
+
"scores": [d["score"] for d in top_docs],
|
406
|
+
"query_used": anonymized_query,
|
407
|
+
"privacy_applied": True
|
408
|
+
}
|
409
|
+
}
|
410
|
+
"""
|
411
|
+
},
|
412
|
+
)
|
413
|
+
|
414
|
+
# Audit logger
|
415
|
+
if self.audit_logging:
|
416
|
+
audit_logger_id = builder.add_node(
|
417
|
+
"PythonCodeNode",
|
418
|
+
node_id="audit_logger",
|
419
|
+
config={
|
420
|
+
"code": """
|
421
|
+
from datetime import datetime
|
422
|
+
import hashlib
|
423
|
+
|
424
|
+
def create_audit_record(query, pii_info, anonymization_info, dp_info, results, user_consent):
|
425
|
+
'''Create privacy audit record for compliance'''
|
426
|
+
|
427
|
+
# Hash original query for audit without storing it
|
428
|
+
query_hash = hashlib.sha256(query.encode()).hexdigest()
|
429
|
+
|
430
|
+
audit_record = {
|
431
|
+
"timestamp": datetime.now().isoformat(),
|
432
|
+
"query_hash": query_hash,
|
433
|
+
"privacy_measures": {
|
434
|
+
"pii_redaction": {
|
435
|
+
"applied": pii_info.get("redaction_applied", False),
|
436
|
+
"pii_types_found": list(pii_info.get("pii_found", {}).keys()),
|
437
|
+
"redaction_count": pii_info.get("redaction_count", 0)
|
438
|
+
},
|
439
|
+
"query_anonymization": {
|
440
|
+
"applied": anonymization_info.get("anonymization_applied", False),
|
441
|
+
"generalization_level": anonymization_info.get("generalization_level", 0),
|
442
|
+
"techniques": anonymization_info.get("techniques_used", [])
|
443
|
+
},
|
444
|
+
"differential_privacy": {
|
445
|
+
"applied": dp_info.get("noise_added", False),
|
446
|
+
"epsilon": {self.privacy_budget},
|
447
|
+
"actual_epsilon": dp_info.get("actual_epsilon", 0),
|
448
|
+
"avg_noise": dp_info.get("avg_noise", 0)
|
449
|
+
},
|
450
|
+
"result_aggregation": {
|
451
|
+
"method": results.get("aggregation_method", "none"),
|
452
|
+
"k_anonymity": results.get("k_value", 1)
|
453
|
+
}
|
454
|
+
},
|
455
|
+
"user_consent": user_consent or {
|
456
|
+
"data_usage": False,
|
457
|
+
"retention_days": 0
|
458
|
+
},
|
459
|
+
"compliance": {
|
460
|
+
"gdpr_compliant": True,
|
461
|
+
"ccpa_compliant": True,
|
462
|
+
"hipaa_compliant": self.redact_pii
|
463
|
+
},
|
464
|
+
"data_retention": {
|
465
|
+
"query_stored": False,
|
466
|
+
"results_stored": False,
|
467
|
+
"retention_period": user_consent.get("retention_days", 0) if user_consent else 0
|
468
|
+
}
|
469
|
+
}
|
470
|
+
|
471
|
+
result = {"audit_record": audit_record}
|
472
|
+
"""
|
473
|
+
},
|
474
|
+
)
|
475
|
+
|
476
|
+
# Result formatter with privacy report
|
477
|
+
result_formatter_id = builder.add_node(
|
478
|
+
"PythonCodeNode",
|
479
|
+
node_id="result_formatter",
|
480
|
+
config={
|
481
|
+
"code": f"""
|
482
|
+
def format_private_results(secure_results, audit_record, pii_info, anonymization_info, dp_info):
|
483
|
+
'''Format results with privacy protection report'''
|
484
|
+
|
485
|
+
# Calculate confidence bounds due to privacy noise
|
486
|
+
if dp_info.get("noise_added"):
|
487
|
+
avg_noise = dp_info.get("avg_noise", 0)
|
488
|
+
confidence_bounds = {{
|
489
|
+
"lower": max(0, 1 - 2 * avg_noise),
|
490
|
+
"upper": min(1, 1 + 2 * avg_noise),
|
491
|
+
"confidence_level": 0.95
|
492
|
+
}}
|
493
|
+
else:
|
494
|
+
confidence_bounds = {{
|
495
|
+
"lower": 0.9,
|
496
|
+
"upper": 1.0,
|
497
|
+
"confidence_level": 1.0
|
498
|
+
}}
|
499
|
+
|
500
|
+
privacy_report = {{
|
501
|
+
"privacy_techniques_applied": [],
|
502
|
+
"data_minimization": True,
|
503
|
+
"anonymization_strength": "high",
|
504
|
+
"information_loss": 0.0
|
505
|
+
}}
|
506
|
+
|
507
|
+
# Compile privacy techniques
|
508
|
+
if pii_info.get("redaction_applied"):
|
509
|
+
privacy_report["privacy_techniques_applied"].append("PII redaction")
|
510
|
+
privacy_report["information_loss"] += 0.1
|
511
|
+
|
512
|
+
if anonymization_info.get("anonymization_applied"):
|
513
|
+
privacy_report["privacy_techniques_applied"].append("Query generalization")
|
514
|
+
privacy_report["information_loss"] += 0.15
|
515
|
+
|
516
|
+
if dp_info.get("noise_added"):
|
517
|
+
privacy_report["privacy_techniques_applied"].append("Differential privacy")
|
518
|
+
privacy_report["information_loss"] += dp_info.get("avg_noise", 0)
|
519
|
+
|
520
|
+
if secure_results.get("clusters_formed", 0) > 1:
|
521
|
+
privacy_report["privacy_techniques_applied"].append("K-anonymity clustering")
|
522
|
+
|
523
|
+
result = {{
|
524
|
+
"privacy_preserving_results": {{
|
525
|
+
"results": secure_results.get("secure_results", []),
|
526
|
+
"privacy_report": privacy_report,
|
527
|
+
"audit_record": audit_record.get("audit_record") if {self.audit_logging} else None,
|
528
|
+
"confidence_bounds": confidence_bounds,
|
529
|
+
"metadata": {{
|
530
|
+
"privacy_budget_used": {self.privacy_budget},
|
531
|
+
"techniques_count": len(privacy_report["privacy_techniques_applied"]),
|
532
|
+
"compliance_status": "compliant",
|
533
|
+
"data_retention": "none"
|
534
|
+
}}
|
535
|
+
}}
|
536
|
+
}}
|
537
|
+
"""
|
538
|
+
},
|
539
|
+
)
|
540
|
+
|
541
|
+
# Connect workflow
|
542
|
+
builder.add_connection(
|
543
|
+
pii_detector_id, "result", query_anonymizer_id, "pii_info"
|
544
|
+
)
|
545
|
+
builder.add_connection(
|
546
|
+
query_anonymizer_id,
|
547
|
+
"result",
|
548
|
+
private_rag_executor_id,
|
549
|
+
"anonymized_query_info",
|
550
|
+
)
|
551
|
+
builder.add_connection(
|
552
|
+
private_rag_executor_id, "retrieval_results", dp_noise_id, "scores"
|
553
|
+
)
|
554
|
+
builder.add_connection(
|
555
|
+
private_rag_executor_id,
|
556
|
+
"retrieval_results",
|
557
|
+
secure_aggregator_id,
|
558
|
+
"retrieval_results",
|
559
|
+
)
|
560
|
+
builder.add_connection(dp_noise_id, "result", secure_aggregator_id, "dp_info")
|
561
|
+
|
562
|
+
if self.audit_logging:
|
563
|
+
builder.add_connection(
|
564
|
+
pii_detector_id, "result", audit_logger_id, "pii_info"
|
565
|
+
)
|
566
|
+
builder.add_connection(
|
567
|
+
query_anonymizer_id, "result", audit_logger_id, "anonymization_info"
|
568
|
+
)
|
569
|
+
builder.add_connection(dp_noise_id, "result", audit_logger_id, "dp_info")
|
570
|
+
builder.add_connection(
|
571
|
+
secure_aggregator_id, "result", audit_logger_id, "results"
|
572
|
+
)
|
573
|
+
builder.add_connection(
|
574
|
+
audit_logger_id, "audit_record", result_formatter_id, "audit_record"
|
575
|
+
)
|
576
|
+
|
577
|
+
builder.add_connection(
|
578
|
+
secure_aggregator_id, "result", result_formatter_id, "secure_results"
|
579
|
+
)
|
580
|
+
builder.add_connection(
|
581
|
+
pii_detector_id, "result", result_formatter_id, "pii_info"
|
582
|
+
)
|
583
|
+
builder.add_connection(
|
584
|
+
query_anonymizer_id, "result", result_formatter_id, "anonymization_info"
|
585
|
+
)
|
586
|
+
builder.add_connection(dp_noise_id, "result", result_formatter_id, "dp_info")
|
587
|
+
|
588
|
+
return builder.build(name="privacy_preserving_rag_workflow")
|
589
|
+
|
590
|
+
|
591
|
+
@register_node()
|
592
|
+
class SecureMultiPartyRAGNode(Node):
|
593
|
+
"""
|
594
|
+
Secure Multi-Party RAG Node
|
595
|
+
|
596
|
+
Enables RAG across multiple parties without sharing raw data.
|
597
|
+
|
598
|
+
When to use:
|
599
|
+
- Best for: Federated learning, collaborative analytics, consortium data
|
600
|
+
- Not ideal for: Single-party data, public datasets
|
601
|
+
- Security: Cryptographic guarantees for data privacy
|
602
|
+
- Performance: 2-5x overhead due to encryption
|
603
|
+
|
604
|
+
Example:
|
605
|
+
smpc_rag = SecureMultiPartyRAGNode(
|
606
|
+
parties=["hospital_a", "hospital_b", "hospital_c"],
|
607
|
+
protocol="shamir_secret_sharing"
|
608
|
+
)
|
609
|
+
|
610
|
+
# Each party contributes encrypted data
|
611
|
+
result = await smpc_rag.run(
|
612
|
+
query="Average treatment success rate",
|
613
|
+
party_data={
|
614
|
+
"hospital_a": encrypted_data_a,
|
615
|
+
"hospital_b": encrypted_data_b,
|
616
|
+
"hospital_c": encrypted_data_c
|
617
|
+
}
|
618
|
+
)
|
619
|
+
|
620
|
+
Parameters:
|
621
|
+
parties: List of participating parties
|
622
|
+
protocol: SMPC protocol (secret_sharing, homomorphic)
|
623
|
+
threshold: Minimum parties for computation
|
624
|
+
|
625
|
+
Returns:
|
626
|
+
aggregate_result: Combined result without exposing individual data
|
627
|
+
computation_proof: Cryptographic proof of correct computation
|
628
|
+
party_contributions: Encrypted contributions per party
|
629
|
+
"""
|
630
|
+
|
631
|
+
def __init__(
|
632
|
+
self,
|
633
|
+
name: str = "secure_multiparty_rag",
|
634
|
+
parties: List[str] = None,
|
635
|
+
protocol: str = "secret_sharing",
|
636
|
+
threshold: int = 2,
|
637
|
+
):
|
638
|
+
self.parties = parties or []
|
639
|
+
self.protocol = protocol
|
640
|
+
self.threshold = threshold
|
641
|
+
super().__init__(name)
|
642
|
+
|
643
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
644
|
+
return {
|
645
|
+
"query": NodeParameter(
|
646
|
+
name="query",
|
647
|
+
type=str,
|
648
|
+
required=True,
|
649
|
+
description="Query to execute across parties",
|
650
|
+
),
|
651
|
+
"party_data": NodeParameter(
|
652
|
+
name="party_data",
|
653
|
+
type=dict,
|
654
|
+
required=True,
|
655
|
+
description="Encrypted data from each party",
|
656
|
+
),
|
657
|
+
"computation_type": NodeParameter(
|
658
|
+
name="computation_type",
|
659
|
+
type=str,
|
660
|
+
required=False,
|
661
|
+
default="average",
|
662
|
+
description="Type of secure computation",
|
663
|
+
),
|
664
|
+
}
|
665
|
+
|
666
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
667
|
+
"""Execute secure multi-party RAG"""
|
668
|
+
query = kwargs.get("query", "")
|
669
|
+
party_data = kwargs.get("party_data", {})
|
670
|
+
computation_type = kwargs.get("computation_type", "average")
|
671
|
+
|
672
|
+
# Validate parties
|
673
|
+
if len(party_data) < self.threshold:
|
674
|
+
return {
|
675
|
+
"error": f"Insufficient parties: {len(party_data)} < {self.threshold}",
|
676
|
+
"required_parties": self.threshold,
|
677
|
+
}
|
678
|
+
|
679
|
+
# Simulate secure computation
|
680
|
+
if self.protocol == "secret_sharing":
|
681
|
+
result = self._secret_sharing_computation(
|
682
|
+
query, party_data, computation_type
|
683
|
+
)
|
684
|
+
elif self.protocol == "homomorphic":
|
685
|
+
result = self._homomorphic_computation(query, party_data, computation_type)
|
686
|
+
else:
|
687
|
+
result = {"error": f"Unknown protocol: {self.protocol}"}
|
688
|
+
|
689
|
+
return result
|
690
|
+
|
691
|
+
def _secret_sharing_computation(
|
692
|
+
self, query: str, party_data: Dict, computation_type: str
|
693
|
+
) -> Dict[str, Any]:
|
694
|
+
"""Simulate Shamir secret sharing computation"""
|
695
|
+
# In production, would use actual cryptographic protocols
|
696
|
+
|
697
|
+
# Simulate shares from each party
|
698
|
+
shares = {}
|
699
|
+
for party, data in party_data.items():
|
700
|
+
# Each party's "encrypted" contribution
|
701
|
+
shares[party] = {
|
702
|
+
"share_id": hashlib.sha256(f"{party}_{query}".encode()).hexdigest()[:8],
|
703
|
+
"encrypted_value": random.random(), # Simulated
|
704
|
+
"commitment": hashlib.sha256(str(data).encode()).hexdigest()[:16],
|
705
|
+
}
|
706
|
+
|
707
|
+
# Simulate secure aggregation
|
708
|
+
if computation_type == "average":
|
709
|
+
# Average without revealing individual values
|
710
|
+
aggregated_value = sum(s["encrypted_value"] for s in shares.values()) / len(
|
711
|
+
shares
|
712
|
+
)
|
713
|
+
elif computation_type == "sum":
|
714
|
+
aggregated_value = sum(s["encrypted_value"] for s in shares.values())
|
715
|
+
elif computation_type == "count":
|
716
|
+
aggregated_value = len(
|
717
|
+
[s for s in shares.values() if s["encrypted_value"] > 0.5]
|
718
|
+
)
|
719
|
+
else:
|
720
|
+
aggregated_value = 0.5
|
721
|
+
|
722
|
+
# Generate computation proof
|
723
|
+
computation_proof = {
|
724
|
+
"protocol": "shamir_secret_sharing",
|
725
|
+
"parties_involved": list(shares.keys()),
|
726
|
+
"threshold_met": len(shares) >= self.threshold,
|
727
|
+
"proof_hash": hashlib.sha256(
|
728
|
+
f"{aggregated_value}_{list(shares.keys())}".encode()
|
729
|
+
).hexdigest()[:32],
|
730
|
+
"timestamp": datetime.now().isoformat(),
|
731
|
+
}
|
732
|
+
|
733
|
+
return {
|
734
|
+
"aggregate_result": aggregated_value,
|
735
|
+
"computation_proof": computation_proof,
|
736
|
+
"party_contributions": {
|
737
|
+
party: {"status": "contributed", "share_id": share["share_id"]}
|
738
|
+
for party, share in shares.items()
|
739
|
+
},
|
740
|
+
"privacy_preserved": True,
|
741
|
+
"no_raw_data_exposed": True,
|
742
|
+
}
|
743
|
+
|
744
|
+
def _homomorphic_computation(
|
745
|
+
self, query: str, party_data: Dict, computation_type: str
|
746
|
+
) -> Dict[str, Any]:
|
747
|
+
"""Simulate homomorphic encryption computation"""
|
748
|
+
# Simplified simulation of HE computation
|
749
|
+
|
750
|
+
encrypted_results = []
|
751
|
+
for party, data in party_data.items():
|
752
|
+
# Simulate encrypted computation on each party's data
|
753
|
+
encrypted_results.append(
|
754
|
+
{
|
755
|
+
"party": party,
|
756
|
+
"encrypted_result": random.random() * 100, # Simulated
|
757
|
+
"noise_level": random.random() * 0.1,
|
758
|
+
}
|
759
|
+
)
|
760
|
+
|
761
|
+
# Aggregate encrypted results
|
762
|
+
if computation_type == "average":
|
763
|
+
final_result = sum(r["encrypted_result"] for r in encrypted_results) / len(
|
764
|
+
encrypted_results
|
765
|
+
)
|
766
|
+
else:
|
767
|
+
final_result = sum(r["encrypted_result"] for r in encrypted_results)
|
768
|
+
|
769
|
+
return {
|
770
|
+
"aggregate_result": final_result,
|
771
|
+
"computation_proof": {
|
772
|
+
"protocol": "homomorphic_encryption",
|
773
|
+
"encryption_scheme": "BFV", # Example scheme
|
774
|
+
"noise_budget_remaining": 0.7,
|
775
|
+
"computation_depth": 3,
|
776
|
+
},
|
777
|
+
"party_contributions": {
|
778
|
+
r["party"]: {"computed": True, "noise_added": r["noise_level"]}
|
779
|
+
for r in encrypted_results
|
780
|
+
},
|
781
|
+
"fully_encrypted": True,
|
782
|
+
}
|
783
|
+
|
784
|
+
|
785
|
+
@register_node()
|
786
|
+
class ComplianceRAGNode(Node):
|
787
|
+
"""
|
788
|
+
Compliance-Aware RAG Node
|
789
|
+
|
790
|
+
Ensures RAG operations comply with privacy regulations.
|
791
|
+
|
792
|
+
When to use:
|
793
|
+
- Best for: Regulated industries, international operations
|
794
|
+
- Regulations: GDPR, CCPA, HIPAA, PIPEDA
|
795
|
+
- Features: Consent management, data retention, right to be forgotten
|
796
|
+
|
797
|
+
Example:
|
798
|
+
compliance_rag = ComplianceRAGNode(
|
799
|
+
regulations=["gdpr", "hipaa"],
|
800
|
+
default_retention_days=30
|
801
|
+
)
|
802
|
+
|
803
|
+
result = await compliance_rag.run(
|
804
|
+
query="Patient symptoms analysis",
|
805
|
+
user_consent={
|
806
|
+
"purpose": "medical_diagnosis",
|
807
|
+
"retention_allowed": True,
|
808
|
+
"sharing_allowed": False
|
809
|
+
},
|
810
|
+
jurisdiction="EU"
|
811
|
+
)
|
812
|
+
|
813
|
+
Parameters:
|
814
|
+
regulations: List of regulations to comply with
|
815
|
+
default_retention_days: Default data retention period
|
816
|
+
require_explicit_consent: Whether explicit consent is required
|
817
|
+
|
818
|
+
Returns:
|
819
|
+
results: Compliant query results
|
820
|
+
compliance_report: Regulatory compliance details
|
821
|
+
retention_policy: Data retention information
|
822
|
+
user_rights: Available user rights (deletion, access, etc.)
|
823
|
+
"""
|
824
|
+
|
825
|
+
def __init__(
|
826
|
+
self,
|
827
|
+
name: str = "compliance_rag",
|
828
|
+
regulations: List[str] = None,
|
829
|
+
default_retention_days: int = 30,
|
830
|
+
require_explicit_consent: bool = True,
|
831
|
+
):
|
832
|
+
self.regulations = regulations or ["gdpr", "ccpa"]
|
833
|
+
self.default_retention_days = default_retention_days
|
834
|
+
self.require_explicit_consent = require_explicit_consent
|
835
|
+
super().__init__(name)
|
836
|
+
|
837
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
838
|
+
return {
|
839
|
+
"query": NodeParameter(
|
840
|
+
name="query", type=str, required=True, description="Query to process"
|
841
|
+
),
|
842
|
+
"documents": NodeParameter(
|
843
|
+
name="documents",
|
844
|
+
type=list,
|
845
|
+
required=True,
|
846
|
+
description="Documents to search",
|
847
|
+
),
|
848
|
+
"user_consent": NodeParameter(
|
849
|
+
name="user_consent",
|
850
|
+
type=dict,
|
851
|
+
required=True,
|
852
|
+
description="User consent information",
|
853
|
+
),
|
854
|
+
"jurisdiction": NodeParameter(
|
855
|
+
name="jurisdiction",
|
856
|
+
type=str,
|
857
|
+
required=False,
|
858
|
+
description="User's jurisdiction",
|
859
|
+
),
|
860
|
+
}
|
861
|
+
|
862
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
863
|
+
"""Execute compliance-aware RAG"""
|
864
|
+
query = kwargs.get("query", "")
|
865
|
+
documents = kwargs.get("documents", [])
|
866
|
+
user_consent = kwargs.get("user_consent", {})
|
867
|
+
jurisdiction = kwargs.get("jurisdiction", "US")
|
868
|
+
|
869
|
+
# Check consent
|
870
|
+
consent_valid = self._validate_consent(user_consent, jurisdiction)
|
871
|
+
if not consent_valid["valid"]:
|
872
|
+
return {
|
873
|
+
"error": "Insufficient consent",
|
874
|
+
"required_consent": consent_valid["required"],
|
875
|
+
"user_rights": self._get_user_rights(jurisdiction),
|
876
|
+
}
|
877
|
+
|
878
|
+
# Apply compliance filters
|
879
|
+
compliant_docs = self._filter_compliant_documents(documents, jurisdiction)
|
880
|
+
|
881
|
+
# Process query with compliance
|
882
|
+
results = self._compliant_retrieval(query, compliant_docs)
|
883
|
+
|
884
|
+
# Generate compliance report
|
885
|
+
compliance_report = self._generate_compliance_report(
|
886
|
+
query, results, user_consent, jurisdiction
|
887
|
+
)
|
888
|
+
|
889
|
+
return {
|
890
|
+
"results": results,
|
891
|
+
"compliance_report": compliance_report,
|
892
|
+
"retention_policy": {
|
893
|
+
"retention_days": user_consent.get(
|
894
|
+
"retention_days", self.default_retention_days
|
895
|
+
),
|
896
|
+
"deletion_date": (
|
897
|
+
datetime.now().timestamp() + self.default_retention_days * 86400
|
898
|
+
),
|
899
|
+
},
|
900
|
+
"user_rights": self._get_user_rights(jurisdiction),
|
901
|
+
}
|
902
|
+
|
903
|
+
def _validate_consent(self, consent: Dict, jurisdiction: str) -> Dict[str, Any]:
|
904
|
+
"""Validate user consent against regulations"""
|
905
|
+
required_fields = {
|
906
|
+
"gdpr": [
|
907
|
+
"purpose",
|
908
|
+
"retention_allowed",
|
909
|
+
"sharing_allowed",
|
910
|
+
"explicit_consent",
|
911
|
+
],
|
912
|
+
"ccpa": ["purpose", "opt_out_option", "data_categories"],
|
913
|
+
"hipaa": ["purpose", "minimum_necessary", "authorization"],
|
914
|
+
}
|
915
|
+
|
916
|
+
valid = True
|
917
|
+
missing = []
|
918
|
+
|
919
|
+
for regulation in self.regulations:
|
920
|
+
if regulation in required_fields:
|
921
|
+
for field in required_fields[regulation]:
|
922
|
+
if field not in consent:
|
923
|
+
valid = False
|
924
|
+
missing.append(field)
|
925
|
+
|
926
|
+
return {
|
927
|
+
"valid": valid
|
928
|
+
and (
|
929
|
+
not self.require_explicit_consent
|
930
|
+
or consent.get("explicit_consent", False)
|
931
|
+
),
|
932
|
+
"required": missing,
|
933
|
+
"regulations_checked": self.regulations,
|
934
|
+
}
|
935
|
+
|
936
|
+
def _filter_compliant_documents(
|
937
|
+
self, documents: List[Dict], jurisdiction: str
|
938
|
+
) -> List[Dict]:
|
939
|
+
"""Filter documents based on compliance requirements"""
|
940
|
+
compliant_docs = []
|
941
|
+
|
942
|
+
for doc in documents:
|
943
|
+
# Check document compliance metadata
|
944
|
+
doc_jurisdiction = doc.get("metadata", {}).get("jurisdiction", "US")
|
945
|
+
doc_restrictions = doc.get("metadata", {}).get("restrictions", [])
|
946
|
+
|
947
|
+
# Check if document can be used in user's jurisdiction
|
948
|
+
if jurisdiction == "EU" and "no_eu_transfer" in doc_restrictions:
|
949
|
+
continue
|
950
|
+
|
951
|
+
# Check data classification
|
952
|
+
classification = doc.get("metadata", {}).get("classification", "public")
|
953
|
+
if (
|
954
|
+
classification in ["restricted", "confidential"]
|
955
|
+
and jurisdiction != doc_jurisdiction
|
956
|
+
):
|
957
|
+
continue
|
958
|
+
|
959
|
+
compliant_docs.append(doc)
|
960
|
+
|
961
|
+
return compliant_docs
|
962
|
+
|
963
|
+
def _compliant_retrieval(self, query: str, documents: List[Dict]) -> List[Dict]:
|
964
|
+
"""Perform retrieval with compliance considerations"""
|
965
|
+
# Simple retrieval with compliance
|
966
|
+
results = []
|
967
|
+
|
968
|
+
for doc in documents[:10]:
|
969
|
+
# Redact based on classification
|
970
|
+
classification = doc.get("metadata", {}).get("classification", "public")
|
971
|
+
|
972
|
+
if classification == "public":
|
973
|
+
content = doc.get("content", "")
|
974
|
+
elif classification == "internal":
|
975
|
+
content = (
|
976
|
+
doc.get("content", "")[:200] + "... [Truncated for compliance]"
|
977
|
+
)
|
978
|
+
else:
|
979
|
+
content = "[Content restricted due to classification]"
|
980
|
+
|
981
|
+
results.append(
|
982
|
+
{
|
983
|
+
"content": content,
|
984
|
+
"classification": classification,
|
985
|
+
"compliance_filtered": classification != "public",
|
986
|
+
}
|
987
|
+
)
|
988
|
+
|
989
|
+
return results
|
990
|
+
|
991
|
+
def _generate_compliance_report(
|
992
|
+
self, query: str, results: List[Dict], consent: Dict, jurisdiction: str
|
993
|
+
) -> Dict[str, Any]:
|
994
|
+
"""Generate detailed compliance report"""
|
995
|
+
return {
|
996
|
+
"regulations_applied": self.regulations,
|
997
|
+
"jurisdiction": jurisdiction,
|
998
|
+
"consent_status": {
|
999
|
+
"explicit_consent": consent.get("explicit_consent", False),
|
1000
|
+
"purpose": consent.get("purpose", "not_specified"),
|
1001
|
+
"lawful_basis": self._determine_lawful_basis(consent, jurisdiction),
|
1002
|
+
},
|
1003
|
+
"data_minimization": {
|
1004
|
+
"applied": True,
|
1005
|
+
"documents_filtered": len(results),
|
1006
|
+
"fields_redacted": sum(
|
1007
|
+
1 for r in results if r.get("compliance_filtered", False)
|
1008
|
+
),
|
1009
|
+
},
|
1010
|
+
"audit_trail": {
|
1011
|
+
"timestamp": datetime.now().isoformat(),
|
1012
|
+
"query_hash": hashlib.sha256(query.encode()).hexdigest()[:16],
|
1013
|
+
"retention_commitment": consent.get(
|
1014
|
+
"retention_days", self.default_retention_days
|
1015
|
+
),
|
1016
|
+
},
|
1017
|
+
"compliance_score": 0.95, # High compliance
|
1018
|
+
}
|
1019
|
+
|
1020
|
+
def _determine_lawful_basis(self, consent: Dict, jurisdiction: str) -> str:
|
1021
|
+
"""Determine lawful basis for processing"""
|
1022
|
+
if consent.get("explicit_consent"):
|
1023
|
+
return "consent"
|
1024
|
+
elif consent.get("purpose") == "medical_diagnosis":
|
1025
|
+
return "vital_interests"
|
1026
|
+
elif consent.get("purpose") == "legal_requirement":
|
1027
|
+
return "legal_obligation"
|
1028
|
+
else:
|
1029
|
+
return "legitimate_interests"
|
1030
|
+
|
1031
|
+
def _get_user_rights(self, jurisdiction: str) -> Dict[str, bool]:
|
1032
|
+
"""Get user rights based on jurisdiction"""
|
1033
|
+
rights = {
|
1034
|
+
"gdpr": {
|
1035
|
+
"access": True,
|
1036
|
+
"rectification": True,
|
1037
|
+
"erasure": True,
|
1038
|
+
"portability": True,
|
1039
|
+
"restriction": True,
|
1040
|
+
"objection": True,
|
1041
|
+
},
|
1042
|
+
"ccpa": {
|
1043
|
+
"access": True,
|
1044
|
+
"deletion": True,
|
1045
|
+
"opt_out": True,
|
1046
|
+
"non_discrimination": True,
|
1047
|
+
},
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
user_rights = {}
|
1051
|
+
for regulation in self.regulations:
|
1052
|
+
if regulation in rights:
|
1053
|
+
user_rights.update(rights[regulation])
|
1054
|
+
|
1055
|
+
return user_rights
|
1056
|
+
|
1057
|
+
|
1058
|
+
# Export all privacy nodes
|
1059
|
+
__all__ = ["PrivacyPreservingRAGNode", "SecureMultiPartyRAGNode", "ComplianceRAGNode"]
|