kailash 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. kailash/__init__.py +33 -1
  2. kailash/access_control/__init__.py +129 -0
  3. kailash/access_control/managers.py +461 -0
  4. kailash/access_control/rule_evaluators.py +467 -0
  5. kailash/access_control_abac.py +825 -0
  6. kailash/config/__init__.py +27 -0
  7. kailash/config/database_config.py +359 -0
  8. kailash/database/__init__.py +28 -0
  9. kailash/database/execution_pipeline.py +499 -0
  10. kailash/middleware/__init__.py +306 -0
  11. kailash/middleware/auth/__init__.py +33 -0
  12. kailash/middleware/auth/access_control.py +436 -0
  13. kailash/middleware/auth/auth_manager.py +422 -0
  14. kailash/middleware/auth/jwt_auth.py +477 -0
  15. kailash/middleware/auth/kailash_jwt_auth.py +616 -0
  16. kailash/middleware/communication/__init__.py +37 -0
  17. kailash/middleware/communication/ai_chat.py +989 -0
  18. kailash/middleware/communication/api_gateway.py +802 -0
  19. kailash/middleware/communication/events.py +470 -0
  20. kailash/middleware/communication/realtime.py +710 -0
  21. kailash/middleware/core/__init__.py +21 -0
  22. kailash/middleware/core/agent_ui.py +890 -0
  23. kailash/middleware/core/schema.py +643 -0
  24. kailash/middleware/core/workflows.py +396 -0
  25. kailash/middleware/database/__init__.py +63 -0
  26. kailash/middleware/database/base.py +113 -0
  27. kailash/middleware/database/base_models.py +525 -0
  28. kailash/middleware/database/enums.py +106 -0
  29. kailash/middleware/database/migrations.py +12 -0
  30. kailash/{api/database.py → middleware/database/models.py} +183 -291
  31. kailash/middleware/database/repositories.py +685 -0
  32. kailash/middleware/database/session_manager.py +19 -0
  33. kailash/middleware/mcp/__init__.py +38 -0
  34. kailash/middleware/mcp/client_integration.py +585 -0
  35. kailash/middleware/mcp/enhanced_server.py +576 -0
  36. kailash/nodes/__init__.py +27 -3
  37. kailash/nodes/admin/__init__.py +42 -0
  38. kailash/nodes/admin/audit_log.py +794 -0
  39. kailash/nodes/admin/permission_check.py +864 -0
  40. kailash/nodes/admin/role_management.py +823 -0
  41. kailash/nodes/admin/security_event.py +1523 -0
  42. kailash/nodes/admin/user_management.py +944 -0
  43. kailash/nodes/ai/a2a.py +24 -7
  44. kailash/nodes/ai/ai_providers.py +248 -40
  45. kailash/nodes/ai/embedding_generator.py +11 -11
  46. kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
  47. kailash/nodes/ai/llm_agent.py +436 -5
  48. kailash/nodes/ai/self_organizing.py +85 -10
  49. kailash/nodes/ai/vision_utils.py +148 -0
  50. kailash/nodes/alerts/__init__.py +26 -0
  51. kailash/nodes/alerts/base.py +234 -0
  52. kailash/nodes/alerts/discord.py +499 -0
  53. kailash/nodes/api/auth.py +287 -6
  54. kailash/nodes/api/rest.py +151 -0
  55. kailash/nodes/auth/__init__.py +17 -0
  56. kailash/nodes/auth/directory_integration.py +1228 -0
  57. kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
  58. kailash/nodes/auth/mfa.py +2338 -0
  59. kailash/nodes/auth/risk_assessment.py +872 -0
  60. kailash/nodes/auth/session_management.py +1093 -0
  61. kailash/nodes/auth/sso.py +1040 -0
  62. kailash/nodes/base.py +344 -13
  63. kailash/nodes/base_cycle_aware.py +4 -2
  64. kailash/nodes/base_with_acl.py +1 -1
  65. kailash/nodes/code/python.py +283 -10
  66. kailash/nodes/compliance/__init__.py +9 -0
  67. kailash/nodes/compliance/data_retention.py +1888 -0
  68. kailash/nodes/compliance/gdpr.py +2004 -0
  69. kailash/nodes/data/__init__.py +22 -2
  70. kailash/nodes/data/async_connection.py +469 -0
  71. kailash/nodes/data/async_sql.py +757 -0
  72. kailash/nodes/data/async_vector.py +598 -0
  73. kailash/nodes/data/readers.py +767 -0
  74. kailash/nodes/data/retrieval.py +360 -1
  75. kailash/nodes/data/sharepoint_graph.py +397 -21
  76. kailash/nodes/data/sql.py +94 -5
  77. kailash/nodes/data/streaming.py +68 -8
  78. kailash/nodes/data/vector_db.py +54 -4
  79. kailash/nodes/enterprise/__init__.py +13 -0
  80. kailash/nodes/enterprise/batch_processor.py +741 -0
  81. kailash/nodes/enterprise/data_lineage.py +497 -0
  82. kailash/nodes/logic/convergence.py +31 -9
  83. kailash/nodes/logic/operations.py +14 -3
  84. kailash/nodes/mixins/__init__.py +8 -0
  85. kailash/nodes/mixins/event_emitter.py +201 -0
  86. kailash/nodes/mixins/mcp.py +9 -4
  87. kailash/nodes/mixins/security.py +165 -0
  88. kailash/nodes/monitoring/__init__.py +7 -0
  89. kailash/nodes/monitoring/performance_benchmark.py +2497 -0
  90. kailash/nodes/rag/__init__.py +284 -0
  91. kailash/nodes/rag/advanced.py +1615 -0
  92. kailash/nodes/rag/agentic.py +773 -0
  93. kailash/nodes/rag/conversational.py +999 -0
  94. kailash/nodes/rag/evaluation.py +875 -0
  95. kailash/nodes/rag/federated.py +1188 -0
  96. kailash/nodes/rag/graph.py +721 -0
  97. kailash/nodes/rag/multimodal.py +671 -0
  98. kailash/nodes/rag/optimized.py +933 -0
  99. kailash/nodes/rag/privacy.py +1059 -0
  100. kailash/nodes/rag/query_processing.py +1335 -0
  101. kailash/nodes/rag/realtime.py +764 -0
  102. kailash/nodes/rag/registry.py +547 -0
  103. kailash/nodes/rag/router.py +837 -0
  104. kailash/nodes/rag/similarity.py +1854 -0
  105. kailash/nodes/rag/strategies.py +566 -0
  106. kailash/nodes/rag/workflows.py +575 -0
  107. kailash/nodes/security/__init__.py +19 -0
  108. kailash/nodes/security/abac_evaluator.py +1411 -0
  109. kailash/nodes/security/audit_log.py +103 -0
  110. kailash/nodes/security/behavior_analysis.py +1893 -0
  111. kailash/nodes/security/credential_manager.py +401 -0
  112. kailash/nodes/security/rotating_credentials.py +760 -0
  113. kailash/nodes/security/security_event.py +133 -0
  114. kailash/nodes/security/threat_detection.py +1103 -0
  115. kailash/nodes/testing/__init__.py +9 -0
  116. kailash/nodes/testing/credential_testing.py +499 -0
  117. kailash/nodes/transform/__init__.py +10 -2
  118. kailash/nodes/transform/chunkers.py +592 -1
  119. kailash/nodes/transform/processors.py +484 -14
  120. kailash/nodes/validation.py +321 -0
  121. kailash/runtime/access_controlled.py +1 -1
  122. kailash/runtime/async_local.py +41 -7
  123. kailash/runtime/docker.py +1 -1
  124. kailash/runtime/local.py +474 -55
  125. kailash/runtime/parallel.py +1 -1
  126. kailash/runtime/parallel_cyclic.py +1 -1
  127. kailash/runtime/testing.py +210 -2
  128. kailash/security.py +1 -1
  129. kailash/utils/migrations/__init__.py +25 -0
  130. kailash/utils/migrations/generator.py +433 -0
  131. kailash/utils/migrations/models.py +231 -0
  132. kailash/utils/migrations/runner.py +489 -0
  133. kailash/utils/secure_logging.py +342 -0
  134. kailash/workflow/__init__.py +16 -0
  135. kailash/workflow/cyclic_runner.py +3 -4
  136. kailash/workflow/graph.py +70 -2
  137. kailash/workflow/resilience.py +249 -0
  138. kailash/workflow/templates.py +726 -0
  139. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/METADATA +256 -20
  140. kailash-0.4.1.dist-info/RECORD +227 -0
  141. kailash/api/__init__.py +0 -17
  142. kailash/api/__main__.py +0 -6
  143. kailash/api/studio_secure.py +0 -893
  144. kailash/mcp/__main__.py +0 -13
  145. kailash/mcp/server_new.py +0 -336
  146. kailash/mcp/servers/__init__.py +0 -12
  147. kailash-0.3.2.dist-info/RECORD +0 -136
  148. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/WHEEL +0 -0
  149. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/entry_points.txt +0 -0
  150. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/licenses/LICENSE +0 -0
  151. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,764 @@
1
+ """
2
+ Real-time RAG Implementation
3
+
4
+ Implements RAG with live data updates and streaming capabilities:
5
+ - Dynamic index updates
6
+ - Streaming data ingestion
7
+ - Real-time relevance adjustments
8
+ - Incremental retrieval
9
+ - Live document monitoring
10
+
11
+ Based on streaming architectures and real-time search research.
12
+ """
13
+
14
+ import asyncio
15
+ import json
16
+ import logging
17
+ import time
18
+ from collections import deque
19
+ from datetime import datetime, timedelta
20
+ from typing import Any, AsyncIterator, Dict, List, Optional, Union
21
+
22
+ from ...workflow.builder import WorkflowBuilder
23
+ from ..base import Node, NodeParameter, register_node
24
+ from ..code.python import PythonCodeNode
25
+ from ..data.streaming import EventStreamNode
26
+ from ..logic.workflow import WorkflowNode
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @register_node()
32
+ class RealtimeRAGNode(WorkflowNode):
33
+ """
34
+ Real-time RAG with Live Data Updates
35
+
36
+ Implements RAG that continuously updates its knowledge base and adjusts
37
+ to changing information in real-time.
38
+
39
+ When to use:
40
+ - Best for: News aggregation, monitoring systems, live documentation
41
+ - Not ideal for: Static knowledge bases, historical data
42
+ - Performance: <100ms for updates, <500ms for queries
43
+ - Freshness: Data updated within seconds
44
+
45
+ Key features:
46
+ - Incremental index updates
47
+ - Time-decay relevance scoring
48
+ - Live document monitoring
49
+ - Streaming ingestion pipeline
50
+ - Real-time cache invalidation
51
+
52
+ Example:
53
+ realtime_rag = RealtimeRAGNode(
54
+ update_interval=5.0, # 5 seconds
55
+ relevance_decay_rate=0.95
56
+ )
57
+
58
+ # Start monitoring live data sources
59
+ await realtime_rag.start_monitoring([
60
+ {"type": "rss", "url": "https://news.site/feed"},
61
+ {"type": "api", "endpoint": "https://api.data/stream"},
62
+ {"type": "file", "path": "/data/live/*.json"}
63
+ ])
64
+
65
+ # Query with real-time data
66
+ result = await realtime_rag.run(
67
+ query="What are the latest developments in AI?"
68
+ )
69
+ # Returns most recent relevant information
70
+
71
+ Parameters:
72
+ update_interval: How often to check for updates (seconds)
73
+ relevance_decay_rate: How quickly old info loses relevance
74
+ max_buffer_size: Maximum documents in memory
75
+ enable_streaming: Support streaming responses
76
+
77
+ Returns:
78
+ results: Most recent relevant documents
79
+ timestamps: When each result was updated
80
+ freshness_scores: How recent each result is
81
+ update_stats: Real-time system statistics
82
+ """
83
+
84
+ def __init__(
85
+ self,
86
+ name: str = "realtime_rag",
87
+ update_interval: float = 10.0,
88
+ relevance_decay_rate: float = 0.95,
89
+ max_buffer_size: int = 1000,
90
+ ):
91
+ self.update_interval = update_interval
92
+ self.relevance_decay_rate = relevance_decay_rate
93
+ self.max_buffer_size = max_buffer_size
94
+ self.document_buffer = deque(maxlen=max_buffer_size)
95
+ self.last_update = datetime.now()
96
+ super().__init__(name, self._create_workflow())
97
+
98
+ def _create_workflow(self) -> WorkflowNode:
99
+ """Create real-time RAG workflow"""
100
+ builder = WorkflowBuilder()
101
+
102
+ # Live data monitor
103
+ monitor_id = builder.add_node(
104
+ "PythonCodeNode",
105
+ node_id="live_monitor",
106
+ config={
107
+ "code": """
108
+ import time
109
+ from datetime import datetime, timedelta
110
+ from collections import deque
111
+
112
+ def check_for_updates(data_sources, last_check_time):
113
+ '''Check data sources for updates'''
114
+ new_documents = []
115
+ current_time = datetime.now()
116
+
117
+ for source in data_sources:
118
+ source_type = source.get("type", "unknown")
119
+
120
+ if source_type == "api":
121
+ # Simulated API check
122
+ if (current_time - last_check_time).seconds > 5:
123
+ new_documents.append({
124
+ "id": f"api_{current_time.timestamp()}",
125
+ "content": f"Latest API data at {current_time}",
126
+ "source": source.get("endpoint", "unknown"),
127
+ "timestamp": current_time.isoformat(),
128
+ "type": "live_update"
129
+ })
130
+
131
+ elif source_type == "file":
132
+ # Simulated file monitoring
133
+ new_documents.append({
134
+ "id": f"file_{current_time.timestamp()}",
135
+ "content": f"New file content detected at {current_time}",
136
+ "source": source.get("path", "unknown"),
137
+ "timestamp": current_time.isoformat(),
138
+ "type": "file_update"
139
+ })
140
+
141
+ elif source_type == "stream":
142
+ # Simulated stream data
143
+ for i in range(3): # Simulate 3 new items
144
+ new_documents.append({
145
+ "id": f"stream_{current_time.timestamp()}_{i}",
146
+ "content": f"Stream item {i} at {current_time}",
147
+ "source": "data_stream",
148
+ "timestamp": current_time.isoformat(),
149
+ "type": "stream_item"
150
+ })
151
+
152
+ result = {
153
+ "new_documents": new_documents,
154
+ "check_time": current_time.isoformat(),
155
+ "update_count": len(new_documents)
156
+ }
157
+ """
158
+ },
159
+ )
160
+
161
+ # Incremental indexer
162
+ indexer_id = builder.add_node(
163
+ "PythonCodeNode",
164
+ node_id="incremental_indexer",
165
+ config={
166
+ "code": f"""
167
+ from datetime import datetime, timedelta
168
+ from collections import deque
169
+
170
+ def update_index(existing_buffer, new_documents, max_size={self.max_buffer_size}):
171
+ '''Update document buffer with new documents'''
172
+
173
+ # Convert to deque if needed
174
+ if not isinstance(existing_buffer, deque):
175
+ buffer = deque(existing_buffer or [], maxlen=max_size)
176
+ else:
177
+ buffer = existing_buffer
178
+
179
+ # Add new documents with metadata
180
+ for doc in new_documents:
181
+ indexed_doc = doc.copy()
182
+ indexed_doc["indexed_at"] = datetime.now().isoformat()
183
+ indexed_doc["initial_relevance"] = 1.0
184
+ buffer.append(indexed_doc)
185
+
186
+ # Calculate index statistics
187
+ current_time = datetime.now()
188
+ age_distribution = {{
189
+ "last_minute": 0,
190
+ "last_hour": 0,
191
+ "last_day": 0,
192
+ "older": 0
193
+ }}
194
+
195
+ for doc in buffer:
196
+ try:
197
+ doc_time = datetime.fromisoformat(doc.get("timestamp", doc.get("indexed_at", "")))
198
+ age = current_time - doc_time
199
+
200
+ if age < timedelta(minutes=1):
201
+ age_distribution["last_minute"] += 1
202
+ elif age < timedelta(hours=1):
203
+ age_distribution["last_hour"] += 1
204
+ elif age < timedelta(days=1):
205
+ age_distribution["last_day"] += 1
206
+ else:
207
+ age_distribution["older"] += 1
208
+ except:
209
+ age_distribution["older"] += 1
210
+
211
+ result = {{
212
+ "updated_buffer": list(buffer),
213
+ "buffer_size": len(buffer),
214
+ "age_distribution": age_distribution,
215
+ "newest_timestamp": new_documents[0]["timestamp"] if new_documents else None
216
+ }}
217
+ """
218
+ },
219
+ )
220
+
221
+ # Time-aware retriever
222
+ retriever_id = builder.add_node(
223
+ "PythonCodeNode",
224
+ node_id="time_aware_retriever",
225
+ config={
226
+ "code": f"""
227
+ from datetime import datetime, timedelta
228
+ import math
229
+
230
+ def calculate_time_decay(timestamp, current_time, decay_rate={self.relevance_decay_rate}):
231
+ '''Calculate relevance decay based on age'''
232
+ try:
233
+ doc_time = datetime.fromisoformat(timestamp)
234
+ age_hours = (current_time - doc_time).total_seconds() / 3600
235
+
236
+ # Exponential decay
237
+ decay_factor = decay_rate ** age_hours
238
+ return decay_factor
239
+ except:
240
+ return 0.5 # Default for unparseable timestamps
241
+
242
+ def retrieve_with_freshness(query, document_buffer):
243
+ '''Retrieve documents with time-aware scoring'''
244
+ current_time = datetime.now()
245
+ query_words = set(query.lower().split())
246
+
247
+ scored_docs = []
248
+
249
+ for doc in document_buffer:
250
+ # Content relevance score
251
+ content = doc.get("content", "").lower()
252
+ content_words = set(content.split())
253
+
254
+ if not query_words:
255
+ relevance_score = 0
256
+ else:
257
+ overlap = len(query_words & content_words)
258
+ relevance_score = overlap / len(query_words)
259
+
260
+ # Time decay factor
261
+ timestamp = doc.get("timestamp", doc.get("indexed_at", ""))
262
+ time_factor = calculate_time_decay(timestamp, current_time)
263
+
264
+ # Combined score
265
+ final_score = relevance_score * time_factor
266
+
267
+ # Add metadata
268
+ scored_docs.append({{
269
+ "document": doc,
270
+ "relevance_score": relevance_score,
271
+ "time_factor": time_factor,
272
+ "final_score": final_score,
273
+ "age_hours": (current_time - datetime.fromisoformat(timestamp)).total_seconds() / 3600
274
+ }})
275
+
276
+ # Sort by final score
277
+ scored_docs.sort(key=lambda x: x["final_score"], reverse=True)
278
+
279
+ # Get top results
280
+ top_results = scored_docs[:10]
281
+
282
+ result = {{
283
+ "retrieval_results": {{
284
+ "documents": [r["document"] for r in top_results],
285
+ "scores": [r["final_score"] for r in top_results],
286
+ "metadata": {{
287
+ "avg_age_hours": sum(r["age_hours"] for r in top_results) / len(top_results) if top_results else 0,
288
+ "newest_result_age": min(r["age_hours"] for r in top_results) if top_results else float('inf'),
289
+ "time_decay_applied": True
290
+ }}
291
+ }}
292
+ }}
293
+ """
294
+ },
295
+ )
296
+
297
+ # Stream formatter
298
+ stream_formatter_id = builder.add_node(
299
+ "PythonCodeNode",
300
+ node_id="stream_formatter",
301
+ config={
302
+ "code": """
303
+ from datetime import datetime
304
+
305
+ def format_realtime_results(retrieval_results, query, update_stats):
306
+ '''Format results for real-time consumption'''
307
+
308
+ documents = retrieval_results["documents"]
309
+ scores = retrieval_results["scores"]
310
+ metadata = retrieval_results["metadata"]
311
+
312
+ # Create response with freshness indicators
313
+ formatted_results = []
314
+
315
+ for doc, score in zip(documents, scores):
316
+ # Calculate freshness
317
+ doc_time = datetime.fromisoformat(doc.get("timestamp", doc.get("indexed_at", "")))
318
+ age_seconds = (datetime.now() - doc_time).total_seconds()
319
+
320
+ if age_seconds < 60:
321
+ freshness = "just now"
322
+ elif age_seconds < 3600:
323
+ freshness = f"{int(age_seconds/60)} minutes ago"
324
+ elif age_seconds < 86400:
325
+ freshness = f"{int(age_seconds/3600)} hours ago"
326
+ else:
327
+ freshness = f"{int(age_seconds/86400)} days ago"
328
+
329
+ formatted_results.append({
330
+ "content": doc.get("content", ""),
331
+ "source": doc.get("source", "unknown"),
332
+ "freshness": freshness,
333
+ "timestamp": doc.get("timestamp"),
334
+ "relevance": score,
335
+ "type": doc.get("type", "unknown")
336
+ })
337
+
338
+ result = {
339
+ "realtime_results": {
340
+ "query": query,
341
+ "results": formatted_results,
342
+ "timestamps": [r["timestamp"] for r in formatted_results],
343
+ "freshness_scores": scores,
344
+ "update_stats": {
345
+ "last_update": update_stats.get("check_time"),
346
+ "buffer_size": update_stats.get("buffer_size", 0),
347
+ "avg_age_hours": metadata.get("avg_age_hours", 0),
348
+ "newest_age_hours": metadata.get("newest_result_age", 0)
349
+ },
350
+ "response_time": datetime.now().isoformat()
351
+ }
352
+ }
353
+ """
354
+ },
355
+ )
356
+
357
+ # Connect workflow
358
+ builder.add_connection(monitor_id, "new_documents", indexer_id, "new_documents")
359
+ builder.add_connection(
360
+ indexer_id, "updated_buffer", retriever_id, "document_buffer"
361
+ )
362
+ builder.add_connection(
363
+ retriever_id, "retrieval_results", stream_formatter_id, "retrieval_results"
364
+ )
365
+ builder.add_connection(
366
+ indexer_id, "age_distribution", stream_formatter_id, "update_stats"
367
+ )
368
+
369
+ return builder.build(name="realtime_rag_workflow")
370
+
371
+ async def start_monitoring(self, data_sources: List[Dict[str, Any]]):
372
+ """Start monitoring data sources for updates"""
373
+ self.monitoring_active = True
374
+ self.data_sources = data_sources
375
+
376
+ # Start background monitoring task
377
+ asyncio.create_task(self._monitor_loop())
378
+
379
+ logger.info(f"Started monitoring {len(data_sources)} data sources")
380
+
381
+ async def _monitor_loop(self):
382
+ """Background monitoring loop"""
383
+ while self.monitoring_active:
384
+ try:
385
+ # Check for updates
386
+ # In production, would actually poll sources
387
+ await asyncio.sleep(self.update_interval)
388
+
389
+ # Trigger update
390
+ self.last_update = datetime.now()
391
+
392
+ except Exception as e:
393
+ logger.error(f"Monitoring error: {e}")
394
+
395
+ def stop_monitoring(self):
396
+ """Stop monitoring data sources"""
397
+ self.monitoring_active = False
398
+
399
+
400
+ @register_node()
401
+ class StreamingRAGNode(Node):
402
+ """
403
+ Streaming RAG Response Node
404
+
405
+ Provides RAG responses as a stream for real-time UIs.
406
+
407
+ When to use:
408
+ - Best for: Chat interfaces, live dashboards, progressive loading
409
+ - Not ideal for: Batch processing, complete results needed upfront
410
+ - Performance: First chunk in <100ms
411
+ - User experience: Immediate feedback
412
+
413
+ Example:
414
+ streaming_rag = StreamingRAGNode()
415
+
416
+ async for chunk in streaming_rag.stream(
417
+ query="Latest news on AI",
418
+ documents=live_documents
419
+ ):
420
+ print(chunk) # Display progressively
421
+
422
+ Parameters:
423
+ chunk_size: Number of results per chunk
424
+ chunk_interval: Delay between chunks (ms)
425
+ enable_backpressure: Handle slow consumers
426
+
427
+ Yields:
428
+ chunks: Stream of result chunks
429
+ progress: Progress indicators
430
+ metadata: Streaming statistics
431
+ """
432
+
433
+ def __init__(
434
+ self,
435
+ name: str = "streaming_rag",
436
+ chunk_size: int = 50,
437
+ chunk_interval: int = 100,
438
+ ):
439
+ self.chunk_size = chunk_size
440
+ self.chunk_interval = chunk_interval
441
+ super().__init__(name)
442
+
443
+ def get_parameters(self) -> Dict[str, NodeParameter]:
444
+ return {
445
+ "query": NodeParameter(
446
+ name="query", type=str, required=True, description="Search query"
447
+ ),
448
+ "documents": NodeParameter(
449
+ name="documents",
450
+ type=list,
451
+ required=True,
452
+ description="Document collection",
453
+ ),
454
+ "max_chunks": NodeParameter(
455
+ name="max_chunks",
456
+ type=int,
457
+ required=False,
458
+ default=10,
459
+ description="Maximum chunks to stream",
460
+ ),
461
+ }
462
+
463
+ async def stream(self, **kwargs) -> AsyncIterator[Dict[str, Any]]:
464
+ """Stream RAG results progressively"""
465
+ query = kwargs.get("query", "")
466
+ documents = kwargs.get("documents", [])
467
+ max_chunks = kwargs.get("max_chunks", 10)
468
+
469
+ # Quick initial results
470
+ yield {
471
+ "type": "start",
472
+ "query": query,
473
+ "estimated_results": min(len(documents), max_chunks * self.chunk_size),
474
+ }
475
+
476
+ # Score all documents
477
+ scored_docs = []
478
+ query_words = set(query.lower().split())
479
+
480
+ for doc in documents:
481
+ content = doc.get("content", "").lower()
482
+ doc_words = set(content.split())
483
+ score = (
484
+ len(query_words & doc_words) / len(query_words) if query_words else 0
485
+ )
486
+
487
+ if score > 0:
488
+ scored_docs.append((doc, score))
489
+
490
+ # Sort by score
491
+ scored_docs.sort(key=lambda x: x[1], reverse=True)
492
+
493
+ # Stream in chunks
494
+ for chunk_idx in range(max_chunks):
495
+ start_idx = chunk_idx * self.chunk_size
496
+ end_idx = start_idx + self.chunk_size
497
+
498
+ chunk_docs = scored_docs[start_idx:end_idx]
499
+ if not chunk_docs:
500
+ break
501
+
502
+ # Yield chunk
503
+ yield {
504
+ "type": "chunk",
505
+ "chunk_id": chunk_idx,
506
+ "results": [
507
+ {"document": doc, "score": score, "position": start_idx + i}
508
+ for i, (doc, score) in enumerate(chunk_docs)
509
+ ],
510
+ "progress": min(100, (end_idx / len(scored_docs)) * 100),
511
+ }
512
+
513
+ # Simulate processing time
514
+ await asyncio.sleep(self.chunk_interval / 1000)
515
+
516
+ # Final metadata
517
+ yield {
518
+ "type": "complete",
519
+ "total_results": len(scored_docs),
520
+ "chunks_sent": min(
521
+ max_chunks, (len(scored_docs) + self.chunk_size - 1) // self.chunk_size
522
+ ),
523
+ "processing_time": chunk_idx * self.chunk_interval,
524
+ }
525
+
526
+ def run(self, **kwargs) -> Dict[str, Any]:
527
+ """Synchronous run method (returns first chunk)"""
528
+ # For compatibility, return first chunk synchronously
529
+ query = kwargs.get("query", "")
530
+ documents = kwargs.get("documents", [])
531
+
532
+ # Quick scoring
533
+ query_words = set(query.lower().split())
534
+ first_results = []
535
+
536
+ for doc in documents[: self.chunk_size]:
537
+ content = doc.get("content", "").lower()
538
+ doc_words = set(content.split())
539
+ score = (
540
+ len(query_words & doc_words) / len(query_words) if query_words else 0
541
+ )
542
+
543
+ if score > 0:
544
+ first_results.append({"document": doc, "score": score})
545
+
546
+ return {
547
+ "streaming_enabled": True,
548
+ "first_chunk": first_results,
549
+ "use_stream_method": "Call stream() for full results",
550
+ }
551
+
552
+
553
+ @register_node()
554
+ class IncrementalIndexNode(Node):
555
+ """
556
+ Incremental Index Update Node
557
+
558
+ Efficiently updates RAG indices without full rebuilds.
559
+
560
+ When to use:
561
+ - Best for: Frequently changing document sets
562
+ - Not ideal for: Static collections
563
+ - Performance: O(log n) updates
564
+ - Memory: Efficient incremental storage
565
+
566
+ Example:
567
+ index = IncrementalIndexNode()
568
+
569
+ # Add new documents
570
+ await index.add_documents(new_docs)
571
+
572
+ # Remove outdated
573
+ await index.remove_documents(old_ids)
574
+
575
+ # Update existing
576
+ await index.update_documents(changed_docs)
577
+
578
+ Parameters:
579
+ index_type: Type of index (inverted, vector, hybrid)
580
+ merge_strategy: How to merge updates
581
+ compaction_threshold: When to compact index
582
+
583
+ Returns:
584
+ update_stats: Statistics about the update
585
+ index_health: Current index status
586
+ """
587
+
588
+ def __init__(
589
+ self,
590
+ name: str = "incremental_index",
591
+ index_type: str = "hybrid",
592
+ merge_strategy: str = "immediate",
593
+ ):
594
+ self.index_type = index_type
595
+ self.merge_strategy = merge_strategy
596
+ self.index = {}
597
+ self.document_store = {}
598
+ self.update_log = deque(maxlen=1000)
599
+ super().__init__(name)
600
+
601
+ def get_parameters(self) -> Dict[str, NodeParameter]:
602
+ return {
603
+ "operation": NodeParameter(
604
+ name="operation",
605
+ type=str,
606
+ required=True,
607
+ description="Operation: add, remove, update, search",
608
+ ),
609
+ "documents": NodeParameter(
610
+ name="documents",
611
+ type=list,
612
+ required=False,
613
+ description="Documents to process",
614
+ ),
615
+ "document_ids": NodeParameter(
616
+ name="document_ids",
617
+ type=list,
618
+ required=False,
619
+ description="IDs for removal",
620
+ ),
621
+ "query": NodeParameter(
622
+ name="query", type=str, required=False, description="Search query"
623
+ ),
624
+ }
625
+
626
+ def run(self, **kwargs) -> Dict[str, Any]:
627
+ """Execute incremental index operation"""
628
+ operation = kwargs.get("operation", "search")
629
+
630
+ if operation == "add":
631
+ return self._add_documents(kwargs.get("documents", []))
632
+ elif operation == "remove":
633
+ return self._remove_documents(kwargs.get("document_ids", []))
634
+ elif operation == "update":
635
+ return self._update_documents(kwargs.get("documents", []))
636
+ elif operation == "search":
637
+ return self._search(kwargs.get("query", ""))
638
+ else:
639
+ return {"error": f"Unknown operation: {operation}"}
640
+
641
+ def _add_documents(self, documents: List[Dict]) -> Dict[str, Any]:
642
+ """Add documents to index"""
643
+ added_count = 0
644
+
645
+ for doc in documents:
646
+ doc_id = doc.get("id", str(hash(doc.get("content", ""))))
647
+
648
+ # Store document
649
+ self.document_store[doc_id] = doc
650
+
651
+ # Update inverted index
652
+ content = doc.get("content", "").lower()
653
+ words = content.split()
654
+
655
+ for word in set(words):
656
+ if word not in self.index:
657
+ self.index[word] = set()
658
+ self.index[word].add(doc_id)
659
+
660
+ added_count += 1
661
+
662
+ # Log update
663
+ self.update_log.append(
664
+ {
665
+ "operation": "add",
666
+ "doc_id": doc_id,
667
+ "timestamp": datetime.now().isoformat(),
668
+ }
669
+ )
670
+
671
+ return {
672
+ "operation": "add",
673
+ "documents_added": added_count,
674
+ "total_documents": len(self.document_store),
675
+ "index_terms": len(self.index),
676
+ "update_time": datetime.now().isoformat(),
677
+ }
678
+
679
+ def _remove_documents(self, document_ids: List[str]) -> Dict[str, Any]:
680
+ """Remove documents from index"""
681
+ removed_count = 0
682
+
683
+ for doc_id in document_ids:
684
+ if doc_id in self.document_store:
685
+ # Get document
686
+ doc = self.document_store[doc_id]
687
+
688
+ # Remove from inverted index
689
+ content = doc.get("content", "").lower()
690
+ words = content.split()
691
+
692
+ for word in set(words):
693
+ if word in self.index and doc_id in self.index[word]:
694
+ self.index[word].discard(doc_id)
695
+ if not self.index[word]:
696
+ del self.index[word]
697
+
698
+ # Remove document
699
+ del self.document_store[doc_id]
700
+ removed_count += 1
701
+
702
+ # Log update
703
+ self.update_log.append(
704
+ {
705
+ "operation": "remove",
706
+ "doc_id": doc_id,
707
+ "timestamp": datetime.now().isoformat(),
708
+ }
709
+ )
710
+
711
+ return {
712
+ "operation": "remove",
713
+ "documents_removed": removed_count,
714
+ "total_documents": len(self.document_store),
715
+ "index_terms": len(self.index),
716
+ }
717
+
718
+ def _update_documents(self, documents: List[Dict]) -> Dict[str, Any]:
719
+ """Update existing documents"""
720
+ updated_count = 0
721
+
722
+ for doc in documents:
723
+ doc_id = doc.get("id")
724
+ if doc_id and doc_id in self.document_store:
725
+ # Remove old version
726
+ self._remove_documents([doc_id])
727
+
728
+ # Add new version
729
+ self._add_documents([doc])
730
+ updated_count += 1
731
+
732
+ return {
733
+ "operation": "update",
734
+ "documents_updated": updated_count,
735
+ "total_documents": len(self.document_store),
736
+ }
737
+
738
+ def _search(self, query: str) -> Dict[str, Any]:
739
+ """Search the incremental index"""
740
+ query_words = set(query.lower().split())
741
+
742
+ # Find matching documents
743
+ matching_docs = set()
744
+ for word in query_words:
745
+ if word in self.index:
746
+ matching_docs.update(self.index[word])
747
+
748
+ # Retrieve documents
749
+ results = []
750
+ for doc_id in matching_docs:
751
+ if doc_id in self.document_store:
752
+ results.append(self.document_store[doc_id])
753
+
754
+ return {
755
+ "operation": "search",
756
+ "query": query,
757
+ "results": results[:10],
758
+ "total_matches": len(matching_docs),
759
+ "search_time": datetime.now().isoformat(),
760
+ }
761
+
762
+
763
+ # Export all real-time nodes
764
+ __all__ = ["RealtimeRAGNode", "StreamingRAGNode", "IncrementalIndexNode"]