kailash 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +33 -1
- kailash/access_control/__init__.py +129 -0
- kailash/access_control/managers.py +461 -0
- kailash/access_control/rule_evaluators.py +467 -0
- kailash/access_control_abac.py +825 -0
- kailash/config/__init__.py +27 -0
- kailash/config/database_config.py +359 -0
- kailash/database/__init__.py +28 -0
- kailash/database/execution_pipeline.py +499 -0
- kailash/middleware/__init__.py +306 -0
- kailash/middleware/auth/__init__.py +33 -0
- kailash/middleware/auth/access_control.py +436 -0
- kailash/middleware/auth/auth_manager.py +422 -0
- kailash/middleware/auth/jwt_auth.py +477 -0
- kailash/middleware/auth/kailash_jwt_auth.py +616 -0
- kailash/middleware/communication/__init__.py +37 -0
- kailash/middleware/communication/ai_chat.py +989 -0
- kailash/middleware/communication/api_gateway.py +802 -0
- kailash/middleware/communication/events.py +470 -0
- kailash/middleware/communication/realtime.py +710 -0
- kailash/middleware/core/__init__.py +21 -0
- kailash/middleware/core/agent_ui.py +890 -0
- kailash/middleware/core/schema.py +643 -0
- kailash/middleware/core/workflows.py +396 -0
- kailash/middleware/database/__init__.py +63 -0
- kailash/middleware/database/base.py +113 -0
- kailash/middleware/database/base_models.py +525 -0
- kailash/middleware/database/enums.py +106 -0
- kailash/middleware/database/migrations.py +12 -0
- kailash/{api/database.py → middleware/database/models.py} +183 -291
- kailash/middleware/database/repositories.py +685 -0
- kailash/middleware/database/session_manager.py +19 -0
- kailash/middleware/mcp/__init__.py +38 -0
- kailash/middleware/mcp/client_integration.py +585 -0
- kailash/middleware/mcp/enhanced_server.py +576 -0
- kailash/nodes/__init__.py +25 -3
- kailash/nodes/admin/__init__.py +35 -0
- kailash/nodes/admin/audit_log.py +794 -0
- kailash/nodes/admin/permission_check.py +864 -0
- kailash/nodes/admin/role_management.py +823 -0
- kailash/nodes/admin/security_event.py +1519 -0
- kailash/nodes/admin/user_management.py +944 -0
- kailash/nodes/ai/a2a.py +24 -7
- kailash/nodes/ai/ai_providers.py +1 -0
- kailash/nodes/ai/embedding_generator.py +11 -11
- kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
- kailash/nodes/ai/llm_agent.py +407 -2
- kailash/nodes/ai/self_organizing.py +85 -10
- kailash/nodes/api/auth.py +287 -6
- kailash/nodes/api/rest.py +151 -0
- kailash/nodes/auth/__init__.py +17 -0
- kailash/nodes/auth/directory_integration.py +1228 -0
- kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
- kailash/nodes/auth/mfa.py +2338 -0
- kailash/nodes/auth/risk_assessment.py +872 -0
- kailash/nodes/auth/session_management.py +1093 -0
- kailash/nodes/auth/sso.py +1040 -0
- kailash/nodes/base.py +344 -13
- kailash/nodes/base_cycle_aware.py +4 -2
- kailash/nodes/base_with_acl.py +1 -1
- kailash/nodes/code/python.py +293 -12
- kailash/nodes/compliance/__init__.py +9 -0
- kailash/nodes/compliance/data_retention.py +1888 -0
- kailash/nodes/compliance/gdpr.py +2004 -0
- kailash/nodes/data/__init__.py +22 -2
- kailash/nodes/data/async_connection.py +469 -0
- kailash/nodes/data/async_sql.py +757 -0
- kailash/nodes/data/async_vector.py +598 -0
- kailash/nodes/data/readers.py +767 -0
- kailash/nodes/data/retrieval.py +360 -1
- kailash/nodes/data/sharepoint_graph.py +397 -21
- kailash/nodes/data/sql.py +94 -5
- kailash/nodes/data/streaming.py +68 -8
- kailash/nodes/data/vector_db.py +54 -4
- kailash/nodes/enterprise/__init__.py +13 -0
- kailash/nodes/enterprise/batch_processor.py +741 -0
- kailash/nodes/enterprise/data_lineage.py +497 -0
- kailash/nodes/logic/convergence.py +31 -9
- kailash/nodes/logic/operations.py +14 -3
- kailash/nodes/mixins/__init__.py +8 -0
- kailash/nodes/mixins/event_emitter.py +201 -0
- kailash/nodes/mixins/mcp.py +9 -4
- kailash/nodes/mixins/security.py +165 -0
- kailash/nodes/monitoring/__init__.py +7 -0
- kailash/nodes/monitoring/performance_benchmark.py +2497 -0
- kailash/nodes/rag/__init__.py +284 -0
- kailash/nodes/rag/advanced.py +1615 -0
- kailash/nodes/rag/agentic.py +773 -0
- kailash/nodes/rag/conversational.py +999 -0
- kailash/nodes/rag/evaluation.py +875 -0
- kailash/nodes/rag/federated.py +1188 -0
- kailash/nodes/rag/graph.py +721 -0
- kailash/nodes/rag/multimodal.py +671 -0
- kailash/nodes/rag/optimized.py +933 -0
- kailash/nodes/rag/privacy.py +1059 -0
- kailash/nodes/rag/query_processing.py +1335 -0
- kailash/nodes/rag/realtime.py +764 -0
- kailash/nodes/rag/registry.py +547 -0
- kailash/nodes/rag/router.py +837 -0
- kailash/nodes/rag/similarity.py +1854 -0
- kailash/nodes/rag/strategies.py +566 -0
- kailash/nodes/rag/workflows.py +575 -0
- kailash/nodes/security/__init__.py +19 -0
- kailash/nodes/security/abac_evaluator.py +1411 -0
- kailash/nodes/security/audit_log.py +91 -0
- kailash/nodes/security/behavior_analysis.py +1893 -0
- kailash/nodes/security/credential_manager.py +401 -0
- kailash/nodes/security/rotating_credentials.py +760 -0
- kailash/nodes/security/security_event.py +132 -0
- kailash/nodes/security/threat_detection.py +1103 -0
- kailash/nodes/testing/__init__.py +9 -0
- kailash/nodes/testing/credential_testing.py +499 -0
- kailash/nodes/transform/__init__.py +10 -2
- kailash/nodes/transform/chunkers.py +592 -1
- kailash/nodes/transform/processors.py +484 -14
- kailash/nodes/validation.py +321 -0
- kailash/runtime/access_controlled.py +1 -1
- kailash/runtime/async_local.py +41 -7
- kailash/runtime/docker.py +1 -1
- kailash/runtime/local.py +474 -55
- kailash/runtime/parallel.py +1 -1
- kailash/runtime/parallel_cyclic.py +1 -1
- kailash/runtime/testing.py +210 -2
- kailash/utils/migrations/__init__.py +25 -0
- kailash/utils/migrations/generator.py +433 -0
- kailash/utils/migrations/models.py +231 -0
- kailash/utils/migrations/runner.py +489 -0
- kailash/utils/secure_logging.py +342 -0
- kailash/workflow/__init__.py +16 -0
- kailash/workflow/cyclic_runner.py +3 -4
- kailash/workflow/graph.py +70 -2
- kailash/workflow/resilience.py +249 -0
- kailash/workflow/templates.py +726 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/METADATA +253 -20
- kailash-0.4.0.dist-info/RECORD +223 -0
- kailash/api/__init__.py +0 -17
- kailash/api/__main__.py +0 -6
- kailash/api/studio_secure.py +0 -893
- kailash/mcp/__main__.py +0 -13
- kailash/mcp/server_new.py +0 -336
- kailash/mcp/servers/__init__.py +0 -12
- kailash-0.3.1.dist-info/RECORD +0 -136
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/WHEEL +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,566 @@
|
|
1
|
+
"""
|
2
|
+
RAG Strategy Workflow Nodes
|
3
|
+
|
4
|
+
RAG strategies implemented as WorkflowNodes that encapsulate complete
|
5
|
+
RAG pipelines using existing Kailash components. Each strategy creates
|
6
|
+
a workflow using WorkflowBuilder and delegates all execution to the SDK.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import logging
|
10
|
+
from dataclasses import dataclass, field
|
11
|
+
from typing import Any, Dict, List, Optional
|
12
|
+
|
13
|
+
from ...workflow.builder import WorkflowBuilder
|
14
|
+
from ..base import Node, NodeParameter, register_node
|
15
|
+
from ..logic.workflow import WorkflowNode
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
@dataclass
|
21
|
+
class RAGConfig:
|
22
|
+
"""Configuration for RAG strategies"""
|
23
|
+
|
24
|
+
chunk_size: int = 1000
|
25
|
+
chunk_overlap: int = 200
|
26
|
+
embedding_model: str = "text-embedding-3-small"
|
27
|
+
embedding_provider: str = "openai"
|
28
|
+
vector_db_provider: str = "postgresql"
|
29
|
+
retrieval_k: int = 5
|
30
|
+
similarity_threshold: float = 0.7
|
31
|
+
|
32
|
+
|
33
|
+
def create_semantic_rag_workflow(config: RAGConfig) -> WorkflowNode:
|
34
|
+
"""
|
35
|
+
Create semantic RAG workflow using existing Kailash nodes.
|
36
|
+
|
37
|
+
Pipeline: Documents → SemanticChunker → EmbeddingGenerator → VectorDatabase → HybridRetriever
|
38
|
+
"""
|
39
|
+
builder = WorkflowBuilder()
|
40
|
+
|
41
|
+
# Add chunking node
|
42
|
+
chunker_id = builder.add_node(
|
43
|
+
"SemanticChunkerNode",
|
44
|
+
node_id="semantic_chunker",
|
45
|
+
config={
|
46
|
+
"chunk_size": config.chunk_size,
|
47
|
+
"chunk_overlap": config.chunk_overlap,
|
48
|
+
"similarity_threshold": config.similarity_threshold,
|
49
|
+
},
|
50
|
+
)
|
51
|
+
|
52
|
+
# Add embedding generation
|
53
|
+
embedder_id = builder.add_node(
|
54
|
+
"EmbeddingGeneratorNode",
|
55
|
+
node_id="embedder",
|
56
|
+
config={"model": config.embedding_model, "provider": config.embedding_provider},
|
57
|
+
)
|
58
|
+
|
59
|
+
# Add vector database storage
|
60
|
+
vectordb_id = builder.add_node(
|
61
|
+
"VectorDatabaseNode",
|
62
|
+
node_id="vector_db",
|
63
|
+
config={
|
64
|
+
"provider": config.vector_db_provider,
|
65
|
+
"collection_name": "semantic_rag",
|
66
|
+
},
|
67
|
+
)
|
68
|
+
|
69
|
+
# Add retrieval node
|
70
|
+
retriever_id = builder.add_node(
|
71
|
+
"HybridRetrieverNode",
|
72
|
+
node_id="retriever",
|
73
|
+
config={
|
74
|
+
"k": config.retrieval_k,
|
75
|
+
"similarity_threshold": config.similarity_threshold,
|
76
|
+
"method": "dense",
|
77
|
+
},
|
78
|
+
)
|
79
|
+
|
80
|
+
# Connect the pipeline
|
81
|
+
builder.add_connection(chunker_id, "chunks", embedder_id, "texts")
|
82
|
+
builder.add_connection(embedder_id, "embeddings", vectordb_id, "embeddings")
|
83
|
+
builder.add_connection(chunker_id, "chunks", vectordb_id, "documents")
|
84
|
+
builder.add_connection(
|
85
|
+
vectordb_id, "stored_documents", retriever_id, "document_store"
|
86
|
+
)
|
87
|
+
|
88
|
+
# Build workflow
|
89
|
+
workflow = builder.build(name="semantic_rag_workflow")
|
90
|
+
|
91
|
+
# Return as WorkflowNode
|
92
|
+
return WorkflowNode(
|
93
|
+
workflow=workflow,
|
94
|
+
name="semantic_rag_node",
|
95
|
+
description="Semantic RAG with dense embeddings and semantic chunking",
|
96
|
+
)
|
97
|
+
|
98
|
+
|
99
|
+
def create_statistical_rag_workflow(config: RAGConfig) -> WorkflowNode:
|
100
|
+
"""
|
101
|
+
Create statistical RAG workflow using existing Kailash nodes.
|
102
|
+
|
103
|
+
Pipeline: Documents → StatisticalChunker → EmbeddingGenerator → VectorDatabase → HybridRetriever (sparse)
|
104
|
+
"""
|
105
|
+
builder = WorkflowBuilder()
|
106
|
+
|
107
|
+
# Add statistical chunking
|
108
|
+
chunker_id = builder.add_node(
|
109
|
+
"StatisticalChunkerNode",
|
110
|
+
node_id="statistical_chunker",
|
111
|
+
config={"chunk_size": config.chunk_size, "overlap": config.chunk_overlap},
|
112
|
+
)
|
113
|
+
|
114
|
+
# Add embedding generation (for backup dense retrieval)
|
115
|
+
embedder_id = builder.add_node(
|
116
|
+
"EmbeddingGeneratorNode",
|
117
|
+
node_id="embedder",
|
118
|
+
config={"model": config.embedding_model, "provider": config.embedding_provider},
|
119
|
+
)
|
120
|
+
|
121
|
+
# Add keyword extraction for sparse retrieval
|
122
|
+
keyword_extractor_id = builder.add_node(
|
123
|
+
"PythonCodeNode",
|
124
|
+
node_id="keyword_extractor",
|
125
|
+
config={
|
126
|
+
"code": """
|
127
|
+
import re
|
128
|
+
def extract_keywords(text):
|
129
|
+
words = re.findall(r'\\b[a-zA-Z]{3,}\\b', text.lower())
|
130
|
+
stop_words = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'man', 'new', 'now', 'old', 'see', 'two', 'way', 'who', 'boy', 'did', 'its', 'let', 'put', 'say', 'she', 'too', 'use'}
|
131
|
+
keywords = [word for word in set(words) if word not in stop_words]
|
132
|
+
return keywords[:20]
|
133
|
+
|
134
|
+
result = {"keywords": [extract_keywords(chunk["content"]) for chunk in chunks]}
|
135
|
+
"""
|
136
|
+
},
|
137
|
+
)
|
138
|
+
|
139
|
+
# Add vector database
|
140
|
+
vectordb_id = builder.add_node(
|
141
|
+
"VectorDatabaseNode",
|
142
|
+
node_id="vector_db",
|
143
|
+
config={
|
144
|
+
"provider": config.vector_db_provider,
|
145
|
+
"collection_name": "statistical_rag",
|
146
|
+
},
|
147
|
+
)
|
148
|
+
|
149
|
+
# Add sparse retrieval
|
150
|
+
retriever_id = builder.add_node(
|
151
|
+
"HybridRetrieverNode",
|
152
|
+
node_id="retriever",
|
153
|
+
config={"k": config.retrieval_k, "method": "sparse"},
|
154
|
+
)
|
155
|
+
|
156
|
+
# Connect pipeline
|
157
|
+
builder.add_connection(chunker_id, "chunks", keyword_extractor_id, "chunks")
|
158
|
+
builder.add_connection(chunker_id, "chunks", embedder_id, "texts")
|
159
|
+
builder.add_connection(keyword_extractor_id, "result", vectordb_id, "keywords")
|
160
|
+
builder.add_connection(embedder_id, "embeddings", vectordb_id, "embeddings")
|
161
|
+
builder.add_connection(chunker_id, "chunks", vectordb_id, "documents")
|
162
|
+
builder.add_connection(
|
163
|
+
vectordb_id, "stored_documents", retriever_id, "document_store"
|
164
|
+
)
|
165
|
+
|
166
|
+
workflow = builder.build(name="statistical_rag_workflow")
|
167
|
+
|
168
|
+
return WorkflowNode(
|
169
|
+
workflow=workflow,
|
170
|
+
name="statistical_rag_node",
|
171
|
+
description="Statistical RAG with sparse retrieval and keyword matching",
|
172
|
+
)
|
173
|
+
|
174
|
+
|
175
|
+
def create_hybrid_rag_workflow(
|
176
|
+
config: RAGConfig, fusion_method: str = "rrf"
|
177
|
+
) -> WorkflowNode:
|
178
|
+
"""
|
179
|
+
Create hybrid RAG workflow combining semantic and statistical approaches.
|
180
|
+
|
181
|
+
Pipeline: Documents → [SemanticRAG + StatisticalRAG] → ResultFuser → HybridRetriever
|
182
|
+
"""
|
183
|
+
builder = WorkflowBuilder()
|
184
|
+
|
185
|
+
# Create both semantic and statistical sub-workflows
|
186
|
+
semantic_workflow = create_semantic_rag_workflow(config)
|
187
|
+
statistical_workflow = create_statistical_rag_workflow(config)
|
188
|
+
|
189
|
+
# Add sub-workflows as nodes
|
190
|
+
semantic_id = builder.add_node(
|
191
|
+
"WorkflowNode",
|
192
|
+
node_id="semantic_rag",
|
193
|
+
config={"workflow": semantic_workflow.workflow},
|
194
|
+
)
|
195
|
+
|
196
|
+
statistical_id = builder.add_node(
|
197
|
+
"WorkflowNode",
|
198
|
+
node_id="statistical_rag",
|
199
|
+
config={"workflow": statistical_workflow.workflow},
|
200
|
+
)
|
201
|
+
|
202
|
+
# Add result fusion node
|
203
|
+
fusion_id = builder.add_node(
|
204
|
+
"PythonCodeNode",
|
205
|
+
node_id="result_fusion",
|
206
|
+
config={
|
207
|
+
"code": f"""
|
208
|
+
def rrf_fusion(semantic_results, statistical_results, k=60):
|
209
|
+
'''Reciprocal Rank Fusion for combining results'''
|
210
|
+
doc_scores = {{}}
|
211
|
+
|
212
|
+
# Add semantic results
|
213
|
+
for i, doc in enumerate(semantic_results.get("results", [])):
|
214
|
+
doc_id = doc.get("id", f"semantic_{{i}}")
|
215
|
+
doc_scores[doc_id] = {{
|
216
|
+
"document": doc,
|
217
|
+
"score": 1 / (k + i + 1),
|
218
|
+
"sources": ["semantic"]
|
219
|
+
}}
|
220
|
+
|
221
|
+
# Add statistical results
|
222
|
+
for i, doc in enumerate(statistical_results.get("results", [])):
|
223
|
+
doc_id = doc.get("id", f"statistical_{{i}}")
|
224
|
+
if doc_id in doc_scores:
|
225
|
+
doc_scores[doc_id]["score"] += 1 / (k + i + 1)
|
226
|
+
doc_scores[doc_id]["sources"].append("statistical")
|
227
|
+
else:
|
228
|
+
doc_scores[doc_id] = {{
|
229
|
+
"document": doc,
|
230
|
+
"score": 1 / (k + i + 1),
|
231
|
+
"sources": ["statistical"]
|
232
|
+
}}
|
233
|
+
|
234
|
+
# Sort by fused score
|
235
|
+
sorted_results = sorted(doc_scores.items(), key=lambda x: x[1]["score"], reverse=True)
|
236
|
+
|
237
|
+
return {{
|
238
|
+
"documents": [item[1]["document"] for item in sorted_results[:5]],
|
239
|
+
"scores": [item[1]["score"] for item in sorted_results[:5]],
|
240
|
+
"fusion_method": "{fusion_method}"
|
241
|
+
}}
|
242
|
+
|
243
|
+
# Execute fusion
|
244
|
+
fusion_results = rrf_fusion(semantic_results, statistical_results)
|
245
|
+
result = {{"fused_results": fusion_results}}
|
246
|
+
"""
|
247
|
+
},
|
248
|
+
)
|
249
|
+
|
250
|
+
# Connect workflows to fusion
|
251
|
+
builder.add_connection(semantic_id, "output", fusion_id, "semantic_results")
|
252
|
+
builder.add_connection(statistical_id, "output", fusion_id, "statistical_results")
|
253
|
+
|
254
|
+
workflow = builder.build(name="hybrid_rag_workflow")
|
255
|
+
|
256
|
+
return WorkflowNode(
|
257
|
+
workflow=workflow,
|
258
|
+
name="hybrid_rag_node",
|
259
|
+
description=f"Hybrid RAG with {fusion_method} fusion combining semantic and statistical approaches",
|
260
|
+
)
|
261
|
+
|
262
|
+
|
263
|
+
def create_hierarchical_rag_workflow(config: RAGConfig) -> WorkflowNode:
|
264
|
+
"""
|
265
|
+
Create hierarchical RAG workflow for multi-level document processing.
|
266
|
+
|
267
|
+
Pipeline: Documents → HierarchicalChunker → Multi-level Embedding → Multi-collection Storage → Hierarchical Retrieval
|
268
|
+
"""
|
269
|
+
builder = WorkflowBuilder()
|
270
|
+
|
271
|
+
# Add hierarchical chunking
|
272
|
+
chunker_id = builder.add_node(
|
273
|
+
"HierarchicalChunkerNode",
|
274
|
+
node_id="hierarchical_chunker",
|
275
|
+
config={"chunk_size": config.chunk_size, "overlap": config.chunk_overlap},
|
276
|
+
)
|
277
|
+
|
278
|
+
# Add embedding for each level
|
279
|
+
embedder_id = builder.add_node(
|
280
|
+
"EmbeddingGeneratorNode",
|
281
|
+
node_id="embedder",
|
282
|
+
config={"model": config.embedding_model, "provider": config.embedding_provider},
|
283
|
+
)
|
284
|
+
|
285
|
+
# Add level processor for organizing chunks by hierarchy
|
286
|
+
level_processor_id = builder.add_node(
|
287
|
+
"PythonCodeNode",
|
288
|
+
node_id="level_processor",
|
289
|
+
config={
|
290
|
+
"code": """
|
291
|
+
levels = ["document", "section", "paragraph"]
|
292
|
+
level_chunks = {}
|
293
|
+
|
294
|
+
for level in levels:
|
295
|
+
level_chunks[level] = [chunk for chunk in chunks if chunk.get("hierarchy_level") == level]
|
296
|
+
|
297
|
+
result = {"level_chunks": level_chunks, "levels": levels}
|
298
|
+
"""
|
299
|
+
},
|
300
|
+
)
|
301
|
+
|
302
|
+
# Add vector databases for each level
|
303
|
+
doc_vectordb_id = builder.add_node(
|
304
|
+
"VectorDatabaseNode",
|
305
|
+
node_id="doc_vector_db",
|
306
|
+
config={
|
307
|
+
"provider": config.vector_db_provider,
|
308
|
+
"collection_name": "hierarchical_rag_document",
|
309
|
+
},
|
310
|
+
)
|
311
|
+
|
312
|
+
section_vectordb_id = builder.add_node(
|
313
|
+
"VectorDatabaseNode",
|
314
|
+
node_id="section_vector_db",
|
315
|
+
config={
|
316
|
+
"provider": config.vector_db_provider,
|
317
|
+
"collection_name": "hierarchical_rag_section",
|
318
|
+
},
|
319
|
+
)
|
320
|
+
|
321
|
+
para_vectordb_id = builder.add_node(
|
322
|
+
"VectorDatabaseNode",
|
323
|
+
node_id="para_vector_db",
|
324
|
+
config={
|
325
|
+
"provider": config.vector_db_provider,
|
326
|
+
"collection_name": "hierarchical_rag_paragraph",
|
327
|
+
},
|
328
|
+
)
|
329
|
+
|
330
|
+
# Add hierarchical retriever
|
331
|
+
retriever_id = builder.add_node(
|
332
|
+
"HybridRetrieverNode",
|
333
|
+
node_id="hierarchical_retriever",
|
334
|
+
config={"k": config.retrieval_k, "method": "hierarchical"},
|
335
|
+
)
|
336
|
+
|
337
|
+
# Connect pipeline
|
338
|
+
builder.add_connection(chunker_id, "chunks", level_processor_id, "chunks")
|
339
|
+
builder.add_connection(chunker_id, "chunks", embedder_id, "texts")
|
340
|
+
builder.add_connection(
|
341
|
+
level_processor_id, "result", doc_vectordb_id, "level_chunks"
|
342
|
+
)
|
343
|
+
builder.add_connection(
|
344
|
+
level_processor_id, "result", section_vectordb_id, "level_chunks"
|
345
|
+
)
|
346
|
+
builder.add_connection(
|
347
|
+
level_processor_id, "result", para_vectordb_id, "level_chunks"
|
348
|
+
)
|
349
|
+
builder.add_connection(embedder_id, "embeddings", doc_vectordb_id, "embeddings")
|
350
|
+
builder.add_connection(embedder_id, "embeddings", section_vectordb_id, "embeddings")
|
351
|
+
builder.add_connection(embedder_id, "embeddings", para_vectordb_id, "embeddings")
|
352
|
+
|
353
|
+
# Connect all vector DBs to retriever
|
354
|
+
builder.add_connection(
|
355
|
+
doc_vectordb_id, "stored_documents", retriever_id, "document_store"
|
356
|
+
)
|
357
|
+
builder.add_connection(
|
358
|
+
section_vectordb_id, "stored_documents", retriever_id, "section_store"
|
359
|
+
)
|
360
|
+
builder.add_connection(
|
361
|
+
para_vectordb_id, "stored_documents", retriever_id, "paragraph_store"
|
362
|
+
)
|
363
|
+
|
364
|
+
workflow = builder.build(name="hierarchical_rag_workflow")
|
365
|
+
|
366
|
+
return WorkflowNode(
|
367
|
+
workflow=workflow,
|
368
|
+
name="hierarchical_rag_node",
|
369
|
+
description="Hierarchical RAG with multi-level document processing and context aggregation",
|
370
|
+
)
|
371
|
+
|
372
|
+
|
373
|
+
@register_node()
|
374
|
+
class SemanticRAGNode(Node):
|
375
|
+
"""
|
376
|
+
Semantic RAG Strategy Node
|
377
|
+
|
378
|
+
Wraps the semantic RAG workflow as a single node for easy integration.
|
379
|
+
Uses semantic chunking with dense embeddings for optimal semantic matching.
|
380
|
+
"""
|
381
|
+
|
382
|
+
def __init__(self, name: str = "semantic_rag", config: Optional[RAGConfig] = None):
|
383
|
+
self.config = config or RAGConfig()
|
384
|
+
self.workflow_node = None
|
385
|
+
super().__init__(name)
|
386
|
+
|
387
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
388
|
+
return {
|
389
|
+
"documents": NodeParameter(
|
390
|
+
name="documents",
|
391
|
+
type=list,
|
392
|
+
required=True,
|
393
|
+
description="Documents to process for semantic RAG",
|
394
|
+
),
|
395
|
+
"query": NodeParameter(
|
396
|
+
name="query",
|
397
|
+
type=str,
|
398
|
+
required=False,
|
399
|
+
description="Query for retrieval",
|
400
|
+
),
|
401
|
+
"operation": NodeParameter(
|
402
|
+
name="operation",
|
403
|
+
type=str,
|
404
|
+
default="index",
|
405
|
+
description="Operation: 'index' or 'retrieve'",
|
406
|
+
),
|
407
|
+
}
|
408
|
+
|
409
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
410
|
+
"""Run semantic RAG using WorkflowNode"""
|
411
|
+
if not self.workflow_node:
|
412
|
+
self.workflow_node = create_semantic_rag_workflow(self.config)
|
413
|
+
|
414
|
+
# Delegate to WorkflowNode
|
415
|
+
return self.workflow_node.run(**kwargs)
|
416
|
+
|
417
|
+
|
418
|
+
@register_node()
|
419
|
+
class StatisticalRAGNode(Node):
|
420
|
+
"""
|
421
|
+
Statistical RAG Strategy Node
|
422
|
+
|
423
|
+
Wraps the statistical RAG workflow for sparse keyword-based retrieval.
|
424
|
+
Uses statistical chunking with keyword extraction for technical content.
|
425
|
+
"""
|
426
|
+
|
427
|
+
def __init__(
|
428
|
+
self, name: str = "statistical_rag", config: Optional[RAGConfig] = None
|
429
|
+
):
|
430
|
+
self.config = config or RAGConfig()
|
431
|
+
self.workflow_node = None
|
432
|
+
super().__init__(name)
|
433
|
+
|
434
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
435
|
+
return {
|
436
|
+
"documents": NodeParameter(
|
437
|
+
name="documents",
|
438
|
+
type=list,
|
439
|
+
required=True,
|
440
|
+
description="Documents to process for statistical RAG",
|
441
|
+
),
|
442
|
+
"query": NodeParameter(
|
443
|
+
name="query",
|
444
|
+
type=str,
|
445
|
+
required=False,
|
446
|
+
description="Query for retrieval",
|
447
|
+
),
|
448
|
+
"operation": NodeParameter(
|
449
|
+
name="operation",
|
450
|
+
type=str,
|
451
|
+
default="index",
|
452
|
+
description="Operation: 'index' or 'retrieve'",
|
453
|
+
),
|
454
|
+
}
|
455
|
+
|
456
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
457
|
+
"""Run statistical RAG using WorkflowNode"""
|
458
|
+
if not self.workflow_node:
|
459
|
+
self.workflow_node = create_statistical_rag_workflow(self.config)
|
460
|
+
|
461
|
+
return self.workflow_node.run(**kwargs)
|
462
|
+
|
463
|
+
|
464
|
+
@register_node()
|
465
|
+
class HybridRAGNode(Node):
|
466
|
+
"""
|
467
|
+
Hybrid RAG Strategy Node
|
468
|
+
|
469
|
+
Combines semantic and statistical approaches using result fusion.
|
470
|
+
Provides 20-30% better performance than individual methods.
|
471
|
+
"""
|
472
|
+
|
473
|
+
def __init__(
|
474
|
+
self,
|
475
|
+
name: str = "hybrid_rag",
|
476
|
+
config: Optional[RAGConfig] = None,
|
477
|
+
fusion_method: str = "rrf",
|
478
|
+
):
|
479
|
+
self.config = config or RAGConfig()
|
480
|
+
self.fusion_method = fusion_method
|
481
|
+
self.workflow_node = None
|
482
|
+
super().__init__(name)
|
483
|
+
|
484
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
485
|
+
return {
|
486
|
+
"documents": NodeParameter(
|
487
|
+
name="documents",
|
488
|
+
type=list,
|
489
|
+
required=True,
|
490
|
+
description="Documents to process for hybrid RAG",
|
491
|
+
),
|
492
|
+
"query": NodeParameter(
|
493
|
+
name="query",
|
494
|
+
type=str,
|
495
|
+
required=False,
|
496
|
+
description="Query for retrieval",
|
497
|
+
),
|
498
|
+
"operation": NodeParameter(
|
499
|
+
name="operation",
|
500
|
+
type=str,
|
501
|
+
default="index",
|
502
|
+
description="Operation: 'index' or 'retrieve'",
|
503
|
+
),
|
504
|
+
"fusion_method": NodeParameter(
|
505
|
+
name="fusion_method",
|
506
|
+
type=str,
|
507
|
+
default="rrf",
|
508
|
+
description="Fusion method: 'rrf', 'linear', 'weighted'",
|
509
|
+
),
|
510
|
+
}
|
511
|
+
|
512
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
513
|
+
"""Run hybrid RAG using WorkflowNode"""
|
514
|
+
fusion_method = kwargs.get("fusion_method", self.fusion_method)
|
515
|
+
|
516
|
+
if not self.workflow_node or fusion_method != self.fusion_method:
|
517
|
+
self.fusion_method = fusion_method
|
518
|
+
self.workflow_node = create_hybrid_rag_workflow(self.config, fusion_method)
|
519
|
+
|
520
|
+
return self.workflow_node.run(**kwargs)
|
521
|
+
|
522
|
+
|
523
|
+
@register_node()
|
524
|
+
class HierarchicalRAGNode(Node):
|
525
|
+
"""
|
526
|
+
Hierarchical RAG Strategy Node
|
527
|
+
|
528
|
+
Multi-level document processing that preserves document structure.
|
529
|
+
Processes documents at document, section, and paragraph levels.
|
530
|
+
"""
|
531
|
+
|
532
|
+
def __init__(
|
533
|
+
self, name: str = "hierarchical_rag", config: Optional[RAGConfig] = None
|
534
|
+
):
|
535
|
+
self.config = config or RAGConfig()
|
536
|
+
self.workflow_node = None
|
537
|
+
super().__init__(name)
|
538
|
+
|
539
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
540
|
+
return {
|
541
|
+
"documents": NodeParameter(
|
542
|
+
name="documents",
|
543
|
+
type=list,
|
544
|
+
required=True,
|
545
|
+
description="Documents to process hierarchically",
|
546
|
+
),
|
547
|
+
"query": NodeParameter(
|
548
|
+
name="query",
|
549
|
+
type=str,
|
550
|
+
required=False,
|
551
|
+
description="Query for hierarchical retrieval",
|
552
|
+
),
|
553
|
+
"operation": NodeParameter(
|
554
|
+
name="operation",
|
555
|
+
type=str,
|
556
|
+
default="index",
|
557
|
+
description="Operation: 'index' or 'retrieve'",
|
558
|
+
),
|
559
|
+
}
|
560
|
+
|
561
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
562
|
+
"""Run hierarchical RAG using WorkflowNode"""
|
563
|
+
if not self.workflow_node:
|
564
|
+
self.workflow_node = create_hierarchical_rag_workflow(self.config)
|
565
|
+
|
566
|
+
return self.workflow_node.run(**kwargs)
|