kailash 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +33 -1
- kailash/access_control/__init__.py +129 -0
- kailash/access_control/managers.py +461 -0
- kailash/access_control/rule_evaluators.py +467 -0
- kailash/access_control_abac.py +825 -0
- kailash/config/__init__.py +27 -0
- kailash/config/database_config.py +359 -0
- kailash/database/__init__.py +28 -0
- kailash/database/execution_pipeline.py +499 -0
- kailash/middleware/__init__.py +306 -0
- kailash/middleware/auth/__init__.py +33 -0
- kailash/middleware/auth/access_control.py +436 -0
- kailash/middleware/auth/auth_manager.py +422 -0
- kailash/middleware/auth/jwt_auth.py +477 -0
- kailash/middleware/auth/kailash_jwt_auth.py +616 -0
- kailash/middleware/communication/__init__.py +37 -0
- kailash/middleware/communication/ai_chat.py +989 -0
- kailash/middleware/communication/api_gateway.py +802 -0
- kailash/middleware/communication/events.py +470 -0
- kailash/middleware/communication/realtime.py +710 -0
- kailash/middleware/core/__init__.py +21 -0
- kailash/middleware/core/agent_ui.py +890 -0
- kailash/middleware/core/schema.py +643 -0
- kailash/middleware/core/workflows.py +396 -0
- kailash/middleware/database/__init__.py +63 -0
- kailash/middleware/database/base.py +113 -0
- kailash/middleware/database/base_models.py +525 -0
- kailash/middleware/database/enums.py +106 -0
- kailash/middleware/database/migrations.py +12 -0
- kailash/{api/database.py → middleware/database/models.py} +183 -291
- kailash/middleware/database/repositories.py +685 -0
- kailash/middleware/database/session_manager.py +19 -0
- kailash/middleware/mcp/__init__.py +38 -0
- kailash/middleware/mcp/client_integration.py +585 -0
- kailash/middleware/mcp/enhanced_server.py +576 -0
- kailash/nodes/__init__.py +27 -3
- kailash/nodes/admin/__init__.py +42 -0
- kailash/nodes/admin/audit_log.py +794 -0
- kailash/nodes/admin/permission_check.py +864 -0
- kailash/nodes/admin/role_management.py +823 -0
- kailash/nodes/admin/security_event.py +1523 -0
- kailash/nodes/admin/user_management.py +944 -0
- kailash/nodes/ai/a2a.py +24 -7
- kailash/nodes/ai/ai_providers.py +248 -40
- kailash/nodes/ai/embedding_generator.py +11 -11
- kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
- kailash/nodes/ai/llm_agent.py +436 -5
- kailash/nodes/ai/self_organizing.py +85 -10
- kailash/nodes/ai/vision_utils.py +148 -0
- kailash/nodes/alerts/__init__.py +26 -0
- kailash/nodes/alerts/base.py +234 -0
- kailash/nodes/alerts/discord.py +499 -0
- kailash/nodes/api/auth.py +287 -6
- kailash/nodes/api/rest.py +151 -0
- kailash/nodes/auth/__init__.py +17 -0
- kailash/nodes/auth/directory_integration.py +1228 -0
- kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
- kailash/nodes/auth/mfa.py +2338 -0
- kailash/nodes/auth/risk_assessment.py +872 -0
- kailash/nodes/auth/session_management.py +1093 -0
- kailash/nodes/auth/sso.py +1040 -0
- kailash/nodes/base.py +344 -13
- kailash/nodes/base_cycle_aware.py +4 -2
- kailash/nodes/base_with_acl.py +1 -1
- kailash/nodes/code/python.py +283 -10
- kailash/nodes/compliance/__init__.py +9 -0
- kailash/nodes/compliance/data_retention.py +1888 -0
- kailash/nodes/compliance/gdpr.py +2004 -0
- kailash/nodes/data/__init__.py +22 -2
- kailash/nodes/data/async_connection.py +469 -0
- kailash/nodes/data/async_sql.py +757 -0
- kailash/nodes/data/async_vector.py +598 -0
- kailash/nodes/data/readers.py +767 -0
- kailash/nodes/data/retrieval.py +360 -1
- kailash/nodes/data/sharepoint_graph.py +397 -21
- kailash/nodes/data/sql.py +94 -5
- kailash/nodes/data/streaming.py +68 -8
- kailash/nodes/data/vector_db.py +54 -4
- kailash/nodes/enterprise/__init__.py +13 -0
- kailash/nodes/enterprise/batch_processor.py +741 -0
- kailash/nodes/enterprise/data_lineage.py +497 -0
- kailash/nodes/logic/convergence.py +31 -9
- kailash/nodes/logic/operations.py +14 -3
- kailash/nodes/mixins/__init__.py +8 -0
- kailash/nodes/mixins/event_emitter.py +201 -0
- kailash/nodes/mixins/mcp.py +9 -4
- kailash/nodes/mixins/security.py +165 -0
- kailash/nodes/monitoring/__init__.py +7 -0
- kailash/nodes/monitoring/performance_benchmark.py +2497 -0
- kailash/nodes/rag/__init__.py +284 -0
- kailash/nodes/rag/advanced.py +1615 -0
- kailash/nodes/rag/agentic.py +773 -0
- kailash/nodes/rag/conversational.py +999 -0
- kailash/nodes/rag/evaluation.py +875 -0
- kailash/nodes/rag/federated.py +1188 -0
- kailash/nodes/rag/graph.py +721 -0
- kailash/nodes/rag/multimodal.py +671 -0
- kailash/nodes/rag/optimized.py +933 -0
- kailash/nodes/rag/privacy.py +1059 -0
- kailash/nodes/rag/query_processing.py +1335 -0
- kailash/nodes/rag/realtime.py +764 -0
- kailash/nodes/rag/registry.py +547 -0
- kailash/nodes/rag/router.py +837 -0
- kailash/nodes/rag/similarity.py +1854 -0
- kailash/nodes/rag/strategies.py +566 -0
- kailash/nodes/rag/workflows.py +575 -0
- kailash/nodes/security/__init__.py +19 -0
- kailash/nodes/security/abac_evaluator.py +1411 -0
- kailash/nodes/security/audit_log.py +103 -0
- kailash/nodes/security/behavior_analysis.py +1893 -0
- kailash/nodes/security/credential_manager.py +401 -0
- kailash/nodes/security/rotating_credentials.py +760 -0
- kailash/nodes/security/security_event.py +133 -0
- kailash/nodes/security/threat_detection.py +1103 -0
- kailash/nodes/testing/__init__.py +9 -0
- kailash/nodes/testing/credential_testing.py +499 -0
- kailash/nodes/transform/__init__.py +10 -2
- kailash/nodes/transform/chunkers.py +592 -1
- kailash/nodes/transform/processors.py +484 -14
- kailash/nodes/validation.py +321 -0
- kailash/runtime/access_controlled.py +1 -1
- kailash/runtime/async_local.py +41 -7
- kailash/runtime/docker.py +1 -1
- kailash/runtime/local.py +474 -55
- kailash/runtime/parallel.py +1 -1
- kailash/runtime/parallel_cyclic.py +1 -1
- kailash/runtime/testing.py +210 -2
- kailash/security.py +1 -1
- kailash/utils/migrations/__init__.py +25 -0
- kailash/utils/migrations/generator.py +433 -0
- kailash/utils/migrations/models.py +231 -0
- kailash/utils/migrations/runner.py +489 -0
- kailash/utils/secure_logging.py +342 -0
- kailash/workflow/__init__.py +16 -0
- kailash/workflow/cyclic_runner.py +3 -4
- kailash/workflow/graph.py +70 -2
- kailash/workflow/resilience.py +249 -0
- kailash/workflow/templates.py +726 -0
- {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/METADATA +256 -20
- kailash-0.4.1.dist-info/RECORD +227 -0
- kailash/api/__init__.py +0 -17
- kailash/api/__main__.py +0 -6
- kailash/api/studio_secure.py +0 -893
- kailash/mcp/__main__.py +0 -13
- kailash/mcp/server_new.py +0 -336
- kailash/mcp/servers/__init__.py +0 -12
- kailash-0.3.2.dist-info/RECORD +0 -136
- {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/WHEEL +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/entry_points.txt +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,671 @@
|
|
1
|
+
"""
|
2
|
+
Multimodal RAG Implementation
|
3
|
+
|
4
|
+
Implements RAG with support for multiple modalities:
|
5
|
+
- Text + Image retrieval and generation
|
6
|
+
- Cross-modal similarity search
|
7
|
+
- Visual question answering
|
8
|
+
- Image-augmented responses
|
9
|
+
- Document understanding with visuals
|
10
|
+
|
11
|
+
Based on CLIP, BLIP-2, and multimodal research from 2024.
|
12
|
+
"""
|
13
|
+
|
14
|
+
import base64
|
15
|
+
import json
|
16
|
+
import logging
|
17
|
+
from pathlib import Path
|
18
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
19
|
+
|
20
|
+
from ...workflow.builder import WorkflowBuilder
|
21
|
+
from ..ai.llm_agent import LLMAgentNode
|
22
|
+
from ..base import Node, NodeParameter, register_node
|
23
|
+
|
24
|
+
# from ..data.readers import ImageReaderNode # TODO: Implement ImageReaderNode
|
25
|
+
from ..code.python import PythonCodeNode
|
26
|
+
from ..logic.workflow import WorkflowNode
|
27
|
+
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
@register_node()
|
32
|
+
class MultimodalRAGNode(WorkflowNode):
|
33
|
+
"""
|
34
|
+
Multimodal RAG with Text + Image Support
|
35
|
+
|
36
|
+
Implements RAG that can process and retrieve from both text and images,
|
37
|
+
enabling richer responses with visual context.
|
38
|
+
|
39
|
+
When to use:
|
40
|
+
- Best for: Technical documentation with diagrams, e-commerce, medical imaging
|
41
|
+
- Not ideal for: Audio/video heavy content, pure text scenarios
|
42
|
+
- Performance: 1-3 seconds for retrieval, 2-5 seconds for generation
|
43
|
+
- Quality improvement: 40-60% for visual questions
|
44
|
+
|
45
|
+
Key features:
|
46
|
+
- Cross-modal retrieval (text→image, image→text)
|
47
|
+
- Visual question answering
|
48
|
+
- Diagram and chart understanding
|
49
|
+
- Multi-modal fusion for responses
|
50
|
+
- Support for various image formats
|
51
|
+
|
52
|
+
Example:
|
53
|
+
multimodal_rag = MultimodalRAGNode(
|
54
|
+
image_encoder="clip-base",
|
55
|
+
enable_ocr=True
|
56
|
+
)
|
57
|
+
|
58
|
+
# Query: "Show me the architecture diagram for transformers"
|
59
|
+
# Will retrieve:
|
60
|
+
# 1. Text descriptions of transformer architecture
|
61
|
+
# 2. Architecture diagrams and visualizations
|
62
|
+
# 3. Code implementations with visual outputs
|
63
|
+
# 4. Combine into comprehensive answer with images
|
64
|
+
|
65
|
+
result = await multimodal_rag.run(
|
66
|
+
documents=mixed_media_docs, # Contains text and image paths
|
67
|
+
query="Show me the architecture diagram for transformers"
|
68
|
+
)
|
69
|
+
|
70
|
+
Parameters:
|
71
|
+
image_encoder: Model for image encoding (clip, blip, etc.)
|
72
|
+
text_encoder: Model for text encoding
|
73
|
+
enable_ocr: Extract text from images
|
74
|
+
fusion_strategy: How to combine modalities
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
text_results: Retrieved text documents
|
78
|
+
image_results: Retrieved images with captions
|
79
|
+
combined_answer: Multimodal response
|
80
|
+
modality_scores: Relevance per modality
|
81
|
+
"""
|
82
|
+
|
83
|
+
def __init__(
|
84
|
+
self,
|
85
|
+
name: str = "multimodal_rag",
|
86
|
+
image_encoder: str = "clip-base",
|
87
|
+
enable_ocr: bool = True,
|
88
|
+
fusion_strategy: str = "weighted",
|
89
|
+
):
|
90
|
+
self.image_encoder = image_encoder
|
91
|
+
self.enable_ocr = enable_ocr
|
92
|
+
self.fusion_strategy = fusion_strategy
|
93
|
+
super().__init__(name, self._create_workflow())
|
94
|
+
|
95
|
+
def _create_workflow(self) -> WorkflowNode:
|
96
|
+
"""Create multimodal RAG workflow"""
|
97
|
+
builder = WorkflowBuilder()
|
98
|
+
|
99
|
+
# Query analyzer for modality detection
|
100
|
+
query_analyzer_id = builder.add_node(
|
101
|
+
"LLMAgentNode",
|
102
|
+
node_id="query_analyzer",
|
103
|
+
config={
|
104
|
+
"system_prompt": """Analyze the query to determine required modalities.
|
105
|
+
|
106
|
+
Identify:
|
107
|
+
1. Is visual information needed?
|
108
|
+
2. What type of images would be helpful?
|
109
|
+
3. Should we prioritize text or images?
|
110
|
+
|
111
|
+
Return JSON:
|
112
|
+
{
|
113
|
+
"needs_images": true/false,
|
114
|
+
"image_types": ["diagram", "photo", "chart", etc.],
|
115
|
+
"text_weight": 0.0-1.0,
|
116
|
+
"image_weight": 0.0-1.0,
|
117
|
+
"query_type": "visual|textual|mixed"
|
118
|
+
}""",
|
119
|
+
"model": "gpt-4",
|
120
|
+
},
|
121
|
+
)
|
122
|
+
|
123
|
+
# Document preprocessor
|
124
|
+
doc_preprocessor_id = builder.add_node(
|
125
|
+
"PythonCodeNode",
|
126
|
+
node_id="doc_preprocessor",
|
127
|
+
config={
|
128
|
+
"code": f"""
|
129
|
+
import json
|
130
|
+
import base64
|
131
|
+
from pathlib import Path
|
132
|
+
|
133
|
+
def preprocess_documents(documents):
|
134
|
+
'''Separate and prepare text and image documents'''
|
135
|
+
text_docs = []
|
136
|
+
image_docs = []
|
137
|
+
|
138
|
+
for doc in documents:
|
139
|
+
doc_type = doc.get("type", "text")
|
140
|
+
|
141
|
+
if doc_type == "text":
|
142
|
+
text_docs.append({{
|
143
|
+
"id": doc.get("id", f"text_{{len(text_docs)}}"),
|
144
|
+
"content": doc.get("content", ""),
|
145
|
+
"title": doc.get("title", ""),
|
146
|
+
"metadata": doc.get("metadata", {{}})
|
147
|
+
}})
|
148
|
+
|
149
|
+
elif doc_type in ["image", "multimodal"]:
|
150
|
+
# Handle image documents
|
151
|
+
image_path = doc.get("image_path") or doc.get("path")
|
152
|
+
|
153
|
+
image_doc = {{
|
154
|
+
"id": doc.get("id", f"image_{{len(image_docs)}}"),
|
155
|
+
"path": image_path,
|
156
|
+
"caption": doc.get("caption", ""),
|
157
|
+
"alt_text": doc.get("alt_text", ""),
|
158
|
+
"metadata": doc.get("metadata", {{}}),
|
159
|
+
"associated_text": doc.get("content", "")
|
160
|
+
}}
|
161
|
+
|
162
|
+
# If OCR is enabled, we'd extract text here
|
163
|
+
if {self.enable_ocr} and image_path:
|
164
|
+
# Simulated OCR result
|
165
|
+
image_doc["ocr_text"] = f"[OCR text from {{image_path}}]"
|
166
|
+
|
167
|
+
image_docs.append(image_doc)
|
168
|
+
|
169
|
+
# Also add associated text as separate doc
|
170
|
+
if doc.get("content"):
|
171
|
+
text_docs.append({{
|
172
|
+
"id": f"{{doc.get('id', '')}}_text",
|
173
|
+
"content": doc.get("content", ""),
|
174
|
+
"title": doc.get("title", ""),
|
175
|
+
"metadata": {{"from_multimodal": True}}
|
176
|
+
}})
|
177
|
+
|
178
|
+
result = {{
|
179
|
+
"preprocessed_docs": {{
|
180
|
+
"text_documents": text_docs,
|
181
|
+
"image_documents": image_docs,
|
182
|
+
"stats": {{
|
183
|
+
"total_text": len(text_docs),
|
184
|
+
"total_images": len(image_docs),
|
185
|
+
"multimodal_docs": len([d for d in documents if d.get("type") == "multimodal"])
|
186
|
+
}}
|
187
|
+
}}
|
188
|
+
}}
|
189
|
+
"""
|
190
|
+
},
|
191
|
+
)
|
192
|
+
|
193
|
+
# Multimodal encoder
|
194
|
+
encoder_id = builder.add_node(
|
195
|
+
"PythonCodeNode",
|
196
|
+
node_id="multimodal_encoder",
|
197
|
+
config={
|
198
|
+
"code": f"""
|
199
|
+
import numpy as np
|
200
|
+
from typing import List, Dict
|
201
|
+
|
202
|
+
def encode_multimodal(text_docs, image_docs, query, modality_analysis):
|
203
|
+
'''Encode documents and query for multimodal retrieval'''
|
204
|
+
|
205
|
+
# Simulated encoding (would use CLIP/BLIP in production)
|
206
|
+
def text_encoder(text):
|
207
|
+
# Simple hash-based encoding for demo
|
208
|
+
return [float(ord(c)) / 100 for c in text[:10]]
|
209
|
+
|
210
|
+
def image_encoder(image_path):
|
211
|
+
# Simulated image encoding
|
212
|
+
if "architecture" in image_path.lower():
|
213
|
+
return [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0]
|
214
|
+
elif "diagram" in image_path.lower():
|
215
|
+
return [0.8, 0.9, 0.6, 0.7, 0.4, 0.5, 0.2, 0.3, 0.0, 0.1]
|
216
|
+
else:
|
217
|
+
return [0.5] * 10
|
218
|
+
|
219
|
+
# Encode query
|
220
|
+
query_embedding = text_encoder(query)
|
221
|
+
|
222
|
+
# Encode text documents
|
223
|
+
text_embeddings = []
|
224
|
+
for doc in text_docs:
|
225
|
+
content = doc.get("content", "") + " " + doc.get("title", "")
|
226
|
+
text_embeddings.append({{
|
227
|
+
"id": doc["id"],
|
228
|
+
"embedding": text_encoder(content),
|
229
|
+
"content": content[:200]
|
230
|
+
}})
|
231
|
+
|
232
|
+
# Encode images
|
233
|
+
image_embeddings = []
|
234
|
+
for doc in image_docs:
|
235
|
+
# Combine visual and textual features
|
236
|
+
visual_emb = image_encoder(doc.get("path", ""))
|
237
|
+
|
238
|
+
# If we have caption or OCR text, encode that too
|
239
|
+
text_content = doc.get("caption", "") + " " + doc.get("ocr_text", "")
|
240
|
+
if text_content.strip():
|
241
|
+
text_emb = text_encoder(text_content)
|
242
|
+
# Fusion of visual and textual
|
243
|
+
combined_emb = [(v + t) / 2 for v, t in zip(visual_emb, text_emb)]
|
244
|
+
else:
|
245
|
+
combined_emb = visual_emb
|
246
|
+
|
247
|
+
image_embeddings.append({{
|
248
|
+
"id": doc["id"],
|
249
|
+
"embedding": combined_emb,
|
250
|
+
"path": doc.get("path", ""),
|
251
|
+
"caption": doc.get("caption", "")
|
252
|
+
}})
|
253
|
+
|
254
|
+
result = {{
|
255
|
+
"encoded_data": {{
|
256
|
+
"query_embedding": query_embedding,
|
257
|
+
"text_embeddings": text_embeddings,
|
258
|
+
"image_embeddings": image_embeddings,
|
259
|
+
"encoding_method": "{self.image_encoder}"
|
260
|
+
}}
|
261
|
+
}}
|
262
|
+
"""
|
263
|
+
},
|
264
|
+
)
|
265
|
+
|
266
|
+
# Cross-modal retriever
|
267
|
+
retriever_id = builder.add_node(
|
268
|
+
"PythonCodeNode",
|
269
|
+
node_id="cross_modal_retriever",
|
270
|
+
config={
|
271
|
+
"code": """
|
272
|
+
import numpy as np
|
273
|
+
|
274
|
+
def compute_similarity(emb1, emb2):
|
275
|
+
'''Compute cosine similarity'''
|
276
|
+
if not emb1 or not emb2:
|
277
|
+
return 0.0
|
278
|
+
|
279
|
+
# Simple dot product for demo
|
280
|
+
return sum(a * b for a, b in zip(emb1, emb2)) / (len(emb1) * len(emb2))
|
281
|
+
|
282
|
+
def retrieve_multimodal(encoded_data, modality_analysis):
|
283
|
+
'''Perform cross-modal retrieval'''
|
284
|
+
|
285
|
+
query_emb = encoded_data["query_embedding"]
|
286
|
+
text_embs = encoded_data["text_embeddings"]
|
287
|
+
image_embs = encoded_data["image_embeddings"]
|
288
|
+
|
289
|
+
# Get modality weights
|
290
|
+
text_weight = modality_analysis.get("response", {}).get("text_weight", 0.7)
|
291
|
+
image_weight = modality_analysis.get("response", {}).get("image_weight", 0.3)
|
292
|
+
|
293
|
+
# Score text documents
|
294
|
+
text_scores = []
|
295
|
+
for doc in text_embs:
|
296
|
+
score = compute_similarity(query_emb, doc["embedding"])
|
297
|
+
text_scores.append({
|
298
|
+
"id": doc["id"],
|
299
|
+
"score": score * text_weight,
|
300
|
+
"type": "text",
|
301
|
+
"preview": doc["content"]
|
302
|
+
})
|
303
|
+
|
304
|
+
# Score images
|
305
|
+
image_scores = []
|
306
|
+
for doc in image_embs:
|
307
|
+
score = compute_similarity(query_emb, doc["embedding"])
|
308
|
+
|
309
|
+
# Boost score if query mentions visual terms
|
310
|
+
query_lower = query.lower()
|
311
|
+
if any(term in query_lower for term in ["diagram", "image", "show", "picture", "visual"]):
|
312
|
+
score *= 1.5
|
313
|
+
|
314
|
+
image_scores.append({
|
315
|
+
"id": doc["id"],
|
316
|
+
"score": score * image_weight,
|
317
|
+
"type": "image",
|
318
|
+
"path": doc["path"],
|
319
|
+
"caption": doc["caption"]
|
320
|
+
})
|
321
|
+
|
322
|
+
# Combine and sort
|
323
|
+
all_scores = text_scores + image_scores
|
324
|
+
all_scores.sort(key=lambda x: x["score"], reverse=True)
|
325
|
+
|
326
|
+
# Separate top results by type
|
327
|
+
top_text = [s for s in all_scores if s["type"] == "text"][:5]
|
328
|
+
top_images = [s for s in all_scores if s["type"] == "image"][:3]
|
329
|
+
|
330
|
+
result = {
|
331
|
+
"retrieval_results": {
|
332
|
+
"text_results": top_text,
|
333
|
+
"image_results": top_images,
|
334
|
+
"combined_results": all_scores[:10],
|
335
|
+
"modality_distribution": {
|
336
|
+
"text_count": len([s for s in all_scores[:10] if s["type"] == "text"]),
|
337
|
+
"image_count": len([s for s in all_scores[:10] if s["type"] == "image"])
|
338
|
+
}
|
339
|
+
}
|
340
|
+
}
|
341
|
+
"""
|
342
|
+
},
|
343
|
+
)
|
344
|
+
|
345
|
+
# Multimodal response generator
|
346
|
+
response_generator_id = builder.add_node(
|
347
|
+
"LLMAgentNode",
|
348
|
+
node_id="response_generator",
|
349
|
+
config={
|
350
|
+
"system_prompt": """Generate a comprehensive response using both text and image results.
|
351
|
+
|
352
|
+
Structure your response to:
|
353
|
+
1. Provide textual explanation
|
354
|
+
2. Reference relevant images
|
355
|
+
3. Describe what images show
|
356
|
+
4. Integrate visual and textual information
|
357
|
+
|
358
|
+
Format:
|
359
|
+
[Text explanation]
|
360
|
+
|
361
|
+
Relevant Images:
|
362
|
+
- Image 1: [description and relevance]
|
363
|
+
- Image 2: [description and relevance]
|
364
|
+
|
365
|
+
[Integration of visual and textual insights]""",
|
366
|
+
"model": "gpt-4-vision", # Vision-capable model
|
367
|
+
},
|
368
|
+
)
|
369
|
+
|
370
|
+
# Result formatter
|
371
|
+
result_formatter_id = builder.add_node(
|
372
|
+
"PythonCodeNode",
|
373
|
+
node_id="result_formatter",
|
374
|
+
config={
|
375
|
+
"code": """
|
376
|
+
# Format multimodal results
|
377
|
+
retrieval_results = retrieval_results
|
378
|
+
response = response.get("response", "") if isinstance(response, dict) else str(response)
|
379
|
+
query = query
|
380
|
+
modality_analysis = modality_analysis.get("response", {})
|
381
|
+
|
382
|
+
# Structure final output
|
383
|
+
multimodal_output = {
|
384
|
+
"text_results": retrieval_results["text_results"],
|
385
|
+
"image_results": retrieval_results["image_results"],
|
386
|
+
"combined_answer": response,
|
387
|
+
"modality_scores": {
|
388
|
+
"text_relevance": sum(r["score"] for r in retrieval_results["text_results"]) / max(1, len(retrieval_results["text_results"])),
|
389
|
+
"image_relevance": sum(r["score"] for r in retrieval_results["image_results"]) / max(1, len(retrieval_results["image_results"]))
|
390
|
+
},
|
391
|
+
"metadata": {
|
392
|
+
"query_type": modality_analysis.get("query_type", "mixed"),
|
393
|
+
"fusion_strategy": "{self.fusion_strategy}",
|
394
|
+
"total_results": len(retrieval_results["combined_results"])
|
395
|
+
}
|
396
|
+
}
|
397
|
+
|
398
|
+
result = {"multimodal_rag_output": multimodal_output}
|
399
|
+
"""
|
400
|
+
},
|
401
|
+
)
|
402
|
+
|
403
|
+
# Connect workflow
|
404
|
+
builder.add_connection(
|
405
|
+
query_analyzer_id, "response", doc_preprocessor_id, "modality_requirements"
|
406
|
+
)
|
407
|
+
builder.add_connection(
|
408
|
+
doc_preprocessor_id, "preprocessed_docs", encoder_id, "documents_to_encode"
|
409
|
+
)
|
410
|
+
builder.add_connection(
|
411
|
+
query_analyzer_id, "response", encoder_id, "modality_analysis"
|
412
|
+
)
|
413
|
+
builder.add_connection(encoder_id, "encoded_data", retriever_id, "encoded_data")
|
414
|
+
builder.add_connection(
|
415
|
+
query_analyzer_id, "response", retriever_id, "modality_analysis"
|
416
|
+
)
|
417
|
+
builder.add_connection(
|
418
|
+
retriever_id, "retrieval_results", response_generator_id, "context"
|
419
|
+
)
|
420
|
+
builder.add_connection(
|
421
|
+
response_generator_id, "response", result_formatter_id, "response"
|
422
|
+
)
|
423
|
+
builder.add_connection(
|
424
|
+
retriever_id, "retrieval_results", result_formatter_id, "retrieval_results"
|
425
|
+
)
|
426
|
+
builder.add_connection(
|
427
|
+
query_analyzer_id, "response", result_formatter_id, "modality_analysis"
|
428
|
+
)
|
429
|
+
|
430
|
+
return builder.build(name="multimodal_rag_workflow")
|
431
|
+
|
432
|
+
|
433
|
+
@register_node()
|
434
|
+
class VisualQuestionAnsweringNode(Node):
|
435
|
+
"""
|
436
|
+
Visual Question Answering (VQA) Node
|
437
|
+
|
438
|
+
Specialized node for answering questions about images.
|
439
|
+
|
440
|
+
When to use:
|
441
|
+
- Best for: Direct questions about image content
|
442
|
+
- Not ideal for: Abstract reasoning about images
|
443
|
+
- Performance: 1-2 seconds per image
|
444
|
+
- Accuracy: High for descriptive questions
|
445
|
+
|
446
|
+
Example:
|
447
|
+
vqa = VisualQuestionAnsweringNode()
|
448
|
+
|
449
|
+
result = await vqa.run(
|
450
|
+
image_path="architecture_diagram.png",
|
451
|
+
question="What components are shown in this diagram?"
|
452
|
+
)
|
453
|
+
|
454
|
+
Parameters:
|
455
|
+
model: VQA model to use (blip2, flamingo, etc.)
|
456
|
+
enable_captioning: Generate image captions
|
457
|
+
confidence_threshold: Minimum confidence for answers
|
458
|
+
|
459
|
+
Returns:
|
460
|
+
answer: Answer to the visual question
|
461
|
+
confidence: Model confidence
|
462
|
+
image_caption: Generated caption if enabled
|
463
|
+
detected_objects: Objects found in image
|
464
|
+
"""
|
465
|
+
|
466
|
+
def __init__(
|
467
|
+
self,
|
468
|
+
name: str = "vqa_node",
|
469
|
+
model: str = "blip2-base",
|
470
|
+
enable_captioning: bool = True,
|
471
|
+
):
|
472
|
+
self.model = model
|
473
|
+
self.enable_captioning = enable_captioning
|
474
|
+
super().__init__(name)
|
475
|
+
|
476
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
477
|
+
return {
|
478
|
+
"image_path": NodeParameter(
|
479
|
+
name="image_path",
|
480
|
+
type=str,
|
481
|
+
required=True,
|
482
|
+
description="Path to image file",
|
483
|
+
),
|
484
|
+
"question": NodeParameter(
|
485
|
+
name="question",
|
486
|
+
type=str,
|
487
|
+
required=True,
|
488
|
+
description="Question about the image",
|
489
|
+
),
|
490
|
+
"context": NodeParameter(
|
491
|
+
name="context",
|
492
|
+
type=dict,
|
493
|
+
required=False,
|
494
|
+
description="Additional context",
|
495
|
+
),
|
496
|
+
}
|
497
|
+
|
498
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
499
|
+
"""Answer questions about images"""
|
500
|
+
image_path = kwargs.get("image_path", "")
|
501
|
+
question = kwargs.get("question", "")
|
502
|
+
|
503
|
+
# Simulated VQA (would use real model in production)
|
504
|
+
# Analyze question type
|
505
|
+
question_lower = question.lower()
|
506
|
+
|
507
|
+
answer = "Based on visual analysis: "
|
508
|
+
confidence = 0.85
|
509
|
+
detected_objects = []
|
510
|
+
|
511
|
+
if "what" in question_lower:
|
512
|
+
if "components" in question_lower or "parts" in question_lower:
|
513
|
+
answer += "The image shows multiple interconnected components including input layers, processing units, and output connections."
|
514
|
+
detected_objects = [
|
515
|
+
"input_layer",
|
516
|
+
"hidden_units",
|
517
|
+
"output_layer",
|
518
|
+
"connections",
|
519
|
+
]
|
520
|
+
elif "color" in question_lower:
|
521
|
+
answer += "The dominant colors in the image are blue and white with accent highlights."
|
522
|
+
detected_objects = ["blue_elements", "white_background"]
|
523
|
+
|
524
|
+
elif "how many" in question_lower:
|
525
|
+
answer += "I can identify 6 distinct elements in the visual representation."
|
526
|
+
detected_objects = [
|
527
|
+
"element_1",
|
528
|
+
"element_2",
|
529
|
+
"element_3",
|
530
|
+
"element_4",
|
531
|
+
"element_5",
|
532
|
+
"element_6",
|
533
|
+
]
|
534
|
+
|
535
|
+
elif "where" in question_lower:
|
536
|
+
answer += "The requested element is located in the central portion of the diagram."
|
537
|
+
|
538
|
+
else:
|
539
|
+
answer += "The image contains visual information relevant to your query."
|
540
|
+
confidence = 0.7
|
541
|
+
|
542
|
+
# Generate caption if enabled
|
543
|
+
caption = ""
|
544
|
+
if self.enable_captioning:
|
545
|
+
caption = f"A technical diagram showing {len(detected_objects)} components in a structured layout"
|
546
|
+
|
547
|
+
return {
|
548
|
+
"answer": answer,
|
549
|
+
"confidence": confidence,
|
550
|
+
"image_caption": caption,
|
551
|
+
"detected_objects": detected_objects,
|
552
|
+
"model_used": self.model,
|
553
|
+
}
|
554
|
+
|
555
|
+
|
556
|
+
@register_node()
|
557
|
+
class ImageTextMatchingNode(Node):
|
558
|
+
"""
|
559
|
+
Image-Text Matching Node
|
560
|
+
|
561
|
+
Finds the best matching images for text queries or vice versa.
|
562
|
+
|
563
|
+
When to use:
|
564
|
+
- Best for: Finding relevant visuals for content
|
565
|
+
- Not ideal for: Exact image search
|
566
|
+
- Performance: 200-500ms per comparison
|
567
|
+
- Use cases: Documentation, e-commerce, content creation
|
568
|
+
|
569
|
+
Example:
|
570
|
+
matcher = ImageTextMatchingNode()
|
571
|
+
|
572
|
+
matches = await matcher.run(
|
573
|
+
query="neural network architecture",
|
574
|
+
image_collection=image_database
|
575
|
+
)
|
576
|
+
|
577
|
+
Parameters:
|
578
|
+
matching_model: Model for similarity (clip, align, etc.)
|
579
|
+
bidirectional: Support both text→image and image→text
|
580
|
+
top_k: Number of matches to return
|
581
|
+
|
582
|
+
Returns:
|
583
|
+
matches: Ranked list of matches
|
584
|
+
similarity_scores: Score for each match
|
585
|
+
match_type: Type of matching performed
|
586
|
+
"""
|
587
|
+
|
588
|
+
def __init__(
|
589
|
+
self,
|
590
|
+
name: str = "image_text_matcher",
|
591
|
+
matching_model: str = "clip",
|
592
|
+
bidirectional: bool = True,
|
593
|
+
):
|
594
|
+
self.matching_model = matching_model
|
595
|
+
self.bidirectional = bidirectional
|
596
|
+
super().__init__(name)
|
597
|
+
|
598
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
599
|
+
return {
|
600
|
+
"query": NodeParameter(
|
601
|
+
name="query",
|
602
|
+
type=Union[str, dict],
|
603
|
+
required=True,
|
604
|
+
description="Text query or image reference",
|
605
|
+
),
|
606
|
+
"collection": NodeParameter(
|
607
|
+
name="collection",
|
608
|
+
type=list,
|
609
|
+
required=True,
|
610
|
+
description="Collection to search",
|
611
|
+
),
|
612
|
+
"top_k": NodeParameter(
|
613
|
+
name="top_k",
|
614
|
+
type=int,
|
615
|
+
required=False,
|
616
|
+
default=5,
|
617
|
+
description="Number of results",
|
618
|
+
),
|
619
|
+
}
|
620
|
+
|
621
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
622
|
+
"""Find matching images or text"""
|
623
|
+
query = kwargs.get("query")
|
624
|
+
collection = kwargs.get("collection", [])
|
625
|
+
top_k = kwargs.get("top_k", 5)
|
626
|
+
|
627
|
+
# Determine match type
|
628
|
+
if isinstance(query, str):
|
629
|
+
match_type = "text_to_image"
|
630
|
+
else:
|
631
|
+
match_type = "image_to_text"
|
632
|
+
|
633
|
+
# Perform matching (simplified)
|
634
|
+
matches = []
|
635
|
+
|
636
|
+
for i, item in enumerate(collection[:20]): # Limit for demo
|
637
|
+
# Calculate similarity
|
638
|
+
if match_type == "text_to_image":
|
639
|
+
# Text query to image matching
|
640
|
+
if "architecture" in query.lower() and "diagram" in str(
|
641
|
+
item.get("tags", [])
|
642
|
+
):
|
643
|
+
score = 0.9
|
644
|
+
elif any(
|
645
|
+
word in query.lower()
|
646
|
+
for word in str(item.get("caption", "")).lower().split()
|
647
|
+
):
|
648
|
+
score = 0.7
|
649
|
+
else:
|
650
|
+
score = 0.3
|
651
|
+
else:
|
652
|
+
# Image to text matching
|
653
|
+
score = 0.5 # Simplified
|
654
|
+
|
655
|
+
matches.append({"item": item, "score": score, "index": i})
|
656
|
+
|
657
|
+
# Sort by score
|
658
|
+
matches.sort(key=lambda x: x["score"], reverse=True)
|
659
|
+
top_matches = matches[:top_k]
|
660
|
+
|
661
|
+
return {
|
662
|
+
"matches": [m["item"] for m in top_matches],
|
663
|
+
"similarity_scores": [m["score"] for m in top_matches],
|
664
|
+
"match_type": match_type,
|
665
|
+
"model": self.matching_model,
|
666
|
+
"total_searched": len(collection),
|
667
|
+
}
|
668
|
+
|
669
|
+
|
670
|
+
# Export all multimodal nodes
|
671
|
+
__all__ = ["MultimodalRAGNode", "VisualQuestionAnsweringNode", "ImageTextMatchingNode"]
|