kailash 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +33 -1
- kailash/access_control/__init__.py +129 -0
- kailash/access_control/managers.py +461 -0
- kailash/access_control/rule_evaluators.py +467 -0
- kailash/access_control_abac.py +825 -0
- kailash/config/__init__.py +27 -0
- kailash/config/database_config.py +359 -0
- kailash/database/__init__.py +28 -0
- kailash/database/execution_pipeline.py +499 -0
- kailash/middleware/__init__.py +306 -0
- kailash/middleware/auth/__init__.py +33 -0
- kailash/middleware/auth/access_control.py +436 -0
- kailash/middleware/auth/auth_manager.py +422 -0
- kailash/middleware/auth/jwt_auth.py +477 -0
- kailash/middleware/auth/kailash_jwt_auth.py +616 -0
- kailash/middleware/communication/__init__.py +37 -0
- kailash/middleware/communication/ai_chat.py +989 -0
- kailash/middleware/communication/api_gateway.py +802 -0
- kailash/middleware/communication/events.py +470 -0
- kailash/middleware/communication/realtime.py +710 -0
- kailash/middleware/core/__init__.py +21 -0
- kailash/middleware/core/agent_ui.py +890 -0
- kailash/middleware/core/schema.py +643 -0
- kailash/middleware/core/workflows.py +396 -0
- kailash/middleware/database/__init__.py +63 -0
- kailash/middleware/database/base.py +113 -0
- kailash/middleware/database/base_models.py +525 -0
- kailash/middleware/database/enums.py +106 -0
- kailash/middleware/database/migrations.py +12 -0
- kailash/{api/database.py → middleware/database/models.py} +183 -291
- kailash/middleware/database/repositories.py +685 -0
- kailash/middleware/database/session_manager.py +19 -0
- kailash/middleware/mcp/__init__.py +38 -0
- kailash/middleware/mcp/client_integration.py +585 -0
- kailash/middleware/mcp/enhanced_server.py +576 -0
- kailash/nodes/__init__.py +25 -3
- kailash/nodes/admin/__init__.py +35 -0
- kailash/nodes/admin/audit_log.py +794 -0
- kailash/nodes/admin/permission_check.py +864 -0
- kailash/nodes/admin/role_management.py +823 -0
- kailash/nodes/admin/security_event.py +1519 -0
- kailash/nodes/admin/user_management.py +944 -0
- kailash/nodes/ai/a2a.py +24 -7
- kailash/nodes/ai/ai_providers.py +1 -0
- kailash/nodes/ai/embedding_generator.py +11 -11
- kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
- kailash/nodes/ai/llm_agent.py +407 -2
- kailash/nodes/ai/self_organizing.py +85 -10
- kailash/nodes/api/auth.py +287 -6
- kailash/nodes/api/rest.py +151 -0
- kailash/nodes/auth/__init__.py +17 -0
- kailash/nodes/auth/directory_integration.py +1228 -0
- kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
- kailash/nodes/auth/mfa.py +2338 -0
- kailash/nodes/auth/risk_assessment.py +872 -0
- kailash/nodes/auth/session_management.py +1093 -0
- kailash/nodes/auth/sso.py +1040 -0
- kailash/nodes/base.py +344 -13
- kailash/nodes/base_cycle_aware.py +4 -2
- kailash/nodes/base_with_acl.py +1 -1
- kailash/nodes/code/python.py +293 -12
- kailash/nodes/compliance/__init__.py +9 -0
- kailash/nodes/compliance/data_retention.py +1888 -0
- kailash/nodes/compliance/gdpr.py +2004 -0
- kailash/nodes/data/__init__.py +22 -2
- kailash/nodes/data/async_connection.py +469 -0
- kailash/nodes/data/async_sql.py +757 -0
- kailash/nodes/data/async_vector.py +598 -0
- kailash/nodes/data/readers.py +767 -0
- kailash/nodes/data/retrieval.py +360 -1
- kailash/nodes/data/sharepoint_graph.py +397 -21
- kailash/nodes/data/sql.py +94 -5
- kailash/nodes/data/streaming.py +68 -8
- kailash/nodes/data/vector_db.py +54 -4
- kailash/nodes/enterprise/__init__.py +13 -0
- kailash/nodes/enterprise/batch_processor.py +741 -0
- kailash/nodes/enterprise/data_lineage.py +497 -0
- kailash/nodes/logic/convergence.py +31 -9
- kailash/nodes/logic/operations.py +14 -3
- kailash/nodes/mixins/__init__.py +8 -0
- kailash/nodes/mixins/event_emitter.py +201 -0
- kailash/nodes/mixins/mcp.py +9 -4
- kailash/nodes/mixins/security.py +165 -0
- kailash/nodes/monitoring/__init__.py +7 -0
- kailash/nodes/monitoring/performance_benchmark.py +2497 -0
- kailash/nodes/rag/__init__.py +284 -0
- kailash/nodes/rag/advanced.py +1615 -0
- kailash/nodes/rag/agentic.py +773 -0
- kailash/nodes/rag/conversational.py +999 -0
- kailash/nodes/rag/evaluation.py +875 -0
- kailash/nodes/rag/federated.py +1188 -0
- kailash/nodes/rag/graph.py +721 -0
- kailash/nodes/rag/multimodal.py +671 -0
- kailash/nodes/rag/optimized.py +933 -0
- kailash/nodes/rag/privacy.py +1059 -0
- kailash/nodes/rag/query_processing.py +1335 -0
- kailash/nodes/rag/realtime.py +764 -0
- kailash/nodes/rag/registry.py +547 -0
- kailash/nodes/rag/router.py +837 -0
- kailash/nodes/rag/similarity.py +1854 -0
- kailash/nodes/rag/strategies.py +566 -0
- kailash/nodes/rag/workflows.py +575 -0
- kailash/nodes/security/__init__.py +19 -0
- kailash/nodes/security/abac_evaluator.py +1411 -0
- kailash/nodes/security/audit_log.py +91 -0
- kailash/nodes/security/behavior_analysis.py +1893 -0
- kailash/nodes/security/credential_manager.py +401 -0
- kailash/nodes/security/rotating_credentials.py +760 -0
- kailash/nodes/security/security_event.py +132 -0
- kailash/nodes/security/threat_detection.py +1103 -0
- kailash/nodes/testing/__init__.py +9 -0
- kailash/nodes/testing/credential_testing.py +499 -0
- kailash/nodes/transform/__init__.py +10 -2
- kailash/nodes/transform/chunkers.py +592 -1
- kailash/nodes/transform/processors.py +484 -14
- kailash/nodes/validation.py +321 -0
- kailash/runtime/access_controlled.py +1 -1
- kailash/runtime/async_local.py +41 -7
- kailash/runtime/docker.py +1 -1
- kailash/runtime/local.py +474 -55
- kailash/runtime/parallel.py +1 -1
- kailash/runtime/parallel_cyclic.py +1 -1
- kailash/runtime/testing.py +210 -2
- kailash/utils/migrations/__init__.py +25 -0
- kailash/utils/migrations/generator.py +433 -0
- kailash/utils/migrations/models.py +231 -0
- kailash/utils/migrations/runner.py +489 -0
- kailash/utils/secure_logging.py +342 -0
- kailash/workflow/__init__.py +16 -0
- kailash/workflow/cyclic_runner.py +3 -4
- kailash/workflow/graph.py +70 -2
- kailash/workflow/resilience.py +249 -0
- kailash/workflow/templates.py +726 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/METADATA +253 -20
- kailash-0.4.0.dist-info/RECORD +223 -0
- kailash/api/__init__.py +0 -17
- kailash/api/__main__.py +0 -6
- kailash/api/studio_secure.py +0 -893
- kailash/mcp/__main__.py +0 -13
- kailash/mcp/server_new.py +0 -336
- kailash/mcp/servers/__init__.py +0 -12
- kailash-0.3.1.dist-info/RECORD +0 -136
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/WHEEL +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/entry_points.txt +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.3.1.dist-info → kailash-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,875 @@
|
|
1
|
+
"""
|
2
|
+
RAG Evaluation and Benchmarking Framework
|
3
|
+
|
4
|
+
Implements comprehensive evaluation metrics and benchmarking:
|
5
|
+
- Retrieval quality metrics (precision, recall, MRR)
|
6
|
+
- Generation quality assessment
|
7
|
+
- End-to-end RAG evaluation
|
8
|
+
- A/B testing framework
|
9
|
+
- Performance benchmarking
|
10
|
+
- Dataset generation for testing
|
11
|
+
|
12
|
+
Based on RAGAS, BEIR, and evaluation research from 2024.
|
13
|
+
"""
|
14
|
+
|
15
|
+
import json
|
16
|
+
import logging
|
17
|
+
import random
|
18
|
+
import statistics
|
19
|
+
import time
|
20
|
+
from datetime import datetime
|
21
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
22
|
+
|
23
|
+
from ...workflow.builder import WorkflowBuilder
|
24
|
+
from ..ai.llm_agent import LLMAgentNode
|
25
|
+
from ..base import Node, NodeParameter, register_node
|
26
|
+
from ..code.python import PythonCodeNode
|
27
|
+
from ..logic.workflow import WorkflowNode
|
28
|
+
|
29
|
+
logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
|
32
|
+
@register_node()
|
33
|
+
class RAGEvaluationNode(WorkflowNode):
|
34
|
+
"""
|
35
|
+
Comprehensive RAG Evaluation Framework
|
36
|
+
|
37
|
+
Evaluates RAG systems across multiple dimensions including retrieval
|
38
|
+
quality, generation accuracy, and end-to-end performance.
|
39
|
+
|
40
|
+
When to use:
|
41
|
+
- Best for: System optimization, quality assurance, model selection
|
42
|
+
- Not ideal for: Real-time evaluation during inference
|
43
|
+
- Performance: 5-30 seconds per evaluation (depends on metrics)
|
44
|
+
- Insights: Detailed breakdown of strengths and weaknesses
|
45
|
+
|
46
|
+
Key features:
|
47
|
+
- RAGAS-based evaluation metrics
|
48
|
+
- Retrieval and generation quality assessment
|
49
|
+
- Faithfulness and relevance scoring
|
50
|
+
- Comparative analysis across strategies
|
51
|
+
- Automated test dataset generation
|
52
|
+
|
53
|
+
Example:
|
54
|
+
evaluator = RAGEvaluationNode(
|
55
|
+
metrics=["faithfulness", "relevance", "context_precision", "answer_quality"],
|
56
|
+
use_reference_answers=True
|
57
|
+
)
|
58
|
+
|
59
|
+
# Evaluate a RAG system
|
60
|
+
results = await evaluator.run(
|
61
|
+
test_queries=[
|
62
|
+
{"query": "What is transformer architecture?",
|
63
|
+
"reference": "Transformers use self-attention..."},
|
64
|
+
{"query": "Explain BERT",
|
65
|
+
"reference": "BERT is a bidirectional..."}
|
66
|
+
],
|
67
|
+
rag_system=my_rag_node
|
68
|
+
)
|
69
|
+
|
70
|
+
# Results include:
|
71
|
+
# - Per-query scores
|
72
|
+
# - Aggregate metrics
|
73
|
+
# - Failure analysis
|
74
|
+
# - Improvement recommendations
|
75
|
+
|
76
|
+
Parameters:
|
77
|
+
metrics: List of evaluation metrics to compute
|
78
|
+
use_reference_answers: Whether to use ground truth
|
79
|
+
llm_judge_model: Model for LLM-based evaluation
|
80
|
+
confidence_threshold: Minimum acceptable score
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
scores: Detailed scores per metric
|
84
|
+
aggregate_metrics: Overall system performance
|
85
|
+
failure_analysis: Queries that performed poorly
|
86
|
+
recommendations: Suggested improvements
|
87
|
+
"""
|
88
|
+
|
89
|
+
def __init__(
|
90
|
+
self,
|
91
|
+
name: str = "rag_evaluation",
|
92
|
+
metrics: List[str] = None,
|
93
|
+
use_reference_answers: bool = True,
|
94
|
+
llm_judge_model: str = "gpt-4",
|
95
|
+
):
|
96
|
+
self.metrics = metrics or [
|
97
|
+
"faithfulness",
|
98
|
+
"relevance",
|
99
|
+
"context_precision",
|
100
|
+
"answer_quality",
|
101
|
+
]
|
102
|
+
self.use_reference_answers = use_reference_answers
|
103
|
+
self.llm_judge_model = llm_judge_model
|
104
|
+
super().__init__(name, self._create_workflow())
|
105
|
+
|
106
|
+
def _create_workflow(self) -> WorkflowNode:
|
107
|
+
"""Create RAG evaluation workflow"""
|
108
|
+
builder = WorkflowBuilder()
|
109
|
+
|
110
|
+
# Test executor - runs RAG on test queries
|
111
|
+
test_executor_id = builder.add_node(
|
112
|
+
"PythonCodeNode",
|
113
|
+
node_id="test_executor",
|
114
|
+
config={
|
115
|
+
"code": """
|
116
|
+
import time
|
117
|
+
from datetime import datetime
|
118
|
+
|
119
|
+
def execute_rag_tests(test_queries, rag_system):
|
120
|
+
'''Execute RAG system on test queries'''
|
121
|
+
test_results = []
|
122
|
+
|
123
|
+
for i, test_case in enumerate(test_queries):
|
124
|
+
query = test_case.get("query", "")
|
125
|
+
reference = test_case.get("reference", "")
|
126
|
+
|
127
|
+
# Time the execution
|
128
|
+
start_time = time.time()
|
129
|
+
|
130
|
+
# Execute RAG (simplified - would call actual system)
|
131
|
+
# In production, would use rag_system.run(query=query)
|
132
|
+
rag_response = {
|
133
|
+
"answer": f"Generated answer for: {query}",
|
134
|
+
"retrieved_contexts": [
|
135
|
+
{"content": "Context 1 about transformers...", "score": 0.9},
|
136
|
+
{"content": "Context 2 about attention...", "score": 0.85},
|
137
|
+
{"content": "Context 3 about architecture...", "score": 0.8}
|
138
|
+
],
|
139
|
+
"confidence": 0.87
|
140
|
+
}
|
141
|
+
|
142
|
+
execution_time = time.time() - start_time
|
143
|
+
|
144
|
+
test_results.append({
|
145
|
+
"test_id": i,
|
146
|
+
"query": query,
|
147
|
+
"reference_answer": reference,
|
148
|
+
"generated_answer": rag_response["answer"],
|
149
|
+
"retrieved_contexts": rag_response["retrieved_contexts"],
|
150
|
+
"execution_time": execution_time,
|
151
|
+
"timestamp": datetime.now().isoformat()
|
152
|
+
})
|
153
|
+
|
154
|
+
result = {
|
155
|
+
"test_results": test_results,
|
156
|
+
"total_tests": len(test_queries),
|
157
|
+
"avg_execution_time": sum(r["execution_time"] for r in test_results) / len(test_results)
|
158
|
+
}
|
159
|
+
"""
|
160
|
+
},
|
161
|
+
)
|
162
|
+
|
163
|
+
# Faithfulness evaluator
|
164
|
+
faithfulness_evaluator_id = builder.add_node(
|
165
|
+
"LLMAgentNode",
|
166
|
+
node_id="faithfulness_evaluator",
|
167
|
+
config={
|
168
|
+
"system_prompt": """Evaluate the faithfulness of the generated answer to the retrieved contexts.
|
169
|
+
|
170
|
+
Faithfulness measures whether the answer is grounded in the retrieved information.
|
171
|
+
|
172
|
+
For each statement in the answer:
|
173
|
+
1. Check if it's supported by the contexts
|
174
|
+
2. Identify any hallucinations
|
175
|
+
3. Rate overall faithfulness
|
176
|
+
|
177
|
+
Return JSON:
|
178
|
+
{
|
179
|
+
"faithfulness_score": 0.0-1.0,
|
180
|
+
"supported_statements": ["list of supported claims"],
|
181
|
+
"unsupported_statements": ["list of unsupported claims"],
|
182
|
+
"hallucinations": ["list of hallucinated information"],
|
183
|
+
"reasoning": "explanation"
|
184
|
+
}""",
|
185
|
+
"model": self.llm_judge_model,
|
186
|
+
},
|
187
|
+
)
|
188
|
+
|
189
|
+
# Relevance evaluator
|
190
|
+
relevance_evaluator_id = builder.add_node(
|
191
|
+
"LLMAgentNode",
|
192
|
+
node_id="relevance_evaluator",
|
193
|
+
config={
|
194
|
+
"system_prompt": """Evaluate the relevance of the answer to the query.
|
195
|
+
|
196
|
+
Consider:
|
197
|
+
1. Does the answer address the query?
|
198
|
+
2. Is it complete?
|
199
|
+
3. Is it focused without irrelevant information?
|
200
|
+
|
201
|
+
Return JSON:
|
202
|
+
{
|
203
|
+
"relevance_score": 0.0-1.0,
|
204
|
+
"addresses_query": true/false,
|
205
|
+
"completeness": 0.0-1.0,
|
206
|
+
"focus": 0.0-1.0,
|
207
|
+
"missing_aspects": ["list of missing elements"],
|
208
|
+
"irrelevant_content": ["list of irrelevant parts"]
|
209
|
+
}""",
|
210
|
+
"model": self.llm_judge_model,
|
211
|
+
},
|
212
|
+
)
|
213
|
+
|
214
|
+
# Context precision evaluator
|
215
|
+
context_evaluator_id = builder.add_node(
|
216
|
+
"PythonCodeNode",
|
217
|
+
node_id="context_evaluator",
|
218
|
+
config={
|
219
|
+
"code": """
|
220
|
+
def evaluate_context_precision(test_result):
|
221
|
+
'''Evaluate the precision of retrieved contexts'''
|
222
|
+
|
223
|
+
contexts = test_result.get("retrieved_contexts", [])
|
224
|
+
query = test_result.get("query", "")
|
225
|
+
|
226
|
+
if not contexts:
|
227
|
+
return {
|
228
|
+
"context_precision": 0.0,
|
229
|
+
"context_recall": 0.0,
|
230
|
+
"context_ranking_quality": 0.0
|
231
|
+
}
|
232
|
+
|
233
|
+
# Calculate precision at different k values
|
234
|
+
precision_at_k = {}
|
235
|
+
relevant_count = 0
|
236
|
+
|
237
|
+
for k in [1, 3, 5, 10]:
|
238
|
+
if k <= len(contexts):
|
239
|
+
# Simulate relevance judgment (would use LLM in production)
|
240
|
+
relevant_at_k = sum(1 for c in contexts[:k] if c.get("score", 0) > 0.7)
|
241
|
+
precision_at_k[f"P@{k}"] = relevant_at_k / k
|
242
|
+
|
243
|
+
# Calculate MRR (Mean Reciprocal Rank)
|
244
|
+
first_relevant_rank = None
|
245
|
+
for i, ctx in enumerate(contexts):
|
246
|
+
if ctx.get("score", 0) > 0.7:
|
247
|
+
first_relevant_rank = i + 1
|
248
|
+
break
|
249
|
+
|
250
|
+
mrr = 1.0 / first_relevant_rank if first_relevant_rank else 0.0
|
251
|
+
|
252
|
+
# Context diversity
|
253
|
+
unique_terms = set()
|
254
|
+
for ctx in contexts:
|
255
|
+
unique_terms.update(ctx.get("content", "").lower().split()[:20])
|
256
|
+
|
257
|
+
diversity_score = len(unique_terms) / (len(contexts) * 20) if contexts else 0
|
258
|
+
|
259
|
+
result = {
|
260
|
+
"context_metrics": {
|
261
|
+
"precision_at_k": precision_at_k,
|
262
|
+
"mrr": mrr,
|
263
|
+
"diversity_score": diversity_score,
|
264
|
+
"avg_relevance_score": sum(c.get("score", 0) for c in contexts) / len(contexts),
|
265
|
+
"context_count": len(contexts)
|
266
|
+
}
|
267
|
+
}
|
268
|
+
"""
|
269
|
+
},
|
270
|
+
)
|
271
|
+
|
272
|
+
# Answer quality evaluator (if reference available)
|
273
|
+
if self.use_reference_answers:
|
274
|
+
answer_quality_id = builder.add_node(
|
275
|
+
"LLMAgentNode",
|
276
|
+
node_id="answer_quality_evaluator",
|
277
|
+
config={
|
278
|
+
"system_prompt": """Compare the generated answer with the reference answer.
|
279
|
+
|
280
|
+
Evaluate:
|
281
|
+
1. Factual accuracy
|
282
|
+
2. Completeness
|
283
|
+
3. Clarity and coherence
|
284
|
+
4. Additional valuable information
|
285
|
+
|
286
|
+
Return JSON:
|
287
|
+
{
|
288
|
+
"accuracy_score": 0.0-1.0,
|
289
|
+
"completeness_score": 0.0-1.0,
|
290
|
+
"clarity_score": 0.0-1.0,
|
291
|
+
"additional_value": 0.0-1.0,
|
292
|
+
"overall_quality": 0.0-1.0,
|
293
|
+
"key_differences": ["list of major differences"],
|
294
|
+
"improvements_needed": ["list of improvements"]
|
295
|
+
}""",
|
296
|
+
"model": self.llm_judge_model,
|
297
|
+
},
|
298
|
+
)
|
299
|
+
|
300
|
+
# Metric aggregator
|
301
|
+
aggregator_id = builder.add_node(
|
302
|
+
"PythonCodeNode",
|
303
|
+
node_id="metric_aggregator",
|
304
|
+
config={
|
305
|
+
"code": f"""
|
306
|
+
import statistics
|
307
|
+
|
308
|
+
def aggregate_evaluation_metrics(test_results, faithfulness_scores, relevance_scores,
|
309
|
+
context_metrics, answer_quality_scores=None):
|
310
|
+
'''Aggregate all evaluation metrics'''
|
311
|
+
|
312
|
+
# Parse evaluation results
|
313
|
+
all_metrics = {{
|
314
|
+
"faithfulness": [],
|
315
|
+
"relevance": [],
|
316
|
+
"context_precision": [],
|
317
|
+
"answer_quality": [],
|
318
|
+
"execution_time": []
|
319
|
+
}}
|
320
|
+
|
321
|
+
for i, test in enumerate(test_results):
|
322
|
+
# Get scores for this test
|
323
|
+
faith_score = faithfulness_scores[i].get("response", {{}}).get("faithfulness_score", 0)
|
324
|
+
rel_score = relevance_scores[i].get("response", {{}}).get("relevance_score", 0)
|
325
|
+
ctx_score = context_metrics[i].get("context_metrics", {{}}).get("avg_relevance_score", 0)
|
326
|
+
|
327
|
+
all_metrics["faithfulness"].append(faith_score)
|
328
|
+
all_metrics["relevance"].append(rel_score)
|
329
|
+
all_metrics["context_precision"].append(ctx_score)
|
330
|
+
all_metrics["execution_time"].append(test.get("execution_time", 0))
|
331
|
+
|
332
|
+
if answer_quality_scores:
|
333
|
+
quality_score = answer_quality_scores[i].get("response", {{}}).get("overall_quality", 0)
|
334
|
+
all_metrics["answer_quality"].append(quality_score)
|
335
|
+
|
336
|
+
# Calculate aggregate statistics
|
337
|
+
aggregate_stats = {{}}
|
338
|
+
for metric, scores in all_metrics.items():
|
339
|
+
if scores:
|
340
|
+
aggregate_stats[metric] = {{
|
341
|
+
"mean": statistics.mean(scores),
|
342
|
+
"median": statistics.median(scores),
|
343
|
+
"std_dev": statistics.stdev(scores) if len(scores) > 1 else 0,
|
344
|
+
"min": min(scores),
|
345
|
+
"max": max(scores),
|
346
|
+
"scores": scores
|
347
|
+
}}
|
348
|
+
|
349
|
+
# Identify failure cases
|
350
|
+
failure_threshold = 0.6
|
351
|
+
failures = []
|
352
|
+
|
353
|
+
for i, test in enumerate(test_results):
|
354
|
+
overall_score = (all_metrics["faithfulness"][i] +
|
355
|
+
all_metrics["relevance"][i] +
|
356
|
+
all_metrics["context_precision"][i]) / 3
|
357
|
+
|
358
|
+
if overall_score < failure_threshold:
|
359
|
+
failures.append({{
|
360
|
+
"test_id": i,
|
361
|
+
"query": test["query"],
|
362
|
+
"overall_score": overall_score,
|
363
|
+
"weakest_metric": min(
|
364
|
+
("faithfulness", all_metrics["faithfulness"][i]),
|
365
|
+
("relevance", all_metrics["relevance"][i]),
|
366
|
+
("context_precision", all_metrics["context_precision"][i]),
|
367
|
+
key=lambda x: x[1]
|
368
|
+
)[0]
|
369
|
+
}})
|
370
|
+
|
371
|
+
# Generate recommendations
|
372
|
+
recommendations = []
|
373
|
+
|
374
|
+
if aggregate_stats.get("faithfulness", {{}}).get("mean", 1) < 0.7:
|
375
|
+
recommendations.append("Improve grounding: Ensure answers strictly follow retrieved content")
|
376
|
+
|
377
|
+
if aggregate_stats.get("relevance", {{}}).get("mean", 1) < 0.7:
|
378
|
+
recommendations.append("Enhance relevance: Better query understanding and targeted responses")
|
379
|
+
|
380
|
+
if aggregate_stats.get("context_precision", {{}}).get("mean", 1) < 0.7:
|
381
|
+
recommendations.append("Optimize retrieval: Improve document ranking and selection")
|
382
|
+
|
383
|
+
if aggregate_stats.get("execution_time", {{}}).get("mean", 0) > 2.0:
|
384
|
+
recommendations.append("Reduce latency: Consider caching or parallel processing")
|
385
|
+
|
386
|
+
result = {{
|
387
|
+
"evaluation_summary": {{
|
388
|
+
"aggregate_metrics": aggregate_stats,
|
389
|
+
"overall_score": statistics.mean([
|
390
|
+
aggregate_stats.get("faithfulness", {{}}).get("mean", 0),
|
391
|
+
aggregate_stats.get("relevance", {{}}).get("mean", 0),
|
392
|
+
aggregate_stats.get("context_precision", {{}}).get("mean", 0)
|
393
|
+
]),
|
394
|
+
"failure_analysis": {{
|
395
|
+
"failure_count": len(failures),
|
396
|
+
"failure_rate": len(failures) / len(test_results),
|
397
|
+
"failed_queries": failures
|
398
|
+
}},
|
399
|
+
"recommendations": recommendations,
|
400
|
+
"evaluation_config": {{
|
401
|
+
"metrics_used": {self.metrics},
|
402
|
+
"total_tests": len(test_results),
|
403
|
+
"timestamp": datetime.now().isoformat()
|
404
|
+
}}
|
405
|
+
}}
|
406
|
+
}}
|
407
|
+
"""
|
408
|
+
},
|
409
|
+
)
|
410
|
+
|
411
|
+
# Connect workflow
|
412
|
+
builder.add_connection(
|
413
|
+
test_executor_id, "test_results", faithfulness_evaluator_id, "test_data"
|
414
|
+
)
|
415
|
+
builder.add_connection(
|
416
|
+
test_executor_id, "test_results", relevance_evaluator_id, "test_data"
|
417
|
+
)
|
418
|
+
builder.add_connection(
|
419
|
+
test_executor_id, "test_results", context_evaluator_id, "test_data"
|
420
|
+
)
|
421
|
+
|
422
|
+
if self.use_reference_answers:
|
423
|
+
builder.add_connection(
|
424
|
+
test_executor_id, "test_results", answer_quality_id, "test_data"
|
425
|
+
)
|
426
|
+
builder.add_connection(
|
427
|
+
answer_quality_id, "response", aggregator_id, "answer_quality_scores"
|
428
|
+
)
|
429
|
+
|
430
|
+
builder.add_connection(
|
431
|
+
test_executor_id, "test_results", aggregator_id, "test_results"
|
432
|
+
)
|
433
|
+
builder.add_connection(
|
434
|
+
faithfulness_evaluator_id, "response", aggregator_id, "faithfulness_scores"
|
435
|
+
)
|
436
|
+
builder.add_connection(
|
437
|
+
relevance_evaluator_id, "response", aggregator_id, "relevance_scores"
|
438
|
+
)
|
439
|
+
builder.add_connection(
|
440
|
+
context_evaluator_id, "context_metrics", aggregator_id, "context_metrics"
|
441
|
+
)
|
442
|
+
|
443
|
+
return builder.build(name="rag_evaluation_workflow")
|
444
|
+
|
445
|
+
|
446
|
+
@register_node()
|
447
|
+
class RAGBenchmarkNode(Node):
|
448
|
+
"""
|
449
|
+
RAG Performance Benchmarking Node
|
450
|
+
|
451
|
+
Benchmarks RAG systems for performance characteristics.
|
452
|
+
|
453
|
+
When to use:
|
454
|
+
- Best for: System comparison, optimization, capacity planning
|
455
|
+
- Not ideal for: Quality evaluation (use RAGEvaluationNode)
|
456
|
+
- Metrics: Latency, throughput, resource usage, scalability
|
457
|
+
|
458
|
+
Example:
|
459
|
+
benchmark = RAGBenchmarkNode(
|
460
|
+
workload_sizes=[10, 100, 1000],
|
461
|
+
concurrent_users=[1, 5, 10]
|
462
|
+
)
|
463
|
+
|
464
|
+
results = await benchmark.run(
|
465
|
+
rag_systems={"system_a": rag_a, "system_b": rag_b},
|
466
|
+
test_queries=queries
|
467
|
+
)
|
468
|
+
|
469
|
+
Parameters:
|
470
|
+
workload_sizes: Different dataset sizes to test
|
471
|
+
concurrent_users: Concurrency levels to test
|
472
|
+
metrics_interval: How often to collect metrics
|
473
|
+
|
474
|
+
Returns:
|
475
|
+
latency_profiles: Response time distributions
|
476
|
+
throughput_curves: Requests/second at different loads
|
477
|
+
resource_usage: Memory and compute utilization
|
478
|
+
scalability_analysis: How performance scales
|
479
|
+
"""
|
480
|
+
|
481
|
+
def __init__(
|
482
|
+
self,
|
483
|
+
name: str = "rag_benchmark",
|
484
|
+
workload_sizes: List[int] = None,
|
485
|
+
concurrent_users: List[int] = None,
|
486
|
+
):
|
487
|
+
self.workload_sizes = workload_sizes or [10, 100, 1000]
|
488
|
+
self.concurrent_users = concurrent_users or [1, 5, 10]
|
489
|
+
super().__init__(name)
|
490
|
+
|
491
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
492
|
+
return {
|
493
|
+
"rag_systems": NodeParameter(
|
494
|
+
name="rag_systems",
|
495
|
+
type=dict,
|
496
|
+
required=True,
|
497
|
+
description="RAG systems to benchmark",
|
498
|
+
),
|
499
|
+
"test_queries": NodeParameter(
|
500
|
+
name="test_queries",
|
501
|
+
type=list,
|
502
|
+
required=True,
|
503
|
+
description="Queries for benchmarking",
|
504
|
+
),
|
505
|
+
"duration": NodeParameter(
|
506
|
+
name="duration",
|
507
|
+
type=int,
|
508
|
+
required=False,
|
509
|
+
default=60,
|
510
|
+
description="Test duration in seconds",
|
511
|
+
),
|
512
|
+
}
|
513
|
+
|
514
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
515
|
+
"""Run performance benchmarks"""
|
516
|
+
rag_systems = kwargs.get("rag_systems", {})
|
517
|
+
test_queries = kwargs.get("test_queries", [])
|
518
|
+
duration = kwargs.get("duration", 60)
|
519
|
+
|
520
|
+
benchmark_results = {}
|
521
|
+
|
522
|
+
for system_name, system in rag_systems.items():
|
523
|
+
system_results = {
|
524
|
+
"latency_profiles": {},
|
525
|
+
"throughput_curves": {},
|
526
|
+
"resource_usage": {},
|
527
|
+
"scalability_analysis": {},
|
528
|
+
}
|
529
|
+
|
530
|
+
# Test different workload sizes
|
531
|
+
for size in self.workload_sizes:
|
532
|
+
workload = test_queries[:size]
|
533
|
+
|
534
|
+
# Measure latency
|
535
|
+
latencies = []
|
536
|
+
start_time = time.time()
|
537
|
+
|
538
|
+
for query in workload:
|
539
|
+
query_start = time.time()
|
540
|
+
# Would call system.run(query=query) in production
|
541
|
+
# Simulate processing
|
542
|
+
time.sleep(0.1 + random.random() * 0.1)
|
543
|
+
latencies.append(time.time() - query_start)
|
544
|
+
|
545
|
+
system_results["latency_profiles"][f"size_{size}"] = {
|
546
|
+
"p50": statistics.median(latencies),
|
547
|
+
"p95": sorted(latencies)[int(len(latencies) * 0.95)],
|
548
|
+
"p99": sorted(latencies)[int(len(latencies) * 0.99)],
|
549
|
+
"mean": statistics.mean(latencies),
|
550
|
+
"std_dev": statistics.stdev(latencies) if len(latencies) > 1 else 0,
|
551
|
+
}
|
552
|
+
|
553
|
+
# Calculate throughput
|
554
|
+
total_time = time.time() - start_time
|
555
|
+
throughput = len(workload) / total_time
|
556
|
+
system_results["throughput_curves"][f"size_{size}"] = throughput
|
557
|
+
|
558
|
+
# Test concurrency
|
559
|
+
for users in self.concurrent_users:
|
560
|
+
# Simulate concurrent load
|
561
|
+
concurrent_latencies = []
|
562
|
+
|
563
|
+
# Simplified - would use asyncio/threading in production
|
564
|
+
for _ in range(users * 10):
|
565
|
+
query_start = time.time()
|
566
|
+
time.sleep(0.1 + random.random() * 0.2 * users)
|
567
|
+
concurrent_latencies.append(time.time() - query_start)
|
568
|
+
|
569
|
+
system_results["scalability_analysis"][f"users_{users}"] = {
|
570
|
+
"avg_latency": statistics.mean(concurrent_latencies),
|
571
|
+
"throughput_degradation": 1.0 / users, # Simplified
|
572
|
+
}
|
573
|
+
|
574
|
+
# Simulate resource usage
|
575
|
+
system_results["resource_usage"] = {
|
576
|
+
"memory_mb": 100 + random.randint(0, 500),
|
577
|
+
"cpu_percent": 20 + random.randint(0, 60),
|
578
|
+
"gpu_memory_mb": (
|
579
|
+
0
|
580
|
+
if "gpu" not in system_name.lower()
|
581
|
+
else 1000 + random.randint(0, 3000)
|
582
|
+
),
|
583
|
+
}
|
584
|
+
|
585
|
+
benchmark_results[system_name] = system_results
|
586
|
+
|
587
|
+
# Comparative analysis
|
588
|
+
comparison = self._compare_systems(benchmark_results)
|
589
|
+
|
590
|
+
return {
|
591
|
+
"benchmark_results": benchmark_results,
|
592
|
+
"comparison": comparison,
|
593
|
+
"test_configuration": {
|
594
|
+
"workload_sizes": self.workload_sizes,
|
595
|
+
"concurrent_users": self.concurrent_users,
|
596
|
+
"duration": duration,
|
597
|
+
"num_queries": len(test_queries),
|
598
|
+
},
|
599
|
+
}
|
600
|
+
|
601
|
+
def _compare_systems(self, results: Dict) -> Dict[str, Any]:
|
602
|
+
"""Compare benchmark results across systems"""
|
603
|
+
comparison = {
|
604
|
+
"fastest_system": None,
|
605
|
+
"most_scalable": None,
|
606
|
+
"most_efficient": None,
|
607
|
+
"recommendations": [],
|
608
|
+
}
|
609
|
+
|
610
|
+
# Find fastest system
|
611
|
+
avg_latencies = {}
|
612
|
+
for system, data in results.items():
|
613
|
+
latencies = [v["mean"] for v in data["latency_profiles"].values()]
|
614
|
+
avg_latencies[system] = (
|
615
|
+
statistics.mean(latencies) if latencies else float("inf")
|
616
|
+
)
|
617
|
+
|
618
|
+
comparison["fastest_system"] = min(avg_latencies, key=avg_latencies.get)
|
619
|
+
|
620
|
+
# Find most scalable
|
621
|
+
scalability_scores = {}
|
622
|
+
for system, data in results.items():
|
623
|
+
# Lower degradation = better scalability
|
624
|
+
degradations = [
|
625
|
+
v["throughput_degradation"]
|
626
|
+
for v in data["scalability_analysis"].values()
|
627
|
+
]
|
628
|
+
scalability_scores[system] = (
|
629
|
+
statistics.mean(degradations) if degradations else 0
|
630
|
+
)
|
631
|
+
|
632
|
+
comparison["most_scalable"] = max(
|
633
|
+
scalability_scores, key=scalability_scores.get
|
634
|
+
)
|
635
|
+
|
636
|
+
# Find most efficient (performance per resource)
|
637
|
+
efficiency_scores = {}
|
638
|
+
for system, data in results.items():
|
639
|
+
throughput = (
|
640
|
+
statistics.mean(data["throughput_curves"].values())
|
641
|
+
if data["throughput_curves"]
|
642
|
+
else 1
|
643
|
+
)
|
644
|
+
memory = data["resource_usage"]["memory_mb"]
|
645
|
+
efficiency_scores[system] = throughput / memory * 1000
|
646
|
+
|
647
|
+
comparison["most_efficient"] = max(efficiency_scores, key=efficiency_scores.get)
|
648
|
+
|
649
|
+
# Generate recommendations
|
650
|
+
comparison["recommendations"] = [
|
651
|
+
f"Use {comparison['fastest_system']} for latency-critical applications",
|
652
|
+
f"Use {comparison['most_scalable']} for high-concurrency scenarios",
|
653
|
+
f"Use {comparison['most_efficient']} for resource-constrained environments",
|
654
|
+
]
|
655
|
+
|
656
|
+
return comparison
|
657
|
+
|
658
|
+
|
659
|
+
@register_node()
|
660
|
+
class TestDatasetGeneratorNode(Node):
|
661
|
+
"""
|
662
|
+
RAG Test Dataset Generator
|
663
|
+
|
664
|
+
Generates synthetic test datasets for RAG evaluation.
|
665
|
+
|
666
|
+
When to use:
|
667
|
+
- Best for: Creating evaluation benchmarks, testing edge cases
|
668
|
+
- Not ideal for: Production data generation
|
669
|
+
- Output: Queries with ground truth answers and contexts
|
670
|
+
|
671
|
+
Example:
|
672
|
+
generator = TestDatasetGeneratorNode(
|
673
|
+
categories=["factual", "analytical", "comparative"],
|
674
|
+
difficulty_levels=["easy", "medium", "hard"]
|
675
|
+
)
|
676
|
+
|
677
|
+
dataset = generator.run(
|
678
|
+
num_samples=100,
|
679
|
+
domain="machine learning"
|
680
|
+
)
|
681
|
+
|
682
|
+
Parameters:
|
683
|
+
categories: Types of questions to generate
|
684
|
+
difficulty_levels: Complexity levels
|
685
|
+
include_adversarial: Generate tricky cases
|
686
|
+
|
687
|
+
Returns:
|
688
|
+
test_queries: Generated queries with metadata
|
689
|
+
reference_answers: Ground truth answers
|
690
|
+
test_contexts: Relevant documents
|
691
|
+
"""
|
692
|
+
|
693
|
+
def __init__(
|
694
|
+
self,
|
695
|
+
name: str = "test_dataset_generator",
|
696
|
+
categories: List[str] = None,
|
697
|
+
include_adversarial: bool = True,
|
698
|
+
):
|
699
|
+
self.categories = categories or ["factual", "analytical", "comparative"]
|
700
|
+
self.include_adversarial = include_adversarial
|
701
|
+
super().__init__(name)
|
702
|
+
|
703
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
704
|
+
return {
|
705
|
+
"num_samples": NodeParameter(
|
706
|
+
name="num_samples",
|
707
|
+
type=int,
|
708
|
+
required=True,
|
709
|
+
description="Number of test samples",
|
710
|
+
),
|
711
|
+
"domain": NodeParameter(
|
712
|
+
name="domain",
|
713
|
+
type=str,
|
714
|
+
required=False,
|
715
|
+
default="general",
|
716
|
+
description="Domain for questions",
|
717
|
+
),
|
718
|
+
"seed": NodeParameter(
|
719
|
+
name="seed",
|
720
|
+
type=int,
|
721
|
+
required=False,
|
722
|
+
description="Random seed for reproducibility",
|
723
|
+
),
|
724
|
+
}
|
725
|
+
|
726
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
727
|
+
"""Generate test dataset"""
|
728
|
+
num_samples = kwargs.get("num_samples", 10)
|
729
|
+
domain = kwargs.get("domain", "general")
|
730
|
+
seed = kwargs.get("seed")
|
731
|
+
|
732
|
+
if seed:
|
733
|
+
random.seed(seed)
|
734
|
+
|
735
|
+
test_dataset = []
|
736
|
+
|
737
|
+
# Templates for different categories
|
738
|
+
templates = {
|
739
|
+
"factual": [
|
740
|
+
("What is {concept}?", "Definition and explanation of {concept}"),
|
741
|
+
(
|
742
|
+
"When was {event} discovered?",
|
743
|
+
"Discovery date and context of {event}",
|
744
|
+
),
|
745
|
+
("Who invented {invention}?", "Inventor and history of {invention}"),
|
746
|
+
],
|
747
|
+
"analytical": [
|
748
|
+
(
|
749
|
+
"How does {system} work?",
|
750
|
+
"Detailed explanation of {system} mechanics",
|
751
|
+
),
|
752
|
+
(
|
753
|
+
"What are the advantages of {method}?",
|
754
|
+
"Benefits and strengths of {method}",
|
755
|
+
),
|
756
|
+
(
|
757
|
+
"Why is {principle} important?",
|
758
|
+
"Significance and applications of {principle}",
|
759
|
+
),
|
760
|
+
],
|
761
|
+
"comparative": [
|
762
|
+
(
|
763
|
+
"Compare {option1} and {option2}",
|
764
|
+
"Comparison of {option1} vs {option2}",
|
765
|
+
),
|
766
|
+
(
|
767
|
+
"What's the difference between {concept1} and {concept2}?",
|
768
|
+
"Distinctions between concepts",
|
769
|
+
),
|
770
|
+
(
|
771
|
+
"Which is better: {choice1} or {choice2}?",
|
772
|
+
"Trade-offs and recommendations",
|
773
|
+
),
|
774
|
+
],
|
775
|
+
}
|
776
|
+
|
777
|
+
# Domain-specific concepts
|
778
|
+
domain_concepts = {
|
779
|
+
"machine learning": [
|
780
|
+
"neural networks",
|
781
|
+
"transformers",
|
782
|
+
"BERT",
|
783
|
+
"attention mechanism",
|
784
|
+
"backpropagation",
|
785
|
+
],
|
786
|
+
"general": [
|
787
|
+
"democracy",
|
788
|
+
"photosynthesis",
|
789
|
+
"gravity",
|
790
|
+
"internet",
|
791
|
+
"climate change",
|
792
|
+
],
|
793
|
+
}
|
794
|
+
|
795
|
+
concepts = domain_concepts.get(domain, domain_concepts["general"])
|
796
|
+
|
797
|
+
for i in range(num_samples):
|
798
|
+
category = random.choice(self.categories)
|
799
|
+
template_q, template_a = random.choice(templates[category])
|
800
|
+
|
801
|
+
# Generate specific question
|
802
|
+
if "{concept}" in template_q:
|
803
|
+
concept = random.choice(concepts)
|
804
|
+
query = template_q.format(concept=concept)
|
805
|
+
answer = template_a.format(concept=concept)
|
806
|
+
else:
|
807
|
+
# Handle other placeholders
|
808
|
+
query = template_q
|
809
|
+
answer = template_a
|
810
|
+
|
811
|
+
# Generate contexts
|
812
|
+
contexts = []
|
813
|
+
for j in range(3):
|
814
|
+
contexts.append(
|
815
|
+
{
|
816
|
+
"id": f"ctx_{i}_{j}",
|
817
|
+
"content": f"Context {j+1} about {query}: {answer}",
|
818
|
+
"relevance": 0.9 - j * 0.1,
|
819
|
+
}
|
820
|
+
)
|
821
|
+
|
822
|
+
# Add adversarial examples if enabled
|
823
|
+
metadata = {"category": category, "difficulty": "medium"}
|
824
|
+
|
825
|
+
if self.include_adversarial and random.random() < 0.2:
|
826
|
+
# Make it adversarial
|
827
|
+
if random.random() < 0.5:
|
828
|
+
# Negation
|
829
|
+
query = f"Is it true that {query.lower()}"
|
830
|
+
metadata["adversarial_type"] = "negation"
|
831
|
+
else:
|
832
|
+
# Misleading context
|
833
|
+
contexts.append(
|
834
|
+
{
|
835
|
+
"id": f"ctx_{i}_misleading",
|
836
|
+
"content": f"Incorrect information: {query} is actually false because...",
|
837
|
+
"relevance": 0.7,
|
838
|
+
}
|
839
|
+
)
|
840
|
+
metadata["adversarial_type"] = "misleading_context"
|
841
|
+
|
842
|
+
test_dataset.append(
|
843
|
+
{
|
844
|
+
"id": f"test_{i}",
|
845
|
+
"query": query,
|
846
|
+
"reference_answer": answer,
|
847
|
+
"contexts": contexts,
|
848
|
+
"metadata": metadata,
|
849
|
+
}
|
850
|
+
)
|
851
|
+
|
852
|
+
return {
|
853
|
+
"test_dataset": test_dataset,
|
854
|
+
"statistics": {
|
855
|
+
"total_samples": len(test_dataset),
|
856
|
+
"category_distribution": {
|
857
|
+
cat: sum(
|
858
|
+
1 for t in test_dataset if t["metadata"]["category"] == cat
|
859
|
+
)
|
860
|
+
for cat in self.categories
|
861
|
+
},
|
862
|
+
"adversarial_count": sum(
|
863
|
+
1 for t in test_dataset if "adversarial_type" in t["metadata"]
|
864
|
+
),
|
865
|
+
},
|
866
|
+
"generation_config": {
|
867
|
+
"domain": domain,
|
868
|
+
"categories": self.categories,
|
869
|
+
"seed": seed,
|
870
|
+
},
|
871
|
+
}
|
872
|
+
|
873
|
+
|
874
|
+
# Export all evaluation nodes
|
875
|
+
__all__ = ["RAGEvaluationNode", "RAGBenchmarkNode", "TestDatasetGeneratorNode"]
|