kailash 0.3.2__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. kailash/__init__.py +33 -1
  2. kailash/access_control/__init__.py +129 -0
  3. kailash/access_control/managers.py +461 -0
  4. kailash/access_control/rule_evaluators.py +467 -0
  5. kailash/access_control_abac.py +825 -0
  6. kailash/config/__init__.py +27 -0
  7. kailash/config/database_config.py +359 -0
  8. kailash/database/__init__.py +28 -0
  9. kailash/database/execution_pipeline.py +499 -0
  10. kailash/middleware/__init__.py +306 -0
  11. kailash/middleware/auth/__init__.py +33 -0
  12. kailash/middleware/auth/access_control.py +436 -0
  13. kailash/middleware/auth/auth_manager.py +422 -0
  14. kailash/middleware/auth/jwt_auth.py +477 -0
  15. kailash/middleware/auth/kailash_jwt_auth.py +616 -0
  16. kailash/middleware/communication/__init__.py +37 -0
  17. kailash/middleware/communication/ai_chat.py +989 -0
  18. kailash/middleware/communication/api_gateway.py +802 -0
  19. kailash/middleware/communication/events.py +470 -0
  20. kailash/middleware/communication/realtime.py +710 -0
  21. kailash/middleware/core/__init__.py +21 -0
  22. kailash/middleware/core/agent_ui.py +890 -0
  23. kailash/middleware/core/schema.py +643 -0
  24. kailash/middleware/core/workflows.py +396 -0
  25. kailash/middleware/database/__init__.py +63 -0
  26. kailash/middleware/database/base.py +113 -0
  27. kailash/middleware/database/base_models.py +525 -0
  28. kailash/middleware/database/enums.py +106 -0
  29. kailash/middleware/database/migrations.py +12 -0
  30. kailash/{api/database.py → middleware/database/models.py} +183 -291
  31. kailash/middleware/database/repositories.py +685 -0
  32. kailash/middleware/database/session_manager.py +19 -0
  33. kailash/middleware/mcp/__init__.py +38 -0
  34. kailash/middleware/mcp/client_integration.py +585 -0
  35. kailash/middleware/mcp/enhanced_server.py +576 -0
  36. kailash/nodes/__init__.py +27 -3
  37. kailash/nodes/admin/__init__.py +42 -0
  38. kailash/nodes/admin/audit_log.py +794 -0
  39. kailash/nodes/admin/permission_check.py +864 -0
  40. kailash/nodes/admin/role_management.py +823 -0
  41. kailash/nodes/admin/security_event.py +1523 -0
  42. kailash/nodes/admin/user_management.py +944 -0
  43. kailash/nodes/ai/a2a.py +24 -7
  44. kailash/nodes/ai/ai_providers.py +248 -40
  45. kailash/nodes/ai/embedding_generator.py +11 -11
  46. kailash/nodes/ai/intelligent_agent_orchestrator.py +99 -11
  47. kailash/nodes/ai/llm_agent.py +436 -5
  48. kailash/nodes/ai/self_organizing.py +85 -10
  49. kailash/nodes/ai/vision_utils.py +148 -0
  50. kailash/nodes/alerts/__init__.py +26 -0
  51. kailash/nodes/alerts/base.py +234 -0
  52. kailash/nodes/alerts/discord.py +499 -0
  53. kailash/nodes/api/auth.py +287 -6
  54. kailash/nodes/api/rest.py +151 -0
  55. kailash/nodes/auth/__init__.py +17 -0
  56. kailash/nodes/auth/directory_integration.py +1228 -0
  57. kailash/nodes/auth/enterprise_auth_provider.py +1328 -0
  58. kailash/nodes/auth/mfa.py +2338 -0
  59. kailash/nodes/auth/risk_assessment.py +872 -0
  60. kailash/nodes/auth/session_management.py +1093 -0
  61. kailash/nodes/auth/sso.py +1040 -0
  62. kailash/nodes/base.py +344 -13
  63. kailash/nodes/base_cycle_aware.py +4 -2
  64. kailash/nodes/base_with_acl.py +1 -1
  65. kailash/nodes/code/python.py +283 -10
  66. kailash/nodes/compliance/__init__.py +9 -0
  67. kailash/nodes/compliance/data_retention.py +1888 -0
  68. kailash/nodes/compliance/gdpr.py +2004 -0
  69. kailash/nodes/data/__init__.py +22 -2
  70. kailash/nodes/data/async_connection.py +469 -0
  71. kailash/nodes/data/async_sql.py +757 -0
  72. kailash/nodes/data/async_vector.py +598 -0
  73. kailash/nodes/data/readers.py +767 -0
  74. kailash/nodes/data/retrieval.py +360 -1
  75. kailash/nodes/data/sharepoint_graph.py +397 -21
  76. kailash/nodes/data/sql.py +94 -5
  77. kailash/nodes/data/streaming.py +68 -8
  78. kailash/nodes/data/vector_db.py +54 -4
  79. kailash/nodes/enterprise/__init__.py +13 -0
  80. kailash/nodes/enterprise/batch_processor.py +741 -0
  81. kailash/nodes/enterprise/data_lineage.py +497 -0
  82. kailash/nodes/logic/convergence.py +31 -9
  83. kailash/nodes/logic/operations.py +14 -3
  84. kailash/nodes/mixins/__init__.py +8 -0
  85. kailash/nodes/mixins/event_emitter.py +201 -0
  86. kailash/nodes/mixins/mcp.py +9 -4
  87. kailash/nodes/mixins/security.py +165 -0
  88. kailash/nodes/monitoring/__init__.py +7 -0
  89. kailash/nodes/monitoring/performance_benchmark.py +2497 -0
  90. kailash/nodes/rag/__init__.py +284 -0
  91. kailash/nodes/rag/advanced.py +1615 -0
  92. kailash/nodes/rag/agentic.py +773 -0
  93. kailash/nodes/rag/conversational.py +999 -0
  94. kailash/nodes/rag/evaluation.py +875 -0
  95. kailash/nodes/rag/federated.py +1188 -0
  96. kailash/nodes/rag/graph.py +721 -0
  97. kailash/nodes/rag/multimodal.py +671 -0
  98. kailash/nodes/rag/optimized.py +933 -0
  99. kailash/nodes/rag/privacy.py +1059 -0
  100. kailash/nodes/rag/query_processing.py +1335 -0
  101. kailash/nodes/rag/realtime.py +764 -0
  102. kailash/nodes/rag/registry.py +547 -0
  103. kailash/nodes/rag/router.py +837 -0
  104. kailash/nodes/rag/similarity.py +1854 -0
  105. kailash/nodes/rag/strategies.py +566 -0
  106. kailash/nodes/rag/workflows.py +575 -0
  107. kailash/nodes/security/__init__.py +19 -0
  108. kailash/nodes/security/abac_evaluator.py +1411 -0
  109. kailash/nodes/security/audit_log.py +103 -0
  110. kailash/nodes/security/behavior_analysis.py +1893 -0
  111. kailash/nodes/security/credential_manager.py +401 -0
  112. kailash/nodes/security/rotating_credentials.py +760 -0
  113. kailash/nodes/security/security_event.py +133 -0
  114. kailash/nodes/security/threat_detection.py +1103 -0
  115. kailash/nodes/testing/__init__.py +9 -0
  116. kailash/nodes/testing/credential_testing.py +499 -0
  117. kailash/nodes/transform/__init__.py +10 -2
  118. kailash/nodes/transform/chunkers.py +592 -1
  119. kailash/nodes/transform/processors.py +484 -14
  120. kailash/nodes/validation.py +321 -0
  121. kailash/runtime/access_controlled.py +1 -1
  122. kailash/runtime/async_local.py +41 -7
  123. kailash/runtime/docker.py +1 -1
  124. kailash/runtime/local.py +474 -55
  125. kailash/runtime/parallel.py +1 -1
  126. kailash/runtime/parallel_cyclic.py +1 -1
  127. kailash/runtime/testing.py +210 -2
  128. kailash/security.py +1 -1
  129. kailash/utils/migrations/__init__.py +25 -0
  130. kailash/utils/migrations/generator.py +433 -0
  131. kailash/utils/migrations/models.py +231 -0
  132. kailash/utils/migrations/runner.py +489 -0
  133. kailash/utils/secure_logging.py +342 -0
  134. kailash/workflow/__init__.py +16 -0
  135. kailash/workflow/cyclic_runner.py +3 -4
  136. kailash/workflow/graph.py +70 -2
  137. kailash/workflow/resilience.py +249 -0
  138. kailash/workflow/templates.py +726 -0
  139. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/METADATA +256 -20
  140. kailash-0.4.1.dist-info/RECORD +227 -0
  141. kailash/api/__init__.py +0 -17
  142. kailash/api/__main__.py +0 -6
  143. kailash/api/studio_secure.py +0 -893
  144. kailash/mcp/__main__.py +0 -13
  145. kailash/mcp/server_new.py +0 -336
  146. kailash/mcp/servers/__init__.py +0 -12
  147. kailash-0.3.2.dist-info/RECORD +0 -136
  148. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/WHEEL +0 -0
  149. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/entry_points.txt +0 -0
  150. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/licenses/LICENSE +0 -0
  151. {kailash-0.3.2.dist-info → kailash-0.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,875 @@
1
+ """
2
+ RAG Evaluation and Benchmarking Framework
3
+
4
+ Implements comprehensive evaluation metrics and benchmarking:
5
+ - Retrieval quality metrics (precision, recall, MRR)
6
+ - Generation quality assessment
7
+ - End-to-end RAG evaluation
8
+ - A/B testing framework
9
+ - Performance benchmarking
10
+ - Dataset generation for testing
11
+
12
+ Based on RAGAS, BEIR, and evaluation research from 2024.
13
+ """
14
+
15
+ import json
16
+ import logging
17
+ import random
18
+ import statistics
19
+ import time
20
+ from datetime import datetime
21
+ from typing import Any, Dict, List, Optional, Tuple, Union
22
+
23
+ from ...workflow.builder import WorkflowBuilder
24
+ from ..ai.llm_agent import LLMAgentNode
25
+ from ..base import Node, NodeParameter, register_node
26
+ from ..code.python import PythonCodeNode
27
+ from ..logic.workflow import WorkflowNode
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ @register_node()
33
+ class RAGEvaluationNode(WorkflowNode):
34
+ """
35
+ Comprehensive RAG Evaluation Framework
36
+
37
+ Evaluates RAG systems across multiple dimensions including retrieval
38
+ quality, generation accuracy, and end-to-end performance.
39
+
40
+ When to use:
41
+ - Best for: System optimization, quality assurance, model selection
42
+ - Not ideal for: Real-time evaluation during inference
43
+ - Performance: 5-30 seconds per evaluation (depends on metrics)
44
+ - Insights: Detailed breakdown of strengths and weaknesses
45
+
46
+ Key features:
47
+ - RAGAS-based evaluation metrics
48
+ - Retrieval and generation quality assessment
49
+ - Faithfulness and relevance scoring
50
+ - Comparative analysis across strategies
51
+ - Automated test dataset generation
52
+
53
+ Example:
54
+ evaluator = RAGEvaluationNode(
55
+ metrics=["faithfulness", "relevance", "context_precision", "answer_quality"],
56
+ use_reference_answers=True
57
+ )
58
+
59
+ # Evaluate a RAG system
60
+ results = await evaluator.run(
61
+ test_queries=[
62
+ {"query": "What is transformer architecture?",
63
+ "reference": "Transformers use self-attention..."},
64
+ {"query": "Explain BERT",
65
+ "reference": "BERT is a bidirectional..."}
66
+ ],
67
+ rag_system=my_rag_node
68
+ )
69
+
70
+ # Results include:
71
+ # - Per-query scores
72
+ # - Aggregate metrics
73
+ # - Failure analysis
74
+ # - Improvement recommendations
75
+
76
+ Parameters:
77
+ metrics: List of evaluation metrics to compute
78
+ use_reference_answers: Whether to use ground truth
79
+ llm_judge_model: Model for LLM-based evaluation
80
+ confidence_threshold: Minimum acceptable score
81
+
82
+ Returns:
83
+ scores: Detailed scores per metric
84
+ aggregate_metrics: Overall system performance
85
+ failure_analysis: Queries that performed poorly
86
+ recommendations: Suggested improvements
87
+ """
88
+
89
+ def __init__(
90
+ self,
91
+ name: str = "rag_evaluation",
92
+ metrics: List[str] = None,
93
+ use_reference_answers: bool = True,
94
+ llm_judge_model: str = "gpt-4",
95
+ ):
96
+ self.metrics = metrics or [
97
+ "faithfulness",
98
+ "relevance",
99
+ "context_precision",
100
+ "answer_quality",
101
+ ]
102
+ self.use_reference_answers = use_reference_answers
103
+ self.llm_judge_model = llm_judge_model
104
+ super().__init__(name, self._create_workflow())
105
+
106
+ def _create_workflow(self) -> WorkflowNode:
107
+ """Create RAG evaluation workflow"""
108
+ builder = WorkflowBuilder()
109
+
110
+ # Test executor - runs RAG on test queries
111
+ test_executor_id = builder.add_node(
112
+ "PythonCodeNode",
113
+ node_id="test_executor",
114
+ config={
115
+ "code": """
116
+ import time
117
+ from datetime import datetime
118
+
119
+ def execute_rag_tests(test_queries, rag_system):
120
+ '''Execute RAG system on test queries'''
121
+ test_results = []
122
+
123
+ for i, test_case in enumerate(test_queries):
124
+ query = test_case.get("query", "")
125
+ reference = test_case.get("reference", "")
126
+
127
+ # Time the execution
128
+ start_time = time.time()
129
+
130
+ # Execute RAG (simplified - would call actual system)
131
+ # In production, would use rag_system.run(query=query)
132
+ rag_response = {
133
+ "answer": f"Generated answer for: {query}",
134
+ "retrieved_contexts": [
135
+ {"content": "Context 1 about transformers...", "score": 0.9},
136
+ {"content": "Context 2 about attention...", "score": 0.85},
137
+ {"content": "Context 3 about architecture...", "score": 0.8}
138
+ ],
139
+ "confidence": 0.87
140
+ }
141
+
142
+ execution_time = time.time() - start_time
143
+
144
+ test_results.append({
145
+ "test_id": i,
146
+ "query": query,
147
+ "reference_answer": reference,
148
+ "generated_answer": rag_response["answer"],
149
+ "retrieved_contexts": rag_response["retrieved_contexts"],
150
+ "execution_time": execution_time,
151
+ "timestamp": datetime.now().isoformat()
152
+ })
153
+
154
+ result = {
155
+ "test_results": test_results,
156
+ "total_tests": len(test_queries),
157
+ "avg_execution_time": sum(r["execution_time"] for r in test_results) / len(test_results)
158
+ }
159
+ """
160
+ },
161
+ )
162
+
163
+ # Faithfulness evaluator
164
+ faithfulness_evaluator_id = builder.add_node(
165
+ "LLMAgentNode",
166
+ node_id="faithfulness_evaluator",
167
+ config={
168
+ "system_prompt": """Evaluate the faithfulness of the generated answer to the retrieved contexts.
169
+
170
+ Faithfulness measures whether the answer is grounded in the retrieved information.
171
+
172
+ For each statement in the answer:
173
+ 1. Check if it's supported by the contexts
174
+ 2. Identify any hallucinations
175
+ 3. Rate overall faithfulness
176
+
177
+ Return JSON:
178
+ {
179
+ "faithfulness_score": 0.0-1.0,
180
+ "supported_statements": ["list of supported claims"],
181
+ "unsupported_statements": ["list of unsupported claims"],
182
+ "hallucinations": ["list of hallucinated information"],
183
+ "reasoning": "explanation"
184
+ }""",
185
+ "model": self.llm_judge_model,
186
+ },
187
+ )
188
+
189
+ # Relevance evaluator
190
+ relevance_evaluator_id = builder.add_node(
191
+ "LLMAgentNode",
192
+ node_id="relevance_evaluator",
193
+ config={
194
+ "system_prompt": """Evaluate the relevance of the answer to the query.
195
+
196
+ Consider:
197
+ 1. Does the answer address the query?
198
+ 2. Is it complete?
199
+ 3. Is it focused without irrelevant information?
200
+
201
+ Return JSON:
202
+ {
203
+ "relevance_score": 0.0-1.0,
204
+ "addresses_query": true/false,
205
+ "completeness": 0.0-1.0,
206
+ "focus": 0.0-1.0,
207
+ "missing_aspects": ["list of missing elements"],
208
+ "irrelevant_content": ["list of irrelevant parts"]
209
+ }""",
210
+ "model": self.llm_judge_model,
211
+ },
212
+ )
213
+
214
+ # Context precision evaluator
215
+ context_evaluator_id = builder.add_node(
216
+ "PythonCodeNode",
217
+ node_id="context_evaluator",
218
+ config={
219
+ "code": """
220
+ def evaluate_context_precision(test_result):
221
+ '''Evaluate the precision of retrieved contexts'''
222
+
223
+ contexts = test_result.get("retrieved_contexts", [])
224
+ query = test_result.get("query", "")
225
+
226
+ if not contexts:
227
+ return {
228
+ "context_precision": 0.0,
229
+ "context_recall": 0.0,
230
+ "context_ranking_quality": 0.0
231
+ }
232
+
233
+ # Calculate precision at different k values
234
+ precision_at_k = {}
235
+ relevant_count = 0
236
+
237
+ for k in [1, 3, 5, 10]:
238
+ if k <= len(contexts):
239
+ # Simulate relevance judgment (would use LLM in production)
240
+ relevant_at_k = sum(1 for c in contexts[:k] if c.get("score", 0) > 0.7)
241
+ precision_at_k[f"P@{k}"] = relevant_at_k / k
242
+
243
+ # Calculate MRR (Mean Reciprocal Rank)
244
+ first_relevant_rank = None
245
+ for i, ctx in enumerate(contexts):
246
+ if ctx.get("score", 0) > 0.7:
247
+ first_relevant_rank = i + 1
248
+ break
249
+
250
+ mrr = 1.0 / first_relevant_rank if first_relevant_rank else 0.0
251
+
252
+ # Context diversity
253
+ unique_terms = set()
254
+ for ctx in contexts:
255
+ unique_terms.update(ctx.get("content", "").lower().split()[:20])
256
+
257
+ diversity_score = len(unique_terms) / (len(contexts) * 20) if contexts else 0
258
+
259
+ result = {
260
+ "context_metrics": {
261
+ "precision_at_k": precision_at_k,
262
+ "mrr": mrr,
263
+ "diversity_score": diversity_score,
264
+ "avg_relevance_score": sum(c.get("score", 0) for c in contexts) / len(contexts),
265
+ "context_count": len(contexts)
266
+ }
267
+ }
268
+ """
269
+ },
270
+ )
271
+
272
+ # Answer quality evaluator (if reference available)
273
+ if self.use_reference_answers:
274
+ answer_quality_id = builder.add_node(
275
+ "LLMAgentNode",
276
+ node_id="answer_quality_evaluator",
277
+ config={
278
+ "system_prompt": """Compare the generated answer with the reference answer.
279
+
280
+ Evaluate:
281
+ 1. Factual accuracy
282
+ 2. Completeness
283
+ 3. Clarity and coherence
284
+ 4. Additional valuable information
285
+
286
+ Return JSON:
287
+ {
288
+ "accuracy_score": 0.0-1.0,
289
+ "completeness_score": 0.0-1.0,
290
+ "clarity_score": 0.0-1.0,
291
+ "additional_value": 0.0-1.0,
292
+ "overall_quality": 0.0-1.0,
293
+ "key_differences": ["list of major differences"],
294
+ "improvements_needed": ["list of improvements"]
295
+ }""",
296
+ "model": self.llm_judge_model,
297
+ },
298
+ )
299
+
300
+ # Metric aggregator
301
+ aggregator_id = builder.add_node(
302
+ "PythonCodeNode",
303
+ node_id="metric_aggregator",
304
+ config={
305
+ "code": f"""
306
+ import statistics
307
+
308
+ def aggregate_evaluation_metrics(test_results, faithfulness_scores, relevance_scores,
309
+ context_metrics, answer_quality_scores=None):
310
+ '''Aggregate all evaluation metrics'''
311
+
312
+ # Parse evaluation results
313
+ all_metrics = {{
314
+ "faithfulness": [],
315
+ "relevance": [],
316
+ "context_precision": [],
317
+ "answer_quality": [],
318
+ "execution_time": []
319
+ }}
320
+
321
+ for i, test in enumerate(test_results):
322
+ # Get scores for this test
323
+ faith_score = faithfulness_scores[i].get("response", {{}}).get("faithfulness_score", 0)
324
+ rel_score = relevance_scores[i].get("response", {{}}).get("relevance_score", 0)
325
+ ctx_score = context_metrics[i].get("context_metrics", {{}}).get("avg_relevance_score", 0)
326
+
327
+ all_metrics["faithfulness"].append(faith_score)
328
+ all_metrics["relevance"].append(rel_score)
329
+ all_metrics["context_precision"].append(ctx_score)
330
+ all_metrics["execution_time"].append(test.get("execution_time", 0))
331
+
332
+ if answer_quality_scores:
333
+ quality_score = answer_quality_scores[i].get("response", {{}}).get("overall_quality", 0)
334
+ all_metrics["answer_quality"].append(quality_score)
335
+
336
+ # Calculate aggregate statistics
337
+ aggregate_stats = {{}}
338
+ for metric, scores in all_metrics.items():
339
+ if scores:
340
+ aggregate_stats[metric] = {{
341
+ "mean": statistics.mean(scores),
342
+ "median": statistics.median(scores),
343
+ "std_dev": statistics.stdev(scores) if len(scores) > 1 else 0,
344
+ "min": min(scores),
345
+ "max": max(scores),
346
+ "scores": scores
347
+ }}
348
+
349
+ # Identify failure cases
350
+ failure_threshold = 0.6
351
+ failures = []
352
+
353
+ for i, test in enumerate(test_results):
354
+ overall_score = (all_metrics["faithfulness"][i] +
355
+ all_metrics["relevance"][i] +
356
+ all_metrics["context_precision"][i]) / 3
357
+
358
+ if overall_score < failure_threshold:
359
+ failures.append({{
360
+ "test_id": i,
361
+ "query": test["query"],
362
+ "overall_score": overall_score,
363
+ "weakest_metric": min(
364
+ ("faithfulness", all_metrics["faithfulness"][i]),
365
+ ("relevance", all_metrics["relevance"][i]),
366
+ ("context_precision", all_metrics["context_precision"][i]),
367
+ key=lambda x: x[1]
368
+ )[0]
369
+ }})
370
+
371
+ # Generate recommendations
372
+ recommendations = []
373
+
374
+ if aggregate_stats.get("faithfulness", {{}}).get("mean", 1) < 0.7:
375
+ recommendations.append("Improve grounding: Ensure answers strictly follow retrieved content")
376
+
377
+ if aggregate_stats.get("relevance", {{}}).get("mean", 1) < 0.7:
378
+ recommendations.append("Enhance relevance: Better query understanding and targeted responses")
379
+
380
+ if aggregate_stats.get("context_precision", {{}}).get("mean", 1) < 0.7:
381
+ recommendations.append("Optimize retrieval: Improve document ranking and selection")
382
+
383
+ if aggregate_stats.get("execution_time", {{}}).get("mean", 0) > 2.0:
384
+ recommendations.append("Reduce latency: Consider caching or parallel processing")
385
+
386
+ result = {{
387
+ "evaluation_summary": {{
388
+ "aggregate_metrics": aggregate_stats,
389
+ "overall_score": statistics.mean([
390
+ aggregate_stats.get("faithfulness", {{}}).get("mean", 0),
391
+ aggregate_stats.get("relevance", {{}}).get("mean", 0),
392
+ aggregate_stats.get("context_precision", {{}}).get("mean", 0)
393
+ ]),
394
+ "failure_analysis": {{
395
+ "failure_count": len(failures),
396
+ "failure_rate": len(failures) / len(test_results),
397
+ "failed_queries": failures
398
+ }},
399
+ "recommendations": recommendations,
400
+ "evaluation_config": {{
401
+ "metrics_used": {self.metrics},
402
+ "total_tests": len(test_results),
403
+ "timestamp": datetime.now().isoformat()
404
+ }}
405
+ }}
406
+ }}
407
+ """
408
+ },
409
+ )
410
+
411
+ # Connect workflow
412
+ builder.add_connection(
413
+ test_executor_id, "test_results", faithfulness_evaluator_id, "test_data"
414
+ )
415
+ builder.add_connection(
416
+ test_executor_id, "test_results", relevance_evaluator_id, "test_data"
417
+ )
418
+ builder.add_connection(
419
+ test_executor_id, "test_results", context_evaluator_id, "test_data"
420
+ )
421
+
422
+ if self.use_reference_answers:
423
+ builder.add_connection(
424
+ test_executor_id, "test_results", answer_quality_id, "test_data"
425
+ )
426
+ builder.add_connection(
427
+ answer_quality_id, "response", aggregator_id, "answer_quality_scores"
428
+ )
429
+
430
+ builder.add_connection(
431
+ test_executor_id, "test_results", aggregator_id, "test_results"
432
+ )
433
+ builder.add_connection(
434
+ faithfulness_evaluator_id, "response", aggregator_id, "faithfulness_scores"
435
+ )
436
+ builder.add_connection(
437
+ relevance_evaluator_id, "response", aggregator_id, "relevance_scores"
438
+ )
439
+ builder.add_connection(
440
+ context_evaluator_id, "context_metrics", aggregator_id, "context_metrics"
441
+ )
442
+
443
+ return builder.build(name="rag_evaluation_workflow")
444
+
445
+
446
+ @register_node()
447
+ class RAGBenchmarkNode(Node):
448
+ """
449
+ RAG Performance Benchmarking Node
450
+
451
+ Benchmarks RAG systems for performance characteristics.
452
+
453
+ When to use:
454
+ - Best for: System comparison, optimization, capacity planning
455
+ - Not ideal for: Quality evaluation (use RAGEvaluationNode)
456
+ - Metrics: Latency, throughput, resource usage, scalability
457
+
458
+ Example:
459
+ benchmark = RAGBenchmarkNode(
460
+ workload_sizes=[10, 100, 1000],
461
+ concurrent_users=[1, 5, 10]
462
+ )
463
+
464
+ results = await benchmark.run(
465
+ rag_systems={"system_a": rag_a, "system_b": rag_b},
466
+ test_queries=queries
467
+ )
468
+
469
+ Parameters:
470
+ workload_sizes: Different dataset sizes to test
471
+ concurrent_users: Concurrency levels to test
472
+ metrics_interval: How often to collect metrics
473
+
474
+ Returns:
475
+ latency_profiles: Response time distributions
476
+ throughput_curves: Requests/second at different loads
477
+ resource_usage: Memory and compute utilization
478
+ scalability_analysis: How performance scales
479
+ """
480
+
481
+ def __init__(
482
+ self,
483
+ name: str = "rag_benchmark",
484
+ workload_sizes: List[int] = None,
485
+ concurrent_users: List[int] = None,
486
+ ):
487
+ self.workload_sizes = workload_sizes or [10, 100, 1000]
488
+ self.concurrent_users = concurrent_users or [1, 5, 10]
489
+ super().__init__(name)
490
+
491
+ def get_parameters(self) -> Dict[str, NodeParameter]:
492
+ return {
493
+ "rag_systems": NodeParameter(
494
+ name="rag_systems",
495
+ type=dict,
496
+ required=True,
497
+ description="RAG systems to benchmark",
498
+ ),
499
+ "test_queries": NodeParameter(
500
+ name="test_queries",
501
+ type=list,
502
+ required=True,
503
+ description="Queries for benchmarking",
504
+ ),
505
+ "duration": NodeParameter(
506
+ name="duration",
507
+ type=int,
508
+ required=False,
509
+ default=60,
510
+ description="Test duration in seconds",
511
+ ),
512
+ }
513
+
514
+ def run(self, **kwargs) -> Dict[str, Any]:
515
+ """Run performance benchmarks"""
516
+ rag_systems = kwargs.get("rag_systems", {})
517
+ test_queries = kwargs.get("test_queries", [])
518
+ duration = kwargs.get("duration", 60)
519
+
520
+ benchmark_results = {}
521
+
522
+ for system_name, system in rag_systems.items():
523
+ system_results = {
524
+ "latency_profiles": {},
525
+ "throughput_curves": {},
526
+ "resource_usage": {},
527
+ "scalability_analysis": {},
528
+ }
529
+
530
+ # Test different workload sizes
531
+ for size in self.workload_sizes:
532
+ workload = test_queries[:size]
533
+
534
+ # Measure latency
535
+ latencies = []
536
+ start_time = time.time()
537
+
538
+ for query in workload:
539
+ query_start = time.time()
540
+ # Would call system.run(query=query) in production
541
+ # Simulate processing
542
+ time.sleep(0.1 + random.random() * 0.1)
543
+ latencies.append(time.time() - query_start)
544
+
545
+ system_results["latency_profiles"][f"size_{size}"] = {
546
+ "p50": statistics.median(latencies),
547
+ "p95": sorted(latencies)[int(len(latencies) * 0.95)],
548
+ "p99": sorted(latencies)[int(len(latencies) * 0.99)],
549
+ "mean": statistics.mean(latencies),
550
+ "std_dev": statistics.stdev(latencies) if len(latencies) > 1 else 0,
551
+ }
552
+
553
+ # Calculate throughput
554
+ total_time = time.time() - start_time
555
+ throughput = len(workload) / total_time
556
+ system_results["throughput_curves"][f"size_{size}"] = throughput
557
+
558
+ # Test concurrency
559
+ for users in self.concurrent_users:
560
+ # Simulate concurrent load
561
+ concurrent_latencies = []
562
+
563
+ # Simplified - would use asyncio/threading in production
564
+ for _ in range(users * 10):
565
+ query_start = time.time()
566
+ time.sleep(0.1 + random.random() * 0.2 * users)
567
+ concurrent_latencies.append(time.time() - query_start)
568
+
569
+ system_results["scalability_analysis"][f"users_{users}"] = {
570
+ "avg_latency": statistics.mean(concurrent_latencies),
571
+ "throughput_degradation": 1.0 / users, # Simplified
572
+ }
573
+
574
+ # Simulate resource usage
575
+ system_results["resource_usage"] = {
576
+ "memory_mb": 100 + random.randint(0, 500),
577
+ "cpu_percent": 20 + random.randint(0, 60),
578
+ "gpu_memory_mb": (
579
+ 0
580
+ if "gpu" not in system_name.lower()
581
+ else 1000 + random.randint(0, 3000)
582
+ ),
583
+ }
584
+
585
+ benchmark_results[system_name] = system_results
586
+
587
+ # Comparative analysis
588
+ comparison = self._compare_systems(benchmark_results)
589
+
590
+ return {
591
+ "benchmark_results": benchmark_results,
592
+ "comparison": comparison,
593
+ "test_configuration": {
594
+ "workload_sizes": self.workload_sizes,
595
+ "concurrent_users": self.concurrent_users,
596
+ "duration": duration,
597
+ "num_queries": len(test_queries),
598
+ },
599
+ }
600
+
601
+ def _compare_systems(self, results: Dict) -> Dict[str, Any]:
602
+ """Compare benchmark results across systems"""
603
+ comparison = {
604
+ "fastest_system": None,
605
+ "most_scalable": None,
606
+ "most_efficient": None,
607
+ "recommendations": [],
608
+ }
609
+
610
+ # Find fastest system
611
+ avg_latencies = {}
612
+ for system, data in results.items():
613
+ latencies = [v["mean"] for v in data["latency_profiles"].values()]
614
+ avg_latencies[system] = (
615
+ statistics.mean(latencies) if latencies else float("inf")
616
+ )
617
+
618
+ comparison["fastest_system"] = min(avg_latencies, key=avg_latencies.get)
619
+
620
+ # Find most scalable
621
+ scalability_scores = {}
622
+ for system, data in results.items():
623
+ # Lower degradation = better scalability
624
+ degradations = [
625
+ v["throughput_degradation"]
626
+ for v in data["scalability_analysis"].values()
627
+ ]
628
+ scalability_scores[system] = (
629
+ statistics.mean(degradations) if degradations else 0
630
+ )
631
+
632
+ comparison["most_scalable"] = max(
633
+ scalability_scores, key=scalability_scores.get
634
+ )
635
+
636
+ # Find most efficient (performance per resource)
637
+ efficiency_scores = {}
638
+ for system, data in results.items():
639
+ throughput = (
640
+ statistics.mean(data["throughput_curves"].values())
641
+ if data["throughput_curves"]
642
+ else 1
643
+ )
644
+ memory = data["resource_usage"]["memory_mb"]
645
+ efficiency_scores[system] = throughput / memory * 1000
646
+
647
+ comparison["most_efficient"] = max(efficiency_scores, key=efficiency_scores.get)
648
+
649
+ # Generate recommendations
650
+ comparison["recommendations"] = [
651
+ f"Use {comparison['fastest_system']} for latency-critical applications",
652
+ f"Use {comparison['most_scalable']} for high-concurrency scenarios",
653
+ f"Use {comparison['most_efficient']} for resource-constrained environments",
654
+ ]
655
+
656
+ return comparison
657
+
658
+
659
+ @register_node()
660
+ class TestDatasetGeneratorNode(Node):
661
+ """
662
+ RAG Test Dataset Generator
663
+
664
+ Generates synthetic test datasets for RAG evaluation.
665
+
666
+ When to use:
667
+ - Best for: Creating evaluation benchmarks, testing edge cases
668
+ - Not ideal for: Production data generation
669
+ - Output: Queries with ground truth answers and contexts
670
+
671
+ Example:
672
+ generator = TestDatasetGeneratorNode(
673
+ categories=["factual", "analytical", "comparative"],
674
+ difficulty_levels=["easy", "medium", "hard"]
675
+ )
676
+
677
+ dataset = generator.run(
678
+ num_samples=100,
679
+ domain="machine learning"
680
+ )
681
+
682
+ Parameters:
683
+ categories: Types of questions to generate
684
+ difficulty_levels: Complexity levels
685
+ include_adversarial: Generate tricky cases
686
+
687
+ Returns:
688
+ test_queries: Generated queries with metadata
689
+ reference_answers: Ground truth answers
690
+ test_contexts: Relevant documents
691
+ """
692
+
693
+ def __init__(
694
+ self,
695
+ name: str = "test_dataset_generator",
696
+ categories: List[str] = None,
697
+ include_adversarial: bool = True,
698
+ ):
699
+ self.categories = categories or ["factual", "analytical", "comparative"]
700
+ self.include_adversarial = include_adversarial
701
+ super().__init__(name)
702
+
703
+ def get_parameters(self) -> Dict[str, NodeParameter]:
704
+ return {
705
+ "num_samples": NodeParameter(
706
+ name="num_samples",
707
+ type=int,
708
+ required=True,
709
+ description="Number of test samples",
710
+ ),
711
+ "domain": NodeParameter(
712
+ name="domain",
713
+ type=str,
714
+ required=False,
715
+ default="general",
716
+ description="Domain for questions",
717
+ ),
718
+ "seed": NodeParameter(
719
+ name="seed",
720
+ type=int,
721
+ required=False,
722
+ description="Random seed for reproducibility",
723
+ ),
724
+ }
725
+
726
+ def run(self, **kwargs) -> Dict[str, Any]:
727
+ """Generate test dataset"""
728
+ num_samples = kwargs.get("num_samples", 10)
729
+ domain = kwargs.get("domain", "general")
730
+ seed = kwargs.get("seed")
731
+
732
+ if seed:
733
+ random.seed(seed)
734
+
735
+ test_dataset = []
736
+
737
+ # Templates for different categories
738
+ templates = {
739
+ "factual": [
740
+ ("What is {concept}?", "Definition and explanation of {concept}"),
741
+ (
742
+ "When was {event} discovered?",
743
+ "Discovery date and context of {event}",
744
+ ),
745
+ ("Who invented {invention}?", "Inventor and history of {invention}"),
746
+ ],
747
+ "analytical": [
748
+ (
749
+ "How does {system} work?",
750
+ "Detailed explanation of {system} mechanics",
751
+ ),
752
+ (
753
+ "What are the advantages of {method}?",
754
+ "Benefits and strengths of {method}",
755
+ ),
756
+ (
757
+ "Why is {principle} important?",
758
+ "Significance and applications of {principle}",
759
+ ),
760
+ ],
761
+ "comparative": [
762
+ (
763
+ "Compare {option1} and {option2}",
764
+ "Comparison of {option1} vs {option2}",
765
+ ),
766
+ (
767
+ "What's the difference between {concept1} and {concept2}?",
768
+ "Distinctions between concepts",
769
+ ),
770
+ (
771
+ "Which is better: {choice1} or {choice2}?",
772
+ "Trade-offs and recommendations",
773
+ ),
774
+ ],
775
+ }
776
+
777
+ # Domain-specific concepts
778
+ domain_concepts = {
779
+ "machine learning": [
780
+ "neural networks",
781
+ "transformers",
782
+ "BERT",
783
+ "attention mechanism",
784
+ "backpropagation",
785
+ ],
786
+ "general": [
787
+ "democracy",
788
+ "photosynthesis",
789
+ "gravity",
790
+ "internet",
791
+ "climate change",
792
+ ],
793
+ }
794
+
795
+ concepts = domain_concepts.get(domain, domain_concepts["general"])
796
+
797
+ for i in range(num_samples):
798
+ category = random.choice(self.categories)
799
+ template_q, template_a = random.choice(templates[category])
800
+
801
+ # Generate specific question
802
+ if "{concept}" in template_q:
803
+ concept = random.choice(concepts)
804
+ query = template_q.format(concept=concept)
805
+ answer = template_a.format(concept=concept)
806
+ else:
807
+ # Handle other placeholders
808
+ query = template_q
809
+ answer = template_a
810
+
811
+ # Generate contexts
812
+ contexts = []
813
+ for j in range(3):
814
+ contexts.append(
815
+ {
816
+ "id": f"ctx_{i}_{j}",
817
+ "content": f"Context {j+1} about {query}: {answer}",
818
+ "relevance": 0.9 - j * 0.1,
819
+ }
820
+ )
821
+
822
+ # Add adversarial examples if enabled
823
+ metadata = {"category": category, "difficulty": "medium"}
824
+
825
+ if self.include_adversarial and random.random() < 0.2:
826
+ # Make it adversarial
827
+ if random.random() < 0.5:
828
+ # Negation
829
+ query = f"Is it true that {query.lower()}"
830
+ metadata["adversarial_type"] = "negation"
831
+ else:
832
+ # Misleading context
833
+ contexts.append(
834
+ {
835
+ "id": f"ctx_{i}_misleading",
836
+ "content": f"Incorrect information: {query} is actually false because...",
837
+ "relevance": 0.7,
838
+ }
839
+ )
840
+ metadata["adversarial_type"] = "misleading_context"
841
+
842
+ test_dataset.append(
843
+ {
844
+ "id": f"test_{i}",
845
+ "query": query,
846
+ "reference_answer": answer,
847
+ "contexts": contexts,
848
+ "metadata": metadata,
849
+ }
850
+ )
851
+
852
+ return {
853
+ "test_dataset": test_dataset,
854
+ "statistics": {
855
+ "total_samples": len(test_dataset),
856
+ "category_distribution": {
857
+ cat: sum(
858
+ 1 for t in test_dataset if t["metadata"]["category"] == cat
859
+ )
860
+ for cat in self.categories
861
+ },
862
+ "adversarial_count": sum(
863
+ 1 for t in test_dataset if "adversarial_type" in t["metadata"]
864
+ ),
865
+ },
866
+ "generation_config": {
867
+ "domain": domain,
868
+ "categories": self.categories,
869
+ "seed": seed,
870
+ },
871
+ }
872
+
873
+
874
+ # Export all evaluation nodes
875
+ __all__ = ["RAGEvaluationNode", "RAGBenchmarkNode", "TestDatasetGeneratorNode"]