mantisdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mantisdk might be problematic. Click here for more details.

Files changed (190) hide show
  1. mantisdk/__init__.py +22 -0
  2. mantisdk/adapter/__init__.py +15 -0
  3. mantisdk/adapter/base.py +94 -0
  4. mantisdk/adapter/messages.py +270 -0
  5. mantisdk/adapter/triplet.py +1028 -0
  6. mantisdk/algorithm/__init__.py +39 -0
  7. mantisdk/algorithm/apo/__init__.py +5 -0
  8. mantisdk/algorithm/apo/apo.py +889 -0
  9. mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
  10. mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
  11. mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
  12. mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
  13. mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
  14. mantisdk/algorithm/base.py +162 -0
  15. mantisdk/algorithm/decorator.py +264 -0
  16. mantisdk/algorithm/fast.py +250 -0
  17. mantisdk/algorithm/gepa/__init__.py +59 -0
  18. mantisdk/algorithm/gepa/adapter.py +459 -0
  19. mantisdk/algorithm/gepa/gepa.py +364 -0
  20. mantisdk/algorithm/gepa/lib/__init__.py +18 -0
  21. mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
  22. mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
  23. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
  24. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
  25. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
  26. mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
  27. mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
  28. mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
  29. mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
  30. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
  31. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
  32. mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
  33. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
  34. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
  35. mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
  36. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
  37. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
  38. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
  39. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
  40. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
  41. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
  42. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
  43. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
  44. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
  45. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
  46. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
  47. mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
  48. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
  49. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
  50. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
  51. mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
  52. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
  53. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
  54. mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
  55. mantisdk/algorithm/gepa/lib/api.py +375 -0
  56. mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
  57. mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
  58. mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
  59. mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
  60. mantisdk/algorithm/gepa/lib/core/result.py +233 -0
  61. mantisdk/algorithm/gepa/lib/core/state.py +636 -0
  62. mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
  63. mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
  64. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
  65. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
  66. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
  67. mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
  68. mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
  69. mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
  70. mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
  71. mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
  72. mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
  73. mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
  74. mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
  75. mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
  76. mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
  77. mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
  78. mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
  79. mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
  80. mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
  81. mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
  82. mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
  83. mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
  84. mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
  85. mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
  86. mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
  87. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
  88. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
  89. mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
  90. mantisdk/algorithm/gepa/lib/py.typed +0 -0
  91. mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
  92. mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
  93. mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
  94. mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
  95. mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
  96. mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
  97. mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
  98. mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
  99. mantisdk/algorithm/gepa/tracing.py +105 -0
  100. mantisdk/algorithm/utils.py +177 -0
  101. mantisdk/algorithm/verl/__init__.py +5 -0
  102. mantisdk/algorithm/verl/interface.py +202 -0
  103. mantisdk/cli/__init__.py +56 -0
  104. mantisdk/cli/prometheus.py +115 -0
  105. mantisdk/cli/store.py +131 -0
  106. mantisdk/cli/vllm.py +29 -0
  107. mantisdk/client.py +408 -0
  108. mantisdk/config.py +348 -0
  109. mantisdk/emitter/__init__.py +43 -0
  110. mantisdk/emitter/annotation.py +370 -0
  111. mantisdk/emitter/exception.py +54 -0
  112. mantisdk/emitter/message.py +61 -0
  113. mantisdk/emitter/object.py +117 -0
  114. mantisdk/emitter/reward.py +320 -0
  115. mantisdk/env_var.py +156 -0
  116. mantisdk/execution/__init__.py +15 -0
  117. mantisdk/execution/base.py +64 -0
  118. mantisdk/execution/client_server.py +443 -0
  119. mantisdk/execution/events.py +69 -0
  120. mantisdk/execution/inter_process.py +16 -0
  121. mantisdk/execution/shared_memory.py +282 -0
  122. mantisdk/instrumentation/__init__.py +119 -0
  123. mantisdk/instrumentation/agentops.py +314 -0
  124. mantisdk/instrumentation/agentops_langchain.py +45 -0
  125. mantisdk/instrumentation/litellm.py +83 -0
  126. mantisdk/instrumentation/vllm.py +81 -0
  127. mantisdk/instrumentation/weave.py +500 -0
  128. mantisdk/litagent/__init__.py +11 -0
  129. mantisdk/litagent/decorator.py +536 -0
  130. mantisdk/litagent/litagent.py +252 -0
  131. mantisdk/llm_proxy.py +1890 -0
  132. mantisdk/logging.py +370 -0
  133. mantisdk/reward.py +7 -0
  134. mantisdk/runner/__init__.py +11 -0
  135. mantisdk/runner/agent.py +845 -0
  136. mantisdk/runner/base.py +182 -0
  137. mantisdk/runner/legacy.py +309 -0
  138. mantisdk/semconv.py +170 -0
  139. mantisdk/server.py +401 -0
  140. mantisdk/store/__init__.py +23 -0
  141. mantisdk/store/base.py +897 -0
  142. mantisdk/store/client_server.py +2092 -0
  143. mantisdk/store/collection/__init__.py +30 -0
  144. mantisdk/store/collection/base.py +587 -0
  145. mantisdk/store/collection/memory.py +970 -0
  146. mantisdk/store/collection/mongo.py +1412 -0
  147. mantisdk/store/collection_based.py +1823 -0
  148. mantisdk/store/insight.py +648 -0
  149. mantisdk/store/listener.py +58 -0
  150. mantisdk/store/memory.py +396 -0
  151. mantisdk/store/mongo.py +165 -0
  152. mantisdk/store/sqlite.py +3 -0
  153. mantisdk/store/threading.py +357 -0
  154. mantisdk/store/utils.py +142 -0
  155. mantisdk/tracer/__init__.py +16 -0
  156. mantisdk/tracer/agentops.py +242 -0
  157. mantisdk/tracer/base.py +287 -0
  158. mantisdk/tracer/dummy.py +106 -0
  159. mantisdk/tracer/otel.py +555 -0
  160. mantisdk/tracer/weave.py +677 -0
  161. mantisdk/trainer/__init__.py +6 -0
  162. mantisdk/trainer/init_utils.py +263 -0
  163. mantisdk/trainer/legacy.py +367 -0
  164. mantisdk/trainer/registry.py +12 -0
  165. mantisdk/trainer/trainer.py +618 -0
  166. mantisdk/types/__init__.py +6 -0
  167. mantisdk/types/core.py +553 -0
  168. mantisdk/types/resources.py +204 -0
  169. mantisdk/types/tracer.py +515 -0
  170. mantisdk/types/tracing.py +218 -0
  171. mantisdk/utils/__init__.py +1 -0
  172. mantisdk/utils/id.py +18 -0
  173. mantisdk/utils/metrics.py +1025 -0
  174. mantisdk/utils/otel.py +578 -0
  175. mantisdk/utils/otlp.py +536 -0
  176. mantisdk/utils/server_launcher.py +1045 -0
  177. mantisdk/utils/system_snapshot.py +81 -0
  178. mantisdk/verl/__init__.py +8 -0
  179. mantisdk/verl/__main__.py +6 -0
  180. mantisdk/verl/async_server.py +46 -0
  181. mantisdk/verl/config.yaml +27 -0
  182. mantisdk/verl/daemon.py +1154 -0
  183. mantisdk/verl/dataset.py +44 -0
  184. mantisdk/verl/entrypoint.py +248 -0
  185. mantisdk/verl/trainer.py +549 -0
  186. mantisdk-0.1.0.dist-info/METADATA +119 -0
  187. mantisdk-0.1.0.dist-info/RECORD +190 -0
  188. mantisdk-0.1.0.dist-info/WHEEL +4 -0
  189. mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
  190. mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
@@ -0,0 +1,496 @@
1
+ # Copyright (c) 2025 Lakshya A Agrawal and the GEPA contributors
2
+ # https://github.com/gepa-ai/gepa
3
+
4
+ from typing import Any, TypedDict
5
+
6
+ from mantisdk.algorithm.gepa.lib.adapters.generic_rag_adapter.evaluation_metrics import RAGEvaluationMetrics
7
+ from mantisdk.algorithm.gepa.lib.adapters.generic_rag_adapter.rag_pipeline import RAGPipeline
8
+ from mantisdk.algorithm.gepa.lib.adapters.generic_rag_adapter.vector_store_interface import VectorStoreInterface
9
+ from mantisdk.algorithm.gepa.lib.core.adapter import EvaluationBatch, GEPAAdapter
10
+
11
+
12
+ class RAGDataInst(TypedDict):
13
+ """
14
+ Data instance for RAG evaluation and optimization.
15
+
16
+ This TypedDict defines the structure for training and validation examples
17
+ used in RAG system optimization with GEPA.
18
+
19
+ Attributes:
20
+ query (str): User query or question to be answered
21
+ ground_truth_answer (str): Expected/correct answer for evaluation
22
+ relevant_doc_ids (List[str]): List of document IDs that should ideally
23
+ be retrieved for this query (used for retrieval evaluation)
24
+ metadata (Dict[str, Any]): Additional context, tags, or configuration
25
+ specific to this example (e.g., difficulty level, category)
26
+
27
+ Example:
28
+ .. code-block:: python
29
+
30
+ data_inst = RAGDataInst(
31
+ query="What is machine learning?",
32
+ ground_truth_answer="Machine learning is a subset of AI...",
33
+ relevant_doc_ids=["doc_001", "doc_042"],
34
+ metadata={"category": "AI", "difficulty": "beginner"}
35
+ )
36
+ """
37
+
38
+ query: str
39
+ ground_truth_answer: str
40
+ relevant_doc_ids: list[str]
41
+ metadata: dict[str, Any]
42
+
43
+
44
+ class RAGTrajectory(TypedDict):
45
+ """
46
+ Detailed trajectory capturing all RAG pipeline execution steps.
47
+
48
+ This TypedDict captures the complete execution trace of the RAG pipeline,
49
+ providing visibility into each step for analysis and optimization.
50
+
51
+ Attributes:
52
+ original_query (str): Original user query as provided
53
+ reformulated_query (str): Query after reformulation step (if enabled)
54
+ retrieved_docs (List[Dict[str, Any]]): Documents retrieved from vector store
55
+ with their content, metadata, and similarity scores
56
+ synthesized_context (str): Context after document synthesis step
57
+ generated_answer (str): Final answer generated by the LLM
58
+ execution_metadata (Dict[str, Any]): Pipeline execution metadata including
59
+ retrieval metrics, generation metrics, token counts, and performance data
60
+
61
+ Note:
62
+ Trajectories are only captured when capture_traces=True is passed to
63
+ the evaluate() method, as they can be memory-intensive for large batches.
64
+ """
65
+
66
+ original_query: str
67
+ reformulated_query: str
68
+ retrieved_docs: list[dict[str, Any]]
69
+ synthesized_context: str
70
+ generated_answer: str
71
+ execution_metadata: dict[str, Any]
72
+
73
+
74
+ class RAGOutput(TypedDict):
75
+ """
76
+ Final output from RAG system execution.
77
+
78
+ This TypedDict represents the final result of RAG pipeline execution,
79
+ containing both the generated answer and associated metadata.
80
+
81
+ Attributes:
82
+ final_answer (str): The generated answer from the RAG system
83
+ confidence_score (float): Estimated confidence in the answer (0.0 to 1.0)
84
+ based on retrieval quality and generation metrics
85
+ retrieved_docs (List[Dict[str, Any]]): Documents that were retrieved
86
+ and used for answer generation
87
+ total_tokens (int): Estimated total token usage for the pipeline execution
88
+
89
+ Example:
90
+ .. code-block:: python
91
+
92
+ output = RAGOutput(
93
+ final_answer="Machine learning is a method of data analysis...",
94
+ confidence_score=0.87,
95
+ retrieved_docs=[{"content": "...", "score": 0.9}],
96
+ total_tokens=450
97
+ )
98
+ """
99
+
100
+ final_answer: str
101
+ confidence_score: float
102
+ retrieved_docs: list[dict[str, Any]]
103
+ total_tokens: int
104
+
105
+
106
+ class GenericRAGAdapter(GEPAAdapter[RAGDataInst, RAGTrajectory, RAGOutput]):
107
+ """
108
+ Generic GEPA adapter for RAG system optimization with pluggable vector stores.
109
+
110
+ This adapter enables GEPA's evolutionary prompt optimization to work with any
111
+ vector store implementation through the VectorStoreInterface. It provides
112
+ comprehensive evaluation of both retrieval and generation quality.
113
+
114
+ Optimizable Components:
115
+ - Query reformulation prompts: Improve query understanding and reformulation
116
+ - Context synthesis prompts: Optimize document combination and summarization
117
+ - Answer generation prompts: Enhance final answer quality and formatting
118
+ - Reranking criteria: Improve document relevance ordering
119
+
120
+ Evaluation Metrics:
121
+ - Retrieval Quality: Precision, recall, F1, mean reciprocal rank (MRR)
122
+ - Generation Quality: Token F1, BLEU score, faithfulness, answer relevance
123
+ - Combined Score: Weighted combination for overall system performance
124
+
125
+ Vector Store Support:
126
+ Works with any vector store implementing VectorStoreInterface, including:
127
+ ChromaDB, Weaviate, Qdrant, Pinecone, Milvus, and custom implementations.
128
+
129
+ Example:
130
+ .. code-block:: python
131
+
132
+ from mantisdk.algorithm.gepa.lib.adapters.generic_rag_adapter import GenericRAGAdapter, ChromaVectorStore
133
+ import gepa
134
+
135
+ vector_store = ChromaVectorStore.create_local("./kb", "docs")
136
+ adapter = GenericRAGAdapter(vector_store=vector_store, llm_model="gpt-4")
137
+
138
+ result = gepa.optimize(
139
+ seed_candidate={"answer_generation": "Answer based on context:"},
140
+ trainset=train_data,
141
+ valset=val_data,
142
+ adapter=adapter,
143
+ max_metric_calls=50
144
+ )
145
+ print(result.best_candidate) # Optimized prompts
146
+ """
147
+
148
+ def __init__(
149
+ self,
150
+ vector_store: VectorStoreInterface,
151
+ llm_model,
152
+ embedding_model: str = "text-embedding-3-small",
153
+ embedding_function=None,
154
+ rag_config: dict[str, Any] | None = None,
155
+ failure_score: float = 0.0,
156
+ ):
157
+ """
158
+ Initialize the GenericRAGAdapter for RAG system optimization.
159
+
160
+ Args:
161
+ vector_store: Vector store implementation (ChromaDB, Weaviate, etc.)
162
+ Must implement VectorStoreInterface for similarity search operations.
163
+ llm_model: LLM client for text generation. Can be:
164
+ - String model name (uses litellm for inference)
165
+ - Callable that takes messages and returns response text
166
+ - Any object with a callable interface for LLM inference
167
+ embedding_model: Model name for text embeddings (default: "text-embedding-3-small").
168
+ Used when embedding_function is not provided.
169
+ embedding_function: Optional custom embedding function that takes text
170
+ and returns List[float]. If None, uses default litellm embeddings.
171
+ rag_config: RAG pipeline configuration dictionary. Keys include:
172
+ - "retrieval_strategy": "similarity", "hybrid", or "vector"
173
+ - "top_k": Number of documents to retrieve (default: 5)
174
+ - "retrieval_weight": Weight for retrieval in combined score (default: 0.3)
175
+ - "generation_weight": Weight for generation in combined score (default: 0.7)
176
+ - "hybrid_alpha": Semantic vs keyword balance for hybrid search (default: 0.5)
177
+ - "filters": Default metadata filters for retrieval
178
+ failure_score: Score assigned when evaluation fails (default: 0.0)
179
+
180
+ Example:
181
+ .. code-block:: python
182
+
183
+ vector_store = WeaviateVectorStore.create_local(collection_name="docs")
184
+ adapter = GenericRAGAdapter(
185
+ vector_store=vector_store,
186
+ llm_model="gpt-4",
187
+ rag_config={
188
+ "retrieval_strategy": "hybrid",
189
+ "top_k": 5,
190
+ "hybrid_alpha": 0.7
191
+ }
192
+ )
193
+ """
194
+ self.vector_store = vector_store
195
+ self.rag_pipeline = RAGPipeline(
196
+ vector_store=vector_store,
197
+ llm_client=llm_model,
198
+ embedding_model=embedding_model,
199
+ embedding_function=embedding_function,
200
+ )
201
+ self.evaluator = RAGEvaluationMetrics()
202
+ self.config = rag_config or self._default_config()
203
+ self.failure_score = failure_score
204
+
205
+ def evaluate(
206
+ self,
207
+ batch: list[RAGDataInst],
208
+ candidate: dict[str, str],
209
+ capture_traces: bool = False,
210
+ ) -> EvaluationBatch[RAGTrajectory, RAGOutput]:
211
+ """
212
+ Evaluate RAG system performance on a batch of query-answer examples.
213
+
214
+ This method runs the complete RAG pipeline on each example in the batch,
215
+ evaluating both retrieval and generation quality using the provided
216
+ prompt components.
217
+
218
+ Args:
219
+ batch: List of RAG evaluation examples, each containing:
220
+ - query: Question to answer
221
+ - ground_truth_answer: Expected correct answer
222
+ - relevant_doc_ids: Documents that should be retrieved
223
+ - metadata: Additional context for evaluation
224
+ candidate: Dictionary mapping prompt component names to their text.
225
+ Supported components:
226
+ - "query_reformulation": Prompt for improving user queries
227
+ - "context_synthesis": Prompt for combining retrieved documents
228
+ - "answer_generation": Prompt for generating final answers
229
+ - "reranking_criteria": Criteria for reordering retrieved documents
230
+ capture_traces: If True, capture detailed execution trajectories
231
+ for each example. Required for reflective dataset generation but
232
+ increases memory usage.
233
+
234
+ Returns:
235
+ EvaluationBatch containing:
236
+ - outputs: List of RAGOutput for each example
237
+ - scores: List of combined quality scores (higher = better)
238
+ - trajectories: List of detailed execution traces (if capture_traces=True)
239
+
240
+ Raises:
241
+ Exception: Individual example failures are caught and assigned failure_score.
242
+ Only systemic failures (e.g., vector store unavailable) raise exceptions.
243
+
244
+ Example:
245
+ .. code-block:: python
246
+
247
+ prompts = {
248
+ "answer_generation": "Answer the question based on this context:"
249
+ }
250
+ result = adapter.evaluate(
251
+ batch=validation_data,
252
+ candidate=prompts,
253
+ capture_traces=True
254
+ )
255
+ avg_score = sum(result.scores) / len(result.scores)
256
+ print(f"Average RAG performance: {avg_score:.3f}")
257
+ """
258
+ outputs: list[RAGOutput] = []
259
+ scores: list[float] = []
260
+ trajectories: list[RAGTrajectory] | None = [] if capture_traces else None
261
+
262
+ for data_inst in batch:
263
+ try:
264
+ # Execute RAG pipeline with candidate prompts
265
+ rag_result = self.rag_pipeline.execute_rag(
266
+ query=data_inst["query"], prompts=candidate, config=self.config
267
+ )
268
+
269
+ # Evaluate retrieval quality
270
+ retrieval_metrics = self.evaluator.evaluate_retrieval(
271
+ rag_result["retrieved_docs"], data_inst["relevant_doc_ids"]
272
+ )
273
+
274
+ # Evaluate generation quality
275
+ generation_metrics = self.evaluator.evaluate_generation(
276
+ rag_result["generated_answer"], data_inst["ground_truth_answer"], rag_result["synthesized_context"]
277
+ )
278
+
279
+ # Calculate combined score
280
+ overall_score = self.evaluator.combined_rag_score(
281
+ retrieval_metrics,
282
+ generation_metrics,
283
+ retrieval_weight=self.config.get("retrieval_weight", 0.3),
284
+ generation_weight=self.config.get("generation_weight", 0.7),
285
+ )
286
+
287
+ # Prepare output
288
+ output = RAGOutput(
289
+ final_answer=rag_result["generated_answer"],
290
+ confidence_score=generation_metrics.get("answer_confidence", 0.5),
291
+ retrieved_docs=rag_result["retrieved_docs"],
292
+ total_tokens=rag_result["metadata"]["total_tokens"],
293
+ )
294
+
295
+ outputs.append(output)
296
+ scores.append(overall_score)
297
+
298
+ # Capture trajectory if requested
299
+ if capture_traces:
300
+ trajectory = RAGTrajectory(
301
+ original_query=rag_result["original_query"],
302
+ reformulated_query=rag_result["reformulated_query"],
303
+ retrieved_docs=rag_result["retrieved_docs"],
304
+ synthesized_context=rag_result["synthesized_context"],
305
+ generated_answer=rag_result["generated_answer"],
306
+ execution_metadata={
307
+ **rag_result["metadata"],
308
+ "retrieval_metrics": retrieval_metrics,
309
+ "generation_metrics": generation_metrics,
310
+ "overall_score": overall_score,
311
+ },
312
+ )
313
+ trajectories.append(trajectory)
314
+
315
+ except Exception as e:
316
+ # Handle individual example failure
317
+ error_output = RAGOutput(
318
+ final_answer=f"Error: {e!s}", confidence_score=0.0, retrieved_docs=[], total_tokens=0
319
+ )
320
+
321
+ outputs.append(error_output)
322
+ scores.append(self.failure_score)
323
+
324
+ if capture_traces:
325
+ error_trajectory = RAGTrajectory(
326
+ original_query=data_inst["query"],
327
+ reformulated_query=data_inst["query"],
328
+ retrieved_docs=[],
329
+ synthesized_context="",
330
+ generated_answer=f"Error: {e!s}",
331
+ execution_metadata={"error": str(e)},
332
+ )
333
+ trajectories.append(error_trajectory)
334
+
335
+ return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories)
336
+
337
+ def make_reflective_dataset(
338
+ self,
339
+ candidate: dict[str, str],
340
+ eval_batch: EvaluationBatch[RAGTrajectory, RAGOutput],
341
+ components_to_update: list[str],
342
+ ) -> dict[str, list[dict[str, Any]]]:
343
+ """
344
+ Generate reflective dataset for evolutionary prompt optimization.
345
+
346
+ This method analyzes the evaluation results and creates training examples
347
+ that GEPA's proposer can use to improve the specified prompt components.
348
+ Each component gets a tailored dataset with input-output pairs and feedback.
349
+
350
+ Args:
351
+ candidate: Current prompt components that were evaluated
352
+ eval_batch: Evaluation results from evaluate() with capture_traces=True.
353
+ Must contain trajectories for analysis.
354
+ components_to_update: List of component names to generate improvement
355
+ suggestions for. Must be subset of candidate.keys().
356
+
357
+ Returns:
358
+ Dictionary mapping component names to their reflective datasets.
359
+ Each dataset is a list of examples with structure:
360
+ - "Inputs": Input data for the component (query, docs, etc.)
361
+ - "Generated Outputs": What the component currently produces
362
+ - "Feedback": Analysis of performance and suggestions for improvement
363
+
364
+ Example:
365
+ .. code-block:: python
366
+
367
+ reflective_data = adapter.make_reflective_dataset(
368
+ candidate=current_prompts,
369
+ eval_batch=evaluation_results, # with trajectories
370
+ components_to_update=["answer_generation", "context_synthesis"]
371
+ )
372
+ print(reflective_data["answer_generation"][0]["Feedback"])
373
+ # Output: "The generated answer lacks specific details from the context..."
374
+
375
+ Note:
376
+ This method requires eval_batch to have been created with
377
+ capture_traces=True, otherwise trajectories will be None.
378
+ """
379
+ reflective_data: dict[str, list[dict[str, Any]]] = {}
380
+
381
+ for component in components_to_update:
382
+ component_examples = []
383
+
384
+ # Process each trajectory to create examples for this component
385
+ for traj, output, score in zip(
386
+ eval_batch.trajectories or [], eval_batch.outputs, eval_batch.scores, strict=False
387
+ ):
388
+ example = self._create_component_example(component, traj, output, score, candidate)
389
+ if example:
390
+ component_examples.append(example)
391
+
392
+ # Only include components that have examples
393
+ if component_examples:
394
+ reflective_data[component] = component_examples
395
+
396
+ return reflective_data
397
+
398
+ def _create_component_example(
399
+ self, component_name: str, trajectory: RAGTrajectory, output: RAGOutput, score: float, candidate: dict[str, str]
400
+ ) -> dict[str, Any] | None:
401
+ """Create a reflective example for a specific component."""
402
+
403
+ if component_name == "query_reformulation":
404
+ return {
405
+ "Inputs": {
406
+ "original_query": trajectory["original_query"],
407
+ "current_prompt": candidate.get(component_name, ""),
408
+ },
409
+ "Generated Outputs": trajectory["reformulated_query"],
410
+ "Feedback": self._generate_query_reformulation_feedback(trajectory, score),
411
+ }
412
+
413
+ elif component_name == "context_synthesis":
414
+ return {
415
+ "Inputs": {
416
+ "query": trajectory["original_query"],
417
+ "retrieved_docs": [doc["content"] for doc in trajectory["retrieved_docs"]],
418
+ "current_prompt": candidate.get(component_name, ""),
419
+ },
420
+ "Generated Outputs": trajectory["synthesized_context"],
421
+ "Feedback": self._generate_context_synthesis_feedback(trajectory, score),
422
+ }
423
+
424
+ elif component_name == "answer_generation":
425
+ return {
426
+ "Inputs": {
427
+ "query": trajectory["original_query"],
428
+ "context": trajectory["synthesized_context"],
429
+ "current_prompt": candidate.get(component_name, ""),
430
+ },
431
+ "Generated Outputs": trajectory["generated_answer"],
432
+ "Feedback": self._generate_answer_generation_feedback(trajectory, output, score),
433
+ }
434
+
435
+ elif component_name == "reranking_criteria":
436
+ return {
437
+ "Inputs": {
438
+ "query": trajectory["original_query"],
439
+ "documents": [doc["content"] for doc in trajectory["retrieved_docs"]],
440
+ "current_criteria": candidate.get(component_name, ""),
441
+ },
442
+ "Generated Outputs": "Document ranking applied",
443
+ "Feedback": self._generate_reranking_feedback(trajectory, score),
444
+ }
445
+
446
+ return None
447
+
448
+ def _generate_query_reformulation_feedback(self, trajectory: RAGTrajectory, score: float) -> str:
449
+ """Generate feedback for query reformulation component."""
450
+ if score > 0.7:
451
+ return f"Good query reformulation. The reformulated query '{trajectory['reformulated_query']}' helped retrieve relevant documents and generated a good answer."
452
+ else:
453
+ return f"The query reformulation from '{trajectory['original_query']}' to '{trajectory['reformulated_query']}' may not have improved retrieval. Consider making the reformulated query more specific or preserving key terms."
454
+
455
+ def _generate_context_synthesis_feedback(self, trajectory: RAGTrajectory, score: float) -> str:
456
+ """Generate feedback for context synthesis component."""
457
+ if score > 0.7:
458
+ return "Context synthesis worked well - the synthesized context effectively supported answer generation."
459
+ else:
460
+ return "Context synthesis could be improved. The synthesized context may not have highlighted the most relevant information or may have been too verbose/concise."
461
+
462
+ def _generate_answer_generation_feedback(self, trajectory: RAGTrajectory, output: RAGOutput, score: float) -> str:
463
+ """Generate feedback for answer generation component."""
464
+ if score > 0.7:
465
+ return f"Good answer generation. The generated answer '{trajectory['generated_answer']}' was accurate and well-supported by the context."
466
+ else:
467
+ return f"Answer generation needs improvement. The generated answer '{trajectory['generated_answer']}' may not be fully accurate or well-supported by the provided context."
468
+
469
+ def _generate_reranking_feedback(self, trajectory: RAGTrajectory, score: float) -> str:
470
+ """Generate feedback for reranking criteria component."""
471
+ if score > 0.7:
472
+ return "Document reranking appears to have helped surface more relevant documents for answer generation."
473
+ else:
474
+ return "Document reranking may not have improved relevance. Consider adjusting the criteria to better prioritize documents that contain the answer."
475
+
476
+ def _default_config(self) -> dict[str, Any]:
477
+ """
478
+ Get default configuration for RAG pipeline.
479
+
480
+ Returns:
481
+ Dictionary with default RAG configuration parameters:
482
+ - retrieval_strategy: "similarity" (semantic search)
483
+ - top_k: 5 (number of documents to retrieve)
484
+ - retrieval_weight: 0.3 (30% weight for retrieval metrics)
485
+ - generation_weight: 0.7 (70% weight for generation metrics)
486
+ - hybrid_alpha: 0.5 (balanced semantic/keyword for hybrid search)
487
+ - filters: None (no metadata filtering by default)
488
+ """
489
+ return {
490
+ "retrieval_strategy": "similarity",
491
+ "top_k": 5,
492
+ "retrieval_weight": 0.3,
493
+ "generation_weight": 0.7,
494
+ "hybrid_alpha": 0.5,
495
+ "filters": None,
496
+ }