mantisdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mantisdk might be problematic. Click here for more details.
- mantisdk/__init__.py +22 -0
- mantisdk/adapter/__init__.py +15 -0
- mantisdk/adapter/base.py +94 -0
- mantisdk/adapter/messages.py +270 -0
- mantisdk/adapter/triplet.py +1028 -0
- mantisdk/algorithm/__init__.py +39 -0
- mantisdk/algorithm/apo/__init__.py +5 -0
- mantisdk/algorithm/apo/apo.py +889 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
- mantisdk/algorithm/base.py +162 -0
- mantisdk/algorithm/decorator.py +264 -0
- mantisdk/algorithm/fast.py +250 -0
- mantisdk/algorithm/gepa/__init__.py +59 -0
- mantisdk/algorithm/gepa/adapter.py +459 -0
- mantisdk/algorithm/gepa/gepa.py +364 -0
- mantisdk/algorithm/gepa/lib/__init__.py +18 -0
- mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
- mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
- mantisdk/algorithm/gepa/lib/api.py +375 -0
- mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
- mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
- mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
- mantisdk/algorithm/gepa/lib/core/result.py +233 -0
- mantisdk/algorithm/gepa/lib/core/state.py +636 -0
- mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
- mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
- mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
- mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
- mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
- mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
- mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
- mantisdk/algorithm/gepa/lib/py.typed +0 -0
- mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
- mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
- mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
- mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
- mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
- mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
- mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
- mantisdk/algorithm/gepa/tracing.py +105 -0
- mantisdk/algorithm/utils.py +177 -0
- mantisdk/algorithm/verl/__init__.py +5 -0
- mantisdk/algorithm/verl/interface.py +202 -0
- mantisdk/cli/__init__.py +56 -0
- mantisdk/cli/prometheus.py +115 -0
- mantisdk/cli/store.py +131 -0
- mantisdk/cli/vllm.py +29 -0
- mantisdk/client.py +408 -0
- mantisdk/config.py +348 -0
- mantisdk/emitter/__init__.py +43 -0
- mantisdk/emitter/annotation.py +370 -0
- mantisdk/emitter/exception.py +54 -0
- mantisdk/emitter/message.py +61 -0
- mantisdk/emitter/object.py +117 -0
- mantisdk/emitter/reward.py +320 -0
- mantisdk/env_var.py +156 -0
- mantisdk/execution/__init__.py +15 -0
- mantisdk/execution/base.py +64 -0
- mantisdk/execution/client_server.py +443 -0
- mantisdk/execution/events.py +69 -0
- mantisdk/execution/inter_process.py +16 -0
- mantisdk/execution/shared_memory.py +282 -0
- mantisdk/instrumentation/__init__.py +119 -0
- mantisdk/instrumentation/agentops.py +314 -0
- mantisdk/instrumentation/agentops_langchain.py +45 -0
- mantisdk/instrumentation/litellm.py +83 -0
- mantisdk/instrumentation/vllm.py +81 -0
- mantisdk/instrumentation/weave.py +500 -0
- mantisdk/litagent/__init__.py +11 -0
- mantisdk/litagent/decorator.py +536 -0
- mantisdk/litagent/litagent.py +252 -0
- mantisdk/llm_proxy.py +1890 -0
- mantisdk/logging.py +370 -0
- mantisdk/reward.py +7 -0
- mantisdk/runner/__init__.py +11 -0
- mantisdk/runner/agent.py +845 -0
- mantisdk/runner/base.py +182 -0
- mantisdk/runner/legacy.py +309 -0
- mantisdk/semconv.py +170 -0
- mantisdk/server.py +401 -0
- mantisdk/store/__init__.py +23 -0
- mantisdk/store/base.py +897 -0
- mantisdk/store/client_server.py +2092 -0
- mantisdk/store/collection/__init__.py +30 -0
- mantisdk/store/collection/base.py +587 -0
- mantisdk/store/collection/memory.py +970 -0
- mantisdk/store/collection/mongo.py +1412 -0
- mantisdk/store/collection_based.py +1823 -0
- mantisdk/store/insight.py +648 -0
- mantisdk/store/listener.py +58 -0
- mantisdk/store/memory.py +396 -0
- mantisdk/store/mongo.py +165 -0
- mantisdk/store/sqlite.py +3 -0
- mantisdk/store/threading.py +357 -0
- mantisdk/store/utils.py +142 -0
- mantisdk/tracer/__init__.py +16 -0
- mantisdk/tracer/agentops.py +242 -0
- mantisdk/tracer/base.py +287 -0
- mantisdk/tracer/dummy.py +106 -0
- mantisdk/tracer/otel.py +555 -0
- mantisdk/tracer/weave.py +677 -0
- mantisdk/trainer/__init__.py +6 -0
- mantisdk/trainer/init_utils.py +263 -0
- mantisdk/trainer/legacy.py +367 -0
- mantisdk/trainer/registry.py +12 -0
- mantisdk/trainer/trainer.py +618 -0
- mantisdk/types/__init__.py +6 -0
- mantisdk/types/core.py +553 -0
- mantisdk/types/resources.py +204 -0
- mantisdk/types/tracer.py +515 -0
- mantisdk/types/tracing.py +218 -0
- mantisdk/utils/__init__.py +1 -0
- mantisdk/utils/id.py +18 -0
- mantisdk/utils/metrics.py +1025 -0
- mantisdk/utils/otel.py +578 -0
- mantisdk/utils/otlp.py +536 -0
- mantisdk/utils/server_launcher.py +1045 -0
- mantisdk/utils/system_snapshot.py +81 -0
- mantisdk/verl/__init__.py +8 -0
- mantisdk/verl/__main__.py +6 -0
- mantisdk/verl/async_server.py +46 -0
- mantisdk/verl/config.yaml +27 -0
- mantisdk/verl/daemon.py +1154 -0
- mantisdk/verl/dataset.py +44 -0
- mantisdk/verl/entrypoint.py +248 -0
- mantisdk/verl/trainer.py +549 -0
- mantisdk-0.1.0.dist-info/METADATA +119 -0
- mantisdk-0.1.0.dist-info/RECORD +190 -0
- mantisdk-0.1.0.dist-info/WHEEL +4 -0
- mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
- mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
# Copyright (c) 2025 Lakshya A Agrawal and the GEPA contributors
|
|
2
|
+
# https://github.com/gepa-ai/gepa
|
|
3
|
+
|
|
4
|
+
from typing import Any, TypedDict
|
|
5
|
+
|
|
6
|
+
from mantisdk.algorithm.gepa.lib.adapters.generic_rag_adapter.evaluation_metrics import RAGEvaluationMetrics
|
|
7
|
+
from mantisdk.algorithm.gepa.lib.adapters.generic_rag_adapter.rag_pipeline import RAGPipeline
|
|
8
|
+
from mantisdk.algorithm.gepa.lib.adapters.generic_rag_adapter.vector_store_interface import VectorStoreInterface
|
|
9
|
+
from mantisdk.algorithm.gepa.lib.core.adapter import EvaluationBatch, GEPAAdapter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RAGDataInst(TypedDict):
|
|
13
|
+
"""
|
|
14
|
+
Data instance for RAG evaluation and optimization.
|
|
15
|
+
|
|
16
|
+
This TypedDict defines the structure for training and validation examples
|
|
17
|
+
used in RAG system optimization with GEPA.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
query (str): User query or question to be answered
|
|
21
|
+
ground_truth_answer (str): Expected/correct answer for evaluation
|
|
22
|
+
relevant_doc_ids (List[str]): List of document IDs that should ideally
|
|
23
|
+
be retrieved for this query (used for retrieval evaluation)
|
|
24
|
+
metadata (Dict[str, Any]): Additional context, tags, or configuration
|
|
25
|
+
specific to this example (e.g., difficulty level, category)
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
.. code-block:: python
|
|
29
|
+
|
|
30
|
+
data_inst = RAGDataInst(
|
|
31
|
+
query="What is machine learning?",
|
|
32
|
+
ground_truth_answer="Machine learning is a subset of AI...",
|
|
33
|
+
relevant_doc_ids=["doc_001", "doc_042"],
|
|
34
|
+
metadata={"category": "AI", "difficulty": "beginner"}
|
|
35
|
+
)
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
query: str
|
|
39
|
+
ground_truth_answer: str
|
|
40
|
+
relevant_doc_ids: list[str]
|
|
41
|
+
metadata: dict[str, Any]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class RAGTrajectory(TypedDict):
|
|
45
|
+
"""
|
|
46
|
+
Detailed trajectory capturing all RAG pipeline execution steps.
|
|
47
|
+
|
|
48
|
+
This TypedDict captures the complete execution trace of the RAG pipeline,
|
|
49
|
+
providing visibility into each step for analysis and optimization.
|
|
50
|
+
|
|
51
|
+
Attributes:
|
|
52
|
+
original_query (str): Original user query as provided
|
|
53
|
+
reformulated_query (str): Query after reformulation step (if enabled)
|
|
54
|
+
retrieved_docs (List[Dict[str, Any]]): Documents retrieved from vector store
|
|
55
|
+
with their content, metadata, and similarity scores
|
|
56
|
+
synthesized_context (str): Context after document synthesis step
|
|
57
|
+
generated_answer (str): Final answer generated by the LLM
|
|
58
|
+
execution_metadata (Dict[str, Any]): Pipeline execution metadata including
|
|
59
|
+
retrieval metrics, generation metrics, token counts, and performance data
|
|
60
|
+
|
|
61
|
+
Note:
|
|
62
|
+
Trajectories are only captured when capture_traces=True is passed to
|
|
63
|
+
the evaluate() method, as they can be memory-intensive for large batches.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
original_query: str
|
|
67
|
+
reformulated_query: str
|
|
68
|
+
retrieved_docs: list[dict[str, Any]]
|
|
69
|
+
synthesized_context: str
|
|
70
|
+
generated_answer: str
|
|
71
|
+
execution_metadata: dict[str, Any]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class RAGOutput(TypedDict):
|
|
75
|
+
"""
|
|
76
|
+
Final output from RAG system execution.
|
|
77
|
+
|
|
78
|
+
This TypedDict represents the final result of RAG pipeline execution,
|
|
79
|
+
containing both the generated answer and associated metadata.
|
|
80
|
+
|
|
81
|
+
Attributes:
|
|
82
|
+
final_answer (str): The generated answer from the RAG system
|
|
83
|
+
confidence_score (float): Estimated confidence in the answer (0.0 to 1.0)
|
|
84
|
+
based on retrieval quality and generation metrics
|
|
85
|
+
retrieved_docs (List[Dict[str, Any]]): Documents that were retrieved
|
|
86
|
+
and used for answer generation
|
|
87
|
+
total_tokens (int): Estimated total token usage for the pipeline execution
|
|
88
|
+
|
|
89
|
+
Example:
|
|
90
|
+
.. code-block:: python
|
|
91
|
+
|
|
92
|
+
output = RAGOutput(
|
|
93
|
+
final_answer="Machine learning is a method of data analysis...",
|
|
94
|
+
confidence_score=0.87,
|
|
95
|
+
retrieved_docs=[{"content": "...", "score": 0.9}],
|
|
96
|
+
total_tokens=450
|
|
97
|
+
)
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
final_answer: str
|
|
101
|
+
confidence_score: float
|
|
102
|
+
retrieved_docs: list[dict[str, Any]]
|
|
103
|
+
total_tokens: int
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class GenericRAGAdapter(GEPAAdapter[RAGDataInst, RAGTrajectory, RAGOutput]):
|
|
107
|
+
"""
|
|
108
|
+
Generic GEPA adapter for RAG system optimization with pluggable vector stores.
|
|
109
|
+
|
|
110
|
+
This adapter enables GEPA's evolutionary prompt optimization to work with any
|
|
111
|
+
vector store implementation through the VectorStoreInterface. It provides
|
|
112
|
+
comprehensive evaluation of both retrieval and generation quality.
|
|
113
|
+
|
|
114
|
+
Optimizable Components:
|
|
115
|
+
- Query reformulation prompts: Improve query understanding and reformulation
|
|
116
|
+
- Context synthesis prompts: Optimize document combination and summarization
|
|
117
|
+
- Answer generation prompts: Enhance final answer quality and formatting
|
|
118
|
+
- Reranking criteria: Improve document relevance ordering
|
|
119
|
+
|
|
120
|
+
Evaluation Metrics:
|
|
121
|
+
- Retrieval Quality: Precision, recall, F1, mean reciprocal rank (MRR)
|
|
122
|
+
- Generation Quality: Token F1, BLEU score, faithfulness, answer relevance
|
|
123
|
+
- Combined Score: Weighted combination for overall system performance
|
|
124
|
+
|
|
125
|
+
Vector Store Support:
|
|
126
|
+
Works with any vector store implementing VectorStoreInterface, including:
|
|
127
|
+
ChromaDB, Weaviate, Qdrant, Pinecone, Milvus, and custom implementations.
|
|
128
|
+
|
|
129
|
+
Example:
|
|
130
|
+
.. code-block:: python
|
|
131
|
+
|
|
132
|
+
from mantisdk.algorithm.gepa.lib.adapters.generic_rag_adapter import GenericRAGAdapter, ChromaVectorStore
|
|
133
|
+
import gepa
|
|
134
|
+
|
|
135
|
+
vector_store = ChromaVectorStore.create_local("./kb", "docs")
|
|
136
|
+
adapter = GenericRAGAdapter(vector_store=vector_store, llm_model="gpt-4")
|
|
137
|
+
|
|
138
|
+
result = gepa.optimize(
|
|
139
|
+
seed_candidate={"answer_generation": "Answer based on context:"},
|
|
140
|
+
trainset=train_data,
|
|
141
|
+
valset=val_data,
|
|
142
|
+
adapter=adapter,
|
|
143
|
+
max_metric_calls=50
|
|
144
|
+
)
|
|
145
|
+
print(result.best_candidate) # Optimized prompts
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
def __init__(
|
|
149
|
+
self,
|
|
150
|
+
vector_store: VectorStoreInterface,
|
|
151
|
+
llm_model,
|
|
152
|
+
embedding_model: str = "text-embedding-3-small",
|
|
153
|
+
embedding_function=None,
|
|
154
|
+
rag_config: dict[str, Any] | None = None,
|
|
155
|
+
failure_score: float = 0.0,
|
|
156
|
+
):
|
|
157
|
+
"""
|
|
158
|
+
Initialize the GenericRAGAdapter for RAG system optimization.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
vector_store: Vector store implementation (ChromaDB, Weaviate, etc.)
|
|
162
|
+
Must implement VectorStoreInterface for similarity search operations.
|
|
163
|
+
llm_model: LLM client for text generation. Can be:
|
|
164
|
+
- String model name (uses litellm for inference)
|
|
165
|
+
- Callable that takes messages and returns response text
|
|
166
|
+
- Any object with a callable interface for LLM inference
|
|
167
|
+
embedding_model: Model name for text embeddings (default: "text-embedding-3-small").
|
|
168
|
+
Used when embedding_function is not provided.
|
|
169
|
+
embedding_function: Optional custom embedding function that takes text
|
|
170
|
+
and returns List[float]. If None, uses default litellm embeddings.
|
|
171
|
+
rag_config: RAG pipeline configuration dictionary. Keys include:
|
|
172
|
+
- "retrieval_strategy": "similarity", "hybrid", or "vector"
|
|
173
|
+
- "top_k": Number of documents to retrieve (default: 5)
|
|
174
|
+
- "retrieval_weight": Weight for retrieval in combined score (default: 0.3)
|
|
175
|
+
- "generation_weight": Weight for generation in combined score (default: 0.7)
|
|
176
|
+
- "hybrid_alpha": Semantic vs keyword balance for hybrid search (default: 0.5)
|
|
177
|
+
- "filters": Default metadata filters for retrieval
|
|
178
|
+
failure_score: Score assigned when evaluation fails (default: 0.0)
|
|
179
|
+
|
|
180
|
+
Example:
|
|
181
|
+
.. code-block:: python
|
|
182
|
+
|
|
183
|
+
vector_store = WeaviateVectorStore.create_local(collection_name="docs")
|
|
184
|
+
adapter = GenericRAGAdapter(
|
|
185
|
+
vector_store=vector_store,
|
|
186
|
+
llm_model="gpt-4",
|
|
187
|
+
rag_config={
|
|
188
|
+
"retrieval_strategy": "hybrid",
|
|
189
|
+
"top_k": 5,
|
|
190
|
+
"hybrid_alpha": 0.7
|
|
191
|
+
}
|
|
192
|
+
)
|
|
193
|
+
"""
|
|
194
|
+
self.vector_store = vector_store
|
|
195
|
+
self.rag_pipeline = RAGPipeline(
|
|
196
|
+
vector_store=vector_store,
|
|
197
|
+
llm_client=llm_model,
|
|
198
|
+
embedding_model=embedding_model,
|
|
199
|
+
embedding_function=embedding_function,
|
|
200
|
+
)
|
|
201
|
+
self.evaluator = RAGEvaluationMetrics()
|
|
202
|
+
self.config = rag_config or self._default_config()
|
|
203
|
+
self.failure_score = failure_score
|
|
204
|
+
|
|
205
|
+
def evaluate(
|
|
206
|
+
self,
|
|
207
|
+
batch: list[RAGDataInst],
|
|
208
|
+
candidate: dict[str, str],
|
|
209
|
+
capture_traces: bool = False,
|
|
210
|
+
) -> EvaluationBatch[RAGTrajectory, RAGOutput]:
|
|
211
|
+
"""
|
|
212
|
+
Evaluate RAG system performance on a batch of query-answer examples.
|
|
213
|
+
|
|
214
|
+
This method runs the complete RAG pipeline on each example in the batch,
|
|
215
|
+
evaluating both retrieval and generation quality using the provided
|
|
216
|
+
prompt components.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
batch: List of RAG evaluation examples, each containing:
|
|
220
|
+
- query: Question to answer
|
|
221
|
+
- ground_truth_answer: Expected correct answer
|
|
222
|
+
- relevant_doc_ids: Documents that should be retrieved
|
|
223
|
+
- metadata: Additional context for evaluation
|
|
224
|
+
candidate: Dictionary mapping prompt component names to their text.
|
|
225
|
+
Supported components:
|
|
226
|
+
- "query_reformulation": Prompt for improving user queries
|
|
227
|
+
- "context_synthesis": Prompt for combining retrieved documents
|
|
228
|
+
- "answer_generation": Prompt for generating final answers
|
|
229
|
+
- "reranking_criteria": Criteria for reordering retrieved documents
|
|
230
|
+
capture_traces: If True, capture detailed execution trajectories
|
|
231
|
+
for each example. Required for reflective dataset generation but
|
|
232
|
+
increases memory usage.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
EvaluationBatch containing:
|
|
236
|
+
- outputs: List of RAGOutput for each example
|
|
237
|
+
- scores: List of combined quality scores (higher = better)
|
|
238
|
+
- trajectories: List of detailed execution traces (if capture_traces=True)
|
|
239
|
+
|
|
240
|
+
Raises:
|
|
241
|
+
Exception: Individual example failures are caught and assigned failure_score.
|
|
242
|
+
Only systemic failures (e.g., vector store unavailable) raise exceptions.
|
|
243
|
+
|
|
244
|
+
Example:
|
|
245
|
+
.. code-block:: python
|
|
246
|
+
|
|
247
|
+
prompts = {
|
|
248
|
+
"answer_generation": "Answer the question based on this context:"
|
|
249
|
+
}
|
|
250
|
+
result = adapter.evaluate(
|
|
251
|
+
batch=validation_data,
|
|
252
|
+
candidate=prompts,
|
|
253
|
+
capture_traces=True
|
|
254
|
+
)
|
|
255
|
+
avg_score = sum(result.scores) / len(result.scores)
|
|
256
|
+
print(f"Average RAG performance: {avg_score:.3f}")
|
|
257
|
+
"""
|
|
258
|
+
outputs: list[RAGOutput] = []
|
|
259
|
+
scores: list[float] = []
|
|
260
|
+
trajectories: list[RAGTrajectory] | None = [] if capture_traces else None
|
|
261
|
+
|
|
262
|
+
for data_inst in batch:
|
|
263
|
+
try:
|
|
264
|
+
# Execute RAG pipeline with candidate prompts
|
|
265
|
+
rag_result = self.rag_pipeline.execute_rag(
|
|
266
|
+
query=data_inst["query"], prompts=candidate, config=self.config
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Evaluate retrieval quality
|
|
270
|
+
retrieval_metrics = self.evaluator.evaluate_retrieval(
|
|
271
|
+
rag_result["retrieved_docs"], data_inst["relevant_doc_ids"]
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Evaluate generation quality
|
|
275
|
+
generation_metrics = self.evaluator.evaluate_generation(
|
|
276
|
+
rag_result["generated_answer"], data_inst["ground_truth_answer"], rag_result["synthesized_context"]
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Calculate combined score
|
|
280
|
+
overall_score = self.evaluator.combined_rag_score(
|
|
281
|
+
retrieval_metrics,
|
|
282
|
+
generation_metrics,
|
|
283
|
+
retrieval_weight=self.config.get("retrieval_weight", 0.3),
|
|
284
|
+
generation_weight=self.config.get("generation_weight", 0.7),
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Prepare output
|
|
288
|
+
output = RAGOutput(
|
|
289
|
+
final_answer=rag_result["generated_answer"],
|
|
290
|
+
confidence_score=generation_metrics.get("answer_confidence", 0.5),
|
|
291
|
+
retrieved_docs=rag_result["retrieved_docs"],
|
|
292
|
+
total_tokens=rag_result["metadata"]["total_tokens"],
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
outputs.append(output)
|
|
296
|
+
scores.append(overall_score)
|
|
297
|
+
|
|
298
|
+
# Capture trajectory if requested
|
|
299
|
+
if capture_traces:
|
|
300
|
+
trajectory = RAGTrajectory(
|
|
301
|
+
original_query=rag_result["original_query"],
|
|
302
|
+
reformulated_query=rag_result["reformulated_query"],
|
|
303
|
+
retrieved_docs=rag_result["retrieved_docs"],
|
|
304
|
+
synthesized_context=rag_result["synthesized_context"],
|
|
305
|
+
generated_answer=rag_result["generated_answer"],
|
|
306
|
+
execution_metadata={
|
|
307
|
+
**rag_result["metadata"],
|
|
308
|
+
"retrieval_metrics": retrieval_metrics,
|
|
309
|
+
"generation_metrics": generation_metrics,
|
|
310
|
+
"overall_score": overall_score,
|
|
311
|
+
},
|
|
312
|
+
)
|
|
313
|
+
trajectories.append(trajectory)
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
# Handle individual example failure
|
|
317
|
+
error_output = RAGOutput(
|
|
318
|
+
final_answer=f"Error: {e!s}", confidence_score=0.0, retrieved_docs=[], total_tokens=0
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
outputs.append(error_output)
|
|
322
|
+
scores.append(self.failure_score)
|
|
323
|
+
|
|
324
|
+
if capture_traces:
|
|
325
|
+
error_trajectory = RAGTrajectory(
|
|
326
|
+
original_query=data_inst["query"],
|
|
327
|
+
reformulated_query=data_inst["query"],
|
|
328
|
+
retrieved_docs=[],
|
|
329
|
+
synthesized_context="",
|
|
330
|
+
generated_answer=f"Error: {e!s}",
|
|
331
|
+
execution_metadata={"error": str(e)},
|
|
332
|
+
)
|
|
333
|
+
trajectories.append(error_trajectory)
|
|
334
|
+
|
|
335
|
+
return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories)
|
|
336
|
+
|
|
337
|
+
def make_reflective_dataset(
|
|
338
|
+
self,
|
|
339
|
+
candidate: dict[str, str],
|
|
340
|
+
eval_batch: EvaluationBatch[RAGTrajectory, RAGOutput],
|
|
341
|
+
components_to_update: list[str],
|
|
342
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
343
|
+
"""
|
|
344
|
+
Generate reflective dataset for evolutionary prompt optimization.
|
|
345
|
+
|
|
346
|
+
This method analyzes the evaluation results and creates training examples
|
|
347
|
+
that GEPA's proposer can use to improve the specified prompt components.
|
|
348
|
+
Each component gets a tailored dataset with input-output pairs and feedback.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
candidate: Current prompt components that were evaluated
|
|
352
|
+
eval_batch: Evaluation results from evaluate() with capture_traces=True.
|
|
353
|
+
Must contain trajectories for analysis.
|
|
354
|
+
components_to_update: List of component names to generate improvement
|
|
355
|
+
suggestions for. Must be subset of candidate.keys().
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Dictionary mapping component names to their reflective datasets.
|
|
359
|
+
Each dataset is a list of examples with structure:
|
|
360
|
+
- "Inputs": Input data for the component (query, docs, etc.)
|
|
361
|
+
- "Generated Outputs": What the component currently produces
|
|
362
|
+
- "Feedback": Analysis of performance and suggestions for improvement
|
|
363
|
+
|
|
364
|
+
Example:
|
|
365
|
+
.. code-block:: python
|
|
366
|
+
|
|
367
|
+
reflective_data = adapter.make_reflective_dataset(
|
|
368
|
+
candidate=current_prompts,
|
|
369
|
+
eval_batch=evaluation_results, # with trajectories
|
|
370
|
+
components_to_update=["answer_generation", "context_synthesis"]
|
|
371
|
+
)
|
|
372
|
+
print(reflective_data["answer_generation"][0]["Feedback"])
|
|
373
|
+
# Output: "The generated answer lacks specific details from the context..."
|
|
374
|
+
|
|
375
|
+
Note:
|
|
376
|
+
This method requires eval_batch to have been created with
|
|
377
|
+
capture_traces=True, otherwise trajectories will be None.
|
|
378
|
+
"""
|
|
379
|
+
reflective_data: dict[str, list[dict[str, Any]]] = {}
|
|
380
|
+
|
|
381
|
+
for component in components_to_update:
|
|
382
|
+
component_examples = []
|
|
383
|
+
|
|
384
|
+
# Process each trajectory to create examples for this component
|
|
385
|
+
for traj, output, score in zip(
|
|
386
|
+
eval_batch.trajectories or [], eval_batch.outputs, eval_batch.scores, strict=False
|
|
387
|
+
):
|
|
388
|
+
example = self._create_component_example(component, traj, output, score, candidate)
|
|
389
|
+
if example:
|
|
390
|
+
component_examples.append(example)
|
|
391
|
+
|
|
392
|
+
# Only include components that have examples
|
|
393
|
+
if component_examples:
|
|
394
|
+
reflective_data[component] = component_examples
|
|
395
|
+
|
|
396
|
+
return reflective_data
|
|
397
|
+
|
|
398
|
+
def _create_component_example(
|
|
399
|
+
self, component_name: str, trajectory: RAGTrajectory, output: RAGOutput, score: float, candidate: dict[str, str]
|
|
400
|
+
) -> dict[str, Any] | None:
|
|
401
|
+
"""Create a reflective example for a specific component."""
|
|
402
|
+
|
|
403
|
+
if component_name == "query_reformulation":
|
|
404
|
+
return {
|
|
405
|
+
"Inputs": {
|
|
406
|
+
"original_query": trajectory["original_query"],
|
|
407
|
+
"current_prompt": candidate.get(component_name, ""),
|
|
408
|
+
},
|
|
409
|
+
"Generated Outputs": trajectory["reformulated_query"],
|
|
410
|
+
"Feedback": self._generate_query_reformulation_feedback(trajectory, score),
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
elif component_name == "context_synthesis":
|
|
414
|
+
return {
|
|
415
|
+
"Inputs": {
|
|
416
|
+
"query": trajectory["original_query"],
|
|
417
|
+
"retrieved_docs": [doc["content"] for doc in trajectory["retrieved_docs"]],
|
|
418
|
+
"current_prompt": candidate.get(component_name, ""),
|
|
419
|
+
},
|
|
420
|
+
"Generated Outputs": trajectory["synthesized_context"],
|
|
421
|
+
"Feedback": self._generate_context_synthesis_feedback(trajectory, score),
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
elif component_name == "answer_generation":
|
|
425
|
+
return {
|
|
426
|
+
"Inputs": {
|
|
427
|
+
"query": trajectory["original_query"],
|
|
428
|
+
"context": trajectory["synthesized_context"],
|
|
429
|
+
"current_prompt": candidate.get(component_name, ""),
|
|
430
|
+
},
|
|
431
|
+
"Generated Outputs": trajectory["generated_answer"],
|
|
432
|
+
"Feedback": self._generate_answer_generation_feedback(trajectory, output, score),
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
elif component_name == "reranking_criteria":
|
|
436
|
+
return {
|
|
437
|
+
"Inputs": {
|
|
438
|
+
"query": trajectory["original_query"],
|
|
439
|
+
"documents": [doc["content"] for doc in trajectory["retrieved_docs"]],
|
|
440
|
+
"current_criteria": candidate.get(component_name, ""),
|
|
441
|
+
},
|
|
442
|
+
"Generated Outputs": "Document ranking applied",
|
|
443
|
+
"Feedback": self._generate_reranking_feedback(trajectory, score),
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
return None
|
|
447
|
+
|
|
448
|
+
def _generate_query_reformulation_feedback(self, trajectory: RAGTrajectory, score: float) -> str:
|
|
449
|
+
"""Generate feedback for query reformulation component."""
|
|
450
|
+
if score > 0.7:
|
|
451
|
+
return f"Good query reformulation. The reformulated query '{trajectory['reformulated_query']}' helped retrieve relevant documents and generated a good answer."
|
|
452
|
+
else:
|
|
453
|
+
return f"The query reformulation from '{trajectory['original_query']}' to '{trajectory['reformulated_query']}' may not have improved retrieval. Consider making the reformulated query more specific or preserving key terms."
|
|
454
|
+
|
|
455
|
+
def _generate_context_synthesis_feedback(self, trajectory: RAGTrajectory, score: float) -> str:
|
|
456
|
+
"""Generate feedback for context synthesis component."""
|
|
457
|
+
if score > 0.7:
|
|
458
|
+
return "Context synthesis worked well - the synthesized context effectively supported answer generation."
|
|
459
|
+
else:
|
|
460
|
+
return "Context synthesis could be improved. The synthesized context may not have highlighted the most relevant information or may have been too verbose/concise."
|
|
461
|
+
|
|
462
|
+
def _generate_answer_generation_feedback(self, trajectory: RAGTrajectory, output: RAGOutput, score: float) -> str:
|
|
463
|
+
"""Generate feedback for answer generation component."""
|
|
464
|
+
if score > 0.7:
|
|
465
|
+
return f"Good answer generation. The generated answer '{trajectory['generated_answer']}' was accurate and well-supported by the context."
|
|
466
|
+
else:
|
|
467
|
+
return f"Answer generation needs improvement. The generated answer '{trajectory['generated_answer']}' may not be fully accurate or well-supported by the provided context."
|
|
468
|
+
|
|
469
|
+
def _generate_reranking_feedback(self, trajectory: RAGTrajectory, score: float) -> str:
|
|
470
|
+
"""Generate feedback for reranking criteria component."""
|
|
471
|
+
if score > 0.7:
|
|
472
|
+
return "Document reranking appears to have helped surface more relevant documents for answer generation."
|
|
473
|
+
else:
|
|
474
|
+
return "Document reranking may not have improved relevance. Consider adjusting the criteria to better prioritize documents that contain the answer."
|
|
475
|
+
|
|
476
|
+
def _default_config(self) -> dict[str, Any]:
|
|
477
|
+
"""
|
|
478
|
+
Get default configuration for RAG pipeline.
|
|
479
|
+
|
|
480
|
+
Returns:
|
|
481
|
+
Dictionary with default RAG configuration parameters:
|
|
482
|
+
- retrieval_strategy: "similarity" (semantic search)
|
|
483
|
+
- top_k: 5 (number of documents to retrieve)
|
|
484
|
+
- retrieval_weight: 0.3 (30% weight for retrieval metrics)
|
|
485
|
+
- generation_weight: 0.7 (70% weight for generation metrics)
|
|
486
|
+
- hybrid_alpha: 0.5 (balanced semantic/keyword for hybrid search)
|
|
487
|
+
- filters: None (no metadata filtering by default)
|
|
488
|
+
"""
|
|
489
|
+
return {
|
|
490
|
+
"retrieval_strategy": "similarity",
|
|
491
|
+
"top_k": 5,
|
|
492
|
+
"retrieval_weight": 0.3,
|
|
493
|
+
"generation_weight": 0.7,
|
|
494
|
+
"hybrid_alpha": 0.5,
|
|
495
|
+
"filters": None,
|
|
496
|
+
}
|