mantisdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mantisdk might be problematic. Click here for more details.
- mantisdk/__init__.py +22 -0
- mantisdk/adapter/__init__.py +15 -0
- mantisdk/adapter/base.py +94 -0
- mantisdk/adapter/messages.py +270 -0
- mantisdk/adapter/triplet.py +1028 -0
- mantisdk/algorithm/__init__.py +39 -0
- mantisdk/algorithm/apo/__init__.py +5 -0
- mantisdk/algorithm/apo/apo.py +889 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant01.poml +22 -0
- mantisdk/algorithm/apo/prompts/apply_edit_variant02.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant01.poml +18 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant02.poml +16 -0
- mantisdk/algorithm/apo/prompts/text_gradient_variant03.poml +107 -0
- mantisdk/algorithm/base.py +162 -0
- mantisdk/algorithm/decorator.py +264 -0
- mantisdk/algorithm/fast.py +250 -0
- mantisdk/algorithm/gepa/__init__.py +59 -0
- mantisdk/algorithm/gepa/adapter.py +459 -0
- mantisdk/algorithm/gepa/gepa.py +364 -0
- mantisdk/algorithm/gepa/lib/__init__.py +18 -0
- mantisdk/algorithm/gepa/lib/adapters/README.md +12 -0
- mantisdk/algorithm/gepa/lib/adapters/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/README.md +341 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/__init__.py +1 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/anymaths_adapter.py +174 -0
- mantisdk/algorithm/gepa/lib/adapters/anymaths_adapter/requirements.txt +1 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/README.md +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/default_adapter/default_adapter.py +209 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/README.md +7 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_adapter/dspy_adapter.py +307 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/README.md +99 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/dspy_program_proposal_signature.py +137 -0
- mantisdk/algorithm/gepa/lib/adapters/dspy_full_program_adapter/full_program_adapter.py +266 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/GEPA_RAG.md +621 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/__init__.py +56 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/evaluation_metrics.py +226 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/generic_rag_adapter.py +496 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/rag_pipeline.py +238 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_store_interface.py +212 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/__init__.py +2 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/chroma_store.py +196 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/lancedb_store.py +422 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/milvus_store.py +409 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/qdrant_store.py +368 -0
- mantisdk/algorithm/gepa/lib/adapters/generic_rag_adapter/vector_stores/weaviate_store.py +418 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/README.md +552 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/__init__.py +37 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_adapter.py +705 -0
- mantisdk/algorithm/gepa/lib/adapters/mcp_adapter/mcp_client.py +364 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/README.md +9 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/adapters/terminal_bench_adapter/terminal_bench_adapter.py +217 -0
- mantisdk/algorithm/gepa/lib/api.py +375 -0
- mantisdk/algorithm/gepa/lib/core/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/core/adapter.py +180 -0
- mantisdk/algorithm/gepa/lib/core/data_loader.py +74 -0
- mantisdk/algorithm/gepa/lib/core/engine.py +356 -0
- mantisdk/algorithm/gepa/lib/core/result.py +233 -0
- mantisdk/algorithm/gepa/lib/core/state.py +636 -0
- mantisdk/algorithm/gepa/lib/examples/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/examples/aime.py +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/eval_default.py +111 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/instruction_prompt.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/prompt-templates/optimal_prompt.txt +24 -0
- mantisdk/algorithm/gepa/lib/examples/anymaths-bench/train_anymaths.py +177 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/arc_agi.ipynb +25705 -0
- mantisdk/algorithm/gepa/lib/examples/dspy_full_program_evolution/example.ipynb +348 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/__init__.py +4 -0
- mantisdk/algorithm/gepa/lib/examples/mcp_adapter/mcp_optimization_example.py +455 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/RAG_GUIDE.md +613 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/__init__.py +9 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/rag_optimization.py +824 -0
- mantisdk/algorithm/gepa/lib/examples/rag_adapter/requirements-rag.txt +29 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/instruction_prompt.txt +16 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/prompt-templates/terminus.txt +9 -0
- mantisdk/algorithm/gepa/lib/examples/terminal-bench/train_terminus.py +161 -0
- mantisdk/algorithm/gepa/lib/gepa_utils.py +117 -0
- mantisdk/algorithm/gepa/lib/logging/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/logging/experiment_tracker.py +187 -0
- mantisdk/algorithm/gepa/lib/logging/logger.py +75 -0
- mantisdk/algorithm/gepa/lib/logging/utils.py +103 -0
- mantisdk/algorithm/gepa/lib/proposer/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/base.py +31 -0
- mantisdk/algorithm/gepa/lib/proposer/merge.py +357 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/base.py +49 -0
- mantisdk/algorithm/gepa/lib/proposer/reflective_mutation/reflective_mutation.py +176 -0
- mantisdk/algorithm/gepa/lib/py.typed +0 -0
- mantisdk/algorithm/gepa/lib/strategies/__init__.py +0 -0
- mantisdk/algorithm/gepa/lib/strategies/batch_sampler.py +77 -0
- mantisdk/algorithm/gepa/lib/strategies/candidate_selector.py +50 -0
- mantisdk/algorithm/gepa/lib/strategies/component_selector.py +36 -0
- mantisdk/algorithm/gepa/lib/strategies/eval_policy.py +64 -0
- mantisdk/algorithm/gepa/lib/strategies/instruction_proposal.py +127 -0
- mantisdk/algorithm/gepa/lib/utils/__init__.py +10 -0
- mantisdk/algorithm/gepa/lib/utils/stop_condition.py +196 -0
- mantisdk/algorithm/gepa/tracing.py +105 -0
- mantisdk/algorithm/utils.py +177 -0
- mantisdk/algorithm/verl/__init__.py +5 -0
- mantisdk/algorithm/verl/interface.py +202 -0
- mantisdk/cli/__init__.py +56 -0
- mantisdk/cli/prometheus.py +115 -0
- mantisdk/cli/store.py +131 -0
- mantisdk/cli/vllm.py +29 -0
- mantisdk/client.py +408 -0
- mantisdk/config.py +348 -0
- mantisdk/emitter/__init__.py +43 -0
- mantisdk/emitter/annotation.py +370 -0
- mantisdk/emitter/exception.py +54 -0
- mantisdk/emitter/message.py +61 -0
- mantisdk/emitter/object.py +117 -0
- mantisdk/emitter/reward.py +320 -0
- mantisdk/env_var.py +156 -0
- mantisdk/execution/__init__.py +15 -0
- mantisdk/execution/base.py +64 -0
- mantisdk/execution/client_server.py +443 -0
- mantisdk/execution/events.py +69 -0
- mantisdk/execution/inter_process.py +16 -0
- mantisdk/execution/shared_memory.py +282 -0
- mantisdk/instrumentation/__init__.py +119 -0
- mantisdk/instrumentation/agentops.py +314 -0
- mantisdk/instrumentation/agentops_langchain.py +45 -0
- mantisdk/instrumentation/litellm.py +83 -0
- mantisdk/instrumentation/vllm.py +81 -0
- mantisdk/instrumentation/weave.py +500 -0
- mantisdk/litagent/__init__.py +11 -0
- mantisdk/litagent/decorator.py +536 -0
- mantisdk/litagent/litagent.py +252 -0
- mantisdk/llm_proxy.py +1890 -0
- mantisdk/logging.py +370 -0
- mantisdk/reward.py +7 -0
- mantisdk/runner/__init__.py +11 -0
- mantisdk/runner/agent.py +845 -0
- mantisdk/runner/base.py +182 -0
- mantisdk/runner/legacy.py +309 -0
- mantisdk/semconv.py +170 -0
- mantisdk/server.py +401 -0
- mantisdk/store/__init__.py +23 -0
- mantisdk/store/base.py +897 -0
- mantisdk/store/client_server.py +2092 -0
- mantisdk/store/collection/__init__.py +30 -0
- mantisdk/store/collection/base.py +587 -0
- mantisdk/store/collection/memory.py +970 -0
- mantisdk/store/collection/mongo.py +1412 -0
- mantisdk/store/collection_based.py +1823 -0
- mantisdk/store/insight.py +648 -0
- mantisdk/store/listener.py +58 -0
- mantisdk/store/memory.py +396 -0
- mantisdk/store/mongo.py +165 -0
- mantisdk/store/sqlite.py +3 -0
- mantisdk/store/threading.py +357 -0
- mantisdk/store/utils.py +142 -0
- mantisdk/tracer/__init__.py +16 -0
- mantisdk/tracer/agentops.py +242 -0
- mantisdk/tracer/base.py +287 -0
- mantisdk/tracer/dummy.py +106 -0
- mantisdk/tracer/otel.py +555 -0
- mantisdk/tracer/weave.py +677 -0
- mantisdk/trainer/__init__.py +6 -0
- mantisdk/trainer/init_utils.py +263 -0
- mantisdk/trainer/legacy.py +367 -0
- mantisdk/trainer/registry.py +12 -0
- mantisdk/trainer/trainer.py +618 -0
- mantisdk/types/__init__.py +6 -0
- mantisdk/types/core.py +553 -0
- mantisdk/types/resources.py +204 -0
- mantisdk/types/tracer.py +515 -0
- mantisdk/types/tracing.py +218 -0
- mantisdk/utils/__init__.py +1 -0
- mantisdk/utils/id.py +18 -0
- mantisdk/utils/metrics.py +1025 -0
- mantisdk/utils/otel.py +578 -0
- mantisdk/utils/otlp.py +536 -0
- mantisdk/utils/server_launcher.py +1045 -0
- mantisdk/utils/system_snapshot.py +81 -0
- mantisdk/verl/__init__.py +8 -0
- mantisdk/verl/__main__.py +6 -0
- mantisdk/verl/async_server.py +46 -0
- mantisdk/verl/config.yaml +27 -0
- mantisdk/verl/daemon.py +1154 -0
- mantisdk/verl/dataset.py +44 -0
- mantisdk/verl/entrypoint.py +248 -0
- mantisdk/verl/trainer.py +549 -0
- mantisdk-0.1.0.dist-info/METADATA +119 -0
- mantisdk-0.1.0.dist-info/RECORD +190 -0
- mantisdk-0.1.0.dist-info/WHEEL +4 -0
- mantisdk-0.1.0.dist-info/entry_points.txt +2 -0
- mantisdk-0.1.0.dist-info/licenses/LICENSE +19 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# Copyright (c) 2025 Lakshya A Agrawal and the GEPA contributors
|
|
2
|
+
# https://github.com/gepa-ai/gepa
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RAGEvaluationMetrics:
|
|
9
|
+
"""
|
|
10
|
+
Evaluation metrics for RAG systems.
|
|
11
|
+
|
|
12
|
+
Provides both retrieval and generation quality metrics
|
|
13
|
+
for comprehensive RAG system evaluation.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def evaluate_retrieval(self, retrieved_docs: list[dict[str, Any]], relevant_doc_ids: list[str]) -> dict[str, float]:
|
|
17
|
+
"""
|
|
18
|
+
Evaluate retrieval quality metrics.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
retrieved_docs: List of retrieved documents with metadata
|
|
22
|
+
relevant_doc_ids: List of ground truth relevant document IDs
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Dictionary with retrieval metrics (precision, recall, f1, mrr)
|
|
26
|
+
"""
|
|
27
|
+
if not retrieved_docs or not relevant_doc_ids:
|
|
28
|
+
return {"retrieval_precision": 0.0, "retrieval_recall": 0.0, "retrieval_f1": 0.0, "retrieval_mrr": 0.0}
|
|
29
|
+
|
|
30
|
+
# Extract document IDs from retrieved docs
|
|
31
|
+
retrieved_ids = []
|
|
32
|
+
for doc in retrieved_docs:
|
|
33
|
+
doc_id = doc.get("metadata", {}).get("doc_id") or doc.get("metadata", {}).get("id")
|
|
34
|
+
if doc_id:
|
|
35
|
+
retrieved_ids.append(str(doc_id))
|
|
36
|
+
|
|
37
|
+
relevant_set = set(relevant_doc_ids)
|
|
38
|
+
retrieved_set = set(retrieved_ids)
|
|
39
|
+
|
|
40
|
+
# Calculate precision and recall
|
|
41
|
+
if len(retrieved_set) == 0:
|
|
42
|
+
precision = 0.0
|
|
43
|
+
else:
|
|
44
|
+
precision = len(relevant_set.intersection(retrieved_set)) / len(retrieved_set)
|
|
45
|
+
|
|
46
|
+
if len(relevant_set) == 0:
|
|
47
|
+
recall = 0.0
|
|
48
|
+
else:
|
|
49
|
+
recall = len(relevant_set.intersection(retrieved_set)) / len(relevant_set)
|
|
50
|
+
|
|
51
|
+
# Calculate F1
|
|
52
|
+
if precision + recall == 0:
|
|
53
|
+
f1 = 0.0
|
|
54
|
+
else:
|
|
55
|
+
f1 = 2 * (precision * recall) / (precision + recall)
|
|
56
|
+
|
|
57
|
+
# Calculate Mean Reciprocal Rank (MRR)
|
|
58
|
+
mrr = 0.0
|
|
59
|
+
for i, retrieved_id in enumerate(retrieved_ids):
|
|
60
|
+
if retrieved_id in relevant_set:
|
|
61
|
+
mrr = 1.0 / (i + 1)
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
return {"retrieval_precision": precision, "retrieval_recall": recall, "retrieval_f1": f1, "retrieval_mrr": mrr}
|
|
65
|
+
|
|
66
|
+
def evaluate_generation(self, generated_answer: str, ground_truth: str, context: str) -> dict[str, float]:
|
|
67
|
+
"""
|
|
68
|
+
Evaluate generation quality metrics.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
generated_answer: Generated answer text
|
|
72
|
+
ground_truth: Ground truth answer
|
|
73
|
+
context: Retrieved context used for generation
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Dictionary with generation metrics
|
|
77
|
+
"""
|
|
78
|
+
# Exact match (case-insensitive)
|
|
79
|
+
exact_match = self._exact_match(generated_answer, ground_truth)
|
|
80
|
+
|
|
81
|
+
# F1 score based on token overlap
|
|
82
|
+
f1_score = self._token_f1(generated_answer, ground_truth)
|
|
83
|
+
|
|
84
|
+
# BLEU-like score
|
|
85
|
+
bleu_score = self._simple_bleu(generated_answer, ground_truth)
|
|
86
|
+
|
|
87
|
+
# Answer relevance (simple keyword overlap with context)
|
|
88
|
+
relevance_score = self._answer_relevance(generated_answer, context)
|
|
89
|
+
|
|
90
|
+
# Faithfulness (how well the answer is supported by context)
|
|
91
|
+
faithfulness_score = self._faithfulness_score(generated_answer, context)
|
|
92
|
+
|
|
93
|
+
return {
|
|
94
|
+
"exact_match": float(exact_match),
|
|
95
|
+
"token_f1": f1_score,
|
|
96
|
+
"bleu_score": bleu_score,
|
|
97
|
+
"answer_relevance": relevance_score,
|
|
98
|
+
"faithfulness": faithfulness_score,
|
|
99
|
+
"answer_confidence": (f1_score + relevance_score + faithfulness_score) / 3.0,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
def combined_rag_score(
|
|
103
|
+
self,
|
|
104
|
+
retrieval_metrics: dict[str, float],
|
|
105
|
+
generation_metrics: dict[str, float],
|
|
106
|
+
retrieval_weight: float = 0.3,
|
|
107
|
+
generation_weight: float = 0.7,
|
|
108
|
+
) -> float:
|
|
109
|
+
"""
|
|
110
|
+
Combine retrieval and generation metrics into a single score.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
retrieval_metrics: Output from evaluate_retrieval
|
|
114
|
+
generation_metrics: Output from evaluate_generation
|
|
115
|
+
retrieval_weight: Weight for retrieval score
|
|
116
|
+
generation_weight: Weight for generation score
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Combined score between 0 and 1
|
|
120
|
+
"""
|
|
121
|
+
# Primary retrieval metric: F1 score
|
|
122
|
+
retrieval_score = retrieval_metrics.get("retrieval_f1", 0.0)
|
|
123
|
+
|
|
124
|
+
# Primary generation metric: weighted combination
|
|
125
|
+
generation_score = (
|
|
126
|
+
generation_metrics.get("token_f1", 0.0) * 0.4
|
|
127
|
+
+ generation_metrics.get("answer_relevance", 0.0) * 0.3
|
|
128
|
+
+ generation_metrics.get("faithfulness", 0.0) * 0.3
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return retrieval_weight * retrieval_score + generation_weight * generation_score
|
|
132
|
+
|
|
133
|
+
def _exact_match(self, prediction: str, ground_truth: str) -> bool:
|
|
134
|
+
"""Check if prediction exactly matches ground truth (case-insensitive)."""
|
|
135
|
+
return prediction.strip().lower() == ground_truth.strip().lower()
|
|
136
|
+
|
|
137
|
+
def _token_f1(self, prediction: str, ground_truth: str) -> float:
|
|
138
|
+
"""Calculate F1 score based on token overlap."""
|
|
139
|
+
pred_tokens = set(self._normalize_text(prediction).split())
|
|
140
|
+
truth_tokens = set(self._normalize_text(ground_truth).split())
|
|
141
|
+
|
|
142
|
+
if len(pred_tokens) == 0 and len(truth_tokens) == 0:
|
|
143
|
+
return 1.0
|
|
144
|
+
if len(pred_tokens) == 0 or len(truth_tokens) == 0:
|
|
145
|
+
return 0.0
|
|
146
|
+
|
|
147
|
+
intersection = pred_tokens.intersection(truth_tokens)
|
|
148
|
+
precision = len(intersection) / len(pred_tokens)
|
|
149
|
+
recall = len(intersection) / len(truth_tokens)
|
|
150
|
+
|
|
151
|
+
if precision + recall == 0:
|
|
152
|
+
return 0.0
|
|
153
|
+
|
|
154
|
+
return 2 * (precision * recall) / (precision + recall)
|
|
155
|
+
|
|
156
|
+
def _simple_bleu(self, prediction: str, ground_truth: str, n: int = 2) -> float:
|
|
157
|
+
"""Simple BLEU-like score for n-gram overlap."""
|
|
158
|
+
pred_words = self._normalize_text(prediction).split()
|
|
159
|
+
truth_words = self._normalize_text(ground_truth).split()
|
|
160
|
+
|
|
161
|
+
if len(pred_words) < n or len(truth_words) < n:
|
|
162
|
+
return self._token_f1(prediction, ground_truth)
|
|
163
|
+
|
|
164
|
+
pred_ngrams = {tuple(pred_words[i : i + n]) for i in range(len(pred_words) - n + 1)}
|
|
165
|
+
truth_ngrams = {tuple(truth_words[i : i + n]) for i in range(len(truth_words) - n + 1)}
|
|
166
|
+
|
|
167
|
+
if len(pred_ngrams) == 0 or len(truth_ngrams) == 0:
|
|
168
|
+
return 0.0
|
|
169
|
+
|
|
170
|
+
intersection = pred_ngrams.intersection(truth_ngrams)
|
|
171
|
+
return len(intersection) / len(pred_ngrams)
|
|
172
|
+
|
|
173
|
+
def _answer_relevance(self, answer: str, context: str) -> float:
|
|
174
|
+
"""Measure how well the answer relates to the provided context."""
|
|
175
|
+
answer_words = set(self._normalize_text(answer).split())
|
|
176
|
+
context_words = set(self._normalize_text(context).split())
|
|
177
|
+
|
|
178
|
+
if len(answer_words) == 0:
|
|
179
|
+
return 0.0
|
|
180
|
+
|
|
181
|
+
overlap = answer_words.intersection(context_words)
|
|
182
|
+
return len(overlap) / len(answer_words)
|
|
183
|
+
|
|
184
|
+
def _faithfulness_score(self, answer: str, context: str) -> float:
|
|
185
|
+
"""
|
|
186
|
+
Measure how well the answer is supported by the context.
|
|
187
|
+
Simple implementation based on shared key phrases.
|
|
188
|
+
"""
|
|
189
|
+
# Extract key phrases (sequences of 2+ words)
|
|
190
|
+
answer_phrases = self._extract_phrases(answer)
|
|
191
|
+
context_phrases = self._extract_phrases(context)
|
|
192
|
+
|
|
193
|
+
if len(answer_phrases) == 0:
|
|
194
|
+
return 1.0 # Empty answer is technically faithful
|
|
195
|
+
|
|
196
|
+
supported_phrases = answer_phrases.intersection(context_phrases)
|
|
197
|
+
return len(supported_phrases) / len(answer_phrases)
|
|
198
|
+
|
|
199
|
+
def _extract_phrases(self, text: str, min_length: int = 2) -> set[str]:
|
|
200
|
+
"""Extract meaningful phrases from text."""
|
|
201
|
+
words = self._normalize_text(text).split()
|
|
202
|
+
phrases = set()
|
|
203
|
+
|
|
204
|
+
# Add individual significant words (length > 3)
|
|
205
|
+
for word in words:
|
|
206
|
+
if len(word) > 3:
|
|
207
|
+
phrases.add(word)
|
|
208
|
+
|
|
209
|
+
# Add bi-grams and tri-grams
|
|
210
|
+
for n in range(min_length, min(4, len(words) + 1)):
|
|
211
|
+
for i in range(len(words) - n + 1):
|
|
212
|
+
phrase = " ".join(words[i : i + n])
|
|
213
|
+
if len(phrase) > 5: # Only meaningful phrases
|
|
214
|
+
phrases.add(phrase)
|
|
215
|
+
|
|
216
|
+
return phrases
|
|
217
|
+
|
|
218
|
+
def _normalize_text(self, text: str) -> str:
|
|
219
|
+
"""Normalize text for comparison."""
|
|
220
|
+
# Convert to lowercase and remove extra whitespace
|
|
221
|
+
text = text.lower().strip()
|
|
222
|
+
# Remove punctuation and special characters
|
|
223
|
+
text = re.sub(r"[^\w\s]", " ", text)
|
|
224
|
+
# Normalize whitespace
|
|
225
|
+
text = re.sub(r"\s+", " ", text)
|
|
226
|
+
return text
|