levelapp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- levelapp/__init__.py +0 -0
- levelapp/aspects/__init__.py +8 -0
- levelapp/aspects/loader.py +253 -0
- levelapp/aspects/logger.py +59 -0
- levelapp/aspects/monitor.py +614 -0
- levelapp/aspects/sanitizer.py +168 -0
- levelapp/clients/__init__.py +119 -0
- levelapp/clients/anthropic.py +112 -0
- levelapp/clients/ionos.py +116 -0
- levelapp/clients/mistral.py +106 -0
- levelapp/clients/openai.py +102 -0
- levelapp/comparator/__init__.py +5 -0
- levelapp/comparator/comparator.py +232 -0
- levelapp/comparator/extractor.py +108 -0
- levelapp/comparator/schemas.py +61 -0
- levelapp/comparator/scorer.py +271 -0
- levelapp/comparator/utils.py +136 -0
- levelapp/config/__init__.py +5 -0
- levelapp/config/endpoint.py +190 -0
- levelapp/config/prompts.py +35 -0
- levelapp/core/__init__.py +0 -0
- levelapp/core/base.py +386 -0
- levelapp/core/session.py +214 -0
- levelapp/evaluator/__init__.py +3 -0
- levelapp/evaluator/evaluator.py +265 -0
- levelapp/metrics/__init__.py +67 -0
- levelapp/metrics/embedding.py +2 -0
- levelapp/metrics/exact.py +182 -0
- levelapp/metrics/fuzzy.py +80 -0
- levelapp/metrics/token.py +103 -0
- levelapp/plugins/__init__.py +0 -0
- levelapp/repository/__init__.py +3 -0
- levelapp/repository/firestore.py +282 -0
- levelapp/simulator/__init__.py +3 -0
- levelapp/simulator/schemas.py +89 -0
- levelapp/simulator/simulator.py +441 -0
- levelapp/simulator/utils.py +201 -0
- levelapp/workflow/__init__.py +5 -0
- levelapp/workflow/base.py +113 -0
- levelapp/workflow/factory.py +51 -0
- levelapp/workflow/registration.py +6 -0
- levelapp/workflow/schemas.py +121 -0
- levelapp-0.1.0.dist-info/METADATA +254 -0
- levelapp-0.1.0.dist-info/RECORD +46 -0
- levelapp-0.1.0.dist-info/WHEEL +4 -0
- levelapp-0.1.0.dist-info/licenses/LICENSE +0 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""levelapp/core/evaluator.py"""
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from typing import List, Dict, Any
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from tenacity import (
|
|
8
|
+
retry,
|
|
9
|
+
retry_if_exception_type,
|
|
10
|
+
stop_after_attempt,
|
|
11
|
+
wait_exponential,
|
|
12
|
+
AsyncRetrying,
|
|
13
|
+
RetryError,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from levelapp.clients import ClientRegistry
|
|
17
|
+
from levelapp.comparator import MetricsManager, MetadataComparator
|
|
18
|
+
from levelapp.config.prompts import EVAL_PROMPT_TEMPLATE
|
|
19
|
+
from levelapp.core.base import BaseEvaluator, BaseChatClient
|
|
20
|
+
from levelapp.aspects import MonitoringAspect, MetricType, logger, DataLoader
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Evidence(BaseModel):
|
|
24
|
+
"""Evidence details for evaluation."""
|
|
25
|
+
covered_points: List[str] = Field(
|
|
26
|
+
default_factory=list,
|
|
27
|
+
description="Key points covered the agent reply covered (<= 3 items)"
|
|
28
|
+
)
|
|
29
|
+
missing_or_wrong: List[str] = Field(
|
|
30
|
+
default_factory=list,
|
|
31
|
+
description="Key points the agent reply missed or contradicted (<= 3 items)"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class JudgeEvaluationResults(BaseModel):
|
|
36
|
+
"""Structured result of an interaction evaluation."""
|
|
37
|
+
provider: str = Field(..., description="The provider name, e.g., 'openai', 'ionos'")
|
|
38
|
+
score: int = Field(..., ge=0, le=3, description="Evaluation score between 0 and 3")
|
|
39
|
+
label: str = Field(..., description="The label of the evaluation result")
|
|
40
|
+
justification: str = Field(..., description="Short explanation of the evaluation result")
|
|
41
|
+
evidence: Evidence = Field(default_factory=Evidence, description="Detailed evidence for the evaluation")
|
|
42
|
+
raw_response: Dict[str, Any] = Field(..., description="Full unprocessed API response")
|
|
43
|
+
metadata: Dict[str, Any] = Field(..., description="Metadata about the evaluation result")
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def from_parsed(cls, provider: str, parsed: Dict[str, Any], raw: Dict[str, Any]) -> "JudgeEvaluationResults":
|
|
47
|
+
"""
|
|
48
|
+
Build a model instance from the provided data.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
provider (str): The provider name.
|
|
52
|
+
parsed (Dict[str, Any]): The parsed response data.
|
|
53
|
+
raw (Dict[str, Any]): The raw response data.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
JudgeEvaluationResults: The constructed evaluation result instance.
|
|
57
|
+
"""
|
|
58
|
+
content = parsed.get("output", {})
|
|
59
|
+
metadata = parsed.get("metadata", {})
|
|
60
|
+
return cls(
|
|
61
|
+
provider=provider,
|
|
62
|
+
score=content.get("score", 0),
|
|
63
|
+
label=content.get("label", "N/A"),
|
|
64
|
+
justification=content.get("justification", "N/A"),
|
|
65
|
+
evidence=Evidence(**content.get("evidence", {})),
|
|
66
|
+
raw_response=raw,
|
|
67
|
+
metadata=metadata,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class JudgeEvaluator(BaseEvaluator):
|
|
72
|
+
def __init__(self):
|
|
73
|
+
self.prompt_template = EVAL_PROMPT_TEMPLATE
|
|
74
|
+
self.clients = defaultdict(BaseChatClient)
|
|
75
|
+
|
|
76
|
+
def register_client(self, provider: str, client: BaseChatClient) -> None:
|
|
77
|
+
"""
|
|
78
|
+
Register LLM clients used for the evaluation.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
provider (str): The provider name.
|
|
82
|
+
client (BaseChatClient): The LLM client to register.
|
|
83
|
+
"""
|
|
84
|
+
self.clients[provider] = client
|
|
85
|
+
|
|
86
|
+
@lru_cache(maxsize=1024)
|
|
87
|
+
def _build_prompt(self, user_input: str, generated_text: str, reference_text: str) -> str:
|
|
88
|
+
"""
|
|
89
|
+
Build the prompt used for the evaluation.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
user_input (str): The user input.
|
|
93
|
+
generated_text (str): The generated text.
|
|
94
|
+
reference_text (str): The reference text.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
A string containing the prompt.
|
|
98
|
+
"""
|
|
99
|
+
return self.prompt_template.format(
|
|
100
|
+
user_input=user_input,
|
|
101
|
+
generated_text=generated_text,
|
|
102
|
+
reference_text=reference_text
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
@retry(
|
|
106
|
+
retry=retry_if_exception_type((TimeoutError, ValueError, RuntimeError)),
|
|
107
|
+
stop=stop_after_attempt(3),
|
|
108
|
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
|
109
|
+
reraise=True,
|
|
110
|
+
)
|
|
111
|
+
def evaluate(
|
|
112
|
+
self,
|
|
113
|
+
generated_data: str,
|
|
114
|
+
reference_data: str,
|
|
115
|
+
user_input: str,
|
|
116
|
+
provider: str,
|
|
117
|
+
) -> JudgeEvaluationResults | None:
|
|
118
|
+
"""
|
|
119
|
+
Synchronous evaluation for the generated data.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
generated_data (str): The generated data.
|
|
123
|
+
reference_data (str): The reference data.
|
|
124
|
+
user_input (str): The user input.
|
|
125
|
+
provider (str): The LLM provider user for evaluation.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
JudgeEvaluationResults instance containing the evaluation results.
|
|
129
|
+
|
|
130
|
+
Raises:
|
|
131
|
+
Exception: If the evaluation failed.
|
|
132
|
+
"""
|
|
133
|
+
prompt = self._build_prompt(
|
|
134
|
+
user_input=user_input,
|
|
135
|
+
generated_text=generated_data,
|
|
136
|
+
reference_text=reference_data
|
|
137
|
+
)
|
|
138
|
+
client = ClientRegistry.get(provider=provider)
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
response = client.call(message=prompt)
|
|
142
|
+
logger.info(f"[{provider}] Evaluation: {response}\n{'---' * 10}")
|
|
143
|
+
parsed = client.parse_response(response=response)
|
|
144
|
+
return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
|
|
145
|
+
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.error(f"[{provider}] Evaluation failed: {e}", exc_info=True)
|
|
148
|
+
return JudgeEvaluationResults(
|
|
149
|
+
provider=provider,
|
|
150
|
+
score=0,
|
|
151
|
+
label="N/A",
|
|
152
|
+
justification="N/A",
|
|
153
|
+
evidence=Evidence(covered_points=[], missing_or_wrong=[]),
|
|
154
|
+
raw_response={},
|
|
155
|
+
metadata={}
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
@MonitoringAspect.monitor(name="judge_evaluation", category=MetricType.API_CALL)
|
|
159
|
+
async def async_evaluate(
|
|
160
|
+
self,
|
|
161
|
+
generated_data: str,
|
|
162
|
+
reference_data: str,
|
|
163
|
+
user_input: str,
|
|
164
|
+
provider: str,
|
|
165
|
+
) -> JudgeEvaluationResults | None:
|
|
166
|
+
"""
|
|
167
|
+
Synchronous evaluation for the generated data.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
generated_data (str): The generated data.
|
|
171
|
+
reference_data (str): The reference data.
|
|
172
|
+
user_input (str): The user input.
|
|
173
|
+
provider (str): The LLM provider user for evaluation.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
JudgeEvaluationResults instance containing the evaluation results.
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
RetryError: If the evaluation failed.
|
|
180
|
+
"""
|
|
181
|
+
prompt = self._build_prompt(
|
|
182
|
+
user_input=user_input,
|
|
183
|
+
generated_text=generated_data,
|
|
184
|
+
reference_text=reference_data
|
|
185
|
+
)
|
|
186
|
+
client = ClientRegistry.get(provider=provider)
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
async for attempt in AsyncRetrying(
|
|
190
|
+
retry=retry_if_exception_type((TimeoutError, ValueError, RuntimeError)),
|
|
191
|
+
stop=stop_after_attempt(3),
|
|
192
|
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
|
193
|
+
reraise=True,
|
|
194
|
+
):
|
|
195
|
+
with attempt:
|
|
196
|
+
response = await client.acall(message=prompt)
|
|
197
|
+
logger.info(f"[{provider}] Async evaluation:\n{response}\n{'---' * 10}")
|
|
198
|
+
parsed = client.parse_response(response=response)
|
|
199
|
+
return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
|
|
200
|
+
|
|
201
|
+
except RetryError as e:
|
|
202
|
+
logger.error(f"[{provider}] Async evaluation failed after retries: {e}", exc_info=True)
|
|
203
|
+
return JudgeEvaluationResults(
|
|
204
|
+
provider=provider,
|
|
205
|
+
score=0,
|
|
206
|
+
label="N/A",
|
|
207
|
+
justification="N/A",
|
|
208
|
+
evidence=Evidence(covered_points=[], missing_or_wrong=[]),
|
|
209
|
+
raw_response={},
|
|
210
|
+
metadata={}
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class MetadataEvaluator(BaseEvaluator):
|
|
215
|
+
def __init__(self):
|
|
216
|
+
self.data_loader = DataLoader()
|
|
217
|
+
self.comparator = MetadataComparator()
|
|
218
|
+
self.metrics_manager = MetricsManager()
|
|
219
|
+
|
|
220
|
+
def evaluate(
|
|
221
|
+
self,
|
|
222
|
+
generated_data: str | Dict[str, Any],
|
|
223
|
+
reference_data: str | Dict[str, Any],
|
|
224
|
+
metrics_mapping: Any | None = None,
|
|
225
|
+
) -> Dict[str, float]:
|
|
226
|
+
"""
|
|
227
|
+
Synchronous evaluation for the generated data.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
generated_data (str): The generated data.
|
|
231
|
+
reference_data (str): The reference data.
|
|
232
|
+
metrics_mapping (dict): A dictionary mapping metric names to metrics.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
A dict containing the evaluation results.
|
|
236
|
+
"""
|
|
237
|
+
gen_data = self.data_loader.create_dynamic_model(data=generated_data, model_name="GeneratedMetadata")
|
|
238
|
+
ref_data = self.data_loader.create_dynamic_model(data=reference_data, model_name="ReferenceMetadata")
|
|
239
|
+
|
|
240
|
+
if metrics_mapping:
|
|
241
|
+
self.comparator.metrics_manager = metrics_mapping
|
|
242
|
+
|
|
243
|
+
self.comparator.metrics_manager = self.metrics_manager
|
|
244
|
+
self.comparator.generated_data = gen_data
|
|
245
|
+
self.comparator.reference_data = ref_data
|
|
246
|
+
|
|
247
|
+
output = self.comparator.run(indexed_mode=False)
|
|
248
|
+
logger.info(f"Comparison results:\n{output}\n---")
|
|
249
|
+
results: Dict[str, float] = {}
|
|
250
|
+
|
|
251
|
+
for k, v in output.items():
|
|
252
|
+
field = v.get("field_name", "N/A")
|
|
253
|
+
score = v.get("set_scores", -1)
|
|
254
|
+
results[field] = int(score[0]) if isinstance(score, list) else int(score)
|
|
255
|
+
|
|
256
|
+
return results
|
|
257
|
+
|
|
258
|
+
async def async_evaluate(
|
|
259
|
+
self,
|
|
260
|
+
generated_data: str | Dict[str, Any],
|
|
261
|
+
reference_data: str | Dict[str, Any],
|
|
262
|
+
**kwargs
|
|
263
|
+
):
|
|
264
|
+
"""Not implemented yet."""
|
|
265
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""levelapp/metrics/__init__.py"""
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from typing import List, Dict, Type, Any
|
|
5
|
+
|
|
6
|
+
from levelapp.core.base import BaseMetric
|
|
7
|
+
from levelapp.metrics.exact import EXACT_METRICS
|
|
8
|
+
from levelapp.metrics.fuzzy import FUZZY_METRICS
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MetricRegistry:
|
|
14
|
+
"""Registry for metric classes."""
|
|
15
|
+
_metrics: Dict[str, Type[BaseMetric]] = {}
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def register(cls, name: str, metric_class: Type[BaseMetric]) -> None:
|
|
19
|
+
"""
|
|
20
|
+
Register a metric class under a given name.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
name (str): Unique identifier for the metric.
|
|
24
|
+
metric_class (Type[BaseMetric]): The metric class to register.
|
|
25
|
+
"""
|
|
26
|
+
if name in cls._metrics:
|
|
27
|
+
raise KeyError(f"Metric '{name}' is already registered")
|
|
28
|
+
|
|
29
|
+
cls._metrics[name] = metric_class
|
|
30
|
+
logger.info(f"Metric '{name}' registered successfully.")
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def get(cls, name: str, **kwargs: Any) -> BaseMetric:
|
|
34
|
+
"""
|
|
35
|
+
Retrieve an instance of a registered metric by its name.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
name (str): The name of the metric to retrieve.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Type[BaseMetric]: The metric class associated with the given name.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
KeyError: If the metric is not found.
|
|
45
|
+
"""
|
|
46
|
+
if name not in cls._metrics:
|
|
47
|
+
raise KeyError(f"Metric '{name}' is not registered")
|
|
48
|
+
|
|
49
|
+
return cls._metrics[name](**kwargs)
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def list_metrics(cls) -> List[str]:
|
|
53
|
+
return list(cls._metrics.keys())
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def unregister(cls, name: str) -> None:
|
|
57
|
+
cls._metrics.pop(name, None)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
METRICS = FUZZY_METRICS | EXACT_METRICS
|
|
61
|
+
|
|
62
|
+
for name_, metric_class_ in METRICS.items():
|
|
63
|
+
try:
|
|
64
|
+
MetricRegistry.register(name_, metric_class_)
|
|
65
|
+
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.info(f"Failed to register metric {name_}: {e}")
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""levelapp/metrics/exact.py"""
|
|
2
|
+
from typing import Dict, Any
|
|
3
|
+
|
|
4
|
+
from rapidfuzz import distance
|
|
5
|
+
|
|
6
|
+
from levelapp.core.base import BaseMetric
|
|
7
|
+
from levelapp.aspects.monitor import MonitoringAspect, MetricType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ExactMatch(BaseMetric):
|
|
11
|
+
"""Binary exact match comparison (1.0 for exact match, 0.0 otherwise)"""
|
|
12
|
+
|
|
13
|
+
@MonitoringAspect.monitor(name="exact_match", category=MetricType.SCORING, cached=True, enable_timing=True)
|
|
14
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
15
|
+
""""
|
|
16
|
+
Compute the exact match score between generated and reference strings.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
generated (str): The text generated by the agent.
|
|
20
|
+
reference (str): The expected reference text.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Dict[str, Any]: A dictionary containing the exact match score and metadata.
|
|
24
|
+
"""
|
|
25
|
+
self._validate_inputs(generated=generated, reference=reference)
|
|
26
|
+
|
|
27
|
+
score = distance.Levenshtein.normalized_similarity(
|
|
28
|
+
s1=generated,
|
|
29
|
+
s2=reference,
|
|
30
|
+
processor=self.processor,
|
|
31
|
+
score_cutoff=1.0
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
return {
|
|
35
|
+
"score": score,
|
|
36
|
+
"metadata": self._build_metadata(
|
|
37
|
+
generated_length=len(generated),
|
|
38
|
+
reference_length=len(reference)
|
|
39
|
+
)
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Levenshtein(BaseMetric):
|
|
44
|
+
"""Levenshtein edit distance (number of insertions, deletions, substitutions)"""
|
|
45
|
+
|
|
46
|
+
@MonitoringAspect.monitor(name="levenshtein", category=MetricType.SCORING, cached=True, enable_timing=True)
|
|
47
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
48
|
+
"""
|
|
49
|
+
Compute the Levenshtein distance score between generated and reference strings.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
generated (str): The text generated by the agent.
|
|
53
|
+
reference (str): The expected reference text.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Dict[str, Any]: A dictionary containing the Levenshtein score and metadata.
|
|
57
|
+
"""
|
|
58
|
+
self._validate_inputs(generated=generated, reference=reference)
|
|
59
|
+
|
|
60
|
+
score = distance.Levenshtein.normalized_similarity(
|
|
61
|
+
s1=generated,
|
|
62
|
+
s2=reference,
|
|
63
|
+
processor=self.processor,
|
|
64
|
+
score_cutoff=self.score_cutoff or 1.0
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
"score": score,
|
|
69
|
+
"metadata": self._build_metadata(
|
|
70
|
+
generated_length=len(generated),
|
|
71
|
+
reference_length=len(reference)
|
|
72
|
+
)
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class JaroWinkler(BaseMetric):
|
|
77
|
+
"""Jaro-Winkler distance (similarity measure for strings)"""
|
|
78
|
+
|
|
79
|
+
@MonitoringAspect.monitor(name="jaro-winkler", category=MetricType.SCORING, cached=True, enable_timing=True)
|
|
80
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
81
|
+
"""
|
|
82
|
+
Compute the Jaro-Winkler distance score between generated and reference strings.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
generated (str): The text generated by the agent.
|
|
86
|
+
reference (str): The expected reference text.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Dict[str, Any]: A dictionary containing the Jaro-Winkler score and metadata.
|
|
90
|
+
"""
|
|
91
|
+
self._validate_inputs(generated=generated, reference=reference)
|
|
92
|
+
|
|
93
|
+
score = distance.JaroWinkler.normalized_similarity(
|
|
94
|
+
s1=generated,
|
|
95
|
+
s2=reference,
|
|
96
|
+
processor=self.processor,
|
|
97
|
+
score_cutoff=self.score_cutoff
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
"score": score,
|
|
102
|
+
"metadata": self._build_metadata(
|
|
103
|
+
generated_length=len(generated),
|
|
104
|
+
reference_length=len(reference)
|
|
105
|
+
)
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class Hamming(BaseMetric):
|
|
110
|
+
"""Hamming distance (character substitutions only, for equal-length strings)"""
|
|
111
|
+
|
|
112
|
+
@MonitoringAspect.monitor(name="hamming", category=MetricType.SCORING, cached=True, enable_timing=True)
|
|
113
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
114
|
+
"""
|
|
115
|
+
Compute the Hamming distance score between generated and reference strings.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
generated (str): The text generated by the agent.
|
|
119
|
+
reference (str): The expected reference text.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Dict[str, Any]: A dictionary containing the Hamming score and metadata.
|
|
123
|
+
"""
|
|
124
|
+
self._validate_inputs(generated=generated, reference=reference)
|
|
125
|
+
|
|
126
|
+
score = distance.Hamming.normalized_similarity(
|
|
127
|
+
s1=generated,
|
|
128
|
+
s2=reference,
|
|
129
|
+
processor=self.processor,
|
|
130
|
+
score_cutoff=self.score_cutoff
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return {
|
|
134
|
+
"score": score,
|
|
135
|
+
"metadata": self._build_metadata(
|
|
136
|
+
generated_length=len(generated),
|
|
137
|
+
reference_length=len(reference)
|
|
138
|
+
)
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class PrefixMatch(BaseMetric):
|
|
143
|
+
"""Prefix similarity (1.0 if generated starts with reference)"""
|
|
144
|
+
|
|
145
|
+
@MonitoringAspect.monitor(name="prefix-match", category=MetricType.SCORING, cached=True, enable_timing=True)
|
|
146
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
147
|
+
"""
|
|
148
|
+
Compute the Prefix similarity score between generated and reference strings.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
generated (str): The text generated by the agent.
|
|
152
|
+
reference (str): The expected reference text.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Dict[str, Any]: A dictionary containing the Prefix similarity and metadata.
|
|
156
|
+
"""
|
|
157
|
+
self._validate_inputs(generated=generated, reference=reference)
|
|
158
|
+
|
|
159
|
+
score = distance.Prefix.normalized_similarity(
|
|
160
|
+
s1=generated,
|
|
161
|
+
s2=reference,
|
|
162
|
+
processor=self.processor,
|
|
163
|
+
score_cutoff=self.score_cutoff
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return {
|
|
167
|
+
"score": score,
|
|
168
|
+
"metadata": self._build_metadata(
|
|
169
|
+
generated_length=len(generated),
|
|
170
|
+
reference_length=len(reference)
|
|
171
|
+
)
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# Registry of all exact metrics
|
|
176
|
+
EXACT_METRICS = {
|
|
177
|
+
"exact_match": ExactMatch,
|
|
178
|
+
"levenshtein": Levenshtein,
|
|
179
|
+
"jaro_winkler": JaroWinkler,
|
|
180
|
+
"hamming": Hamming,
|
|
181
|
+
"prefix_match": PrefixMatch
|
|
182
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""levelapp/metrics/fuzzy.py"""
|
|
2
|
+
from rapidfuzz import fuzz
|
|
3
|
+
|
|
4
|
+
from typing import Dict, Any
|
|
5
|
+
|
|
6
|
+
from levelapp.core.base import BaseMetric
|
|
7
|
+
from levelapp.aspects.monitor import MonitoringAspect, MetricType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FuzzyRatio(BaseMetric):
|
|
11
|
+
"""A metric that computes the fuzzy ratio between two texts."""
|
|
12
|
+
|
|
13
|
+
@MonitoringAspect.monitor(name="fuzzy-ratio", category=MetricType.API_CALL, cached=True, enable_timing=True)
|
|
14
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
15
|
+
"""
|
|
16
|
+
Compute the fuzzy ratio between the generated text and the reference text.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
generated (str): The text generated by the agent.
|
|
20
|
+
reference (str): The expected reference text.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Dict[str, Any]: A dictionary containing the fuzzy ratio score and metadata.
|
|
24
|
+
"""
|
|
25
|
+
score = fuzz.ratio(
|
|
26
|
+
s1=generated,
|
|
27
|
+
s2=reference,
|
|
28
|
+
processor=self.processor,
|
|
29
|
+
score_cutoff=self.score_cutoff
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# TODO-0: Return results as Pydantic model.
|
|
33
|
+
return {
|
|
34
|
+
"score": score / 100,
|
|
35
|
+
"metadata": self._build_metadata(
|
|
36
|
+
generated_length=len(generated),
|
|
37
|
+
reference_length=len(reference)
|
|
38
|
+
)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class PartialRatio(BaseMetric):
|
|
43
|
+
"""
|
|
44
|
+
A metric that computes the partial fuzzy ratio between two texts.
|
|
45
|
+
This is useful for evaluating how similar two pieces of text are,
|
|
46
|
+
allowing for partial matches.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
@MonitoringAspect.monitor(name="partial-ratio", category=MetricType.SCORING, cached=True, enable_timing=True)
|
|
50
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
51
|
+
"""
|
|
52
|
+
Compute the partial fuzzy ratio between the generated text and the reference text.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
generated (str): The text generated by the agent.
|
|
56
|
+
reference (str): The expected reference text.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Dict[str, Any]: A dictionary containing the partial fuzzy ratio.
|
|
60
|
+
"""
|
|
61
|
+
score = fuzz.partial_ratio(
|
|
62
|
+
s1=generated,
|
|
63
|
+
s2=reference,
|
|
64
|
+
processor=self.processor,
|
|
65
|
+
score_cutoff=self.score_cutoff
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return {
|
|
69
|
+
"score": score / 100,
|
|
70
|
+
"metadata": self._build_metadata(
|
|
71
|
+
generated_length=len(generated),
|
|
72
|
+
reference_length=len(reference)
|
|
73
|
+
)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
FUZZY_METRICS = {
|
|
78
|
+
"fuzzy_ratio": FuzzyRatio,
|
|
79
|
+
"partial_ratio": PartialRatio,
|
|
80
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""levelapp/metrics/token.py"""
|
|
2
|
+
from rapidfuzz import fuzz
|
|
3
|
+
|
|
4
|
+
from typing import Dict, Any
|
|
5
|
+
|
|
6
|
+
from levelapp.core.base import BaseMetric
|
|
7
|
+
from levelapp.aspects.monitor import MonitoringAspect, MetricType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WeightedRatio(BaseMetric):
|
|
11
|
+
"""A metric that calculates a weighted ratio based on the other ratio algorithms"""
|
|
12
|
+
|
|
13
|
+
@MonitoringAspect.monitor(name="weighted-ratio", category=MetricType.SCORING, cached=True, enable_timing=True)
|
|
14
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
15
|
+
"""
|
|
16
|
+
Compute the token-based metric between the generated text and the reference text.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
generated (str): The text generated by the agent.
|
|
20
|
+
reference (str): The expected reference text.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Dict[str, Any]: A dictionary containing the score and metadata.
|
|
24
|
+
"""
|
|
25
|
+
score = fuzz.WRatio(
|
|
26
|
+
s1=generated,
|
|
27
|
+
s2=reference,
|
|
28
|
+
processor=self.processor,
|
|
29
|
+
score_cutoff=self.score_cutoff
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
return {
|
|
33
|
+
"score": score / 100,
|
|
34
|
+
"metadata": self._build_metadata(
|
|
35
|
+
generated_length=len(generated),
|
|
36
|
+
reference_length=len(reference)
|
|
37
|
+
)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class TokenSetRatio(BaseMetric):
|
|
42
|
+
"""
|
|
43
|
+
A metric that compares the words in the strings based
|
|
44
|
+
on unique and common words between them using fuzz.ratio.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
@MonitoringAspect.monitor(name="token-set-ratio", category=MetricType.SCORING, cached=True, enable_timing=True)
|
|
48
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
49
|
+
"""
|
|
50
|
+
Compute the token-based metric between the generated text and the reference text.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
generated (str): The text generated by the agent.
|
|
54
|
+
reference (str): The expected reference text.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Dict[str, Any]: A dictionary containing the score and metadata.
|
|
58
|
+
"""
|
|
59
|
+
score = fuzz.token_set_ratio(
|
|
60
|
+
s1=generated,
|
|
61
|
+
s2=reference,
|
|
62
|
+
processor=self.processor,
|
|
63
|
+
score_cutoff=self.score_cutoff
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
"score": score / 100,
|
|
68
|
+
"metadata": self._build_metadata(
|
|
69
|
+
generated_length=len(generated),
|
|
70
|
+
reference_length=len(reference)
|
|
71
|
+
)
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class TokenSortRatio(BaseMetric):
|
|
76
|
+
"""A metric that sorts the words in the strings and calculates the fuzz.ratio between them."""
|
|
77
|
+
|
|
78
|
+
@MonitoringAspect.monitor(name="token-sort-ratio", category=MetricType.SCORING, cached=True, enable_timing=True)
|
|
79
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
80
|
+
"""
|
|
81
|
+
Compute the token-based metric between the generated text and the reference text.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
generated (str): The text generated by the agent.
|
|
85
|
+
reference (str): The expected reference text.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Dict[str, Any]: A dictionary containing the score and metadata.
|
|
89
|
+
"""
|
|
90
|
+
score = fuzz.token_sort_ratio(
|
|
91
|
+
s1=generated,
|
|
92
|
+
s2=reference,
|
|
93
|
+
processor=self.processor,
|
|
94
|
+
score_cutoff=self.score_cutoff
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
"score": score / 100,
|
|
99
|
+
"metadata": self._build_metadata(
|
|
100
|
+
generated_length=len(generated),
|
|
101
|
+
reference_length=len(reference)
|
|
102
|
+
)
|
|
103
|
+
}
|