levelapp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of levelapp might be problematic. Click here for more details.

Files changed (46) hide show
  1. levelapp/__init__.py +0 -0
  2. levelapp/aspects/__init__.py +8 -0
  3. levelapp/aspects/loader.py +253 -0
  4. levelapp/aspects/logger.py +59 -0
  5. levelapp/aspects/monitor.py +614 -0
  6. levelapp/aspects/sanitizer.py +168 -0
  7. levelapp/clients/__init__.py +119 -0
  8. levelapp/clients/anthropic.py +112 -0
  9. levelapp/clients/ionos.py +116 -0
  10. levelapp/clients/mistral.py +106 -0
  11. levelapp/clients/openai.py +102 -0
  12. levelapp/comparator/__init__.py +5 -0
  13. levelapp/comparator/comparator.py +232 -0
  14. levelapp/comparator/extractor.py +108 -0
  15. levelapp/comparator/schemas.py +61 -0
  16. levelapp/comparator/scorer.py +271 -0
  17. levelapp/comparator/utils.py +136 -0
  18. levelapp/config/__init__.py +5 -0
  19. levelapp/config/endpoint.py +190 -0
  20. levelapp/config/prompts.py +35 -0
  21. levelapp/core/__init__.py +0 -0
  22. levelapp/core/base.py +386 -0
  23. levelapp/core/session.py +214 -0
  24. levelapp/evaluator/__init__.py +3 -0
  25. levelapp/evaluator/evaluator.py +265 -0
  26. levelapp/metrics/__init__.py +67 -0
  27. levelapp/metrics/embedding.py +2 -0
  28. levelapp/metrics/exact.py +182 -0
  29. levelapp/metrics/fuzzy.py +80 -0
  30. levelapp/metrics/token.py +103 -0
  31. levelapp/plugins/__init__.py +0 -0
  32. levelapp/repository/__init__.py +3 -0
  33. levelapp/repository/firestore.py +282 -0
  34. levelapp/simulator/__init__.py +3 -0
  35. levelapp/simulator/schemas.py +89 -0
  36. levelapp/simulator/simulator.py +441 -0
  37. levelapp/simulator/utils.py +201 -0
  38. levelapp/workflow/__init__.py +5 -0
  39. levelapp/workflow/base.py +113 -0
  40. levelapp/workflow/factory.py +51 -0
  41. levelapp/workflow/registration.py +6 -0
  42. levelapp/workflow/schemas.py +121 -0
  43. levelapp-0.1.0.dist-info/METADATA +254 -0
  44. levelapp-0.1.0.dist-info/RECORD +46 -0
  45. levelapp-0.1.0.dist-info/WHEEL +4 -0
  46. levelapp-0.1.0.dist-info/licenses/LICENSE +0 -0
@@ -0,0 +1,265 @@
1
+ """levelapp/core/evaluator.py"""
2
+ from functools import lru_cache
3
+ from typing import List, Dict, Any
4
+ from collections import defaultdict
5
+ from pydantic import BaseModel, Field
6
+
7
+ from tenacity import (
8
+ retry,
9
+ retry_if_exception_type,
10
+ stop_after_attempt,
11
+ wait_exponential,
12
+ AsyncRetrying,
13
+ RetryError,
14
+ )
15
+
16
+ from levelapp.clients import ClientRegistry
17
+ from levelapp.comparator import MetricsManager, MetadataComparator
18
+ from levelapp.config.prompts import EVAL_PROMPT_TEMPLATE
19
+ from levelapp.core.base import BaseEvaluator, BaseChatClient
20
+ from levelapp.aspects import MonitoringAspect, MetricType, logger, DataLoader
21
+
22
+
23
+ class Evidence(BaseModel):
24
+ """Evidence details for evaluation."""
25
+ covered_points: List[str] = Field(
26
+ default_factory=list,
27
+ description="Key points covered the agent reply covered (<= 3 items)"
28
+ )
29
+ missing_or_wrong: List[str] = Field(
30
+ default_factory=list,
31
+ description="Key points the agent reply missed or contradicted (<= 3 items)"
32
+ )
33
+
34
+
35
+ class JudgeEvaluationResults(BaseModel):
36
+ """Structured result of an interaction evaluation."""
37
+ provider: str = Field(..., description="The provider name, e.g., 'openai', 'ionos'")
38
+ score: int = Field(..., ge=0, le=3, description="Evaluation score between 0 and 3")
39
+ label: str = Field(..., description="The label of the evaluation result")
40
+ justification: str = Field(..., description="Short explanation of the evaluation result")
41
+ evidence: Evidence = Field(default_factory=Evidence, description="Detailed evidence for the evaluation")
42
+ raw_response: Dict[str, Any] = Field(..., description="Full unprocessed API response")
43
+ metadata: Dict[str, Any] = Field(..., description="Metadata about the evaluation result")
44
+
45
+ @classmethod
46
+ def from_parsed(cls, provider: str, parsed: Dict[str, Any], raw: Dict[str, Any]) -> "JudgeEvaluationResults":
47
+ """
48
+ Build a model instance from the provided data.
49
+
50
+ Args:
51
+ provider (str): The provider name.
52
+ parsed (Dict[str, Any]): The parsed response data.
53
+ raw (Dict[str, Any]): The raw response data.
54
+
55
+ Returns:
56
+ JudgeEvaluationResults: The constructed evaluation result instance.
57
+ """
58
+ content = parsed.get("output", {})
59
+ metadata = parsed.get("metadata", {})
60
+ return cls(
61
+ provider=provider,
62
+ score=content.get("score", 0),
63
+ label=content.get("label", "N/A"),
64
+ justification=content.get("justification", "N/A"),
65
+ evidence=Evidence(**content.get("evidence", {})),
66
+ raw_response=raw,
67
+ metadata=metadata,
68
+ )
69
+
70
+
71
+ class JudgeEvaluator(BaseEvaluator):
72
+ def __init__(self):
73
+ self.prompt_template = EVAL_PROMPT_TEMPLATE
74
+ self.clients = defaultdict(BaseChatClient)
75
+
76
+ def register_client(self, provider: str, client: BaseChatClient) -> None:
77
+ """
78
+ Register LLM clients used for the evaluation.
79
+
80
+ Args:
81
+ provider (str): The provider name.
82
+ client (BaseChatClient): The LLM client to register.
83
+ """
84
+ self.clients[provider] = client
85
+
86
+ @lru_cache(maxsize=1024)
87
+ def _build_prompt(self, user_input: str, generated_text: str, reference_text: str) -> str:
88
+ """
89
+ Build the prompt used for the evaluation.
90
+
91
+ Args:
92
+ user_input (str): The user input.
93
+ generated_text (str): The generated text.
94
+ reference_text (str): The reference text.
95
+
96
+ Returns:
97
+ A string containing the prompt.
98
+ """
99
+ return self.prompt_template.format(
100
+ user_input=user_input,
101
+ generated_text=generated_text,
102
+ reference_text=reference_text
103
+ )
104
+
105
+ @retry(
106
+ retry=retry_if_exception_type((TimeoutError, ValueError, RuntimeError)),
107
+ stop=stop_after_attempt(3),
108
+ wait=wait_exponential(multiplier=1, min=2, max=10),
109
+ reraise=True,
110
+ )
111
+ def evaluate(
112
+ self,
113
+ generated_data: str,
114
+ reference_data: str,
115
+ user_input: str,
116
+ provider: str,
117
+ ) -> JudgeEvaluationResults | None:
118
+ """
119
+ Synchronous evaluation for the generated data.
120
+
121
+ Args:
122
+ generated_data (str): The generated data.
123
+ reference_data (str): The reference data.
124
+ user_input (str): The user input.
125
+ provider (str): The LLM provider user for evaluation.
126
+
127
+ Returns:
128
+ JudgeEvaluationResults instance containing the evaluation results.
129
+
130
+ Raises:
131
+ Exception: If the evaluation failed.
132
+ """
133
+ prompt = self._build_prompt(
134
+ user_input=user_input,
135
+ generated_text=generated_data,
136
+ reference_text=reference_data
137
+ )
138
+ client = ClientRegistry.get(provider=provider)
139
+
140
+ try:
141
+ response = client.call(message=prompt)
142
+ logger.info(f"[{provider}] Evaluation: {response}\n{'---' * 10}")
143
+ parsed = client.parse_response(response=response)
144
+ return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
145
+
146
+ except Exception as e:
147
+ logger.error(f"[{provider}] Evaluation failed: {e}", exc_info=True)
148
+ return JudgeEvaluationResults(
149
+ provider=provider,
150
+ score=0,
151
+ label="N/A",
152
+ justification="N/A",
153
+ evidence=Evidence(covered_points=[], missing_or_wrong=[]),
154
+ raw_response={},
155
+ metadata={}
156
+ )
157
+
158
+ @MonitoringAspect.monitor(name="judge_evaluation", category=MetricType.API_CALL)
159
+ async def async_evaluate(
160
+ self,
161
+ generated_data: str,
162
+ reference_data: str,
163
+ user_input: str,
164
+ provider: str,
165
+ ) -> JudgeEvaluationResults | None:
166
+ """
167
+ Synchronous evaluation for the generated data.
168
+
169
+ Args:
170
+ generated_data (str): The generated data.
171
+ reference_data (str): The reference data.
172
+ user_input (str): The user input.
173
+ provider (str): The LLM provider user for evaluation.
174
+
175
+ Returns:
176
+ JudgeEvaluationResults instance containing the evaluation results.
177
+
178
+ Raises:
179
+ RetryError: If the evaluation failed.
180
+ """
181
+ prompt = self._build_prompt(
182
+ user_input=user_input,
183
+ generated_text=generated_data,
184
+ reference_text=reference_data
185
+ )
186
+ client = ClientRegistry.get(provider=provider)
187
+
188
+ try:
189
+ async for attempt in AsyncRetrying(
190
+ retry=retry_if_exception_type((TimeoutError, ValueError, RuntimeError)),
191
+ stop=stop_after_attempt(3),
192
+ wait=wait_exponential(multiplier=1, min=2, max=10),
193
+ reraise=True,
194
+ ):
195
+ with attempt:
196
+ response = await client.acall(message=prompt)
197
+ logger.info(f"[{provider}] Async evaluation:\n{response}\n{'---' * 10}")
198
+ parsed = client.parse_response(response=response)
199
+ return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
200
+
201
+ except RetryError as e:
202
+ logger.error(f"[{provider}] Async evaluation failed after retries: {e}", exc_info=True)
203
+ return JudgeEvaluationResults(
204
+ provider=provider,
205
+ score=0,
206
+ label="N/A",
207
+ justification="N/A",
208
+ evidence=Evidence(covered_points=[], missing_or_wrong=[]),
209
+ raw_response={},
210
+ metadata={}
211
+ )
212
+
213
+
214
+ class MetadataEvaluator(BaseEvaluator):
215
+ def __init__(self):
216
+ self.data_loader = DataLoader()
217
+ self.comparator = MetadataComparator()
218
+ self.metrics_manager = MetricsManager()
219
+
220
+ def evaluate(
221
+ self,
222
+ generated_data: str | Dict[str, Any],
223
+ reference_data: str | Dict[str, Any],
224
+ metrics_mapping: Any | None = None,
225
+ ) -> Dict[str, float]:
226
+ """
227
+ Synchronous evaluation for the generated data.
228
+
229
+ Args:
230
+ generated_data (str): The generated data.
231
+ reference_data (str): The reference data.
232
+ metrics_mapping (dict): A dictionary mapping metric names to metrics.
233
+
234
+ Returns:
235
+ A dict containing the evaluation results.
236
+ """
237
+ gen_data = self.data_loader.create_dynamic_model(data=generated_data, model_name="GeneratedMetadata")
238
+ ref_data = self.data_loader.create_dynamic_model(data=reference_data, model_name="ReferenceMetadata")
239
+
240
+ if metrics_mapping:
241
+ self.comparator.metrics_manager = metrics_mapping
242
+
243
+ self.comparator.metrics_manager = self.metrics_manager
244
+ self.comparator.generated_data = gen_data
245
+ self.comparator.reference_data = ref_data
246
+
247
+ output = self.comparator.run(indexed_mode=False)
248
+ logger.info(f"Comparison results:\n{output}\n---")
249
+ results: Dict[str, float] = {}
250
+
251
+ for k, v in output.items():
252
+ field = v.get("field_name", "N/A")
253
+ score = v.get("set_scores", -1)
254
+ results[field] = int(score[0]) if isinstance(score, list) else int(score)
255
+
256
+ return results
257
+
258
+ async def async_evaluate(
259
+ self,
260
+ generated_data: str | Dict[str, Any],
261
+ reference_data: str | Dict[str, Any],
262
+ **kwargs
263
+ ):
264
+ """Not implemented yet."""
265
+ raise NotImplementedError()
@@ -0,0 +1,67 @@
1
+ """levelapp/metrics/__init__.py"""
2
+ import logging
3
+
4
+ from typing import List, Dict, Type, Any
5
+
6
+ from levelapp.core.base import BaseMetric
7
+ from levelapp.metrics.exact import EXACT_METRICS
8
+ from levelapp.metrics.fuzzy import FUZZY_METRICS
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class MetricRegistry:
14
+ """Registry for metric classes."""
15
+ _metrics: Dict[str, Type[BaseMetric]] = {}
16
+
17
+ @classmethod
18
+ def register(cls, name: str, metric_class: Type[BaseMetric]) -> None:
19
+ """
20
+ Register a metric class under a given name.
21
+
22
+ Args:
23
+ name (str): Unique identifier for the metric.
24
+ metric_class (Type[BaseMetric]): The metric class to register.
25
+ """
26
+ if name in cls._metrics:
27
+ raise KeyError(f"Metric '{name}' is already registered")
28
+
29
+ cls._metrics[name] = metric_class
30
+ logger.info(f"Metric '{name}' registered successfully.")
31
+
32
+ @classmethod
33
+ def get(cls, name: str, **kwargs: Any) -> BaseMetric:
34
+ """
35
+ Retrieve an instance of a registered metric by its name.
36
+
37
+ Args:
38
+ name (str): The name of the metric to retrieve.
39
+
40
+ Returns:
41
+ Type[BaseMetric]: The metric class associated with the given name.
42
+
43
+ Raises:
44
+ KeyError: If the metric is not found.
45
+ """
46
+ if name not in cls._metrics:
47
+ raise KeyError(f"Metric '{name}' is not registered")
48
+
49
+ return cls._metrics[name](**kwargs)
50
+
51
+ @classmethod
52
+ def list_metrics(cls) -> List[str]:
53
+ return list(cls._metrics.keys())
54
+
55
+ @classmethod
56
+ def unregister(cls, name: str) -> None:
57
+ cls._metrics.pop(name, None)
58
+
59
+
60
+ METRICS = FUZZY_METRICS | EXACT_METRICS
61
+
62
+ for name_, metric_class_ in METRICS.items():
63
+ try:
64
+ MetricRegistry.register(name_, metric_class_)
65
+
66
+ except Exception as e:
67
+ logger.info(f"Failed to register metric {name_}: {e}")
@@ -0,0 +1,2 @@
1
+ """levelapp/metrics/token.py"""
2
+ # TODO-0: Implement Cosine Similarity, BERTScore, and other token-based metrics.
@@ -0,0 +1,182 @@
1
+ """levelapp/metrics/exact.py"""
2
+ from typing import Dict, Any
3
+
4
+ from rapidfuzz import distance
5
+
6
+ from levelapp.core.base import BaseMetric
7
+ from levelapp.aspects.monitor import MonitoringAspect, MetricType
8
+
9
+
10
+ class ExactMatch(BaseMetric):
11
+ """Binary exact match comparison (1.0 for exact match, 0.0 otherwise)"""
12
+
13
+ @MonitoringAspect.monitor(name="exact_match", category=MetricType.SCORING, cached=True, enable_timing=True)
14
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
15
+ """"
16
+ Compute the exact match score between generated and reference strings.
17
+
18
+ Args:
19
+ generated (str): The text generated by the agent.
20
+ reference (str): The expected reference text.
21
+
22
+ Returns:
23
+ Dict[str, Any]: A dictionary containing the exact match score and metadata.
24
+ """
25
+ self._validate_inputs(generated=generated, reference=reference)
26
+
27
+ score = distance.Levenshtein.normalized_similarity(
28
+ s1=generated,
29
+ s2=reference,
30
+ processor=self.processor,
31
+ score_cutoff=1.0
32
+ )
33
+
34
+ return {
35
+ "score": score,
36
+ "metadata": self._build_metadata(
37
+ generated_length=len(generated),
38
+ reference_length=len(reference)
39
+ )
40
+ }
41
+
42
+
43
+ class Levenshtein(BaseMetric):
44
+ """Levenshtein edit distance (number of insertions, deletions, substitutions)"""
45
+
46
+ @MonitoringAspect.monitor(name="levenshtein", category=MetricType.SCORING, cached=True, enable_timing=True)
47
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
48
+ """
49
+ Compute the Levenshtein distance score between generated and reference strings.
50
+
51
+ Args:
52
+ generated (str): The text generated by the agent.
53
+ reference (str): The expected reference text.
54
+
55
+ Returns:
56
+ Dict[str, Any]: A dictionary containing the Levenshtein score and metadata.
57
+ """
58
+ self._validate_inputs(generated=generated, reference=reference)
59
+
60
+ score = distance.Levenshtein.normalized_similarity(
61
+ s1=generated,
62
+ s2=reference,
63
+ processor=self.processor,
64
+ score_cutoff=self.score_cutoff or 1.0
65
+ )
66
+
67
+ return {
68
+ "score": score,
69
+ "metadata": self._build_metadata(
70
+ generated_length=len(generated),
71
+ reference_length=len(reference)
72
+ )
73
+ }
74
+
75
+
76
+ class JaroWinkler(BaseMetric):
77
+ """Jaro-Winkler distance (similarity measure for strings)"""
78
+
79
+ @MonitoringAspect.monitor(name="jaro-winkler", category=MetricType.SCORING, cached=True, enable_timing=True)
80
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
81
+ """
82
+ Compute the Jaro-Winkler distance score between generated and reference strings.
83
+
84
+ Args:
85
+ generated (str): The text generated by the agent.
86
+ reference (str): The expected reference text.
87
+
88
+ Returns:
89
+ Dict[str, Any]: A dictionary containing the Jaro-Winkler score and metadata.
90
+ """
91
+ self._validate_inputs(generated=generated, reference=reference)
92
+
93
+ score = distance.JaroWinkler.normalized_similarity(
94
+ s1=generated,
95
+ s2=reference,
96
+ processor=self.processor,
97
+ score_cutoff=self.score_cutoff
98
+ )
99
+
100
+ return {
101
+ "score": score,
102
+ "metadata": self._build_metadata(
103
+ generated_length=len(generated),
104
+ reference_length=len(reference)
105
+ )
106
+ }
107
+
108
+
109
+ class Hamming(BaseMetric):
110
+ """Hamming distance (character substitutions only, for equal-length strings)"""
111
+
112
+ @MonitoringAspect.monitor(name="hamming", category=MetricType.SCORING, cached=True, enable_timing=True)
113
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
114
+ """
115
+ Compute the Hamming distance score between generated and reference strings.
116
+
117
+ Args:
118
+ generated (str): The text generated by the agent.
119
+ reference (str): The expected reference text.
120
+
121
+ Returns:
122
+ Dict[str, Any]: A dictionary containing the Hamming score and metadata.
123
+ """
124
+ self._validate_inputs(generated=generated, reference=reference)
125
+
126
+ score = distance.Hamming.normalized_similarity(
127
+ s1=generated,
128
+ s2=reference,
129
+ processor=self.processor,
130
+ score_cutoff=self.score_cutoff
131
+ )
132
+
133
+ return {
134
+ "score": score,
135
+ "metadata": self._build_metadata(
136
+ generated_length=len(generated),
137
+ reference_length=len(reference)
138
+ )
139
+ }
140
+
141
+
142
+ class PrefixMatch(BaseMetric):
143
+ """Prefix similarity (1.0 if generated starts with reference)"""
144
+
145
+ @MonitoringAspect.monitor(name="prefix-match", category=MetricType.SCORING, cached=True, enable_timing=True)
146
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
147
+ """
148
+ Compute the Prefix similarity score between generated and reference strings.
149
+
150
+ Args:
151
+ generated (str): The text generated by the agent.
152
+ reference (str): The expected reference text.
153
+
154
+ Returns:
155
+ Dict[str, Any]: A dictionary containing the Prefix similarity and metadata.
156
+ """
157
+ self._validate_inputs(generated=generated, reference=reference)
158
+
159
+ score = distance.Prefix.normalized_similarity(
160
+ s1=generated,
161
+ s2=reference,
162
+ processor=self.processor,
163
+ score_cutoff=self.score_cutoff
164
+ )
165
+
166
+ return {
167
+ "score": score,
168
+ "metadata": self._build_metadata(
169
+ generated_length=len(generated),
170
+ reference_length=len(reference)
171
+ )
172
+ }
173
+
174
+
175
+ # Registry of all exact metrics
176
+ EXACT_METRICS = {
177
+ "exact_match": ExactMatch,
178
+ "levenshtein": Levenshtein,
179
+ "jaro_winkler": JaroWinkler,
180
+ "hamming": Hamming,
181
+ "prefix_match": PrefixMatch
182
+ }
@@ -0,0 +1,80 @@
1
+ """levelapp/metrics/fuzzy.py"""
2
+ from rapidfuzz import fuzz
3
+
4
+ from typing import Dict, Any
5
+
6
+ from levelapp.core.base import BaseMetric
7
+ from levelapp.aspects.monitor import MonitoringAspect, MetricType
8
+
9
+
10
+ class FuzzyRatio(BaseMetric):
11
+ """A metric that computes the fuzzy ratio between two texts."""
12
+
13
+ @MonitoringAspect.monitor(name="fuzzy-ratio", category=MetricType.API_CALL, cached=True, enable_timing=True)
14
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
15
+ """
16
+ Compute the fuzzy ratio between the generated text and the reference text.
17
+
18
+ Args:
19
+ generated (str): The text generated by the agent.
20
+ reference (str): The expected reference text.
21
+
22
+ Returns:
23
+ Dict[str, Any]: A dictionary containing the fuzzy ratio score and metadata.
24
+ """
25
+ score = fuzz.ratio(
26
+ s1=generated,
27
+ s2=reference,
28
+ processor=self.processor,
29
+ score_cutoff=self.score_cutoff
30
+ )
31
+
32
+ # TODO-0: Return results as Pydantic model.
33
+ return {
34
+ "score": score / 100,
35
+ "metadata": self._build_metadata(
36
+ generated_length=len(generated),
37
+ reference_length=len(reference)
38
+ )
39
+ }
40
+
41
+
42
+ class PartialRatio(BaseMetric):
43
+ """
44
+ A metric that computes the partial fuzzy ratio between two texts.
45
+ This is useful for evaluating how similar two pieces of text are,
46
+ allowing for partial matches.
47
+ """
48
+
49
+ @MonitoringAspect.monitor(name="partial-ratio", category=MetricType.SCORING, cached=True, enable_timing=True)
50
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
51
+ """
52
+ Compute the partial fuzzy ratio between the generated text and the reference text.
53
+
54
+ Args:
55
+ generated (str): The text generated by the agent.
56
+ reference (str): The expected reference text.
57
+
58
+ Returns:
59
+ Dict[str, Any]: A dictionary containing the partial fuzzy ratio.
60
+ """
61
+ score = fuzz.partial_ratio(
62
+ s1=generated,
63
+ s2=reference,
64
+ processor=self.processor,
65
+ score_cutoff=self.score_cutoff
66
+ )
67
+
68
+ return {
69
+ "score": score / 100,
70
+ "metadata": self._build_metadata(
71
+ generated_length=len(generated),
72
+ reference_length=len(reference)
73
+ )
74
+ }
75
+
76
+
77
+ FUZZY_METRICS = {
78
+ "fuzzy_ratio": FuzzyRatio,
79
+ "partial_ratio": PartialRatio,
80
+ }
@@ -0,0 +1,103 @@
1
+ """levelapp/metrics/token.py"""
2
+ from rapidfuzz import fuzz
3
+
4
+ from typing import Dict, Any
5
+
6
+ from levelapp.core.base import BaseMetric
7
+ from levelapp.aspects.monitor import MonitoringAspect, MetricType
8
+
9
+
10
+ class WeightedRatio(BaseMetric):
11
+ """A metric that calculates a weighted ratio based on the other ratio algorithms"""
12
+
13
+ @MonitoringAspect.monitor(name="weighted-ratio", category=MetricType.SCORING, cached=True, enable_timing=True)
14
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
15
+ """
16
+ Compute the token-based metric between the generated text and the reference text.
17
+
18
+ Args:
19
+ generated (str): The text generated by the agent.
20
+ reference (str): The expected reference text.
21
+
22
+ Returns:
23
+ Dict[str, Any]: A dictionary containing the score and metadata.
24
+ """
25
+ score = fuzz.WRatio(
26
+ s1=generated,
27
+ s2=reference,
28
+ processor=self.processor,
29
+ score_cutoff=self.score_cutoff
30
+ )
31
+
32
+ return {
33
+ "score": score / 100,
34
+ "metadata": self._build_metadata(
35
+ generated_length=len(generated),
36
+ reference_length=len(reference)
37
+ )
38
+ }
39
+
40
+
41
+ class TokenSetRatio(BaseMetric):
42
+ """
43
+ A metric that compares the words in the strings based
44
+ on unique and common words between them using fuzz.ratio.
45
+ """
46
+
47
+ @MonitoringAspect.monitor(name="token-set-ratio", category=MetricType.SCORING, cached=True, enable_timing=True)
48
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
49
+ """
50
+ Compute the token-based metric between the generated text and the reference text.
51
+
52
+ Args:
53
+ generated (str): The text generated by the agent.
54
+ reference (str): The expected reference text.
55
+
56
+ Returns:
57
+ Dict[str, Any]: A dictionary containing the score and metadata.
58
+ """
59
+ score = fuzz.token_set_ratio(
60
+ s1=generated,
61
+ s2=reference,
62
+ processor=self.processor,
63
+ score_cutoff=self.score_cutoff
64
+ )
65
+
66
+ return {
67
+ "score": score / 100,
68
+ "metadata": self._build_metadata(
69
+ generated_length=len(generated),
70
+ reference_length=len(reference)
71
+ )
72
+ }
73
+
74
+
75
+ class TokenSortRatio(BaseMetric):
76
+ """A metric that sorts the words in the strings and calculates the fuzz.ratio between them."""
77
+
78
+ @MonitoringAspect.monitor(name="token-sort-ratio", category=MetricType.SCORING, cached=True, enable_timing=True)
79
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
80
+ """
81
+ Compute the token-based metric between the generated text and the reference text.
82
+
83
+ Args:
84
+ generated (str): The text generated by the agent.
85
+ reference (str): The expected reference text.
86
+
87
+ Returns:
88
+ Dict[str, Any]: A dictionary containing the score and metadata.
89
+ """
90
+ score = fuzz.token_sort_ratio(
91
+ s1=generated,
92
+ s2=reference,
93
+ processor=self.processor,
94
+ score_cutoff=self.score_cutoff
95
+ )
96
+
97
+ return {
98
+ "score": score / 100,
99
+ "metadata": self._build_metadata(
100
+ generated_length=len(generated),
101
+ reference_length=len(reference)
102
+ )
103
+ }