levelapp 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. levelapp/__init__.py +0 -0
  2. levelapp/aspects/__init__.py +8 -0
  3. levelapp/aspects/loader.py +253 -0
  4. levelapp/aspects/logger.py +59 -0
  5. levelapp/aspects/monitor.py +617 -0
  6. levelapp/aspects/sanitizer.py +168 -0
  7. levelapp/clients/__init__.py +122 -0
  8. levelapp/clients/anthropic.py +112 -0
  9. levelapp/clients/gemini.py +130 -0
  10. levelapp/clients/groq.py +101 -0
  11. levelapp/clients/huggingface.py +162 -0
  12. levelapp/clients/ionos.py +126 -0
  13. levelapp/clients/mistral.py +106 -0
  14. levelapp/clients/openai.py +116 -0
  15. levelapp/comparator/__init__.py +5 -0
  16. levelapp/comparator/comparator.py +232 -0
  17. levelapp/comparator/extractor.py +108 -0
  18. levelapp/comparator/schemas.py +61 -0
  19. levelapp/comparator/scorer.py +269 -0
  20. levelapp/comparator/utils.py +136 -0
  21. levelapp/config/__init__.py +5 -0
  22. levelapp/config/endpoint.py +199 -0
  23. levelapp/config/prompts.py +57 -0
  24. levelapp/core/__init__.py +0 -0
  25. levelapp/core/base.py +386 -0
  26. levelapp/core/schemas.py +24 -0
  27. levelapp/core/session.py +336 -0
  28. levelapp/endpoint/__init__.py +0 -0
  29. levelapp/endpoint/client.py +188 -0
  30. levelapp/endpoint/client_test.py +41 -0
  31. levelapp/endpoint/manager.py +114 -0
  32. levelapp/endpoint/parsers.py +119 -0
  33. levelapp/endpoint/schemas.py +38 -0
  34. levelapp/endpoint/tester.py +52 -0
  35. levelapp/evaluator/__init__.py +3 -0
  36. levelapp/evaluator/evaluator.py +307 -0
  37. levelapp/metrics/__init__.py +63 -0
  38. levelapp/metrics/embedding.py +56 -0
  39. levelapp/metrics/embeddings/__init__.py +0 -0
  40. levelapp/metrics/embeddings/sentence_transformer.py +30 -0
  41. levelapp/metrics/embeddings/torch_based.py +56 -0
  42. levelapp/metrics/exact.py +182 -0
  43. levelapp/metrics/fuzzy.py +80 -0
  44. levelapp/metrics/token.py +103 -0
  45. levelapp/plugins/__init__.py +0 -0
  46. levelapp/repository/__init__.py +3 -0
  47. levelapp/repository/filesystem.py +203 -0
  48. levelapp/repository/firestore.py +291 -0
  49. levelapp/simulator/__init__.py +3 -0
  50. levelapp/simulator/schemas.py +116 -0
  51. levelapp/simulator/simulator.py +531 -0
  52. levelapp/simulator/utils.py +134 -0
  53. levelapp/visualization/__init__.py +7 -0
  54. levelapp/visualization/charts.py +358 -0
  55. levelapp/visualization/dashboard.py +240 -0
  56. levelapp/visualization/exporter.py +167 -0
  57. levelapp/visualization/templates/base.html +158 -0
  58. levelapp/visualization/templates/comparator_dashboard.html +57 -0
  59. levelapp/visualization/templates/simulator_dashboard.html +111 -0
  60. levelapp/workflow/__init__.py +6 -0
  61. levelapp/workflow/base.py +192 -0
  62. levelapp/workflow/config.py +96 -0
  63. levelapp/workflow/context.py +64 -0
  64. levelapp/workflow/factory.py +42 -0
  65. levelapp/workflow/registration.py +6 -0
  66. levelapp/workflow/runtime.py +19 -0
  67. levelapp-0.1.15.dist-info/METADATA +571 -0
  68. levelapp-0.1.15.dist-info/RECORD +70 -0
  69. levelapp-0.1.15.dist-info/WHEEL +4 -0
  70. levelapp-0.1.15.dist-info/licenses/LICENSE +0 -0
@@ -0,0 +1,119 @@
1
+ """levelapp/endpoint/parsers.py"""
2
+ from typing import List, Dict, Any
3
+
4
+ from levelapp.endpoint.schemas import RequestSchemaConfig, ResponseMappingConfig
5
+
6
+
7
+ class RequestPayloadBuilder:
8
+ def build(self, schema: List[RequestSchemaConfig], context: Dict[str, Any]) -> Dict[str, Any]:
9
+ """
10
+ Builds nested JSON payloads using dot-notation paths.
11
+
12
+ Args:
13
+ schema (List[RequestSchemaConfig]): List of request schema configurations.
14
+ context (Dict[str, Any]): Context for building the payload.
15
+
16
+ Returns:
17
+ payload (Dict[str, Any]): Request payload.
18
+ """
19
+ payload = {}
20
+
21
+ for field_config in schema:
22
+ value = self._resolve_value(config=field_config, context=context)
23
+ if value is None and field_config.required:
24
+ raise ValueError(f"Required field '{field_config.field_path}' has no value")
25
+
26
+ self._set_nested_value(obj=payload, path=field_config.field_path, value=value)
27
+
28
+ return payload
29
+
30
+ @staticmethod
31
+ def _resolve_value(config: RequestSchemaConfig, context: Dict[str, Any]) -> Any:
32
+ """
33
+ Resolve value based on type: static, env, or dynamic.
34
+
35
+ Args:
36
+ config (RequestSchemaConfig): Request schema configuration.
37
+ context (Dict[str, Any]): Context for building the payload.
38
+
39
+ Returns:
40
+ Any: Value resolved.
41
+ """
42
+ if config.value_type == "static":
43
+ return config.value
44
+ elif config.value_type == "env":
45
+ import os
46
+ return os.getenv(config.value)
47
+ elif config.value_type == "dynamic":
48
+ return context.get(config.value, None)
49
+
50
+ return config.value
51
+
52
+ @staticmethod
53
+ def _set_nested_value(obj: Dict, path: str, value: Any) -> None:
54
+ parts: List[str] = path.split(".")
55
+ for part in parts[:-1]:
56
+ obj = obj.setdefault(part, {})
57
+
58
+ obj[parts[-1]] = value
59
+
60
+
61
+ class ResponseDataExtractor:
62
+ """Extracts data from API response using mapping-based config."""
63
+ def extract(
64
+ self,
65
+ response_data: Dict[str, Any],
66
+ mappings: List[ResponseMappingConfig]
67
+ ) -> Dict[str, Any]:
68
+ """
69
+ Extracts data from API response using mapping-based config.
70
+
71
+ Args:
72
+ response_data (Dict[str, Any]): API response data.
73
+ mappings (List[ResponseMappingConfig]): List of response mappings.
74
+
75
+ Returns:
76
+ Dict[str, Any]: Extracted data.
77
+ """
78
+ result: Dict[str, Any] = {}
79
+
80
+ for mapping in mappings:
81
+ try:
82
+ value = self._extract_by_path(obj=response_data, path=mapping.field_path, default=mapping.default)
83
+ result[mapping.extract_as] = value
84
+
85
+ except Exception as e:
86
+ print(f"Failed to extract '{mapping.field_path}':\n{e}")
87
+ result[mapping.extract_as] = mapping.default
88
+
89
+ return result
90
+
91
+ @staticmethod
92
+ def _extract_by_path(obj: Dict, path: str, default: Any = "N/A") -> Any:
93
+ """
94
+ Extracts value using JSON path-like notation.
95
+ """
96
+ parts = path.split(".")
97
+ current = obj
98
+
99
+ for part in parts:
100
+ if not isinstance(current, dict):
101
+ print("[extract_by_path][WARNING] the response data is not a dict.")
102
+ return default
103
+
104
+ try:
105
+ if '[' in part and ']' in part:
106
+ key, idx = part.split('[')
107
+ idx = int(idx.rstrip(']'))
108
+ current = current[key][idx] if key else current[idx]
109
+ else:
110
+ if part not in current:
111
+ print(f"[extract_by_path][WARNING] Key '{part}' is missing from response.")
112
+ return default
113
+ current = current.get(part)
114
+
115
+ except (KeyError, IndexError, TypeError, AttributeError) as e:
116
+ print(f"[extract_by_path][ERROR] Error type <{e.__class__.__name__}> : {e.args[0]}")
117
+ return default
118
+
119
+ return current
@@ -0,0 +1,38 @@
1
+ """levelapp/endpoint/schemas.py"""
2
+ from enum import Enum
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class HttpMethod(str, Enum):
9
+ GET = "GET"
10
+ POST = "POST"
11
+ PUT = "PUT"
12
+ PATCH = "PATCH"
13
+ DELETE = "DELETE"
14
+
15
+
16
+ class HeaderConfig(BaseModel):
17
+ """Secure header configuration with environment variables support."""
18
+ name: str
19
+ value: str
20
+ secure: bool = False
21
+
22
+ class Config:
23
+ frozen = True
24
+
25
+
26
+ class RequestSchemaConfig(BaseModel):
27
+ """Schema Definition for request payload population."""
28
+ field_path: str # JSON path-like: "data.user.id"
29
+ value: Any
30
+ value_type: str = "static"
31
+ required: bool = True
32
+
33
+
34
+ class ResponseMappingConfig(BaseModel):
35
+ """Response data extraction mapping."""
36
+ field_path: str
37
+ extract_as: str
38
+ default: Any = None
@@ -0,0 +1,52 @@
1
+ """levelapp/endpoint/tester.py"""
2
+ import logging
3
+ from typing import Dict, Any
4
+
5
+ from levelapp.endpoint.client import EndpointConfig, APIClient
6
+ from levelapp.endpoint.parsers import RequestPayloadBuilder, ResponseDataExtractor
7
+
8
+
9
+ class ConnectivityTester:
10
+ """Tests REST endpoint connectivity with configurable behavior."""
11
+ def __init__(self, config: EndpointConfig):
12
+ self.config = config
13
+ self.client = APIClient(config=config)
14
+ self.payload_builder = RequestPayloadBuilder()
15
+ self.response_extractor = ResponseDataExtractor()
16
+ self.logger = logging.getLogger(f"ConnectivityTester.{self.config.name}")
17
+
18
+ async def test(self, context: Dict[str, Any] = None) -> Dict[str, Any]:
19
+ """Execute connectivity test (template method)."""
20
+ context = context or {}
21
+
22
+ self.logger.info(f"Starting connectivity test for '{self.config.name}'")
23
+
24
+ try:
25
+ payload = None
26
+ if self.config.request_schema:
27
+ payload = self.payload_builder.build(schema=self.config.request_schema, context=context)
28
+ self.logger.debug(f"Request payload: {payload}")
29
+
30
+ response = await self.client.execute(payload=payload)
31
+ self.logger.debug(f"Response status: {response.status_code}")
32
+
33
+ response_data = response.json() if response.text else {}
34
+ extracted = self.response_extractor.extract(
35
+ response_data=response_data,
36
+ mappings=self.config.response_mapping,
37
+ )
38
+
39
+ return {
40
+ "success": True,
41
+ "status_code": response.status_code,
42
+ "extracted_data": extracted,
43
+ "raw_response": response,
44
+ }
45
+
46
+ except Exception as e:
47
+ self.logger.error(f"Connectivity test failed: {e}", exc_info=e)
48
+ return {
49
+ "success": False,
50
+ "error": str(e),
51
+ "error_type": type(e).__name__,
52
+ }
@@ -0,0 +1,3 @@
1
+ from .evaluator import JudgeEvaluator, MetadataEvaluator
2
+
3
+ __all__ = ['JudgeEvaluator', 'MetadataEvaluator']
@@ -0,0 +1,307 @@
1
+ """levelapp/core/evaluator.py"""
2
+ from functools import lru_cache
3
+ from typing import List, Dict, Any, TYPE_CHECKING
4
+ from pydantic import BaseModel, Field
5
+
6
+ from tenacity import (
7
+ retry,
8
+ retry_if_exception_type,
9
+ stop_after_attempt,
10
+ wait_exponential,
11
+ AsyncRetrying,
12
+ RetryError,
13
+ )
14
+
15
+ from levelapp.clients import ClientRegistry
16
+ from levelapp.comparator import MetricsManager, MetadataComparator
17
+ from levelapp.config.prompts import EVAL_PROMPT_TEMPLATE
18
+ from levelapp.core.base import BaseEvaluator, BaseChatClient
19
+ from levelapp.aspects import MonitoringAspect, MetricType, logger, DataLoader
20
+
21
+ if TYPE_CHECKING:
22
+ from levelapp.workflow.config import WorkflowConfig
23
+
24
+
25
+ class Evidence(BaseModel):
26
+ """Evidence details for evaluation."""
27
+ covered_points: List[str] = Field(
28
+ default_factory=list,
29
+ description="Key points covered the agent reply covered (<= 3 items)"
30
+ )
31
+ missing_or_wrong: List[str] = Field(
32
+ default_factory=list,
33
+ description="Key points the agent reply missed or contradicted (<= 3 items)"
34
+ )
35
+
36
+
37
+ class JudgeEvaluationResults(BaseModel):
38
+ """Structured result of an interaction evaluation."""
39
+ provider: str = Field(..., description="The provider name, e.g., 'openai', 'ionos'")
40
+ score: int = Field(..., ge=0, le=3, description="Evaluation score between 0 and 3")
41
+ label: str = Field(..., description="The label of the evaluation result")
42
+ justification: str = Field(..., description="Short explanation of the evaluation result")
43
+ evidence: Evidence = Field(default_factory=Evidence, description="Detailed evidence for the evaluation")
44
+ raw_response: Dict[str, Any] = Field(..., description="Full unprocessed API response", exclude=True)
45
+ metadata: Dict[str, Any] = Field(..., description="Metadata about the evaluation result")
46
+
47
+ @classmethod
48
+ def from_parsed(cls, provider: str, parsed: Dict[str, Any], raw: Dict[str, Any]) -> "JudgeEvaluationResults":
49
+ """
50
+ Build a model instance from the provided data.
51
+
52
+ Args:
53
+ provider (str): The provider name.
54
+ parsed (Dict[str, Any]): The parsed response data.
55
+ raw (Dict[str, Any]): The raw response data.
56
+
57
+ Returns:
58
+ JudgeEvaluationResults: The constructed evaluation result instance.
59
+ """
60
+ content = parsed.get("output", {})
61
+ metadata = parsed.get("metadata", {})
62
+ return cls(
63
+ provider=provider,
64
+ score=content.get("score", 0),
65
+ label=content.get("label", "N/A"),
66
+ justification=content.get("justification", "N/A"),
67
+ evidence=Evidence(**content.get("evidence", {})),
68
+ raw_response=raw,
69
+ metadata=metadata,
70
+ )
71
+
72
+
73
+ class JudgeEvaluator(BaseEvaluator):
74
+ """LLM-as-a-judge evaluator class"""
75
+ def __init__(self, config: "WorkflowConfig | None" = None):
76
+ """
77
+ Initialize the JudgeEvaluator.
78
+
79
+ Args:
80
+ config (WorkflowConfig | None): The configuration of the workflow.
81
+ """
82
+ if config:
83
+ self.config = config
84
+ self.providers = config.evaluation.providers
85
+
86
+ self.prompt_template = EVAL_PROMPT_TEMPLATE
87
+ self.client_registry = ClientRegistry
88
+
89
+ def select_client(self, provider: str) -> BaseChatClient:
90
+ """
91
+ Select an LLM client to use for the evaluation.
92
+
93
+ Args:
94
+ provider (str): The provider name.
95
+
96
+ Returns:
97
+ client (BaseChatClient): The LLM client to use for the evaluation.
98
+ """
99
+ if provider not in self.client_registry.list_providers():
100
+ logger.warning(f"[JudgeEvaluator] {provider} is not registered. Defaulting to 'OpenAI'.")
101
+ return self.client_registry.get(provider="openai")
102
+
103
+ return self.client_registry.get(provider=provider)
104
+
105
+ @lru_cache(maxsize=1024)
106
+ def _build_prompt(self, user_input: str, generated_text: str, reference_text: str) -> str:
107
+ """
108
+ Build the prompt used for the evaluation.
109
+
110
+ Args:
111
+ user_input (str): The user input.
112
+ generated_text (str): The generated text.
113
+ reference_text (str): The reference text.
114
+
115
+ Returns:
116
+ A string containing the prompt.
117
+ """
118
+ return self.prompt_template.format(
119
+ user_input=user_input,
120
+ generated_text=generated_text,
121
+ reference_text=reference_text
122
+ )
123
+
124
+ @retry(
125
+ retry=retry_if_exception_type((TimeoutError, ValueError, RuntimeError)),
126
+ stop=stop_after_attempt(3),
127
+ wait=wait_exponential(multiplier=1, min=2, max=10),
128
+ reraise=True,
129
+ )
130
+ def evaluate(
131
+ self,
132
+ generated_data: str,
133
+ reference_data: str,
134
+ user_input: str,
135
+ provider: str,
136
+ ) -> JudgeEvaluationResults | None:
137
+ """
138
+ Synchronous evaluation for the generated data.
139
+
140
+ Args:
141
+ generated_data (str): The generated data.
142
+ reference_data (str): The reference data.
143
+ user_input (str): The user input.
144
+ provider (str): The LLM provider user for evaluation.
145
+
146
+ Returns:
147
+ JudgeEvaluationResults instance containing the evaluation results.
148
+
149
+ Raises:
150
+ Exception: If the evaluation failed.
151
+ """
152
+ prompt = self._build_prompt(
153
+ user_input=user_input,
154
+ generated_text=generated_data,
155
+ reference_text=reference_data
156
+ )
157
+ client = self.select_client(provider=provider)
158
+
159
+ try:
160
+ response = client.call(message=prompt)
161
+ logger.info(f"[{provider}] Evaluation: {response}\n{'---' * 10}")
162
+ parsed = client.parse_response(response=response)
163
+ return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
164
+
165
+ except Exception as e:
166
+ logger.error(f"[{provider}] Evaluation failed: {e}", exc_info=True)
167
+ return JudgeEvaluationResults(
168
+ provider=provider,
169
+ score=0,
170
+ label="N/A",
171
+ justification="N/A",
172
+ evidence=Evidence(covered_points=[], missing_or_wrong=[]),
173
+ raw_response={},
174
+ metadata={}
175
+ )
176
+
177
+ @MonitoringAspect.monitor(name="judge_evaluation", category=MetricType.API_CALL)
178
+ async def async_evaluate(
179
+ self,
180
+ generated_data: str,
181
+ reference_data: str,
182
+ user_input: str,
183
+ provider: str,
184
+ ) -> JudgeEvaluationResults | None:
185
+ """
186
+ Synchronous evaluation for the generated data.
187
+
188
+ Args:
189
+ generated_data (str): The generated data.
190
+ reference_data (str): The reference data.
191
+ user_input (str): The user input.
192
+ provider (str): The LLM provider user for evaluation.
193
+
194
+ Returns:
195
+ JudgeEvaluationResults instance containing the evaluation results.
196
+
197
+ Raises:
198
+ RetryError: If the evaluation failed.
199
+ """
200
+ prompt = self._build_prompt(
201
+ user_input=user_input,
202
+ generated_text=generated_data,
203
+ reference_text=reference_data
204
+ )
205
+ client = self.select_client(provider=provider)
206
+
207
+ try:
208
+ async for attempt in AsyncRetrying(
209
+ retry=retry_if_exception_type((TimeoutError, ValueError, RuntimeError)),
210
+ stop=stop_after_attempt(3),
211
+ wait=wait_exponential(multiplier=1, min=2, max=10),
212
+ reraise=True,
213
+ ):
214
+ with attempt:
215
+ response = await client.acall(message=prompt)
216
+ parsed = client.parse_response(response=response)
217
+ return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
218
+
219
+ except RetryError as e:
220
+ logger.error(f"[{provider}] Async evaluation failed after retries: {e}", exc_info=True)
221
+ return JudgeEvaluationResults(
222
+ provider=provider,
223
+ score=0,
224
+ label="N/A",
225
+ justification="N/A",
226
+ evidence=Evidence(covered_points=[], missing_or_wrong=[]),
227
+ raw_response={},
228
+ metadata={}
229
+ )
230
+
231
+
232
+ class MetadataEvaluator(BaseEvaluator):
233
+ """Metadata evaluator class."""
234
+ def __init__(self, config: "WorkflowConfig | None" = None):
235
+ """
236
+ Initialize the MetadataEvaluator.
237
+
238
+ Args:
239
+ config (WorkflowConfig | None): The workflow configuration.
240
+ """
241
+ if config:
242
+ self.config = config
243
+ self.metrics_map = config.evaluation.metrics_map
244
+
245
+ self.data_loader = DataLoader()
246
+ self.comparator = MetadataComparator()
247
+ self.metrics_manager = MetricsManager()
248
+
249
+ def evaluate(
250
+ self,
251
+ generated_data: str | Dict[str, Any],
252
+ reference_data: str | Dict[str, Any],
253
+ metrics_mapping: Any | None = None,
254
+ ) -> Dict[str, float]:
255
+ """
256
+ Synchronous evaluation for the generated data.
257
+
258
+ Args:
259
+ generated_data (str): The generated data.
260
+ reference_data (str): The reference data.
261
+ metrics_mapping (dict): A dictionary mapping metric names to metrics.
262
+
263
+ Returns:
264
+ A dict containing the evaluation results.
265
+ """
266
+ gen_data = self.data_loader.create_dynamic_model(data=generated_data, model_name="GeneratedMetadata")
267
+ ref_data = self.data_loader.create_dynamic_model(data=reference_data, model_name="ReferenceMetadata")
268
+
269
+ if metrics_mapping:
270
+ self.comparator.metrics_manager = metrics_mapping
271
+ else:
272
+ logger.info(f"[MetadataEvaluator] Metric map: {self.metrics_map}")
273
+ self.comparator.metrics_manager = self.metrics_map
274
+
275
+ self.comparator.metrics_manager = self.metrics_manager
276
+ self.comparator.generated_data = gen_data
277
+ self.comparator.reference_data = ref_data
278
+
279
+ output = self.comparator.run(indexed_mode=False)
280
+ results: Dict[str, float] = {}
281
+ logger.info(f"[MetadataEvaluator] Metadata Evaluation Output:\n{output}]")
282
+
283
+ for k, v in output.items():
284
+ field = v.get("field_name", "N/A")
285
+ score = v.get("set_scores", -1)
286
+
287
+ if score is None:
288
+ results[field] = -1
289
+ continue
290
+
291
+ try:
292
+ val = score[0] if isinstance(score, list) else score
293
+ results[field] = float(val)
294
+
295
+ except (TypeError, ValueError):
296
+ results[field] = -1
297
+
298
+ return results
299
+
300
+ async def async_evaluate(
301
+ self,
302
+ generated_data: str | Dict[str, Any],
303
+ reference_data: str | Dict[str, Any],
304
+ **kwargs
305
+ ):
306
+ """Not implemented yet."""
307
+ raise NotImplementedError()
@@ -0,0 +1,63 @@
1
+ """levelapp/metrics/__init__.py"""
2
+ from typing import List, Dict, Type, Any
3
+
4
+ from levelapp.aspects import logger
5
+ from levelapp.core.base import BaseMetric
6
+ from levelapp.metrics.exact import EXACT_METRICS
7
+ from levelapp.metrics.fuzzy import FUZZY_METRICS
8
+
9
+
10
+ class MetricRegistry:
11
+ """Registry for metric classes."""
12
+ _metrics: Dict[str, Type[BaseMetric]] = {}
13
+
14
+ @classmethod
15
+ def register(cls, name: str, metric_class: Type[BaseMetric]) -> None:
16
+ """
17
+ Register a metric class under a given name.
18
+
19
+ Args:
20
+ name (str): Unique identifier for the metric.
21
+ metric_class (Type[BaseMetric]): The metric class to register.
22
+ """
23
+ if name in cls._metrics:
24
+ raise KeyError(f"Metric '{name}' is already registered")
25
+
26
+ cls._metrics[name] = metric_class
27
+
28
+ @classmethod
29
+ def get(cls, name: str, **kwargs: Any) -> BaseMetric:
30
+ """
31
+ Retrieve an instance of a registered metric by its name.
32
+
33
+ Args:
34
+ name (str): The name of the metric to retrieve.
35
+
36
+ Returns:
37
+ Type[BaseMetric]: The metric class associated with the given name.
38
+
39
+ Raises:
40
+ KeyError: If the metric is not found.
41
+ """
42
+ if name not in cls._metrics:
43
+ raise KeyError(f"Metric '{name}' is not registered")
44
+
45
+ return cls._metrics[name](**kwargs)
46
+
47
+ @classmethod
48
+ def list_metrics(cls) -> List[str]:
49
+ return list(cls._metrics.keys())
50
+
51
+ @classmethod
52
+ def unregister(cls, name: str) -> None:
53
+ cls._metrics.pop(name, None)
54
+
55
+
56
+ METRICS = FUZZY_METRICS | EXACT_METRICS
57
+
58
+ for name_, metric_class_ in METRICS.items():
59
+ try:
60
+ MetricRegistry.register(name_, metric_class_)
61
+
62
+ except Exception as e:
63
+ logger.info(f"Failed to register metric {name_}: {e}")
@@ -0,0 +1,56 @@
1
+ """levelapp/metrics/embeddings.py"""
2
+ from __future__ import annotations
3
+
4
+ import importlib
5
+
6
+ from importlib import util
7
+ from typing import Any, Dict
8
+
9
+ from levelapp.core.base import BaseMetric
10
+
11
+
12
+ class EmbeddingMetric(BaseMetric):
13
+ """
14
+ Abstract embeddings metric that dynamically delegates to a backend implementation (Torch or Scikit).
15
+ """
16
+ def __init__(self, backend: str | None = None, **kwargs: Any):
17
+ """
18
+ Initialize the embeddings metric.
19
+
20
+ Args:
21
+ backend (str, optional): Embedding metric backend 'torch' or 'scikit'. Defaults to None.
22
+ """
23
+ super().__init__(processor=kwargs.get("processor"), score_cutoff=kwargs.get("score_cutoff"))
24
+ self.backend_name = backend or self._detect_backend()
25
+ self.backend = self._load_backend(self.backend_name)(**kwargs)
26
+
27
+ @staticmethod
28
+ def _detect_backend() -> str:
29
+ """Auto-detect which embeddings backend to use."""
30
+ if util.find_spec("torch") and util.find_spec("transformers"):
31
+ return "torch"
32
+
33
+ elif util.find_spec("sklearn"):
34
+ return "scikit"
35
+
36
+ raise ImportError(
37
+ "No embeddings backend available. Install with 'pip install levelapp[embeddings]' "
38
+ "for Torch support, or ensure scikit-learn is installed."
39
+ )
40
+
41
+ @staticmethod
42
+ def _load_backend(backend: str):
43
+ if backend == "torch":
44
+ module = importlib.import_module("levelapp.metrics.embeddings.torch_based")
45
+ return getattr(module, "TorchEmbeddingMetric")
46
+
47
+ elif backend == "scikit":
48
+ module = importlib.import_module("levelapp.metrics.embeddings.sentence_transformer")
49
+ return getattr(module, "SentenceEmbeddingMetric")
50
+
51
+ else:
52
+ raise ValueError(f"Unknown embeddings backend: {backend}")
53
+
54
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
55
+ """Delegate to selected backend implementation."""
56
+ return self.backend.compute(generated, reference)
File without changes
@@ -0,0 +1,30 @@
1
+ """levelapp/metrics/embeddings/sentence_transformer.py"""
2
+ import numpy as np
3
+
4
+ from typing import Any, Dict
5
+
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+
9
+ from levelapp.core.base import BaseMetric
10
+
11
+
12
+ class SentenceEmbeddingMetric(BaseMetric):
13
+ """Lightweight embeddings similarity using TF-IDF cosine similarity."""
14
+ def __init__(self, **kwargs):
15
+ super().__init__(processor=kwargs.get("processor"), score_cutoff=kwargs.get("score_cutoff"))
16
+ self.vectorizer = TfidfVectorizer()
17
+
18
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
19
+ self._validate_inputs(generated=generated, reference=reference)
20
+
21
+ corpus = [reference, generated]
22
+ tfidf_matrix = self.vectorizer.fit_transform(corpus)
23
+ similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
24
+ # clamping for numerical stability
25
+ similarity = float(np.clip(similarity, 0.0, 1.0))
26
+
27
+ return {
28
+ "similarity": similarity,
29
+ "metadata": self._build_metadata(backend="scikit", vectorizer="TF-IDF"),
30
+ }