levelapp 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- levelapp/__init__.py +0 -0
- levelapp/aspects/__init__.py +8 -0
- levelapp/aspects/loader.py +253 -0
- levelapp/aspects/logger.py +59 -0
- levelapp/aspects/monitor.py +617 -0
- levelapp/aspects/sanitizer.py +168 -0
- levelapp/clients/__init__.py +122 -0
- levelapp/clients/anthropic.py +112 -0
- levelapp/clients/gemini.py +130 -0
- levelapp/clients/groq.py +101 -0
- levelapp/clients/huggingface.py +162 -0
- levelapp/clients/ionos.py +126 -0
- levelapp/clients/mistral.py +106 -0
- levelapp/clients/openai.py +116 -0
- levelapp/comparator/__init__.py +5 -0
- levelapp/comparator/comparator.py +232 -0
- levelapp/comparator/extractor.py +108 -0
- levelapp/comparator/schemas.py +61 -0
- levelapp/comparator/scorer.py +269 -0
- levelapp/comparator/utils.py +136 -0
- levelapp/config/__init__.py +5 -0
- levelapp/config/endpoint.py +199 -0
- levelapp/config/prompts.py +57 -0
- levelapp/core/__init__.py +0 -0
- levelapp/core/base.py +386 -0
- levelapp/core/schemas.py +24 -0
- levelapp/core/session.py +336 -0
- levelapp/endpoint/__init__.py +0 -0
- levelapp/endpoint/client.py +188 -0
- levelapp/endpoint/client_test.py +41 -0
- levelapp/endpoint/manager.py +114 -0
- levelapp/endpoint/parsers.py +119 -0
- levelapp/endpoint/schemas.py +38 -0
- levelapp/endpoint/tester.py +52 -0
- levelapp/evaluator/__init__.py +3 -0
- levelapp/evaluator/evaluator.py +307 -0
- levelapp/metrics/__init__.py +63 -0
- levelapp/metrics/embedding.py +56 -0
- levelapp/metrics/embeddings/__init__.py +0 -0
- levelapp/metrics/embeddings/sentence_transformer.py +30 -0
- levelapp/metrics/embeddings/torch_based.py +56 -0
- levelapp/metrics/exact.py +182 -0
- levelapp/metrics/fuzzy.py +80 -0
- levelapp/metrics/token.py +103 -0
- levelapp/plugins/__init__.py +0 -0
- levelapp/repository/__init__.py +3 -0
- levelapp/repository/filesystem.py +203 -0
- levelapp/repository/firestore.py +291 -0
- levelapp/simulator/__init__.py +3 -0
- levelapp/simulator/schemas.py +116 -0
- levelapp/simulator/simulator.py +531 -0
- levelapp/simulator/utils.py +134 -0
- levelapp/visualization/__init__.py +7 -0
- levelapp/visualization/charts.py +358 -0
- levelapp/visualization/dashboard.py +240 -0
- levelapp/visualization/exporter.py +167 -0
- levelapp/visualization/templates/base.html +158 -0
- levelapp/visualization/templates/comparator_dashboard.html +57 -0
- levelapp/visualization/templates/simulator_dashboard.html +111 -0
- levelapp/workflow/__init__.py +6 -0
- levelapp/workflow/base.py +192 -0
- levelapp/workflow/config.py +96 -0
- levelapp/workflow/context.py +64 -0
- levelapp/workflow/factory.py +42 -0
- levelapp/workflow/registration.py +6 -0
- levelapp/workflow/runtime.py +19 -0
- levelapp-0.1.15.dist-info/METADATA +571 -0
- levelapp-0.1.15.dist-info/RECORD +70 -0
- levelapp-0.1.15.dist-info/WHEEL +4 -0
- levelapp-0.1.15.dist-info/licenses/LICENSE +0 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""levelapp/endpoint/parsers.py"""
|
|
2
|
+
from typing import List, Dict, Any
|
|
3
|
+
|
|
4
|
+
from levelapp.endpoint.schemas import RequestSchemaConfig, ResponseMappingConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RequestPayloadBuilder:
|
|
8
|
+
def build(self, schema: List[RequestSchemaConfig], context: Dict[str, Any]) -> Dict[str, Any]:
|
|
9
|
+
"""
|
|
10
|
+
Builds nested JSON payloads using dot-notation paths.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
schema (List[RequestSchemaConfig]): List of request schema configurations.
|
|
14
|
+
context (Dict[str, Any]): Context for building the payload.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
payload (Dict[str, Any]): Request payload.
|
|
18
|
+
"""
|
|
19
|
+
payload = {}
|
|
20
|
+
|
|
21
|
+
for field_config in schema:
|
|
22
|
+
value = self._resolve_value(config=field_config, context=context)
|
|
23
|
+
if value is None and field_config.required:
|
|
24
|
+
raise ValueError(f"Required field '{field_config.field_path}' has no value")
|
|
25
|
+
|
|
26
|
+
self._set_nested_value(obj=payload, path=field_config.field_path, value=value)
|
|
27
|
+
|
|
28
|
+
return payload
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def _resolve_value(config: RequestSchemaConfig, context: Dict[str, Any]) -> Any:
|
|
32
|
+
"""
|
|
33
|
+
Resolve value based on type: static, env, or dynamic.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
config (RequestSchemaConfig): Request schema configuration.
|
|
37
|
+
context (Dict[str, Any]): Context for building the payload.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Any: Value resolved.
|
|
41
|
+
"""
|
|
42
|
+
if config.value_type == "static":
|
|
43
|
+
return config.value
|
|
44
|
+
elif config.value_type == "env":
|
|
45
|
+
import os
|
|
46
|
+
return os.getenv(config.value)
|
|
47
|
+
elif config.value_type == "dynamic":
|
|
48
|
+
return context.get(config.value, None)
|
|
49
|
+
|
|
50
|
+
return config.value
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _set_nested_value(obj: Dict, path: str, value: Any) -> None:
|
|
54
|
+
parts: List[str] = path.split(".")
|
|
55
|
+
for part in parts[:-1]:
|
|
56
|
+
obj = obj.setdefault(part, {})
|
|
57
|
+
|
|
58
|
+
obj[parts[-1]] = value
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ResponseDataExtractor:
|
|
62
|
+
"""Extracts data from API response using mapping-based config."""
|
|
63
|
+
def extract(
|
|
64
|
+
self,
|
|
65
|
+
response_data: Dict[str, Any],
|
|
66
|
+
mappings: List[ResponseMappingConfig]
|
|
67
|
+
) -> Dict[str, Any]:
|
|
68
|
+
"""
|
|
69
|
+
Extracts data from API response using mapping-based config.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
response_data (Dict[str, Any]): API response data.
|
|
73
|
+
mappings (List[ResponseMappingConfig]): List of response mappings.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Dict[str, Any]: Extracted data.
|
|
77
|
+
"""
|
|
78
|
+
result: Dict[str, Any] = {}
|
|
79
|
+
|
|
80
|
+
for mapping in mappings:
|
|
81
|
+
try:
|
|
82
|
+
value = self._extract_by_path(obj=response_data, path=mapping.field_path, default=mapping.default)
|
|
83
|
+
result[mapping.extract_as] = value
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
print(f"Failed to extract '{mapping.field_path}':\n{e}")
|
|
87
|
+
result[mapping.extract_as] = mapping.default
|
|
88
|
+
|
|
89
|
+
return result
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def _extract_by_path(obj: Dict, path: str, default: Any = "N/A") -> Any:
|
|
93
|
+
"""
|
|
94
|
+
Extracts value using JSON path-like notation.
|
|
95
|
+
"""
|
|
96
|
+
parts = path.split(".")
|
|
97
|
+
current = obj
|
|
98
|
+
|
|
99
|
+
for part in parts:
|
|
100
|
+
if not isinstance(current, dict):
|
|
101
|
+
print("[extract_by_path][WARNING] the response data is not a dict.")
|
|
102
|
+
return default
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
if '[' in part and ']' in part:
|
|
106
|
+
key, idx = part.split('[')
|
|
107
|
+
idx = int(idx.rstrip(']'))
|
|
108
|
+
current = current[key][idx] if key else current[idx]
|
|
109
|
+
else:
|
|
110
|
+
if part not in current:
|
|
111
|
+
print(f"[extract_by_path][WARNING] Key '{part}' is missing from response.")
|
|
112
|
+
return default
|
|
113
|
+
current = current.get(part)
|
|
114
|
+
|
|
115
|
+
except (KeyError, IndexError, TypeError, AttributeError) as e:
|
|
116
|
+
print(f"[extract_by_path][ERROR] Error type <{e.__class__.__name__}> : {e.args[0]}")
|
|
117
|
+
return default
|
|
118
|
+
|
|
119
|
+
return current
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""levelapp/endpoint/schemas.py"""
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HttpMethod(str, Enum):
|
|
9
|
+
GET = "GET"
|
|
10
|
+
POST = "POST"
|
|
11
|
+
PUT = "PUT"
|
|
12
|
+
PATCH = "PATCH"
|
|
13
|
+
DELETE = "DELETE"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HeaderConfig(BaseModel):
|
|
17
|
+
"""Secure header configuration with environment variables support."""
|
|
18
|
+
name: str
|
|
19
|
+
value: str
|
|
20
|
+
secure: bool = False
|
|
21
|
+
|
|
22
|
+
class Config:
|
|
23
|
+
frozen = True
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RequestSchemaConfig(BaseModel):
|
|
27
|
+
"""Schema Definition for request payload population."""
|
|
28
|
+
field_path: str # JSON path-like: "data.user.id"
|
|
29
|
+
value: Any
|
|
30
|
+
value_type: str = "static"
|
|
31
|
+
required: bool = True
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ResponseMappingConfig(BaseModel):
|
|
35
|
+
"""Response data extraction mapping."""
|
|
36
|
+
field_path: str
|
|
37
|
+
extract_as: str
|
|
38
|
+
default: Any = None
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""levelapp/endpoint/tester.py"""
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Dict, Any
|
|
4
|
+
|
|
5
|
+
from levelapp.endpoint.client import EndpointConfig, APIClient
|
|
6
|
+
from levelapp.endpoint.parsers import RequestPayloadBuilder, ResponseDataExtractor
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ConnectivityTester:
|
|
10
|
+
"""Tests REST endpoint connectivity with configurable behavior."""
|
|
11
|
+
def __init__(self, config: EndpointConfig):
|
|
12
|
+
self.config = config
|
|
13
|
+
self.client = APIClient(config=config)
|
|
14
|
+
self.payload_builder = RequestPayloadBuilder()
|
|
15
|
+
self.response_extractor = ResponseDataExtractor()
|
|
16
|
+
self.logger = logging.getLogger(f"ConnectivityTester.{self.config.name}")
|
|
17
|
+
|
|
18
|
+
async def test(self, context: Dict[str, Any] = None) -> Dict[str, Any]:
|
|
19
|
+
"""Execute connectivity test (template method)."""
|
|
20
|
+
context = context or {}
|
|
21
|
+
|
|
22
|
+
self.logger.info(f"Starting connectivity test for '{self.config.name}'")
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
payload = None
|
|
26
|
+
if self.config.request_schema:
|
|
27
|
+
payload = self.payload_builder.build(schema=self.config.request_schema, context=context)
|
|
28
|
+
self.logger.debug(f"Request payload: {payload}")
|
|
29
|
+
|
|
30
|
+
response = await self.client.execute(payload=payload)
|
|
31
|
+
self.logger.debug(f"Response status: {response.status_code}")
|
|
32
|
+
|
|
33
|
+
response_data = response.json() if response.text else {}
|
|
34
|
+
extracted = self.response_extractor.extract(
|
|
35
|
+
response_data=response_data,
|
|
36
|
+
mappings=self.config.response_mapping,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
"success": True,
|
|
41
|
+
"status_code": response.status_code,
|
|
42
|
+
"extracted_data": extracted,
|
|
43
|
+
"raw_response": response,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
except Exception as e:
|
|
47
|
+
self.logger.error(f"Connectivity test failed: {e}", exc_info=e)
|
|
48
|
+
return {
|
|
49
|
+
"success": False,
|
|
50
|
+
"error": str(e),
|
|
51
|
+
"error_type": type(e).__name__,
|
|
52
|
+
}
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
"""levelapp/core/evaluator.py"""
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from typing import List, Dict, Any, TYPE_CHECKING
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
from tenacity import (
|
|
7
|
+
retry,
|
|
8
|
+
retry_if_exception_type,
|
|
9
|
+
stop_after_attempt,
|
|
10
|
+
wait_exponential,
|
|
11
|
+
AsyncRetrying,
|
|
12
|
+
RetryError,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from levelapp.clients import ClientRegistry
|
|
16
|
+
from levelapp.comparator import MetricsManager, MetadataComparator
|
|
17
|
+
from levelapp.config.prompts import EVAL_PROMPT_TEMPLATE
|
|
18
|
+
from levelapp.core.base import BaseEvaluator, BaseChatClient
|
|
19
|
+
from levelapp.aspects import MonitoringAspect, MetricType, logger, DataLoader
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from levelapp.workflow.config import WorkflowConfig
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Evidence(BaseModel):
|
|
26
|
+
"""Evidence details for evaluation."""
|
|
27
|
+
covered_points: List[str] = Field(
|
|
28
|
+
default_factory=list,
|
|
29
|
+
description="Key points covered the agent reply covered (<= 3 items)"
|
|
30
|
+
)
|
|
31
|
+
missing_or_wrong: List[str] = Field(
|
|
32
|
+
default_factory=list,
|
|
33
|
+
description="Key points the agent reply missed or contradicted (<= 3 items)"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class JudgeEvaluationResults(BaseModel):
|
|
38
|
+
"""Structured result of an interaction evaluation."""
|
|
39
|
+
provider: str = Field(..., description="The provider name, e.g., 'openai', 'ionos'")
|
|
40
|
+
score: int = Field(..., ge=0, le=3, description="Evaluation score between 0 and 3")
|
|
41
|
+
label: str = Field(..., description="The label of the evaluation result")
|
|
42
|
+
justification: str = Field(..., description="Short explanation of the evaluation result")
|
|
43
|
+
evidence: Evidence = Field(default_factory=Evidence, description="Detailed evidence for the evaluation")
|
|
44
|
+
raw_response: Dict[str, Any] = Field(..., description="Full unprocessed API response", exclude=True)
|
|
45
|
+
metadata: Dict[str, Any] = Field(..., description="Metadata about the evaluation result")
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def from_parsed(cls, provider: str, parsed: Dict[str, Any], raw: Dict[str, Any]) -> "JudgeEvaluationResults":
|
|
49
|
+
"""
|
|
50
|
+
Build a model instance from the provided data.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
provider (str): The provider name.
|
|
54
|
+
parsed (Dict[str, Any]): The parsed response data.
|
|
55
|
+
raw (Dict[str, Any]): The raw response data.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
JudgeEvaluationResults: The constructed evaluation result instance.
|
|
59
|
+
"""
|
|
60
|
+
content = parsed.get("output", {})
|
|
61
|
+
metadata = parsed.get("metadata", {})
|
|
62
|
+
return cls(
|
|
63
|
+
provider=provider,
|
|
64
|
+
score=content.get("score", 0),
|
|
65
|
+
label=content.get("label", "N/A"),
|
|
66
|
+
justification=content.get("justification", "N/A"),
|
|
67
|
+
evidence=Evidence(**content.get("evidence", {})),
|
|
68
|
+
raw_response=raw,
|
|
69
|
+
metadata=metadata,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class JudgeEvaluator(BaseEvaluator):
|
|
74
|
+
"""LLM-as-a-judge evaluator class"""
|
|
75
|
+
def __init__(self, config: "WorkflowConfig | None" = None):
|
|
76
|
+
"""
|
|
77
|
+
Initialize the JudgeEvaluator.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
config (WorkflowConfig | None): The configuration of the workflow.
|
|
81
|
+
"""
|
|
82
|
+
if config:
|
|
83
|
+
self.config = config
|
|
84
|
+
self.providers = config.evaluation.providers
|
|
85
|
+
|
|
86
|
+
self.prompt_template = EVAL_PROMPT_TEMPLATE
|
|
87
|
+
self.client_registry = ClientRegistry
|
|
88
|
+
|
|
89
|
+
def select_client(self, provider: str) -> BaseChatClient:
|
|
90
|
+
"""
|
|
91
|
+
Select an LLM client to use for the evaluation.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
provider (str): The provider name.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
client (BaseChatClient): The LLM client to use for the evaluation.
|
|
98
|
+
"""
|
|
99
|
+
if provider not in self.client_registry.list_providers():
|
|
100
|
+
logger.warning(f"[JudgeEvaluator] {provider} is not registered. Defaulting to 'OpenAI'.")
|
|
101
|
+
return self.client_registry.get(provider="openai")
|
|
102
|
+
|
|
103
|
+
return self.client_registry.get(provider=provider)
|
|
104
|
+
|
|
105
|
+
@lru_cache(maxsize=1024)
|
|
106
|
+
def _build_prompt(self, user_input: str, generated_text: str, reference_text: str) -> str:
|
|
107
|
+
"""
|
|
108
|
+
Build the prompt used for the evaluation.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
user_input (str): The user input.
|
|
112
|
+
generated_text (str): The generated text.
|
|
113
|
+
reference_text (str): The reference text.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
A string containing the prompt.
|
|
117
|
+
"""
|
|
118
|
+
return self.prompt_template.format(
|
|
119
|
+
user_input=user_input,
|
|
120
|
+
generated_text=generated_text,
|
|
121
|
+
reference_text=reference_text
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
@retry(
|
|
125
|
+
retry=retry_if_exception_type((TimeoutError, ValueError, RuntimeError)),
|
|
126
|
+
stop=stop_after_attempt(3),
|
|
127
|
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
|
128
|
+
reraise=True,
|
|
129
|
+
)
|
|
130
|
+
def evaluate(
|
|
131
|
+
self,
|
|
132
|
+
generated_data: str,
|
|
133
|
+
reference_data: str,
|
|
134
|
+
user_input: str,
|
|
135
|
+
provider: str,
|
|
136
|
+
) -> JudgeEvaluationResults | None:
|
|
137
|
+
"""
|
|
138
|
+
Synchronous evaluation for the generated data.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
generated_data (str): The generated data.
|
|
142
|
+
reference_data (str): The reference data.
|
|
143
|
+
user_input (str): The user input.
|
|
144
|
+
provider (str): The LLM provider user for evaluation.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
JudgeEvaluationResults instance containing the evaluation results.
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
Exception: If the evaluation failed.
|
|
151
|
+
"""
|
|
152
|
+
prompt = self._build_prompt(
|
|
153
|
+
user_input=user_input,
|
|
154
|
+
generated_text=generated_data,
|
|
155
|
+
reference_text=reference_data
|
|
156
|
+
)
|
|
157
|
+
client = self.select_client(provider=provider)
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
response = client.call(message=prompt)
|
|
161
|
+
logger.info(f"[{provider}] Evaluation: {response}\n{'---' * 10}")
|
|
162
|
+
parsed = client.parse_response(response=response)
|
|
163
|
+
return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
|
|
164
|
+
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.error(f"[{provider}] Evaluation failed: {e}", exc_info=True)
|
|
167
|
+
return JudgeEvaluationResults(
|
|
168
|
+
provider=provider,
|
|
169
|
+
score=0,
|
|
170
|
+
label="N/A",
|
|
171
|
+
justification="N/A",
|
|
172
|
+
evidence=Evidence(covered_points=[], missing_or_wrong=[]),
|
|
173
|
+
raw_response={},
|
|
174
|
+
metadata={}
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
@MonitoringAspect.monitor(name="judge_evaluation", category=MetricType.API_CALL)
|
|
178
|
+
async def async_evaluate(
|
|
179
|
+
self,
|
|
180
|
+
generated_data: str,
|
|
181
|
+
reference_data: str,
|
|
182
|
+
user_input: str,
|
|
183
|
+
provider: str,
|
|
184
|
+
) -> JudgeEvaluationResults | None:
|
|
185
|
+
"""
|
|
186
|
+
Synchronous evaluation for the generated data.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
generated_data (str): The generated data.
|
|
190
|
+
reference_data (str): The reference data.
|
|
191
|
+
user_input (str): The user input.
|
|
192
|
+
provider (str): The LLM provider user for evaluation.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
JudgeEvaluationResults instance containing the evaluation results.
|
|
196
|
+
|
|
197
|
+
Raises:
|
|
198
|
+
RetryError: If the evaluation failed.
|
|
199
|
+
"""
|
|
200
|
+
prompt = self._build_prompt(
|
|
201
|
+
user_input=user_input,
|
|
202
|
+
generated_text=generated_data,
|
|
203
|
+
reference_text=reference_data
|
|
204
|
+
)
|
|
205
|
+
client = self.select_client(provider=provider)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
async for attempt in AsyncRetrying(
|
|
209
|
+
retry=retry_if_exception_type((TimeoutError, ValueError, RuntimeError)),
|
|
210
|
+
stop=stop_after_attempt(3),
|
|
211
|
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
|
212
|
+
reraise=True,
|
|
213
|
+
):
|
|
214
|
+
with attempt:
|
|
215
|
+
response = await client.acall(message=prompt)
|
|
216
|
+
parsed = client.parse_response(response=response)
|
|
217
|
+
return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
|
|
218
|
+
|
|
219
|
+
except RetryError as e:
|
|
220
|
+
logger.error(f"[{provider}] Async evaluation failed after retries: {e}", exc_info=True)
|
|
221
|
+
return JudgeEvaluationResults(
|
|
222
|
+
provider=provider,
|
|
223
|
+
score=0,
|
|
224
|
+
label="N/A",
|
|
225
|
+
justification="N/A",
|
|
226
|
+
evidence=Evidence(covered_points=[], missing_or_wrong=[]),
|
|
227
|
+
raw_response={},
|
|
228
|
+
metadata={}
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class MetadataEvaluator(BaseEvaluator):
|
|
233
|
+
"""Metadata evaluator class."""
|
|
234
|
+
def __init__(self, config: "WorkflowConfig | None" = None):
|
|
235
|
+
"""
|
|
236
|
+
Initialize the MetadataEvaluator.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
config (WorkflowConfig | None): The workflow configuration.
|
|
240
|
+
"""
|
|
241
|
+
if config:
|
|
242
|
+
self.config = config
|
|
243
|
+
self.metrics_map = config.evaluation.metrics_map
|
|
244
|
+
|
|
245
|
+
self.data_loader = DataLoader()
|
|
246
|
+
self.comparator = MetadataComparator()
|
|
247
|
+
self.metrics_manager = MetricsManager()
|
|
248
|
+
|
|
249
|
+
def evaluate(
|
|
250
|
+
self,
|
|
251
|
+
generated_data: str | Dict[str, Any],
|
|
252
|
+
reference_data: str | Dict[str, Any],
|
|
253
|
+
metrics_mapping: Any | None = None,
|
|
254
|
+
) -> Dict[str, float]:
|
|
255
|
+
"""
|
|
256
|
+
Synchronous evaluation for the generated data.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
generated_data (str): The generated data.
|
|
260
|
+
reference_data (str): The reference data.
|
|
261
|
+
metrics_mapping (dict): A dictionary mapping metric names to metrics.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
A dict containing the evaluation results.
|
|
265
|
+
"""
|
|
266
|
+
gen_data = self.data_loader.create_dynamic_model(data=generated_data, model_name="GeneratedMetadata")
|
|
267
|
+
ref_data = self.data_loader.create_dynamic_model(data=reference_data, model_name="ReferenceMetadata")
|
|
268
|
+
|
|
269
|
+
if metrics_mapping:
|
|
270
|
+
self.comparator.metrics_manager = metrics_mapping
|
|
271
|
+
else:
|
|
272
|
+
logger.info(f"[MetadataEvaluator] Metric map: {self.metrics_map}")
|
|
273
|
+
self.comparator.metrics_manager = self.metrics_map
|
|
274
|
+
|
|
275
|
+
self.comparator.metrics_manager = self.metrics_manager
|
|
276
|
+
self.comparator.generated_data = gen_data
|
|
277
|
+
self.comparator.reference_data = ref_data
|
|
278
|
+
|
|
279
|
+
output = self.comparator.run(indexed_mode=False)
|
|
280
|
+
results: Dict[str, float] = {}
|
|
281
|
+
logger.info(f"[MetadataEvaluator] Metadata Evaluation Output:\n{output}]")
|
|
282
|
+
|
|
283
|
+
for k, v in output.items():
|
|
284
|
+
field = v.get("field_name", "N/A")
|
|
285
|
+
score = v.get("set_scores", -1)
|
|
286
|
+
|
|
287
|
+
if score is None:
|
|
288
|
+
results[field] = -1
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
try:
|
|
292
|
+
val = score[0] if isinstance(score, list) else score
|
|
293
|
+
results[field] = float(val)
|
|
294
|
+
|
|
295
|
+
except (TypeError, ValueError):
|
|
296
|
+
results[field] = -1
|
|
297
|
+
|
|
298
|
+
return results
|
|
299
|
+
|
|
300
|
+
async def async_evaluate(
|
|
301
|
+
self,
|
|
302
|
+
generated_data: str | Dict[str, Any],
|
|
303
|
+
reference_data: str | Dict[str, Any],
|
|
304
|
+
**kwargs
|
|
305
|
+
):
|
|
306
|
+
"""Not implemented yet."""
|
|
307
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""levelapp/metrics/__init__.py"""
|
|
2
|
+
from typing import List, Dict, Type, Any
|
|
3
|
+
|
|
4
|
+
from levelapp.aspects import logger
|
|
5
|
+
from levelapp.core.base import BaseMetric
|
|
6
|
+
from levelapp.metrics.exact import EXACT_METRICS
|
|
7
|
+
from levelapp.metrics.fuzzy import FUZZY_METRICS
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MetricRegistry:
|
|
11
|
+
"""Registry for metric classes."""
|
|
12
|
+
_metrics: Dict[str, Type[BaseMetric]] = {}
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
def register(cls, name: str, metric_class: Type[BaseMetric]) -> None:
|
|
16
|
+
"""
|
|
17
|
+
Register a metric class under a given name.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
name (str): Unique identifier for the metric.
|
|
21
|
+
metric_class (Type[BaseMetric]): The metric class to register.
|
|
22
|
+
"""
|
|
23
|
+
if name in cls._metrics:
|
|
24
|
+
raise KeyError(f"Metric '{name}' is already registered")
|
|
25
|
+
|
|
26
|
+
cls._metrics[name] = metric_class
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def get(cls, name: str, **kwargs: Any) -> BaseMetric:
|
|
30
|
+
"""
|
|
31
|
+
Retrieve an instance of a registered metric by its name.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
name (str): The name of the metric to retrieve.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Type[BaseMetric]: The metric class associated with the given name.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
KeyError: If the metric is not found.
|
|
41
|
+
"""
|
|
42
|
+
if name not in cls._metrics:
|
|
43
|
+
raise KeyError(f"Metric '{name}' is not registered")
|
|
44
|
+
|
|
45
|
+
return cls._metrics[name](**kwargs)
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def list_metrics(cls) -> List[str]:
|
|
49
|
+
return list(cls._metrics.keys())
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def unregister(cls, name: str) -> None:
|
|
53
|
+
cls._metrics.pop(name, None)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
METRICS = FUZZY_METRICS | EXACT_METRICS
|
|
57
|
+
|
|
58
|
+
for name_, metric_class_ in METRICS.items():
|
|
59
|
+
try:
|
|
60
|
+
MetricRegistry.register(name_, metric_class_)
|
|
61
|
+
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.info(f"Failed to register metric {name_}: {e}")
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""levelapp/metrics/embeddings.py"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import importlib
|
|
5
|
+
|
|
6
|
+
from importlib import util
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
from levelapp.core.base import BaseMetric
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EmbeddingMetric(BaseMetric):
|
|
13
|
+
"""
|
|
14
|
+
Abstract embeddings metric that dynamically delegates to a backend implementation (Torch or Scikit).
|
|
15
|
+
"""
|
|
16
|
+
def __init__(self, backend: str | None = None, **kwargs: Any):
|
|
17
|
+
"""
|
|
18
|
+
Initialize the embeddings metric.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
backend (str, optional): Embedding metric backend 'torch' or 'scikit'. Defaults to None.
|
|
22
|
+
"""
|
|
23
|
+
super().__init__(processor=kwargs.get("processor"), score_cutoff=kwargs.get("score_cutoff"))
|
|
24
|
+
self.backend_name = backend or self._detect_backend()
|
|
25
|
+
self.backend = self._load_backend(self.backend_name)(**kwargs)
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def _detect_backend() -> str:
|
|
29
|
+
"""Auto-detect which embeddings backend to use."""
|
|
30
|
+
if util.find_spec("torch") and util.find_spec("transformers"):
|
|
31
|
+
return "torch"
|
|
32
|
+
|
|
33
|
+
elif util.find_spec("sklearn"):
|
|
34
|
+
return "scikit"
|
|
35
|
+
|
|
36
|
+
raise ImportError(
|
|
37
|
+
"No embeddings backend available. Install with 'pip install levelapp[embeddings]' "
|
|
38
|
+
"for Torch support, or ensure scikit-learn is installed."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def _load_backend(backend: str):
|
|
43
|
+
if backend == "torch":
|
|
44
|
+
module = importlib.import_module("levelapp.metrics.embeddings.torch_based")
|
|
45
|
+
return getattr(module, "TorchEmbeddingMetric")
|
|
46
|
+
|
|
47
|
+
elif backend == "scikit":
|
|
48
|
+
module = importlib.import_module("levelapp.metrics.embeddings.sentence_transformer")
|
|
49
|
+
return getattr(module, "SentenceEmbeddingMetric")
|
|
50
|
+
|
|
51
|
+
else:
|
|
52
|
+
raise ValueError(f"Unknown embeddings backend: {backend}")
|
|
53
|
+
|
|
54
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
55
|
+
"""Delegate to selected backend implementation."""
|
|
56
|
+
return self.backend.compute(generated, reference)
|
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""levelapp/metrics/embeddings/sentence_transformer.py"""
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
7
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
8
|
+
|
|
9
|
+
from levelapp.core.base import BaseMetric
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SentenceEmbeddingMetric(BaseMetric):
|
|
13
|
+
"""Lightweight embeddings similarity using TF-IDF cosine similarity."""
|
|
14
|
+
def __init__(self, **kwargs):
|
|
15
|
+
super().__init__(processor=kwargs.get("processor"), score_cutoff=kwargs.get("score_cutoff"))
|
|
16
|
+
self.vectorizer = TfidfVectorizer()
|
|
17
|
+
|
|
18
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
19
|
+
self._validate_inputs(generated=generated, reference=reference)
|
|
20
|
+
|
|
21
|
+
corpus = [reference, generated]
|
|
22
|
+
tfidf_matrix = self.vectorizer.fit_transform(corpus)
|
|
23
|
+
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
|
|
24
|
+
# clamping for numerical stability
|
|
25
|
+
similarity = float(np.clip(similarity, 0.0, 1.0))
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
"similarity": similarity,
|
|
29
|
+
"metadata": self._build_metadata(backend="scikit", vectorizer="TF-IDF"),
|
|
30
|
+
}
|