levelapp 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- levelapp/aspects/monitor.py +3 -1
- levelapp/clients/__init__.py +0 -1
- levelapp/comparator/scorer.py +0 -2
- levelapp/config/endpoint.py +22 -13
- levelapp/config/endpoint_.py +62 -0
- levelapp/config/prompts.py +22 -0
- levelapp/core/schemas.py +24 -0
- levelapp/core/session.py +97 -59
- levelapp/evaluator/evaluator.py +42 -14
- levelapp/metrics/__init__.py +1 -5
- levelapp/repository/firestore.py +15 -6
- levelapp/simulator/schemas.py +15 -21
- levelapp/simulator/simulator.py +124 -55
- levelapp/simulator/utils.py +40 -78
- levelapp/workflow/__init__.py +3 -2
- levelapp/workflow/base.py +64 -17
- levelapp/workflow/config.py +92 -0
- levelapp/workflow/context.py +62 -0
- levelapp/workflow/factory.py +32 -41
- levelapp/workflow/registration.py +1 -1
- levelapp/workflow/runtime.py +19 -0
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/METADATA +102 -39
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/RECORD +25 -21
- levelapp/workflow/schemas.py +0 -121
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/WHEEL +0 -0
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/licenses/LICENSE +0 -0
levelapp/aspects/monitor.py
CHANGED
|
@@ -343,6 +343,7 @@ class FunctionMonitor:
|
|
|
343
343
|
category: MetricType,
|
|
344
344
|
enable_timing: bool,
|
|
345
345
|
track_memory: bool,
|
|
346
|
+
verbose=False
|
|
346
347
|
) -> Callable[P, T]:
|
|
347
348
|
"""
|
|
348
349
|
Wrap function execution with timing and error handling.
|
|
@@ -352,6 +353,7 @@ class FunctionMonitor:
|
|
|
352
353
|
name: Unique identifier for the function
|
|
353
354
|
enable_timing: Enable execution time logging
|
|
354
355
|
track_memory: Enable memory tracking
|
|
356
|
+
verbose: Enable verbose logging
|
|
355
357
|
|
|
356
358
|
Returns:
|
|
357
359
|
Wrapped function
|
|
@@ -402,7 +404,7 @@ class FunctionMonitor:
|
|
|
402
404
|
|
|
403
405
|
self._aggregated_stats[name].update(metrics=metrics)
|
|
404
406
|
|
|
405
|
-
if enable_timing and metrics.duration is not None:
|
|
407
|
+
if verbose and enable_timing and metrics.duration is not None:
|
|
406
408
|
log_message = f"[FunctionMonitor] Executed '{name}' in {metrics.duration:.4f}s"
|
|
407
409
|
if metrics.cache_hit:
|
|
408
410
|
log_message += " (cache hit)"
|
levelapp/clients/__init__.py
CHANGED
|
@@ -44,7 +44,6 @@ class ClientRegistry:
|
|
|
44
44
|
|
|
45
45
|
cls._wrap_client_methods(client_class)
|
|
46
46
|
cls._clients[provider] = client_class
|
|
47
|
-
logger.info(f"[ClientRegistry] Registered client for provider: {provider}")
|
|
48
47
|
|
|
49
48
|
@classmethod
|
|
50
49
|
def _wrap_client_methods(cls, client_class: Type[BaseChatClient]) -> None:
|
levelapp/comparator/scorer.py
CHANGED
|
@@ -78,7 +78,6 @@ class MetricsManager:
|
|
|
78
78
|
ValueError: if the scorer is not a callable.
|
|
79
79
|
"""
|
|
80
80
|
self._scorers[name] = scorer
|
|
81
|
-
logger.info(f"[MetricsManager] Registered scorer: {name}")
|
|
82
81
|
|
|
83
82
|
def get_scorer(self, name: str) -> Callable:
|
|
84
83
|
"""
|
|
@@ -95,7 +94,6 @@ class MetricsManager:
|
|
|
95
94
|
"""
|
|
96
95
|
try:
|
|
97
96
|
scorer = self._scorers.get(name)
|
|
98
|
-
logger.info(f"[get_scorer] Retrieved scorer: {name}")
|
|
99
97
|
return scorer
|
|
100
98
|
|
|
101
99
|
except KeyError:
|
levelapp/config/endpoint.py
CHANGED
|
@@ -29,7 +29,6 @@ class EndpointConfig(BaseModel):
|
|
|
29
29
|
bearer_token (SecretStr): The Bearer token to use.
|
|
30
30
|
model_id (str): The model to use (if applicable).
|
|
31
31
|
default_request_payload_template (Dict[str, Any]): The payload template to use.
|
|
32
|
-
generated_request_payload_template (Dict[str, Any]): The generated payload template from a provided file.
|
|
33
32
|
variables (Dict[str, Any]): The variables to populate the payload template.
|
|
34
33
|
|
|
35
34
|
Note:
|
|
@@ -40,11 +39,10 @@ class EndpointConfig(BaseModel):
|
|
|
40
39
|
- bearer_token (SecretStr): The Bearer token to use.
|
|
41
40
|
- model_id (str): The model to use (if applicable).
|
|
42
41
|
- default_payload_template (Dict[str, Any]): The payload template to use.
|
|
43
|
-
- generated_payload_template (Dict[str, Any]): The generated payload template from a provided file.
|
|
44
42
|
- variables (Dict[str, Any]): The variables to populate the payload template.
|
|
45
43
|
|
|
46
44
|
Or manually configure the model instance by assigning the proper values to the model fields.\n
|
|
47
|
-
You can also provide the path in the .env file for the payload template (ENDPOINT_PAYLOAD_PATH)
|
|
45
|
+
You can also provide the path in the .env file for the payload template (ENDPOINT_PAYLOAD_PATH/)
|
|
48
46
|
and the response template (ENDPOINT_RESPONSE_PATH) separately. The files can be either YAML or JSON only.
|
|
49
47
|
"""
|
|
50
48
|
load_dotenv()
|
|
@@ -61,9 +59,7 @@ class EndpointConfig(BaseModel):
|
|
|
61
59
|
|
|
62
60
|
# Data
|
|
63
61
|
default_request_payload_template: Dict[str, Any] = Field(default_factory=dict)
|
|
64
|
-
generated_request_payload_template: Dict[str, Any] = Field(default_factory=dict)
|
|
65
62
|
default_response_payload_template: Dict[str, Any] = Field(default_factory=dict)
|
|
66
|
-
generated_response_payload_template: Dict[str, Any] = Field(default_factory=dict)
|
|
67
63
|
|
|
68
64
|
# Variables
|
|
69
65
|
variables: Dict[str, Any] = Field(default_factory=dict)
|
|
@@ -88,14 +84,18 @@ class EndpointConfig(BaseModel):
|
|
|
88
84
|
@computed_field
|
|
89
85
|
@property
|
|
90
86
|
def request_payload(self) -> Dict[str, Any]:
|
|
91
|
-
"""
|
|
92
|
-
|
|
87
|
+
"""
|
|
88
|
+
Return fully prepared payload depending on template or full payload.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
request payload (Dict[str, Any]): Populated request payload template.
|
|
92
|
+
"""
|
|
93
|
+
# First, we check if we have variables to populate the template with. If not, we return the template as is.
|
|
93
94
|
if not self.variables:
|
|
94
95
|
return self.default_request_payload_template
|
|
95
96
|
|
|
96
97
|
if not self.default_request_payload_template:
|
|
97
|
-
self.load_template(template_type=TemplateType.REQUEST)
|
|
98
|
-
base_template = self.generated_request_payload_template
|
|
98
|
+
base_template = self.load_template(template_type=TemplateType.REQUEST)
|
|
99
99
|
else:
|
|
100
100
|
base_template = self.default_request_payload_template
|
|
101
101
|
|
|
@@ -118,8 +118,7 @@ class EndpointConfig(BaseModel):
|
|
|
118
118
|
return self.default_response_payload_template
|
|
119
119
|
|
|
120
120
|
if not self.default_response_payload_template:
|
|
121
|
-
self.load_template(template_type=TemplateType.RESPONSE)
|
|
122
|
-
base_template = self.generated_response_payload_template
|
|
121
|
+
base_template = self.load_template(template_type=TemplateType.RESPONSE)
|
|
123
122
|
else:
|
|
124
123
|
base_template = self.default_response_payload_template
|
|
125
124
|
|
|
@@ -148,12 +147,23 @@ class EndpointConfig(BaseModel):
|
|
|
148
147
|
|
|
149
148
|
return _replace(obj)
|
|
150
149
|
|
|
150
|
+
@staticmethod
|
|
151
151
|
def load_template(
|
|
152
|
-
self,
|
|
153
152
|
template_type: TemplateType = TemplateType.REQUEST,
|
|
154
153
|
path: str | None = None
|
|
155
154
|
) -> Dict[str, Any]:
|
|
155
|
+
"""
|
|
156
|
+
Load request/response payload template from JSON/YAML file.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
template_type (TemplateType): The type of template to load (REQUEST or RESPONSE).
|
|
160
|
+
path (str): The path of the payload template file to load.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Payload template (Dict[str, Any]): Payload template.
|
|
164
|
+
"""
|
|
156
165
|
try:
|
|
166
|
+
# If no path was provided, we check the env. variables.
|
|
157
167
|
if not path:
|
|
158
168
|
env_var = "ENDPOINT_PAYLOAD_PATH" if template_type == TemplateType.REQUEST else "ENDPOINT_RESPONSE_PATH"
|
|
159
169
|
path = os.getenv(env_var, '')
|
|
@@ -171,7 +181,6 @@ class EndpointConfig(BaseModel):
|
|
|
171
181
|
else:
|
|
172
182
|
raise ValueError("[EndpointConfig] Unsupported file format.")
|
|
173
183
|
|
|
174
|
-
self.generated_request_payload_template = data
|
|
175
184
|
return data
|
|
176
185
|
|
|
177
186
|
except FileNotFoundError as e:
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Any, List
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HttpMethod(str, Enum):
|
|
9
|
+
GET = "GET"
|
|
10
|
+
POST = "POST"
|
|
11
|
+
PUT = "PUT"
|
|
12
|
+
Patch = "PATCH"
|
|
13
|
+
DELETE = "DELETE"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HeaderConfig(BaseModel):
|
|
17
|
+
"""Secure header configuration with environment variables support."""
|
|
18
|
+
name: str
|
|
19
|
+
value: str
|
|
20
|
+
secure: bool = False
|
|
21
|
+
|
|
22
|
+
class Config:
|
|
23
|
+
frozen = True
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RequestSchemaConfig(BaseModel):
|
|
27
|
+
"""Schema definition for request payload population."""
|
|
28
|
+
field_path: str # JSON path-like: "data.user.id"
|
|
29
|
+
value: Any
|
|
30
|
+
value_type: str = "static" # static, env, dynamic
|
|
31
|
+
required: bool = True
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ResponseMappingConfig(BaseModel):
|
|
35
|
+
"""Response data extraction mapping."""
|
|
36
|
+
field_path: str # JSON path-like: "data.results[0].id"
|
|
37
|
+
extract_as: str # Name to extract as
|
|
38
|
+
default: Any = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EndpointConfig(BaseModel):
|
|
42
|
+
"""Complete endpoint configuration."""
|
|
43
|
+
name: str
|
|
44
|
+
base_url: str
|
|
45
|
+
path: str
|
|
46
|
+
method: HttpMethod
|
|
47
|
+
headers: List[HeaderConfig] = Field(default_factory=list)
|
|
48
|
+
request_schema: List[RequestSchemaConfig] = Field(default_factory=list)
|
|
49
|
+
response_mapping: List[ResponseMappingConfig] = Field(default_factory=list)
|
|
50
|
+
timeout: int = 30
|
|
51
|
+
retry_count: int = 3
|
|
52
|
+
retry_backoff: float = 1.0
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def validate_path(cls, v: str) -> str:
|
|
56
|
+
if not v.startswith('/'):
|
|
57
|
+
return f'/{v}'
|
|
58
|
+
return v
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class PayloadBuilder(ABC):
|
|
62
|
+
"""Abstract base for payload construction strategies."""
|
levelapp/config/prompts.py
CHANGED
|
@@ -33,3 +33,25 @@ Return ONLY a single JSON object on one line with exactly these keys:
|
|
|
33
33
|
|
|
34
34
|
Do NOT include any additional text, explanations, or formatting (e.g., "JSON object:", ```json or ```, or markdown).
|
|
35
35
|
"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
SUMMARIZATION_PROMPT_TEMPLATE = """
|
|
39
|
+
You are reviewing evaluation justifications from LLM judges about replies generated by a virtual assistant.
|
|
40
|
+
Interpret the context from the verdicts: (e.g., real-estate leasing, medical appointment scheduling, etc.).
|
|
41
|
+
|
|
42
|
+
Each justification contains the judge's assessment of how well the assistant's response matched the expected reply.
|
|
43
|
+
Your task is to **identify and summarize only the negative points**, such as:
|
|
44
|
+
- Errors or inaccuracies
|
|
45
|
+
- Misunderstandings or misinterpretations
|
|
46
|
+
- Missing or incomplete information
|
|
47
|
+
- Failure to meet expectations or requirements
|
|
48
|
+
|
|
49
|
+
**Instructions:**
|
|
50
|
+
- Return up to {max_bullets} concise bullet points.
|
|
51
|
+
- Start each point with "- " and focus on clarity and relevance.
|
|
52
|
+
- Avoid redundancy and prioritize actionable feedback.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
- Judge: {judge}
|
|
56
|
+
- Verdicts: {verdicts}
|
|
57
|
+
"""
|
levelapp/core/schemas.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ExtendedEnum(Enum):
|
|
5
|
+
@classmethod
|
|
6
|
+
def list(cls):
|
|
7
|
+
return [e.value for e in cls]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WorkflowType(ExtendedEnum):
|
|
11
|
+
SIMULATOR = "SIMULATOR"
|
|
12
|
+
COMPARATOR = "COMPARATOR"
|
|
13
|
+
ASSESSOR = "ASSESSOR"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RepositoryType(ExtendedEnum):
|
|
17
|
+
FIRESTORE = "FIRESTORE"
|
|
18
|
+
FILESYSTEM = "FILESYSTEM"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EvaluatorType(ExtendedEnum):
|
|
22
|
+
JUDGE = "JUDGE"
|
|
23
|
+
REFERENCE = "REFERENCE"
|
|
24
|
+
RAG = "RAG"
|
levelapp/core/session.py
CHANGED
|
@@ -1,27 +1,23 @@
|
|
|
1
1
|
"""levelapp/core/session.py"""
|
|
2
2
|
import threading
|
|
3
3
|
|
|
4
|
+
from abc import ABC
|
|
5
|
+
|
|
4
6
|
from dataclasses import dataclass, field
|
|
5
7
|
from typing import Dict, List, Any
|
|
6
8
|
|
|
7
9
|
from datetime import datetime
|
|
8
10
|
from humanize import precisedelta
|
|
9
11
|
|
|
10
|
-
from levelapp.workflow import MainFactory
|
|
12
|
+
from levelapp.workflow import MainFactory, WorkflowConfig
|
|
11
13
|
from levelapp.workflow.base import BaseWorkflow
|
|
12
|
-
from levelapp.
|
|
13
|
-
from levelapp.
|
|
14
|
+
from levelapp.aspects import MetricType, ExecutionMetrics, MonitoringAspect, logger
|
|
15
|
+
from levelapp.workflow.context import WorkflowContextBuilder
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
session_name: str
|
|
20
|
-
started_at: datetime | None = None
|
|
21
|
-
ended_at: datetime | None = None
|
|
22
|
-
total_executions: int = 0
|
|
23
|
-
total_duration: float = 0.0
|
|
24
|
-
steps: Dict[str, 'StepMetadata'] = field(default_factory=dict)
|
|
18
|
+
class TemporalStatusMixin(ABC):
|
|
19
|
+
started_at: datetime | None
|
|
20
|
+
ended_at: datetime | None
|
|
25
21
|
|
|
26
22
|
@property
|
|
27
23
|
def is_active(self) -> bool:
|
|
@@ -37,7 +33,18 @@ class SessionMetadata:
|
|
|
37
33
|
|
|
38
34
|
|
|
39
35
|
@dataclass
|
|
40
|
-
class
|
|
36
|
+
class SessionMetadata(TemporalStatusMixin):
|
|
37
|
+
"""Metadata for an evaluation session."""
|
|
38
|
+
session_name: str
|
|
39
|
+
started_at: datetime | None = None
|
|
40
|
+
ended_at: datetime | None = None
|
|
41
|
+
total_executions: int = 0
|
|
42
|
+
total_duration: float = 0.0
|
|
43
|
+
steps: Dict[str, 'StepMetadata'] = field(default_factory=dict)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class StepMetadata(TemporalStatusMixin):
|
|
41
48
|
"""Metadata for a specific step within an evaluation session."""
|
|
42
49
|
step_name: str
|
|
43
50
|
session_name: str
|
|
@@ -47,27 +54,29 @@ class StepMetadata:
|
|
|
47
54
|
error_count: int = 0
|
|
48
55
|
procedures_stats: List[ExecutionMetrics] | None = None
|
|
49
56
|
|
|
50
|
-
@property
|
|
51
|
-
def is_active(self) -> bool:
|
|
52
|
-
"""Check if the step is currently active."""
|
|
53
|
-
return self.ended_at is None
|
|
54
|
-
|
|
55
|
-
@property
|
|
56
|
-
def duration(self) -> float | None:
|
|
57
|
-
"""Calculate the duration of the step in seconds."""
|
|
58
|
-
if not self.is_active:
|
|
59
|
-
return (self.ended_at - self.started_at).total_seconds()
|
|
60
|
-
return None
|
|
61
|
-
|
|
62
57
|
|
|
63
58
|
class StepContext:
|
|
64
59
|
"""Context manager for an evaluation step within an EvaluationSession."""
|
|
65
|
-
def __init__(
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
session: "EvaluationSession",
|
|
63
|
+
step_name: str,
|
|
64
|
+
category: MetricType,
|
|
65
|
+
):
|
|
66
|
+
"""
|
|
67
|
+
Initialize StepContext.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
session (EvaluationSession): Evaluation session.
|
|
71
|
+
step_name (str): Step name.
|
|
72
|
+
category (MetricType): Metric type.
|
|
73
|
+
"""
|
|
66
74
|
self.session = session
|
|
67
75
|
self.step_name = step_name
|
|
68
76
|
self.category = category
|
|
77
|
+
|
|
69
78
|
self.step_meta: StepMetadata | None = None
|
|
70
|
-
self.full_step_name = f"{session.session_name}
|
|
79
|
+
self.full_step_name = f"<{session.session_name}:{step_name}>"
|
|
71
80
|
self._monitored_func = None
|
|
72
81
|
self._func_gen = None
|
|
73
82
|
|
|
@@ -80,35 +89,50 @@ class StepContext:
|
|
|
80
89
|
)
|
|
81
90
|
self.session.session_metadata.steps[self.step_name] = self.step_meta
|
|
82
91
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
92
|
+
if self.session.enable_monitoring:
|
|
93
|
+
# Wrap with FunctionMonitor
|
|
94
|
+
self._monitored_func = self.session.monitor.monitor(
|
|
95
|
+
name=self.full_step_name,
|
|
96
|
+
category=self.category,
|
|
97
|
+
enable_timing=True,
|
|
98
|
+
track_memory=True,
|
|
99
|
+
verbose=self.session.verbose,
|
|
100
|
+
)(self._step_wrapper)
|
|
101
|
+
|
|
102
|
+
# Start monitoring
|
|
103
|
+
try:
|
|
104
|
+
self._func_gen = self._monitored_func()
|
|
105
|
+
next(self._func_gen) # Enter monitoring
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(f"[StepContext] Failed to initialize monitoring for {self.full_step_name}:\n{e}")
|
|
108
|
+
raise
|
|
109
|
+
|
|
94
110
|
return self # returning self allows nested instrumentation
|
|
95
111
|
|
|
112
|
+
# noinspection PyMethodMayBeStatic
|
|
96
113
|
def _step_wrapper(self):
|
|
97
114
|
yield # Actual user step execution happens here
|
|
98
115
|
|
|
99
116
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
117
|
+
if self.session.enable_monitoring:
|
|
118
|
+
try:
|
|
119
|
+
next(self._func_gen) # Exit monitoring
|
|
120
|
+
except StopIteration:
|
|
121
|
+
pass
|
|
104
122
|
|
|
105
123
|
with self.session.lock:
|
|
106
124
|
self.step_meta.ended_at = datetime.now()
|
|
125
|
+
|
|
107
126
|
if exc_type:
|
|
108
127
|
self.step_meta.error_count += 1
|
|
128
|
+
|
|
109
129
|
self.session.session_metadata.total_executions += 1
|
|
110
|
-
|
|
111
|
-
|
|
130
|
+
|
|
131
|
+
if self.session.enable_monitoring and self.step_meta.duration:
|
|
132
|
+
self.session.monitor.update_procedure_duration(
|
|
133
|
+
name=self.full_step_name,
|
|
134
|
+
value=self.step_meta.duration
|
|
135
|
+
)
|
|
112
136
|
self.session.session_metadata.total_duration += self.step_meta.duration
|
|
113
137
|
|
|
114
138
|
return False
|
|
@@ -119,29 +143,34 @@ class EvaluationSession:
|
|
|
119
143
|
def __init__(
|
|
120
144
|
self,
|
|
121
145
|
session_name: str = "test-session",
|
|
122
|
-
|
|
123
|
-
|
|
146
|
+
workflow_config: WorkflowConfig | None = None,
|
|
147
|
+
enable_monitoring: bool = True,
|
|
148
|
+
verbose: bool = False
|
|
124
149
|
):
|
|
125
150
|
"""
|
|
126
151
|
Initialize Evaluation Session.
|
|
127
152
|
|
|
128
153
|
Args:
|
|
129
154
|
session_name (str): Name of the session
|
|
130
|
-
monitor (FunctionMonitor): Function monitoring aspect
|
|
131
155
|
workflow_config (WorkflowConfig): Workflow configuration.
|
|
156
|
+
enable_monitoring (bool): Switch monitoring on. Defaults to True.
|
|
157
|
+
verbose (bool): Verbose mode. Defaults to False.
|
|
132
158
|
"""
|
|
133
159
|
self._NAME = self.__class__.__name__
|
|
134
160
|
|
|
135
161
|
self.session_name = session_name
|
|
136
|
-
self.monitor = monitor or MonitoringAspect
|
|
137
162
|
self.workflow_config = workflow_config
|
|
138
|
-
self.
|
|
163
|
+
self.enable_monitoring = enable_monitoring
|
|
164
|
+
self.verbose = verbose
|
|
139
165
|
|
|
140
166
|
self.workflow: BaseWorkflow | None = None
|
|
141
167
|
|
|
142
168
|
self.session_metadata = SessionMetadata(session_name=session_name)
|
|
169
|
+
self.monitor = MonitoringAspect if enable_monitoring else None
|
|
143
170
|
self._lock = threading.RLock()
|
|
144
171
|
|
|
172
|
+
logger.info("[EvaluationSession] Evaluation session initialized.")
|
|
173
|
+
|
|
145
174
|
@property
|
|
146
175
|
def lock(self):
|
|
147
176
|
return self._lock
|
|
@@ -154,17 +183,13 @@ class EvaluationSession:
|
|
|
154
183
|
if not self.workflow_config:
|
|
155
184
|
raise ValueError(f"{self._NAME}: Workflow configuration must be provided")
|
|
156
185
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
endpoint_config=self.workflow_config.endpoint_config,
|
|
162
|
-
inputs=self.workflow_config.inputs
|
|
163
|
-
)
|
|
164
|
-
self.workflow = MainFactory.create_workflow(self.workflow_type, context)
|
|
186
|
+
context_builder = WorkflowContextBuilder(self.workflow_config)
|
|
187
|
+
context = context_builder.build()
|
|
188
|
+
|
|
189
|
+
self.workflow = MainFactory.create_workflow(context=context)
|
|
165
190
|
|
|
166
191
|
logger.info(
|
|
167
|
-
f"[{self._NAME}] Starting evaluation session: {self.session_name}
|
|
192
|
+
f"[{self._NAME}] Starting evaluation session: {self.session_name} - "
|
|
168
193
|
f"Workflow: '{self.workflow.name}'"
|
|
169
194
|
)
|
|
170
195
|
return self
|
|
@@ -178,6 +203,7 @@ class EvaluationSession:
|
|
|
178
203
|
|
|
179
204
|
if exc_type:
|
|
180
205
|
logger.error(f"[{self._NAME}] Session ended with error: {exc_val}", exc_info=True)
|
|
206
|
+
|
|
181
207
|
return False
|
|
182
208
|
|
|
183
209
|
def step(self, step_name: str, category: MetricType = MetricType.CUSTOM) -> StepContext:
|
|
@@ -201,6 +227,19 @@ class EvaluationSession:
|
|
|
201
227
|
self.workflow.collect_results()
|
|
202
228
|
|
|
203
229
|
def get_stats(self) -> Dict[str, Any]:
|
|
230
|
+
if self.enable_monitoring:
|
|
231
|
+
return {
|
|
232
|
+
"session": {
|
|
233
|
+
"name": self.session_name,
|
|
234
|
+
"duration": precisedelta(self.session_metadata.duration, suppress=['minutes']),
|
|
235
|
+
"start_time": self.session_metadata.started_at.isoformat(),
|
|
236
|
+
"end_time": self.session_metadata.ended_at.isoformat(),
|
|
237
|
+
"steps": len(self.session_metadata.steps),
|
|
238
|
+
"errors": sum(s.error_count for s in self.session_metadata.steps.values())
|
|
239
|
+
},
|
|
240
|
+
"stats": self.monitor.get_all_stats()
|
|
241
|
+
}
|
|
242
|
+
|
|
204
243
|
return {
|
|
205
244
|
"session": {
|
|
206
245
|
"name": self.session_name,
|
|
@@ -210,5 +249,4 @@ class EvaluationSession:
|
|
|
210
249
|
"steps": len(self.session_metadata.steps),
|
|
211
250
|
"errors": sum(s.error_count for s in self.session_metadata.steps.values())
|
|
212
251
|
},
|
|
213
|
-
"stats": self.monitor.get_all_stats()
|
|
214
252
|
}
|
levelapp/evaluator/evaluator.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""levelapp/core/evaluator.py"""
|
|
2
2
|
from functools import lru_cache
|
|
3
|
-
from typing import List, Dict, Any
|
|
4
|
-
from collections import defaultdict
|
|
3
|
+
from typing import List, Dict, Any, TYPE_CHECKING
|
|
5
4
|
from pydantic import BaseModel, Field
|
|
6
5
|
|
|
7
6
|
from tenacity import (
|
|
@@ -19,6 +18,9 @@ from levelapp.config.prompts import EVAL_PROMPT_TEMPLATE
|
|
|
19
18
|
from levelapp.core.base import BaseEvaluator, BaseChatClient
|
|
20
19
|
from levelapp.aspects import MonitoringAspect, MetricType, logger, DataLoader
|
|
21
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from levelapp.workflow.config import WorkflowConfig
|
|
23
|
+
|
|
22
24
|
|
|
23
25
|
class Evidence(BaseModel):
|
|
24
26
|
"""Evidence details for evaluation."""
|
|
@@ -39,7 +41,7 @@ class JudgeEvaluationResults(BaseModel):
|
|
|
39
41
|
label: str = Field(..., description="The label of the evaluation result")
|
|
40
42
|
justification: str = Field(..., description="Short explanation of the evaluation result")
|
|
41
43
|
evidence: Evidence = Field(default_factory=Evidence, description="Detailed evidence for the evaluation")
|
|
42
|
-
raw_response: Dict[str, Any] = Field(..., description="Full unprocessed API response")
|
|
44
|
+
raw_response: Dict[str, Any] = Field(..., description="Full unprocessed API response", exclude=True)
|
|
43
45
|
metadata: Dict[str, Any] = Field(..., description="Metadata about the evaluation result")
|
|
44
46
|
|
|
45
47
|
@classmethod
|
|
@@ -69,19 +71,36 @@ class JudgeEvaluationResults(BaseModel):
|
|
|
69
71
|
|
|
70
72
|
|
|
71
73
|
class JudgeEvaluator(BaseEvaluator):
|
|
72
|
-
|
|
74
|
+
"""LLM-as-a-judge evaluator class"""
|
|
75
|
+
def __init__(self, config: "WorkflowConfig | None" = None):
|
|
76
|
+
"""
|
|
77
|
+
Initialize the JudgeEvaluator.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
config (WorkflowConfig | None): The configuration of the workflow.
|
|
81
|
+
"""
|
|
82
|
+
if config:
|
|
83
|
+
self.config = config
|
|
84
|
+
self.providers = config.evaluation.providers
|
|
85
|
+
|
|
73
86
|
self.prompt_template = EVAL_PROMPT_TEMPLATE
|
|
74
|
-
self.
|
|
87
|
+
self.client_registry = ClientRegistry
|
|
75
88
|
|
|
76
|
-
def
|
|
89
|
+
def select_client(self, provider: str) -> BaseChatClient:
|
|
77
90
|
"""
|
|
78
|
-
|
|
91
|
+
Select an LLM client to use for the evaluation.
|
|
79
92
|
|
|
80
93
|
Args:
|
|
81
94
|
provider (str): The provider name.
|
|
82
|
-
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
client (BaseChatClient): The LLM client to use for the evaluation.
|
|
83
98
|
"""
|
|
84
|
-
|
|
99
|
+
if provider not in self.client_registry.list_providers():
|
|
100
|
+
logger.warning(f"[JudgeEvaluator] {provider} is not registered. Defaulting to 'OpenAI'.")
|
|
101
|
+
return self.client_registry.get(provider="openai")
|
|
102
|
+
|
|
103
|
+
return self.client_registry.get(provider=provider)
|
|
85
104
|
|
|
86
105
|
@lru_cache(maxsize=1024)
|
|
87
106
|
def _build_prompt(self, user_input: str, generated_text: str, reference_text: str) -> str:
|
|
@@ -135,7 +154,7 @@ class JudgeEvaluator(BaseEvaluator):
|
|
|
135
154
|
generated_text=generated_data,
|
|
136
155
|
reference_text=reference_data
|
|
137
156
|
)
|
|
138
|
-
client =
|
|
157
|
+
client = self.select_client(provider=provider)
|
|
139
158
|
|
|
140
159
|
try:
|
|
141
160
|
response = client.call(message=prompt)
|
|
@@ -183,7 +202,7 @@ class JudgeEvaluator(BaseEvaluator):
|
|
|
183
202
|
generated_text=generated_data,
|
|
184
203
|
reference_text=reference_data
|
|
185
204
|
)
|
|
186
|
-
client =
|
|
205
|
+
client = self.select_client(provider=provider)
|
|
187
206
|
|
|
188
207
|
try:
|
|
189
208
|
async for attempt in AsyncRetrying(
|
|
@@ -194,7 +213,6 @@ class JudgeEvaluator(BaseEvaluator):
|
|
|
194
213
|
):
|
|
195
214
|
with attempt:
|
|
196
215
|
response = await client.acall(message=prompt)
|
|
197
|
-
logger.info(f"[{provider}] Async evaluation:\n{response}\n{'---' * 10}")
|
|
198
216
|
parsed = client.parse_response(response=response)
|
|
199
217
|
return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
|
|
200
218
|
|
|
@@ -212,7 +230,18 @@ class JudgeEvaluator(BaseEvaluator):
|
|
|
212
230
|
|
|
213
231
|
|
|
214
232
|
class MetadataEvaluator(BaseEvaluator):
|
|
215
|
-
|
|
233
|
+
"""Metadata evaluator class."""
|
|
234
|
+
def __init__(self, config: "WorkflowConfig | None" = None):
|
|
235
|
+
"""
|
|
236
|
+
Initialize the MetadataEvaluator.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
config (WorkflowConfig | None): The workflow configuration.
|
|
240
|
+
"""
|
|
241
|
+
if config:
|
|
242
|
+
self.config = config
|
|
243
|
+
self.metics_map = config.evaluation.metrics_map
|
|
244
|
+
|
|
216
245
|
self.data_loader = DataLoader()
|
|
217
246
|
self.comparator = MetadataComparator()
|
|
218
247
|
self.metrics_manager = MetricsManager()
|
|
@@ -245,7 +274,6 @@ class MetadataEvaluator(BaseEvaluator):
|
|
|
245
274
|
self.comparator.reference_data = ref_data
|
|
246
275
|
|
|
247
276
|
output = self.comparator.run(indexed_mode=False)
|
|
248
|
-
logger.info(f"Comparison results:\n{output}\n---")
|
|
249
277
|
results: Dict[str, float] = {}
|
|
250
278
|
|
|
251
279
|
for k, v in output.items():
|
levelapp/metrics/__init__.py
CHANGED
|
@@ -1,14 +1,11 @@
|
|
|
1
1
|
"""levelapp/metrics/__init__.py"""
|
|
2
|
-
import logging
|
|
3
|
-
|
|
4
2
|
from typing import List, Dict, Type, Any
|
|
5
3
|
|
|
4
|
+
from levelapp.aspects import logger
|
|
6
5
|
from levelapp.core.base import BaseMetric
|
|
7
6
|
from levelapp.metrics.exact import EXACT_METRICS
|
|
8
7
|
from levelapp.metrics.fuzzy import FUZZY_METRICS
|
|
9
8
|
|
|
10
|
-
logger = logging.getLogger(__name__)
|
|
11
|
-
|
|
12
9
|
|
|
13
10
|
class MetricRegistry:
|
|
14
11
|
"""Registry for metric classes."""
|
|
@@ -27,7 +24,6 @@ class MetricRegistry:
|
|
|
27
24
|
raise KeyError(f"Metric '{name}' is already registered")
|
|
28
25
|
|
|
29
26
|
cls._metrics[name] = metric_class
|
|
30
|
-
logger.info(f"Metric '{name}' registered successfully.")
|
|
31
27
|
|
|
32
28
|
@classmethod
|
|
33
29
|
def get(cls, name: str, **kwargs: Any) -> BaseMetric:
|