levelapp 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- levelapp/aspects/monitor.py +3 -1
- levelapp/clients/__init__.py +0 -1
- levelapp/comparator/scorer.py +0 -2
- levelapp/config/endpoint.py +22 -13
- levelapp/config/endpoint_.py +62 -0
- levelapp/config/prompts.py +22 -0
- levelapp/core/schemas.py +0 -2
- levelapp/core/session.py +29 -3
- levelapp/evaluator/evaluator.py +16 -4
- levelapp/metrics/__init__.py +1 -5
- levelapp/simulator/schemas.py +7 -13
- levelapp/simulator/simulator.py +21 -18
- levelapp/simulator/utils.py +40 -78
- levelapp/workflow/base.py +38 -3
- levelapp/workflow/config.py +31 -4
- levelapp/workflow/context.py +0 -1
- levelapp/workflow/factory.py +16 -3
- {levelapp-0.1.1.dist-info → levelapp-0.1.2.dist-info}/METADATA +8 -11
- {levelapp-0.1.1.dist-info → levelapp-0.1.2.dist-info}/RECORD +21 -20
- {levelapp-0.1.1.dist-info → levelapp-0.1.2.dist-info}/WHEEL +0 -0
- {levelapp-0.1.1.dist-info → levelapp-0.1.2.dist-info}/licenses/LICENSE +0 -0
levelapp/aspects/monitor.py
CHANGED
|
@@ -343,6 +343,7 @@ class FunctionMonitor:
|
|
|
343
343
|
category: MetricType,
|
|
344
344
|
enable_timing: bool,
|
|
345
345
|
track_memory: bool,
|
|
346
|
+
verbose=False
|
|
346
347
|
) -> Callable[P, T]:
|
|
347
348
|
"""
|
|
348
349
|
Wrap function execution with timing and error handling.
|
|
@@ -352,6 +353,7 @@ class FunctionMonitor:
|
|
|
352
353
|
name: Unique identifier for the function
|
|
353
354
|
enable_timing: Enable execution time logging
|
|
354
355
|
track_memory: Enable memory tracking
|
|
356
|
+
verbose: Enable verbose logging
|
|
355
357
|
|
|
356
358
|
Returns:
|
|
357
359
|
Wrapped function
|
|
@@ -402,7 +404,7 @@ class FunctionMonitor:
|
|
|
402
404
|
|
|
403
405
|
self._aggregated_stats[name].update(metrics=metrics)
|
|
404
406
|
|
|
405
|
-
if enable_timing and metrics.duration is not None:
|
|
407
|
+
if verbose and enable_timing and metrics.duration is not None:
|
|
406
408
|
log_message = f"[FunctionMonitor] Executed '{name}' in {metrics.duration:.4f}s"
|
|
407
409
|
if metrics.cache_hit:
|
|
408
410
|
log_message += " (cache hit)"
|
levelapp/clients/__init__.py
CHANGED
|
@@ -44,7 +44,6 @@ class ClientRegistry:
|
|
|
44
44
|
|
|
45
45
|
cls._wrap_client_methods(client_class)
|
|
46
46
|
cls._clients[provider] = client_class
|
|
47
|
-
logger.info(f"[ClientRegistry] Registered client for provider: {provider}")
|
|
48
47
|
|
|
49
48
|
@classmethod
|
|
50
49
|
def _wrap_client_methods(cls, client_class: Type[BaseChatClient]) -> None:
|
levelapp/comparator/scorer.py
CHANGED
|
@@ -78,7 +78,6 @@ class MetricsManager:
|
|
|
78
78
|
ValueError: if the scorer is not a callable.
|
|
79
79
|
"""
|
|
80
80
|
self._scorers[name] = scorer
|
|
81
|
-
logger.info(f"[MetricsManager] Registered scorer: {name}")
|
|
82
81
|
|
|
83
82
|
def get_scorer(self, name: str) -> Callable:
|
|
84
83
|
"""
|
|
@@ -95,7 +94,6 @@ class MetricsManager:
|
|
|
95
94
|
"""
|
|
96
95
|
try:
|
|
97
96
|
scorer = self._scorers.get(name)
|
|
98
|
-
logger.info(f"[get_scorer] Retrieved scorer: {name}")
|
|
99
97
|
return scorer
|
|
100
98
|
|
|
101
99
|
except KeyError:
|
levelapp/config/endpoint.py
CHANGED
|
@@ -29,7 +29,6 @@ class EndpointConfig(BaseModel):
|
|
|
29
29
|
bearer_token (SecretStr): The Bearer token to use.
|
|
30
30
|
model_id (str): The model to use (if applicable).
|
|
31
31
|
default_request_payload_template (Dict[str, Any]): The payload template to use.
|
|
32
|
-
generated_request_payload_template (Dict[str, Any]): The generated payload template from a provided file.
|
|
33
32
|
variables (Dict[str, Any]): The variables to populate the payload template.
|
|
34
33
|
|
|
35
34
|
Note:
|
|
@@ -40,11 +39,10 @@ class EndpointConfig(BaseModel):
|
|
|
40
39
|
- bearer_token (SecretStr): The Bearer token to use.
|
|
41
40
|
- model_id (str): The model to use (if applicable).
|
|
42
41
|
- default_payload_template (Dict[str, Any]): The payload template to use.
|
|
43
|
-
- generated_payload_template (Dict[str, Any]): The generated payload template from a provided file.
|
|
44
42
|
- variables (Dict[str, Any]): The variables to populate the payload template.
|
|
45
43
|
|
|
46
44
|
Or manually configure the model instance by assigning the proper values to the model fields.\n
|
|
47
|
-
You can also provide the path in the .env file for the payload template (ENDPOINT_PAYLOAD_PATH)
|
|
45
|
+
You can also provide the path in the .env file for the payload template (ENDPOINT_PAYLOAD_PATH/)
|
|
48
46
|
and the response template (ENDPOINT_RESPONSE_PATH) separately. The files can be either YAML or JSON only.
|
|
49
47
|
"""
|
|
50
48
|
load_dotenv()
|
|
@@ -61,9 +59,7 @@ class EndpointConfig(BaseModel):
|
|
|
61
59
|
|
|
62
60
|
# Data
|
|
63
61
|
default_request_payload_template: Dict[str, Any] = Field(default_factory=dict)
|
|
64
|
-
generated_request_payload_template: Dict[str, Any] = Field(default_factory=dict)
|
|
65
62
|
default_response_payload_template: Dict[str, Any] = Field(default_factory=dict)
|
|
66
|
-
generated_response_payload_template: Dict[str, Any] = Field(default_factory=dict)
|
|
67
63
|
|
|
68
64
|
# Variables
|
|
69
65
|
variables: Dict[str, Any] = Field(default_factory=dict)
|
|
@@ -88,14 +84,18 @@ class EndpointConfig(BaseModel):
|
|
|
88
84
|
@computed_field
|
|
89
85
|
@property
|
|
90
86
|
def request_payload(self) -> Dict[str, Any]:
|
|
91
|
-
"""
|
|
92
|
-
|
|
87
|
+
"""
|
|
88
|
+
Return fully prepared payload depending on template or full payload.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
request payload (Dict[str, Any]): Populated request payload template.
|
|
92
|
+
"""
|
|
93
|
+
# First, we check if we have variables to populate the template with. If not, we return the template as is.
|
|
93
94
|
if not self.variables:
|
|
94
95
|
return self.default_request_payload_template
|
|
95
96
|
|
|
96
97
|
if not self.default_request_payload_template:
|
|
97
|
-
self.load_template(template_type=TemplateType.REQUEST)
|
|
98
|
-
base_template = self.generated_request_payload_template
|
|
98
|
+
base_template = self.load_template(template_type=TemplateType.REQUEST)
|
|
99
99
|
else:
|
|
100
100
|
base_template = self.default_request_payload_template
|
|
101
101
|
|
|
@@ -118,8 +118,7 @@ class EndpointConfig(BaseModel):
|
|
|
118
118
|
return self.default_response_payload_template
|
|
119
119
|
|
|
120
120
|
if not self.default_response_payload_template:
|
|
121
|
-
self.load_template(template_type=TemplateType.RESPONSE)
|
|
122
|
-
base_template = self.generated_response_payload_template
|
|
121
|
+
base_template = self.load_template(template_type=TemplateType.RESPONSE)
|
|
123
122
|
else:
|
|
124
123
|
base_template = self.default_response_payload_template
|
|
125
124
|
|
|
@@ -148,12 +147,23 @@ class EndpointConfig(BaseModel):
|
|
|
148
147
|
|
|
149
148
|
return _replace(obj)
|
|
150
149
|
|
|
150
|
+
@staticmethod
|
|
151
151
|
def load_template(
|
|
152
|
-
self,
|
|
153
152
|
template_type: TemplateType = TemplateType.REQUEST,
|
|
154
153
|
path: str | None = None
|
|
155
154
|
) -> Dict[str, Any]:
|
|
155
|
+
"""
|
|
156
|
+
Load request/response payload template from JSON/YAML file.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
template_type (TemplateType): The type of template to load (REQUEST or RESPONSE).
|
|
160
|
+
path (str): The path of the payload template file to load.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Payload template (Dict[str, Any]): Payload template.
|
|
164
|
+
"""
|
|
156
165
|
try:
|
|
166
|
+
# If no path was provided, we check the env. variables.
|
|
157
167
|
if not path:
|
|
158
168
|
env_var = "ENDPOINT_PAYLOAD_PATH" if template_type == TemplateType.REQUEST else "ENDPOINT_RESPONSE_PATH"
|
|
159
169
|
path = os.getenv(env_var, '')
|
|
@@ -171,7 +181,6 @@ class EndpointConfig(BaseModel):
|
|
|
171
181
|
else:
|
|
172
182
|
raise ValueError("[EndpointConfig] Unsupported file format.")
|
|
173
183
|
|
|
174
|
-
self.generated_request_payload_template = data
|
|
175
184
|
return data
|
|
176
185
|
|
|
177
186
|
except FileNotFoundError as e:
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Any, List
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HttpMethod(str, Enum):
|
|
9
|
+
GET = "GET"
|
|
10
|
+
POST = "POST"
|
|
11
|
+
PUT = "PUT"
|
|
12
|
+
Patch = "PATCH"
|
|
13
|
+
DELETE = "DELETE"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HeaderConfig(BaseModel):
|
|
17
|
+
"""Secure header configuration with environment variables support."""
|
|
18
|
+
name: str
|
|
19
|
+
value: str
|
|
20
|
+
secure: bool = False
|
|
21
|
+
|
|
22
|
+
class Config:
|
|
23
|
+
frozen = True
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RequestSchemaConfig(BaseModel):
|
|
27
|
+
"""Schema definition for request payload population."""
|
|
28
|
+
field_path: str # JSON path-like: "data.user.id"
|
|
29
|
+
value: Any
|
|
30
|
+
value_type: str = "static" # static, env, dynamic
|
|
31
|
+
required: bool = True
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ResponseMappingConfig(BaseModel):
|
|
35
|
+
"""Response data extraction mapping."""
|
|
36
|
+
field_path: str # JSON path-like: "data.results[0].id"
|
|
37
|
+
extract_as: str # Name to extract as
|
|
38
|
+
default: Any = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EndpointConfig(BaseModel):
|
|
42
|
+
"""Complete endpoint configuration."""
|
|
43
|
+
name: str
|
|
44
|
+
base_url: str
|
|
45
|
+
path: str
|
|
46
|
+
method: HttpMethod
|
|
47
|
+
headers: List[HeaderConfig] = Field(default_factory=list)
|
|
48
|
+
request_schema: List[RequestSchemaConfig] = Field(default_factory=list)
|
|
49
|
+
response_mapping: List[ResponseMappingConfig] = Field(default_factory=list)
|
|
50
|
+
timeout: int = 30
|
|
51
|
+
retry_count: int = 3
|
|
52
|
+
retry_backoff: float = 1.0
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def validate_path(cls, v: str) -> str:
|
|
56
|
+
if not v.startswith('/'):
|
|
57
|
+
return f'/{v}'
|
|
58
|
+
return v
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class PayloadBuilder(ABC):
|
|
62
|
+
"""Abstract base for payload construction strategies."""
|
levelapp/config/prompts.py
CHANGED
|
@@ -33,3 +33,25 @@ Return ONLY a single JSON object on one line with exactly these keys:
|
|
|
33
33
|
|
|
34
34
|
Do NOT include any additional text, explanations, or formatting (e.g., "JSON object:", ```json or ```, or markdown).
|
|
35
35
|
"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
SUMMARIZATION_PROMPT_TEMPLATE = """
|
|
39
|
+
You are reviewing evaluation justifications from LLM judges about replies generated by a virtual assistant.
|
|
40
|
+
Interpret the context from the verdicts: (e.g., real-estate leasing, medical appointment scheduling, etc.).
|
|
41
|
+
|
|
42
|
+
Each justification contains the judge's assessment of how well the assistant's response matched the expected reply.
|
|
43
|
+
Your task is to **identify and summarize only the negative points**, such as:
|
|
44
|
+
- Errors or inaccuracies
|
|
45
|
+
- Misunderstandings or misinterpretations
|
|
46
|
+
- Missing or incomplete information
|
|
47
|
+
- Failure to meet expectations or requirements
|
|
48
|
+
|
|
49
|
+
**Instructions:**
|
|
50
|
+
- Return up to {max_bullets} concise bullet points.
|
|
51
|
+
- Start each point with "- " and focus on clarity and relevance.
|
|
52
|
+
- Avoid redundancy and prioritize actionable feedback.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
- Judge: {judge}
|
|
56
|
+
- Verdicts: {verdicts}
|
|
57
|
+
"""
|
levelapp/core/schemas.py
CHANGED
levelapp/core/session.py
CHANGED
|
@@ -63,6 +63,14 @@ class StepContext:
|
|
|
63
63
|
step_name: str,
|
|
64
64
|
category: MetricType,
|
|
65
65
|
):
|
|
66
|
+
"""
|
|
67
|
+
Initialize StepContext.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
session (EvaluationSession): Evaluation session.
|
|
71
|
+
step_name (str): Step name.
|
|
72
|
+
category (MetricType): Metric type.
|
|
73
|
+
"""
|
|
66
74
|
self.session = session
|
|
67
75
|
self.step_name = step_name
|
|
68
76
|
self.category = category
|
|
@@ -88,6 +96,7 @@ class StepContext:
|
|
|
88
96
|
category=self.category,
|
|
89
97
|
enable_timing=True,
|
|
90
98
|
track_memory=True,
|
|
99
|
+
verbose=self.session.verbose,
|
|
91
100
|
)(self._step_wrapper)
|
|
92
101
|
|
|
93
102
|
# Start monitoring
|
|
@@ -119,7 +128,7 @@ class StepContext:
|
|
|
119
128
|
|
|
120
129
|
self.session.session_metadata.total_executions += 1
|
|
121
130
|
|
|
122
|
-
if self.step_meta.duration:
|
|
131
|
+
if self.session.enable_monitoring and self.step_meta.duration:
|
|
123
132
|
self.session.monitor.update_procedure_duration(
|
|
124
133
|
name=self.full_step_name,
|
|
125
134
|
value=self.step_meta.duration
|
|
@@ -136,6 +145,7 @@ class EvaluationSession:
|
|
|
136
145
|
session_name: str = "test-session",
|
|
137
146
|
workflow_config: WorkflowConfig | None = None,
|
|
138
147
|
enable_monitoring: bool = True,
|
|
148
|
+
verbose: bool = False
|
|
139
149
|
):
|
|
140
150
|
"""
|
|
141
151
|
Initialize Evaluation Session.
|
|
@@ -143,12 +153,15 @@ class EvaluationSession:
|
|
|
143
153
|
Args:
|
|
144
154
|
session_name (str): Name of the session
|
|
145
155
|
workflow_config (WorkflowConfig): Workflow configuration.
|
|
156
|
+
enable_monitoring (bool): Switch monitoring on. Defaults to True.
|
|
157
|
+
verbose (bool): Verbose mode. Defaults to False.
|
|
146
158
|
"""
|
|
147
159
|
self._NAME = self.__class__.__name__
|
|
148
160
|
|
|
149
161
|
self.session_name = session_name
|
|
150
162
|
self.workflow_config = workflow_config
|
|
151
163
|
self.enable_monitoring = enable_monitoring
|
|
164
|
+
self.verbose = verbose
|
|
152
165
|
|
|
153
166
|
self.workflow: BaseWorkflow | None = None
|
|
154
167
|
|
|
@@ -176,7 +189,7 @@ class EvaluationSession:
|
|
|
176
189
|
self.workflow = MainFactory.create_workflow(context=context)
|
|
177
190
|
|
|
178
191
|
logger.info(
|
|
179
|
-
f"[{self._NAME}] Starting evaluation session: {self.session_name}
|
|
192
|
+
f"[{self._NAME}] Starting evaluation session: {self.session_name} - "
|
|
180
193
|
f"Workflow: '{self.workflow.name}'"
|
|
181
194
|
)
|
|
182
195
|
return self
|
|
@@ -190,6 +203,7 @@ class EvaluationSession:
|
|
|
190
203
|
|
|
191
204
|
if exc_type:
|
|
192
205
|
logger.error(f"[{self._NAME}] Session ended with error: {exc_val}", exc_info=True)
|
|
206
|
+
|
|
193
207
|
return False
|
|
194
208
|
|
|
195
209
|
def step(self, step_name: str, category: MetricType = MetricType.CUSTOM) -> StepContext:
|
|
@@ -213,6 +227,19 @@ class EvaluationSession:
|
|
|
213
227
|
self.workflow.collect_results()
|
|
214
228
|
|
|
215
229
|
def get_stats(self) -> Dict[str, Any]:
|
|
230
|
+
if self.enable_monitoring:
|
|
231
|
+
return {
|
|
232
|
+
"session": {
|
|
233
|
+
"name": self.session_name,
|
|
234
|
+
"duration": precisedelta(self.session_metadata.duration, suppress=['minutes']),
|
|
235
|
+
"start_time": self.session_metadata.started_at.isoformat(),
|
|
236
|
+
"end_time": self.session_metadata.ended_at.isoformat(),
|
|
237
|
+
"steps": len(self.session_metadata.steps),
|
|
238
|
+
"errors": sum(s.error_count for s in self.session_metadata.steps.values())
|
|
239
|
+
},
|
|
240
|
+
"stats": self.monitor.get_all_stats()
|
|
241
|
+
}
|
|
242
|
+
|
|
216
243
|
return {
|
|
217
244
|
"session": {
|
|
218
245
|
"name": self.session_name,
|
|
@@ -222,5 +249,4 @@ class EvaluationSession:
|
|
|
222
249
|
"steps": len(self.session_metadata.steps),
|
|
223
250
|
"errors": sum(s.error_count for s in self.session_metadata.steps.values())
|
|
224
251
|
},
|
|
225
|
-
"stats": self.monitor.get_all_stats()
|
|
226
252
|
}
|
levelapp/evaluator/evaluator.py
CHANGED
|
@@ -41,7 +41,7 @@ class JudgeEvaluationResults(BaseModel):
|
|
|
41
41
|
label: str = Field(..., description="The label of the evaluation result")
|
|
42
42
|
justification: str = Field(..., description="Short explanation of the evaluation result")
|
|
43
43
|
evidence: Evidence = Field(default_factory=Evidence, description="Detailed evidence for the evaluation")
|
|
44
|
-
raw_response: Dict[str, Any] = Field(..., description="Full unprocessed API response")
|
|
44
|
+
raw_response: Dict[str, Any] = Field(..., description="Full unprocessed API response", exclude=True)
|
|
45
45
|
metadata: Dict[str, Any] = Field(..., description="Metadata about the evaluation result")
|
|
46
46
|
|
|
47
47
|
@classmethod
|
|
@@ -71,7 +71,14 @@ class JudgeEvaluationResults(BaseModel):
|
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
class JudgeEvaluator(BaseEvaluator):
|
|
74
|
+
"""LLM-as-a-judge evaluator class"""
|
|
74
75
|
def __init__(self, config: "WorkflowConfig | None" = None):
|
|
76
|
+
"""
|
|
77
|
+
Initialize the JudgeEvaluator.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
config (WorkflowConfig | None): The configuration of the workflow.
|
|
81
|
+
"""
|
|
75
82
|
if config:
|
|
76
83
|
self.config = config
|
|
77
84
|
self.providers = config.evaluation.providers
|
|
@@ -206,7 +213,6 @@ class JudgeEvaluator(BaseEvaluator):
|
|
|
206
213
|
):
|
|
207
214
|
with attempt:
|
|
208
215
|
response = await client.acall(message=prompt)
|
|
209
|
-
logger.info(f"[{provider}] Async evaluation:\n{response}\n{'---' * 10}")
|
|
210
216
|
parsed = client.parse_response(response=response)
|
|
211
217
|
return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
|
|
212
218
|
|
|
@@ -224,7 +230,14 @@ class JudgeEvaluator(BaseEvaluator):
|
|
|
224
230
|
|
|
225
231
|
|
|
226
232
|
class MetadataEvaluator(BaseEvaluator):
|
|
227
|
-
|
|
233
|
+
"""Metadata evaluator class."""
|
|
234
|
+
def __init__(self, config: "WorkflowConfig | None" = None):
|
|
235
|
+
"""
|
|
236
|
+
Initialize the MetadataEvaluator.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
config (WorkflowConfig | None): The workflow configuration.
|
|
240
|
+
"""
|
|
228
241
|
if config:
|
|
229
242
|
self.config = config
|
|
230
243
|
self.metics_map = config.evaluation.metrics_map
|
|
@@ -261,7 +274,6 @@ class MetadataEvaluator(BaseEvaluator):
|
|
|
261
274
|
self.comparator.reference_data = ref_data
|
|
262
275
|
|
|
263
276
|
output = self.comparator.run(indexed_mode=False)
|
|
264
|
-
logger.info(f"Comparison results:\n{output}\n---")
|
|
265
277
|
results: Dict[str, float] = {}
|
|
266
278
|
|
|
267
279
|
for k, v in output.items():
|
levelapp/metrics/__init__.py
CHANGED
|
@@ -1,14 +1,11 @@
|
|
|
1
1
|
"""levelapp/metrics/__init__.py"""
|
|
2
|
-
import logging
|
|
3
|
-
|
|
4
2
|
from typing import List, Dict, Type, Any
|
|
5
3
|
|
|
4
|
+
from levelapp.aspects import logger
|
|
6
5
|
from levelapp.core.base import BaseMetric
|
|
7
6
|
from levelapp.metrics.exact import EXACT_METRICS
|
|
8
7
|
from levelapp.metrics.fuzzy import FUZZY_METRICS
|
|
9
8
|
|
|
10
|
-
logger = logging.getLogger(__name__)
|
|
11
|
-
|
|
12
9
|
|
|
13
10
|
class MetricRegistry:
|
|
14
11
|
"""Registry for metric classes."""
|
|
@@ -27,7 +24,6 @@ class MetricRegistry:
|
|
|
27
24
|
raise KeyError(f"Metric '{name}' is already registered")
|
|
28
25
|
|
|
29
26
|
cls._metrics[name] = metric_class
|
|
30
|
-
logger.info(f"Metric '{name}' registered successfully.")
|
|
31
27
|
|
|
32
28
|
@classmethod
|
|
33
29
|
def get(cls, name: str, **kwargs: Any) -> BaseMetric:
|
levelapp/simulator/schemas.py
CHANGED
|
@@ -9,7 +9,7 @@ from uuid import UUID, uuid4
|
|
|
9
9
|
from datetime import datetime
|
|
10
10
|
|
|
11
11
|
from typing import Dict, Any, List
|
|
12
|
-
from pydantic import BaseModel, Field, computed_field
|
|
12
|
+
from pydantic import BaseModel, Field, computed_field, field_validator
|
|
13
13
|
|
|
14
14
|
from levelapp.evaluator.evaluator import JudgeEvaluationResults
|
|
15
15
|
|
|
@@ -64,26 +64,20 @@ class InteractionEvaluationResults(BaseModel):
|
|
|
64
64
|
|
|
65
65
|
|
|
66
66
|
class SimulationResults(BaseModel):
|
|
67
|
-
# Initial data
|
|
68
|
-
project_id: str = Field(default_factory=uuid4, description="Project identifier")
|
|
69
|
-
user_id: str = Field(default_factory=uuid4, description="User identifier")
|
|
70
|
-
batch_id: str = Field(default_factory=uuid4, description="Batch identifier")
|
|
71
67
|
# Collected data
|
|
72
68
|
started_at: datetime = datetime.now()
|
|
73
69
|
finished_at: datetime
|
|
74
70
|
# Collected Results
|
|
75
71
|
evaluation_summary: Dict[str, Any] | None = Field(default_factory=dict, description="Evaluation result")
|
|
76
72
|
average_scores: Dict[str, Any] | None = Field(default_factory=dict, description="Average scores")
|
|
73
|
+
interaction_results: List[Dict[str, Any]] | None = Field(default_factory=list, description="detailed results")
|
|
74
|
+
|
|
75
|
+
@computed_field
|
|
76
|
+
@property
|
|
77
|
+
def batch_id(self) -> str:
|
|
78
|
+
return str(uuid4())
|
|
77
79
|
|
|
78
80
|
@computed_field
|
|
79
81
|
@property
|
|
80
82
|
def elapsed_time(self) -> float:
|
|
81
83
|
return (self.finished_at - self.started_at).total_seconds()
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
class TestResults(BaseModel):
|
|
85
|
-
api_host: str = Field(..., alias="apiHost")
|
|
86
|
-
ionos_model_name: str = Field(..., alias="ionosModelName")
|
|
87
|
-
test_name: str = Field(..., alias="testName")
|
|
88
|
-
test_type: str = Field(..., alias="testType")
|
|
89
|
-
batch_details: SimulationResults | None = Field(..., alias="results")
|
levelapp/simulator/simulator.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
'simulators/service.py': Service layer to manage conversation simulation and evaluation.
|
|
3
3
|
"""
|
|
4
|
+
import json
|
|
4
5
|
import time
|
|
5
6
|
import asyncio
|
|
6
7
|
|
|
@@ -92,6 +93,15 @@ class ConversationSimulator(BaseProcess):
|
|
|
92
93
|
self._headers = endpoint_config.headers
|
|
93
94
|
|
|
94
95
|
def get_evaluator(self, name: EvaluatorType) -> BaseEvaluator:
|
|
96
|
+
"""
|
|
97
|
+
Retrieve an evaluator by name.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
name (EvaluatorType): Name of evaluator.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
An evaluator object.
|
|
104
|
+
"""
|
|
95
105
|
_LOG: str = f"[{self._CLASS_NAME}][{self.get_evaluator.__name__}]"
|
|
96
106
|
|
|
97
107
|
if name not in self.evaluators:
|
|
@@ -103,7 +113,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
103
113
|
self,
|
|
104
114
|
test_batch: ScriptsBatch,
|
|
105
115
|
attempts: int = 1,
|
|
106
|
-
) ->
|
|
116
|
+
) -> Any:
|
|
107
117
|
"""
|
|
108
118
|
Run a batch test for the given batch name and details.
|
|
109
119
|
|
|
@@ -129,9 +139,10 @@ class ConversationSimulator(BaseProcess):
|
|
|
129
139
|
finished_at=finished_at,
|
|
130
140
|
evaluation_summary=self.verdict_summaries,
|
|
131
141
|
average_scores=results.get("average_scores", {}),
|
|
142
|
+
interaction_results=results.get("results")
|
|
132
143
|
)
|
|
133
144
|
|
|
134
|
-
return
|
|
145
|
+
return results.model_dump_json(indent=2)
|
|
135
146
|
|
|
136
147
|
async def simulate_conversation(self, attempts: int = 1) -> Dict[str, Any]:
|
|
137
148
|
"""
|
|
@@ -171,10 +182,11 @@ class ConversationSimulator(BaseProcess):
|
|
|
171
182
|
verdicts=verdicts, judge=judge
|
|
172
183
|
)
|
|
173
184
|
|
|
174
|
-
return {"
|
|
185
|
+
return {"results": results, "average_scores": overall_average_scores}
|
|
175
186
|
|
|
176
187
|
async def simulate_single_scenario(
|
|
177
|
-
self, script: ConversationScript,
|
|
188
|
+
self, script: ConversationScript,
|
|
189
|
+
attempts: int = 1
|
|
178
190
|
) -> Dict[str, Any]:
|
|
179
191
|
"""
|
|
180
192
|
Simulate a single scenario with the given number of attempts, concurrently.
|
|
@@ -193,19 +205,18 @@ class ConversationSimulator(BaseProcess):
|
|
|
193
205
|
all_attempts_verdicts: Dict[str, List[str]] = defaultdict(list)
|
|
194
206
|
|
|
195
207
|
async def simulate_attempt(attempt_number: int) -> Dict[str, Any]:
|
|
196
|
-
logger.info(f"{_LOG} Running attempt: {attempt_number + 1}/{attempts}")
|
|
208
|
+
logger.info(f"{_LOG} Running attempt: {attempt_number + 1}/{attempts}\n---")
|
|
197
209
|
start_time = time.time()
|
|
198
210
|
|
|
199
211
|
collected_scores: Dict[str, List[Any]] = defaultdict(list)
|
|
200
212
|
collected_verdicts: Dict[str, List[str]] = defaultdict(list)
|
|
201
213
|
|
|
202
|
-
|
|
214
|
+
interaction_results = await self.simulate_interactions(
|
|
203
215
|
script=script,
|
|
204
216
|
evaluation_verdicts=collected_verdicts,
|
|
205
217
|
collected_scores=collected_scores,
|
|
206
218
|
)
|
|
207
219
|
|
|
208
|
-
logger.info(f"{_LOG} collected_scores: {collected_scores}\n---")
|
|
209
220
|
single_attempt_scores = calculate_average_scores(collected_scores)
|
|
210
221
|
|
|
211
222
|
for target, scores in single_attempt_scores.items():
|
|
@@ -225,7 +236,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
225
236
|
"attempt": attempt_number + 1,
|
|
226
237
|
"script_id": script.id,
|
|
227
238
|
"total_duration": elapsed_time,
|
|
228
|
-
"interaction_results":
|
|
239
|
+
"interaction_results": interaction_results,
|
|
229
240
|
"evaluation_verdicts": collected_verdicts,
|
|
230
241
|
"average_scores": single_attempt_scores,
|
|
231
242
|
}
|
|
@@ -238,10 +249,6 @@ class ConversationSimulator(BaseProcess):
|
|
|
238
249
|
for judge_, verdicts_ in all_attempts_verdicts.items():
|
|
239
250
|
self.evaluation_verdicts[judge_].extend(verdicts_)
|
|
240
251
|
|
|
241
|
-
logger.info(
|
|
242
|
-
f"{_LOG} average scores:\n{average_scores}\n---"
|
|
243
|
-
)
|
|
244
|
-
|
|
245
252
|
return {
|
|
246
253
|
"script_id": script.id,
|
|
247
254
|
"attempts": attempt_results,
|
|
@@ -324,8 +331,6 @@ class ConversationSimulator(BaseProcess):
|
|
|
324
331
|
reference_guardrail=reference_guardrail_flag,
|
|
325
332
|
)
|
|
326
333
|
|
|
327
|
-
logger.info(f"{_LOG} Evaluation results:\n{evaluation_results.model_dump()}\n")
|
|
328
|
-
|
|
329
334
|
self.store_evaluation_results(
|
|
330
335
|
results=evaluation_results,
|
|
331
336
|
evaluation_verdicts=evaluation_verdicts,
|
|
@@ -333,9 +338,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
333
338
|
)
|
|
334
339
|
|
|
335
340
|
elapsed_time = time.time() - start_time
|
|
336
|
-
logger.info(
|
|
337
|
-
f"{_LOG} Interaction simulation complete in {elapsed_time:.2f} seconds.\n---"
|
|
338
|
-
)
|
|
341
|
+
logger.info(f"{_LOG} Interaction simulation complete in {elapsed_time:.2f} seconds.\n---")
|
|
339
342
|
|
|
340
343
|
result = {
|
|
341
344
|
"user_message": user_message,
|
|
@@ -494,7 +497,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
494
497
|
collected_scores (Dict[str, List[Any]]): The collected scores.
|
|
495
498
|
"""
|
|
496
499
|
for provider in results.judge_evaluations.keys():
|
|
497
|
-
evaluation_verdicts[f"{provider}
|
|
500
|
+
evaluation_verdicts[f"{provider}"].append(
|
|
498
501
|
results.judge_evaluations.get(provider, "").justification
|
|
499
502
|
)
|
|
500
503
|
|
levelapp/simulator/utils.py
CHANGED
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
"""
|
|
2
2
|
'simulators/aspects.py': Utility functions for handling VLA interactions and requests.
|
|
3
3
|
"""
|
|
4
|
+
import ast
|
|
4
5
|
import json
|
|
5
|
-
|
|
6
6
|
import httpx
|
|
7
|
-
import arrow
|
|
8
7
|
|
|
9
8
|
from uuid import UUID
|
|
10
|
-
from
|
|
9
|
+
from string import Template
|
|
10
|
+
from typing import Any, Dict, List, Union
|
|
11
11
|
|
|
12
|
-
from openai import OpenAI
|
|
13
12
|
from pydantic import ValidationError
|
|
14
13
|
|
|
14
|
+
from levelapp.clients import ClientRegistry
|
|
15
|
+
from levelapp.config.prompts import SUMMARIZATION_PROMPT_TEMPLATE
|
|
15
16
|
from levelapp.simulator.schemas import InteractionResults
|
|
16
17
|
from levelapp.aspects import MonitoringAspect, MetricType, logger
|
|
17
18
|
|
|
@@ -48,7 +49,14 @@ def extract_interaction_details(
|
|
|
48
49
|
missing_keys = required_keys - response_dict.keys()
|
|
49
50
|
logger.warning(f"[extract_interaction_details] Missing data: {missing_keys}]")
|
|
50
51
|
|
|
51
|
-
|
|
52
|
+
output = {}
|
|
53
|
+
for k, v in template.items():
|
|
54
|
+
output[k] = Template(v).safe_substitute(response_dict)
|
|
55
|
+
|
|
56
|
+
raw_value = output.get("generated_metadata", {})
|
|
57
|
+
output["generated_metadata"] = ast.literal_eval(raw_value) if isinstance(raw_value, str) else raw_value
|
|
58
|
+
|
|
59
|
+
return InteractionResults.model_validate(output)
|
|
52
60
|
|
|
53
61
|
except json.JSONDecodeError as e:
|
|
54
62
|
logger.error(f"[extract_interaction_details] Failed to extract details:\n{e}")
|
|
@@ -64,7 +72,7 @@ async def async_interaction_request(
|
|
|
64
72
|
url: str,
|
|
65
73
|
headers: Dict[str, str],
|
|
66
74
|
payload: Dict[str, Any],
|
|
67
|
-
) ->
|
|
75
|
+
) -> httpx.Response | None:
|
|
68
76
|
"""
|
|
69
77
|
Perform an asynchronous interaction request.
|
|
70
78
|
|
|
@@ -74,7 +82,7 @@ async def async_interaction_request(
|
|
|
74
82
|
payload (Dict[str, Any]): The payload to send in the request.
|
|
75
83
|
|
|
76
84
|
Returns:
|
|
77
|
-
|
|
85
|
+
httpx.Response: The response from the interaction request, or None if an error occurred.
|
|
78
86
|
"""
|
|
79
87
|
try:
|
|
80
88
|
async with httpx.AsyncClient(timeout=180) as client:
|
|
@@ -92,42 +100,6 @@ async def async_interaction_request(
|
|
|
92
100
|
return None
|
|
93
101
|
|
|
94
102
|
|
|
95
|
-
def parse_date_value(raw_date_value: Optional[str], default_date_value: Optional[str] = "") -> str:
|
|
96
|
-
"""
|
|
97
|
-
Cleans and parses a dehumanized relative date string to ISO format.
|
|
98
|
-
|
|
99
|
-
Args:
|
|
100
|
-
raw_date_value (Optional[str]): The raw date value to parse.
|
|
101
|
-
default_date_value (Optional[str]): The default value to return if parsing fails. Defaults to an empty string.
|
|
102
|
-
|
|
103
|
-
Returns:
|
|
104
|
-
str: The parsed date in ISO format, or the default value if parsing fails.
|
|
105
|
-
"""
|
|
106
|
-
if not raw_date_value:
|
|
107
|
-
logger.info(f"[parse_date_value] No raw value provided. returning default: '{default_date_value}'")
|
|
108
|
-
return default_date_value
|
|
109
|
-
|
|
110
|
-
clean = raw_date_value.replace("{{", "").replace("}}", "").replace("_", " ").strip()
|
|
111
|
-
clean += 's' if not clean.endswith('s') else clean
|
|
112
|
-
|
|
113
|
-
try:
|
|
114
|
-
arw = arrow.utcnow()
|
|
115
|
-
parsed_date = arw.dehumanize(clean).utcnow().format('YYYY-MM-DD')
|
|
116
|
-
return parsed_date
|
|
117
|
-
|
|
118
|
-
except arrow.parser.ParserError as e:
|
|
119
|
-
logger.error(f"[parse_date_value] Failed to parse date: '{clean}'\nParserError: {str(e)}", exc_info=True)
|
|
120
|
-
return default_date_value
|
|
121
|
-
|
|
122
|
-
except ValueError as e:
|
|
123
|
-
logger.error(f"[parse_date_value] Invalid date value: '{clean}'\nValueError: {str(e)}", exc_info=True)
|
|
124
|
-
return default_date_value
|
|
125
|
-
|
|
126
|
-
except Exception as e:
|
|
127
|
-
logger.error(f"[parse_date_value] Unexpected error.\nException: {str(e)}", exc_info=True)
|
|
128
|
-
return default_date_value
|
|
129
|
-
|
|
130
|
-
|
|
131
103
|
@MonitoringAspect.monitor(
|
|
132
104
|
name="average_calc",
|
|
133
105
|
category=MetricType.SCORING,
|
|
@@ -157,45 +129,35 @@ def calculate_average_scores(scores: Dict[str, Union[List[float], float]]) -> Di
|
|
|
157
129
|
|
|
158
130
|
|
|
159
131
|
@MonitoringAspect.monitor(name="summarization", category=MetricType.API_CALL)
|
|
160
|
-
def summarize_verdicts(
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
max_bullets (int): The maximum number of bullets allowed per judge.
|
|
168
|
-
|
|
169
|
-
Returns:
|
|
170
|
-
List[str]: The summarized justifications.
|
|
171
|
-
"""
|
|
172
|
-
if not verdicts:
|
|
173
|
-
return []
|
|
174
|
-
|
|
175
|
-
prompt = f"""
|
|
176
|
-
You are reviewing evaluation justifications from LL judges about replies generated by a virtual leasing agent.\n
|
|
177
|
-
Each justification contains the judge's assessment of how well the agent's response matched the expected reply.\n
|
|
178
|
-
Your task is to identify and summarize only the **negative points**, such as errors, misunderstandings,
|
|
179
|
-
missing information, or failure to meet expectations.\n
|
|
180
|
-
Return up to {max_bullets} bullet points. Be concise and start each point with '- '\n\n
|
|
181
|
-
---
|
|
182
|
-
- Judge: {judge}
|
|
183
|
-
- Justifications:\n{chr(10).join(verdicts)}\n
|
|
184
|
-
"""
|
|
185
|
-
|
|
186
|
-
client = OpenAI()
|
|
132
|
+
def summarize_verdicts(
|
|
133
|
+
verdicts: List[str],
|
|
134
|
+
judge: str,
|
|
135
|
+
max_bullets: int = 5
|
|
136
|
+
) -> List[str]:
|
|
137
|
+
client_registry = ClientRegistry()
|
|
138
|
+
client = client_registry.get(provider=judge)
|
|
187
139
|
|
|
188
140
|
try:
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
).
|
|
141
|
+
verdicts = chr(10).join(verdicts)
|
|
142
|
+
prompt = SUMMARIZATION_PROMPT_TEMPLATE.format(max_bullets=max_bullets, judge=judge, verdicts=verdicts)
|
|
143
|
+
response = client.call(message=prompt)
|
|
144
|
+
parsed = client.parse_response(response=response)
|
|
145
|
+
striped = parsed.get("output", "").strip("")
|
|
146
|
+
bullet_points = [point.strip() for point in striped.split("- ") if point.strip()]
|
|
194
147
|
|
|
195
|
-
bullet_points
|
|
196
|
-
|
|
197
|
-
return bullet_points
|
|
148
|
+
return bullet_points[:max_bullets]
|
|
198
149
|
|
|
199
150
|
except Exception as e:
|
|
200
151
|
logger.error(f"[summarize_justifications] Error during summarization: {str(e)}", exc_info=True)
|
|
201
152
|
return []
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# if __name__ == '__main__':
|
|
156
|
+
# template = {'generated_reply': '${agent_reply}', 'generated_metadata': '${generated_metadata}'}
|
|
157
|
+
# response_dict = {
|
|
158
|
+
# 'agent_reply': "I'd be happy to help you book something for 10 AM.",
|
|
159
|
+
# 'generated_metadata': {'appointment_type': 'Cardiology', 'date': 'next Monday', 'time': '10 AM'}
|
|
160
|
+
# }
|
|
161
|
+
#
|
|
162
|
+
# result = extract_interaction_details(response_dict, template)
|
|
163
|
+
# print(f"result: {result.model_dump()}")
|
levelapp/workflow/base.py
CHANGED
|
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
|
|
|
4
4
|
from pydantic import ValidationError
|
|
5
5
|
from functools import partial
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Dict
|
|
8
8
|
|
|
9
9
|
from levelapp.core.base import BaseProcess
|
|
10
10
|
from levelapp.simulator.schemas import ScriptsBatch
|
|
@@ -60,15 +60,32 @@ class BaseWorkflow(ABC):
|
|
|
60
60
|
self._results = await loop.run_in_executor(None, func, None)
|
|
61
61
|
|
|
62
62
|
def collect_results(self) -> Any:
|
|
63
|
-
"""
|
|
63
|
+
"""
|
|
64
|
+
Return unified results structure.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
The simulation results.
|
|
68
|
+
"""
|
|
64
69
|
return self._results
|
|
65
70
|
|
|
66
71
|
@abstractmethod
|
|
67
72
|
def _setup_process(self, context: WorkflowContext) -> BaseProcess:
|
|
73
|
+
"""
|
|
74
|
+
Abstract method for setting up the configured process.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
context (WorkflowContext): The workflow context.
|
|
78
|
+
"""
|
|
68
79
|
raise NotImplementedError
|
|
69
80
|
|
|
70
81
|
@abstractmethod
|
|
71
82
|
def _load_input_data(self, context: WorkflowContext) -> Any:
|
|
83
|
+
"""
|
|
84
|
+
Abstract method for loading reference data.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
context (WorkflowContext): The workflow context.
|
|
88
|
+
"""
|
|
72
89
|
raise NotImplementedError
|
|
73
90
|
|
|
74
91
|
|
|
@@ -77,6 +94,15 @@ class SimulatorWorkflow(BaseWorkflow):
|
|
|
77
94
|
super().__init__(name="ConversationSimulator", context=context)
|
|
78
95
|
|
|
79
96
|
def _setup_process(self, context: WorkflowContext) -> BaseProcess:
|
|
97
|
+
"""
|
|
98
|
+
Concrete implementation for setting up the simulation workflow.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
context (WorkflowContext): The workflow context for the simulation workflow.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
ConversationSimulator instance.
|
|
105
|
+
"""
|
|
80
106
|
simulator = ConversationSimulator()
|
|
81
107
|
simulator.setup(
|
|
82
108
|
repository=context.repository,
|
|
@@ -86,7 +112,16 @@ class SimulatorWorkflow(BaseWorkflow):
|
|
|
86
112
|
)
|
|
87
113
|
return simulator
|
|
88
114
|
|
|
89
|
-
def _load_input_data(self, context: WorkflowContext) -> Any:
|
|
115
|
+
def _load_input_data(self, context: WorkflowContext) -> Dict[str, Any]:
|
|
116
|
+
"""
|
|
117
|
+
Concrete implementation for loading the reference data.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
context (WorkflowContext): The workflow context for the simulation workflow.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Dict[str, Any]: The reference data.
|
|
124
|
+
"""
|
|
90
125
|
loader = DataLoader()
|
|
91
126
|
if "reference_data" in context.inputs:
|
|
92
127
|
data_config = context.inputs["reference_data"]
|
levelapp/workflow/config.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from typing import List, Dict, Any, Optional
|
|
3
3
|
from pydantic import BaseModel, Field
|
|
4
4
|
|
|
5
|
+
from levelapp.aspects import logger
|
|
5
6
|
from levelapp.config.endpoint import EndpointConfig
|
|
6
7
|
from levelapp.core.schemas import WorkflowType, RepositoryType, EvaluatorType
|
|
7
8
|
|
|
@@ -47,19 +48,45 @@ class WorkflowConfig(BaseModel):
|
|
|
47
48
|
extra = "allow"
|
|
48
49
|
|
|
49
50
|
@classmethod
|
|
50
|
-
def load(cls, path:
|
|
51
|
-
"""
|
|
51
|
+
def load(cls, path: str | None = None) -> "WorkflowConfig":
|
|
52
|
+
"""
|
|
53
|
+
Load workflow configuration from a YAML/JSON file.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
path (str): YAML/JSON configuration file path.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
WorkflowConfig: An instance of WorkflowConfig.
|
|
60
|
+
"""
|
|
52
61
|
from levelapp.aspects.loader import DataLoader
|
|
53
62
|
|
|
54
63
|
loader = DataLoader()
|
|
55
64
|
config_dict = loader.load_raw_data(path=path)
|
|
65
|
+
logger.info(f"[{cls.__name__}] Workflow configuration loaded from '{path}' file content")
|
|
56
66
|
return cls.model_validate(config_dict)
|
|
57
67
|
|
|
58
68
|
@classmethod
|
|
59
69
|
def from_dict(cls, content: Dict[str, Any]) -> "WorkflowConfig":
|
|
60
|
-
"""
|
|
70
|
+
"""
|
|
71
|
+
Load workflow configuration from an in-memory dict.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
content (dict): Workflow configuration content.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
WorkflowConfig: An instance of WorkflowConfig.
|
|
78
|
+
"""
|
|
79
|
+
logger.info(f"[{cls.__name__}] Workflow configuration loaded from provided content")
|
|
61
80
|
return cls.model_validate(content)
|
|
62
81
|
|
|
63
82
|
def set_reference_data(self, content: Dict[str, Any]) -> None:
|
|
64
|
-
"""
|
|
83
|
+
"""
|
|
84
|
+
Load referer data from an in-memory dict.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
content (dict): Workflow configuration content.
|
|
88
|
+
|
|
89
|
+
"""
|
|
65
90
|
self.reference_data.data = content
|
|
91
|
+
logger.info(f"[{self.__class__.__name__}] Reference data loaded from provided content")
|
|
92
|
+
|
levelapp/workflow/context.py
CHANGED
|
@@ -51,7 +51,6 @@ class WorkflowContextBuilder:
|
|
|
51
51
|
inputs["reference_data"] = self.config.reference_data.data
|
|
52
52
|
else:
|
|
53
53
|
inputs["reference_data_path"] = self.config.reference_data.path
|
|
54
|
-
print(f"[WorkflowContextBuilder] reference data path: {inputs['reference_data_path']}")
|
|
55
54
|
|
|
56
55
|
return WorkflowContext(
|
|
57
56
|
config=self.config,
|
levelapp/workflow/factory.py
CHANGED
|
@@ -8,7 +8,6 @@ from levelapp.workflow.runtime import WorkflowContext
|
|
|
8
8
|
|
|
9
9
|
class MainFactory:
|
|
10
10
|
"""Central factory for workflows."""
|
|
11
|
-
|
|
12
11
|
_workflow_map: Dict[WorkflowType, Callable[[WorkflowContext], BaseWorkflow]] = {
|
|
13
12
|
WorkflowType.SIMULATOR: lambda ctx: SimulatorWorkflow(ctx),
|
|
14
13
|
WorkflowType.COMPARATOR: lambda ctx: ComparatorWorkflow(ctx),
|
|
@@ -16,7 +15,15 @@ class MainFactory:
|
|
|
16
15
|
|
|
17
16
|
@classmethod
|
|
18
17
|
def create_workflow(cls, context: WorkflowContext) -> BaseWorkflow:
|
|
19
|
-
"""
|
|
18
|
+
"""
|
|
19
|
+
Create workflow using the given runtime context.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
context (WorkflowContext): the provided workflow context.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
BaseWorkflow: the built workflow instance from the provided context.
|
|
26
|
+
"""
|
|
20
27
|
wf_type = context.config.process.workflow_type
|
|
21
28
|
builder = cls._workflow_map.get(wf_type)
|
|
22
29
|
if not builder:
|
|
@@ -25,5 +32,11 @@ class MainFactory:
|
|
|
25
32
|
|
|
26
33
|
@classmethod
|
|
27
34
|
def register_workflow(cls, wf_type: WorkflowType, builder: Callable[[WorkflowContext], BaseWorkflow]) -> None:
|
|
28
|
-
"""
|
|
35
|
+
"""
|
|
36
|
+
Register a new workflow implementation.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
wf_type (WorkflowType): the workflow type.
|
|
40
|
+
builder (Callable[[WorkflowContext], BaseWorkflow]): the workflow builder.
|
|
41
|
+
"""
|
|
29
42
|
cls._workflow_map[wf_type] = builder
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: levelapp
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
|
|
5
5
|
Project-URL: Homepage, https://github.com/levelapp-org
|
|
6
6
|
Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
|
|
@@ -17,14 +17,12 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
18
|
Classifier: Topic :: Software Development :: Testing
|
|
19
19
|
Requires-Python: >=3.12
|
|
20
|
-
Requires-Dist: arrow>=1.3.0
|
|
21
20
|
Requires-Dist: google-api-core>=2.25.1
|
|
22
21
|
Requires-Dist: google-auth>=2.40.3
|
|
23
22
|
Requires-Dist: google-cloud-firestore>=2.21.0
|
|
24
23
|
Requires-Dist: httpx>=0.28.1
|
|
25
24
|
Requires-Dist: humanize>=4.13.0
|
|
26
25
|
Requires-Dist: numpy>=2.3.2
|
|
27
|
-
Requires-Dist: openai>=1.99.9
|
|
28
26
|
Requires-Dist: pandas-stubs==2.3.0.250703
|
|
29
27
|
Requires-Dist: pandas>=2.3.1
|
|
30
28
|
Requires-Dist: pydantic>=2.11.7
|
|
@@ -133,7 +131,7 @@ endpoint:
|
|
|
133
131
|
generated_metadata: "${generated_metadata}"
|
|
134
132
|
|
|
135
133
|
repository:
|
|
136
|
-
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
134
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
137
135
|
project_id: "(default)"
|
|
138
136
|
database_name: ""
|
|
139
137
|
```
|
|
@@ -220,14 +218,14 @@ To run an evaluation:
|
|
|
220
218
|
|
|
221
219
|
```python
|
|
222
220
|
if __name__ == "__main__":
|
|
223
|
-
from levelapp.workflow
|
|
221
|
+
from levelapp.workflow import WorkflowConfig
|
|
224
222
|
from levelapp.core.session import EvaluationSession
|
|
225
223
|
|
|
226
224
|
# Load configuration from YAML
|
|
227
225
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
228
226
|
|
|
229
|
-
# Run evaluation session
|
|
230
|
-
with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
|
|
227
|
+
# Run evaluation session (You can enable/disable the monitoring aspect)
|
|
228
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
|
|
231
229
|
session.run()
|
|
232
230
|
results = session.workflow.collect_results()
|
|
233
231
|
print("Results:", results)
|
|
@@ -243,14 +241,13 @@ if __name__ == "__main__":
|
|
|
243
241
|
from levelapp.workflow import WorkflowConfig
|
|
244
242
|
from levelapp.core.session import EvaluationSession
|
|
245
243
|
|
|
246
|
-
|
|
247
|
-
|
|
244
|
+
|
|
248
245
|
config_dict = {
|
|
249
246
|
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
250
|
-
"evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
|
|
247
|
+
"evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
|
|
251
248
|
"reference_data": {"path": "", "data": {}},
|
|
252
249
|
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
253
|
-
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"
|
|
250
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
|
|
254
251
|
}
|
|
255
252
|
|
|
256
253
|
content = {
|
|
@@ -2,9 +2,9 @@ levelapp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
2
2
|
levelapp/aspects/__init__.py,sha256=_OaPcjTWBizqcUdDVj5aYue7lG9ytjQGLhPvReriKnU,326
|
|
3
3
|
levelapp/aspects/loader.py,sha256=xWpcWtS25zbVhZ0UnIJEcQA9klajKk10TLK4j1IStH0,9543
|
|
4
4
|
levelapp/aspects/logger.py,sha256=MJ9HphyHYkTE5-ajA_WuMUTM0qQzd0WIP243vF-pj3M,1698
|
|
5
|
-
levelapp/aspects/monitor.py,sha256=
|
|
5
|
+
levelapp/aspects/monitor.py,sha256=KREhG_KH4f7okyheexaeKykwhzzQsFl1USC-k9YPXfQ,22065
|
|
6
6
|
levelapp/aspects/sanitizer.py,sha256=zUqgb76tXJ8UUYtHp0Rz7q9PZjAHpSpHPPFfGTjjQNg,5229
|
|
7
|
-
levelapp/clients/__init__.py,sha256=
|
|
7
|
+
levelapp/clients/__init__.py,sha256=of3Zdkag634COXH_ca4hxXkERT8X44QS0IgfWu2yWqY,4084
|
|
8
8
|
levelapp/clients/anthropic.py,sha256=Bxp-HffcIPLwM9BLcTR7n-D8ZXYVWCmbr2oH33fKV04,4030
|
|
9
9
|
levelapp/clients/ionos.py,sha256=GFkLSeu8epFZV44GbNO3h1fRCKcfxscHMTFY0kPfh3o,4267
|
|
10
10
|
levelapp/clients/mistral.py,sha256=e1NRvP9qN7O2zWAzBbgdQmmUDHQfCRLtVKDJCrh0DNA,3777
|
|
@@ -13,18 +13,19 @@ levelapp/comparator/__init__.py,sha256=ynmc0mrx-JbcCqLH-z4hOVezqGocDbDQGqgbhWy2x
|
|
|
13
13
|
levelapp/comparator/comparator.py,sha256=yk0FWREnWKhIbXlsYpieqPJPqrlWXzyBMjVSznGqKY8,8183
|
|
14
14
|
levelapp/comparator/extractor.py,sha256=vJ9iEoWAtXo2r9r7X72uUQPKW3UZE9Kx3uIjCufEp9k,3910
|
|
15
15
|
levelapp/comparator/schemas.py,sha256=lUAQzEyStidt2ePQgV2zq-An5MLBrVSw6t8fB0FQKJs,1803
|
|
16
|
-
levelapp/comparator/scorer.py,sha256=
|
|
16
|
+
levelapp/comparator/scorer.py,sha256=LBRy8H11rXulSa-k40BcycPcMAHgdUm13qS7ibWHq6I,9032
|
|
17
17
|
levelapp/comparator/utils.py,sha256=Eu48nDrNzFr0lwAJJS0aNhKsAWQ72syTEWYMNYfg764,4331
|
|
18
18
|
levelapp/config/__init__.py,sha256=9oaajE5zW-OVWOszUzMAG6nHDSbLQWa3KT6bVoSvzRA,137
|
|
19
|
-
levelapp/config/endpoint.py,sha256=
|
|
20
|
-
levelapp/config/
|
|
19
|
+
levelapp/config/endpoint.py,sha256=B-uIEKF-0_Y6Vo8MZ8eoCZocRkghijrdpwT3zq0FDLk,7647
|
|
20
|
+
levelapp/config/endpoint_.py,sha256=-abrIYKbFPLxTqNst-zbCI4MpMCmCMe9VZ6O8OwNRiE,1629
|
|
21
|
+
levelapp/config/prompts.py,sha256=NXOKRp5l1VQ9LO0pUojVH6TDJhWyZImsAvZEz2QiD9k,2206
|
|
21
22
|
levelapp/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
23
|
levelapp/core/base.py,sha256=oh4OkKgwGxmw_jgjX6wrBoK0KPc1JvCMZfbZP_mGmIg,12453
|
|
23
|
-
levelapp/core/schemas.py,sha256=
|
|
24
|
-
levelapp/core/session.py,sha256=
|
|
24
|
+
levelapp/core/schemas.py,sha256=E47d93MMOj4eRYZIqUyLBiE5Ye7WgwkOJPOWQ6swRmo,465
|
|
25
|
+
levelapp/core/session.py,sha256=6utDbLdg6DjwHL5dP-4wGe4_f7gFgEukuNNeOnbCbtA,9035
|
|
25
26
|
levelapp/evaluator/__init__.py,sha256=K-P75Q1FXXLCNqH1wyhT9sf4y2R9a1qR5449AXEsY1k,109
|
|
26
|
-
levelapp/evaluator/evaluator.py,sha256=
|
|
27
|
-
levelapp/metrics/__init__.py,sha256=
|
|
27
|
+
levelapp/evaluator/evaluator.py,sha256=JCRgQps9GKlJBDYw9xzVrC2_aGy0GhGAJ0ZkSC_IWWA,10806
|
|
28
|
+
levelapp/metrics/__init__.py,sha256=x8iTaeDezJyQ9-NFe8GGvzwIBhyAJHWSRfBE3JRX-PE,1878
|
|
28
29
|
levelapp/metrics/embedding.py,sha256=wvlT8Q5DjDT6GrAIFtc5aFbA_80hDLUXMP4RbSpSwHE,115
|
|
29
30
|
levelapp/metrics/exact.py,sha256=Kb13nD2OVLrl3iYHaXrxDfrxDuhW0SMVvLAEXPaJtlY,6235
|
|
30
31
|
levelapp/metrics/fuzzy.py,sha256=Rg8ashzMxtQwKO-z_LLzdj2PDIRqL4CBw6PGRf9IBrI,2598
|
|
@@ -33,17 +34,17 @@ levelapp/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
|
33
34
|
levelapp/repository/__init__.py,sha256=hNmFRZ7kKJN1mMlOHeW9xf0j9Q7gqTXYJ3hMCzk9to4,79
|
|
34
35
|
levelapp/repository/firestore.py,sha256=K9JgxsNCelAKtzTDv19c1dHRlitMeRzo7H3caTlKuF8,10369
|
|
35
36
|
levelapp/simulator/__init__.py,sha256=8Dz8g7rbpBZX3WoknVmMVoWm_VT72ZL9BABOF1xFpqs,83
|
|
36
|
-
levelapp/simulator/schemas.py,sha256=
|
|
37
|
-
levelapp/simulator/simulator.py,sha256=
|
|
38
|
-
levelapp/simulator/utils.py,sha256=
|
|
37
|
+
levelapp/simulator/schemas.py,sha256=YGprtuRZ4m33WBD35xj1Ib5EbMTdDCOp-wCykf-Iz-4,3700
|
|
38
|
+
levelapp/simulator/simulator.py,sha256=ytgjUE9G8z2oW6rEzSdVkyncPiJcZCLWyvLR0cI5rk8,19895
|
|
39
|
+
levelapp/simulator/utils.py,sha256=d1O4Q4Yl1lAAJWLJDiwNjwt0hD9bGlCan4a2G21E7yw,5930
|
|
39
40
|
levelapp/workflow/__init__.py,sha256=27b2obG7ObhR43yd2uH-R0koRB7-DG8Emnvrq8EjsTA,193
|
|
40
|
-
levelapp/workflow/base.py,sha256=
|
|
41
|
-
levelapp/workflow/config.py,sha256=
|
|
42
|
-
levelapp/workflow/context.py,sha256=
|
|
43
|
-
levelapp/workflow/factory.py,sha256=
|
|
41
|
+
levelapp/workflow/base.py,sha256=1A_xKSBOmVjfMbRBcNhDK6G17SEjqRIm-XjMw45IPC4,5596
|
|
42
|
+
levelapp/workflow/config.py,sha256=MlHt1PsXD09aukB93fvKTew0D8WD4_jdnO93Nn6b2U0,2923
|
|
43
|
+
levelapp/workflow/context.py,sha256=gjAZXHEdlsXqWY6DbXOfKXNbxQbahRPSnNzyWDqryPU,2559
|
|
44
|
+
levelapp/workflow/factory.py,sha256=z1ttJmI59sU9HgOvPo3ixUJ_oPv838XgehfuOorlTt8,1634
|
|
44
45
|
levelapp/workflow/registration.py,sha256=VHUHjLHXad5kjcKukaEOIf7hBZ09bT3HAzVmIT08aLo,359
|
|
45
46
|
levelapp/workflow/runtime.py,sha256=cFyXNWXSuURKbrMDHdkTcjeItM9wHP-5DPljntwYL5g,686
|
|
46
|
-
levelapp-0.1.
|
|
47
|
-
levelapp-0.1.
|
|
48
|
-
levelapp-0.1.
|
|
49
|
-
levelapp-0.1.
|
|
47
|
+
levelapp-0.1.2.dist-info/METADATA,sha256=d0ZoS0BTpHpq65Q1KLeqlN12XQiCAyJ-zPP17T22D6o,12446
|
|
48
|
+
levelapp-0.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
49
|
+
levelapp-0.1.2.dist-info/licenses/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
|
+
levelapp-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|