levelapp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- levelapp/__init__.py +0 -0
- levelapp/aspects/__init__.py +8 -0
- levelapp/aspects/loader.py +253 -0
- levelapp/aspects/logger.py +59 -0
- levelapp/aspects/monitor.py +614 -0
- levelapp/aspects/sanitizer.py +168 -0
- levelapp/clients/__init__.py +119 -0
- levelapp/clients/anthropic.py +112 -0
- levelapp/clients/ionos.py +116 -0
- levelapp/clients/mistral.py +106 -0
- levelapp/clients/openai.py +102 -0
- levelapp/comparator/__init__.py +5 -0
- levelapp/comparator/comparator.py +232 -0
- levelapp/comparator/extractor.py +108 -0
- levelapp/comparator/schemas.py +61 -0
- levelapp/comparator/scorer.py +271 -0
- levelapp/comparator/utils.py +136 -0
- levelapp/config/__init__.py +5 -0
- levelapp/config/endpoint.py +190 -0
- levelapp/config/prompts.py +35 -0
- levelapp/core/__init__.py +0 -0
- levelapp/core/base.py +386 -0
- levelapp/core/session.py +214 -0
- levelapp/evaluator/__init__.py +3 -0
- levelapp/evaluator/evaluator.py +265 -0
- levelapp/metrics/__init__.py +67 -0
- levelapp/metrics/embedding.py +2 -0
- levelapp/metrics/exact.py +182 -0
- levelapp/metrics/fuzzy.py +80 -0
- levelapp/metrics/token.py +103 -0
- levelapp/plugins/__init__.py +0 -0
- levelapp/repository/__init__.py +3 -0
- levelapp/repository/firestore.py +282 -0
- levelapp/simulator/__init__.py +3 -0
- levelapp/simulator/schemas.py +89 -0
- levelapp/simulator/simulator.py +441 -0
- levelapp/simulator/utils.py +201 -0
- levelapp/workflow/__init__.py +5 -0
- levelapp/workflow/base.py +113 -0
- levelapp/workflow/factory.py +51 -0
- levelapp/workflow/registration.py +6 -0
- levelapp/workflow/schemas.py +121 -0
- levelapp-0.1.0.dist-info/METADATA +254 -0
- levelapp-0.1.0.dist-info/RECORD +46 -0
- levelapp-0.1.0.dist-info/WHEEL +4 -0
- levelapp-0.1.0.dist-info/licenses/LICENSE +0 -0
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
"""
|
|
2
|
+
'simulators/service.py': Service layer to manage conversation simulation and evaluation.
|
|
3
|
+
"""
|
|
4
|
+
import time
|
|
5
|
+
import asyncio
|
|
6
|
+
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from typing import Dict, Any, List
|
|
10
|
+
|
|
11
|
+
from levelapp.core.base import BaseRepository, BaseProcess, BaseEvaluator
|
|
12
|
+
from levelapp.config.endpoint import EndpointConfig
|
|
13
|
+
from levelapp.simulator.schemas import (
|
|
14
|
+
InteractionEvaluationResults,
|
|
15
|
+
ScriptsBatch,
|
|
16
|
+
ConversationScript,
|
|
17
|
+
SimulationResults
|
|
18
|
+
)
|
|
19
|
+
from levelapp.simulator.utils import (
|
|
20
|
+
extract_interaction_details,
|
|
21
|
+
async_interaction_request,
|
|
22
|
+
calculate_average_scores,
|
|
23
|
+
summarize_verdicts,
|
|
24
|
+
)
|
|
25
|
+
from levelapp.aspects import logger
|
|
26
|
+
from levelapp.workflow.schemas import EvaluatorType
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ConversationSimulator(BaseProcess):
|
|
30
|
+
"""Conversation simulator component."""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
repository: BaseRepository | None = None,
|
|
35
|
+
evaluators: Dict[EvaluatorType, BaseEvaluator] | None = None,
|
|
36
|
+
endpoint_config: EndpointConfig | None = None,
|
|
37
|
+
):
|
|
38
|
+
"""
|
|
39
|
+
Initialize the ConversationSimulator.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
repository (BaseRepository): Service for saving simulation results.
|
|
43
|
+
evaluators (EvaluationService): Service for evaluating interactions.
|
|
44
|
+
endpoint_config (EndpointConfig): Configuration object for VLA.
|
|
45
|
+
"""
|
|
46
|
+
self._CLASS_NAME = self.__class__.__name__
|
|
47
|
+
|
|
48
|
+
self.repository = repository
|
|
49
|
+
self.evaluators = evaluators
|
|
50
|
+
self.endpoint_config = endpoint_config
|
|
51
|
+
|
|
52
|
+
self._url: str | None = None
|
|
53
|
+
self._credentials: str | None = None
|
|
54
|
+
self._headers: Dict[str, Any] | None = None
|
|
55
|
+
|
|
56
|
+
self.test_batch: ScriptsBatch | None = None
|
|
57
|
+
self.evaluation_verdicts: Dict[str, List[str]] = defaultdict(list)
|
|
58
|
+
self.verdict_summaries: Dict[str, List[str]] = defaultdict(list)
|
|
59
|
+
|
|
60
|
+
def setup(
|
|
61
|
+
self,
|
|
62
|
+
repository: BaseRepository,
|
|
63
|
+
evaluators: Dict[str, BaseEvaluator],
|
|
64
|
+
endpoint_config: EndpointConfig,
|
|
65
|
+
) -> None:
|
|
66
|
+
"""
|
|
67
|
+
Initialize the ConversationSimulator.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
repository (BaseRepository): Repository object for storing simulation results.
|
|
71
|
+
evaluators (Dict[str, BaseEvaluator]): List of evaluator objects for evaluating interactions.
|
|
72
|
+
endpoint_config (EndpointConfig): Configuration object for VLA.
|
|
73
|
+
"""
|
|
74
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.setup.__name__}]"
|
|
75
|
+
logger.info(f"{_LOG} Setting up the Conversation Simulator..")
|
|
76
|
+
|
|
77
|
+
self.repository = repository
|
|
78
|
+
self.evaluators = evaluators
|
|
79
|
+
self.endpoint_config = endpoint_config
|
|
80
|
+
|
|
81
|
+
self._url = endpoint_config.full_url
|
|
82
|
+
self._credentials = endpoint_config.api_key.get_secret_value()
|
|
83
|
+
self._headers = endpoint_config.headers
|
|
84
|
+
|
|
85
|
+
def get_evaluator(self, name: EvaluatorType) -> BaseEvaluator:
|
|
86
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.get_evaluator.__name__}]"
|
|
87
|
+
|
|
88
|
+
if name not in self.evaluators:
|
|
89
|
+
raise KeyError(f"{_LOG} Evaluator {name} not registered.")
|
|
90
|
+
return self.evaluators[name]
|
|
91
|
+
|
|
92
|
+
async def run(
|
|
93
|
+
self,
|
|
94
|
+
test_batch: ScriptsBatch,
|
|
95
|
+
attempts: int = 1,
|
|
96
|
+
) -> Dict[str, Any]:
|
|
97
|
+
"""
|
|
98
|
+
Run a batch test for the given batch name and details.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
test_batch (ScriptsBatch): Scenario batch object.
|
|
102
|
+
attempts (int): Number of attempts to run the simulation.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Dict[str, Any]: The results of the batch test.
|
|
106
|
+
"""
|
|
107
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.run.__name__}]"
|
|
108
|
+
logger.info(f"{_LOG} Starting batch test (attempts: {attempts}).")
|
|
109
|
+
|
|
110
|
+
started_at = datetime.now()
|
|
111
|
+
|
|
112
|
+
self.test_batch = test_batch
|
|
113
|
+
results = await self.simulate_conversation(attempts=attempts)
|
|
114
|
+
|
|
115
|
+
finished_at = datetime.now()
|
|
116
|
+
|
|
117
|
+
results = SimulationResults(
|
|
118
|
+
started_at=started_at,
|
|
119
|
+
finished_at=finished_at,
|
|
120
|
+
evaluation_summary=self.verdict_summaries,
|
|
121
|
+
average_scores=results.get("average_scores", {}),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
return {"results": results, "status": "COMPLETE"}
|
|
125
|
+
|
|
126
|
+
async def simulate_conversation(self, attempts: int = 1) -> Dict[str, Any]:
|
|
127
|
+
"""
|
|
128
|
+
Simulate conversations for all scenarios in the batch.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
attempts (int): Number of attempts to run the simulation.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Dict[str, Any]: The results of the conversation simulation.
|
|
135
|
+
"""
|
|
136
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.simulate_conversation.__name__}]"
|
|
137
|
+
logger.info(f"{_LOG} starting conversation simulation..")
|
|
138
|
+
|
|
139
|
+
semaphore = asyncio.Semaphore(value=len(self.test_batch.scripts))
|
|
140
|
+
|
|
141
|
+
async def run_with_semaphore(script: ConversationScript) -> Dict[str, Any]:
|
|
142
|
+
async with semaphore:
|
|
143
|
+
return await self.simulate_single_scenario(
|
|
144
|
+
script=script, attempts=attempts
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
results = await asyncio.gather(
|
|
148
|
+
*(run_with_semaphore(s) for s in self.test_batch.scripts)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
aggregate_scores: Dict[str, List[float]] = defaultdict(list)
|
|
152
|
+
for result in results:
|
|
153
|
+
for key, value in result.get("average_scores", {}).items():
|
|
154
|
+
if isinstance(value, (int, float)):
|
|
155
|
+
aggregate_scores[key].append(value)
|
|
156
|
+
|
|
157
|
+
overall_average_scores = calculate_average_scores(aggregate_scores)
|
|
158
|
+
|
|
159
|
+
for judge, verdicts in self.evaluation_verdicts.items():
|
|
160
|
+
self.verdict_summaries[judge] = summarize_verdicts(
|
|
161
|
+
verdicts=verdicts, judge=judge
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
return {"scripts": results, "average_scores": overall_average_scores}
|
|
165
|
+
|
|
166
|
+
async def simulate_single_scenario(
|
|
167
|
+
self, script: ConversationScript, attempts: int = 1
|
|
168
|
+
) -> Dict[str, Any]:
|
|
169
|
+
"""
|
|
170
|
+
Simulate a single scenario with the given number of attempts, concurrently.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
script (SimulationScenario): The scenario to simulate.
|
|
174
|
+
attempts (int): Number of attempts to run the simulation.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Dict[str, Any]: The results of the scenario simulation.
|
|
178
|
+
"""
|
|
179
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.simulate_single_scenario.__name__}]"
|
|
180
|
+
|
|
181
|
+
logger.info(f"{_LOG} Starting simulation for script: {script.id}")
|
|
182
|
+
all_attempts_scores: Dict[str, List[float]] = defaultdict(list)
|
|
183
|
+
all_attempts_verdicts: Dict[str, List[str]] = defaultdict(list)
|
|
184
|
+
|
|
185
|
+
async def simulate_attempt(attempt_number: int) -> Dict[str, Any]:
|
|
186
|
+
logger.info(f"{_LOG} Running attempt: {attempt_number + 1}/{attempts}")
|
|
187
|
+
start_time = time.time()
|
|
188
|
+
|
|
189
|
+
collected_scores: Dict[str, List[Any]] = defaultdict(list)
|
|
190
|
+
collected_verdicts: Dict[str, List[str]] = defaultdict(list)
|
|
191
|
+
|
|
192
|
+
initial_interaction_results = await self.simulate_interactions(
|
|
193
|
+
script=script,
|
|
194
|
+
evaluation_verdicts=collected_verdicts,
|
|
195
|
+
collected_scores=collected_scores,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
logger.info(f"{_LOG} collected_scores: {collected_scores}\n---")
|
|
199
|
+
single_attempt_scores = calculate_average_scores(collected_scores)
|
|
200
|
+
|
|
201
|
+
for target, scores in single_attempt_scores.items():
|
|
202
|
+
all_attempts_scores[target].append(scores)
|
|
203
|
+
|
|
204
|
+
for judge, verdicts in collected_verdicts.items():
|
|
205
|
+
all_attempts_verdicts[judge].extend(verdicts)
|
|
206
|
+
|
|
207
|
+
elapsed_time = time.time() - start_time
|
|
208
|
+
all_attempts_scores["processing_time"].append(elapsed_time)
|
|
209
|
+
|
|
210
|
+
logger.info(
|
|
211
|
+
f"{_LOG} Attempt {attempt_number + 1} completed in {elapsed_time:.2f}s\n---"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
return {
|
|
215
|
+
"attempt": attempt_number + 1,
|
|
216
|
+
"script_id": script.id,
|
|
217
|
+
"total_duration": elapsed_time,
|
|
218
|
+
"interaction_results": initial_interaction_results,
|
|
219
|
+
"evaluation_verdicts": collected_verdicts,
|
|
220
|
+
"average_scores": single_attempt_scores,
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
attempt_tasks = [simulate_attempt(i) for i in range(attempts)]
|
|
224
|
+
attempt_results = await asyncio.gather(*attempt_tasks, return_exceptions=False)
|
|
225
|
+
|
|
226
|
+
average_scores = calculate_average_scores(all_attempts_scores)
|
|
227
|
+
|
|
228
|
+
for judge_, verdicts_ in all_attempts_verdicts.items():
|
|
229
|
+
self.evaluation_verdicts[judge_].extend(verdicts_)
|
|
230
|
+
|
|
231
|
+
logger.info(
|
|
232
|
+
f"{_LOG} average scores:\n{average_scores}\n---"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return {
|
|
236
|
+
"script_id": script.id,
|
|
237
|
+
"attempts": attempt_results,
|
|
238
|
+
"average_scores": average_scores,
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
async def simulate_interactions(
|
|
242
|
+
self,
|
|
243
|
+
script: ConversationScript,
|
|
244
|
+
evaluation_verdicts: Dict[str, List[str]],
|
|
245
|
+
collected_scores: Dict[str, List[Any]],
|
|
246
|
+
) -> List[Dict[str, Any]]:
|
|
247
|
+
"""
|
|
248
|
+
Simulate inbound interactions for a scenario.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
script (ConversationScript): The script to simulate.
|
|
252
|
+
evaluation_verdicts(Dict[str, List[str]]): evaluation verdict for each evaluator.
|
|
253
|
+
collected_scores(Dict[str, List[Any]]): collected scores for each target.
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
List[Dict[str, Any]]: The results of the inbound interactions simulation.
|
|
257
|
+
"""
|
|
258
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.simulate_interactions.__name__}]"
|
|
259
|
+
|
|
260
|
+
logger.info(f"{_LOG} Starting interactions simulation..")
|
|
261
|
+
start_time = time.time()
|
|
262
|
+
|
|
263
|
+
results = []
|
|
264
|
+
interactions = script.interactions
|
|
265
|
+
|
|
266
|
+
for interaction in interactions:
|
|
267
|
+
user_message = interaction.user_message
|
|
268
|
+
request_payload = interaction.request_payload
|
|
269
|
+
self.endpoint_config.variables = {
|
|
270
|
+
"user_message": user_message,
|
|
271
|
+
"request_payload": request_payload
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
response = await async_interaction_request(
|
|
275
|
+
url=self.endpoint_config.full_url,
|
|
276
|
+
headers=self.endpoint_config.headers,
|
|
277
|
+
payload=self.endpoint_config.request_payload,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
reference_reply = interaction.reference_reply
|
|
281
|
+
reference_metadata = interaction.reference_metadata
|
|
282
|
+
reference_guardrail_flag: bool = interaction.guardrail_flag
|
|
283
|
+
|
|
284
|
+
if not response or response.status_code != 200:
|
|
285
|
+
logger.error(f"{_LOG} Interaction request failed.")
|
|
286
|
+
result = {
|
|
287
|
+
"user_message": user_message,
|
|
288
|
+
"generated_reply": "Interaction Request failed",
|
|
289
|
+
"reference_reply": reference_reply,
|
|
290
|
+
"generated_metadata": {},
|
|
291
|
+
"reference_metadata": reference_metadata,
|
|
292
|
+
"guardrail_details": None,
|
|
293
|
+
"evaluation_results": {},
|
|
294
|
+
}
|
|
295
|
+
results.append(result)
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
interaction_details = extract_interaction_details(
|
|
299
|
+
response=response.text,
|
|
300
|
+
template=self.endpoint_config.response_payload,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
generated_reply = interaction_details.generated_reply
|
|
304
|
+
generated_metadata = interaction_details.generated_metadata
|
|
305
|
+
extracted_guardrail_flag: bool = interaction_details.guardrail_flag
|
|
306
|
+
|
|
307
|
+
evaluation_results = await self.evaluate_interaction(
|
|
308
|
+
user_input=user_message,
|
|
309
|
+
generated_reply=generated_reply,
|
|
310
|
+
reference_reply=reference_reply,
|
|
311
|
+
generated_metadata=generated_metadata,
|
|
312
|
+
reference_metadata=reference_metadata,
|
|
313
|
+
generated_guardrail=extracted_guardrail_flag,
|
|
314
|
+
reference_guardrail=reference_guardrail_flag,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
logger.info(f"{_LOG} Evaluation results:\n{evaluation_results.model_dump()}\n")
|
|
318
|
+
|
|
319
|
+
self.store_evaluation_results(
|
|
320
|
+
results=evaluation_results,
|
|
321
|
+
evaluation_verdicts=evaluation_verdicts,
|
|
322
|
+
collected_scores=collected_scores,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
elapsed_time = time.time() - start_time
|
|
326
|
+
logger.info(
|
|
327
|
+
f"{_LOG} Interaction simulation complete in {elapsed_time:.2f} seconds.\n---"
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
result = {
|
|
331
|
+
"user_message": user_message,
|
|
332
|
+
"generated_reply": generated_reply,
|
|
333
|
+
"reference_reply": reference_reply,
|
|
334
|
+
"generated_metadata": generated_metadata,
|
|
335
|
+
"reference_metadata": reference_metadata,
|
|
336
|
+
"guardrail_details": interaction_details.guardrail_flag,
|
|
337
|
+
"evaluation_results": evaluation_results.model_dump(),
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
results.append(result)
|
|
341
|
+
|
|
342
|
+
return results
|
|
343
|
+
|
|
344
|
+
async def evaluate_interaction(
|
|
345
|
+
self,
|
|
346
|
+
user_input: str,
|
|
347
|
+
generated_reply: str,
|
|
348
|
+
reference_reply: str,
|
|
349
|
+
generated_metadata: Dict[str, Any],
|
|
350
|
+
reference_metadata: Dict[str, Any],
|
|
351
|
+
generated_guardrail: bool,
|
|
352
|
+
reference_guardrail: bool,
|
|
353
|
+
) -> InteractionEvaluationResults:
|
|
354
|
+
"""
|
|
355
|
+
Evaluate an interaction using OpenAI and Ionos evaluation services.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
user_input (str): user input to evaluate.
|
|
359
|
+
generated_reply (str): The generated agent reply.
|
|
360
|
+
reference_reply (str): The reference agent reply.
|
|
361
|
+
generated_metadata (Dict[str, Any]): The generated metadata.
|
|
362
|
+
reference_metadata (Dict[str, Any]): The reference metadata.
|
|
363
|
+
generated_guardrail (bool): generated handoff/guardrail flag.
|
|
364
|
+
reference_guardrail (bool): reference handoff/guardrail flag.
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
InteractionEvaluationResults: The evaluation results.
|
|
368
|
+
"""
|
|
369
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.evaluate_interaction.__name__}]"
|
|
370
|
+
|
|
371
|
+
judge_evaluator = self.evaluators.get(EvaluatorType.JUDGE)
|
|
372
|
+
metadata_evaluator = self.evaluators.get(EvaluatorType.REFERENCE)
|
|
373
|
+
|
|
374
|
+
if not judge_evaluator:
|
|
375
|
+
raise ValueError(f"{_LOG} No Judge Evaluator found.")
|
|
376
|
+
|
|
377
|
+
openai_eval_task = judge_evaluator.async_evaluate(
|
|
378
|
+
generated_data=generated_reply,
|
|
379
|
+
reference_data=reference_reply,
|
|
380
|
+
user_input=user_input,
|
|
381
|
+
provider="openai"
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
ionos_eval_task = judge_evaluator.async_evaluate(
|
|
385
|
+
provider="ionos",
|
|
386
|
+
user_input=user_input,
|
|
387
|
+
generated_data=generated_reply,
|
|
388
|
+
reference_data=reference_reply,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
openai_judge_evaluation, ionos_judge_evaluation = await asyncio.gather(
|
|
392
|
+
openai_eval_task, ionos_eval_task
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
if not metadata_evaluator:
|
|
396
|
+
raise ValueError(f"{_LOG} No Metadata Evaluator found.")
|
|
397
|
+
|
|
398
|
+
metadata_evaluation = {}
|
|
399
|
+
if reference_metadata:
|
|
400
|
+
metadata_evaluation = metadata_evaluator.evaluate(
|
|
401
|
+
generated_data=generated_metadata,
|
|
402
|
+
reference_data=reference_metadata,
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
|
|
406
|
+
|
|
407
|
+
return InteractionEvaluationResults(
|
|
408
|
+
judge_evaluations={
|
|
409
|
+
openai_judge_evaluation.provider: openai_judge_evaluation,
|
|
410
|
+
ionos_judge_evaluation.provider: ionos_judge_evaluation
|
|
411
|
+
},
|
|
412
|
+
metadata_evaluation=metadata_evaluation,
|
|
413
|
+
guardrail_flag=guardrail_flag,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
@staticmethod
|
|
417
|
+
def store_evaluation_results(
|
|
418
|
+
results: InteractionEvaluationResults,
|
|
419
|
+
evaluation_verdicts: Dict[str, List[str]],
|
|
420
|
+
collected_scores: Dict[str, List[Any]],
|
|
421
|
+
) -> None:
|
|
422
|
+
"""
|
|
423
|
+
Store the evaluation results in the evaluation summary.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
results (InteractionEvaluationResults): The evaluation results to store.
|
|
427
|
+
evaluation_verdicts (Dict[str, List[str]]): The evaluation summary.
|
|
428
|
+
collected_scores (Dict[str, List[Any]]): The collected scores.
|
|
429
|
+
"""
|
|
430
|
+
for provider in results.judge_evaluations.keys():
|
|
431
|
+
evaluation_verdicts[f"{provider}_verdicts_summary"].append(
|
|
432
|
+
results.judge_evaluations.get(provider, "").justification
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
collected_scores[provider].append(results.judge_evaluations.get(provider, "").score)
|
|
436
|
+
|
|
437
|
+
average_metadata_score = calculate_average_scores(scores=results.metadata_evaluation)
|
|
438
|
+
for field, score in average_metadata_score.items():
|
|
439
|
+
collected_scores["metadata"].append(score)
|
|
440
|
+
|
|
441
|
+
collected_scores["guardrail"].append(results.guardrail_flag)
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""
|
|
2
|
+
'simulators/aspects.py': Utility functions for handling VLA interactions and requests.
|
|
3
|
+
"""
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import arrow
|
|
8
|
+
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
from typing import Dict, Any, Optional, List, Union
|
|
11
|
+
|
|
12
|
+
from openai import OpenAI
|
|
13
|
+
from pydantic import ValidationError
|
|
14
|
+
|
|
15
|
+
from levelapp.simulator.schemas import InteractionResults
|
|
16
|
+
from levelapp.aspects import MonitoringAspect, MetricType, logger
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class UUIDEncoder(json.JSONEncoder):
|
|
20
|
+
def default(self, obj):
|
|
21
|
+
if isinstance(obj, UUID):
|
|
22
|
+
return str(obj)
|
|
23
|
+
return json.JSONEncoder.default(self, obj)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def extract_interaction_details(
|
|
27
|
+
response: str | Dict[str, Any],
|
|
28
|
+
template: Dict[str, Any],
|
|
29
|
+
) -> InteractionResults:
|
|
30
|
+
"""
|
|
31
|
+
Extract interaction details from a VLA response.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
response (str): The response text from the VLA.
|
|
35
|
+
template (Dict[str, Any]): The response schema/template.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
InteractionResults: The extracted interaction details.
|
|
39
|
+
"""
|
|
40
|
+
try:
|
|
41
|
+
response_dict = response if isinstance(response, dict) else json.loads(response)
|
|
42
|
+
|
|
43
|
+
if not isinstance(response_dict, dict):
|
|
44
|
+
raise ValueError("Response is not a valid dictionary")
|
|
45
|
+
|
|
46
|
+
required_keys = {value.strip("${}") for value in template.values()}
|
|
47
|
+
if not required_keys.issubset(response_dict.keys()):
|
|
48
|
+
missing_keys = required_keys - response_dict.keys()
|
|
49
|
+
logger.warning(f"[extract_interaction_details] Missing data: {missing_keys}]")
|
|
50
|
+
|
|
51
|
+
return InteractionResults.model_validate(response_dict)
|
|
52
|
+
|
|
53
|
+
except json.JSONDecodeError as e:
|
|
54
|
+
logger.error(f"[extract_interaction_details] Failed to extract details:\n{e}")
|
|
55
|
+
return InteractionResults()
|
|
56
|
+
|
|
57
|
+
except ValidationError as e:
|
|
58
|
+
logger.exception(f"[extract_interaction_details] Failed to create an InteractionResults instance:\n{e}")
|
|
59
|
+
return InteractionResults()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@MonitoringAspect.monitor(name="interaction_request", category=MetricType.API_CALL)
|
|
63
|
+
async def async_interaction_request(
|
|
64
|
+
url: str,
|
|
65
|
+
headers: Dict[str, str],
|
|
66
|
+
payload: Dict[str, Any],
|
|
67
|
+
) -> Optional[httpx.Response]:
|
|
68
|
+
"""
|
|
69
|
+
Perform an asynchronous interaction request.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
url (str): The URL to send the request to.
|
|
73
|
+
headers (Dict[str, str]): The headers to include in the request.
|
|
74
|
+
payload (Dict[str, Any]): The payload to send in the request.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Optional[httpx.Response]: The response from the interaction request, or None if an error occurred.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
async with httpx.AsyncClient(timeout=180) as client:
|
|
81
|
+
response = await client.post(url=url, headers=headers, json=payload)
|
|
82
|
+
response.raise_for_status()
|
|
83
|
+
|
|
84
|
+
return response
|
|
85
|
+
|
|
86
|
+
except httpx.HTTPStatusError as http_err:
|
|
87
|
+
logger.error(f"[async_interaction_request] HTTP error: {http_err.response.text}", exc_info=True)
|
|
88
|
+
|
|
89
|
+
except httpx.RequestError as req_err:
|
|
90
|
+
logger.error(f"[async_interaction_request] Request error: {str(req_err)}", exc_info=True)
|
|
91
|
+
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def parse_date_value(raw_date_value: Optional[str], default_date_value: Optional[str] = "") -> str:
|
|
96
|
+
"""
|
|
97
|
+
Cleans and parses a dehumanized relative date string to ISO format.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
raw_date_value (Optional[str]): The raw date value to parse.
|
|
101
|
+
default_date_value (Optional[str]): The default value to return if parsing fails. Defaults to an empty string.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
str: The parsed date in ISO format, or the default value if parsing fails.
|
|
105
|
+
"""
|
|
106
|
+
if not raw_date_value:
|
|
107
|
+
logger.info(f"[parse_date_value] No raw value provided. returning default: '{default_date_value}'")
|
|
108
|
+
return default_date_value
|
|
109
|
+
|
|
110
|
+
clean = raw_date_value.replace("{{", "").replace("}}", "").replace("_", " ").strip()
|
|
111
|
+
clean += 's' if not clean.endswith('s') else clean
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
arw = arrow.utcnow()
|
|
115
|
+
parsed_date = arw.dehumanize(clean).utcnow().format('YYYY-MM-DD')
|
|
116
|
+
return parsed_date
|
|
117
|
+
|
|
118
|
+
except arrow.parser.ParserError as e:
|
|
119
|
+
logger.error(f"[parse_date_value] Failed to parse date: '{clean}'\nParserError: {str(e)}", exc_info=True)
|
|
120
|
+
return default_date_value
|
|
121
|
+
|
|
122
|
+
except ValueError as e:
|
|
123
|
+
logger.error(f"[parse_date_value] Invalid date value: '{clean}'\nValueError: {str(e)}", exc_info=True)
|
|
124
|
+
return default_date_value
|
|
125
|
+
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"[parse_date_value] Unexpected error.\nException: {str(e)}", exc_info=True)
|
|
128
|
+
return default_date_value
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@MonitoringAspect.monitor(
|
|
132
|
+
name="average_calc",
|
|
133
|
+
category=MetricType.SCORING,
|
|
134
|
+
cached=True,
|
|
135
|
+
maxsize=1000
|
|
136
|
+
)
|
|
137
|
+
def calculate_average_scores(scores: Dict[str, Union[List[float], float]]) -> Dict[str, float]:
|
|
138
|
+
"""
|
|
139
|
+
Helper function that calculates the average scores for a dictionary of score lists.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
scores (Dict[str, List[float]]): A dictionary where keys are identifiers and values are lists of scores.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Dict[str, float]: A dictionary with average scores rounded to three decimal places.
|
|
146
|
+
"""
|
|
147
|
+
result: Dict[str, float] = {}
|
|
148
|
+
for field, value in scores.items():
|
|
149
|
+
if isinstance(value, (int, float)):
|
|
150
|
+
result[field] = value
|
|
151
|
+
elif isinstance(value, list):
|
|
152
|
+
result[field] = round((sum(value) / len(value)), 3) if value else 0.0
|
|
153
|
+
else:
|
|
154
|
+
raise TypeError(f"[calculate_average_scores] Unexpected type '{type(value)}' for field '{field}")
|
|
155
|
+
|
|
156
|
+
return result
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@MonitoringAspect.monitor(name="summarization", category=MetricType.API_CALL)
|
|
160
|
+
def summarize_verdicts(verdicts: List[str], judge: str, max_bullets: int = 5) -> List[str]:
|
|
161
|
+
"""
|
|
162
|
+
Summarize the justifications for each judge.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
verdicts (List[str]): A list of justifications.
|
|
166
|
+
judge (str): The judge or evaluator (provider) name for context.
|
|
167
|
+
max_bullets (int): The maximum number of bullets allowed per judge.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
List[str]: The summarized justifications.
|
|
171
|
+
"""
|
|
172
|
+
if not verdicts:
|
|
173
|
+
return []
|
|
174
|
+
|
|
175
|
+
prompt = f"""
|
|
176
|
+
You are reviewing evaluation justifications from LL judges about replies generated by a virtual leasing agent.\n
|
|
177
|
+
Each justification contains the judge's assessment of how well the agent's response matched the expected reply.\n
|
|
178
|
+
Your task is to identify and summarize only the **negative points**, such as errors, misunderstandings,
|
|
179
|
+
missing information, or failure to meet expectations.\n
|
|
180
|
+
Return up to {max_bullets} bullet points. Be concise and start each point with '- '\n\n
|
|
181
|
+
---
|
|
182
|
+
- Judge: {judge}
|
|
183
|
+
- Justifications:\n{chr(10).join(verdicts)}\n
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
client = OpenAI()
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
result = client.chat.completions.create(
|
|
190
|
+
model="gpt-4o-mini",
|
|
191
|
+
temperature=0,
|
|
192
|
+
messages=[{"role": "user", "content": prompt}]
|
|
193
|
+
).choices[0].message.content
|
|
194
|
+
|
|
195
|
+
bullet_points = [point.strip() for point in result.split('- ') if point.strip()]
|
|
196
|
+
|
|
197
|
+
return bullet_points
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
logger.error(f"[summarize_justifications] Error during summarization: {str(e)}", exc_info=True)
|
|
201
|
+
return []
|