levelapp 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- levelapp/__init__.py +0 -0
- levelapp/aspects/__init__.py +8 -0
- levelapp/aspects/loader.py +253 -0
- levelapp/aspects/logger.py +59 -0
- levelapp/aspects/monitor.py +617 -0
- levelapp/aspects/sanitizer.py +168 -0
- levelapp/clients/__init__.py +122 -0
- levelapp/clients/anthropic.py +112 -0
- levelapp/clients/gemini.py +130 -0
- levelapp/clients/groq.py +101 -0
- levelapp/clients/huggingface.py +162 -0
- levelapp/clients/ionos.py +126 -0
- levelapp/clients/mistral.py +106 -0
- levelapp/clients/openai.py +116 -0
- levelapp/comparator/__init__.py +5 -0
- levelapp/comparator/comparator.py +232 -0
- levelapp/comparator/extractor.py +108 -0
- levelapp/comparator/schemas.py +61 -0
- levelapp/comparator/scorer.py +269 -0
- levelapp/comparator/utils.py +136 -0
- levelapp/config/__init__.py +5 -0
- levelapp/config/endpoint.py +199 -0
- levelapp/config/prompts.py +57 -0
- levelapp/core/__init__.py +0 -0
- levelapp/core/base.py +386 -0
- levelapp/core/schemas.py +24 -0
- levelapp/core/session.py +336 -0
- levelapp/endpoint/__init__.py +0 -0
- levelapp/endpoint/client.py +188 -0
- levelapp/endpoint/client_test.py +41 -0
- levelapp/endpoint/manager.py +114 -0
- levelapp/endpoint/parsers.py +119 -0
- levelapp/endpoint/schemas.py +38 -0
- levelapp/endpoint/tester.py +52 -0
- levelapp/evaluator/__init__.py +3 -0
- levelapp/evaluator/evaluator.py +307 -0
- levelapp/metrics/__init__.py +63 -0
- levelapp/metrics/embedding.py +56 -0
- levelapp/metrics/embeddings/__init__.py +0 -0
- levelapp/metrics/embeddings/sentence_transformer.py +30 -0
- levelapp/metrics/embeddings/torch_based.py +56 -0
- levelapp/metrics/exact.py +182 -0
- levelapp/metrics/fuzzy.py +80 -0
- levelapp/metrics/token.py +103 -0
- levelapp/plugins/__init__.py +0 -0
- levelapp/repository/__init__.py +3 -0
- levelapp/repository/filesystem.py +203 -0
- levelapp/repository/firestore.py +291 -0
- levelapp/simulator/__init__.py +3 -0
- levelapp/simulator/schemas.py +116 -0
- levelapp/simulator/simulator.py +531 -0
- levelapp/simulator/utils.py +134 -0
- levelapp/visualization/__init__.py +7 -0
- levelapp/visualization/charts.py +358 -0
- levelapp/visualization/dashboard.py +240 -0
- levelapp/visualization/exporter.py +167 -0
- levelapp/visualization/templates/base.html +158 -0
- levelapp/visualization/templates/comparator_dashboard.html +57 -0
- levelapp/visualization/templates/simulator_dashboard.html +111 -0
- levelapp/workflow/__init__.py +6 -0
- levelapp/workflow/base.py +192 -0
- levelapp/workflow/config.py +96 -0
- levelapp/workflow/context.py +64 -0
- levelapp/workflow/factory.py +42 -0
- levelapp/workflow/registration.py +6 -0
- levelapp/workflow/runtime.py +19 -0
- levelapp-0.1.15.dist-info/METADATA +571 -0
- levelapp-0.1.15.dist-info/RECORD +70 -0
- levelapp-0.1.15.dist-info/WHEEL +4 -0
- levelapp-0.1.15.dist-info/licenses/LICENSE +0 -0
|
@@ -0,0 +1,531 @@
|
|
|
1
|
+
"""
|
|
2
|
+
'simulators/service.py': Service layer to manage conversation simulation and evaluation.
|
|
3
|
+
"""
|
|
4
|
+
import time
|
|
5
|
+
import asyncio
|
|
6
|
+
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from typing import Dict, Any, List
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
from levelapp.core.base import BaseProcess, BaseEvaluator
|
|
13
|
+
from levelapp.endpoint.client import EndpointConfig
|
|
14
|
+
from levelapp.endpoint.manager import EndpointConfigManager
|
|
15
|
+
|
|
16
|
+
from levelapp.core.schemas import EvaluatorType
|
|
17
|
+
from levelapp.simulator.schemas import (
|
|
18
|
+
InteractionEvaluationResults,
|
|
19
|
+
ScriptsBatch,
|
|
20
|
+
ConversationScript,
|
|
21
|
+
SimulationResults, SingleInteractionResults, SingleAttemptResults, AllAttemptsResults
|
|
22
|
+
)
|
|
23
|
+
from levelapp.simulator.utils import (
|
|
24
|
+
calculate_average_scores,
|
|
25
|
+
summarize_verdicts,
|
|
26
|
+
)
|
|
27
|
+
from levelapp.aspects import logger
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ConversationSimulator(BaseProcess):
|
|
31
|
+
"""Conversation simulator component."""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
endpoint_config: EndpointConfig | None = None,
|
|
36
|
+
evaluators: Dict[EvaluatorType, BaseEvaluator] | None = None,
|
|
37
|
+
providers: List[str] | None = None,
|
|
38
|
+
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the ConversationSimulator.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
endpoint_config (EndpointConfig): Endpoint configuration.
|
|
45
|
+
evaluators (EvaluationService): Service for evaluating interactions.
|
|
46
|
+
endpoint_config (EndpointConfig): Configuration object for VLA.
|
|
47
|
+
"""
|
|
48
|
+
self._CLASS_NAME = self.__class__.__name__
|
|
49
|
+
|
|
50
|
+
self.endpoint_config = endpoint_config
|
|
51
|
+
self.evaluators = evaluators
|
|
52
|
+
self.providers = providers
|
|
53
|
+
|
|
54
|
+
self.endpoint_cm = EndpointConfigManager()
|
|
55
|
+
|
|
56
|
+
self.test_batch: ScriptsBatch | None = None
|
|
57
|
+
|
|
58
|
+
def setup(
|
|
59
|
+
self,
|
|
60
|
+
endpoint_config: EndpointConfig,
|
|
61
|
+
evaluators: Dict[EvaluatorType, BaseEvaluator],
|
|
62
|
+
providers: List[str],
|
|
63
|
+
) -> None:
|
|
64
|
+
"""
|
|
65
|
+
Initialize the ConversationSimulator.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
endpoint_config (EndpointConfig): Configuration object for user endpoint API.
|
|
69
|
+
evaluators (Dict[str, BaseEvaluator]): List of evaluator objects for evaluating interactions.
|
|
70
|
+
providers (List[str]): List of LLM provider names.
|
|
71
|
+
|
|
72
|
+
"""
|
|
73
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.setup.__name__}]"
|
|
74
|
+
logger.info(f"{_LOG} Setting up the Conversation Simulator..")
|
|
75
|
+
|
|
76
|
+
if not self.endpoint_cm:
|
|
77
|
+
self.endpoint_cm = EndpointConfigManager()
|
|
78
|
+
|
|
79
|
+
self.endpoint_config = endpoint_config
|
|
80
|
+
self.endpoint_cm.set_endpoints(endpoints_config=[endpoint_config])
|
|
81
|
+
|
|
82
|
+
self.evaluators = evaluators
|
|
83
|
+
self.providers = providers
|
|
84
|
+
|
|
85
|
+
if not self.providers:
|
|
86
|
+
logger.warning(f"{_LOG} No LLM providers were provided. The Judge Evaluation process will not be executed.")
|
|
87
|
+
|
|
88
|
+
def get_evaluator(self, name: EvaluatorType) -> BaseEvaluator:
|
|
89
|
+
"""
|
|
90
|
+
Retrieve an evaluator by name.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
name (EvaluatorType): Name of evaluator.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
An evaluator object.
|
|
97
|
+
"""
|
|
98
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.get_evaluator.__name__}]"
|
|
99
|
+
|
|
100
|
+
if name not in self.evaluators:
|
|
101
|
+
raise KeyError(f"{_LOG} Evaluator {name} not registered.")
|
|
102
|
+
|
|
103
|
+
return self.evaluators[name]
|
|
104
|
+
|
|
105
|
+
async def run(
|
|
106
|
+
self,
|
|
107
|
+
test_batch: ScriptsBatch,
|
|
108
|
+
attempts: int = 1,
|
|
109
|
+
batch_size: int = 4
|
|
110
|
+
) -> Any:
|
|
111
|
+
"""
|
|
112
|
+
Run a batch test for the given batch name and details.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
test_batch (ScriptsBatch): Scenario batch object.
|
|
116
|
+
attempts (int): Number of attempts to run the simulation.
|
|
117
|
+
batch_size (int): Maximum number of concurrent processes to run the simulation.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Dict[str, Any]: The results of the batch test.
|
|
121
|
+
"""
|
|
122
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.run.__name__}]"
|
|
123
|
+
logger.info(f"{_LOG} Starting batch test [attempts:{attempts}][batch-size:{batch_size}].")
|
|
124
|
+
|
|
125
|
+
started_at = datetime.now()
|
|
126
|
+
|
|
127
|
+
self.test_batch = test_batch
|
|
128
|
+
conversation_results = await self.simulate_conversation(attempts=attempts, max_concurrency=batch_size)
|
|
129
|
+
|
|
130
|
+
finished_at = datetime.now()
|
|
131
|
+
|
|
132
|
+
script_results: List[AllAttemptsResults] = conversation_results.get("script_results", [])
|
|
133
|
+
|
|
134
|
+
batch_verdicts: Dict[str, List[str]] = defaultdict(list)
|
|
135
|
+
|
|
136
|
+
for script in script_results:
|
|
137
|
+
for attempt in script.attempts:
|
|
138
|
+
for judge, verdicts in attempt.evaluation_verdicts.items():
|
|
139
|
+
batch_verdicts[judge].extend(verdicts)
|
|
140
|
+
|
|
141
|
+
verdict_summaries: Dict[str, List[str]] = {
|
|
142
|
+
judge: summarize_verdicts(
|
|
143
|
+
verdicts=verdicts,
|
|
144
|
+
judge=judge,
|
|
145
|
+
)
|
|
146
|
+
for judge, verdicts in batch_verdicts.items()
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
results = SimulationResults(
|
|
150
|
+
started_at=started_at,
|
|
151
|
+
finished_at=finished_at,
|
|
152
|
+
evaluation_summary=verdict_summaries,
|
|
153
|
+
average_scores=conversation_results.get("average_scores", {}),
|
|
154
|
+
script_results=script_results
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return results.model_dump_json(indent=2)
|
|
158
|
+
|
|
159
|
+
async def simulate_conversation(
|
|
160
|
+
self,
|
|
161
|
+
attempts: int = 1,
|
|
162
|
+
max_concurrency: int = 4,
|
|
163
|
+
) -> Dict[str, Any]:
|
|
164
|
+
"""
|
|
165
|
+
Simulate conversations for all scenarios in the batch.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
attempts (int): Number of attempts to run the simulation.
|
|
169
|
+
max_concurrency (int): Maximum number of concurrent conversations.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Dict[str, Any]: The results of the conversation simulation.
|
|
173
|
+
"""
|
|
174
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.simulate_conversation.__name__}]"
|
|
175
|
+
logger.info(f"{_LOG} starting conversation simulation..")
|
|
176
|
+
|
|
177
|
+
semaphore = asyncio.Semaphore(value=max_concurrency)
|
|
178
|
+
|
|
179
|
+
async def run_script(script: ConversationScript) -> AllAttemptsResults:
|
|
180
|
+
async with semaphore:
|
|
181
|
+
return await self.simulate_single_scenario(script=script, attempts=attempts)
|
|
182
|
+
|
|
183
|
+
scripts_tasks = [run_script(script=script) for script in self.test_batch.scripts]
|
|
184
|
+
script_results: List[AllAttemptsResults] = await asyncio.gather(*scripts_tasks)
|
|
185
|
+
|
|
186
|
+
aggregate_scores: Dict[str, List[float]] = defaultdict(list)
|
|
187
|
+
|
|
188
|
+
for result in script_results:
|
|
189
|
+
for metric, value in result.average_scores.items():
|
|
190
|
+
if isinstance(value, (int, float)):
|
|
191
|
+
aggregate_scores[metric].append(value)
|
|
192
|
+
|
|
193
|
+
overall_average_scores = calculate_average_scores(aggregate_scores)
|
|
194
|
+
|
|
195
|
+
return {"script_results": script_results, "average_scores": overall_average_scores}
|
|
196
|
+
|
|
197
|
+
async def simulate_single_scenario(
|
|
198
|
+
self,
|
|
199
|
+
script: ConversationScript,
|
|
200
|
+
attempts: int = 1
|
|
201
|
+
) -> AllAttemptsResults:
|
|
202
|
+
"""
|
|
203
|
+
Simulate a single scenario with the given number of attempts, concurrently.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
script (SimulationScenario): The scenario to simulate.
|
|
207
|
+
attempts (int): Number of attempts to run the simulation.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
AllAttemptsResults: The results of the scenario simulation attempts.
|
|
211
|
+
"""
|
|
212
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.simulate_single_scenario.__name__}]"
|
|
213
|
+
|
|
214
|
+
logger.info(f"{_LOG} Starting simulation for script: {script.id}")
|
|
215
|
+
|
|
216
|
+
async def simulate_attempt(attempt_number: int) -> SingleAttemptResults:
|
|
217
|
+
from uuid import uuid4
|
|
218
|
+
attempt_id: str = str(uuid4())
|
|
219
|
+
|
|
220
|
+
logger.info(f"{_LOG} Running attempt: {attempt_number + 1}/{attempts}\n---")
|
|
221
|
+
start_time = time.time()
|
|
222
|
+
|
|
223
|
+
interaction_results = await self.simulate_interactions(
|
|
224
|
+
script=script,
|
|
225
|
+
attempt_id=attempt_id,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
collected_scores: Dict[str, List[Any]] = defaultdict(list)
|
|
229
|
+
collected_verdicts: Dict[str, List[Any]] = defaultdict(list)
|
|
230
|
+
|
|
231
|
+
for interaction in interaction_results:
|
|
232
|
+
if not interaction.evaluation_results:
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
eval_results = interaction.evaluation_results
|
|
236
|
+
|
|
237
|
+
# Judge scores & verdicts
|
|
238
|
+
for provider, judge_result in eval_results.judge_evaluations.items():
|
|
239
|
+
collected_scores[provider].append(judge_result.score)
|
|
240
|
+
collected_verdicts[provider].append(judge_result.justification)
|
|
241
|
+
|
|
242
|
+
# Metadata scores
|
|
243
|
+
if eval_results.metadata_evaluation:
|
|
244
|
+
for _, score in eval_results.metadata_evaluation.items():
|
|
245
|
+
collected_scores["metadata"].append(score)
|
|
246
|
+
|
|
247
|
+
# Guardrail
|
|
248
|
+
if eval_results.guardrail_flag is not None:
|
|
249
|
+
collected_scores["guardrail"].append(eval_results.guardrail_flag)
|
|
250
|
+
|
|
251
|
+
elapsed_time = time.time() - start_time
|
|
252
|
+
collected_scores["processing_time"].append(elapsed_time)
|
|
253
|
+
|
|
254
|
+
average_scores = calculate_average_scores(collected_scores)
|
|
255
|
+
|
|
256
|
+
logger.info(f"{_LOG} Attempt {attempt_number + 1} completed in {elapsed_time:.2f}s\n---")
|
|
257
|
+
|
|
258
|
+
return SingleAttemptResults(
|
|
259
|
+
attempt_nbr=attempt_number + 1,
|
|
260
|
+
attempt_id=attempt_id,
|
|
261
|
+
script_id=str(script.id),
|
|
262
|
+
total_duration=elapsed_time,
|
|
263
|
+
interaction_results=interaction_results,
|
|
264
|
+
evaluation_verdicts=collected_verdicts,
|
|
265
|
+
average_scores=average_scores,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
attempt_tasks = [simulate_attempt(i) for i in range(attempts)]
|
|
269
|
+
all_attempts: List[SingleAttemptResults] = await asyncio.gather(*attempt_tasks, return_exceptions=False)
|
|
270
|
+
|
|
271
|
+
scenario_scores: Dict[str, List[float]] = defaultdict(list)
|
|
272
|
+
|
|
273
|
+
for attempt in all_attempts:
|
|
274
|
+
for metric, value in attempt.average_scores.items():
|
|
275
|
+
if isinstance(value, (int, float)):
|
|
276
|
+
scenario_scores[metric].append(value)
|
|
277
|
+
|
|
278
|
+
scenario_average_scores = calculate_average_scores(scenario_scores)
|
|
279
|
+
|
|
280
|
+
return AllAttemptsResults(
|
|
281
|
+
script_id=str(script.id),
|
|
282
|
+
attempts=all_attempts,
|
|
283
|
+
average_scores=scenario_average_scores,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
async def simulate_interactions(
|
|
287
|
+
self,
|
|
288
|
+
script: ConversationScript,
|
|
289
|
+
attempt_id: str,
|
|
290
|
+
) -> List[SingleInteractionResults]:
|
|
291
|
+
"""
|
|
292
|
+
Simulate inbound interactions for a scenario.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
script (ConversationScript): The script to simulate.
|
|
296
|
+
attempt_id (str): The id of the attempt.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
List[SingleInteractionResults]: The results of the inbound interactions simulation.
|
|
300
|
+
"""
|
|
301
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.simulate_interactions.__name__}]"
|
|
302
|
+
|
|
303
|
+
logger.info(f"{_LOG} Starting interactions simulation [ConvId:{attempt_id}]..")
|
|
304
|
+
start_time = time.time()
|
|
305
|
+
|
|
306
|
+
results = []
|
|
307
|
+
contextual_mode: bool = script.variable_request_schema
|
|
308
|
+
logger.info(f"{_LOG} Contextual Mode ON: {contextual_mode}")
|
|
309
|
+
interactions = script.interactions
|
|
310
|
+
|
|
311
|
+
for idx, interaction in enumerate(interactions):
|
|
312
|
+
request_payload = interaction.request_payload.copy()
|
|
313
|
+
if contextual_mode:
|
|
314
|
+
from levelapp.simulator.utils import set_by_path
|
|
315
|
+
|
|
316
|
+
if script.uuid_field:
|
|
317
|
+
request_payload[script.uuid_field] = attempt_id
|
|
318
|
+
|
|
319
|
+
user_message = interaction.user_message
|
|
320
|
+
set_by_path(
|
|
321
|
+
obj=request_payload,
|
|
322
|
+
path=interaction.user_message_path,
|
|
323
|
+
value=user_message,
|
|
324
|
+
)
|
|
325
|
+
logger.info(f"{_LOG} Request payload (Preloaded Request Schema):\n{request_payload}\n---")
|
|
326
|
+
|
|
327
|
+
else:
|
|
328
|
+
user_message = interaction.user_message
|
|
329
|
+
request_payload.update({"user_message": user_message})
|
|
330
|
+
logger.info(f"{_LOG} Request payload (Configured Request Schema):\n{request_payload}\n---")
|
|
331
|
+
|
|
332
|
+
mappings = self.endpoint_config.response_mapping
|
|
333
|
+
|
|
334
|
+
client_response = await self.endpoint_cm.send_request(
|
|
335
|
+
endpoint_config=self.endpoint_config,
|
|
336
|
+
context=request_payload,
|
|
337
|
+
contextual_mode=contextual_mode
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
reference_reply = interaction.reference_reply
|
|
341
|
+
reference_metadata = interaction.reference_metadata
|
|
342
|
+
reference_guardrail_flag: bool = interaction.guardrail_flag
|
|
343
|
+
|
|
344
|
+
if not client_response.response or client_response.response.status_code != 200:
|
|
345
|
+
logger.error(
|
|
346
|
+
f"{_LOG} Interaction request failed [{client_response.error}]:\n{client_response.response}\n---"
|
|
347
|
+
)
|
|
348
|
+
output: SingleInteractionResults = SingleInteractionResults(
|
|
349
|
+
conversation_id=attempt_id,
|
|
350
|
+
user_message=user_message,
|
|
351
|
+
reference_reply=reference_reply,
|
|
352
|
+
reference_metadata=reference_metadata,
|
|
353
|
+
errors={"error": str(client_response.error), "context": str(client_response.response)}
|
|
354
|
+
)
|
|
355
|
+
results.append(output)
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
logger.info(
|
|
359
|
+
f"{_LOG} Response [{client_response.response.status_code}]:\n{client_response.response.text}\n---"
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
interaction_details = self.endpoint_cm.extract_response_data(
|
|
363
|
+
response=client_response.response,
|
|
364
|
+
mappings=mappings,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
logger.info(f"{_LOG} Interaction details <ConvID:{attempt_id}>:\n{interaction_details}\n---")
|
|
368
|
+
|
|
369
|
+
generated_reply = interaction_details.get("agent_reply", "")
|
|
370
|
+
generated_metadata = interaction_details.get("metadata", {})
|
|
371
|
+
extracted_guardrail_flag = interaction_details.get("guardrail_flag", False)
|
|
372
|
+
|
|
373
|
+
logger.info(f"{_LOG} Generated reply <ConvID:{attempt_id}>:\n{generated_reply}\n---")
|
|
374
|
+
|
|
375
|
+
evaluation_results = await self.evaluate_interaction(
|
|
376
|
+
user_input=user_message,
|
|
377
|
+
generated_reply=generated_reply,
|
|
378
|
+
reference_reply=reference_reply,
|
|
379
|
+
generated_metadata=generated_metadata,
|
|
380
|
+
reference_metadata=reference_metadata,
|
|
381
|
+
generated_guardrail=extracted_guardrail_flag,
|
|
382
|
+
reference_guardrail=reference_guardrail_flag,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
elapsed_time = time.time() - start_time
|
|
386
|
+
logger.info(f"{_LOG} Interaction simulation complete in {elapsed_time:.2f} seconds.\n---")
|
|
387
|
+
|
|
388
|
+
output: SingleInteractionResults = SingleInteractionResults(
|
|
389
|
+
conversation_id=attempt_id,
|
|
390
|
+
user_message=user_message,
|
|
391
|
+
generated_reply=generated_reply,
|
|
392
|
+
reference_reply=reference_reply,
|
|
393
|
+
generated_metadata=generated_metadata,
|
|
394
|
+
reference_metadata=reference_metadata,
|
|
395
|
+
guardrail_details=extracted_guardrail_flag,
|
|
396
|
+
evaluation_results=evaluation_results,
|
|
397
|
+
response_content=client_response.response.json(),
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
results.append(output)
|
|
401
|
+
|
|
402
|
+
return results
|
|
403
|
+
|
|
404
|
+
async def evaluate_interaction(
|
|
405
|
+
self,
|
|
406
|
+
user_input: str,
|
|
407
|
+
generated_reply: str,
|
|
408
|
+
reference_reply: str,
|
|
409
|
+
generated_metadata: Dict[str, Any],
|
|
410
|
+
reference_metadata: Dict[str, Any],
|
|
411
|
+
generated_guardrail: bool,
|
|
412
|
+
reference_guardrail: bool,
|
|
413
|
+
) -> InteractionEvaluationResults:
|
|
414
|
+
"""
|
|
415
|
+
Evaluate an interaction using OpenAI and Ionos evaluation services.
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
user_input (str): user input to evaluate.
|
|
419
|
+
generated_reply (str): The generated agent reply.
|
|
420
|
+
reference_reply (str): The reference agent reply.
|
|
421
|
+
generated_metadata (Dict[str, Any]): The generated metadata.
|
|
422
|
+
reference_metadata (Dict[str, Any]): The reference metadata.
|
|
423
|
+
generated_guardrail (bool): generated handoff/guardrail flag.
|
|
424
|
+
reference_guardrail (bool): reference handoff/guardrail flag.
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
InteractionEvaluationResults: The evaluation results.
|
|
428
|
+
"""
|
|
429
|
+
_LOG: str = f"[{self._CLASS_NAME}][{self.evaluate_interaction.__name__}]"
|
|
430
|
+
|
|
431
|
+
judge_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.JUDGE, None)
|
|
432
|
+
metadata_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.REFERENCE, None)
|
|
433
|
+
|
|
434
|
+
evaluation_results = InteractionEvaluationResults()
|
|
435
|
+
|
|
436
|
+
if judge_evaluator and self.providers:
|
|
437
|
+
await self._judge_evaluation(
|
|
438
|
+
user_input=user_input,
|
|
439
|
+
generated_reply=generated_reply,
|
|
440
|
+
reference_reply=reference_reply,
|
|
441
|
+
providers=self.providers,
|
|
442
|
+
judge_evaluator=judge_evaluator,
|
|
443
|
+
evaluation_results=evaluation_results,
|
|
444
|
+
)
|
|
445
|
+
else:
|
|
446
|
+
logger.info(f"{_LOG} Judge evaluation skipped (no evaluator or no providers).")
|
|
447
|
+
|
|
448
|
+
if metadata_evaluator and reference_metadata:
|
|
449
|
+
self._metadata_evaluation(
|
|
450
|
+
metadata_evaluator=metadata_evaluator,
|
|
451
|
+
generated_metadata=generated_metadata,
|
|
452
|
+
reference_metadata=reference_metadata,
|
|
453
|
+
evaluation_results=evaluation_results,
|
|
454
|
+
)
|
|
455
|
+
else:
|
|
456
|
+
logger.info(f"{_LOG} Metadata evaluation skipped (no evaluator or no reference metadata).")
|
|
457
|
+
|
|
458
|
+
evaluation_results.guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
|
|
459
|
+
|
|
460
|
+
return evaluation_results
|
|
461
|
+
|
|
462
|
+
async def _judge_evaluation(
|
|
463
|
+
self,
|
|
464
|
+
user_input: str,
|
|
465
|
+
generated_reply: str,
|
|
466
|
+
reference_reply: str,
|
|
467
|
+
providers: List[str],
|
|
468
|
+
judge_evaluator: BaseEvaluator,
|
|
469
|
+
evaluation_results: InteractionEvaluationResults,
|
|
470
|
+
) -> None:
|
|
471
|
+
"""
|
|
472
|
+
Run LLM-as-a-judge evaluation using multiple providers (async).
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
user_input (str): The user input message.
|
|
476
|
+
generated_reply (str): The generated agent reply.
|
|
477
|
+
reference_reply (str): The reference agent reply.
|
|
478
|
+
providers (List[str]): List of judge provider names.
|
|
479
|
+
judge_evaluator (BaseEvaluator): Evaluator instance.
|
|
480
|
+
evaluation_results (InteractionEvaluationResults): Results container (Pydantic model).
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
None
|
|
484
|
+
"""
|
|
485
|
+
_LOG: str = f"[{self._CLASS_NAME}][judge_evaluation]"
|
|
486
|
+
|
|
487
|
+
tasks = {
|
|
488
|
+
provider: judge_evaluator.async_evaluate(
|
|
489
|
+
generated_data=generated_reply,
|
|
490
|
+
reference_data=reference_reply,
|
|
491
|
+
user_input=user_input,
|
|
492
|
+
provider=provider,
|
|
493
|
+
)
|
|
494
|
+
for provider in providers
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
results = await asyncio.gather(*tasks.values(), return_exceptions=True)
|
|
498
|
+
|
|
499
|
+
for provider, result in zip(tasks.keys(), results):
|
|
500
|
+
if isinstance(result, Exception):
|
|
501
|
+
logger.error(f"{_LOG} Provider '{provider}' failed to perform Judge Evaluation.")
|
|
502
|
+
evaluation_results.errors = {"provider": provider, "content": str(result)}
|
|
503
|
+
else:
|
|
504
|
+
evaluation_results.judge_evaluations[provider] = result
|
|
505
|
+
|
|
506
|
+
def _metadata_evaluation(
|
|
507
|
+
self,
|
|
508
|
+
metadata_evaluator: BaseEvaluator,
|
|
509
|
+
generated_metadata: Dict[str, Any],
|
|
510
|
+
reference_metadata: Dict[str, Any],
|
|
511
|
+
evaluation_results: InteractionEvaluationResults,
|
|
512
|
+
) -> None:
|
|
513
|
+
"""
|
|
514
|
+
Run metadata evaluation using the provided evaluator.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
metadata_evaluator (BaseEvaluator): Evaluator for metadata comparison.
|
|
518
|
+
generated_metadata (Dict[str, Any]): The generated metadata.
|
|
519
|
+
reference_metadata (Dict[str, Any]): The reference metadata.
|
|
520
|
+
evaluation_results (InteractionEvaluationResults): Results container.
|
|
521
|
+
"""
|
|
522
|
+
_LOG: str = f"[{self._CLASS_NAME}][metadata_evaluation]"
|
|
523
|
+
|
|
524
|
+
try:
|
|
525
|
+
evaluation_results.metadata_evaluation = metadata_evaluator.evaluate(
|
|
526
|
+
generated_data=generated_metadata,
|
|
527
|
+
reference_data=reference_metadata,
|
|
528
|
+
)
|
|
529
|
+
except Exception as e:
|
|
530
|
+
logger.error(f"{_LOG} Metadata evaluation failed:\n{e}", exc_info=e)
|
|
531
|
+
evaluation_results.errors = {"errors": e}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
'simulators/aspects.py': Utility functions for handling VLA interactions and requests.
|
|
3
|
+
"""
|
|
4
|
+
import httpx
|
|
5
|
+
|
|
6
|
+
from typing import Any, Dict, List, Union
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from levelapp.clients import ClientRegistry
|
|
10
|
+
from levelapp.config.prompts import SUMMARIZATION_PROMPT_TEMPLATE
|
|
11
|
+
from levelapp.aspects import MonitoringAspect, MetricType, logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def set_by_path(obj: Dict, path: str, value: Any) -> None:
|
|
15
|
+
"""
|
|
16
|
+
Sets a value in a nested dictionary using JSON path-like notation.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
obj (dict): Dictionary to modify.
|
|
20
|
+
path (str): Path (e.g., "a.b[0].c") indicating where to set the value.
|
|
21
|
+
value (Any): Value to assign at the specified path.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
None
|
|
25
|
+
"""
|
|
26
|
+
parts = path.split(".")
|
|
27
|
+
current = obj
|
|
28
|
+
|
|
29
|
+
for i, part in enumerate(parts):
|
|
30
|
+
is_last = i == len(parts) - 1
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
# Handle list index access, e.g., key[0] or [1]
|
|
34
|
+
if '[' in part and ']' in part:
|
|
35
|
+
key, idx = part.split('[')
|
|
36
|
+
idx = int(idx.rstrip(']'))
|
|
37
|
+
|
|
38
|
+
# If we have a key before the list
|
|
39
|
+
if key:
|
|
40
|
+
if key not in current or not isinstance(current[key], list):
|
|
41
|
+
current[key] = []
|
|
42
|
+
while len(current[key]) <= idx:
|
|
43
|
+
current[key].append({})
|
|
44
|
+
target = current[key]
|
|
45
|
+
else:
|
|
46
|
+
if not isinstance(current, list):
|
|
47
|
+
print("[set_by_path][WARNING] Expected a list at this level.")
|
|
48
|
+
return
|
|
49
|
+
while len(current) <= idx:
|
|
50
|
+
current.append({})
|
|
51
|
+
target = current
|
|
52
|
+
|
|
53
|
+
if is_last:
|
|
54
|
+
target[idx] = value
|
|
55
|
+
else:
|
|
56
|
+
if not isinstance(target[idx], dict):
|
|
57
|
+
target[idx] = {}
|
|
58
|
+
current = target[idx]
|
|
59
|
+
|
|
60
|
+
else:
|
|
61
|
+
# Regular dictionary key
|
|
62
|
+
if is_last:
|
|
63
|
+
current[part] = value
|
|
64
|
+
else:
|
|
65
|
+
if part not in current or not isinstance(current[part], dict):
|
|
66
|
+
current[part] = {}
|
|
67
|
+
current = current[part]
|
|
68
|
+
|
|
69
|
+
except (KeyError, IndexError, TypeError, AttributeError) as e:
|
|
70
|
+
print(f"[set_by_path][ERROR] Error type <{e.__class__.__name__}> : {e.args[0]}")
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@MonitoringAspect.monitor(
|
|
75
|
+
name="average_calc",
|
|
76
|
+
category=MetricType.SCORING,
|
|
77
|
+
cached=True,
|
|
78
|
+
maxsize=1000
|
|
79
|
+
)
|
|
80
|
+
def calculate_average_scores(scores: Dict[str, Union[List[float], float]]) -> Dict[str, float]:
|
|
81
|
+
"""
|
|
82
|
+
Helper function that calculates the average scores for a dictionary of score lists.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
scores (Dict[str, List[float]]): A dictionary where keys are identifiers and values are lists of scores.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Dict[str, float]: A dictionary with average scores rounded to three decimal places.
|
|
89
|
+
"""
|
|
90
|
+
result: Dict[str, float] = {}
|
|
91
|
+
for field, value in scores.items():
|
|
92
|
+
if isinstance(value, (int, float)):
|
|
93
|
+
result[field] = value
|
|
94
|
+
elif isinstance(value, list):
|
|
95
|
+
result[field] = round((sum(value) / len(value)), 3) if value else 0.0
|
|
96
|
+
else:
|
|
97
|
+
raise TypeError(f"[calculate_average_scores] Unexpected type '{type(value)}' for field '{field}")
|
|
98
|
+
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@MonitoringAspect.monitor(name="summarization", category=MetricType.API_CALL)
|
|
103
|
+
def summarize_verdicts(
|
|
104
|
+
verdicts: List[str],
|
|
105
|
+
judge: str,
|
|
106
|
+
max_bullets: int = 5
|
|
107
|
+
) -> List[str]:
|
|
108
|
+
client_registry = ClientRegistry()
|
|
109
|
+
client = client_registry.get(provider=judge)
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
verdicts = chr(10).join(verdicts)
|
|
113
|
+
prompt = SUMMARIZATION_PROMPT_TEMPLATE.format(max_bullets=max_bullets, judge=judge, verdicts=verdicts)
|
|
114
|
+
response = client.call(message=prompt)
|
|
115
|
+
parsed = client.parse_response(response=response)
|
|
116
|
+
striped = parsed.get("output", "").strip("")
|
|
117
|
+
bullet_points = [point.strip() for point in striped.split("- ") if point.strip()]
|
|
118
|
+
|
|
119
|
+
return bullet_points[:max_bullets]
|
|
120
|
+
|
|
121
|
+
except Exception as e:
|
|
122
|
+
logger.error(f"[summarize_justifications] Error during summarization: {str(e)}", exc_info=True)
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# if __name__ == '__main__':
|
|
127
|
+
# template = {'generated_reply': '${agent_reply}', 'generated_metadata': '${generated_metadata}'}
|
|
128
|
+
# response_dict = {
|
|
129
|
+
# 'agent_reply': "I'd be happy to help you book something for 10 AM.",
|
|
130
|
+
# 'generated_metadata': {'appointment_type': 'Cardiology', 'date': 'next Monday', 'time': '10 AM'}
|
|
131
|
+
# }
|
|
132
|
+
#
|
|
133
|
+
# result = extract_interaction_details(response_dict, template)
|
|
134
|
+
# print(f"result: {result.model_dump()}")
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""levelapp/visualization: Visualization module for evaluation results."""
|
|
2
|
+
|
|
3
|
+
from .charts import ChartGenerator
|
|
4
|
+
from .dashboard import DashboardGenerator
|
|
5
|
+
from .exporter import ResultsExporter
|
|
6
|
+
|
|
7
|
+
__all__ = ["ChartGenerator", "DashboardGenerator", "ResultsExporter"]
|