levelapp 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. levelapp/__init__.py +0 -0
  2. levelapp/aspects/__init__.py +8 -0
  3. levelapp/aspects/loader.py +253 -0
  4. levelapp/aspects/logger.py +59 -0
  5. levelapp/aspects/monitor.py +617 -0
  6. levelapp/aspects/sanitizer.py +168 -0
  7. levelapp/clients/__init__.py +122 -0
  8. levelapp/clients/anthropic.py +112 -0
  9. levelapp/clients/gemini.py +130 -0
  10. levelapp/clients/groq.py +101 -0
  11. levelapp/clients/huggingface.py +162 -0
  12. levelapp/clients/ionos.py +126 -0
  13. levelapp/clients/mistral.py +106 -0
  14. levelapp/clients/openai.py +116 -0
  15. levelapp/comparator/__init__.py +5 -0
  16. levelapp/comparator/comparator.py +232 -0
  17. levelapp/comparator/extractor.py +108 -0
  18. levelapp/comparator/schemas.py +61 -0
  19. levelapp/comparator/scorer.py +269 -0
  20. levelapp/comparator/utils.py +136 -0
  21. levelapp/config/__init__.py +5 -0
  22. levelapp/config/endpoint.py +199 -0
  23. levelapp/config/prompts.py +57 -0
  24. levelapp/core/__init__.py +0 -0
  25. levelapp/core/base.py +386 -0
  26. levelapp/core/schemas.py +24 -0
  27. levelapp/core/session.py +336 -0
  28. levelapp/endpoint/__init__.py +0 -0
  29. levelapp/endpoint/client.py +188 -0
  30. levelapp/endpoint/client_test.py +41 -0
  31. levelapp/endpoint/manager.py +114 -0
  32. levelapp/endpoint/parsers.py +119 -0
  33. levelapp/endpoint/schemas.py +38 -0
  34. levelapp/endpoint/tester.py +52 -0
  35. levelapp/evaluator/__init__.py +3 -0
  36. levelapp/evaluator/evaluator.py +307 -0
  37. levelapp/metrics/__init__.py +63 -0
  38. levelapp/metrics/embedding.py +56 -0
  39. levelapp/metrics/embeddings/__init__.py +0 -0
  40. levelapp/metrics/embeddings/sentence_transformer.py +30 -0
  41. levelapp/metrics/embeddings/torch_based.py +56 -0
  42. levelapp/metrics/exact.py +182 -0
  43. levelapp/metrics/fuzzy.py +80 -0
  44. levelapp/metrics/token.py +103 -0
  45. levelapp/plugins/__init__.py +0 -0
  46. levelapp/repository/__init__.py +3 -0
  47. levelapp/repository/filesystem.py +203 -0
  48. levelapp/repository/firestore.py +291 -0
  49. levelapp/simulator/__init__.py +3 -0
  50. levelapp/simulator/schemas.py +116 -0
  51. levelapp/simulator/simulator.py +531 -0
  52. levelapp/simulator/utils.py +134 -0
  53. levelapp/visualization/__init__.py +7 -0
  54. levelapp/visualization/charts.py +358 -0
  55. levelapp/visualization/dashboard.py +240 -0
  56. levelapp/visualization/exporter.py +167 -0
  57. levelapp/visualization/templates/base.html +158 -0
  58. levelapp/visualization/templates/comparator_dashboard.html +57 -0
  59. levelapp/visualization/templates/simulator_dashboard.html +111 -0
  60. levelapp/workflow/__init__.py +6 -0
  61. levelapp/workflow/base.py +192 -0
  62. levelapp/workflow/config.py +96 -0
  63. levelapp/workflow/context.py +64 -0
  64. levelapp/workflow/factory.py +42 -0
  65. levelapp/workflow/registration.py +6 -0
  66. levelapp/workflow/runtime.py +19 -0
  67. levelapp-0.1.15.dist-info/METADATA +571 -0
  68. levelapp-0.1.15.dist-info/RECORD +70 -0
  69. levelapp-0.1.15.dist-info/WHEEL +4 -0
  70. levelapp-0.1.15.dist-info/licenses/LICENSE +0 -0
@@ -0,0 +1,531 @@
1
+ """
2
+ 'simulators/service.py': Service layer to manage conversation simulation and evaluation.
3
+ """
4
+ import time
5
+ import asyncio
6
+
7
+ from datetime import datetime
8
+ from collections import defaultdict
9
+ from typing import Dict, Any, List
10
+
11
+
12
+ from levelapp.core.base import BaseProcess, BaseEvaluator
13
+ from levelapp.endpoint.client import EndpointConfig
14
+ from levelapp.endpoint.manager import EndpointConfigManager
15
+
16
+ from levelapp.core.schemas import EvaluatorType
17
+ from levelapp.simulator.schemas import (
18
+ InteractionEvaluationResults,
19
+ ScriptsBatch,
20
+ ConversationScript,
21
+ SimulationResults, SingleInteractionResults, SingleAttemptResults, AllAttemptsResults
22
+ )
23
+ from levelapp.simulator.utils import (
24
+ calculate_average_scores,
25
+ summarize_verdicts,
26
+ )
27
+ from levelapp.aspects import logger
28
+
29
+
30
+ class ConversationSimulator(BaseProcess):
31
+ """Conversation simulator component."""
32
+
33
+ def __init__(
34
+ self,
35
+ endpoint_config: EndpointConfig | None = None,
36
+ evaluators: Dict[EvaluatorType, BaseEvaluator] | None = None,
37
+ providers: List[str] | None = None,
38
+
39
+ ):
40
+ """
41
+ Initialize the ConversationSimulator.
42
+
43
+ Args:
44
+ endpoint_config (EndpointConfig): Endpoint configuration.
45
+ evaluators (EvaluationService): Service for evaluating interactions.
46
+ endpoint_config (EndpointConfig): Configuration object for VLA.
47
+ """
48
+ self._CLASS_NAME = self.__class__.__name__
49
+
50
+ self.endpoint_config = endpoint_config
51
+ self.evaluators = evaluators
52
+ self.providers = providers
53
+
54
+ self.endpoint_cm = EndpointConfigManager()
55
+
56
+ self.test_batch: ScriptsBatch | None = None
57
+
58
+ def setup(
59
+ self,
60
+ endpoint_config: EndpointConfig,
61
+ evaluators: Dict[EvaluatorType, BaseEvaluator],
62
+ providers: List[str],
63
+ ) -> None:
64
+ """
65
+ Initialize the ConversationSimulator.
66
+
67
+ Args:
68
+ endpoint_config (EndpointConfig): Configuration object for user endpoint API.
69
+ evaluators (Dict[str, BaseEvaluator]): List of evaluator objects for evaluating interactions.
70
+ providers (List[str]): List of LLM provider names.
71
+
72
+ """
73
+ _LOG: str = f"[{self._CLASS_NAME}][{self.setup.__name__}]"
74
+ logger.info(f"{_LOG} Setting up the Conversation Simulator..")
75
+
76
+ if not self.endpoint_cm:
77
+ self.endpoint_cm = EndpointConfigManager()
78
+
79
+ self.endpoint_config = endpoint_config
80
+ self.endpoint_cm.set_endpoints(endpoints_config=[endpoint_config])
81
+
82
+ self.evaluators = evaluators
83
+ self.providers = providers
84
+
85
+ if not self.providers:
86
+ logger.warning(f"{_LOG} No LLM providers were provided. The Judge Evaluation process will not be executed.")
87
+
88
+ def get_evaluator(self, name: EvaluatorType) -> BaseEvaluator:
89
+ """
90
+ Retrieve an evaluator by name.
91
+
92
+ Args:
93
+ name (EvaluatorType): Name of evaluator.
94
+
95
+ Returns:
96
+ An evaluator object.
97
+ """
98
+ _LOG: str = f"[{self._CLASS_NAME}][{self.get_evaluator.__name__}]"
99
+
100
+ if name not in self.evaluators:
101
+ raise KeyError(f"{_LOG} Evaluator {name} not registered.")
102
+
103
+ return self.evaluators[name]
104
+
105
+ async def run(
106
+ self,
107
+ test_batch: ScriptsBatch,
108
+ attempts: int = 1,
109
+ batch_size: int = 4
110
+ ) -> Any:
111
+ """
112
+ Run a batch test for the given batch name and details.
113
+
114
+ Args:
115
+ test_batch (ScriptsBatch): Scenario batch object.
116
+ attempts (int): Number of attempts to run the simulation.
117
+ batch_size (int): Maximum number of concurrent processes to run the simulation.
118
+
119
+ Returns:
120
+ Dict[str, Any]: The results of the batch test.
121
+ """
122
+ _LOG: str = f"[{self._CLASS_NAME}][{self.run.__name__}]"
123
+ logger.info(f"{_LOG} Starting batch test [attempts:{attempts}][batch-size:{batch_size}].")
124
+
125
+ started_at = datetime.now()
126
+
127
+ self.test_batch = test_batch
128
+ conversation_results = await self.simulate_conversation(attempts=attempts, max_concurrency=batch_size)
129
+
130
+ finished_at = datetime.now()
131
+
132
+ script_results: List[AllAttemptsResults] = conversation_results.get("script_results", [])
133
+
134
+ batch_verdicts: Dict[str, List[str]] = defaultdict(list)
135
+
136
+ for script in script_results:
137
+ for attempt in script.attempts:
138
+ for judge, verdicts in attempt.evaluation_verdicts.items():
139
+ batch_verdicts[judge].extend(verdicts)
140
+
141
+ verdict_summaries: Dict[str, List[str]] = {
142
+ judge: summarize_verdicts(
143
+ verdicts=verdicts,
144
+ judge=judge,
145
+ )
146
+ for judge, verdicts in batch_verdicts.items()
147
+ }
148
+
149
+ results = SimulationResults(
150
+ started_at=started_at,
151
+ finished_at=finished_at,
152
+ evaluation_summary=verdict_summaries,
153
+ average_scores=conversation_results.get("average_scores", {}),
154
+ script_results=script_results
155
+ )
156
+
157
+ return results.model_dump_json(indent=2)
158
+
159
+ async def simulate_conversation(
160
+ self,
161
+ attempts: int = 1,
162
+ max_concurrency: int = 4,
163
+ ) -> Dict[str, Any]:
164
+ """
165
+ Simulate conversations for all scenarios in the batch.
166
+
167
+ Args:
168
+ attempts (int): Number of attempts to run the simulation.
169
+ max_concurrency (int): Maximum number of concurrent conversations.
170
+
171
+ Returns:
172
+ Dict[str, Any]: The results of the conversation simulation.
173
+ """
174
+ _LOG: str = f"[{self._CLASS_NAME}][{self.simulate_conversation.__name__}]"
175
+ logger.info(f"{_LOG} starting conversation simulation..")
176
+
177
+ semaphore = asyncio.Semaphore(value=max_concurrency)
178
+
179
+ async def run_script(script: ConversationScript) -> AllAttemptsResults:
180
+ async with semaphore:
181
+ return await self.simulate_single_scenario(script=script, attempts=attempts)
182
+
183
+ scripts_tasks = [run_script(script=script) for script in self.test_batch.scripts]
184
+ script_results: List[AllAttemptsResults] = await asyncio.gather(*scripts_tasks)
185
+
186
+ aggregate_scores: Dict[str, List[float]] = defaultdict(list)
187
+
188
+ for result in script_results:
189
+ for metric, value in result.average_scores.items():
190
+ if isinstance(value, (int, float)):
191
+ aggregate_scores[metric].append(value)
192
+
193
+ overall_average_scores = calculate_average_scores(aggregate_scores)
194
+
195
+ return {"script_results": script_results, "average_scores": overall_average_scores}
196
+
197
+ async def simulate_single_scenario(
198
+ self,
199
+ script: ConversationScript,
200
+ attempts: int = 1
201
+ ) -> AllAttemptsResults:
202
+ """
203
+ Simulate a single scenario with the given number of attempts, concurrently.
204
+
205
+ Args:
206
+ script (SimulationScenario): The scenario to simulate.
207
+ attempts (int): Number of attempts to run the simulation.
208
+
209
+ Returns:
210
+ AllAttemptsResults: The results of the scenario simulation attempts.
211
+ """
212
+ _LOG: str = f"[{self._CLASS_NAME}][{self.simulate_single_scenario.__name__}]"
213
+
214
+ logger.info(f"{_LOG} Starting simulation for script: {script.id}")
215
+
216
+ async def simulate_attempt(attempt_number: int) -> SingleAttemptResults:
217
+ from uuid import uuid4
218
+ attempt_id: str = str(uuid4())
219
+
220
+ logger.info(f"{_LOG} Running attempt: {attempt_number + 1}/{attempts}\n---")
221
+ start_time = time.time()
222
+
223
+ interaction_results = await self.simulate_interactions(
224
+ script=script,
225
+ attempt_id=attempt_id,
226
+ )
227
+
228
+ collected_scores: Dict[str, List[Any]] = defaultdict(list)
229
+ collected_verdicts: Dict[str, List[Any]] = defaultdict(list)
230
+
231
+ for interaction in interaction_results:
232
+ if not interaction.evaluation_results:
233
+ continue
234
+
235
+ eval_results = interaction.evaluation_results
236
+
237
+ # Judge scores & verdicts
238
+ for provider, judge_result in eval_results.judge_evaluations.items():
239
+ collected_scores[provider].append(judge_result.score)
240
+ collected_verdicts[provider].append(judge_result.justification)
241
+
242
+ # Metadata scores
243
+ if eval_results.metadata_evaluation:
244
+ for _, score in eval_results.metadata_evaluation.items():
245
+ collected_scores["metadata"].append(score)
246
+
247
+ # Guardrail
248
+ if eval_results.guardrail_flag is not None:
249
+ collected_scores["guardrail"].append(eval_results.guardrail_flag)
250
+
251
+ elapsed_time = time.time() - start_time
252
+ collected_scores["processing_time"].append(elapsed_time)
253
+
254
+ average_scores = calculate_average_scores(collected_scores)
255
+
256
+ logger.info(f"{_LOG} Attempt {attempt_number + 1} completed in {elapsed_time:.2f}s\n---")
257
+
258
+ return SingleAttemptResults(
259
+ attempt_nbr=attempt_number + 1,
260
+ attempt_id=attempt_id,
261
+ script_id=str(script.id),
262
+ total_duration=elapsed_time,
263
+ interaction_results=interaction_results,
264
+ evaluation_verdicts=collected_verdicts,
265
+ average_scores=average_scores,
266
+ )
267
+
268
+ attempt_tasks = [simulate_attempt(i) for i in range(attempts)]
269
+ all_attempts: List[SingleAttemptResults] = await asyncio.gather(*attempt_tasks, return_exceptions=False)
270
+
271
+ scenario_scores: Dict[str, List[float]] = defaultdict(list)
272
+
273
+ for attempt in all_attempts:
274
+ for metric, value in attempt.average_scores.items():
275
+ if isinstance(value, (int, float)):
276
+ scenario_scores[metric].append(value)
277
+
278
+ scenario_average_scores = calculate_average_scores(scenario_scores)
279
+
280
+ return AllAttemptsResults(
281
+ script_id=str(script.id),
282
+ attempts=all_attempts,
283
+ average_scores=scenario_average_scores,
284
+ )
285
+
286
+ async def simulate_interactions(
287
+ self,
288
+ script: ConversationScript,
289
+ attempt_id: str,
290
+ ) -> List[SingleInteractionResults]:
291
+ """
292
+ Simulate inbound interactions for a scenario.
293
+
294
+ Args:
295
+ script (ConversationScript): The script to simulate.
296
+ attempt_id (str): The id of the attempt.
297
+
298
+ Returns:
299
+ List[SingleInteractionResults]: The results of the inbound interactions simulation.
300
+ """
301
+ _LOG: str = f"[{self._CLASS_NAME}][{self.simulate_interactions.__name__}]"
302
+
303
+ logger.info(f"{_LOG} Starting interactions simulation [ConvId:{attempt_id}]..")
304
+ start_time = time.time()
305
+
306
+ results = []
307
+ contextual_mode: bool = script.variable_request_schema
308
+ logger.info(f"{_LOG} Contextual Mode ON: {contextual_mode}")
309
+ interactions = script.interactions
310
+
311
+ for idx, interaction in enumerate(interactions):
312
+ request_payload = interaction.request_payload.copy()
313
+ if contextual_mode:
314
+ from levelapp.simulator.utils import set_by_path
315
+
316
+ if script.uuid_field:
317
+ request_payload[script.uuid_field] = attempt_id
318
+
319
+ user_message = interaction.user_message
320
+ set_by_path(
321
+ obj=request_payload,
322
+ path=interaction.user_message_path,
323
+ value=user_message,
324
+ )
325
+ logger.info(f"{_LOG} Request payload (Preloaded Request Schema):\n{request_payload}\n---")
326
+
327
+ else:
328
+ user_message = interaction.user_message
329
+ request_payload.update({"user_message": user_message})
330
+ logger.info(f"{_LOG} Request payload (Configured Request Schema):\n{request_payload}\n---")
331
+
332
+ mappings = self.endpoint_config.response_mapping
333
+
334
+ client_response = await self.endpoint_cm.send_request(
335
+ endpoint_config=self.endpoint_config,
336
+ context=request_payload,
337
+ contextual_mode=contextual_mode
338
+ )
339
+
340
+ reference_reply = interaction.reference_reply
341
+ reference_metadata = interaction.reference_metadata
342
+ reference_guardrail_flag: bool = interaction.guardrail_flag
343
+
344
+ if not client_response.response or client_response.response.status_code != 200:
345
+ logger.error(
346
+ f"{_LOG} Interaction request failed [{client_response.error}]:\n{client_response.response}\n---"
347
+ )
348
+ output: SingleInteractionResults = SingleInteractionResults(
349
+ conversation_id=attempt_id,
350
+ user_message=user_message,
351
+ reference_reply=reference_reply,
352
+ reference_metadata=reference_metadata,
353
+ errors={"error": str(client_response.error), "context": str(client_response.response)}
354
+ )
355
+ results.append(output)
356
+ continue
357
+
358
+ logger.info(
359
+ f"{_LOG} Response [{client_response.response.status_code}]:\n{client_response.response.text}\n---"
360
+ )
361
+
362
+ interaction_details = self.endpoint_cm.extract_response_data(
363
+ response=client_response.response,
364
+ mappings=mappings,
365
+ )
366
+
367
+ logger.info(f"{_LOG} Interaction details <ConvID:{attempt_id}>:\n{interaction_details}\n---")
368
+
369
+ generated_reply = interaction_details.get("agent_reply", "")
370
+ generated_metadata = interaction_details.get("metadata", {})
371
+ extracted_guardrail_flag = interaction_details.get("guardrail_flag", False)
372
+
373
+ logger.info(f"{_LOG} Generated reply <ConvID:{attempt_id}>:\n{generated_reply}\n---")
374
+
375
+ evaluation_results = await self.evaluate_interaction(
376
+ user_input=user_message,
377
+ generated_reply=generated_reply,
378
+ reference_reply=reference_reply,
379
+ generated_metadata=generated_metadata,
380
+ reference_metadata=reference_metadata,
381
+ generated_guardrail=extracted_guardrail_flag,
382
+ reference_guardrail=reference_guardrail_flag,
383
+ )
384
+
385
+ elapsed_time = time.time() - start_time
386
+ logger.info(f"{_LOG} Interaction simulation complete in {elapsed_time:.2f} seconds.\n---")
387
+
388
+ output: SingleInteractionResults = SingleInteractionResults(
389
+ conversation_id=attempt_id,
390
+ user_message=user_message,
391
+ generated_reply=generated_reply,
392
+ reference_reply=reference_reply,
393
+ generated_metadata=generated_metadata,
394
+ reference_metadata=reference_metadata,
395
+ guardrail_details=extracted_guardrail_flag,
396
+ evaluation_results=evaluation_results,
397
+ response_content=client_response.response.json(),
398
+ )
399
+
400
+ results.append(output)
401
+
402
+ return results
403
+
404
+ async def evaluate_interaction(
405
+ self,
406
+ user_input: str,
407
+ generated_reply: str,
408
+ reference_reply: str,
409
+ generated_metadata: Dict[str, Any],
410
+ reference_metadata: Dict[str, Any],
411
+ generated_guardrail: bool,
412
+ reference_guardrail: bool,
413
+ ) -> InteractionEvaluationResults:
414
+ """
415
+ Evaluate an interaction using OpenAI and Ionos evaluation services.
416
+
417
+ Args:
418
+ user_input (str): user input to evaluate.
419
+ generated_reply (str): The generated agent reply.
420
+ reference_reply (str): The reference agent reply.
421
+ generated_metadata (Dict[str, Any]): The generated metadata.
422
+ reference_metadata (Dict[str, Any]): The reference metadata.
423
+ generated_guardrail (bool): generated handoff/guardrail flag.
424
+ reference_guardrail (bool): reference handoff/guardrail flag.
425
+
426
+ Returns:
427
+ InteractionEvaluationResults: The evaluation results.
428
+ """
429
+ _LOG: str = f"[{self._CLASS_NAME}][{self.evaluate_interaction.__name__}]"
430
+
431
+ judge_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.JUDGE, None)
432
+ metadata_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.REFERENCE, None)
433
+
434
+ evaluation_results = InteractionEvaluationResults()
435
+
436
+ if judge_evaluator and self.providers:
437
+ await self._judge_evaluation(
438
+ user_input=user_input,
439
+ generated_reply=generated_reply,
440
+ reference_reply=reference_reply,
441
+ providers=self.providers,
442
+ judge_evaluator=judge_evaluator,
443
+ evaluation_results=evaluation_results,
444
+ )
445
+ else:
446
+ logger.info(f"{_LOG} Judge evaluation skipped (no evaluator or no providers).")
447
+
448
+ if metadata_evaluator and reference_metadata:
449
+ self._metadata_evaluation(
450
+ metadata_evaluator=metadata_evaluator,
451
+ generated_metadata=generated_metadata,
452
+ reference_metadata=reference_metadata,
453
+ evaluation_results=evaluation_results,
454
+ )
455
+ else:
456
+ logger.info(f"{_LOG} Metadata evaluation skipped (no evaluator or no reference metadata).")
457
+
458
+ evaluation_results.guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
459
+
460
+ return evaluation_results
461
+
462
+ async def _judge_evaluation(
463
+ self,
464
+ user_input: str,
465
+ generated_reply: str,
466
+ reference_reply: str,
467
+ providers: List[str],
468
+ judge_evaluator: BaseEvaluator,
469
+ evaluation_results: InteractionEvaluationResults,
470
+ ) -> None:
471
+ """
472
+ Run LLM-as-a-judge evaluation using multiple providers (async).
473
+
474
+ Args:
475
+ user_input (str): The user input message.
476
+ generated_reply (str): The generated agent reply.
477
+ reference_reply (str): The reference agent reply.
478
+ providers (List[str]): List of judge provider names.
479
+ judge_evaluator (BaseEvaluator): Evaluator instance.
480
+ evaluation_results (InteractionEvaluationResults): Results container (Pydantic model).
481
+
482
+ Returns:
483
+ None
484
+ """
485
+ _LOG: str = f"[{self._CLASS_NAME}][judge_evaluation]"
486
+
487
+ tasks = {
488
+ provider: judge_evaluator.async_evaluate(
489
+ generated_data=generated_reply,
490
+ reference_data=reference_reply,
491
+ user_input=user_input,
492
+ provider=provider,
493
+ )
494
+ for provider in providers
495
+ }
496
+
497
+ results = await asyncio.gather(*tasks.values(), return_exceptions=True)
498
+
499
+ for provider, result in zip(tasks.keys(), results):
500
+ if isinstance(result, Exception):
501
+ logger.error(f"{_LOG} Provider '{provider}' failed to perform Judge Evaluation.")
502
+ evaluation_results.errors = {"provider": provider, "content": str(result)}
503
+ else:
504
+ evaluation_results.judge_evaluations[provider] = result
505
+
506
+ def _metadata_evaluation(
507
+ self,
508
+ metadata_evaluator: BaseEvaluator,
509
+ generated_metadata: Dict[str, Any],
510
+ reference_metadata: Dict[str, Any],
511
+ evaluation_results: InteractionEvaluationResults,
512
+ ) -> None:
513
+ """
514
+ Run metadata evaluation using the provided evaluator.
515
+
516
+ Args:
517
+ metadata_evaluator (BaseEvaluator): Evaluator for metadata comparison.
518
+ generated_metadata (Dict[str, Any]): The generated metadata.
519
+ reference_metadata (Dict[str, Any]): The reference metadata.
520
+ evaluation_results (InteractionEvaluationResults): Results container.
521
+ """
522
+ _LOG: str = f"[{self._CLASS_NAME}][metadata_evaluation]"
523
+
524
+ try:
525
+ evaluation_results.metadata_evaluation = metadata_evaluator.evaluate(
526
+ generated_data=generated_metadata,
527
+ reference_data=reference_metadata,
528
+ )
529
+ except Exception as e:
530
+ logger.error(f"{_LOG} Metadata evaluation failed:\n{e}", exc_info=e)
531
+ evaluation_results.errors = {"errors": e}
@@ -0,0 +1,134 @@
1
+ """
2
+ 'simulators/aspects.py': Utility functions for handling VLA interactions and requests.
3
+ """
4
+ import httpx
5
+
6
+ from typing import Any, Dict, List, Union
7
+
8
+
9
+ from levelapp.clients import ClientRegistry
10
+ from levelapp.config.prompts import SUMMARIZATION_PROMPT_TEMPLATE
11
+ from levelapp.aspects import MonitoringAspect, MetricType, logger
12
+
13
+
14
+ def set_by_path(obj: Dict, path: str, value: Any) -> None:
15
+ """
16
+ Sets a value in a nested dictionary using JSON path-like notation.
17
+
18
+ Args:
19
+ obj (dict): Dictionary to modify.
20
+ path (str): Path (e.g., "a.b[0].c") indicating where to set the value.
21
+ value (Any): Value to assign at the specified path.
22
+
23
+ Returns:
24
+ None
25
+ """
26
+ parts = path.split(".")
27
+ current = obj
28
+
29
+ for i, part in enumerate(parts):
30
+ is_last = i == len(parts) - 1
31
+
32
+ try:
33
+ # Handle list index access, e.g., key[0] or [1]
34
+ if '[' in part and ']' in part:
35
+ key, idx = part.split('[')
36
+ idx = int(idx.rstrip(']'))
37
+
38
+ # If we have a key before the list
39
+ if key:
40
+ if key not in current or not isinstance(current[key], list):
41
+ current[key] = []
42
+ while len(current[key]) <= idx:
43
+ current[key].append({})
44
+ target = current[key]
45
+ else:
46
+ if not isinstance(current, list):
47
+ print("[set_by_path][WARNING] Expected a list at this level.")
48
+ return
49
+ while len(current) <= idx:
50
+ current.append({})
51
+ target = current
52
+
53
+ if is_last:
54
+ target[idx] = value
55
+ else:
56
+ if not isinstance(target[idx], dict):
57
+ target[idx] = {}
58
+ current = target[idx]
59
+
60
+ else:
61
+ # Regular dictionary key
62
+ if is_last:
63
+ current[part] = value
64
+ else:
65
+ if part not in current or not isinstance(current[part], dict):
66
+ current[part] = {}
67
+ current = current[part]
68
+
69
+ except (KeyError, IndexError, TypeError, AttributeError) as e:
70
+ print(f"[set_by_path][ERROR] Error type <{e.__class__.__name__}> : {e.args[0]}")
71
+ return
72
+
73
+
74
+ @MonitoringAspect.monitor(
75
+ name="average_calc",
76
+ category=MetricType.SCORING,
77
+ cached=True,
78
+ maxsize=1000
79
+ )
80
+ def calculate_average_scores(scores: Dict[str, Union[List[float], float]]) -> Dict[str, float]:
81
+ """
82
+ Helper function that calculates the average scores for a dictionary of score lists.
83
+
84
+ Args:
85
+ scores (Dict[str, List[float]]): A dictionary where keys are identifiers and values are lists of scores.
86
+
87
+ Returns:
88
+ Dict[str, float]: A dictionary with average scores rounded to three decimal places.
89
+ """
90
+ result: Dict[str, float] = {}
91
+ for field, value in scores.items():
92
+ if isinstance(value, (int, float)):
93
+ result[field] = value
94
+ elif isinstance(value, list):
95
+ result[field] = round((sum(value) / len(value)), 3) if value else 0.0
96
+ else:
97
+ raise TypeError(f"[calculate_average_scores] Unexpected type '{type(value)}' for field '{field}")
98
+
99
+ return result
100
+
101
+
102
+ @MonitoringAspect.monitor(name="summarization", category=MetricType.API_CALL)
103
+ def summarize_verdicts(
104
+ verdicts: List[str],
105
+ judge: str,
106
+ max_bullets: int = 5
107
+ ) -> List[str]:
108
+ client_registry = ClientRegistry()
109
+ client = client_registry.get(provider=judge)
110
+
111
+ try:
112
+ verdicts = chr(10).join(verdicts)
113
+ prompt = SUMMARIZATION_PROMPT_TEMPLATE.format(max_bullets=max_bullets, judge=judge, verdicts=verdicts)
114
+ response = client.call(message=prompt)
115
+ parsed = client.parse_response(response=response)
116
+ striped = parsed.get("output", "").strip("")
117
+ bullet_points = [point.strip() for point in striped.split("- ") if point.strip()]
118
+
119
+ return bullet_points[:max_bullets]
120
+
121
+ except Exception as e:
122
+ logger.error(f"[summarize_justifications] Error during summarization: {str(e)}", exc_info=True)
123
+ return []
124
+
125
+
126
+ # if __name__ == '__main__':
127
+ # template = {'generated_reply': '${agent_reply}', 'generated_metadata': '${generated_metadata}'}
128
+ # response_dict = {
129
+ # 'agent_reply': "I'd be happy to help you book something for 10 AM.",
130
+ # 'generated_metadata': {'appointment_type': 'Cardiology', 'date': 'next Monday', 'time': '10 AM'}
131
+ # }
132
+ #
133
+ # result = extract_interaction_details(response_dict, template)
134
+ # print(f"result: {result.model_dump()}")
@@ -0,0 +1,7 @@
1
+ """levelapp/visualization: Visualization module for evaluation results."""
2
+
3
+ from .charts import ChartGenerator
4
+ from .dashboard import DashboardGenerator
5
+ from .exporter import ResultsExporter
6
+
7
+ __all__ = ["ChartGenerator", "DashboardGenerator", "ResultsExporter"]