levelapp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of levelapp might be problematic. Click here for more details.

Files changed (46) hide show
  1. levelapp/__init__.py +0 -0
  2. levelapp/aspects/__init__.py +8 -0
  3. levelapp/aspects/loader.py +253 -0
  4. levelapp/aspects/logger.py +59 -0
  5. levelapp/aspects/monitor.py +614 -0
  6. levelapp/aspects/sanitizer.py +168 -0
  7. levelapp/clients/__init__.py +119 -0
  8. levelapp/clients/anthropic.py +112 -0
  9. levelapp/clients/ionos.py +116 -0
  10. levelapp/clients/mistral.py +106 -0
  11. levelapp/clients/openai.py +102 -0
  12. levelapp/comparator/__init__.py +5 -0
  13. levelapp/comparator/comparator.py +232 -0
  14. levelapp/comparator/extractor.py +108 -0
  15. levelapp/comparator/schemas.py +61 -0
  16. levelapp/comparator/scorer.py +271 -0
  17. levelapp/comparator/utils.py +136 -0
  18. levelapp/config/__init__.py +5 -0
  19. levelapp/config/endpoint.py +190 -0
  20. levelapp/config/prompts.py +35 -0
  21. levelapp/core/__init__.py +0 -0
  22. levelapp/core/base.py +386 -0
  23. levelapp/core/session.py +214 -0
  24. levelapp/evaluator/__init__.py +3 -0
  25. levelapp/evaluator/evaluator.py +265 -0
  26. levelapp/metrics/__init__.py +67 -0
  27. levelapp/metrics/embedding.py +2 -0
  28. levelapp/metrics/exact.py +182 -0
  29. levelapp/metrics/fuzzy.py +80 -0
  30. levelapp/metrics/token.py +103 -0
  31. levelapp/plugins/__init__.py +0 -0
  32. levelapp/repository/__init__.py +3 -0
  33. levelapp/repository/firestore.py +282 -0
  34. levelapp/simulator/__init__.py +3 -0
  35. levelapp/simulator/schemas.py +89 -0
  36. levelapp/simulator/simulator.py +441 -0
  37. levelapp/simulator/utils.py +201 -0
  38. levelapp/workflow/__init__.py +5 -0
  39. levelapp/workflow/base.py +113 -0
  40. levelapp/workflow/factory.py +51 -0
  41. levelapp/workflow/registration.py +6 -0
  42. levelapp/workflow/schemas.py +121 -0
  43. levelapp-0.1.0.dist-info/METADATA +254 -0
  44. levelapp-0.1.0.dist-info/RECORD +46 -0
  45. levelapp-0.1.0.dist-info/WHEEL +4 -0
  46. levelapp-0.1.0.dist-info/licenses/LICENSE +0 -0
@@ -0,0 +1,441 @@
1
+ """
2
+ 'simulators/service.py': Service layer to manage conversation simulation and evaluation.
3
+ """
4
+ import time
5
+ import asyncio
6
+
7
+ from datetime import datetime
8
+ from collections import defaultdict
9
+ from typing import Dict, Any, List
10
+
11
+ from levelapp.core.base import BaseRepository, BaseProcess, BaseEvaluator
12
+ from levelapp.config.endpoint import EndpointConfig
13
+ from levelapp.simulator.schemas import (
14
+ InteractionEvaluationResults,
15
+ ScriptsBatch,
16
+ ConversationScript,
17
+ SimulationResults
18
+ )
19
+ from levelapp.simulator.utils import (
20
+ extract_interaction_details,
21
+ async_interaction_request,
22
+ calculate_average_scores,
23
+ summarize_verdicts,
24
+ )
25
+ from levelapp.aspects import logger
26
+ from levelapp.workflow.schemas import EvaluatorType
27
+
28
+
29
+ class ConversationSimulator(BaseProcess):
30
+ """Conversation simulator component."""
31
+
32
+ def __init__(
33
+ self,
34
+ repository: BaseRepository | None = None,
35
+ evaluators: Dict[EvaluatorType, BaseEvaluator] | None = None,
36
+ endpoint_config: EndpointConfig | None = None,
37
+ ):
38
+ """
39
+ Initialize the ConversationSimulator.
40
+
41
+ Args:
42
+ repository (BaseRepository): Service for saving simulation results.
43
+ evaluators (EvaluationService): Service for evaluating interactions.
44
+ endpoint_config (EndpointConfig): Configuration object for VLA.
45
+ """
46
+ self._CLASS_NAME = self.__class__.__name__
47
+
48
+ self.repository = repository
49
+ self.evaluators = evaluators
50
+ self.endpoint_config = endpoint_config
51
+
52
+ self._url: str | None = None
53
+ self._credentials: str | None = None
54
+ self._headers: Dict[str, Any] | None = None
55
+
56
+ self.test_batch: ScriptsBatch | None = None
57
+ self.evaluation_verdicts: Dict[str, List[str]] = defaultdict(list)
58
+ self.verdict_summaries: Dict[str, List[str]] = defaultdict(list)
59
+
60
+ def setup(
61
+ self,
62
+ repository: BaseRepository,
63
+ evaluators: Dict[str, BaseEvaluator],
64
+ endpoint_config: EndpointConfig,
65
+ ) -> None:
66
+ """
67
+ Initialize the ConversationSimulator.
68
+
69
+ Args:
70
+ repository (BaseRepository): Repository object for storing simulation results.
71
+ evaluators (Dict[str, BaseEvaluator]): List of evaluator objects for evaluating interactions.
72
+ endpoint_config (EndpointConfig): Configuration object for VLA.
73
+ """
74
+ _LOG: str = f"[{self._CLASS_NAME}][{self.setup.__name__}]"
75
+ logger.info(f"{_LOG} Setting up the Conversation Simulator..")
76
+
77
+ self.repository = repository
78
+ self.evaluators = evaluators
79
+ self.endpoint_config = endpoint_config
80
+
81
+ self._url = endpoint_config.full_url
82
+ self._credentials = endpoint_config.api_key.get_secret_value()
83
+ self._headers = endpoint_config.headers
84
+
85
+ def get_evaluator(self, name: EvaluatorType) -> BaseEvaluator:
86
+ _LOG: str = f"[{self._CLASS_NAME}][{self.get_evaluator.__name__}]"
87
+
88
+ if name not in self.evaluators:
89
+ raise KeyError(f"{_LOG} Evaluator {name} not registered.")
90
+ return self.evaluators[name]
91
+
92
+ async def run(
93
+ self,
94
+ test_batch: ScriptsBatch,
95
+ attempts: int = 1,
96
+ ) -> Dict[str, Any]:
97
+ """
98
+ Run a batch test for the given batch name and details.
99
+
100
+ Args:
101
+ test_batch (ScriptsBatch): Scenario batch object.
102
+ attempts (int): Number of attempts to run the simulation.
103
+
104
+ Returns:
105
+ Dict[str, Any]: The results of the batch test.
106
+ """
107
+ _LOG: str = f"[{self._CLASS_NAME}][{self.run.__name__}]"
108
+ logger.info(f"{_LOG} Starting batch test (attempts: {attempts}).")
109
+
110
+ started_at = datetime.now()
111
+
112
+ self.test_batch = test_batch
113
+ results = await self.simulate_conversation(attempts=attempts)
114
+
115
+ finished_at = datetime.now()
116
+
117
+ results = SimulationResults(
118
+ started_at=started_at,
119
+ finished_at=finished_at,
120
+ evaluation_summary=self.verdict_summaries,
121
+ average_scores=results.get("average_scores", {}),
122
+ )
123
+
124
+ return {"results": results, "status": "COMPLETE"}
125
+
126
+ async def simulate_conversation(self, attempts: int = 1) -> Dict[str, Any]:
127
+ """
128
+ Simulate conversations for all scenarios in the batch.
129
+
130
+ Args:
131
+ attempts (int): Number of attempts to run the simulation.
132
+
133
+ Returns:
134
+ Dict[str, Any]: The results of the conversation simulation.
135
+ """
136
+ _LOG: str = f"[{self._CLASS_NAME}][{self.simulate_conversation.__name__}]"
137
+ logger.info(f"{_LOG} starting conversation simulation..")
138
+
139
+ semaphore = asyncio.Semaphore(value=len(self.test_batch.scripts))
140
+
141
+ async def run_with_semaphore(script: ConversationScript) -> Dict[str, Any]:
142
+ async with semaphore:
143
+ return await self.simulate_single_scenario(
144
+ script=script, attempts=attempts
145
+ )
146
+
147
+ results = await asyncio.gather(
148
+ *(run_with_semaphore(s) for s in self.test_batch.scripts)
149
+ )
150
+
151
+ aggregate_scores: Dict[str, List[float]] = defaultdict(list)
152
+ for result in results:
153
+ for key, value in result.get("average_scores", {}).items():
154
+ if isinstance(value, (int, float)):
155
+ aggregate_scores[key].append(value)
156
+
157
+ overall_average_scores = calculate_average_scores(aggregate_scores)
158
+
159
+ for judge, verdicts in self.evaluation_verdicts.items():
160
+ self.verdict_summaries[judge] = summarize_verdicts(
161
+ verdicts=verdicts, judge=judge
162
+ )
163
+
164
+ return {"scripts": results, "average_scores": overall_average_scores}
165
+
166
+ async def simulate_single_scenario(
167
+ self, script: ConversationScript, attempts: int = 1
168
+ ) -> Dict[str, Any]:
169
+ """
170
+ Simulate a single scenario with the given number of attempts, concurrently.
171
+
172
+ Args:
173
+ script (SimulationScenario): The scenario to simulate.
174
+ attempts (int): Number of attempts to run the simulation.
175
+
176
+ Returns:
177
+ Dict[str, Any]: The results of the scenario simulation.
178
+ """
179
+ _LOG: str = f"[{self._CLASS_NAME}][{self.simulate_single_scenario.__name__}]"
180
+
181
+ logger.info(f"{_LOG} Starting simulation for script: {script.id}")
182
+ all_attempts_scores: Dict[str, List[float]] = defaultdict(list)
183
+ all_attempts_verdicts: Dict[str, List[str]] = defaultdict(list)
184
+
185
+ async def simulate_attempt(attempt_number: int) -> Dict[str, Any]:
186
+ logger.info(f"{_LOG} Running attempt: {attempt_number + 1}/{attempts}")
187
+ start_time = time.time()
188
+
189
+ collected_scores: Dict[str, List[Any]] = defaultdict(list)
190
+ collected_verdicts: Dict[str, List[str]] = defaultdict(list)
191
+
192
+ initial_interaction_results = await self.simulate_interactions(
193
+ script=script,
194
+ evaluation_verdicts=collected_verdicts,
195
+ collected_scores=collected_scores,
196
+ )
197
+
198
+ logger.info(f"{_LOG} collected_scores: {collected_scores}\n---")
199
+ single_attempt_scores = calculate_average_scores(collected_scores)
200
+
201
+ for target, scores in single_attempt_scores.items():
202
+ all_attempts_scores[target].append(scores)
203
+
204
+ for judge, verdicts in collected_verdicts.items():
205
+ all_attempts_verdicts[judge].extend(verdicts)
206
+
207
+ elapsed_time = time.time() - start_time
208
+ all_attempts_scores["processing_time"].append(elapsed_time)
209
+
210
+ logger.info(
211
+ f"{_LOG} Attempt {attempt_number + 1} completed in {elapsed_time:.2f}s\n---"
212
+ )
213
+
214
+ return {
215
+ "attempt": attempt_number + 1,
216
+ "script_id": script.id,
217
+ "total_duration": elapsed_time,
218
+ "interaction_results": initial_interaction_results,
219
+ "evaluation_verdicts": collected_verdicts,
220
+ "average_scores": single_attempt_scores,
221
+ }
222
+
223
+ attempt_tasks = [simulate_attempt(i) for i in range(attempts)]
224
+ attempt_results = await asyncio.gather(*attempt_tasks, return_exceptions=False)
225
+
226
+ average_scores = calculate_average_scores(all_attempts_scores)
227
+
228
+ for judge_, verdicts_ in all_attempts_verdicts.items():
229
+ self.evaluation_verdicts[judge_].extend(verdicts_)
230
+
231
+ logger.info(
232
+ f"{_LOG} average scores:\n{average_scores}\n---"
233
+ )
234
+
235
+ return {
236
+ "script_id": script.id,
237
+ "attempts": attempt_results,
238
+ "average_scores": average_scores,
239
+ }
240
+
241
+ async def simulate_interactions(
242
+ self,
243
+ script: ConversationScript,
244
+ evaluation_verdicts: Dict[str, List[str]],
245
+ collected_scores: Dict[str, List[Any]],
246
+ ) -> List[Dict[str, Any]]:
247
+ """
248
+ Simulate inbound interactions for a scenario.
249
+
250
+ Args:
251
+ script (ConversationScript): The script to simulate.
252
+ evaluation_verdicts(Dict[str, List[str]]): evaluation verdict for each evaluator.
253
+ collected_scores(Dict[str, List[Any]]): collected scores for each target.
254
+
255
+ Returns:
256
+ List[Dict[str, Any]]: The results of the inbound interactions simulation.
257
+ """
258
+ _LOG: str = f"[{self._CLASS_NAME}][{self.simulate_interactions.__name__}]"
259
+
260
+ logger.info(f"{_LOG} Starting interactions simulation..")
261
+ start_time = time.time()
262
+
263
+ results = []
264
+ interactions = script.interactions
265
+
266
+ for interaction in interactions:
267
+ user_message = interaction.user_message
268
+ request_payload = interaction.request_payload
269
+ self.endpoint_config.variables = {
270
+ "user_message": user_message,
271
+ "request_payload": request_payload
272
+ }
273
+
274
+ response = await async_interaction_request(
275
+ url=self.endpoint_config.full_url,
276
+ headers=self.endpoint_config.headers,
277
+ payload=self.endpoint_config.request_payload,
278
+ )
279
+
280
+ reference_reply = interaction.reference_reply
281
+ reference_metadata = interaction.reference_metadata
282
+ reference_guardrail_flag: bool = interaction.guardrail_flag
283
+
284
+ if not response or response.status_code != 200:
285
+ logger.error(f"{_LOG} Interaction request failed.")
286
+ result = {
287
+ "user_message": user_message,
288
+ "generated_reply": "Interaction Request failed",
289
+ "reference_reply": reference_reply,
290
+ "generated_metadata": {},
291
+ "reference_metadata": reference_metadata,
292
+ "guardrail_details": None,
293
+ "evaluation_results": {},
294
+ }
295
+ results.append(result)
296
+ continue
297
+
298
+ interaction_details = extract_interaction_details(
299
+ response=response.text,
300
+ template=self.endpoint_config.response_payload,
301
+ )
302
+
303
+ generated_reply = interaction_details.generated_reply
304
+ generated_metadata = interaction_details.generated_metadata
305
+ extracted_guardrail_flag: bool = interaction_details.guardrail_flag
306
+
307
+ evaluation_results = await self.evaluate_interaction(
308
+ user_input=user_message,
309
+ generated_reply=generated_reply,
310
+ reference_reply=reference_reply,
311
+ generated_metadata=generated_metadata,
312
+ reference_metadata=reference_metadata,
313
+ generated_guardrail=extracted_guardrail_flag,
314
+ reference_guardrail=reference_guardrail_flag,
315
+ )
316
+
317
+ logger.info(f"{_LOG} Evaluation results:\n{evaluation_results.model_dump()}\n")
318
+
319
+ self.store_evaluation_results(
320
+ results=evaluation_results,
321
+ evaluation_verdicts=evaluation_verdicts,
322
+ collected_scores=collected_scores,
323
+ )
324
+
325
+ elapsed_time = time.time() - start_time
326
+ logger.info(
327
+ f"{_LOG} Interaction simulation complete in {elapsed_time:.2f} seconds.\n---"
328
+ )
329
+
330
+ result = {
331
+ "user_message": user_message,
332
+ "generated_reply": generated_reply,
333
+ "reference_reply": reference_reply,
334
+ "generated_metadata": generated_metadata,
335
+ "reference_metadata": reference_metadata,
336
+ "guardrail_details": interaction_details.guardrail_flag,
337
+ "evaluation_results": evaluation_results.model_dump(),
338
+ }
339
+
340
+ results.append(result)
341
+
342
+ return results
343
+
344
+ async def evaluate_interaction(
345
+ self,
346
+ user_input: str,
347
+ generated_reply: str,
348
+ reference_reply: str,
349
+ generated_metadata: Dict[str, Any],
350
+ reference_metadata: Dict[str, Any],
351
+ generated_guardrail: bool,
352
+ reference_guardrail: bool,
353
+ ) -> InteractionEvaluationResults:
354
+ """
355
+ Evaluate an interaction using OpenAI and Ionos evaluation services.
356
+
357
+ Args:
358
+ user_input (str): user input to evaluate.
359
+ generated_reply (str): The generated agent reply.
360
+ reference_reply (str): The reference agent reply.
361
+ generated_metadata (Dict[str, Any]): The generated metadata.
362
+ reference_metadata (Dict[str, Any]): The reference metadata.
363
+ generated_guardrail (bool): generated handoff/guardrail flag.
364
+ reference_guardrail (bool): reference handoff/guardrail flag.
365
+
366
+ Returns:
367
+ InteractionEvaluationResults: The evaluation results.
368
+ """
369
+ _LOG: str = f"[{self._CLASS_NAME}][{self.evaluate_interaction.__name__}]"
370
+
371
+ judge_evaluator = self.evaluators.get(EvaluatorType.JUDGE)
372
+ metadata_evaluator = self.evaluators.get(EvaluatorType.REFERENCE)
373
+
374
+ if not judge_evaluator:
375
+ raise ValueError(f"{_LOG} No Judge Evaluator found.")
376
+
377
+ openai_eval_task = judge_evaluator.async_evaluate(
378
+ generated_data=generated_reply,
379
+ reference_data=reference_reply,
380
+ user_input=user_input,
381
+ provider="openai"
382
+ )
383
+
384
+ ionos_eval_task = judge_evaluator.async_evaluate(
385
+ provider="ionos",
386
+ user_input=user_input,
387
+ generated_data=generated_reply,
388
+ reference_data=reference_reply,
389
+ )
390
+
391
+ openai_judge_evaluation, ionos_judge_evaluation = await asyncio.gather(
392
+ openai_eval_task, ionos_eval_task
393
+ )
394
+
395
+ if not metadata_evaluator:
396
+ raise ValueError(f"{_LOG} No Metadata Evaluator found.")
397
+
398
+ metadata_evaluation = {}
399
+ if reference_metadata:
400
+ metadata_evaluation = metadata_evaluator.evaluate(
401
+ generated_data=generated_metadata,
402
+ reference_data=reference_metadata,
403
+ )
404
+
405
+ guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
406
+
407
+ return InteractionEvaluationResults(
408
+ judge_evaluations={
409
+ openai_judge_evaluation.provider: openai_judge_evaluation,
410
+ ionos_judge_evaluation.provider: ionos_judge_evaluation
411
+ },
412
+ metadata_evaluation=metadata_evaluation,
413
+ guardrail_flag=guardrail_flag,
414
+ )
415
+
416
+ @staticmethod
417
+ def store_evaluation_results(
418
+ results: InteractionEvaluationResults,
419
+ evaluation_verdicts: Dict[str, List[str]],
420
+ collected_scores: Dict[str, List[Any]],
421
+ ) -> None:
422
+ """
423
+ Store the evaluation results in the evaluation summary.
424
+
425
+ Args:
426
+ results (InteractionEvaluationResults): The evaluation results to store.
427
+ evaluation_verdicts (Dict[str, List[str]]): The evaluation summary.
428
+ collected_scores (Dict[str, List[Any]]): The collected scores.
429
+ """
430
+ for provider in results.judge_evaluations.keys():
431
+ evaluation_verdicts[f"{provider}_verdicts_summary"].append(
432
+ results.judge_evaluations.get(provider, "").justification
433
+ )
434
+
435
+ collected_scores[provider].append(results.judge_evaluations.get(provider, "").score)
436
+
437
+ average_metadata_score = calculate_average_scores(scores=results.metadata_evaluation)
438
+ for field, score in average_metadata_score.items():
439
+ collected_scores["metadata"].append(score)
440
+
441
+ collected_scores["guardrail"].append(results.guardrail_flag)
@@ -0,0 +1,201 @@
1
+ """
2
+ 'simulators/aspects.py': Utility functions for handling VLA interactions and requests.
3
+ """
4
+ import json
5
+
6
+ import httpx
7
+ import arrow
8
+
9
+ from uuid import UUID
10
+ from typing import Dict, Any, Optional, List, Union
11
+
12
+ from openai import OpenAI
13
+ from pydantic import ValidationError
14
+
15
+ from levelapp.simulator.schemas import InteractionResults
16
+ from levelapp.aspects import MonitoringAspect, MetricType, logger
17
+
18
+
19
+ class UUIDEncoder(json.JSONEncoder):
20
+ def default(self, obj):
21
+ if isinstance(obj, UUID):
22
+ return str(obj)
23
+ return json.JSONEncoder.default(self, obj)
24
+
25
+
26
+ def extract_interaction_details(
27
+ response: str | Dict[str, Any],
28
+ template: Dict[str, Any],
29
+ ) -> InteractionResults:
30
+ """
31
+ Extract interaction details from a VLA response.
32
+
33
+ Args:
34
+ response (str): The response text from the VLA.
35
+ template (Dict[str, Any]): The response schema/template.
36
+
37
+ Returns:
38
+ InteractionResults: The extracted interaction details.
39
+ """
40
+ try:
41
+ response_dict = response if isinstance(response, dict) else json.loads(response)
42
+
43
+ if not isinstance(response_dict, dict):
44
+ raise ValueError("Response is not a valid dictionary")
45
+
46
+ required_keys = {value.strip("${}") for value in template.values()}
47
+ if not required_keys.issubset(response_dict.keys()):
48
+ missing_keys = required_keys - response_dict.keys()
49
+ logger.warning(f"[extract_interaction_details] Missing data: {missing_keys}]")
50
+
51
+ return InteractionResults.model_validate(response_dict)
52
+
53
+ except json.JSONDecodeError as e:
54
+ logger.error(f"[extract_interaction_details] Failed to extract details:\n{e}")
55
+ return InteractionResults()
56
+
57
+ except ValidationError as e:
58
+ logger.exception(f"[extract_interaction_details] Failed to create an InteractionResults instance:\n{e}")
59
+ return InteractionResults()
60
+
61
+
62
+ @MonitoringAspect.monitor(name="interaction_request", category=MetricType.API_CALL)
63
+ async def async_interaction_request(
64
+ url: str,
65
+ headers: Dict[str, str],
66
+ payload: Dict[str, Any],
67
+ ) -> Optional[httpx.Response]:
68
+ """
69
+ Perform an asynchronous interaction request.
70
+
71
+ Args:
72
+ url (str): The URL to send the request to.
73
+ headers (Dict[str, str]): The headers to include in the request.
74
+ payload (Dict[str, Any]): The payload to send in the request.
75
+
76
+ Returns:
77
+ Optional[httpx.Response]: The response from the interaction request, or None if an error occurred.
78
+ """
79
+ try:
80
+ async with httpx.AsyncClient(timeout=180) as client:
81
+ response = await client.post(url=url, headers=headers, json=payload)
82
+ response.raise_for_status()
83
+
84
+ return response
85
+
86
+ except httpx.HTTPStatusError as http_err:
87
+ logger.error(f"[async_interaction_request] HTTP error: {http_err.response.text}", exc_info=True)
88
+
89
+ except httpx.RequestError as req_err:
90
+ logger.error(f"[async_interaction_request] Request error: {str(req_err)}", exc_info=True)
91
+
92
+ return None
93
+
94
+
95
+ def parse_date_value(raw_date_value: Optional[str], default_date_value: Optional[str] = "") -> str:
96
+ """
97
+ Cleans and parses a dehumanized relative date string to ISO format.
98
+
99
+ Args:
100
+ raw_date_value (Optional[str]): The raw date value to parse.
101
+ default_date_value (Optional[str]): The default value to return if parsing fails. Defaults to an empty string.
102
+
103
+ Returns:
104
+ str: The parsed date in ISO format, or the default value if parsing fails.
105
+ """
106
+ if not raw_date_value:
107
+ logger.info(f"[parse_date_value] No raw value provided. returning default: '{default_date_value}'")
108
+ return default_date_value
109
+
110
+ clean = raw_date_value.replace("{{", "").replace("}}", "").replace("_", " ").strip()
111
+ clean += 's' if not clean.endswith('s') else clean
112
+
113
+ try:
114
+ arw = arrow.utcnow()
115
+ parsed_date = arw.dehumanize(clean).utcnow().format('YYYY-MM-DD')
116
+ return parsed_date
117
+
118
+ except arrow.parser.ParserError as e:
119
+ logger.error(f"[parse_date_value] Failed to parse date: '{clean}'\nParserError: {str(e)}", exc_info=True)
120
+ return default_date_value
121
+
122
+ except ValueError as e:
123
+ logger.error(f"[parse_date_value] Invalid date value: '{clean}'\nValueError: {str(e)}", exc_info=True)
124
+ return default_date_value
125
+
126
+ except Exception as e:
127
+ logger.error(f"[parse_date_value] Unexpected error.\nException: {str(e)}", exc_info=True)
128
+ return default_date_value
129
+
130
+
131
+ @MonitoringAspect.monitor(
132
+ name="average_calc",
133
+ category=MetricType.SCORING,
134
+ cached=True,
135
+ maxsize=1000
136
+ )
137
+ def calculate_average_scores(scores: Dict[str, Union[List[float], float]]) -> Dict[str, float]:
138
+ """
139
+ Helper function that calculates the average scores for a dictionary of score lists.
140
+
141
+ Args:
142
+ scores (Dict[str, List[float]]): A dictionary where keys are identifiers and values are lists of scores.
143
+
144
+ Returns:
145
+ Dict[str, float]: A dictionary with average scores rounded to three decimal places.
146
+ """
147
+ result: Dict[str, float] = {}
148
+ for field, value in scores.items():
149
+ if isinstance(value, (int, float)):
150
+ result[field] = value
151
+ elif isinstance(value, list):
152
+ result[field] = round((sum(value) / len(value)), 3) if value else 0.0
153
+ else:
154
+ raise TypeError(f"[calculate_average_scores] Unexpected type '{type(value)}' for field '{field}")
155
+
156
+ return result
157
+
158
+
159
+ @MonitoringAspect.monitor(name="summarization", category=MetricType.API_CALL)
160
+ def summarize_verdicts(verdicts: List[str], judge: str, max_bullets: int = 5) -> List[str]:
161
+ """
162
+ Summarize the justifications for each judge.
163
+
164
+ Args:
165
+ verdicts (List[str]): A list of justifications.
166
+ judge (str): The judge or evaluator (provider) name for context.
167
+ max_bullets (int): The maximum number of bullets allowed per judge.
168
+
169
+ Returns:
170
+ List[str]: The summarized justifications.
171
+ """
172
+ if not verdicts:
173
+ return []
174
+
175
+ prompt = f"""
176
+ You are reviewing evaluation justifications from LL judges about replies generated by a virtual leasing agent.\n
177
+ Each justification contains the judge's assessment of how well the agent's response matched the expected reply.\n
178
+ Your task is to identify and summarize only the **negative points**, such as errors, misunderstandings,
179
+ missing information, or failure to meet expectations.\n
180
+ Return up to {max_bullets} bullet points. Be concise and start each point with '- '\n\n
181
+ ---
182
+ - Judge: {judge}
183
+ - Justifications:\n{chr(10).join(verdicts)}\n
184
+ """
185
+
186
+ client = OpenAI()
187
+
188
+ try:
189
+ result = client.chat.completions.create(
190
+ model="gpt-4o-mini",
191
+ temperature=0,
192
+ messages=[{"role": "user", "content": prompt}]
193
+ ).choices[0].message.content
194
+
195
+ bullet_points = [point.strip() for point in result.split('- ') if point.strip()]
196
+
197
+ return bullet_points
198
+
199
+ except Exception as e:
200
+ logger.error(f"[summarize_justifications] Error during summarization: {str(e)}", exc_info=True)
201
+ return []
@@ -0,0 +1,5 @@
1
+ from . import registration
2
+ from .schemas import WorkflowType
3
+ from .factory import MainFactory
4
+
5
+ __all__ = ["WorkflowType", "MainFactory"]