levelapp 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of levelapp might be problematic. Click here for more details.

@@ -1,7 +1,5 @@
1
1
  """levelapp/repository/firestore.py"""
2
- import google.auth
3
-
4
- from typing import List, Dict, Any, Type
2
+ from typing import List, Dict, Any, Type, TYPE_CHECKING
5
3
  from pydantic import ValidationError
6
4
 
7
5
  from google.cloud import firestore_v1
@@ -13,15 +11,25 @@ from levelapp.core.base import BaseRepository, Model
13
11
  from levelapp.aspects import logger
14
12
 
15
13
 
14
+ if TYPE_CHECKING:
15
+ from levelapp.workflow.config import WorkflowConfig
16
+
17
+
16
18
  class FirestoreRepository(BaseRepository):
17
19
  """
18
20
  Firestore implementation of BaseRepository.
19
21
  (Uses hierarchical path: {user_id}/{collection_id}/{document_id}
20
22
  """
21
23
 
22
- def __init__(self, project_id: str | Any = None, database_name: str | Any = '(default)'):
23
- self.project_id = project_id
24
- self.database_name = database_name
24
+ def __init__(self, config: "WorkflowConfig | None"):
25
+ if config:
26
+ self.config = config
27
+ self.project_id: str | Any = config.repository.project_id
28
+ self.database_name: str | Any = config.repository.database_name
29
+ else:
30
+ self.project_id: str | Any = None
31
+ self.database_name: str | Any = '(default)'
32
+
25
33
  self.client: firestore_v1.Client | None = None
26
34
 
27
35
  def connect(self) -> None:
@@ -29,6 +37,7 @@ class FirestoreRepository(BaseRepository):
29
37
  Connects to Firestore, prioritizing the project ID passed to the constructor.
30
38
  """
31
39
  try:
40
+ import google.auth
32
41
  credentials, default_project_id = google.auth.default()
33
42
 
34
43
  if not credentials:
@@ -8,8 +8,8 @@ from enum import Enum
8
8
  from uuid import UUID, uuid4
9
9
  from datetime import datetime
10
10
 
11
- from typing import Optional, Dict, Any, List
12
- from pydantic import BaseModel, Field, computed_field
11
+ from typing import Dict, Any, List
12
+ from pydantic import BaseModel, Field, computed_field, field_validator
13
13
 
14
14
  from levelapp.evaluator.evaluator import JudgeEvaluationResults
15
15
 
@@ -25,11 +25,11 @@ class Interaction(BaseModel):
25
25
  """Represents a single interaction within a conversation."""
26
26
  id: UUID = Field(default_factory=uuid4, description="Interaction identifier")
27
27
  user_message: str = Field(..., description="The user's query message")
28
- generated_reply: str = Field(..., description="The agent's reply message")
28
+ # generated_reply: str = Field(..., description="The agent's reply message")
29
29
  reference_reply: str = Field(..., description="The preset reference message")
30
- interaction_type: InteractionLevel = Field(..., description="Type of interaction")
31
- reference_metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Expected metadata")
32
- generated_metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Extracted metadata")
30
+ interaction_type: InteractionLevel = Field(default=InteractionLevel.INITIAL, description="Type of interaction")
31
+ reference_metadata: Dict[str, Any] = Field(default_factory=dict, description="Expected metadata")
32
+ # generated_metadata: Dict[str, Any] = Field(default_factory=dict, description="Extracted metadata")
33
33
  guardrail_flag: bool = Field(default=False, description="Flag for guardrail signaling")
34
34
  request_payload: Dict[str, Any] = Field(default_factory=dict, description="Additional request payload")
35
35
 
@@ -38,7 +38,7 @@ class ConversationScript(BaseModel):
38
38
  """Represents a basic conversation with multiple interactions."""
39
39
  id: UUID = Field(default_factory=uuid4, description="Conversation identifier")
40
40
  interactions: List[Interaction] = Field(default_factory=list, description="List of interactions")
41
- description: str = Field(..., description="A short description of the conversation")
41
+ description: str = Field(default="no-description", description="A short description of the conversation")
42
42
  details: Dict[str, str] = Field(default_factory=dict, description="Conversation details")
43
43
 
44
44
 
@@ -58,32 +58,26 @@ class InteractionResults(BaseModel):
58
58
 
59
59
  class InteractionEvaluationResults(BaseModel):
60
60
  """Model representing the evaluation result of an interaction."""
61
- judge_evaluations: Dict[str, JudgeEvaluationResults] = Field(default_factory=dict)
62
- metadata_evaluation: Dict[str, float] = Field(default_factory=dict)
61
+ judge_evaluations: Dict[str, JudgeEvaluationResults] | None = Field(default_factory=dict)
62
+ metadata_evaluation: Dict[str, float] | None = Field(default_factory=dict)
63
63
  guardrail_flag: int = Field(default=0)
64
64
 
65
65
 
66
66
  class SimulationResults(BaseModel):
67
- # Initial data
68
- project_id: str = Field(default_factory=uuid4, description="Project identifier")
69
- user_id: str = Field(default_factory=uuid4, description="User identifier")
70
- batch_id: str = Field(default_factory=uuid4, description="Batch identifier")
71
67
  # Collected data
72
68
  started_at: datetime = datetime.now()
73
69
  finished_at: datetime
74
70
  # Collected Results
75
71
  evaluation_summary: Dict[str, Any] | None = Field(default_factory=dict, description="Evaluation result")
76
72
  average_scores: Dict[str, Any] | None = Field(default_factory=dict, description="Average scores")
73
+ interaction_results: List[Dict[str, Any]] | None = Field(default_factory=list, description="detailed results")
74
+
75
+ @computed_field
76
+ @property
77
+ def batch_id(self) -> str:
78
+ return str(uuid4())
77
79
 
78
80
  @computed_field
79
81
  @property
80
82
  def elapsed_time(self) -> float:
81
83
  return (self.finished_at - self.started_at).total_seconds()
82
-
83
-
84
- class TestResults(BaseModel):
85
- api_host: str = Field(..., alias="apiHost")
86
- ionos_model_name: str = Field(..., alias="ionosModelName")
87
- test_name: str = Field(..., alias="testName")
88
- test_type: str = Field(..., alias="testType")
89
- batch_details: Optional[SimulationResults] = Field(..., alias="results")
@@ -1,6 +1,7 @@
1
1
  """
2
2
  'simulators/service.py': Service layer to manage conversation simulation and evaluation.
3
3
  """
4
+ import json
4
5
  import time
5
6
  import asyncio
6
7
 
@@ -23,7 +24,7 @@ from levelapp.simulator.utils import (
23
24
  summarize_verdicts,
24
25
  )
25
26
  from levelapp.aspects import logger
26
- from levelapp.workflow.schemas import EvaluatorType
27
+ from levelapp.core.schemas import EvaluatorType
27
28
 
28
29
 
29
30
  class ConversationSimulator(BaseProcess):
@@ -33,6 +34,7 @@ class ConversationSimulator(BaseProcess):
33
34
  self,
34
35
  repository: BaseRepository | None = None,
35
36
  evaluators: Dict[EvaluatorType, BaseEvaluator] | None = None,
37
+ providers: List[str] | None = None,
36
38
  endpoint_config: EndpointConfig | None = None,
37
39
  ):
38
40
  """
@@ -47,6 +49,7 @@ class ConversationSimulator(BaseProcess):
47
49
 
48
50
  self.repository = repository
49
51
  self.evaluators = evaluators
52
+ self.providers = providers
50
53
  self.endpoint_config = endpoint_config
51
54
 
52
55
  self._url: str | None = None
@@ -60,7 +63,8 @@ class ConversationSimulator(BaseProcess):
60
63
  def setup(
61
64
  self,
62
65
  repository: BaseRepository,
63
- evaluators: Dict[str, BaseEvaluator],
66
+ evaluators: Dict[EvaluatorType, BaseEvaluator],
67
+ providers: List[str],
64
68
  endpoint_config: EndpointConfig,
65
69
  ) -> None:
66
70
  """
@@ -69,6 +73,7 @@ class ConversationSimulator(BaseProcess):
69
73
  Args:
70
74
  repository (BaseRepository): Repository object for storing simulation results.
71
75
  evaluators (Dict[str, BaseEvaluator]): List of evaluator objects for evaluating interactions.
76
+ providers (List[str]): List of LLM provider names.
72
77
  endpoint_config (EndpointConfig): Configuration object for VLA.
73
78
  """
74
79
  _LOG: str = f"[{self._CLASS_NAME}][{self.setup.__name__}]"
@@ -76,6 +81,11 @@ class ConversationSimulator(BaseProcess):
76
81
 
77
82
  self.repository = repository
78
83
  self.evaluators = evaluators
84
+ self.providers = providers
85
+
86
+ if not self.providers:
87
+ logger.warning(f"{_LOG} No LLM providers were provided. The Judge Evaluation process will not be executed.")
88
+
79
89
  self.endpoint_config = endpoint_config
80
90
 
81
91
  self._url = endpoint_config.full_url
@@ -83,17 +93,27 @@ class ConversationSimulator(BaseProcess):
83
93
  self._headers = endpoint_config.headers
84
94
 
85
95
  def get_evaluator(self, name: EvaluatorType) -> BaseEvaluator:
96
+ """
97
+ Retrieve an evaluator by name.
98
+
99
+ Args:
100
+ name (EvaluatorType): Name of evaluator.
101
+
102
+ Returns:
103
+ An evaluator object.
104
+ """
86
105
  _LOG: str = f"[{self._CLASS_NAME}][{self.get_evaluator.__name__}]"
87
106
 
88
107
  if name not in self.evaluators:
89
108
  raise KeyError(f"{_LOG} Evaluator {name} not registered.")
109
+
90
110
  return self.evaluators[name]
91
111
 
92
112
  async def run(
93
113
  self,
94
114
  test_batch: ScriptsBatch,
95
115
  attempts: int = 1,
96
- ) -> Dict[str, Any]:
116
+ ) -> Any:
97
117
  """
98
118
  Run a batch test for the given batch name and details.
99
119
 
@@ -119,9 +139,10 @@ class ConversationSimulator(BaseProcess):
119
139
  finished_at=finished_at,
120
140
  evaluation_summary=self.verdict_summaries,
121
141
  average_scores=results.get("average_scores", {}),
142
+ interaction_results=results.get("results")
122
143
  )
123
144
 
124
- return {"results": results, "status": "COMPLETE"}
145
+ return results.model_dump_json(indent=2)
125
146
 
126
147
  async def simulate_conversation(self, attempts: int = 1) -> Dict[str, Any]:
127
148
  """
@@ -161,10 +182,11 @@ class ConversationSimulator(BaseProcess):
161
182
  verdicts=verdicts, judge=judge
162
183
  )
163
184
 
164
- return {"scripts": results, "average_scores": overall_average_scores}
185
+ return {"results": results, "average_scores": overall_average_scores}
165
186
 
166
187
  async def simulate_single_scenario(
167
- self, script: ConversationScript, attempts: int = 1
188
+ self, script: ConversationScript,
189
+ attempts: int = 1
168
190
  ) -> Dict[str, Any]:
169
191
  """
170
192
  Simulate a single scenario with the given number of attempts, concurrently.
@@ -183,19 +205,18 @@ class ConversationSimulator(BaseProcess):
183
205
  all_attempts_verdicts: Dict[str, List[str]] = defaultdict(list)
184
206
 
185
207
  async def simulate_attempt(attempt_number: int) -> Dict[str, Any]:
186
- logger.info(f"{_LOG} Running attempt: {attempt_number + 1}/{attempts}")
208
+ logger.info(f"{_LOG} Running attempt: {attempt_number + 1}/{attempts}\n---")
187
209
  start_time = time.time()
188
210
 
189
211
  collected_scores: Dict[str, List[Any]] = defaultdict(list)
190
212
  collected_verdicts: Dict[str, List[str]] = defaultdict(list)
191
213
 
192
- initial_interaction_results = await self.simulate_interactions(
214
+ interaction_results = await self.simulate_interactions(
193
215
  script=script,
194
216
  evaluation_verdicts=collected_verdicts,
195
217
  collected_scores=collected_scores,
196
218
  )
197
219
 
198
- logger.info(f"{_LOG} collected_scores: {collected_scores}\n---")
199
220
  single_attempt_scores = calculate_average_scores(collected_scores)
200
221
 
201
222
  for target, scores in single_attempt_scores.items():
@@ -215,7 +236,7 @@ class ConversationSimulator(BaseProcess):
215
236
  "attempt": attempt_number + 1,
216
237
  "script_id": script.id,
217
238
  "total_duration": elapsed_time,
218
- "interaction_results": initial_interaction_results,
239
+ "interaction_results": interaction_results,
219
240
  "evaluation_verdicts": collected_verdicts,
220
241
  "average_scores": single_attempt_scores,
221
242
  }
@@ -228,10 +249,6 @@ class ConversationSimulator(BaseProcess):
228
249
  for judge_, verdicts_ in all_attempts_verdicts.items():
229
250
  self.evaluation_verdicts[judge_].extend(verdicts_)
230
251
 
231
- logger.info(
232
- f"{_LOG} average scores:\n{average_scores}\n---"
233
- )
234
-
235
252
  return {
236
253
  "script_id": script.id,
237
254
  "attempts": attempt_results,
@@ -314,8 +331,6 @@ class ConversationSimulator(BaseProcess):
314
331
  reference_guardrail=reference_guardrail_flag,
315
332
  )
316
333
 
317
- logger.info(f"{_LOG} Evaluation results:\n{evaluation_results.model_dump()}\n")
318
-
319
334
  self.store_evaluation_results(
320
335
  results=evaluation_results,
321
336
  evaluation_verdicts=evaluation_verdicts,
@@ -323,9 +338,7 @@ class ConversationSimulator(BaseProcess):
323
338
  )
324
339
 
325
340
  elapsed_time = time.time() - start_time
326
- logger.info(
327
- f"{_LOG} Interaction simulation complete in {elapsed_time:.2f} seconds.\n---"
328
- )
341
+ logger.info(f"{_LOG} Interaction simulation complete in {elapsed_time:.2f} seconds.\n---")
329
342
 
330
343
  result = {
331
344
  "user_message": user_message,
@@ -368,50 +381,106 @@ class ConversationSimulator(BaseProcess):
368
381
  """
369
382
  _LOG: str = f"[{self._CLASS_NAME}][{self.evaluate_interaction.__name__}]"
370
383
 
371
- judge_evaluator = self.evaluators.get(EvaluatorType.JUDGE)
372
- metadata_evaluator = self.evaluators.get(EvaluatorType.REFERENCE)
384
+ judge_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.JUDGE, None)
385
+ metadata_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.REFERENCE, None)
373
386
 
374
- if not judge_evaluator:
375
- raise ValueError(f"{_LOG} No Judge Evaluator found.")
387
+ evaluation_results = InteractionEvaluationResults()
376
388
 
377
- openai_eval_task = judge_evaluator.async_evaluate(
378
- generated_data=generated_reply,
379
- reference_data=reference_reply,
380
- user_input=user_input,
381
- provider="openai"
382
- )
389
+ if judge_evaluator and self.providers:
390
+ await self._judge_evaluation(
391
+ user_input=user_input,
392
+ generated_reply=generated_reply,
393
+ reference_reply=reference_reply,
394
+ providers=self.providers,
395
+ judge_evaluator=judge_evaluator,
396
+ evaluation_results=evaluation_results,
397
+ )
398
+ else:
399
+ logger.info(f"[{_LOG}] Judge evaluation skipped (no evaluator or no providers).")
383
400
 
384
- ionos_eval_task = judge_evaluator.async_evaluate(
385
- provider="ionos",
386
- user_input=user_input,
387
- generated_data=generated_reply,
388
- reference_data=reference_reply,
389
- )
401
+ if metadata_evaluator and reference_metadata:
402
+ self._metadata_evaluation(
403
+ metadata_evaluator=metadata_evaluator,
404
+ generated_metadata=generated_metadata,
405
+ reference_metadata=reference_metadata,
406
+ evaluation_results=evaluation_results,
407
+ )
408
+ else:
409
+ logger.info(f"[{_LOG}] Metadata evaluation skipped (no evaluator or no reference metadata).")
390
410
 
391
- openai_judge_evaluation, ionos_judge_evaluation = await asyncio.gather(
392
- openai_eval_task, ionos_eval_task
393
- )
411
+ evaluation_results.guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
394
412
 
395
- if not metadata_evaluator:
396
- raise ValueError(f"{_LOG} No Metadata Evaluator found.")
413
+ return evaluation_results
397
414
 
398
- metadata_evaluation = {}
399
- if reference_metadata:
400
- metadata_evaluation = metadata_evaluator.evaluate(
401
- generated_data=generated_metadata,
402
- reference_data=reference_metadata,
415
+ async def _judge_evaluation(
416
+ self,
417
+ user_input: str,
418
+ generated_reply: str,
419
+ reference_reply: str,
420
+ providers: List[str],
421
+ judge_evaluator: BaseEvaluator,
422
+ evaluation_results: InteractionEvaluationResults,
423
+ ) -> None:
424
+ """
425
+ Run LLM-as-a-judge evaluation using multiple providers (async).
426
+
427
+ Args:
428
+ user_input (str): The user input message.
429
+ generated_reply (str): The generated agent reply.
430
+ reference_reply (str): The reference agent reply.
431
+ providers (List[str]): List of judge provider names.
432
+ judge_evaluator (BaseEvaluator): Evaluator instance.
433
+ evaluation_results (InteractionEvaluationResults): Results container (Pydantic model).
434
+
435
+ Returns:
436
+ None
437
+ """
438
+ _LOG: str = f"[{self._CLASS_NAME}][judge_evaluation]"
439
+
440
+ tasks = {
441
+ provider: judge_evaluator.async_evaluate(
442
+ generated_data=generated_reply,
443
+ reference_data=reference_reply,
444
+ user_input=user_input,
445
+ provider=provider,
403
446
  )
447
+ for provider in providers
448
+ }
404
449
 
405
- guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
450
+ results = await asyncio.gather(*tasks.values(), return_exceptions=True)
406
451
 
407
- return InteractionEvaluationResults(
408
- judge_evaluations={
409
- openai_judge_evaluation.provider: openai_judge_evaluation,
410
- ionos_judge_evaluation.provider: ionos_judge_evaluation
411
- },
412
- metadata_evaluation=metadata_evaluation,
413
- guardrail_flag=guardrail_flag,
414
- )
452
+ for provider, result in zip(tasks.keys(), results):
453
+ if isinstance(result, Exception):
454
+ logger.error(f"{_LOG} Provider '{provider}' failed to perform Judge Evaluation.")
455
+ continue
456
+
457
+ evaluation_results.judge_evaluations[provider] = result
458
+
459
+ def _metadata_evaluation(
460
+ self,
461
+ metadata_evaluator: BaseEvaluator,
462
+ generated_metadata: Dict[str, Any],
463
+ reference_metadata: Dict[str, Any],
464
+ evaluation_results: InteractionEvaluationResults,
465
+ ) -> None:
466
+ """
467
+ Run metadata evaluation using the provided evaluator.
468
+
469
+ Args:
470
+ metadata_evaluator (BaseEvaluator): Evaluator for metadata comparison.
471
+ generated_metadata (Dict[str, Any]): The generated metadata.
472
+ reference_metadata (Dict[str, Any]): The reference metadata.
473
+ evaluation_results (InteractionEvaluationResults): Results container.
474
+ """
475
+ _LOG: str = f"[{self._CLASS_NAME}][metadata_evaluation]"
476
+
477
+ try:
478
+ evaluation_results.metadata_evaluation = metadata_evaluator.evaluate(
479
+ generated_data=generated_metadata,
480
+ reference_data=reference_metadata,
481
+ )
482
+ except Exception as e:
483
+ logger.error(f"[{_LOG}] Metadata evaluation failed:\n{e}", exc_info=e)
415
484
 
416
485
  @staticmethod
417
486
  def store_evaluation_results(
@@ -428,7 +497,7 @@ class ConversationSimulator(BaseProcess):
428
497
  collected_scores (Dict[str, List[Any]]): The collected scores.
429
498
  """
430
499
  for provider in results.judge_evaluations.keys():
431
- evaluation_verdicts[f"{provider}_verdicts_summary"].append(
500
+ evaluation_verdicts[f"{provider}"].append(
432
501
  results.judge_evaluations.get(provider, "").justification
433
502
  )
434
503
 
@@ -1,17 +1,18 @@
1
1
  """
2
2
  'simulators/aspects.py': Utility functions for handling VLA interactions and requests.
3
3
  """
4
+ import ast
4
5
  import json
5
-
6
6
  import httpx
7
- import arrow
8
7
 
9
8
  from uuid import UUID
10
- from typing import Dict, Any, Optional, List, Union
9
+ from string import Template
10
+ from typing import Any, Dict, List, Union
11
11
 
12
- from openai import OpenAI
13
12
  from pydantic import ValidationError
14
13
 
14
+ from levelapp.clients import ClientRegistry
15
+ from levelapp.config.prompts import SUMMARIZATION_PROMPT_TEMPLATE
15
16
  from levelapp.simulator.schemas import InteractionResults
16
17
  from levelapp.aspects import MonitoringAspect, MetricType, logger
17
18
 
@@ -48,7 +49,14 @@ def extract_interaction_details(
48
49
  missing_keys = required_keys - response_dict.keys()
49
50
  logger.warning(f"[extract_interaction_details] Missing data: {missing_keys}]")
50
51
 
51
- return InteractionResults.model_validate(response_dict)
52
+ output = {}
53
+ for k, v in template.items():
54
+ output[k] = Template(v).safe_substitute(response_dict)
55
+
56
+ raw_value = output.get("generated_metadata", {})
57
+ output["generated_metadata"] = ast.literal_eval(raw_value) if isinstance(raw_value, str) else raw_value
58
+
59
+ return InteractionResults.model_validate(output)
52
60
 
53
61
  except json.JSONDecodeError as e:
54
62
  logger.error(f"[extract_interaction_details] Failed to extract details:\n{e}")
@@ -64,7 +72,7 @@ async def async_interaction_request(
64
72
  url: str,
65
73
  headers: Dict[str, str],
66
74
  payload: Dict[str, Any],
67
- ) -> Optional[httpx.Response]:
75
+ ) -> httpx.Response | None:
68
76
  """
69
77
  Perform an asynchronous interaction request.
70
78
 
@@ -74,7 +82,7 @@ async def async_interaction_request(
74
82
  payload (Dict[str, Any]): The payload to send in the request.
75
83
 
76
84
  Returns:
77
- Optional[httpx.Response]: The response from the interaction request, or None if an error occurred.
85
+ httpx.Response: The response from the interaction request, or None if an error occurred.
78
86
  """
79
87
  try:
80
88
  async with httpx.AsyncClient(timeout=180) as client:
@@ -92,42 +100,6 @@ async def async_interaction_request(
92
100
  return None
93
101
 
94
102
 
95
- def parse_date_value(raw_date_value: Optional[str], default_date_value: Optional[str] = "") -> str:
96
- """
97
- Cleans and parses a dehumanized relative date string to ISO format.
98
-
99
- Args:
100
- raw_date_value (Optional[str]): The raw date value to parse.
101
- default_date_value (Optional[str]): The default value to return if parsing fails. Defaults to an empty string.
102
-
103
- Returns:
104
- str: The parsed date in ISO format, or the default value if parsing fails.
105
- """
106
- if not raw_date_value:
107
- logger.info(f"[parse_date_value] No raw value provided. returning default: '{default_date_value}'")
108
- return default_date_value
109
-
110
- clean = raw_date_value.replace("{{", "").replace("}}", "").replace("_", " ").strip()
111
- clean += 's' if not clean.endswith('s') else clean
112
-
113
- try:
114
- arw = arrow.utcnow()
115
- parsed_date = arw.dehumanize(clean).utcnow().format('YYYY-MM-DD')
116
- return parsed_date
117
-
118
- except arrow.parser.ParserError as e:
119
- logger.error(f"[parse_date_value] Failed to parse date: '{clean}'\nParserError: {str(e)}", exc_info=True)
120
- return default_date_value
121
-
122
- except ValueError as e:
123
- logger.error(f"[parse_date_value] Invalid date value: '{clean}'\nValueError: {str(e)}", exc_info=True)
124
- return default_date_value
125
-
126
- except Exception as e:
127
- logger.error(f"[parse_date_value] Unexpected error.\nException: {str(e)}", exc_info=True)
128
- return default_date_value
129
-
130
-
131
103
  @MonitoringAspect.monitor(
132
104
  name="average_calc",
133
105
  category=MetricType.SCORING,
@@ -157,45 +129,35 @@ def calculate_average_scores(scores: Dict[str, Union[List[float], float]]) -> Di
157
129
 
158
130
 
159
131
  @MonitoringAspect.monitor(name="summarization", category=MetricType.API_CALL)
160
- def summarize_verdicts(verdicts: List[str], judge: str, max_bullets: int = 5) -> List[str]:
161
- """
162
- Summarize the justifications for each judge.
163
-
164
- Args:
165
- verdicts (List[str]): A list of justifications.
166
- judge (str): The judge or evaluator (provider) name for context.
167
- max_bullets (int): The maximum number of bullets allowed per judge.
168
-
169
- Returns:
170
- List[str]: The summarized justifications.
171
- """
172
- if not verdicts:
173
- return []
174
-
175
- prompt = f"""
176
- You are reviewing evaluation justifications from LL judges about replies generated by a virtual leasing agent.\n
177
- Each justification contains the judge's assessment of how well the agent's response matched the expected reply.\n
178
- Your task is to identify and summarize only the **negative points**, such as errors, misunderstandings,
179
- missing information, or failure to meet expectations.\n
180
- Return up to {max_bullets} bullet points. Be concise and start each point with '- '\n\n
181
- ---
182
- - Judge: {judge}
183
- - Justifications:\n{chr(10).join(verdicts)}\n
184
- """
185
-
186
- client = OpenAI()
132
+ def summarize_verdicts(
133
+ verdicts: List[str],
134
+ judge: str,
135
+ max_bullets: int = 5
136
+ ) -> List[str]:
137
+ client_registry = ClientRegistry()
138
+ client = client_registry.get(provider=judge)
187
139
 
188
140
  try:
189
- result = client.chat.completions.create(
190
- model="gpt-4o-mini",
191
- temperature=0,
192
- messages=[{"role": "user", "content": prompt}]
193
- ).choices[0].message.content
141
+ verdicts = chr(10).join(verdicts)
142
+ prompt = SUMMARIZATION_PROMPT_TEMPLATE.format(max_bullets=max_bullets, judge=judge, verdicts=verdicts)
143
+ response = client.call(message=prompt)
144
+ parsed = client.parse_response(response=response)
145
+ striped = parsed.get("output", "").strip("")
146
+ bullet_points = [point.strip() for point in striped.split("- ") if point.strip()]
194
147
 
195
- bullet_points = [point.strip() for point in result.split('- ') if point.strip()]
196
-
197
- return bullet_points
148
+ return bullet_points[:max_bullets]
198
149
 
199
150
  except Exception as e:
200
151
  logger.error(f"[summarize_justifications] Error during summarization: {str(e)}", exc_info=True)
201
152
  return []
153
+
154
+
155
+ # if __name__ == '__main__':
156
+ # template = {'generated_reply': '${agent_reply}', 'generated_metadata': '${generated_metadata}'}
157
+ # response_dict = {
158
+ # 'agent_reply': "I'd be happy to help you book something for 10 AM.",
159
+ # 'generated_metadata': {'appointment_type': 'Cardiology', 'date': 'next Monday', 'time': '10 AM'}
160
+ # }
161
+ #
162
+ # result = extract_interaction_details(response_dict, template)
163
+ # print(f"result: {result.model_dump()}")
@@ -1,5 +1,6 @@
1
1
  from . import registration
2
- from .schemas import WorkflowType
2
+ from .base import BaseWorkflow
3
+ from .config import WorkflowConfig
3
4
  from .factory import MainFactory
4
5
 
5
- __all__ = ["WorkflowType", "MainFactory"]
6
+ __all__ = ["BaseWorkflow", "WorkflowConfig", "MainFactory"]