levelapp 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of levelapp might be problematic. Click here for more details.

@@ -0,0 +1,26 @@
1
+ from enum import Enum
2
+
3
+
4
+ class ExtendedEnum(Enum):
5
+ @classmethod
6
+ def list(cls):
7
+ return [e.value for e in cls]
8
+
9
+
10
+ class WorkflowType(ExtendedEnum):
11
+ SIMULATOR = "SIMULATOR"
12
+ COMPARATOR = "COMPARATOR"
13
+ ASSESSOR = "ASSESSOR"
14
+
15
+
16
+ class RepositoryType(ExtendedEnum):
17
+ FIRESTORE = "FIRESTORE"
18
+ FILESYSTEM = "FILESYSTEM"
19
+
20
+
21
+ class EvaluatorType(ExtendedEnum):
22
+ JUDGE = "JUDGE"
23
+ REFERENCE = "REFERENCE"
24
+ RAG = "RAG"
25
+
26
+
levelapp/core/session.py CHANGED
@@ -1,27 +1,23 @@
1
1
  """levelapp/core/session.py"""
2
2
  import threading
3
3
 
4
+ from abc import ABC
5
+
4
6
  from dataclasses import dataclass, field
5
7
  from typing import Dict, List, Any
6
8
 
7
9
  from datetime import datetime
8
10
  from humanize import precisedelta
9
11
 
10
- from levelapp.workflow import MainFactory
12
+ from levelapp.workflow import MainFactory, WorkflowConfig
11
13
  from levelapp.workflow.base import BaseWorkflow
12
- from levelapp.workflow.schemas import WorkflowConfig, WorkflowContext
13
- from levelapp.aspects import FunctionMonitor, MetricType, ExecutionMetrics, MonitoringAspect, logger
14
+ from levelapp.aspects import MetricType, ExecutionMetrics, MonitoringAspect, logger
15
+ from levelapp.workflow.context import WorkflowContextBuilder
14
16
 
15
17
 
16
- @dataclass
17
- class SessionMetadata:
18
- """Metadata for an evaluation session."""
19
- session_name: str
20
- started_at: datetime | None = None
21
- ended_at: datetime | None = None
22
- total_executions: int = 0
23
- total_duration: float = 0.0
24
- steps: Dict[str, 'StepMetadata'] = field(default_factory=dict)
18
+ class TemporalStatusMixin(ABC):
19
+ started_at: datetime | None
20
+ ended_at: datetime | None
25
21
 
26
22
  @property
27
23
  def is_active(self) -> bool:
@@ -37,7 +33,18 @@ class SessionMetadata:
37
33
 
38
34
 
39
35
  @dataclass
40
- class StepMetadata:
36
+ class SessionMetadata(TemporalStatusMixin):
37
+ """Metadata for an evaluation session."""
38
+ session_name: str
39
+ started_at: datetime | None = None
40
+ ended_at: datetime | None = None
41
+ total_executions: int = 0
42
+ total_duration: float = 0.0
43
+ steps: Dict[str, 'StepMetadata'] = field(default_factory=dict)
44
+
45
+
46
+ @dataclass
47
+ class StepMetadata(TemporalStatusMixin):
41
48
  """Metadata for a specific step within an evaluation session."""
42
49
  step_name: str
43
50
  session_name: str
@@ -47,27 +54,21 @@ class StepMetadata:
47
54
  error_count: int = 0
48
55
  procedures_stats: List[ExecutionMetrics] | None = None
49
56
 
50
- @property
51
- def is_active(self) -> bool:
52
- """Check if the step is currently active."""
53
- return self.ended_at is None
54
-
55
- @property
56
- def duration(self) -> float | None:
57
- """Calculate the duration of the step in seconds."""
58
- if not self.is_active:
59
- return (self.ended_at - self.started_at).total_seconds()
60
- return None
61
-
62
57
 
63
58
  class StepContext:
64
59
  """Context manager for an evaluation step within an EvaluationSession."""
65
- def __init__(self, session: "EvaluationSession", step_name: str, category: MetricType):
60
+ def __init__(
61
+ self,
62
+ session: "EvaluationSession",
63
+ step_name: str,
64
+ category: MetricType,
65
+ ):
66
66
  self.session = session
67
67
  self.step_name = step_name
68
68
  self.category = category
69
+
69
70
  self.step_meta: StepMetadata | None = None
70
- self.full_step_name = f"{session.session_name}.{step_name}"
71
+ self.full_step_name = f"<{session.session_name}:{step_name}>"
71
72
  self._monitored_func = None
72
73
  self._func_gen = None
73
74
 
@@ -80,35 +81,49 @@ class StepContext:
80
81
  )
81
82
  self.session.session_metadata.steps[self.step_name] = self.step_meta
82
83
 
83
- # Wrap with FunctionMonitor
84
- self._monitored_func = self.session.monitor.monitor(
85
- name=self.full_step_name,
86
- category=self.category,
87
- enable_timing=True,
88
- track_memory=True,
89
- )(self._step_wrapper)
90
-
91
- # Start monitoring
92
- self._func_gen = self._monitored_func()
93
- next(self._func_gen) # Enter monitoring
84
+ if self.session.enable_monitoring:
85
+ # Wrap with FunctionMonitor
86
+ self._monitored_func = self.session.monitor.monitor(
87
+ name=self.full_step_name,
88
+ category=self.category,
89
+ enable_timing=True,
90
+ track_memory=True,
91
+ )(self._step_wrapper)
92
+
93
+ # Start monitoring
94
+ try:
95
+ self._func_gen = self._monitored_func()
96
+ next(self._func_gen) # Enter monitoring
97
+ except Exception as e:
98
+ logger.error(f"[StepContext] Failed to initialize monitoring for {self.full_step_name}:\n{e}")
99
+ raise
100
+
94
101
  return self # returning self allows nested instrumentation
95
102
 
103
+ # noinspection PyMethodMayBeStatic
96
104
  def _step_wrapper(self):
97
105
  yield # Actual user step execution happens here
98
106
 
99
107
  def __exit__(self, exc_type, exc_val, exc_tb):
100
- try:
101
- next(self._func_gen) # Exit monitoring
102
- except StopIteration:
103
- pass
108
+ if self.session.enable_monitoring:
109
+ try:
110
+ next(self._func_gen) # Exit monitoring
111
+ except StopIteration:
112
+ pass
104
113
 
105
114
  with self.session.lock:
106
115
  self.step_meta.ended_at = datetime.now()
116
+
107
117
  if exc_type:
108
118
  self.step_meta.error_count += 1
119
+
109
120
  self.session.session_metadata.total_executions += 1
121
+
110
122
  if self.step_meta.duration:
111
- self.session.monitor.update_procedure_duration(name=self.full_step_name, value=self.step_meta.duration)
123
+ self.session.monitor.update_procedure_duration(
124
+ name=self.full_step_name,
125
+ value=self.step_meta.duration
126
+ )
112
127
  self.session.session_metadata.total_duration += self.step_meta.duration
113
128
 
114
129
  return False
@@ -119,29 +134,30 @@ class EvaluationSession:
119
134
  def __init__(
120
135
  self,
121
136
  session_name: str = "test-session",
122
- monitor: FunctionMonitor | None = None,
123
- workflow_config: WorkflowConfig | None = None
137
+ workflow_config: WorkflowConfig | None = None,
138
+ enable_monitoring: bool = True,
124
139
  ):
125
140
  """
126
141
  Initialize Evaluation Session.
127
142
 
128
143
  Args:
129
144
  session_name (str): Name of the session
130
- monitor (FunctionMonitor): Function monitoring aspect
131
145
  workflow_config (WorkflowConfig): Workflow configuration.
132
146
  """
133
147
  self._NAME = self.__class__.__name__
134
148
 
135
149
  self.session_name = session_name
136
- self.monitor = monitor or MonitoringAspect
137
150
  self.workflow_config = workflow_config
138
- self.workflow_type = workflow_config.workflow
151
+ self.enable_monitoring = enable_monitoring
139
152
 
140
153
  self.workflow: BaseWorkflow | None = None
141
154
 
142
155
  self.session_metadata = SessionMetadata(session_name=session_name)
156
+ self.monitor = MonitoringAspect if enable_monitoring else None
143
157
  self._lock = threading.RLock()
144
158
 
159
+ logger.info("[EvaluationSession] Evaluation session initialized.")
160
+
145
161
  @property
146
162
  def lock(self):
147
163
  return self._lock
@@ -154,14 +170,10 @@ class EvaluationSession:
154
170
  if not self.workflow_config:
155
171
  raise ValueError(f"{self._NAME}: Workflow configuration must be provided")
156
172
 
157
- context = WorkflowContext(
158
- config=self.workflow_config,
159
- repository=MainFactory.create_repository(self.workflow_config),
160
- evaluators=MainFactory.create_evaluator(self.workflow_config),
161
- endpoint_config=self.workflow_config.endpoint_config,
162
- inputs=self.workflow_config.inputs
163
- )
164
- self.workflow = MainFactory.create_workflow(self.workflow_type, context)
173
+ context_builder = WorkflowContextBuilder(self.workflow_config)
174
+ context = context_builder.build()
175
+
176
+ self.workflow = MainFactory.create_workflow(context=context)
165
177
 
166
178
  logger.info(
167
179
  f"[{self._NAME}] Starting evaluation session: {self.session_name}, "
@@ -1,7 +1,6 @@
1
1
  """levelapp/core/evaluator.py"""
2
2
  from functools import lru_cache
3
- from typing import List, Dict, Any
4
- from collections import defaultdict
3
+ from typing import List, Dict, Any, TYPE_CHECKING
5
4
  from pydantic import BaseModel, Field
6
5
 
7
6
  from tenacity import (
@@ -19,6 +18,9 @@ from levelapp.config.prompts import EVAL_PROMPT_TEMPLATE
19
18
  from levelapp.core.base import BaseEvaluator, BaseChatClient
20
19
  from levelapp.aspects import MonitoringAspect, MetricType, logger, DataLoader
21
20
 
21
+ if TYPE_CHECKING:
22
+ from levelapp.workflow.config import WorkflowConfig
23
+
22
24
 
23
25
  class Evidence(BaseModel):
24
26
  """Evidence details for evaluation."""
@@ -69,19 +71,29 @@ class JudgeEvaluationResults(BaseModel):
69
71
 
70
72
 
71
73
  class JudgeEvaluator(BaseEvaluator):
72
- def __init__(self):
74
+ def __init__(self, config: "WorkflowConfig | None" = None):
75
+ if config:
76
+ self.config = config
77
+ self.providers = config.evaluation.providers
78
+
73
79
  self.prompt_template = EVAL_PROMPT_TEMPLATE
74
- self.clients = defaultdict(BaseChatClient)
80
+ self.client_registry = ClientRegistry
75
81
 
76
- def register_client(self, provider: str, client: BaseChatClient) -> None:
82
+ def select_client(self, provider: str) -> BaseChatClient:
77
83
  """
78
- Register LLM clients used for the evaluation.
84
+ Select an LLM client to use for the evaluation.
79
85
 
80
86
  Args:
81
87
  provider (str): The provider name.
82
- client (BaseChatClient): The LLM client to register.
88
+
89
+ Returns:
90
+ client (BaseChatClient): The LLM client to use for the evaluation.
83
91
  """
84
- self.clients[provider] = client
92
+ if provider not in self.client_registry.list_providers():
93
+ logger.warning(f"[JudgeEvaluator] {provider} is not registered. Defaulting to 'OpenAI'.")
94
+ return self.client_registry.get(provider="openai")
95
+
96
+ return self.client_registry.get(provider=provider)
85
97
 
86
98
  @lru_cache(maxsize=1024)
87
99
  def _build_prompt(self, user_input: str, generated_text: str, reference_text: str) -> str:
@@ -135,7 +147,7 @@ class JudgeEvaluator(BaseEvaluator):
135
147
  generated_text=generated_data,
136
148
  reference_text=reference_data
137
149
  )
138
- client = ClientRegistry.get(provider=provider)
150
+ client = self.select_client(provider=provider)
139
151
 
140
152
  try:
141
153
  response = client.call(message=prompt)
@@ -183,7 +195,7 @@ class JudgeEvaluator(BaseEvaluator):
183
195
  generated_text=generated_data,
184
196
  reference_text=reference_data
185
197
  )
186
- client = ClientRegistry.get(provider=provider)
198
+ client = self.select_client(provider=provider)
187
199
 
188
200
  try:
189
201
  async for attempt in AsyncRetrying(
@@ -212,7 +224,11 @@ class JudgeEvaluator(BaseEvaluator):
212
224
 
213
225
 
214
226
  class MetadataEvaluator(BaseEvaluator):
215
- def __init__(self):
227
+ def __init__(self, config: "WorkflowConfig | None"= None):
228
+ if config:
229
+ self.config = config
230
+ self.metics_map = config.evaluation.metrics_map
231
+
216
232
  self.data_loader = DataLoader()
217
233
  self.comparator = MetadataComparator()
218
234
  self.metrics_manager = MetricsManager()
@@ -1,7 +1,5 @@
1
1
  """levelapp/repository/firestore.py"""
2
- import google.auth
3
-
4
- from typing import List, Dict, Any, Type
2
+ from typing import List, Dict, Any, Type, TYPE_CHECKING
5
3
  from pydantic import ValidationError
6
4
 
7
5
  from google.cloud import firestore_v1
@@ -13,15 +11,25 @@ from levelapp.core.base import BaseRepository, Model
13
11
  from levelapp.aspects import logger
14
12
 
15
13
 
14
+ if TYPE_CHECKING:
15
+ from levelapp.workflow.config import WorkflowConfig
16
+
17
+
16
18
  class FirestoreRepository(BaseRepository):
17
19
  """
18
20
  Firestore implementation of BaseRepository.
19
21
  (Uses hierarchical path: {user_id}/{collection_id}/{document_id}
20
22
  """
21
23
 
22
- def __init__(self, project_id: str | Any = None, database_name: str | Any = '(default)'):
23
- self.project_id = project_id
24
- self.database_name = database_name
24
+ def __init__(self, config: "WorkflowConfig | None"):
25
+ if config:
26
+ self.config = config
27
+ self.project_id: str | Any = config.repository.project_id
28
+ self.database_name: str | Any = config.repository.database_name
29
+ else:
30
+ self.project_id: str | Any = None
31
+ self.database_name: str | Any = '(default)'
32
+
25
33
  self.client: firestore_v1.Client | None = None
26
34
 
27
35
  def connect(self) -> None:
@@ -29,6 +37,7 @@ class FirestoreRepository(BaseRepository):
29
37
  Connects to Firestore, prioritizing the project ID passed to the constructor.
30
38
  """
31
39
  try:
40
+ import google.auth
32
41
  credentials, default_project_id = google.auth.default()
33
42
 
34
43
  if not credentials:
@@ -8,7 +8,7 @@ from enum import Enum
8
8
  from uuid import UUID, uuid4
9
9
  from datetime import datetime
10
10
 
11
- from typing import Optional, Dict, Any, List
11
+ from typing import Dict, Any, List
12
12
  from pydantic import BaseModel, Field, computed_field
13
13
 
14
14
  from levelapp.evaluator.evaluator import JudgeEvaluationResults
@@ -25,11 +25,11 @@ class Interaction(BaseModel):
25
25
  """Represents a single interaction within a conversation."""
26
26
  id: UUID = Field(default_factory=uuid4, description="Interaction identifier")
27
27
  user_message: str = Field(..., description="The user's query message")
28
- generated_reply: str = Field(..., description="The agent's reply message")
28
+ # generated_reply: str = Field(..., description="The agent's reply message")
29
29
  reference_reply: str = Field(..., description="The preset reference message")
30
- interaction_type: InteractionLevel = Field(..., description="Type of interaction")
31
- reference_metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Expected metadata")
32
- generated_metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Extracted metadata")
30
+ interaction_type: InteractionLevel = Field(default=InteractionLevel.INITIAL, description="Type of interaction")
31
+ reference_metadata: Dict[str, Any] = Field(default_factory=dict, description="Expected metadata")
32
+ # generated_metadata: Dict[str, Any] = Field(default_factory=dict, description="Extracted metadata")
33
33
  guardrail_flag: bool = Field(default=False, description="Flag for guardrail signaling")
34
34
  request_payload: Dict[str, Any] = Field(default_factory=dict, description="Additional request payload")
35
35
 
@@ -38,7 +38,7 @@ class ConversationScript(BaseModel):
38
38
  """Represents a basic conversation with multiple interactions."""
39
39
  id: UUID = Field(default_factory=uuid4, description="Conversation identifier")
40
40
  interactions: List[Interaction] = Field(default_factory=list, description="List of interactions")
41
- description: str = Field(..., description="A short description of the conversation")
41
+ description: str = Field(default="no-description", description="A short description of the conversation")
42
42
  details: Dict[str, str] = Field(default_factory=dict, description="Conversation details")
43
43
 
44
44
 
@@ -58,8 +58,8 @@ class InteractionResults(BaseModel):
58
58
 
59
59
  class InteractionEvaluationResults(BaseModel):
60
60
  """Model representing the evaluation result of an interaction."""
61
- judge_evaluations: Dict[str, JudgeEvaluationResults] = Field(default_factory=dict)
62
- metadata_evaluation: Dict[str, float] = Field(default_factory=dict)
61
+ judge_evaluations: Dict[str, JudgeEvaluationResults] | None = Field(default_factory=dict)
62
+ metadata_evaluation: Dict[str, float] | None = Field(default_factory=dict)
63
63
  guardrail_flag: int = Field(default=0)
64
64
 
65
65
 
@@ -86,4 +86,4 @@ class TestResults(BaseModel):
86
86
  ionos_model_name: str = Field(..., alias="ionosModelName")
87
87
  test_name: str = Field(..., alias="testName")
88
88
  test_type: str = Field(..., alias="testType")
89
- batch_details: Optional[SimulationResults] = Field(..., alias="results")
89
+ batch_details: SimulationResults | None = Field(..., alias="results")
@@ -23,7 +23,7 @@ from levelapp.simulator.utils import (
23
23
  summarize_verdicts,
24
24
  )
25
25
  from levelapp.aspects import logger
26
- from levelapp.workflow.schemas import EvaluatorType
26
+ from levelapp.core.schemas import EvaluatorType
27
27
 
28
28
 
29
29
  class ConversationSimulator(BaseProcess):
@@ -33,6 +33,7 @@ class ConversationSimulator(BaseProcess):
33
33
  self,
34
34
  repository: BaseRepository | None = None,
35
35
  evaluators: Dict[EvaluatorType, BaseEvaluator] | None = None,
36
+ providers: List[str] | None = None,
36
37
  endpoint_config: EndpointConfig | None = None,
37
38
  ):
38
39
  """
@@ -47,6 +48,7 @@ class ConversationSimulator(BaseProcess):
47
48
 
48
49
  self.repository = repository
49
50
  self.evaluators = evaluators
51
+ self.providers = providers
50
52
  self.endpoint_config = endpoint_config
51
53
 
52
54
  self._url: str | None = None
@@ -60,7 +62,8 @@ class ConversationSimulator(BaseProcess):
60
62
  def setup(
61
63
  self,
62
64
  repository: BaseRepository,
63
- evaluators: Dict[str, BaseEvaluator],
65
+ evaluators: Dict[EvaluatorType, BaseEvaluator],
66
+ providers: List[str],
64
67
  endpoint_config: EndpointConfig,
65
68
  ) -> None:
66
69
  """
@@ -69,6 +72,7 @@ class ConversationSimulator(BaseProcess):
69
72
  Args:
70
73
  repository (BaseRepository): Repository object for storing simulation results.
71
74
  evaluators (Dict[str, BaseEvaluator]): List of evaluator objects for evaluating interactions.
75
+ providers (List[str]): List of LLM provider names.
72
76
  endpoint_config (EndpointConfig): Configuration object for VLA.
73
77
  """
74
78
  _LOG: str = f"[{self._CLASS_NAME}][{self.setup.__name__}]"
@@ -76,6 +80,11 @@ class ConversationSimulator(BaseProcess):
76
80
 
77
81
  self.repository = repository
78
82
  self.evaluators = evaluators
83
+ self.providers = providers
84
+
85
+ if not self.providers:
86
+ logger.warning(f"{_LOG} No LLM providers were provided. The Judge Evaluation process will not be executed.")
87
+
79
88
  self.endpoint_config = endpoint_config
80
89
 
81
90
  self._url = endpoint_config.full_url
@@ -87,6 +96,7 @@ class ConversationSimulator(BaseProcess):
87
96
 
88
97
  if name not in self.evaluators:
89
98
  raise KeyError(f"{_LOG} Evaluator {name} not registered.")
99
+
90
100
  return self.evaluators[name]
91
101
 
92
102
  async def run(
@@ -368,50 +378,106 @@ class ConversationSimulator(BaseProcess):
368
378
  """
369
379
  _LOG: str = f"[{self._CLASS_NAME}][{self.evaluate_interaction.__name__}]"
370
380
 
371
- judge_evaluator = self.evaluators.get(EvaluatorType.JUDGE)
372
- metadata_evaluator = self.evaluators.get(EvaluatorType.REFERENCE)
381
+ judge_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.JUDGE, None)
382
+ metadata_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.REFERENCE, None)
373
383
 
374
- if not judge_evaluator:
375
- raise ValueError(f"{_LOG} No Judge Evaluator found.")
384
+ evaluation_results = InteractionEvaluationResults()
376
385
 
377
- openai_eval_task = judge_evaluator.async_evaluate(
378
- generated_data=generated_reply,
379
- reference_data=reference_reply,
380
- user_input=user_input,
381
- provider="openai"
382
- )
386
+ if judge_evaluator and self.providers:
387
+ await self._judge_evaluation(
388
+ user_input=user_input,
389
+ generated_reply=generated_reply,
390
+ reference_reply=reference_reply,
391
+ providers=self.providers,
392
+ judge_evaluator=judge_evaluator,
393
+ evaluation_results=evaluation_results,
394
+ )
395
+ else:
396
+ logger.info(f"[{_LOG}] Judge evaluation skipped (no evaluator or no providers).")
383
397
 
384
- ionos_eval_task = judge_evaluator.async_evaluate(
385
- provider="ionos",
386
- user_input=user_input,
387
- generated_data=generated_reply,
388
- reference_data=reference_reply,
389
- )
398
+ if metadata_evaluator and reference_metadata:
399
+ self._metadata_evaluation(
400
+ metadata_evaluator=metadata_evaluator,
401
+ generated_metadata=generated_metadata,
402
+ reference_metadata=reference_metadata,
403
+ evaluation_results=evaluation_results,
404
+ )
405
+ else:
406
+ logger.info(f"[{_LOG}] Metadata evaluation skipped (no evaluator or no reference metadata).")
390
407
 
391
- openai_judge_evaluation, ionos_judge_evaluation = await asyncio.gather(
392
- openai_eval_task, ionos_eval_task
393
- )
408
+ evaluation_results.guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
394
409
 
395
- if not metadata_evaluator:
396
- raise ValueError(f"{_LOG} No Metadata Evaluator found.")
410
+ return evaluation_results
397
411
 
398
- metadata_evaluation = {}
399
- if reference_metadata:
400
- metadata_evaluation = metadata_evaluator.evaluate(
401
- generated_data=generated_metadata,
402
- reference_data=reference_metadata,
412
+ async def _judge_evaluation(
413
+ self,
414
+ user_input: str,
415
+ generated_reply: str,
416
+ reference_reply: str,
417
+ providers: List[str],
418
+ judge_evaluator: BaseEvaluator,
419
+ evaluation_results: InteractionEvaluationResults,
420
+ ) -> None:
421
+ """
422
+ Run LLM-as-a-judge evaluation using multiple providers (async).
423
+
424
+ Args:
425
+ user_input (str): The user input message.
426
+ generated_reply (str): The generated agent reply.
427
+ reference_reply (str): The reference agent reply.
428
+ providers (List[str]): List of judge provider names.
429
+ judge_evaluator (BaseEvaluator): Evaluator instance.
430
+ evaluation_results (InteractionEvaluationResults): Results container (Pydantic model).
431
+
432
+ Returns:
433
+ None
434
+ """
435
+ _LOG: str = f"[{self._CLASS_NAME}][judge_evaluation]"
436
+
437
+ tasks = {
438
+ provider: judge_evaluator.async_evaluate(
439
+ generated_data=generated_reply,
440
+ reference_data=reference_reply,
441
+ user_input=user_input,
442
+ provider=provider,
403
443
  )
444
+ for provider in providers
445
+ }
404
446
 
405
- guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
447
+ results = await asyncio.gather(*tasks.values(), return_exceptions=True)
406
448
 
407
- return InteractionEvaluationResults(
408
- judge_evaluations={
409
- openai_judge_evaluation.provider: openai_judge_evaluation,
410
- ionos_judge_evaluation.provider: ionos_judge_evaluation
411
- },
412
- metadata_evaluation=metadata_evaluation,
413
- guardrail_flag=guardrail_flag,
414
- )
449
+ for provider, result in zip(tasks.keys(), results):
450
+ if isinstance(result, Exception):
451
+ logger.error(f"{_LOG} Provider '{provider}' failed to perform Judge Evaluation.")
452
+ continue
453
+
454
+ evaluation_results.judge_evaluations[provider] = result
455
+
456
+ def _metadata_evaluation(
457
+ self,
458
+ metadata_evaluator: BaseEvaluator,
459
+ generated_metadata: Dict[str, Any],
460
+ reference_metadata: Dict[str, Any],
461
+ evaluation_results: InteractionEvaluationResults,
462
+ ) -> None:
463
+ """
464
+ Run metadata evaluation using the provided evaluator.
465
+
466
+ Args:
467
+ metadata_evaluator (BaseEvaluator): Evaluator for metadata comparison.
468
+ generated_metadata (Dict[str, Any]): The generated metadata.
469
+ reference_metadata (Dict[str, Any]): The reference metadata.
470
+ evaluation_results (InteractionEvaluationResults): Results container.
471
+ """
472
+ _LOG: str = f"[{self._CLASS_NAME}][metadata_evaluation]"
473
+
474
+ try:
475
+ evaluation_results.metadata_evaluation = metadata_evaluator.evaluate(
476
+ generated_data=generated_metadata,
477
+ reference_data=reference_metadata,
478
+ )
479
+ except Exception as e:
480
+ logger.error(f"[{_LOG}] Metadata evaluation failed:\n{e}", exc_info=e)
415
481
 
416
482
  @staticmethod
417
483
  def store_evaluation_results(
@@ -1,5 +1,6 @@
1
1
  from . import registration
2
- from .schemas import WorkflowType
2
+ from .base import BaseWorkflow
3
+ from .config import WorkflowConfig
3
4
  from .factory import MainFactory
4
5
 
5
- __all__ = ["WorkflowType", "MainFactory"]
6
+ __all__ = ["BaseWorkflow", "WorkflowConfig", "MainFactory"]