levelapp 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of levelapp might be problematic. Click here for more details.

@@ -343,6 +343,7 @@ class FunctionMonitor:
343
343
  category: MetricType,
344
344
  enable_timing: bool,
345
345
  track_memory: bool,
346
+ verbose=False
346
347
  ) -> Callable[P, T]:
347
348
  """
348
349
  Wrap function execution with timing and error handling.
@@ -352,6 +353,7 @@ class FunctionMonitor:
352
353
  name: Unique identifier for the function
353
354
  enable_timing: Enable execution time logging
354
355
  track_memory: Enable memory tracking
356
+ verbose: Enable verbose logging
355
357
 
356
358
  Returns:
357
359
  Wrapped function
@@ -402,7 +404,7 @@ class FunctionMonitor:
402
404
 
403
405
  self._aggregated_stats[name].update(metrics=metrics)
404
406
 
405
- if enable_timing and metrics.duration is not None:
407
+ if verbose and enable_timing and metrics.duration is not None:
406
408
  log_message = f"[FunctionMonitor] Executed '{name}' in {metrics.duration:.4f}s"
407
409
  if metrics.cache_hit:
408
410
  log_message += " (cache hit)"
@@ -44,7 +44,6 @@ class ClientRegistry:
44
44
 
45
45
  cls._wrap_client_methods(client_class)
46
46
  cls._clients[provider] = client_class
47
- logger.info(f"[ClientRegistry] Registered client for provider: {provider}")
48
47
 
49
48
  @classmethod
50
49
  def _wrap_client_methods(cls, client_class: Type[BaseChatClient]) -> None:
@@ -78,7 +78,6 @@ class MetricsManager:
78
78
  ValueError: if the scorer is not a callable.
79
79
  """
80
80
  self._scorers[name] = scorer
81
- logger.info(f"[MetricsManager] Registered scorer: {name}")
82
81
 
83
82
  def get_scorer(self, name: str) -> Callable:
84
83
  """
@@ -95,7 +94,6 @@ class MetricsManager:
95
94
  """
96
95
  try:
97
96
  scorer = self._scorers.get(name)
98
- logger.info(f"[get_scorer] Retrieved scorer: {name}")
99
97
  return scorer
100
98
 
101
99
  except KeyError:
@@ -29,7 +29,6 @@ class EndpointConfig(BaseModel):
29
29
  bearer_token (SecretStr): The Bearer token to use.
30
30
  model_id (str): The model to use (if applicable).
31
31
  default_request_payload_template (Dict[str, Any]): The payload template to use.
32
- generated_request_payload_template (Dict[str, Any]): The generated payload template from a provided file.
33
32
  variables (Dict[str, Any]): The variables to populate the payload template.
34
33
 
35
34
  Note:
@@ -40,11 +39,10 @@ class EndpointConfig(BaseModel):
40
39
  - bearer_token (SecretStr): The Bearer token to use.
41
40
  - model_id (str): The model to use (if applicable).
42
41
  - default_payload_template (Dict[str, Any]): The payload template to use.
43
- - generated_payload_template (Dict[str, Any]): The generated payload template from a provided file.
44
42
  - variables (Dict[str, Any]): The variables to populate the payload template.
45
43
 
46
44
  Or manually configure the model instance by assigning the proper values to the model fields.\n
47
- You can also provide the path in the .env file for the payload template (ENDPOINT_PAYLOAD_PATH)
45
+ You can also provide the path in the .env file for the payload template (ENDPOINT_PAYLOAD_PATH/)
48
46
  and the response template (ENDPOINT_RESPONSE_PATH) separately. The files can be either YAML or JSON only.
49
47
  """
50
48
  load_dotenv()
@@ -61,9 +59,7 @@ class EndpointConfig(BaseModel):
61
59
 
62
60
  # Data
63
61
  default_request_payload_template: Dict[str, Any] = Field(default_factory=dict)
64
- generated_request_payload_template: Dict[str, Any] = Field(default_factory=dict)
65
62
  default_response_payload_template: Dict[str, Any] = Field(default_factory=dict)
66
- generated_response_payload_template: Dict[str, Any] = Field(default_factory=dict)
67
63
 
68
64
  # Variables
69
65
  variables: Dict[str, Any] = Field(default_factory=dict)
@@ -88,14 +84,18 @@ class EndpointConfig(BaseModel):
88
84
  @computed_field
89
85
  @property
90
86
  def request_payload(self) -> Dict[str, Any]:
91
- """Return fully prepared payload depending on template or full payload."""
92
- # First, load the request payload template (either from YAML config file or from specific template)
87
+ """
88
+ Return fully prepared payload depending on template or full payload.
89
+
90
+ Returns:
91
+ request payload (Dict[str, Any]): Populated request payload template.
92
+ """
93
+ # First, we check if we have variables to populate the template with. If not, we return the template as is.
93
94
  if not self.variables:
94
95
  return self.default_request_payload_template
95
96
 
96
97
  if not self.default_request_payload_template:
97
- self.load_template(template_type=TemplateType.REQUEST)
98
- base_template = self.generated_request_payload_template
98
+ base_template = self.load_template(template_type=TemplateType.REQUEST)
99
99
  else:
100
100
  base_template = self.default_request_payload_template
101
101
 
@@ -118,8 +118,7 @@ class EndpointConfig(BaseModel):
118
118
  return self.default_response_payload_template
119
119
 
120
120
  if not self.default_response_payload_template:
121
- self.load_template(template_type=TemplateType.RESPONSE)
122
- base_template = self.generated_response_payload_template
121
+ base_template = self.load_template(template_type=TemplateType.RESPONSE)
123
122
  else:
124
123
  base_template = self.default_response_payload_template
125
124
 
@@ -148,12 +147,23 @@ class EndpointConfig(BaseModel):
148
147
 
149
148
  return _replace(obj)
150
149
 
150
+ @staticmethod
151
151
  def load_template(
152
- self,
153
152
  template_type: TemplateType = TemplateType.REQUEST,
154
153
  path: str | None = None
155
154
  ) -> Dict[str, Any]:
155
+ """
156
+ Load request/response payload template from JSON/YAML file.
157
+
158
+ Args:
159
+ template_type (TemplateType): The type of template to load (REQUEST or RESPONSE).
160
+ path (str): The path of the payload template file to load.
161
+
162
+ Returns:
163
+ Payload template (Dict[str, Any]): Payload template.
164
+ """
156
165
  try:
166
+ # If no path was provided, we check the env. variables.
157
167
  if not path:
158
168
  env_var = "ENDPOINT_PAYLOAD_PATH" if template_type == TemplateType.REQUEST else "ENDPOINT_RESPONSE_PATH"
159
169
  path = os.getenv(env_var, '')
@@ -171,7 +181,6 @@ class EndpointConfig(BaseModel):
171
181
  else:
172
182
  raise ValueError("[EndpointConfig] Unsupported file format.")
173
183
 
174
- self.generated_request_payload_template = data
175
184
  return data
176
185
 
177
186
  except FileNotFoundError as e:
@@ -0,0 +1,62 @@
1
+ from abc import ABC
2
+ from enum import Enum
3
+ from typing import Any, List
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class HttpMethod(str, Enum):
9
+ GET = "GET"
10
+ POST = "POST"
11
+ PUT = "PUT"
12
+ Patch = "PATCH"
13
+ DELETE = "DELETE"
14
+
15
+
16
+ class HeaderConfig(BaseModel):
17
+ """Secure header configuration with environment variables support."""
18
+ name: str
19
+ value: str
20
+ secure: bool = False
21
+
22
+ class Config:
23
+ frozen = True
24
+
25
+
26
+ class RequestSchemaConfig(BaseModel):
27
+ """Schema definition for request payload population."""
28
+ field_path: str # JSON path-like: "data.user.id"
29
+ value: Any
30
+ value_type: str = "static" # static, env, dynamic
31
+ required: bool = True
32
+
33
+
34
+ class ResponseMappingConfig(BaseModel):
35
+ """Response data extraction mapping."""
36
+ field_path: str # JSON path-like: "data.results[0].id"
37
+ extract_as: str # Name to extract as
38
+ default: Any = None
39
+
40
+
41
+ class EndpointConfig(BaseModel):
42
+ """Complete endpoint configuration."""
43
+ name: str
44
+ base_url: str
45
+ path: str
46
+ method: HttpMethod
47
+ headers: List[HeaderConfig] = Field(default_factory=list)
48
+ request_schema: List[RequestSchemaConfig] = Field(default_factory=list)
49
+ response_mapping: List[ResponseMappingConfig] = Field(default_factory=list)
50
+ timeout: int = 30
51
+ retry_count: int = 3
52
+ retry_backoff: float = 1.0
53
+
54
+ @classmethod
55
+ def validate_path(cls, v: str) -> str:
56
+ if not v.startswith('/'):
57
+ return f'/{v}'
58
+ return v
59
+
60
+
61
+ class PayloadBuilder(ABC):
62
+ """Abstract base for payload construction strategies."""
@@ -33,3 +33,25 @@ Return ONLY a single JSON object on one line with exactly these keys:
33
33
 
34
34
  Do NOT include any additional text, explanations, or formatting (e.g., "JSON object:", ```json or ```, or markdown).
35
35
  """
36
+
37
+
38
+ SUMMARIZATION_PROMPT_TEMPLATE = """
39
+ You are reviewing evaluation justifications from LLM judges about replies generated by a virtual assistant.
40
+ Interpret the context from the verdicts: (e.g., real-estate leasing, medical appointment scheduling, etc.).
41
+
42
+ Each justification contains the judge's assessment of how well the assistant's response matched the expected reply.
43
+ Your task is to **identify and summarize only the negative points**, such as:
44
+ - Errors or inaccuracies
45
+ - Misunderstandings or misinterpretations
46
+ - Missing or incomplete information
47
+ - Failure to meet expectations or requirements
48
+
49
+ **Instructions:**
50
+ - Return up to {max_bullets} concise bullet points.
51
+ - Start each point with "- " and focus on clarity and relevance.
52
+ - Avoid redundancy and prioritize actionable feedback.
53
+
54
+ ---
55
+ - Judge: {judge}
56
+ - Verdicts: {verdicts}
57
+ """
@@ -0,0 +1,24 @@
1
+ from enum import Enum
2
+
3
+
4
+ class ExtendedEnum(Enum):
5
+ @classmethod
6
+ def list(cls):
7
+ return [e.value for e in cls]
8
+
9
+
10
+ class WorkflowType(ExtendedEnum):
11
+ SIMULATOR = "SIMULATOR"
12
+ COMPARATOR = "COMPARATOR"
13
+ ASSESSOR = "ASSESSOR"
14
+
15
+
16
+ class RepositoryType(ExtendedEnum):
17
+ FIRESTORE = "FIRESTORE"
18
+ FILESYSTEM = "FILESYSTEM"
19
+
20
+
21
+ class EvaluatorType(ExtendedEnum):
22
+ JUDGE = "JUDGE"
23
+ REFERENCE = "REFERENCE"
24
+ RAG = "RAG"
levelapp/core/session.py CHANGED
@@ -1,27 +1,23 @@
1
1
  """levelapp/core/session.py"""
2
2
  import threading
3
3
 
4
+ from abc import ABC
5
+
4
6
  from dataclasses import dataclass, field
5
7
  from typing import Dict, List, Any
6
8
 
7
9
  from datetime import datetime
8
10
  from humanize import precisedelta
9
11
 
10
- from levelapp.workflow import MainFactory
12
+ from levelapp.workflow import MainFactory, WorkflowConfig
11
13
  from levelapp.workflow.base import BaseWorkflow
12
- from levelapp.workflow.schemas import WorkflowConfig, WorkflowContext
13
- from levelapp.aspects import FunctionMonitor, MetricType, ExecutionMetrics, MonitoringAspect, logger
14
+ from levelapp.aspects import MetricType, ExecutionMetrics, MonitoringAspect, logger
15
+ from levelapp.workflow.context import WorkflowContextBuilder
14
16
 
15
17
 
16
- @dataclass
17
- class SessionMetadata:
18
- """Metadata for an evaluation session."""
19
- session_name: str
20
- started_at: datetime | None = None
21
- ended_at: datetime | None = None
22
- total_executions: int = 0
23
- total_duration: float = 0.0
24
- steps: Dict[str, 'StepMetadata'] = field(default_factory=dict)
18
+ class TemporalStatusMixin(ABC):
19
+ started_at: datetime | None
20
+ ended_at: datetime | None
25
21
 
26
22
  @property
27
23
  def is_active(self) -> bool:
@@ -37,7 +33,18 @@ class SessionMetadata:
37
33
 
38
34
 
39
35
  @dataclass
40
- class StepMetadata:
36
+ class SessionMetadata(TemporalStatusMixin):
37
+ """Metadata for an evaluation session."""
38
+ session_name: str
39
+ started_at: datetime | None = None
40
+ ended_at: datetime | None = None
41
+ total_executions: int = 0
42
+ total_duration: float = 0.0
43
+ steps: Dict[str, 'StepMetadata'] = field(default_factory=dict)
44
+
45
+
46
+ @dataclass
47
+ class StepMetadata(TemporalStatusMixin):
41
48
  """Metadata for a specific step within an evaluation session."""
42
49
  step_name: str
43
50
  session_name: str
@@ -47,27 +54,29 @@ class StepMetadata:
47
54
  error_count: int = 0
48
55
  procedures_stats: List[ExecutionMetrics] | None = None
49
56
 
50
- @property
51
- def is_active(self) -> bool:
52
- """Check if the step is currently active."""
53
- return self.ended_at is None
54
-
55
- @property
56
- def duration(self) -> float | None:
57
- """Calculate the duration of the step in seconds."""
58
- if not self.is_active:
59
- return (self.ended_at - self.started_at).total_seconds()
60
- return None
61
-
62
57
 
63
58
  class StepContext:
64
59
  """Context manager for an evaluation step within an EvaluationSession."""
65
- def __init__(self, session: "EvaluationSession", step_name: str, category: MetricType):
60
+ def __init__(
61
+ self,
62
+ session: "EvaluationSession",
63
+ step_name: str,
64
+ category: MetricType,
65
+ ):
66
+ """
67
+ Initialize StepContext.
68
+
69
+ Args:
70
+ session (EvaluationSession): Evaluation session.
71
+ step_name (str): Step name.
72
+ category (MetricType): Metric type.
73
+ """
66
74
  self.session = session
67
75
  self.step_name = step_name
68
76
  self.category = category
77
+
69
78
  self.step_meta: StepMetadata | None = None
70
- self.full_step_name = f"{session.session_name}.{step_name}"
79
+ self.full_step_name = f"<{session.session_name}:{step_name}>"
71
80
  self._monitored_func = None
72
81
  self._func_gen = None
73
82
 
@@ -80,35 +89,50 @@ class StepContext:
80
89
  )
81
90
  self.session.session_metadata.steps[self.step_name] = self.step_meta
82
91
 
83
- # Wrap with FunctionMonitor
84
- self._monitored_func = self.session.monitor.monitor(
85
- name=self.full_step_name,
86
- category=self.category,
87
- enable_timing=True,
88
- track_memory=True,
89
- )(self._step_wrapper)
90
-
91
- # Start monitoring
92
- self._func_gen = self._monitored_func()
93
- next(self._func_gen) # Enter monitoring
92
+ if self.session.enable_monitoring:
93
+ # Wrap with FunctionMonitor
94
+ self._monitored_func = self.session.monitor.monitor(
95
+ name=self.full_step_name,
96
+ category=self.category,
97
+ enable_timing=True,
98
+ track_memory=True,
99
+ verbose=self.session.verbose,
100
+ )(self._step_wrapper)
101
+
102
+ # Start monitoring
103
+ try:
104
+ self._func_gen = self._monitored_func()
105
+ next(self._func_gen) # Enter monitoring
106
+ except Exception as e:
107
+ logger.error(f"[StepContext] Failed to initialize monitoring for {self.full_step_name}:\n{e}")
108
+ raise
109
+
94
110
  return self # returning self allows nested instrumentation
95
111
 
112
+ # noinspection PyMethodMayBeStatic
96
113
  def _step_wrapper(self):
97
114
  yield # Actual user step execution happens here
98
115
 
99
116
  def __exit__(self, exc_type, exc_val, exc_tb):
100
- try:
101
- next(self._func_gen) # Exit monitoring
102
- except StopIteration:
103
- pass
117
+ if self.session.enable_monitoring:
118
+ try:
119
+ next(self._func_gen) # Exit monitoring
120
+ except StopIteration:
121
+ pass
104
122
 
105
123
  with self.session.lock:
106
124
  self.step_meta.ended_at = datetime.now()
125
+
107
126
  if exc_type:
108
127
  self.step_meta.error_count += 1
128
+
109
129
  self.session.session_metadata.total_executions += 1
110
- if self.step_meta.duration:
111
- self.session.monitor.update_procedure_duration(name=self.full_step_name, value=self.step_meta.duration)
130
+
131
+ if self.session.enable_monitoring and self.step_meta.duration:
132
+ self.session.monitor.update_procedure_duration(
133
+ name=self.full_step_name,
134
+ value=self.step_meta.duration
135
+ )
112
136
  self.session.session_metadata.total_duration += self.step_meta.duration
113
137
 
114
138
  return False
@@ -119,29 +143,34 @@ class EvaluationSession:
119
143
  def __init__(
120
144
  self,
121
145
  session_name: str = "test-session",
122
- monitor: FunctionMonitor | None = None,
123
- workflow_config: WorkflowConfig | None = None
146
+ workflow_config: WorkflowConfig | None = None,
147
+ enable_monitoring: bool = True,
148
+ verbose: bool = False
124
149
  ):
125
150
  """
126
151
  Initialize Evaluation Session.
127
152
 
128
153
  Args:
129
154
  session_name (str): Name of the session
130
- monitor (FunctionMonitor): Function monitoring aspect
131
155
  workflow_config (WorkflowConfig): Workflow configuration.
156
+ enable_monitoring (bool): Switch monitoring on. Defaults to True.
157
+ verbose (bool): Verbose mode. Defaults to False.
132
158
  """
133
159
  self._NAME = self.__class__.__name__
134
160
 
135
161
  self.session_name = session_name
136
- self.monitor = monitor or MonitoringAspect
137
162
  self.workflow_config = workflow_config
138
- self.workflow_type = workflow_config.workflow
163
+ self.enable_monitoring = enable_monitoring
164
+ self.verbose = verbose
139
165
 
140
166
  self.workflow: BaseWorkflow | None = None
141
167
 
142
168
  self.session_metadata = SessionMetadata(session_name=session_name)
169
+ self.monitor = MonitoringAspect if enable_monitoring else None
143
170
  self._lock = threading.RLock()
144
171
 
172
+ logger.info("[EvaluationSession] Evaluation session initialized.")
173
+
145
174
  @property
146
175
  def lock(self):
147
176
  return self._lock
@@ -154,17 +183,13 @@ class EvaluationSession:
154
183
  if not self.workflow_config:
155
184
  raise ValueError(f"{self._NAME}: Workflow configuration must be provided")
156
185
 
157
- context = WorkflowContext(
158
- config=self.workflow_config,
159
- repository=MainFactory.create_repository(self.workflow_config),
160
- evaluators=MainFactory.create_evaluator(self.workflow_config),
161
- endpoint_config=self.workflow_config.endpoint_config,
162
- inputs=self.workflow_config.inputs
163
- )
164
- self.workflow = MainFactory.create_workflow(self.workflow_type, context)
186
+ context_builder = WorkflowContextBuilder(self.workflow_config)
187
+ context = context_builder.build()
188
+
189
+ self.workflow = MainFactory.create_workflow(context=context)
165
190
 
166
191
  logger.info(
167
- f"[{self._NAME}] Starting evaluation session: {self.session_name}, "
192
+ f"[{self._NAME}] Starting evaluation session: {self.session_name} - "
168
193
  f"Workflow: '{self.workflow.name}'"
169
194
  )
170
195
  return self
@@ -178,6 +203,7 @@ class EvaluationSession:
178
203
 
179
204
  if exc_type:
180
205
  logger.error(f"[{self._NAME}] Session ended with error: {exc_val}", exc_info=True)
206
+
181
207
  return False
182
208
 
183
209
  def step(self, step_name: str, category: MetricType = MetricType.CUSTOM) -> StepContext:
@@ -201,6 +227,19 @@ class EvaluationSession:
201
227
  self.workflow.collect_results()
202
228
 
203
229
  def get_stats(self) -> Dict[str, Any]:
230
+ if self.enable_monitoring:
231
+ return {
232
+ "session": {
233
+ "name": self.session_name,
234
+ "duration": precisedelta(self.session_metadata.duration, suppress=['minutes']),
235
+ "start_time": self.session_metadata.started_at.isoformat(),
236
+ "end_time": self.session_metadata.ended_at.isoformat(),
237
+ "steps": len(self.session_metadata.steps),
238
+ "errors": sum(s.error_count for s in self.session_metadata.steps.values())
239
+ },
240
+ "stats": self.monitor.get_all_stats()
241
+ }
242
+
204
243
  return {
205
244
  "session": {
206
245
  "name": self.session_name,
@@ -210,5 +249,4 @@ class EvaluationSession:
210
249
  "steps": len(self.session_metadata.steps),
211
250
  "errors": sum(s.error_count for s in self.session_metadata.steps.values())
212
251
  },
213
- "stats": self.monitor.get_all_stats()
214
252
  }
@@ -1,7 +1,6 @@
1
1
  """levelapp/core/evaluator.py"""
2
2
  from functools import lru_cache
3
- from typing import List, Dict, Any
4
- from collections import defaultdict
3
+ from typing import List, Dict, Any, TYPE_CHECKING
5
4
  from pydantic import BaseModel, Field
6
5
 
7
6
  from tenacity import (
@@ -19,6 +18,9 @@ from levelapp.config.prompts import EVAL_PROMPT_TEMPLATE
19
18
  from levelapp.core.base import BaseEvaluator, BaseChatClient
20
19
  from levelapp.aspects import MonitoringAspect, MetricType, logger, DataLoader
21
20
 
21
+ if TYPE_CHECKING:
22
+ from levelapp.workflow.config import WorkflowConfig
23
+
22
24
 
23
25
  class Evidence(BaseModel):
24
26
  """Evidence details for evaluation."""
@@ -39,7 +41,7 @@ class JudgeEvaluationResults(BaseModel):
39
41
  label: str = Field(..., description="The label of the evaluation result")
40
42
  justification: str = Field(..., description="Short explanation of the evaluation result")
41
43
  evidence: Evidence = Field(default_factory=Evidence, description="Detailed evidence for the evaluation")
42
- raw_response: Dict[str, Any] = Field(..., description="Full unprocessed API response")
44
+ raw_response: Dict[str, Any] = Field(..., description="Full unprocessed API response", exclude=True)
43
45
  metadata: Dict[str, Any] = Field(..., description="Metadata about the evaluation result")
44
46
 
45
47
  @classmethod
@@ -69,19 +71,36 @@ class JudgeEvaluationResults(BaseModel):
69
71
 
70
72
 
71
73
  class JudgeEvaluator(BaseEvaluator):
72
- def __init__(self):
74
+ """LLM-as-a-judge evaluator class"""
75
+ def __init__(self, config: "WorkflowConfig | None" = None):
76
+ """
77
+ Initialize the JudgeEvaluator.
78
+
79
+ Args:
80
+ config (WorkflowConfig | None): The configuration of the workflow.
81
+ """
82
+ if config:
83
+ self.config = config
84
+ self.providers = config.evaluation.providers
85
+
73
86
  self.prompt_template = EVAL_PROMPT_TEMPLATE
74
- self.clients = defaultdict(BaseChatClient)
87
+ self.client_registry = ClientRegistry
75
88
 
76
- def register_client(self, provider: str, client: BaseChatClient) -> None:
89
+ def select_client(self, provider: str) -> BaseChatClient:
77
90
  """
78
- Register LLM clients used for the evaluation.
91
+ Select an LLM client to use for the evaluation.
79
92
 
80
93
  Args:
81
94
  provider (str): The provider name.
82
- client (BaseChatClient): The LLM client to register.
95
+
96
+ Returns:
97
+ client (BaseChatClient): The LLM client to use for the evaluation.
83
98
  """
84
- self.clients[provider] = client
99
+ if provider not in self.client_registry.list_providers():
100
+ logger.warning(f"[JudgeEvaluator] {provider} is not registered. Defaulting to 'OpenAI'.")
101
+ return self.client_registry.get(provider="openai")
102
+
103
+ return self.client_registry.get(provider=provider)
85
104
 
86
105
  @lru_cache(maxsize=1024)
87
106
  def _build_prompt(self, user_input: str, generated_text: str, reference_text: str) -> str:
@@ -135,7 +154,7 @@ class JudgeEvaluator(BaseEvaluator):
135
154
  generated_text=generated_data,
136
155
  reference_text=reference_data
137
156
  )
138
- client = ClientRegistry.get(provider=provider)
157
+ client = self.select_client(provider=provider)
139
158
 
140
159
  try:
141
160
  response = client.call(message=prompt)
@@ -183,7 +202,7 @@ class JudgeEvaluator(BaseEvaluator):
183
202
  generated_text=generated_data,
184
203
  reference_text=reference_data
185
204
  )
186
- client = ClientRegistry.get(provider=provider)
205
+ client = self.select_client(provider=provider)
187
206
 
188
207
  try:
189
208
  async for attempt in AsyncRetrying(
@@ -194,7 +213,6 @@ class JudgeEvaluator(BaseEvaluator):
194
213
  ):
195
214
  with attempt:
196
215
  response = await client.acall(message=prompt)
197
- logger.info(f"[{provider}] Async evaluation:\n{response}\n{'---' * 10}")
198
216
  parsed = client.parse_response(response=response)
199
217
  return JudgeEvaluationResults.from_parsed(provider=provider, parsed=parsed, raw=response)
200
218
 
@@ -212,7 +230,18 @@ class JudgeEvaluator(BaseEvaluator):
212
230
 
213
231
 
214
232
  class MetadataEvaluator(BaseEvaluator):
215
- def __init__(self):
233
+ """Metadata evaluator class."""
234
+ def __init__(self, config: "WorkflowConfig | None" = None):
235
+ """
236
+ Initialize the MetadataEvaluator.
237
+
238
+ Args:
239
+ config (WorkflowConfig | None): The workflow configuration.
240
+ """
241
+ if config:
242
+ self.config = config
243
+ self.metics_map = config.evaluation.metrics_map
244
+
216
245
  self.data_loader = DataLoader()
217
246
  self.comparator = MetadataComparator()
218
247
  self.metrics_manager = MetricsManager()
@@ -245,7 +274,6 @@ class MetadataEvaluator(BaseEvaluator):
245
274
  self.comparator.reference_data = ref_data
246
275
 
247
276
  output = self.comparator.run(indexed_mode=False)
248
- logger.info(f"Comparison results:\n{output}\n---")
249
277
  results: Dict[str, float] = {}
250
278
 
251
279
  for k, v in output.items():
@@ -1,14 +1,11 @@
1
1
  """levelapp/metrics/__init__.py"""
2
- import logging
3
-
4
2
  from typing import List, Dict, Type, Any
5
3
 
4
+ from levelapp.aspects import logger
6
5
  from levelapp.core.base import BaseMetric
7
6
  from levelapp.metrics.exact import EXACT_METRICS
8
7
  from levelapp.metrics.fuzzy import FUZZY_METRICS
9
8
 
10
- logger = logging.getLogger(__name__)
11
-
12
9
 
13
10
  class MetricRegistry:
14
11
  """Registry for metric classes."""
@@ -27,7 +24,6 @@ class MetricRegistry:
27
24
  raise KeyError(f"Metric '{name}' is already registered")
28
25
 
29
26
  cls._metrics[name] = metric_class
30
- logger.info(f"Metric '{name}' registered successfully.")
31
27
 
32
28
  @classmethod
33
29
  def get(cls, name: str, **kwargs: Any) -> BaseMetric: