levelapp 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- levelapp/core/schemas.py +26 -0
- levelapp/core/session.py +68 -56
- levelapp/evaluator/evaluator.py +27 -11
- levelapp/repository/firestore.py +15 -6
- levelapp/simulator/schemas.py +9 -9
- levelapp/simulator/simulator.py +103 -37
- levelapp/workflow/__init__.py +3 -2
- levelapp/workflow/base.py +26 -14
- levelapp/workflow/config.py +65 -0
- levelapp/workflow/context.py +63 -0
- levelapp/workflow/factory.py +18 -40
- levelapp/workflow/registration.py +1 -1
- levelapp/workflow/runtime.py +19 -0
- {levelapp-0.1.0.dist-info → levelapp-0.1.1.dist-info}/METADATA +101 -35
- {levelapp-0.1.0.dist-info → levelapp-0.1.1.dist-info}/RECORD +17 -14
- levelapp/workflow/schemas.py +0 -121
- {levelapp-0.1.0.dist-info → levelapp-0.1.1.dist-info}/WHEEL +0 -0
- {levelapp-0.1.0.dist-info → levelapp-0.1.1.dist-info}/licenses/LICENSE +0 -0
levelapp/core/schemas.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ExtendedEnum(Enum):
|
|
5
|
+
@classmethod
|
|
6
|
+
def list(cls):
|
|
7
|
+
return [e.value for e in cls]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WorkflowType(ExtendedEnum):
|
|
11
|
+
SIMULATOR = "SIMULATOR"
|
|
12
|
+
COMPARATOR = "COMPARATOR"
|
|
13
|
+
ASSESSOR = "ASSESSOR"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RepositoryType(ExtendedEnum):
|
|
17
|
+
FIRESTORE = "FIRESTORE"
|
|
18
|
+
FILESYSTEM = "FILESYSTEM"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EvaluatorType(ExtendedEnum):
|
|
22
|
+
JUDGE = "JUDGE"
|
|
23
|
+
REFERENCE = "REFERENCE"
|
|
24
|
+
RAG = "RAG"
|
|
25
|
+
|
|
26
|
+
|
levelapp/core/session.py
CHANGED
|
@@ -1,27 +1,23 @@
|
|
|
1
1
|
"""levelapp/core/session.py"""
|
|
2
2
|
import threading
|
|
3
3
|
|
|
4
|
+
from abc import ABC
|
|
5
|
+
|
|
4
6
|
from dataclasses import dataclass, field
|
|
5
7
|
from typing import Dict, List, Any
|
|
6
8
|
|
|
7
9
|
from datetime import datetime
|
|
8
10
|
from humanize import precisedelta
|
|
9
11
|
|
|
10
|
-
from levelapp.workflow import MainFactory
|
|
12
|
+
from levelapp.workflow import MainFactory, WorkflowConfig
|
|
11
13
|
from levelapp.workflow.base import BaseWorkflow
|
|
12
|
-
from levelapp.
|
|
13
|
-
from levelapp.
|
|
14
|
+
from levelapp.aspects import MetricType, ExecutionMetrics, MonitoringAspect, logger
|
|
15
|
+
from levelapp.workflow.context import WorkflowContextBuilder
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
session_name: str
|
|
20
|
-
started_at: datetime | None = None
|
|
21
|
-
ended_at: datetime | None = None
|
|
22
|
-
total_executions: int = 0
|
|
23
|
-
total_duration: float = 0.0
|
|
24
|
-
steps: Dict[str, 'StepMetadata'] = field(default_factory=dict)
|
|
18
|
+
class TemporalStatusMixin(ABC):
|
|
19
|
+
started_at: datetime | None
|
|
20
|
+
ended_at: datetime | None
|
|
25
21
|
|
|
26
22
|
@property
|
|
27
23
|
def is_active(self) -> bool:
|
|
@@ -37,7 +33,18 @@ class SessionMetadata:
|
|
|
37
33
|
|
|
38
34
|
|
|
39
35
|
@dataclass
|
|
40
|
-
class
|
|
36
|
+
class SessionMetadata(TemporalStatusMixin):
|
|
37
|
+
"""Metadata for an evaluation session."""
|
|
38
|
+
session_name: str
|
|
39
|
+
started_at: datetime | None = None
|
|
40
|
+
ended_at: datetime | None = None
|
|
41
|
+
total_executions: int = 0
|
|
42
|
+
total_duration: float = 0.0
|
|
43
|
+
steps: Dict[str, 'StepMetadata'] = field(default_factory=dict)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class StepMetadata(TemporalStatusMixin):
|
|
41
48
|
"""Metadata for a specific step within an evaluation session."""
|
|
42
49
|
step_name: str
|
|
43
50
|
session_name: str
|
|
@@ -47,27 +54,21 @@ class StepMetadata:
|
|
|
47
54
|
error_count: int = 0
|
|
48
55
|
procedures_stats: List[ExecutionMetrics] | None = None
|
|
49
56
|
|
|
50
|
-
@property
|
|
51
|
-
def is_active(self) -> bool:
|
|
52
|
-
"""Check if the step is currently active."""
|
|
53
|
-
return self.ended_at is None
|
|
54
|
-
|
|
55
|
-
@property
|
|
56
|
-
def duration(self) -> float | None:
|
|
57
|
-
"""Calculate the duration of the step in seconds."""
|
|
58
|
-
if not self.is_active:
|
|
59
|
-
return (self.ended_at - self.started_at).total_seconds()
|
|
60
|
-
return None
|
|
61
|
-
|
|
62
57
|
|
|
63
58
|
class StepContext:
|
|
64
59
|
"""Context manager for an evaluation step within an EvaluationSession."""
|
|
65
|
-
def __init__(
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
session: "EvaluationSession",
|
|
63
|
+
step_name: str,
|
|
64
|
+
category: MetricType,
|
|
65
|
+
):
|
|
66
66
|
self.session = session
|
|
67
67
|
self.step_name = step_name
|
|
68
68
|
self.category = category
|
|
69
|
+
|
|
69
70
|
self.step_meta: StepMetadata | None = None
|
|
70
|
-
self.full_step_name = f"{session.session_name}
|
|
71
|
+
self.full_step_name = f"<{session.session_name}:{step_name}>"
|
|
71
72
|
self._monitored_func = None
|
|
72
73
|
self._func_gen = None
|
|
73
74
|
|
|
@@ -80,35 +81,49 @@ class StepContext:
|
|
|
80
81
|
)
|
|
81
82
|
self.session.session_metadata.steps[self.step_name] = self.step_meta
|
|
82
83
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
84
|
+
if self.session.enable_monitoring:
|
|
85
|
+
# Wrap with FunctionMonitor
|
|
86
|
+
self._monitored_func = self.session.monitor.monitor(
|
|
87
|
+
name=self.full_step_name,
|
|
88
|
+
category=self.category,
|
|
89
|
+
enable_timing=True,
|
|
90
|
+
track_memory=True,
|
|
91
|
+
)(self._step_wrapper)
|
|
92
|
+
|
|
93
|
+
# Start monitoring
|
|
94
|
+
try:
|
|
95
|
+
self._func_gen = self._monitored_func()
|
|
96
|
+
next(self._func_gen) # Enter monitoring
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.error(f"[StepContext] Failed to initialize monitoring for {self.full_step_name}:\n{e}")
|
|
99
|
+
raise
|
|
100
|
+
|
|
94
101
|
return self # returning self allows nested instrumentation
|
|
95
102
|
|
|
103
|
+
# noinspection PyMethodMayBeStatic
|
|
96
104
|
def _step_wrapper(self):
|
|
97
105
|
yield # Actual user step execution happens here
|
|
98
106
|
|
|
99
107
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
108
|
+
if self.session.enable_monitoring:
|
|
109
|
+
try:
|
|
110
|
+
next(self._func_gen) # Exit monitoring
|
|
111
|
+
except StopIteration:
|
|
112
|
+
pass
|
|
104
113
|
|
|
105
114
|
with self.session.lock:
|
|
106
115
|
self.step_meta.ended_at = datetime.now()
|
|
116
|
+
|
|
107
117
|
if exc_type:
|
|
108
118
|
self.step_meta.error_count += 1
|
|
119
|
+
|
|
109
120
|
self.session.session_metadata.total_executions += 1
|
|
121
|
+
|
|
110
122
|
if self.step_meta.duration:
|
|
111
|
-
self.session.monitor.update_procedure_duration(
|
|
123
|
+
self.session.monitor.update_procedure_duration(
|
|
124
|
+
name=self.full_step_name,
|
|
125
|
+
value=self.step_meta.duration
|
|
126
|
+
)
|
|
112
127
|
self.session.session_metadata.total_duration += self.step_meta.duration
|
|
113
128
|
|
|
114
129
|
return False
|
|
@@ -119,29 +134,30 @@ class EvaluationSession:
|
|
|
119
134
|
def __init__(
|
|
120
135
|
self,
|
|
121
136
|
session_name: str = "test-session",
|
|
122
|
-
|
|
123
|
-
|
|
137
|
+
workflow_config: WorkflowConfig | None = None,
|
|
138
|
+
enable_monitoring: bool = True,
|
|
124
139
|
):
|
|
125
140
|
"""
|
|
126
141
|
Initialize Evaluation Session.
|
|
127
142
|
|
|
128
143
|
Args:
|
|
129
144
|
session_name (str): Name of the session
|
|
130
|
-
monitor (FunctionMonitor): Function monitoring aspect
|
|
131
145
|
workflow_config (WorkflowConfig): Workflow configuration.
|
|
132
146
|
"""
|
|
133
147
|
self._NAME = self.__class__.__name__
|
|
134
148
|
|
|
135
149
|
self.session_name = session_name
|
|
136
|
-
self.monitor = monitor or MonitoringAspect
|
|
137
150
|
self.workflow_config = workflow_config
|
|
138
|
-
self.
|
|
151
|
+
self.enable_monitoring = enable_monitoring
|
|
139
152
|
|
|
140
153
|
self.workflow: BaseWorkflow | None = None
|
|
141
154
|
|
|
142
155
|
self.session_metadata = SessionMetadata(session_name=session_name)
|
|
156
|
+
self.monitor = MonitoringAspect if enable_monitoring else None
|
|
143
157
|
self._lock = threading.RLock()
|
|
144
158
|
|
|
159
|
+
logger.info("[EvaluationSession] Evaluation session initialized.")
|
|
160
|
+
|
|
145
161
|
@property
|
|
146
162
|
def lock(self):
|
|
147
163
|
return self._lock
|
|
@@ -154,14 +170,10 @@ class EvaluationSession:
|
|
|
154
170
|
if not self.workflow_config:
|
|
155
171
|
raise ValueError(f"{self._NAME}: Workflow configuration must be provided")
|
|
156
172
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
endpoint_config=self.workflow_config.endpoint_config,
|
|
162
|
-
inputs=self.workflow_config.inputs
|
|
163
|
-
)
|
|
164
|
-
self.workflow = MainFactory.create_workflow(self.workflow_type, context)
|
|
173
|
+
context_builder = WorkflowContextBuilder(self.workflow_config)
|
|
174
|
+
context = context_builder.build()
|
|
175
|
+
|
|
176
|
+
self.workflow = MainFactory.create_workflow(context=context)
|
|
165
177
|
|
|
166
178
|
logger.info(
|
|
167
179
|
f"[{self._NAME}] Starting evaluation session: {self.session_name}, "
|
levelapp/evaluator/evaluator.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""levelapp/core/evaluator.py"""
|
|
2
2
|
from functools import lru_cache
|
|
3
|
-
from typing import List, Dict, Any
|
|
4
|
-
from collections import defaultdict
|
|
3
|
+
from typing import List, Dict, Any, TYPE_CHECKING
|
|
5
4
|
from pydantic import BaseModel, Field
|
|
6
5
|
|
|
7
6
|
from tenacity import (
|
|
@@ -19,6 +18,9 @@ from levelapp.config.prompts import EVAL_PROMPT_TEMPLATE
|
|
|
19
18
|
from levelapp.core.base import BaseEvaluator, BaseChatClient
|
|
20
19
|
from levelapp.aspects import MonitoringAspect, MetricType, logger, DataLoader
|
|
21
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from levelapp.workflow.config import WorkflowConfig
|
|
23
|
+
|
|
22
24
|
|
|
23
25
|
class Evidence(BaseModel):
|
|
24
26
|
"""Evidence details for evaluation."""
|
|
@@ -69,19 +71,29 @@ class JudgeEvaluationResults(BaseModel):
|
|
|
69
71
|
|
|
70
72
|
|
|
71
73
|
class JudgeEvaluator(BaseEvaluator):
|
|
72
|
-
def __init__(self):
|
|
74
|
+
def __init__(self, config: "WorkflowConfig | None" = None):
|
|
75
|
+
if config:
|
|
76
|
+
self.config = config
|
|
77
|
+
self.providers = config.evaluation.providers
|
|
78
|
+
|
|
73
79
|
self.prompt_template = EVAL_PROMPT_TEMPLATE
|
|
74
|
-
self.
|
|
80
|
+
self.client_registry = ClientRegistry
|
|
75
81
|
|
|
76
|
-
def
|
|
82
|
+
def select_client(self, provider: str) -> BaseChatClient:
|
|
77
83
|
"""
|
|
78
|
-
|
|
84
|
+
Select an LLM client to use for the evaluation.
|
|
79
85
|
|
|
80
86
|
Args:
|
|
81
87
|
provider (str): The provider name.
|
|
82
|
-
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
client (BaseChatClient): The LLM client to use for the evaluation.
|
|
83
91
|
"""
|
|
84
|
-
|
|
92
|
+
if provider not in self.client_registry.list_providers():
|
|
93
|
+
logger.warning(f"[JudgeEvaluator] {provider} is not registered. Defaulting to 'OpenAI'.")
|
|
94
|
+
return self.client_registry.get(provider="openai")
|
|
95
|
+
|
|
96
|
+
return self.client_registry.get(provider=provider)
|
|
85
97
|
|
|
86
98
|
@lru_cache(maxsize=1024)
|
|
87
99
|
def _build_prompt(self, user_input: str, generated_text: str, reference_text: str) -> str:
|
|
@@ -135,7 +147,7 @@ class JudgeEvaluator(BaseEvaluator):
|
|
|
135
147
|
generated_text=generated_data,
|
|
136
148
|
reference_text=reference_data
|
|
137
149
|
)
|
|
138
|
-
client =
|
|
150
|
+
client = self.select_client(provider=provider)
|
|
139
151
|
|
|
140
152
|
try:
|
|
141
153
|
response = client.call(message=prompt)
|
|
@@ -183,7 +195,7 @@ class JudgeEvaluator(BaseEvaluator):
|
|
|
183
195
|
generated_text=generated_data,
|
|
184
196
|
reference_text=reference_data
|
|
185
197
|
)
|
|
186
|
-
client =
|
|
198
|
+
client = self.select_client(provider=provider)
|
|
187
199
|
|
|
188
200
|
try:
|
|
189
201
|
async for attempt in AsyncRetrying(
|
|
@@ -212,7 +224,11 @@ class JudgeEvaluator(BaseEvaluator):
|
|
|
212
224
|
|
|
213
225
|
|
|
214
226
|
class MetadataEvaluator(BaseEvaluator):
|
|
215
|
-
def __init__(self):
|
|
227
|
+
def __init__(self, config: "WorkflowConfig | None"= None):
|
|
228
|
+
if config:
|
|
229
|
+
self.config = config
|
|
230
|
+
self.metics_map = config.evaluation.metrics_map
|
|
231
|
+
|
|
216
232
|
self.data_loader = DataLoader()
|
|
217
233
|
self.comparator = MetadataComparator()
|
|
218
234
|
self.metrics_manager = MetricsManager()
|
levelapp/repository/firestore.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
"""levelapp/repository/firestore.py"""
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
from typing import List, Dict, Any, Type
|
|
2
|
+
from typing import List, Dict, Any, Type, TYPE_CHECKING
|
|
5
3
|
from pydantic import ValidationError
|
|
6
4
|
|
|
7
5
|
from google.cloud import firestore_v1
|
|
@@ -13,15 +11,25 @@ from levelapp.core.base import BaseRepository, Model
|
|
|
13
11
|
from levelapp.aspects import logger
|
|
14
12
|
|
|
15
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from levelapp.workflow.config import WorkflowConfig
|
|
16
|
+
|
|
17
|
+
|
|
16
18
|
class FirestoreRepository(BaseRepository):
|
|
17
19
|
"""
|
|
18
20
|
Firestore implementation of BaseRepository.
|
|
19
21
|
(Uses hierarchical path: {user_id}/{collection_id}/{document_id}
|
|
20
22
|
"""
|
|
21
23
|
|
|
22
|
-
def __init__(self,
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
def __init__(self, config: "WorkflowConfig | None"):
|
|
25
|
+
if config:
|
|
26
|
+
self.config = config
|
|
27
|
+
self.project_id: str | Any = config.repository.project_id
|
|
28
|
+
self.database_name: str | Any = config.repository.database_name
|
|
29
|
+
else:
|
|
30
|
+
self.project_id: str | Any = None
|
|
31
|
+
self.database_name: str | Any = '(default)'
|
|
32
|
+
|
|
25
33
|
self.client: firestore_v1.Client | None = None
|
|
26
34
|
|
|
27
35
|
def connect(self) -> None:
|
|
@@ -29,6 +37,7 @@ class FirestoreRepository(BaseRepository):
|
|
|
29
37
|
Connects to Firestore, prioritizing the project ID passed to the constructor.
|
|
30
38
|
"""
|
|
31
39
|
try:
|
|
40
|
+
import google.auth
|
|
32
41
|
credentials, default_project_id = google.auth.default()
|
|
33
42
|
|
|
34
43
|
if not credentials:
|
levelapp/simulator/schemas.py
CHANGED
|
@@ -8,7 +8,7 @@ from enum import Enum
|
|
|
8
8
|
from uuid import UUID, uuid4
|
|
9
9
|
from datetime import datetime
|
|
10
10
|
|
|
11
|
-
from typing import
|
|
11
|
+
from typing import Dict, Any, List
|
|
12
12
|
from pydantic import BaseModel, Field, computed_field
|
|
13
13
|
|
|
14
14
|
from levelapp.evaluator.evaluator import JudgeEvaluationResults
|
|
@@ -25,11 +25,11 @@ class Interaction(BaseModel):
|
|
|
25
25
|
"""Represents a single interaction within a conversation."""
|
|
26
26
|
id: UUID = Field(default_factory=uuid4, description="Interaction identifier")
|
|
27
27
|
user_message: str = Field(..., description="The user's query message")
|
|
28
|
-
generated_reply: str = Field(..., description="The agent's reply message")
|
|
28
|
+
# generated_reply: str = Field(..., description="The agent's reply message")
|
|
29
29
|
reference_reply: str = Field(..., description="The preset reference message")
|
|
30
|
-
interaction_type: InteractionLevel = Field(
|
|
31
|
-
reference_metadata:
|
|
32
|
-
generated_metadata:
|
|
30
|
+
interaction_type: InteractionLevel = Field(default=InteractionLevel.INITIAL, description="Type of interaction")
|
|
31
|
+
reference_metadata: Dict[str, Any] = Field(default_factory=dict, description="Expected metadata")
|
|
32
|
+
# generated_metadata: Dict[str, Any] = Field(default_factory=dict, description="Extracted metadata")
|
|
33
33
|
guardrail_flag: bool = Field(default=False, description="Flag for guardrail signaling")
|
|
34
34
|
request_payload: Dict[str, Any] = Field(default_factory=dict, description="Additional request payload")
|
|
35
35
|
|
|
@@ -38,7 +38,7 @@ class ConversationScript(BaseModel):
|
|
|
38
38
|
"""Represents a basic conversation with multiple interactions."""
|
|
39
39
|
id: UUID = Field(default_factory=uuid4, description="Conversation identifier")
|
|
40
40
|
interactions: List[Interaction] = Field(default_factory=list, description="List of interactions")
|
|
41
|
-
description: str = Field(
|
|
41
|
+
description: str = Field(default="no-description", description="A short description of the conversation")
|
|
42
42
|
details: Dict[str, str] = Field(default_factory=dict, description="Conversation details")
|
|
43
43
|
|
|
44
44
|
|
|
@@ -58,8 +58,8 @@ class InteractionResults(BaseModel):
|
|
|
58
58
|
|
|
59
59
|
class InteractionEvaluationResults(BaseModel):
|
|
60
60
|
"""Model representing the evaluation result of an interaction."""
|
|
61
|
-
judge_evaluations: Dict[str, JudgeEvaluationResults] = Field(default_factory=dict)
|
|
62
|
-
metadata_evaluation: Dict[str, float] = Field(default_factory=dict)
|
|
61
|
+
judge_evaluations: Dict[str, JudgeEvaluationResults] | None = Field(default_factory=dict)
|
|
62
|
+
metadata_evaluation: Dict[str, float] | None = Field(default_factory=dict)
|
|
63
63
|
guardrail_flag: int = Field(default=0)
|
|
64
64
|
|
|
65
65
|
|
|
@@ -86,4 +86,4 @@ class TestResults(BaseModel):
|
|
|
86
86
|
ionos_model_name: str = Field(..., alias="ionosModelName")
|
|
87
87
|
test_name: str = Field(..., alias="testName")
|
|
88
88
|
test_type: str = Field(..., alias="testType")
|
|
89
|
-
batch_details:
|
|
89
|
+
batch_details: SimulationResults | None = Field(..., alias="results")
|
levelapp/simulator/simulator.py
CHANGED
|
@@ -23,7 +23,7 @@ from levelapp.simulator.utils import (
|
|
|
23
23
|
summarize_verdicts,
|
|
24
24
|
)
|
|
25
25
|
from levelapp.aspects import logger
|
|
26
|
-
from levelapp.
|
|
26
|
+
from levelapp.core.schemas import EvaluatorType
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class ConversationSimulator(BaseProcess):
|
|
@@ -33,6 +33,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
33
33
|
self,
|
|
34
34
|
repository: BaseRepository | None = None,
|
|
35
35
|
evaluators: Dict[EvaluatorType, BaseEvaluator] | None = None,
|
|
36
|
+
providers: List[str] | None = None,
|
|
36
37
|
endpoint_config: EndpointConfig | None = None,
|
|
37
38
|
):
|
|
38
39
|
"""
|
|
@@ -47,6 +48,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
47
48
|
|
|
48
49
|
self.repository = repository
|
|
49
50
|
self.evaluators = evaluators
|
|
51
|
+
self.providers = providers
|
|
50
52
|
self.endpoint_config = endpoint_config
|
|
51
53
|
|
|
52
54
|
self._url: str | None = None
|
|
@@ -60,7 +62,8 @@ class ConversationSimulator(BaseProcess):
|
|
|
60
62
|
def setup(
|
|
61
63
|
self,
|
|
62
64
|
repository: BaseRepository,
|
|
63
|
-
evaluators: Dict[
|
|
65
|
+
evaluators: Dict[EvaluatorType, BaseEvaluator],
|
|
66
|
+
providers: List[str],
|
|
64
67
|
endpoint_config: EndpointConfig,
|
|
65
68
|
) -> None:
|
|
66
69
|
"""
|
|
@@ -69,6 +72,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
69
72
|
Args:
|
|
70
73
|
repository (BaseRepository): Repository object for storing simulation results.
|
|
71
74
|
evaluators (Dict[str, BaseEvaluator]): List of evaluator objects for evaluating interactions.
|
|
75
|
+
providers (List[str]): List of LLM provider names.
|
|
72
76
|
endpoint_config (EndpointConfig): Configuration object for VLA.
|
|
73
77
|
"""
|
|
74
78
|
_LOG: str = f"[{self._CLASS_NAME}][{self.setup.__name__}]"
|
|
@@ -76,6 +80,11 @@ class ConversationSimulator(BaseProcess):
|
|
|
76
80
|
|
|
77
81
|
self.repository = repository
|
|
78
82
|
self.evaluators = evaluators
|
|
83
|
+
self.providers = providers
|
|
84
|
+
|
|
85
|
+
if not self.providers:
|
|
86
|
+
logger.warning(f"{_LOG} No LLM providers were provided. The Judge Evaluation process will not be executed.")
|
|
87
|
+
|
|
79
88
|
self.endpoint_config = endpoint_config
|
|
80
89
|
|
|
81
90
|
self._url = endpoint_config.full_url
|
|
@@ -87,6 +96,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
87
96
|
|
|
88
97
|
if name not in self.evaluators:
|
|
89
98
|
raise KeyError(f"{_LOG} Evaluator {name} not registered.")
|
|
99
|
+
|
|
90
100
|
return self.evaluators[name]
|
|
91
101
|
|
|
92
102
|
async def run(
|
|
@@ -368,50 +378,106 @@ class ConversationSimulator(BaseProcess):
|
|
|
368
378
|
"""
|
|
369
379
|
_LOG: str = f"[{self._CLASS_NAME}][{self.evaluate_interaction.__name__}]"
|
|
370
380
|
|
|
371
|
-
judge_evaluator = self.evaluators.get(EvaluatorType.JUDGE)
|
|
372
|
-
metadata_evaluator = self.evaluators.get(EvaluatorType.REFERENCE)
|
|
381
|
+
judge_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.JUDGE, None)
|
|
382
|
+
metadata_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.REFERENCE, None)
|
|
373
383
|
|
|
374
|
-
|
|
375
|
-
raise ValueError(f"{_LOG} No Judge Evaluator found.")
|
|
384
|
+
evaluation_results = InteractionEvaluationResults()
|
|
376
385
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
386
|
+
if judge_evaluator and self.providers:
|
|
387
|
+
await self._judge_evaluation(
|
|
388
|
+
user_input=user_input,
|
|
389
|
+
generated_reply=generated_reply,
|
|
390
|
+
reference_reply=reference_reply,
|
|
391
|
+
providers=self.providers,
|
|
392
|
+
judge_evaluator=judge_evaluator,
|
|
393
|
+
evaluation_results=evaluation_results,
|
|
394
|
+
)
|
|
395
|
+
else:
|
|
396
|
+
logger.info(f"[{_LOG}] Judge evaluation skipped (no evaluator or no providers).")
|
|
383
397
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
398
|
+
if metadata_evaluator and reference_metadata:
|
|
399
|
+
self._metadata_evaluation(
|
|
400
|
+
metadata_evaluator=metadata_evaluator,
|
|
401
|
+
generated_metadata=generated_metadata,
|
|
402
|
+
reference_metadata=reference_metadata,
|
|
403
|
+
evaluation_results=evaluation_results,
|
|
404
|
+
)
|
|
405
|
+
else:
|
|
406
|
+
logger.info(f"[{_LOG}] Metadata evaluation skipped (no evaluator or no reference metadata).")
|
|
390
407
|
|
|
391
|
-
|
|
392
|
-
openai_eval_task, ionos_eval_task
|
|
393
|
-
)
|
|
408
|
+
evaluation_results.guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
|
|
394
409
|
|
|
395
|
-
|
|
396
|
-
raise ValueError(f"{_LOG} No Metadata Evaluator found.")
|
|
410
|
+
return evaluation_results
|
|
397
411
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
412
|
+
async def _judge_evaluation(
|
|
413
|
+
self,
|
|
414
|
+
user_input: str,
|
|
415
|
+
generated_reply: str,
|
|
416
|
+
reference_reply: str,
|
|
417
|
+
providers: List[str],
|
|
418
|
+
judge_evaluator: BaseEvaluator,
|
|
419
|
+
evaluation_results: InteractionEvaluationResults,
|
|
420
|
+
) -> None:
|
|
421
|
+
"""
|
|
422
|
+
Run LLM-as-a-judge evaluation using multiple providers (async).
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
user_input (str): The user input message.
|
|
426
|
+
generated_reply (str): The generated agent reply.
|
|
427
|
+
reference_reply (str): The reference agent reply.
|
|
428
|
+
providers (List[str]): List of judge provider names.
|
|
429
|
+
judge_evaluator (BaseEvaluator): Evaluator instance.
|
|
430
|
+
evaluation_results (InteractionEvaluationResults): Results container (Pydantic model).
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
None
|
|
434
|
+
"""
|
|
435
|
+
_LOG: str = f"[{self._CLASS_NAME}][judge_evaluation]"
|
|
436
|
+
|
|
437
|
+
tasks = {
|
|
438
|
+
provider: judge_evaluator.async_evaluate(
|
|
439
|
+
generated_data=generated_reply,
|
|
440
|
+
reference_data=reference_reply,
|
|
441
|
+
user_input=user_input,
|
|
442
|
+
provider=provider,
|
|
403
443
|
)
|
|
444
|
+
for provider in providers
|
|
445
|
+
}
|
|
404
446
|
|
|
405
|
-
|
|
447
|
+
results = await asyncio.gather(*tasks.values(), return_exceptions=True)
|
|
406
448
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
449
|
+
for provider, result in zip(tasks.keys(), results):
|
|
450
|
+
if isinstance(result, Exception):
|
|
451
|
+
logger.error(f"{_LOG} Provider '{provider}' failed to perform Judge Evaluation.")
|
|
452
|
+
continue
|
|
453
|
+
|
|
454
|
+
evaluation_results.judge_evaluations[provider] = result
|
|
455
|
+
|
|
456
|
+
def _metadata_evaluation(
|
|
457
|
+
self,
|
|
458
|
+
metadata_evaluator: BaseEvaluator,
|
|
459
|
+
generated_metadata: Dict[str, Any],
|
|
460
|
+
reference_metadata: Dict[str, Any],
|
|
461
|
+
evaluation_results: InteractionEvaluationResults,
|
|
462
|
+
) -> None:
|
|
463
|
+
"""
|
|
464
|
+
Run metadata evaluation using the provided evaluator.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
metadata_evaluator (BaseEvaluator): Evaluator for metadata comparison.
|
|
468
|
+
generated_metadata (Dict[str, Any]): The generated metadata.
|
|
469
|
+
reference_metadata (Dict[str, Any]): The reference metadata.
|
|
470
|
+
evaluation_results (InteractionEvaluationResults): Results container.
|
|
471
|
+
"""
|
|
472
|
+
_LOG: str = f"[{self._CLASS_NAME}][metadata_evaluation]"
|
|
473
|
+
|
|
474
|
+
try:
|
|
475
|
+
evaluation_results.metadata_evaluation = metadata_evaluator.evaluate(
|
|
476
|
+
generated_data=generated_metadata,
|
|
477
|
+
reference_data=reference_metadata,
|
|
478
|
+
)
|
|
479
|
+
except Exception as e:
|
|
480
|
+
logger.error(f"[{_LOG}] Metadata evaluation failed:\n{e}", exc_info=e)
|
|
415
481
|
|
|
416
482
|
@staticmethod
|
|
417
483
|
def store_evaluation_results(
|
levelapp/workflow/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from . import registration
|
|
2
|
-
from .
|
|
2
|
+
from .base import BaseWorkflow
|
|
3
|
+
from .config import WorkflowConfig
|
|
3
4
|
from .factory import MainFactory
|
|
4
5
|
|
|
5
|
-
__all__ = ["
|
|
6
|
+
__all__ = ["BaseWorkflow", "WorkflowConfig", "MainFactory"]
|