levelapp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- levelapp/__init__.py +0 -0
- levelapp/aspects/__init__.py +8 -0
- levelapp/aspects/loader.py +253 -0
- levelapp/aspects/logger.py +59 -0
- levelapp/aspects/monitor.py +614 -0
- levelapp/aspects/sanitizer.py +168 -0
- levelapp/clients/__init__.py +119 -0
- levelapp/clients/anthropic.py +112 -0
- levelapp/clients/ionos.py +116 -0
- levelapp/clients/mistral.py +106 -0
- levelapp/clients/openai.py +102 -0
- levelapp/comparator/__init__.py +5 -0
- levelapp/comparator/comparator.py +232 -0
- levelapp/comparator/extractor.py +108 -0
- levelapp/comparator/schemas.py +61 -0
- levelapp/comparator/scorer.py +271 -0
- levelapp/comparator/utils.py +136 -0
- levelapp/config/__init__.py +5 -0
- levelapp/config/endpoint.py +190 -0
- levelapp/config/prompts.py +35 -0
- levelapp/core/__init__.py +0 -0
- levelapp/core/base.py +386 -0
- levelapp/core/session.py +214 -0
- levelapp/evaluator/__init__.py +3 -0
- levelapp/evaluator/evaluator.py +265 -0
- levelapp/metrics/__init__.py +67 -0
- levelapp/metrics/embedding.py +2 -0
- levelapp/metrics/exact.py +182 -0
- levelapp/metrics/fuzzy.py +80 -0
- levelapp/metrics/token.py +103 -0
- levelapp/plugins/__init__.py +0 -0
- levelapp/repository/__init__.py +3 -0
- levelapp/repository/firestore.py +282 -0
- levelapp/simulator/__init__.py +3 -0
- levelapp/simulator/schemas.py +89 -0
- levelapp/simulator/simulator.py +441 -0
- levelapp/simulator/utils.py +201 -0
- levelapp/workflow/__init__.py +5 -0
- levelapp/workflow/base.py +113 -0
- levelapp/workflow/factory.py +51 -0
- levelapp/workflow/registration.py +6 -0
- levelapp/workflow/schemas.py +121 -0
- levelapp-0.1.0.dist-info/METADATA +254 -0
- levelapp-0.1.0.dist-info/RECORD +46 -0
- levelapp-0.1.0.dist-info/WHEEL +4 -0
- levelapp-0.1.0.dist-info/licenses/LICENSE +0 -0
levelapp/core/base.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
"""levelapp/core/base.py"""
|
|
2
|
+
import datetime
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
from typing import List, Dict, Any, Callable, TypeVar, Type
|
|
12
|
+
|
|
13
|
+
from levelapp.aspects import JSONSanitizer
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
Model = TypeVar("Model", bound=BaseModel)
|
|
17
|
+
Context = TypeVar("Context")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BaseProcess(ABC):
|
|
21
|
+
"""Interface for the evaluation classes."""
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def run(self, **kwargs) -> Any:
|
|
24
|
+
raise NotImplementedError
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BaseEvaluator(ABC):
|
|
28
|
+
"""Abstract base class for evaluator components."""
|
|
29
|
+
@abstractmethod
|
|
30
|
+
def evaluate(
|
|
31
|
+
self,
|
|
32
|
+
generated_data: str | Dict[str, Any],
|
|
33
|
+
reference_data: str | Dict[str, Any],
|
|
34
|
+
**kwargs
|
|
35
|
+
):
|
|
36
|
+
"""Evaluate system output to reference output."""
|
|
37
|
+
raise NotImplementedError
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
async def async_evaluate(
|
|
41
|
+
self,
|
|
42
|
+
generated_data: str | Dict[str, Any],
|
|
43
|
+
reference_data: str | Dict[str, Any],
|
|
44
|
+
**kwargs
|
|
45
|
+
):
|
|
46
|
+
"""Asynchronous evaluation method."""
|
|
47
|
+
raise NotImplementedError
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class BaseChatClient(ABC):
|
|
51
|
+
"""
|
|
52
|
+
Abstract base class for integrating different LLM provider clients.
|
|
53
|
+
|
|
54
|
+
This class defines the common interface and request lifecycle for
|
|
55
|
+
calling chat-based large language models (LLMs). It enforces
|
|
56
|
+
provider-specific implementations for:
|
|
57
|
+
- endpoint path resolution
|
|
58
|
+
- request headers
|
|
59
|
+
- request payload
|
|
60
|
+
- response parsing
|
|
61
|
+
|
|
62
|
+
Subclasses (e.g., `OpenAIClient`, `MistralClient`, `AnthropicClient`, `IonosClient`)
|
|
63
|
+
must override the abstract members to handle provider-specific request/response formats.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, **kwargs):
|
|
67
|
+
"""
|
|
68
|
+
Initialize the base chat client.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
**kwargs: Arbitrary keyword arguments. Expected keys include:
|
|
72
|
+
- base_url (str): The base API URL for the LLM provider.
|
|
73
|
+
"""
|
|
74
|
+
self.base_url = kwargs.get("base_url")
|
|
75
|
+
self.sanitizer = JSONSanitizer()
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def endpoint_path(self) -> str:
|
|
80
|
+
"""
|
|
81
|
+
API path (relative to `base_url`) for the provider’s chat endpoint.
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
- OpenAI: "/v1/chat/completions"
|
|
85
|
+
- Mistral: "/chat/completions"
|
|
86
|
+
- Anthropic: "/v1/messages"
|
|
87
|
+
- IONOS: "/models/model-id/predictions"
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
str: Provider-specific endpoint path.
|
|
91
|
+
"""
|
|
92
|
+
raise NotImplementedError
|
|
93
|
+
|
|
94
|
+
def _build_endpoint(self) -> str:
|
|
95
|
+
"""
|
|
96
|
+
Construct the full request endpoint URL.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
str: Complete endpoint URL (base_url + endpoint_path).
|
|
100
|
+
"""
|
|
101
|
+
return f"{self.base_url}/{self.endpoint_path.lstrip('/')}"
|
|
102
|
+
|
|
103
|
+
@abstractmethod
|
|
104
|
+
def _build_headers(self) -> Dict[str, str]:
|
|
105
|
+
"""
|
|
106
|
+
Construct HTTP request headers for the provider.
|
|
107
|
+
|
|
108
|
+
This typically includes authentication (e.g., API key or Bearer token),
|
|
109
|
+
content type, and provider-specific headers.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Dict[str, str]: HTTP headers.
|
|
113
|
+
"""
|
|
114
|
+
raise NotImplementedError
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def _build_payload(self, message: str) -> Dict[str, Any]:
|
|
118
|
+
"""
|
|
119
|
+
Construct the request body payload for the provider.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
message (str): User message to send to the LLM.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Dict[str, Any]: JSON-serializable payload as required by the provider API.
|
|
126
|
+
"""
|
|
127
|
+
raise NotImplementedError
|
|
128
|
+
|
|
129
|
+
@abstractmethod
|
|
130
|
+
def parse_response(self, response: Dict[str, Any]) -> Dict[str, Any]:
|
|
131
|
+
"""
|
|
132
|
+
Parse the raw provider response into a normalized format.
|
|
133
|
+
|
|
134
|
+
The normalized format should include:
|
|
135
|
+
- "output": str or structured response content
|
|
136
|
+
- "metadata": Dict containing tokens, cost, or other provider stats
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
response (Dict[str, Any]): Raw JSON response returned by the provider.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Dict[str, Any]: Normalized output structure.
|
|
143
|
+
"""
|
|
144
|
+
raise NotImplementedError
|
|
145
|
+
|
|
146
|
+
def call(self, message: str) -> Dict[str, Any]:
|
|
147
|
+
"""
|
|
148
|
+
Make a synchronous call to the provider API.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
message (str): User input message to send.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Dict[str, Any]: Provider's raw JSON response.
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
requests.exceptions.RequestException: On any network or HTTP error.
|
|
158
|
+
"""
|
|
159
|
+
url = self._build_endpoint()
|
|
160
|
+
headers = self._build_headers()
|
|
161
|
+
payload = self._build_payload(message)
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
|
165
|
+
response.raise_for_status()
|
|
166
|
+
return response.json()
|
|
167
|
+
|
|
168
|
+
except requests.exceptions.HTTPError as http_err:
|
|
169
|
+
print(f"HTTP error occurred: {http_err}")
|
|
170
|
+
raise
|
|
171
|
+
except requests.exceptions.ConnectionError as conn_err:
|
|
172
|
+
print(f"Connection error occurred: {conn_err}")
|
|
173
|
+
raise
|
|
174
|
+
except requests.exceptions.Timeout as timeout_err:
|
|
175
|
+
print(f"Timeout error occurred: {timeout_err}")
|
|
176
|
+
raise
|
|
177
|
+
except requests.exceptions.RequestException as req_err:
|
|
178
|
+
print(f"An unexpected error occurred: {req_err}")
|
|
179
|
+
raise
|
|
180
|
+
|
|
181
|
+
async def acall(self, message: str) -> Dict[str, Any]:
|
|
182
|
+
"""
|
|
183
|
+
Make an asynchronous call to the provider API.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
message (str): User input message to send.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Dict[str, Any]: Provider's raw JSON response.
|
|
190
|
+
|
|
191
|
+
Raises:
|
|
192
|
+
httpx.RequestError, httpx.TimeoutException, httpx.HTTPStatusError:
|
|
193
|
+
On any network or HTTP error.
|
|
194
|
+
"""
|
|
195
|
+
url = self._build_endpoint()
|
|
196
|
+
headers = self._build_headers()
|
|
197
|
+
payload = self._build_payload(message)
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
async with httpx.AsyncClient(timeout=300) as client:
|
|
201
|
+
response = await client.post(url, headers=headers, json=payload)
|
|
202
|
+
response.raise_for_status()
|
|
203
|
+
return response.json()
|
|
204
|
+
|
|
205
|
+
except httpx.HTTPStatusError as http_err:
|
|
206
|
+
print(f"[IonosClient.acall] HTTP error: {http_err}")
|
|
207
|
+
raise
|
|
208
|
+
except httpx.RequestError as req_err:
|
|
209
|
+
print(f"[IonosClient.acall] Request error: {req_err}")
|
|
210
|
+
raise
|
|
211
|
+
except httpx.TimeoutException as timeout_err:
|
|
212
|
+
print(f"[IonosClient.acall] Timeout: {timeout_err}")
|
|
213
|
+
raise
|
|
214
|
+
except Exception as e:
|
|
215
|
+
print(f"[IonosClient.acall] Unexpected error: {e}")
|
|
216
|
+
raise
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class BaseMetric(ABC):
|
|
220
|
+
"""Abstract base class for metrics collection."""
|
|
221
|
+
|
|
222
|
+
def __init__(self, processor: Callable | None = None, score_cutoff: float | None = None):
|
|
223
|
+
"""
|
|
224
|
+
Initialize the metric.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
processor (Optional[Callable]): Optional function to preprocess strings before comparison.
|
|
228
|
+
score_cutoff (Optional[float]): Minimum similarity score for an early match cutoff.
|
|
229
|
+
"""
|
|
230
|
+
self.processor = processor
|
|
231
|
+
self.score_cutoff = score_cutoff
|
|
232
|
+
|
|
233
|
+
@abstractmethod
|
|
234
|
+
def compute(self, generated: str, reference: str) -> Dict[str, Any]:
|
|
235
|
+
"""
|
|
236
|
+
Evaluate the generated text against the reference text.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
generated (str): The generated text to evaluate.
|
|
240
|
+
reference (str): The reference text to compare against.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Dict[str, Any]: Evaluation results including match level and justification.
|
|
244
|
+
"""
|
|
245
|
+
raise NotImplementedError
|
|
246
|
+
|
|
247
|
+
@property
|
|
248
|
+
def name(self) -> str:
|
|
249
|
+
"""
|
|
250
|
+
Get the name of the metric.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
str: Name of the metric.
|
|
254
|
+
"""
|
|
255
|
+
return self.__class__.__name__.lower()
|
|
256
|
+
|
|
257
|
+
# TODO-0: You know what..We can remove this at some point.
|
|
258
|
+
@staticmethod
|
|
259
|
+
def _validate_inputs(generated: str, reference: str) -> None:
|
|
260
|
+
"""Validate that both inputs are strings."""
|
|
261
|
+
if not (isinstance(generated, str) and isinstance(reference, str)):
|
|
262
|
+
raise TypeError("Both 'generated' and 'reference' must be strings.")
|
|
263
|
+
|
|
264
|
+
def _get_params(self) -> Dict[str, Any]:
|
|
265
|
+
"""Return a serializable dictionary of metric parameters."""
|
|
266
|
+
return {
|
|
267
|
+
"processor": repr(self.processor) if self.processor else None,
|
|
268
|
+
"score_cutoff": self.score_cutoff
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
def _build_metadata(self, **extra_inputs) -> Dict[str, Any]:
|
|
272
|
+
"""Construct a consistent metadata dictionary."""
|
|
273
|
+
return {
|
|
274
|
+
"type": self.__class__.__name__,
|
|
275
|
+
"params": self._get_params(),
|
|
276
|
+
"inputs": extra_inputs,
|
|
277
|
+
"timestamp": datetime.datetime.now()
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class BaseRepository(ABC):
|
|
282
|
+
"""
|
|
283
|
+
Abstract base class for pluggable NoSQL data stores.
|
|
284
|
+
Supports document-based operations with Pydantic model parsing.
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
@abstractmethod
|
|
288
|
+
def connect(self) -> None:
|
|
289
|
+
"""Initialize connection or client."""
|
|
290
|
+
raise NotImplementedError
|
|
291
|
+
|
|
292
|
+
@abstractmethod
|
|
293
|
+
def close(self) -> None:
|
|
294
|
+
"""Close connection or client."""
|
|
295
|
+
raise NotImplementedError
|
|
296
|
+
|
|
297
|
+
@abstractmethod
|
|
298
|
+
def retrieve_document(
|
|
299
|
+
self,
|
|
300
|
+
collection_id: str,
|
|
301
|
+
section_id: str,
|
|
302
|
+
sub_collection_id: str,
|
|
303
|
+
document_id: str,
|
|
304
|
+
model_type: Type[Model]
|
|
305
|
+
) -> Model | None:
|
|
306
|
+
"""
|
|
307
|
+
Retrieve and parse a document from the datastore based on its type.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
collection_id (str): Collection reference.
|
|
311
|
+
section_id (str): Section reference.
|
|
312
|
+
sub_collection_id (str): Sub-collection reference.
|
|
313
|
+
document_id (str): Reference of the document to retrieve.
|
|
314
|
+
model_type (Type[BaseModel]): Pydantic class to instantiate.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Parsed model instance or None if document was not found.
|
|
318
|
+
"""
|
|
319
|
+
raise NotImplementedError
|
|
320
|
+
|
|
321
|
+
@abstractmethod
|
|
322
|
+
def store_document(
|
|
323
|
+
self,
|
|
324
|
+
collection_id: str,
|
|
325
|
+
section_id: str,
|
|
326
|
+
sub_collection_id: str,
|
|
327
|
+
document_id: str,
|
|
328
|
+
data: Model
|
|
329
|
+
) -> None:
|
|
330
|
+
"""
|
|
331
|
+
Store a pydantic model instance as a document.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
collection_id (str): Collection reference.
|
|
335
|
+
section_id (str): Section reference.
|
|
336
|
+
sub_collection_id (str): Sub-collection reference.
|
|
337
|
+
document_id (str): Reference of the document to store.
|
|
338
|
+
data (Model): Pydantic model instance.
|
|
339
|
+
"""
|
|
340
|
+
raise NotImplementedError
|
|
341
|
+
|
|
342
|
+
@abstractmethod
|
|
343
|
+
def query_collection(
|
|
344
|
+
self,
|
|
345
|
+
collection_id: str,
|
|
346
|
+
section_id: str,
|
|
347
|
+
sub_collection_id: str,
|
|
348
|
+
filters: Dict[str, Any],
|
|
349
|
+
model_type: Type[Model]
|
|
350
|
+
) -> List[Model]:
|
|
351
|
+
"""
|
|
352
|
+
Query documents in a collection with optional filters.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
collection_id (str): Collection reference.
|
|
356
|
+
section_id (str): Section reference.
|
|
357
|
+
sub_collection_id (str): Sub-collection reference.
|
|
358
|
+
filters (Dict[str, Any]): Filters to apply to the query (implementation dependent).
|
|
359
|
+
model_type (Type[BaseModel]): Pydantic class to instantiate.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
List[Model]: Query results.
|
|
363
|
+
"""
|
|
364
|
+
raise NotImplementedError
|
|
365
|
+
|
|
366
|
+
@abstractmethod
|
|
367
|
+
def delete_document(
|
|
368
|
+
self,
|
|
369
|
+
collection_id: str,
|
|
370
|
+
section_id: str,
|
|
371
|
+
sub_collection_id: str,
|
|
372
|
+
document_id: str
|
|
373
|
+
) -> bool:
|
|
374
|
+
"""
|
|
375
|
+
Delete a document.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
collection_id (str): Collection reference.
|
|
379
|
+
section_id (str): Section reference.
|
|
380
|
+
sub_collection_id (str): Sub-collection reference.
|
|
381
|
+
document_id (str): Reference of the document to delete.
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
True if deleted, False if not.
|
|
385
|
+
"""
|
|
386
|
+
raise NotImplementedError
|
levelapp/core/session.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""levelapp/core/session.py"""
|
|
2
|
+
import threading
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Dict, List, Any
|
|
6
|
+
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from humanize import precisedelta
|
|
9
|
+
|
|
10
|
+
from levelapp.workflow import MainFactory
|
|
11
|
+
from levelapp.workflow.base import BaseWorkflow
|
|
12
|
+
from levelapp.workflow.schemas import WorkflowConfig, WorkflowContext
|
|
13
|
+
from levelapp.aspects import FunctionMonitor, MetricType, ExecutionMetrics, MonitoringAspect, logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class SessionMetadata:
|
|
18
|
+
"""Metadata for an evaluation session."""
|
|
19
|
+
session_name: str
|
|
20
|
+
started_at: datetime | None = None
|
|
21
|
+
ended_at: datetime | None = None
|
|
22
|
+
total_executions: int = 0
|
|
23
|
+
total_duration: float = 0.0
|
|
24
|
+
steps: Dict[str, 'StepMetadata'] = field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def is_active(self) -> bool:
|
|
28
|
+
"""Check if the session is currently active."""
|
|
29
|
+
return self.ended_at is None
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def duration(self) -> float | None:
|
|
33
|
+
"""Calculate the duration of the session in seconds."""
|
|
34
|
+
if not self.is_active:
|
|
35
|
+
return (self.ended_at - self.started_at).total_seconds()
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class StepMetadata:
|
|
41
|
+
"""Metadata for a specific step within an evaluation session."""
|
|
42
|
+
step_name: str
|
|
43
|
+
session_name: str
|
|
44
|
+
started_at: datetime | None = None
|
|
45
|
+
ended_at: datetime | None = None
|
|
46
|
+
memory_peak_mb: float | None = None
|
|
47
|
+
error_count: int = 0
|
|
48
|
+
procedures_stats: List[ExecutionMetrics] | None = None
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def is_active(self) -> bool:
|
|
52
|
+
"""Check if the step is currently active."""
|
|
53
|
+
return self.ended_at is None
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def duration(self) -> float | None:
|
|
57
|
+
"""Calculate the duration of the step in seconds."""
|
|
58
|
+
if not self.is_active:
|
|
59
|
+
return (self.ended_at - self.started_at).total_seconds()
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class StepContext:
|
|
64
|
+
"""Context manager for an evaluation step within an EvaluationSession."""
|
|
65
|
+
def __init__(self, session: "EvaluationSession", step_name: str, category: MetricType):
|
|
66
|
+
self.session = session
|
|
67
|
+
self.step_name = step_name
|
|
68
|
+
self.category = category
|
|
69
|
+
self.step_meta: StepMetadata | None = None
|
|
70
|
+
self.full_step_name = f"{session.session_name}.{step_name}"
|
|
71
|
+
self._monitored_func = None
|
|
72
|
+
self._func_gen = None
|
|
73
|
+
|
|
74
|
+
def __enter__(self):
|
|
75
|
+
with self.session.lock:
|
|
76
|
+
self.step_meta = StepMetadata(
|
|
77
|
+
step_name=self.step_name,
|
|
78
|
+
session_name=self.session.session_name,
|
|
79
|
+
started_at=datetime.now()
|
|
80
|
+
)
|
|
81
|
+
self.session.session_metadata.steps[self.step_name] = self.step_meta
|
|
82
|
+
|
|
83
|
+
# Wrap with FunctionMonitor
|
|
84
|
+
self._monitored_func = self.session.monitor.monitor(
|
|
85
|
+
name=self.full_step_name,
|
|
86
|
+
category=self.category,
|
|
87
|
+
enable_timing=True,
|
|
88
|
+
track_memory=True,
|
|
89
|
+
)(self._step_wrapper)
|
|
90
|
+
|
|
91
|
+
# Start monitoring
|
|
92
|
+
self._func_gen = self._monitored_func()
|
|
93
|
+
next(self._func_gen) # Enter monitoring
|
|
94
|
+
return self # returning self allows nested instrumentation
|
|
95
|
+
|
|
96
|
+
def _step_wrapper(self):
|
|
97
|
+
yield # Actual user step execution happens here
|
|
98
|
+
|
|
99
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
100
|
+
try:
|
|
101
|
+
next(self._func_gen) # Exit monitoring
|
|
102
|
+
except StopIteration:
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
with self.session.lock:
|
|
106
|
+
self.step_meta.ended_at = datetime.now()
|
|
107
|
+
if exc_type:
|
|
108
|
+
self.step_meta.error_count += 1
|
|
109
|
+
self.session.session_metadata.total_executions += 1
|
|
110
|
+
if self.step_meta.duration:
|
|
111
|
+
self.session.monitor.update_procedure_duration(name=self.full_step_name, value=self.step_meta.duration)
|
|
112
|
+
self.session.session_metadata.total_duration += self.step_meta.duration
|
|
113
|
+
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class EvaluationSession:
|
|
118
|
+
"""Context manager for LLM evaluation sessions with integrated monitoring."""
|
|
119
|
+
def __init__(
|
|
120
|
+
self,
|
|
121
|
+
session_name: str = "test-session",
|
|
122
|
+
monitor: FunctionMonitor | None = None,
|
|
123
|
+
workflow_config: WorkflowConfig | None = None
|
|
124
|
+
):
|
|
125
|
+
"""
|
|
126
|
+
Initialize Evaluation Session.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
session_name (str): Name of the session
|
|
130
|
+
monitor (FunctionMonitor): Function monitoring aspect
|
|
131
|
+
workflow_config (WorkflowConfig): Workflow configuration.
|
|
132
|
+
"""
|
|
133
|
+
self._NAME = self.__class__.__name__
|
|
134
|
+
|
|
135
|
+
self.session_name = session_name
|
|
136
|
+
self.monitor = monitor or MonitoringAspect
|
|
137
|
+
self.workflow_config = workflow_config
|
|
138
|
+
self.workflow_type = workflow_config.workflow
|
|
139
|
+
|
|
140
|
+
self.workflow: BaseWorkflow | None = None
|
|
141
|
+
|
|
142
|
+
self.session_metadata = SessionMetadata(session_name=session_name)
|
|
143
|
+
self._lock = threading.RLock()
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def lock(self):
|
|
147
|
+
return self._lock
|
|
148
|
+
|
|
149
|
+
def __enter__(self):
|
|
150
|
+
self.session_metadata.started_at = datetime.now()
|
|
151
|
+
|
|
152
|
+
# Instantiate workflow if not already
|
|
153
|
+
if not self.workflow:
|
|
154
|
+
if not self.workflow_config:
|
|
155
|
+
raise ValueError(f"{self._NAME}: Workflow configuration must be provided")
|
|
156
|
+
|
|
157
|
+
context = WorkflowContext(
|
|
158
|
+
config=self.workflow_config,
|
|
159
|
+
repository=MainFactory.create_repository(self.workflow_config),
|
|
160
|
+
evaluators=MainFactory.create_evaluator(self.workflow_config),
|
|
161
|
+
endpoint_config=self.workflow_config.endpoint_config,
|
|
162
|
+
inputs=self.workflow_config.inputs
|
|
163
|
+
)
|
|
164
|
+
self.workflow = MainFactory.create_workflow(self.workflow_type, context)
|
|
165
|
+
|
|
166
|
+
logger.info(
|
|
167
|
+
f"[{self._NAME}] Starting evaluation session: {self.session_name}, "
|
|
168
|
+
f"Workflow: '{self.workflow.name}'"
|
|
169
|
+
)
|
|
170
|
+
return self
|
|
171
|
+
|
|
172
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
173
|
+
self.session_metadata.ended_at = datetime.now()
|
|
174
|
+
logger.info(
|
|
175
|
+
f"[{self._NAME}] Completed session '{self.session_name}' "
|
|
176
|
+
f"in {self.session_metadata.duration:.2f}s"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
if exc_type:
|
|
180
|
+
logger.error(f"[{self._NAME}] Session ended with error: {exc_val}", exc_info=True)
|
|
181
|
+
return False
|
|
182
|
+
|
|
183
|
+
def step(self, step_name: str, category: MetricType = MetricType.CUSTOM) -> StepContext:
|
|
184
|
+
"""Create a monitored evaluation step."""
|
|
185
|
+
return StepContext(self, step_name, category)
|
|
186
|
+
|
|
187
|
+
def run(self):
|
|
188
|
+
if not self.workflow:
|
|
189
|
+
raise RuntimeError(f"{self._NAME} Workflow not initialized")
|
|
190
|
+
|
|
191
|
+
with self.step(step_name="setup", category=MetricType.SETUP):
|
|
192
|
+
self.workflow.setup()
|
|
193
|
+
|
|
194
|
+
with self.step(step_name="load_data", category=MetricType.DATA_LOADING):
|
|
195
|
+
self.workflow.load_data()
|
|
196
|
+
|
|
197
|
+
with self.step(step_name="execute", category=MetricType.EXECUTION):
|
|
198
|
+
self.workflow.execute()
|
|
199
|
+
|
|
200
|
+
with self.step(step_name=f"{self.session_name}.collect_results", category=MetricType.RESULTS_COLLECTION):
|
|
201
|
+
self.workflow.collect_results()
|
|
202
|
+
|
|
203
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
204
|
+
return {
|
|
205
|
+
"session": {
|
|
206
|
+
"name": self.session_name,
|
|
207
|
+
"duration": precisedelta(self.session_metadata.duration, suppress=['minutes']),
|
|
208
|
+
"start_time": self.session_metadata.started_at.isoformat(),
|
|
209
|
+
"end_time": self.session_metadata.ended_at.isoformat(),
|
|
210
|
+
"steps": len(self.session_metadata.steps),
|
|
211
|
+
"errors": sum(s.error_count for s in self.session_metadata.steps.values())
|
|
212
|
+
},
|
|
213
|
+
"stats": self.monitor.get_all_stats()
|
|
214
|
+
}
|