levelapp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of levelapp might be problematic. Click here for more details.

Files changed (46) hide show
  1. levelapp/__init__.py +0 -0
  2. levelapp/aspects/__init__.py +8 -0
  3. levelapp/aspects/loader.py +253 -0
  4. levelapp/aspects/logger.py +59 -0
  5. levelapp/aspects/monitor.py +614 -0
  6. levelapp/aspects/sanitizer.py +168 -0
  7. levelapp/clients/__init__.py +119 -0
  8. levelapp/clients/anthropic.py +112 -0
  9. levelapp/clients/ionos.py +116 -0
  10. levelapp/clients/mistral.py +106 -0
  11. levelapp/clients/openai.py +102 -0
  12. levelapp/comparator/__init__.py +5 -0
  13. levelapp/comparator/comparator.py +232 -0
  14. levelapp/comparator/extractor.py +108 -0
  15. levelapp/comparator/schemas.py +61 -0
  16. levelapp/comparator/scorer.py +271 -0
  17. levelapp/comparator/utils.py +136 -0
  18. levelapp/config/__init__.py +5 -0
  19. levelapp/config/endpoint.py +190 -0
  20. levelapp/config/prompts.py +35 -0
  21. levelapp/core/__init__.py +0 -0
  22. levelapp/core/base.py +386 -0
  23. levelapp/core/session.py +214 -0
  24. levelapp/evaluator/__init__.py +3 -0
  25. levelapp/evaluator/evaluator.py +265 -0
  26. levelapp/metrics/__init__.py +67 -0
  27. levelapp/metrics/embedding.py +2 -0
  28. levelapp/metrics/exact.py +182 -0
  29. levelapp/metrics/fuzzy.py +80 -0
  30. levelapp/metrics/token.py +103 -0
  31. levelapp/plugins/__init__.py +0 -0
  32. levelapp/repository/__init__.py +3 -0
  33. levelapp/repository/firestore.py +282 -0
  34. levelapp/simulator/__init__.py +3 -0
  35. levelapp/simulator/schemas.py +89 -0
  36. levelapp/simulator/simulator.py +441 -0
  37. levelapp/simulator/utils.py +201 -0
  38. levelapp/workflow/__init__.py +5 -0
  39. levelapp/workflow/base.py +113 -0
  40. levelapp/workflow/factory.py +51 -0
  41. levelapp/workflow/registration.py +6 -0
  42. levelapp/workflow/schemas.py +121 -0
  43. levelapp-0.1.0.dist-info/METADATA +254 -0
  44. levelapp-0.1.0.dist-info/RECORD +46 -0
  45. levelapp-0.1.0.dist-info/WHEEL +4 -0
  46. levelapp-0.1.0.dist-info/licenses/LICENSE +0 -0
levelapp/core/base.py ADDED
@@ -0,0 +1,386 @@
1
+ """levelapp/core/base.py"""
2
+ import datetime
3
+ import json
4
+
5
+ import httpx
6
+ import requests
7
+
8
+ from abc import ABC, abstractmethod
9
+
10
+ from pydantic import BaseModel
11
+ from typing import List, Dict, Any, Callable, TypeVar, Type
12
+
13
+ from levelapp.aspects import JSONSanitizer
14
+
15
+
16
+ Model = TypeVar("Model", bound=BaseModel)
17
+ Context = TypeVar("Context")
18
+
19
+
20
+ class BaseProcess(ABC):
21
+ """Interface for the evaluation classes."""
22
+ @abstractmethod
23
+ def run(self, **kwargs) -> Any:
24
+ raise NotImplementedError
25
+
26
+
27
+ class BaseEvaluator(ABC):
28
+ """Abstract base class for evaluator components."""
29
+ @abstractmethod
30
+ def evaluate(
31
+ self,
32
+ generated_data: str | Dict[str, Any],
33
+ reference_data: str | Dict[str, Any],
34
+ **kwargs
35
+ ):
36
+ """Evaluate system output to reference output."""
37
+ raise NotImplementedError
38
+
39
+ @abstractmethod
40
+ async def async_evaluate(
41
+ self,
42
+ generated_data: str | Dict[str, Any],
43
+ reference_data: str | Dict[str, Any],
44
+ **kwargs
45
+ ):
46
+ """Asynchronous evaluation method."""
47
+ raise NotImplementedError
48
+
49
+
50
+ class BaseChatClient(ABC):
51
+ """
52
+ Abstract base class for integrating different LLM provider clients.
53
+
54
+ This class defines the common interface and request lifecycle for
55
+ calling chat-based large language models (LLMs). It enforces
56
+ provider-specific implementations for:
57
+ - endpoint path resolution
58
+ - request headers
59
+ - request payload
60
+ - response parsing
61
+
62
+ Subclasses (e.g., `OpenAIClient`, `MistralClient`, `AnthropicClient`, `IonosClient`)
63
+ must override the abstract members to handle provider-specific request/response formats.
64
+ """
65
+
66
+ def __init__(self, **kwargs):
67
+ """
68
+ Initialize the base chat client.
69
+
70
+ Args:
71
+ **kwargs: Arbitrary keyword arguments. Expected keys include:
72
+ - base_url (str): The base API URL for the LLM provider.
73
+ """
74
+ self.base_url = kwargs.get("base_url")
75
+ self.sanitizer = JSONSanitizer()
76
+
77
+ @property
78
+ @abstractmethod
79
+ def endpoint_path(self) -> str:
80
+ """
81
+ API path (relative to `base_url`) for the provider’s chat endpoint.
82
+
83
+ Example:
84
+ - OpenAI: "/v1/chat/completions"
85
+ - Mistral: "/chat/completions"
86
+ - Anthropic: "/v1/messages"
87
+ - IONOS: "/models/model-id/predictions"
88
+
89
+ Returns:
90
+ str: Provider-specific endpoint path.
91
+ """
92
+ raise NotImplementedError
93
+
94
+ def _build_endpoint(self) -> str:
95
+ """
96
+ Construct the full request endpoint URL.
97
+
98
+ Returns:
99
+ str: Complete endpoint URL (base_url + endpoint_path).
100
+ """
101
+ return f"{self.base_url}/{self.endpoint_path.lstrip('/')}"
102
+
103
+ @abstractmethod
104
+ def _build_headers(self) -> Dict[str, str]:
105
+ """
106
+ Construct HTTP request headers for the provider.
107
+
108
+ This typically includes authentication (e.g., API key or Bearer token),
109
+ content type, and provider-specific headers.
110
+
111
+ Returns:
112
+ Dict[str, str]: HTTP headers.
113
+ """
114
+ raise NotImplementedError
115
+
116
+ @abstractmethod
117
+ def _build_payload(self, message: str) -> Dict[str, Any]:
118
+ """
119
+ Construct the request body payload for the provider.
120
+
121
+ Args:
122
+ message (str): User message to send to the LLM.
123
+
124
+ Returns:
125
+ Dict[str, Any]: JSON-serializable payload as required by the provider API.
126
+ """
127
+ raise NotImplementedError
128
+
129
+ @abstractmethod
130
+ def parse_response(self, response: Dict[str, Any]) -> Dict[str, Any]:
131
+ """
132
+ Parse the raw provider response into a normalized format.
133
+
134
+ The normalized format should include:
135
+ - "output": str or structured response content
136
+ - "metadata": Dict containing tokens, cost, or other provider stats
137
+
138
+ Args:
139
+ response (Dict[str, Any]): Raw JSON response returned by the provider.
140
+
141
+ Returns:
142
+ Dict[str, Any]: Normalized output structure.
143
+ """
144
+ raise NotImplementedError
145
+
146
+ def call(self, message: str) -> Dict[str, Any]:
147
+ """
148
+ Make a synchronous call to the provider API.
149
+
150
+ Args:
151
+ message (str): User input message to send.
152
+
153
+ Returns:
154
+ Dict[str, Any]: Provider's raw JSON response.
155
+
156
+ Raises:
157
+ requests.exceptions.RequestException: On any network or HTTP error.
158
+ """
159
+ url = self._build_endpoint()
160
+ headers = self._build_headers()
161
+ payload = self._build_payload(message)
162
+
163
+ try:
164
+ response = requests.post(url, headers=headers, data=json.dumps(payload))
165
+ response.raise_for_status()
166
+ return response.json()
167
+
168
+ except requests.exceptions.HTTPError as http_err:
169
+ print(f"HTTP error occurred: {http_err}")
170
+ raise
171
+ except requests.exceptions.ConnectionError as conn_err:
172
+ print(f"Connection error occurred: {conn_err}")
173
+ raise
174
+ except requests.exceptions.Timeout as timeout_err:
175
+ print(f"Timeout error occurred: {timeout_err}")
176
+ raise
177
+ except requests.exceptions.RequestException as req_err:
178
+ print(f"An unexpected error occurred: {req_err}")
179
+ raise
180
+
181
+ async def acall(self, message: str) -> Dict[str, Any]:
182
+ """
183
+ Make an asynchronous call to the provider API.
184
+
185
+ Args:
186
+ message (str): User input message to send.
187
+
188
+ Returns:
189
+ Dict[str, Any]: Provider's raw JSON response.
190
+
191
+ Raises:
192
+ httpx.RequestError, httpx.TimeoutException, httpx.HTTPStatusError:
193
+ On any network or HTTP error.
194
+ """
195
+ url = self._build_endpoint()
196
+ headers = self._build_headers()
197
+ payload = self._build_payload(message)
198
+
199
+ try:
200
+ async with httpx.AsyncClient(timeout=300) as client:
201
+ response = await client.post(url, headers=headers, json=payload)
202
+ response.raise_for_status()
203
+ return response.json()
204
+
205
+ except httpx.HTTPStatusError as http_err:
206
+ print(f"[IonosClient.acall] HTTP error: {http_err}")
207
+ raise
208
+ except httpx.RequestError as req_err:
209
+ print(f"[IonosClient.acall] Request error: {req_err}")
210
+ raise
211
+ except httpx.TimeoutException as timeout_err:
212
+ print(f"[IonosClient.acall] Timeout: {timeout_err}")
213
+ raise
214
+ except Exception as e:
215
+ print(f"[IonosClient.acall] Unexpected error: {e}")
216
+ raise
217
+
218
+
219
+ class BaseMetric(ABC):
220
+ """Abstract base class for metrics collection."""
221
+
222
+ def __init__(self, processor: Callable | None = None, score_cutoff: float | None = None):
223
+ """
224
+ Initialize the metric.
225
+
226
+ Args:
227
+ processor (Optional[Callable]): Optional function to preprocess strings before comparison.
228
+ score_cutoff (Optional[float]): Minimum similarity score for an early match cutoff.
229
+ """
230
+ self.processor = processor
231
+ self.score_cutoff = score_cutoff
232
+
233
+ @abstractmethod
234
+ def compute(self, generated: str, reference: str) -> Dict[str, Any]:
235
+ """
236
+ Evaluate the generated text against the reference text.
237
+
238
+ Args:
239
+ generated (str): The generated text to evaluate.
240
+ reference (str): The reference text to compare against.
241
+
242
+ Returns:
243
+ Dict[str, Any]: Evaluation results including match level and justification.
244
+ """
245
+ raise NotImplementedError
246
+
247
+ @property
248
+ def name(self) -> str:
249
+ """
250
+ Get the name of the metric.
251
+
252
+ Returns:
253
+ str: Name of the metric.
254
+ """
255
+ return self.__class__.__name__.lower()
256
+
257
+ # TODO-0: You know what..We can remove this at some point.
258
+ @staticmethod
259
+ def _validate_inputs(generated: str, reference: str) -> None:
260
+ """Validate that both inputs are strings."""
261
+ if not (isinstance(generated, str) and isinstance(reference, str)):
262
+ raise TypeError("Both 'generated' and 'reference' must be strings.")
263
+
264
+ def _get_params(self) -> Dict[str, Any]:
265
+ """Return a serializable dictionary of metric parameters."""
266
+ return {
267
+ "processor": repr(self.processor) if self.processor else None,
268
+ "score_cutoff": self.score_cutoff
269
+ }
270
+
271
+ def _build_metadata(self, **extra_inputs) -> Dict[str, Any]:
272
+ """Construct a consistent metadata dictionary."""
273
+ return {
274
+ "type": self.__class__.__name__,
275
+ "params": self._get_params(),
276
+ "inputs": extra_inputs,
277
+ "timestamp": datetime.datetime.now()
278
+ }
279
+
280
+
281
+ class BaseRepository(ABC):
282
+ """
283
+ Abstract base class for pluggable NoSQL data stores.
284
+ Supports document-based operations with Pydantic model parsing.
285
+ """
286
+
287
+ @abstractmethod
288
+ def connect(self) -> None:
289
+ """Initialize connection or client."""
290
+ raise NotImplementedError
291
+
292
+ @abstractmethod
293
+ def close(self) -> None:
294
+ """Close connection or client."""
295
+ raise NotImplementedError
296
+
297
+ @abstractmethod
298
+ def retrieve_document(
299
+ self,
300
+ collection_id: str,
301
+ section_id: str,
302
+ sub_collection_id: str,
303
+ document_id: str,
304
+ model_type: Type[Model]
305
+ ) -> Model | None:
306
+ """
307
+ Retrieve and parse a document from the datastore based on its type.
308
+
309
+ Args:
310
+ collection_id (str): Collection reference.
311
+ section_id (str): Section reference.
312
+ sub_collection_id (str): Sub-collection reference.
313
+ document_id (str): Reference of the document to retrieve.
314
+ model_type (Type[BaseModel]): Pydantic class to instantiate.
315
+
316
+ Returns:
317
+ Parsed model instance or None if document was not found.
318
+ """
319
+ raise NotImplementedError
320
+
321
+ @abstractmethod
322
+ def store_document(
323
+ self,
324
+ collection_id: str,
325
+ section_id: str,
326
+ sub_collection_id: str,
327
+ document_id: str,
328
+ data: Model
329
+ ) -> None:
330
+ """
331
+ Store a pydantic model instance as a document.
332
+
333
+ Args:
334
+ collection_id (str): Collection reference.
335
+ section_id (str): Section reference.
336
+ sub_collection_id (str): Sub-collection reference.
337
+ document_id (str): Reference of the document to store.
338
+ data (Model): Pydantic model instance.
339
+ """
340
+ raise NotImplementedError
341
+
342
+ @abstractmethod
343
+ def query_collection(
344
+ self,
345
+ collection_id: str,
346
+ section_id: str,
347
+ sub_collection_id: str,
348
+ filters: Dict[str, Any],
349
+ model_type: Type[Model]
350
+ ) -> List[Model]:
351
+ """
352
+ Query documents in a collection with optional filters.
353
+
354
+ Args:
355
+ collection_id (str): Collection reference.
356
+ section_id (str): Section reference.
357
+ sub_collection_id (str): Sub-collection reference.
358
+ filters (Dict[str, Any]): Filters to apply to the query (implementation dependent).
359
+ model_type (Type[BaseModel]): Pydantic class to instantiate.
360
+
361
+ Returns:
362
+ List[Model]: Query results.
363
+ """
364
+ raise NotImplementedError
365
+
366
+ @abstractmethod
367
+ def delete_document(
368
+ self,
369
+ collection_id: str,
370
+ section_id: str,
371
+ sub_collection_id: str,
372
+ document_id: str
373
+ ) -> bool:
374
+ """
375
+ Delete a document.
376
+
377
+ Args:
378
+ collection_id (str): Collection reference.
379
+ section_id (str): Section reference.
380
+ sub_collection_id (str): Sub-collection reference.
381
+ document_id (str): Reference of the document to delete.
382
+
383
+ Returns:
384
+ True if deleted, False if not.
385
+ """
386
+ raise NotImplementedError
@@ -0,0 +1,214 @@
1
+ """levelapp/core/session.py"""
2
+ import threading
3
+
4
+ from dataclasses import dataclass, field
5
+ from typing import Dict, List, Any
6
+
7
+ from datetime import datetime
8
+ from humanize import precisedelta
9
+
10
+ from levelapp.workflow import MainFactory
11
+ from levelapp.workflow.base import BaseWorkflow
12
+ from levelapp.workflow.schemas import WorkflowConfig, WorkflowContext
13
+ from levelapp.aspects import FunctionMonitor, MetricType, ExecutionMetrics, MonitoringAspect, logger
14
+
15
+
16
+ @dataclass
17
+ class SessionMetadata:
18
+ """Metadata for an evaluation session."""
19
+ session_name: str
20
+ started_at: datetime | None = None
21
+ ended_at: datetime | None = None
22
+ total_executions: int = 0
23
+ total_duration: float = 0.0
24
+ steps: Dict[str, 'StepMetadata'] = field(default_factory=dict)
25
+
26
+ @property
27
+ def is_active(self) -> bool:
28
+ """Check if the session is currently active."""
29
+ return self.ended_at is None
30
+
31
+ @property
32
+ def duration(self) -> float | None:
33
+ """Calculate the duration of the session in seconds."""
34
+ if not self.is_active:
35
+ return (self.ended_at - self.started_at).total_seconds()
36
+ return None
37
+
38
+
39
+ @dataclass
40
+ class StepMetadata:
41
+ """Metadata for a specific step within an evaluation session."""
42
+ step_name: str
43
+ session_name: str
44
+ started_at: datetime | None = None
45
+ ended_at: datetime | None = None
46
+ memory_peak_mb: float | None = None
47
+ error_count: int = 0
48
+ procedures_stats: List[ExecutionMetrics] | None = None
49
+
50
+ @property
51
+ def is_active(self) -> bool:
52
+ """Check if the step is currently active."""
53
+ return self.ended_at is None
54
+
55
+ @property
56
+ def duration(self) -> float | None:
57
+ """Calculate the duration of the step in seconds."""
58
+ if not self.is_active:
59
+ return (self.ended_at - self.started_at).total_seconds()
60
+ return None
61
+
62
+
63
+ class StepContext:
64
+ """Context manager for an evaluation step within an EvaluationSession."""
65
+ def __init__(self, session: "EvaluationSession", step_name: str, category: MetricType):
66
+ self.session = session
67
+ self.step_name = step_name
68
+ self.category = category
69
+ self.step_meta: StepMetadata | None = None
70
+ self.full_step_name = f"{session.session_name}.{step_name}"
71
+ self._monitored_func = None
72
+ self._func_gen = None
73
+
74
+ def __enter__(self):
75
+ with self.session.lock:
76
+ self.step_meta = StepMetadata(
77
+ step_name=self.step_name,
78
+ session_name=self.session.session_name,
79
+ started_at=datetime.now()
80
+ )
81
+ self.session.session_metadata.steps[self.step_name] = self.step_meta
82
+
83
+ # Wrap with FunctionMonitor
84
+ self._monitored_func = self.session.monitor.monitor(
85
+ name=self.full_step_name,
86
+ category=self.category,
87
+ enable_timing=True,
88
+ track_memory=True,
89
+ )(self._step_wrapper)
90
+
91
+ # Start monitoring
92
+ self._func_gen = self._monitored_func()
93
+ next(self._func_gen) # Enter monitoring
94
+ return self # returning self allows nested instrumentation
95
+
96
+ def _step_wrapper(self):
97
+ yield # Actual user step execution happens here
98
+
99
+ def __exit__(self, exc_type, exc_val, exc_tb):
100
+ try:
101
+ next(self._func_gen) # Exit monitoring
102
+ except StopIteration:
103
+ pass
104
+
105
+ with self.session.lock:
106
+ self.step_meta.ended_at = datetime.now()
107
+ if exc_type:
108
+ self.step_meta.error_count += 1
109
+ self.session.session_metadata.total_executions += 1
110
+ if self.step_meta.duration:
111
+ self.session.monitor.update_procedure_duration(name=self.full_step_name, value=self.step_meta.duration)
112
+ self.session.session_metadata.total_duration += self.step_meta.duration
113
+
114
+ return False
115
+
116
+
117
+ class EvaluationSession:
118
+ """Context manager for LLM evaluation sessions with integrated monitoring."""
119
+ def __init__(
120
+ self,
121
+ session_name: str = "test-session",
122
+ monitor: FunctionMonitor | None = None,
123
+ workflow_config: WorkflowConfig | None = None
124
+ ):
125
+ """
126
+ Initialize Evaluation Session.
127
+
128
+ Args:
129
+ session_name (str): Name of the session
130
+ monitor (FunctionMonitor): Function monitoring aspect
131
+ workflow_config (WorkflowConfig): Workflow configuration.
132
+ """
133
+ self._NAME = self.__class__.__name__
134
+
135
+ self.session_name = session_name
136
+ self.monitor = monitor or MonitoringAspect
137
+ self.workflow_config = workflow_config
138
+ self.workflow_type = workflow_config.workflow
139
+
140
+ self.workflow: BaseWorkflow | None = None
141
+
142
+ self.session_metadata = SessionMetadata(session_name=session_name)
143
+ self._lock = threading.RLock()
144
+
145
+ @property
146
+ def lock(self):
147
+ return self._lock
148
+
149
+ def __enter__(self):
150
+ self.session_metadata.started_at = datetime.now()
151
+
152
+ # Instantiate workflow if not already
153
+ if not self.workflow:
154
+ if not self.workflow_config:
155
+ raise ValueError(f"{self._NAME}: Workflow configuration must be provided")
156
+
157
+ context = WorkflowContext(
158
+ config=self.workflow_config,
159
+ repository=MainFactory.create_repository(self.workflow_config),
160
+ evaluators=MainFactory.create_evaluator(self.workflow_config),
161
+ endpoint_config=self.workflow_config.endpoint_config,
162
+ inputs=self.workflow_config.inputs
163
+ )
164
+ self.workflow = MainFactory.create_workflow(self.workflow_type, context)
165
+
166
+ logger.info(
167
+ f"[{self._NAME}] Starting evaluation session: {self.session_name}, "
168
+ f"Workflow: '{self.workflow.name}'"
169
+ )
170
+ return self
171
+
172
+ def __exit__(self, exc_type, exc_val, exc_tb):
173
+ self.session_metadata.ended_at = datetime.now()
174
+ logger.info(
175
+ f"[{self._NAME}] Completed session '{self.session_name}' "
176
+ f"in {self.session_metadata.duration:.2f}s"
177
+ )
178
+
179
+ if exc_type:
180
+ logger.error(f"[{self._NAME}] Session ended with error: {exc_val}", exc_info=True)
181
+ return False
182
+
183
+ def step(self, step_name: str, category: MetricType = MetricType.CUSTOM) -> StepContext:
184
+ """Create a monitored evaluation step."""
185
+ return StepContext(self, step_name, category)
186
+
187
+ def run(self):
188
+ if not self.workflow:
189
+ raise RuntimeError(f"{self._NAME} Workflow not initialized")
190
+
191
+ with self.step(step_name="setup", category=MetricType.SETUP):
192
+ self.workflow.setup()
193
+
194
+ with self.step(step_name="load_data", category=MetricType.DATA_LOADING):
195
+ self.workflow.load_data()
196
+
197
+ with self.step(step_name="execute", category=MetricType.EXECUTION):
198
+ self.workflow.execute()
199
+
200
+ with self.step(step_name=f"{self.session_name}.collect_results", category=MetricType.RESULTS_COLLECTION):
201
+ self.workflow.collect_results()
202
+
203
+ def get_stats(self) -> Dict[str, Any]:
204
+ return {
205
+ "session": {
206
+ "name": self.session_name,
207
+ "duration": precisedelta(self.session_metadata.duration, suppress=['minutes']),
208
+ "start_time": self.session_metadata.started_at.isoformat(),
209
+ "end_time": self.session_metadata.ended_at.isoformat(),
210
+ "steps": len(self.session_metadata.steps),
211
+ "errors": sum(s.error_count for s in self.session_metadata.steps.values())
212
+ },
213
+ "stats": self.monitor.get_all_stats()
214
+ }
@@ -0,0 +1,3 @@
1
+ from .evaluator import JudgeEvaluator, MetadataEvaluator
2
+
3
+ __all__ = ['JudgeEvaluator', 'MetadataEvaluator']