levelapp 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- levelapp/aspects/monitor.py +3 -1
- levelapp/clients/__init__.py +0 -1
- levelapp/comparator/scorer.py +0 -2
- levelapp/config/endpoint.py +22 -13
- levelapp/config/endpoint_.py +62 -0
- levelapp/config/prompts.py +22 -0
- levelapp/core/schemas.py +24 -0
- levelapp/core/session.py +97 -59
- levelapp/evaluator/evaluator.py +42 -14
- levelapp/metrics/__init__.py +1 -5
- levelapp/repository/firestore.py +15 -6
- levelapp/simulator/schemas.py +15 -21
- levelapp/simulator/simulator.py +124 -55
- levelapp/simulator/utils.py +40 -78
- levelapp/workflow/__init__.py +3 -2
- levelapp/workflow/base.py +64 -17
- levelapp/workflow/config.py +92 -0
- levelapp/workflow/context.py +62 -0
- levelapp/workflow/factory.py +32 -41
- levelapp/workflow/registration.py +1 -1
- levelapp/workflow/runtime.py +19 -0
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/METADATA +102 -39
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/RECORD +25 -21
- levelapp/workflow/schemas.py +0 -121
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/WHEEL +0 -0
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/licenses/LICENSE +0 -0
levelapp/workflow/base.py
CHANGED
|
@@ -4,13 +4,13 @@ from abc import ABC, abstractmethod
|
|
|
4
4
|
from pydantic import ValidationError
|
|
5
5
|
from functools import partial
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Dict
|
|
8
8
|
|
|
9
9
|
from levelapp.core.base import BaseProcess
|
|
10
10
|
from levelapp.simulator.schemas import ScriptsBatch
|
|
11
11
|
from levelapp.simulator.simulator import ConversationSimulator
|
|
12
|
+
from levelapp.workflow.runtime import WorkflowContext
|
|
12
13
|
from levelapp.aspects.loader import DataLoader
|
|
13
|
-
from levelapp.workflow.schemas import WorkflowContext
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class BaseWorkflow(ABC):
|
|
@@ -28,6 +28,7 @@ class BaseWorkflow(ABC):
|
|
|
28
28
|
"""Validate and initialize workflow-specific settings."""
|
|
29
29
|
if self._initialized:
|
|
30
30
|
return
|
|
31
|
+
|
|
31
32
|
self.process = self._setup_process(context=self.context)
|
|
32
33
|
self._initialized = True
|
|
33
34
|
|
|
@@ -56,18 +57,35 @@ class BaseWorkflow(ABC):
|
|
|
56
57
|
else:
|
|
57
58
|
loop = asyncio.get_running_loop()
|
|
58
59
|
func = partial(self.process.run, **self._input_data)
|
|
59
|
-
self._results = await loop.run_in_executor(None, func)
|
|
60
|
+
self._results = await loop.run_in_executor(None, func, None)
|
|
60
61
|
|
|
61
62
|
def collect_results(self) -> Any:
|
|
62
|
-
"""
|
|
63
|
+
"""
|
|
64
|
+
Return unified results structure.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
The simulation results.
|
|
68
|
+
"""
|
|
63
69
|
return self._results
|
|
64
70
|
|
|
65
71
|
@abstractmethod
|
|
66
72
|
def _setup_process(self, context: WorkflowContext) -> BaseProcess:
|
|
73
|
+
"""
|
|
74
|
+
Abstract method for setting up the configured process.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
context (WorkflowContext): The workflow context.
|
|
78
|
+
"""
|
|
67
79
|
raise NotImplementedError
|
|
68
80
|
|
|
69
81
|
@abstractmethod
|
|
70
82
|
def _load_input_data(self, context: WorkflowContext) -> Any:
|
|
83
|
+
"""
|
|
84
|
+
Abstract method for loading reference data.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
context (WorkflowContext): The workflow context.
|
|
88
|
+
"""
|
|
71
89
|
raise NotImplementedError
|
|
72
90
|
|
|
73
91
|
|
|
@@ -76,38 +94,67 @@ class SimulatorWorkflow(BaseWorkflow):
|
|
|
76
94
|
super().__init__(name="ConversationSimulator", context=context)
|
|
77
95
|
|
|
78
96
|
def _setup_process(self, context: WorkflowContext) -> BaseProcess:
|
|
97
|
+
"""
|
|
98
|
+
Concrete implementation for setting up the simulation workflow.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
context (WorkflowContext): The workflow context for the simulation workflow.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
ConversationSimulator instance.
|
|
105
|
+
"""
|
|
79
106
|
simulator = ConversationSimulator()
|
|
80
107
|
simulator.setup(
|
|
81
108
|
repository=context.repository,
|
|
82
109
|
evaluators=context.evaluators,
|
|
110
|
+
providers=context.providers,
|
|
83
111
|
endpoint_config=context.endpoint_config,
|
|
84
112
|
)
|
|
85
113
|
return simulator
|
|
86
114
|
|
|
87
|
-
def _load_input_data(self, context: WorkflowContext) -> Any:
|
|
115
|
+
def _load_input_data(self, context: WorkflowContext) -> Dict[str, Any]:
|
|
116
|
+
"""
|
|
117
|
+
Concrete implementation for loading the reference data.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
context (WorkflowContext): The workflow context for the simulation workflow.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Dict[str, Any]: The reference data.
|
|
124
|
+
"""
|
|
88
125
|
loader = DataLoader()
|
|
89
|
-
|
|
90
|
-
|
|
126
|
+
if "reference_data" in context.inputs:
|
|
127
|
+
data_config = context.inputs["reference_data"]
|
|
128
|
+
else:
|
|
129
|
+
reference_data_path = context.inputs.get("reference_data_path", "no-path-provided")
|
|
130
|
+
|
|
131
|
+
if not reference_data_path:
|
|
132
|
+
raise RuntimeError(f"[{self.name}] No reference data available.")
|
|
133
|
+
|
|
134
|
+
file_path = Path(reference_data_path)
|
|
91
135
|
|
|
92
|
-
|
|
93
|
-
|
|
136
|
+
if not file_path.exists():
|
|
137
|
+
raise FileNotFoundError(f"[{self.name}] Reference data file not found.")
|
|
138
|
+
|
|
139
|
+
data_config = loader.load_raw_data(path=reference_data_path)
|
|
94
140
|
|
|
95
|
-
evaluation_params = context.inputs.get("evaluation_params", {})
|
|
96
|
-
data_config = loader.load_raw_data(path=reference_data_path)
|
|
97
141
|
try:
|
|
98
142
|
scripts_batch = ScriptsBatch.model_validate(data_config)
|
|
143
|
+
|
|
99
144
|
except ValidationError as e:
|
|
100
145
|
raise RuntimeError(f"[{self.name}] Validation error: {e}")
|
|
101
146
|
|
|
102
|
-
|
|
147
|
+
attempts = context.config.process.evaluation_params.get("attempts", 1)
|
|
148
|
+
|
|
149
|
+
return {"test_batch": scripts_batch, "attempts": attempts}
|
|
103
150
|
|
|
104
151
|
|
|
105
152
|
class ComparatorWorkflow(BaseWorkflow):
|
|
153
|
+
def __init__(self, context: WorkflowContext) -> None:
|
|
154
|
+
super().__init__(name="MetadataComparator", context=context)
|
|
155
|
+
|
|
106
156
|
def _setup_process(self, context: WorkflowContext) -> BaseProcess:
|
|
107
|
-
|
|
157
|
+
raise NotImplementedError
|
|
108
158
|
|
|
109
159
|
def _load_input_data(self, context: WorkflowContext) -> Any:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def __init__(self, context: WorkflowContext) -> None:
|
|
113
|
-
super().__init__(name="MetadataComparator", context=context)
|
|
160
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""levelapp/workflow/config.py: Contains modular workflow configuration components."""
|
|
2
|
+
from typing import List, Dict, Any, Optional
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from levelapp.aspects import logger
|
|
6
|
+
from levelapp.config.endpoint import EndpointConfig
|
|
7
|
+
from levelapp.core.schemas import WorkflowType, RepositoryType, EvaluatorType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ProcessConfig(BaseModel):
|
|
11
|
+
project_name: str
|
|
12
|
+
workflow_type: WorkflowType
|
|
13
|
+
evaluation_params: Dict[str, Any] = Field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EvaluationConfig(BaseModel):
|
|
17
|
+
evaluators: List[EvaluatorType]
|
|
18
|
+
providers: List[str] = Field(default_factory=list)
|
|
19
|
+
metrics_map: Dict[str, str] | None = Field(default_factory=dict)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ReferenceDataConfig(BaseModel):
|
|
23
|
+
path: str | None
|
|
24
|
+
data: Dict[str, Any] | None = Field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RepositoryConfig(BaseModel):
|
|
28
|
+
type: RepositoryType | None = None
|
|
29
|
+
project_id: str | None = None
|
|
30
|
+
database_name: str = Field(default="(default)")
|
|
31
|
+
|
|
32
|
+
class Config:
|
|
33
|
+
extra = "allow"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class WorkflowConfig(BaseModel):
|
|
37
|
+
"""
|
|
38
|
+
Static workflow configuration. Maps directly to YAML sections.
|
|
39
|
+
Supports both file-based loading and in-memory dictionary creation.
|
|
40
|
+
"""
|
|
41
|
+
process: ProcessConfig
|
|
42
|
+
evaluation: EvaluationConfig
|
|
43
|
+
reference_data: ReferenceDataConfig
|
|
44
|
+
endpoint: EndpointConfig
|
|
45
|
+
repository: RepositoryConfig
|
|
46
|
+
|
|
47
|
+
class Config:
|
|
48
|
+
extra = "allow"
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def load(cls, path: str | None = None) -> "WorkflowConfig":
|
|
52
|
+
"""
|
|
53
|
+
Load workflow configuration from a YAML/JSON file.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
path (str): YAML/JSON configuration file path.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
WorkflowConfig: An instance of WorkflowConfig.
|
|
60
|
+
"""
|
|
61
|
+
from levelapp.aspects.loader import DataLoader
|
|
62
|
+
|
|
63
|
+
loader = DataLoader()
|
|
64
|
+
config_dict = loader.load_raw_data(path=path)
|
|
65
|
+
logger.info(f"[{cls.__name__}] Workflow configuration loaded from '{path}' file content")
|
|
66
|
+
return cls.model_validate(config_dict)
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def from_dict(cls, content: Dict[str, Any]) -> "WorkflowConfig":
|
|
70
|
+
"""
|
|
71
|
+
Load workflow configuration from an in-memory dict.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
content (dict): Workflow configuration content.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
WorkflowConfig: An instance of WorkflowConfig.
|
|
78
|
+
"""
|
|
79
|
+
logger.info(f"[{cls.__name__}] Workflow configuration loaded from provided content")
|
|
80
|
+
return cls.model_validate(content)
|
|
81
|
+
|
|
82
|
+
def set_reference_data(self, content: Dict[str, Any]) -> None:
|
|
83
|
+
"""
|
|
84
|
+
Load referer data from an in-memory dict.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
content (dict): Workflow configuration content.
|
|
88
|
+
|
|
89
|
+
"""
|
|
90
|
+
self.reference_data.data = content
|
|
91
|
+
logger.info(f"[{self.__class__.__name__}] Reference data loaded from provided content")
|
|
92
|
+
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""levelapp/workflow/context.py: Builds runtime WorkflowContext from WorkflowConfig."""
|
|
2
|
+
from typing import Dict, Callable
|
|
3
|
+
|
|
4
|
+
from levelapp.workflow.config import WorkflowConfig
|
|
5
|
+
from levelapp.core.base import BaseRepository, BaseEvaluator
|
|
6
|
+
from levelapp.workflow.runtime import WorkflowContext
|
|
7
|
+
from levelapp.core.schemas import EvaluatorType, RepositoryType
|
|
8
|
+
|
|
9
|
+
from levelapp.repository.firestore import FirestoreRepository
|
|
10
|
+
from levelapp.evaluator.evaluator import JudgeEvaluator, MetadataEvaluator
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class WorkflowContextBuilder:
|
|
14
|
+
"""Builds a runtime WorkflowContext from a WorkflowConfig."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, config: WorkflowConfig) -> None:
|
|
17
|
+
self.config = config
|
|
18
|
+
|
|
19
|
+
# Map repository type to constructor that accepts the WorkflowConfig
|
|
20
|
+
self.repository_map: Dict[RepositoryType, Callable[[WorkflowConfig], BaseRepository]] = {
|
|
21
|
+
RepositoryType.FIRESTORE: lambda cfg: FirestoreRepository(cfg),
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# Map evaluator type to constructor that accepts the WorkflowConfig
|
|
25
|
+
self.evaluator_map: Dict[EvaluatorType, Callable[[WorkflowConfig], BaseEvaluator]] = {
|
|
26
|
+
EvaluatorType.JUDGE: lambda cfg: JudgeEvaluator(config=cfg),
|
|
27
|
+
EvaluatorType.REFERENCE: lambda cfg: MetadataEvaluator(config=cfg),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def build(self) -> WorkflowContext:
|
|
31
|
+
"""
|
|
32
|
+
Build a runtime WorkflowContext from the static WorkflowConfig.
|
|
33
|
+
Supports in-memory reference data if provided.
|
|
34
|
+
"""
|
|
35
|
+
# Repository instance
|
|
36
|
+
repository_type = self.config.repository.type
|
|
37
|
+
repository = self.repository_map.get(repository_type)(self.config)
|
|
38
|
+
|
|
39
|
+
# Evaluator instances
|
|
40
|
+
evaluators: Dict[EvaluatorType, BaseEvaluator] = {
|
|
41
|
+
ev: self.evaluator_map.get(ev)(self.config) for ev in self.config.evaluation.evaluators
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# Providers and endpoint
|
|
45
|
+
providers = self.config.evaluation.providers
|
|
46
|
+
endpoint_config = self.config.endpoint
|
|
47
|
+
|
|
48
|
+
# Inputs include reference data path or in-memory dict
|
|
49
|
+
inputs = {}
|
|
50
|
+
if self.config.reference_data.data:
|
|
51
|
+
inputs["reference_data"] = self.config.reference_data.data
|
|
52
|
+
else:
|
|
53
|
+
inputs["reference_data_path"] = self.config.reference_data.path
|
|
54
|
+
|
|
55
|
+
return WorkflowContext(
|
|
56
|
+
config=self.config,
|
|
57
|
+
repository=repository,
|
|
58
|
+
evaluators=evaluators,
|
|
59
|
+
providers=providers,
|
|
60
|
+
endpoint_config=endpoint_config,
|
|
61
|
+
inputs=inputs,
|
|
62
|
+
)
|
levelapp/workflow/factory.py
CHANGED
|
@@ -1,51 +1,42 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
3
|
-
from levelapp.core.base import BaseRepository, BaseEvaluator
|
|
4
|
-
from levelapp.workflow.base import BaseWorkflow
|
|
1
|
+
"""levelapp/workflow/factory.py: Creates workflows using WorkflowContext."""
|
|
2
|
+
from typing import Dict, Callable
|
|
5
3
|
|
|
6
|
-
from levelapp.
|
|
7
|
-
from levelapp.
|
|
4
|
+
from levelapp.core.schemas import WorkflowType
|
|
5
|
+
from levelapp.workflow.base import SimulatorWorkflow, ComparatorWorkflow, BaseWorkflow
|
|
6
|
+
from levelapp.workflow.runtime import WorkflowContext
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
class MainFactory:
|
|
11
|
-
"""Central factory for
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
_evaluator_map: dict[EvaluatorType, Callable[[WorkflowConfig], BaseEvaluator]] = {
|
|
18
|
-
EvaluatorType.JUDGE: lambda cfg: JudgeEvaluator(),
|
|
19
|
-
EvaluatorType.REFERENCE: lambda cfg: MetadataEvaluator(),
|
|
20
|
-
# Next is the RAG evaluator..
|
|
10
|
+
"""Central factory for workflows."""
|
|
11
|
+
_workflow_map: Dict[WorkflowType, Callable[[WorkflowContext], BaseWorkflow]] = {
|
|
12
|
+
WorkflowType.SIMULATOR: lambda ctx: SimulatorWorkflow(ctx),
|
|
13
|
+
WorkflowType.COMPARATOR: lambda ctx: ComparatorWorkflow(ctx),
|
|
21
14
|
}
|
|
22
15
|
|
|
23
|
-
_workflow_map: dict[WorkflowType, Callable[["WorkflowContext"], BaseWorkflow]] = {}
|
|
24
|
-
|
|
25
|
-
@classmethod
|
|
26
|
-
def create_repository(cls, config: WorkflowConfig) -> BaseRepository:
|
|
27
|
-
fn = cls._repository_map.get(config.repository)
|
|
28
|
-
if not fn:
|
|
29
|
-
raise NotImplementedError(f"Repository {config.repository} not implemented")
|
|
30
|
-
return fn(config)
|
|
31
|
-
|
|
32
|
-
@classmethod
|
|
33
|
-
def create_evaluator(cls, config: WorkflowConfig) -> Dict[EvaluatorType, BaseEvaluator]:
|
|
34
|
-
evaluators: dict[EvaluatorType, BaseEvaluator] = {}
|
|
35
|
-
for ev in config.evaluators:
|
|
36
|
-
fn = cls._evaluator_map.get(ev)
|
|
37
|
-
if not fn:
|
|
38
|
-
raise NotImplementedError(f"Evaluator {config.evaluators} not implemented")
|
|
39
|
-
evaluators[ev] = fn(config)
|
|
40
|
-
return evaluators
|
|
41
|
-
|
|
42
16
|
@classmethod
|
|
43
|
-
def create_workflow(cls,
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
17
|
+
def create_workflow(cls, context: WorkflowContext) -> BaseWorkflow:
|
|
18
|
+
"""
|
|
19
|
+
Create workflow using the given runtime context.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
context (WorkflowContext): the provided workflow context.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
BaseWorkflow: the built workflow instance from the provided context.
|
|
26
|
+
"""
|
|
27
|
+
wf_type = context.config.process.workflow_type
|
|
28
|
+
builder = cls._workflow_map.get(wf_type)
|
|
29
|
+
if not builder:
|
|
30
|
+
raise NotImplementedError(f"Workflow '{wf_type}' not implemented")
|
|
31
|
+
return builder(context)
|
|
48
32
|
|
|
49
33
|
@classmethod
|
|
50
|
-
def register_workflow(cls, wf_type: WorkflowType, builder: Callable[[
|
|
34
|
+
def register_workflow(cls, wf_type: WorkflowType, builder: Callable[[WorkflowContext], BaseWorkflow]) -> None:
|
|
35
|
+
"""
|
|
36
|
+
Register a new workflow implementation.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
wf_type (WorkflowType): the workflow type.
|
|
40
|
+
builder (Callable[[WorkflowContext], BaseWorkflow]): the workflow builder.
|
|
41
|
+
"""
|
|
51
42
|
cls._workflow_map[wf_type] = builder
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
+
from levelapp.core.schemas import WorkflowType
|
|
1
2
|
from levelapp.workflow.factory import MainFactory
|
|
2
|
-
from levelapp.workflow.schemas import WorkflowType
|
|
3
3
|
from levelapp.workflow.base import SimulatorWorkflow, ComparatorWorkflow
|
|
4
4
|
|
|
5
5
|
MainFactory.register_workflow(WorkflowType.SIMULATOR, lambda ctx: SimulatorWorkflow(ctx))
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""levelapp/workflow/runtime.py: contains the workflow runtime context component."""
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Dict, List, Any
|
|
4
|
+
|
|
5
|
+
from levelapp.config import EndpointConfig
|
|
6
|
+
from levelapp.core.base import BaseRepository, BaseEvaluator
|
|
7
|
+
from levelapp.workflow.config import WorkflowConfig
|
|
8
|
+
from levelapp.core.schemas import EvaluatorType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class WorkflowContext:
|
|
13
|
+
"""Immutable data holder for workflow execution context."""
|
|
14
|
+
config: WorkflowConfig
|
|
15
|
+
repository: BaseRepository
|
|
16
|
+
evaluators: Dict[EvaluatorType, BaseEvaluator]
|
|
17
|
+
providers: List[str]
|
|
18
|
+
endpoint_config: EndpointConfig
|
|
19
|
+
inputs: Dict[str, Any]
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: levelapp
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
|
|
5
5
|
Project-URL: Homepage, https://github.com/levelapp-org
|
|
6
6
|
Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
|
|
7
7
|
Project-URL: Documentation, https://levelapp.readthedocs.io
|
|
8
8
|
Project-URL: Issues, https://github.com/levelapp-org/levelapp-framework/issues
|
|
9
|
-
Author-email:
|
|
9
|
+
Author-email: Mohamed Sofiene KADRI <ms.kadri.dev@gmail.com>
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: ai,evaluation,framework,llm,testing
|
|
12
12
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -17,10 +17,12 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
18
|
Classifier: Topic :: Software Development :: Testing
|
|
19
19
|
Requires-Python: >=3.12
|
|
20
|
-
Requires-Dist:
|
|
20
|
+
Requires-Dist: google-api-core>=2.25.1
|
|
21
|
+
Requires-Dist: google-auth>=2.40.3
|
|
22
|
+
Requires-Dist: google-cloud-firestore>=2.21.0
|
|
21
23
|
Requires-Dist: httpx>=0.28.1
|
|
24
|
+
Requires-Dist: humanize>=4.13.0
|
|
22
25
|
Requires-Dist: numpy>=2.3.2
|
|
23
|
-
Requires-Dist: openai>=1.99.9
|
|
24
26
|
Requires-Dist: pandas-stubs==2.3.0.250703
|
|
25
27
|
Requires-Dist: pandas>=2.3.1
|
|
26
28
|
Requires-Dist: pydantic>=2.11.7
|
|
@@ -91,38 +93,47 @@ pip install levelapp
|
|
|
91
93
|
LevelApp uses a YAML configuration file to define the evaluation setup. Create a `workflow_config.yaml` with the following structure:
|
|
92
94
|
|
|
93
95
|
```yaml
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
96
|
+
process:
|
|
97
|
+
project_name: "test-project"
|
|
98
|
+
workflow_type: SIMULATOR # Pick one of the following workflows: SIMULATOR, COMPARATOR, ASSESSOR.
|
|
99
|
+
evaluation_params:
|
|
100
|
+
attempts: 1 # Add the number of simulation attempts.
|
|
101
|
+
batch_size: 5
|
|
102
|
+
|
|
103
|
+
evaluation:
|
|
104
|
+
evaluators: # Select from the following: JUDGE, REFERENCE, RAG.
|
|
105
|
+
- JUDGE
|
|
106
|
+
- REFERENCE
|
|
107
|
+
providers:
|
|
108
|
+
- openai
|
|
109
|
+
- ionos
|
|
110
|
+
metrics_map:
|
|
111
|
+
field_1: EXACT
|
|
112
|
+
field_2 : LEVENSHTEIN
|
|
97
113
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
- JUDGE
|
|
102
|
-
- REFERENCE
|
|
114
|
+
reference_data:
|
|
115
|
+
path:
|
|
116
|
+
data:
|
|
103
117
|
|
|
104
|
-
|
|
118
|
+
endpoint:
|
|
105
119
|
base_url: "http://127.0.0.1:8000"
|
|
106
120
|
url_path: ''
|
|
107
121
|
api_key: "<API-KEY>"
|
|
108
122
|
bearer_token: "<BEARER-TOKEN>"
|
|
109
123
|
model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
110
|
-
payload_path: "../../src/data/payload_example_1.yaml"
|
|
111
124
|
default_request_payload_template:
|
|
125
|
+
# Change the user message field name only according to the request payload schema (example: 'prompt' to 'message').
|
|
112
126
|
prompt: "${user_message}"
|
|
113
127
|
details: "${request_payload}" # Rest of the request payload data.
|
|
114
128
|
default_response_payload_template:
|
|
129
|
+
# Change the placeholder value only according to the response payload schema (example: ${agent_reply} to ${reply}).
|
|
115
130
|
agent_reply: "${agent_reply}"
|
|
116
|
-
guardrail_flag: "${guardrail_flag}"
|
|
117
131
|
generated_metadata: "${generated_metadata}"
|
|
118
132
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
metrics_map:
|
|
124
|
-
field_1: EXACT
|
|
125
|
-
field_2: LEVENSHTEIN
|
|
133
|
+
repository:
|
|
134
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
135
|
+
project_id: "(default)"
|
|
136
|
+
database_name: ""
|
|
126
137
|
```
|
|
127
138
|
|
|
128
139
|
- **Endpoint Configuration**: Define how to interact with your LLM-based system (base URL, auth, payload templates).
|
|
@@ -133,33 +144,26 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
133
144
|
|
|
134
145
|
```json
|
|
135
146
|
{
|
|
136
|
-
"id": "1fa6f6ed-3cfe-4c0b-b389-7292f58879d4",
|
|
137
147
|
"scripts": [
|
|
138
148
|
{
|
|
139
|
-
"id": "65f58cec-d55d-4a24-bf16-fa8327a3aa6b",
|
|
140
149
|
"interactions": [
|
|
141
150
|
{
|
|
142
|
-
"id": "e99a2898-6a79-4a20-ac85-dfe977ea9935",
|
|
143
151
|
"user_message": "Hello, I would like to book an appointment with a doctor.",
|
|
144
152
|
"reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
|
|
145
153
|
"interaction_type": "initial",
|
|
146
154
|
"reference_metadata": {},
|
|
147
|
-
"generated_metadata": {},
|
|
148
155
|
"guardrail_flag": false,
|
|
149
156
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
150
157
|
},
|
|
151
158
|
{
|
|
152
|
-
"id": "fe5c539a-d0a1-40ee-97bd-dbe456703ccc",
|
|
153
159
|
"user_message": "I need to see a cardiologist.",
|
|
154
160
|
"reference_reply": "When would you like to schedule your appointment?",
|
|
155
161
|
"interaction_type": "intermediate",
|
|
156
162
|
"reference_metadata": {},
|
|
157
|
-
"generated_metadata": {},
|
|
158
163
|
"guardrail_flag": false,
|
|
159
164
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
160
165
|
},
|
|
161
166
|
{
|
|
162
|
-
"id": "2cfdbd1c-a065-48bb-9aa9-b958342154b1",
|
|
163
167
|
"user_message": "I would like to book it for next Monday morning.",
|
|
164
168
|
"reference_reply": "We have an available slot at 10 AM next Monday. Does that work for you?",
|
|
165
169
|
"interaction_type": "intermediate",
|
|
@@ -168,11 +172,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
168
172
|
"date": "next Monday",
|
|
169
173
|
"time": "10 AM"
|
|
170
174
|
},
|
|
171
|
-
"generated_metadata": {
|
|
172
|
-
"appointment_type": "Cardiology",
|
|
173
|
-
"date": "next Monday",
|
|
174
|
-
"time": "morning"
|
|
175
|
-
},
|
|
176
175
|
"guardrail_flag": false,
|
|
177
176
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
178
177
|
},
|
|
@@ -182,7 +181,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
182
181
|
"reference_reply": "Your appointment with the cardiologist is booked for 10 AM next Monday. Is there anything else I can help you with?",
|
|
183
182
|
"interaction_type": "final",
|
|
184
183
|
"reference_metadata": {},
|
|
185
|
-
"generated_metadata": {},
|
|
186
184
|
"guardrail_flag": false,
|
|
187
185
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
188
186
|
}
|
|
@@ -195,9 +193,22 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
195
193
|
]
|
|
196
194
|
}
|
|
197
195
|
```
|
|
198
|
-
|
|
199
196
|
- **Fields**: Include user messages, reference/references replies, metadata for comparison, guardrail flags, and request payloads.
|
|
200
197
|
|
|
198
|
+
In the `.env` you need to add the LLM providers credentials that will be used for the evaluation process.
|
|
199
|
+
```
|
|
200
|
+
OPENAI_API_KEY=
|
|
201
|
+
IONOS_API_KEY=
|
|
202
|
+
ANTHROPIC_API_KEY=
|
|
203
|
+
MISTRAL_API_KEY=
|
|
204
|
+
|
|
205
|
+
# For IONOS, you must include the base URL and the model ID.
|
|
206
|
+
IONOS_BASE_URL="https://inference.de-txl.ionos.com"
|
|
207
|
+
IONOS_MODEL_ID="0b6c4a15-bb8d-4092-82b0-f357b77c59fd"
|
|
208
|
+
|
|
209
|
+
WORKFLOW_CONFIG_PATH="../../src/data/workflow_config_1.yaml"
|
|
210
|
+
```
|
|
211
|
+
|
|
201
212
|
## Usage Example
|
|
202
213
|
|
|
203
214
|
To run an evaluation:
|
|
@@ -207,14 +218,14 @@ To run an evaluation:
|
|
|
207
218
|
|
|
208
219
|
```python
|
|
209
220
|
if __name__ == "__main__":
|
|
210
|
-
from levelapp.workflow
|
|
221
|
+
from levelapp.workflow import WorkflowConfig
|
|
211
222
|
from levelapp.core.session import EvaluationSession
|
|
212
223
|
|
|
213
224
|
# Load configuration from YAML
|
|
214
225
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
215
226
|
|
|
216
|
-
# Run evaluation session
|
|
217
|
-
with EvaluationSession(session_name="
|
|
227
|
+
# Run evaluation session (You can enable/disable the monitoring aspect)
|
|
228
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
|
|
218
229
|
session.run()
|
|
219
230
|
results = session.workflow.collect_results()
|
|
220
231
|
print("Results:", results)
|
|
@@ -223,6 +234,58 @@ if __name__ == "__main__":
|
|
|
223
234
|
print(f"session stats:\n{stats}")
|
|
224
235
|
```
|
|
225
236
|
|
|
237
|
+
Alternatively, if you want to pass the configuration and reference data from in-memory variables,
|
|
238
|
+
you can manually load the data like the following:
|
|
239
|
+
```python
|
|
240
|
+
if __name__ == "__main__":
|
|
241
|
+
from levelapp.workflow import WorkflowConfig
|
|
242
|
+
from levelapp.core.session import EvaluationSession
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
config_dict = {
|
|
246
|
+
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
247
|
+
"evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
|
|
248
|
+
"reference_data": {"path": "", "data": {}},
|
|
249
|
+
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
250
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
content = {
|
|
254
|
+
"scripts": [
|
|
255
|
+
{
|
|
256
|
+
"interactions": [
|
|
257
|
+
{
|
|
258
|
+
"user_message": "Hello!",
|
|
259
|
+
"reference_reply": "Hello, how can I help you!"
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
"user_message": "I need an apartment",
|
|
263
|
+
"reference_reply": "sorry, but I can only assist you with booking medical appointments."
|
|
264
|
+
},
|
|
265
|
+
]
|
|
266
|
+
},
|
|
267
|
+
]
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
# Load configuration from a dict variable
|
|
271
|
+
config = WorkflowConfig.from_dict(content=config_dict)
|
|
272
|
+
|
|
273
|
+
# Load reference data from dict variable
|
|
274
|
+
config.set_reference_data(content=content)
|
|
275
|
+
|
|
276
|
+
evaluation_session = EvaluationSession(session_name="test-session-2", workflow_config=config)
|
|
277
|
+
|
|
278
|
+
with evaluation_session as session:
|
|
279
|
+
session.run()
|
|
280
|
+
results = session.workflow.collect_results()
|
|
281
|
+
print("Results:", results)
|
|
282
|
+
|
|
283
|
+
stats = session.get_stats()
|
|
284
|
+
print(f"session stats:\n{stats}")
|
|
285
|
+
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
|
|
226
289
|
- This loads the config, runs the specified workflow (e.g., Simulator), collects results, and prints stats.
|
|
227
290
|
|
|
228
291
|
For more examples, see the `examples/` directory.
|