levelapp 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- levelapp/core/schemas.py +26 -0
- levelapp/core/session.py +68 -56
- levelapp/evaluator/evaluator.py +27 -11
- levelapp/repository/firestore.py +15 -6
- levelapp/simulator/schemas.py +9 -9
- levelapp/simulator/simulator.py +103 -37
- levelapp/workflow/__init__.py +3 -2
- levelapp/workflow/base.py +26 -14
- levelapp/workflow/config.py +65 -0
- levelapp/workflow/context.py +63 -0
- levelapp/workflow/factory.py +18 -40
- levelapp/workflow/registration.py +1 -1
- levelapp/workflow/runtime.py +19 -0
- {levelapp-0.1.0.dist-info → levelapp-0.1.1.dist-info}/METADATA +101 -35
- {levelapp-0.1.0.dist-info → levelapp-0.1.1.dist-info}/RECORD +17 -14
- levelapp/workflow/schemas.py +0 -121
- {levelapp-0.1.0.dist-info → levelapp-0.1.1.dist-info}/WHEEL +0 -0
- {levelapp-0.1.0.dist-info → levelapp-0.1.1.dist-info}/licenses/LICENSE +0 -0
levelapp/workflow/base.py
CHANGED
|
@@ -9,8 +9,8 @@ from typing import Any
|
|
|
9
9
|
from levelapp.core.base import BaseProcess
|
|
10
10
|
from levelapp.simulator.schemas import ScriptsBatch
|
|
11
11
|
from levelapp.simulator.simulator import ConversationSimulator
|
|
12
|
+
from levelapp.workflow.runtime import WorkflowContext
|
|
12
13
|
from levelapp.aspects.loader import DataLoader
|
|
13
|
-
from levelapp.workflow.schemas import WorkflowContext
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class BaseWorkflow(ABC):
|
|
@@ -28,6 +28,7 @@ class BaseWorkflow(ABC):
|
|
|
28
28
|
"""Validate and initialize workflow-specific settings."""
|
|
29
29
|
if self._initialized:
|
|
30
30
|
return
|
|
31
|
+
|
|
31
32
|
self.process = self._setup_process(context=self.context)
|
|
32
33
|
self._initialized = True
|
|
33
34
|
|
|
@@ -56,7 +57,7 @@ class BaseWorkflow(ABC):
|
|
|
56
57
|
else:
|
|
57
58
|
loop = asyncio.get_running_loop()
|
|
58
59
|
func = partial(self.process.run, **self._input_data)
|
|
59
|
-
self._results = await loop.run_in_executor(None, func)
|
|
60
|
+
self._results = await loop.run_in_executor(None, func, None)
|
|
60
61
|
|
|
61
62
|
def collect_results(self) -> Any:
|
|
62
63
|
"""Return unified results structure."""
|
|
@@ -80,34 +81,45 @@ class SimulatorWorkflow(BaseWorkflow):
|
|
|
80
81
|
simulator.setup(
|
|
81
82
|
repository=context.repository,
|
|
82
83
|
evaluators=context.evaluators,
|
|
84
|
+
providers=context.providers,
|
|
83
85
|
endpoint_config=context.endpoint_config,
|
|
84
86
|
)
|
|
85
87
|
return simulator
|
|
86
88
|
|
|
87
89
|
def _load_input_data(self, context: WorkflowContext) -> Any:
|
|
88
90
|
loader = DataLoader()
|
|
89
|
-
|
|
90
|
-
|
|
91
|
+
if "reference_data" in context.inputs:
|
|
92
|
+
data_config = context.inputs["reference_data"]
|
|
93
|
+
else:
|
|
94
|
+
reference_data_path = context.inputs.get("reference_data_path", "no-path-provided")
|
|
95
|
+
|
|
96
|
+
if not reference_data_path:
|
|
97
|
+
raise RuntimeError(f"[{self.name}] No reference data available.")
|
|
98
|
+
|
|
99
|
+
file_path = Path(reference_data_path)
|
|
91
100
|
|
|
92
|
-
|
|
93
|
-
|
|
101
|
+
if not file_path.exists():
|
|
102
|
+
raise FileNotFoundError(f"[{self.name}] Reference data file not found.")
|
|
103
|
+
|
|
104
|
+
data_config = loader.load_raw_data(path=reference_data_path)
|
|
94
105
|
|
|
95
|
-
evaluation_params = context.inputs.get("evaluation_params", {})
|
|
96
|
-
data_config = loader.load_raw_data(path=reference_data_path)
|
|
97
106
|
try:
|
|
98
107
|
scripts_batch = ScriptsBatch.model_validate(data_config)
|
|
108
|
+
|
|
99
109
|
except ValidationError as e:
|
|
100
110
|
raise RuntimeError(f"[{self.name}] Validation error: {e}")
|
|
101
111
|
|
|
102
|
-
|
|
112
|
+
attempts = context.config.process.evaluation_params.get("attempts", 1)
|
|
113
|
+
|
|
114
|
+
return {"test_batch": scripts_batch, "attempts": attempts}
|
|
103
115
|
|
|
104
116
|
|
|
105
117
|
class ComparatorWorkflow(BaseWorkflow):
|
|
118
|
+
def __init__(self, context: WorkflowContext) -> None:
|
|
119
|
+
super().__init__(name="MetadataComparator", context=context)
|
|
120
|
+
|
|
106
121
|
def _setup_process(self, context: WorkflowContext) -> BaseProcess:
|
|
107
|
-
|
|
122
|
+
raise NotImplementedError
|
|
108
123
|
|
|
109
124
|
def _load_input_data(self, context: WorkflowContext) -> Any:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def __init__(self, context: WorkflowContext) -> None:
|
|
113
|
-
super().__init__(name="MetadataComparator", context=context)
|
|
125
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""levelapp/workflow/config.py: Contains modular workflow configuration components."""
|
|
2
|
+
from typing import List, Dict, Any, Optional
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from levelapp.config.endpoint import EndpointConfig
|
|
6
|
+
from levelapp.core.schemas import WorkflowType, RepositoryType, EvaluatorType
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProcessConfig(BaseModel):
|
|
10
|
+
project_name: str
|
|
11
|
+
workflow_type: WorkflowType
|
|
12
|
+
evaluation_params: Dict[str, Any] = Field(default_factory=dict)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EvaluationConfig(BaseModel):
|
|
16
|
+
evaluators: List[EvaluatorType]
|
|
17
|
+
providers: List[str] = Field(default_factory=list)
|
|
18
|
+
metrics_map: Dict[str, str] | None = Field(default_factory=dict)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ReferenceDataConfig(BaseModel):
|
|
22
|
+
path: str | None
|
|
23
|
+
data: Dict[str, Any] | None = Field(default_factory=dict)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RepositoryConfig(BaseModel):
|
|
27
|
+
type: RepositoryType | None = None
|
|
28
|
+
project_id: str | None = None
|
|
29
|
+
database_name: str = Field(default="(default)")
|
|
30
|
+
|
|
31
|
+
class Config:
|
|
32
|
+
extra = "allow"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class WorkflowConfig(BaseModel):
|
|
36
|
+
"""
|
|
37
|
+
Static workflow configuration. Maps directly to YAML sections.
|
|
38
|
+
Supports both file-based loading and in-memory dictionary creation.
|
|
39
|
+
"""
|
|
40
|
+
process: ProcessConfig
|
|
41
|
+
evaluation: EvaluationConfig
|
|
42
|
+
reference_data: ReferenceDataConfig
|
|
43
|
+
endpoint: EndpointConfig
|
|
44
|
+
repository: RepositoryConfig
|
|
45
|
+
|
|
46
|
+
class Config:
|
|
47
|
+
extra = "allow"
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def load(cls, path: Optional[str] = None) -> "WorkflowConfig":
|
|
51
|
+
"""Load workflow configuration from a YAML/JSON file."""
|
|
52
|
+
from levelapp.aspects.loader import DataLoader
|
|
53
|
+
|
|
54
|
+
loader = DataLoader()
|
|
55
|
+
config_dict = loader.load_raw_data(path=path)
|
|
56
|
+
return cls.model_validate(config_dict)
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def from_dict(cls, content: Dict[str, Any]) -> "WorkflowConfig":
|
|
60
|
+
"""Load workflow configuration from an in-memory dict."""
|
|
61
|
+
return cls.model_validate(content)
|
|
62
|
+
|
|
63
|
+
def set_reference_data(self, content: Dict[str, Any]) -> None:
|
|
64
|
+
"""Load referer data from an in-memory dict."""
|
|
65
|
+
self.reference_data.data = content
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""levelapp/workflow/context.py: Builds runtime WorkflowContext from WorkflowConfig."""
|
|
2
|
+
from typing import Dict, Callable
|
|
3
|
+
|
|
4
|
+
from levelapp.workflow.config import WorkflowConfig
|
|
5
|
+
from levelapp.core.base import BaseRepository, BaseEvaluator
|
|
6
|
+
from levelapp.workflow.runtime import WorkflowContext
|
|
7
|
+
from levelapp.core.schemas import EvaluatorType, RepositoryType
|
|
8
|
+
|
|
9
|
+
from levelapp.repository.firestore import FirestoreRepository
|
|
10
|
+
from levelapp.evaluator.evaluator import JudgeEvaluator, MetadataEvaluator
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class WorkflowContextBuilder:
|
|
14
|
+
"""Builds a runtime WorkflowContext from a WorkflowConfig."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, config: WorkflowConfig) -> None:
|
|
17
|
+
self.config = config
|
|
18
|
+
|
|
19
|
+
# Map repository type to constructor that accepts the WorkflowConfig
|
|
20
|
+
self.repository_map: Dict[RepositoryType, Callable[[WorkflowConfig], BaseRepository]] = {
|
|
21
|
+
RepositoryType.FIRESTORE: lambda cfg: FirestoreRepository(cfg),
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# Map evaluator type to constructor that accepts the WorkflowConfig
|
|
25
|
+
self.evaluator_map: Dict[EvaluatorType, Callable[[WorkflowConfig], BaseEvaluator]] = {
|
|
26
|
+
EvaluatorType.JUDGE: lambda cfg: JudgeEvaluator(config=cfg),
|
|
27
|
+
EvaluatorType.REFERENCE: lambda cfg: MetadataEvaluator(config=cfg),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def build(self) -> WorkflowContext:
|
|
31
|
+
"""
|
|
32
|
+
Build a runtime WorkflowContext from the static WorkflowConfig.
|
|
33
|
+
Supports in-memory reference data if provided.
|
|
34
|
+
"""
|
|
35
|
+
# Repository instance
|
|
36
|
+
repository_type = self.config.repository.type
|
|
37
|
+
repository = self.repository_map.get(repository_type)(self.config)
|
|
38
|
+
|
|
39
|
+
# Evaluator instances
|
|
40
|
+
evaluators: Dict[EvaluatorType, BaseEvaluator] = {
|
|
41
|
+
ev: self.evaluator_map.get(ev)(self.config) for ev in self.config.evaluation.evaluators
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# Providers and endpoint
|
|
45
|
+
providers = self.config.evaluation.providers
|
|
46
|
+
endpoint_config = self.config.endpoint
|
|
47
|
+
|
|
48
|
+
# Inputs include reference data path or in-memory dict
|
|
49
|
+
inputs = {}
|
|
50
|
+
if self.config.reference_data.data:
|
|
51
|
+
inputs["reference_data"] = self.config.reference_data.data
|
|
52
|
+
else:
|
|
53
|
+
inputs["reference_data_path"] = self.config.reference_data.path
|
|
54
|
+
print(f"[WorkflowContextBuilder] reference data path: {inputs['reference_data_path']}")
|
|
55
|
+
|
|
56
|
+
return WorkflowContext(
|
|
57
|
+
config=self.config,
|
|
58
|
+
repository=repository,
|
|
59
|
+
evaluators=evaluators,
|
|
60
|
+
providers=providers,
|
|
61
|
+
endpoint_config=endpoint_config,
|
|
62
|
+
inputs=inputs,
|
|
63
|
+
)
|
levelapp/workflow/factory.py
CHANGED
|
@@ -1,51 +1,29 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
3
|
-
from levelapp.core.base import BaseRepository, BaseEvaluator
|
|
4
|
-
from levelapp.workflow.base import BaseWorkflow
|
|
1
|
+
"""levelapp/workflow/factory.py: Creates workflows using WorkflowContext."""
|
|
2
|
+
from typing import Dict, Callable
|
|
5
3
|
|
|
6
|
-
from levelapp.
|
|
7
|
-
from levelapp.
|
|
4
|
+
from levelapp.core.schemas import WorkflowType
|
|
5
|
+
from levelapp.workflow.base import SimulatorWorkflow, ComparatorWorkflow, BaseWorkflow
|
|
6
|
+
from levelapp.workflow.runtime import WorkflowContext
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
class MainFactory:
|
|
11
|
-
"""Central factory for
|
|
10
|
+
"""Central factory for workflows."""
|
|
12
11
|
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
_workflow_map: Dict[WorkflowType, Callable[[WorkflowContext], BaseWorkflow]] = {
|
|
13
|
+
WorkflowType.SIMULATOR: lambda ctx: SimulatorWorkflow(ctx),
|
|
14
|
+
WorkflowType.COMPARATOR: lambda ctx: ComparatorWorkflow(ctx),
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
-
_evaluator_map: dict[EvaluatorType, Callable[[WorkflowConfig], BaseEvaluator]] = {
|
|
18
|
-
EvaluatorType.JUDGE: lambda cfg: JudgeEvaluator(),
|
|
19
|
-
EvaluatorType.REFERENCE: lambda cfg: MetadataEvaluator(),
|
|
20
|
-
# Next is the RAG evaluator..
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
_workflow_map: dict[WorkflowType, Callable[["WorkflowContext"], BaseWorkflow]] = {}
|
|
24
|
-
|
|
25
|
-
@classmethod
|
|
26
|
-
def create_repository(cls, config: WorkflowConfig) -> BaseRepository:
|
|
27
|
-
fn = cls._repository_map.get(config.repository)
|
|
28
|
-
if not fn:
|
|
29
|
-
raise NotImplementedError(f"Repository {config.repository} not implemented")
|
|
30
|
-
return fn(config)
|
|
31
|
-
|
|
32
|
-
@classmethod
|
|
33
|
-
def create_evaluator(cls, config: WorkflowConfig) -> Dict[EvaluatorType, BaseEvaluator]:
|
|
34
|
-
evaluators: dict[EvaluatorType, BaseEvaluator] = {}
|
|
35
|
-
for ev in config.evaluators:
|
|
36
|
-
fn = cls._evaluator_map.get(ev)
|
|
37
|
-
if not fn:
|
|
38
|
-
raise NotImplementedError(f"Evaluator {config.evaluators} not implemented")
|
|
39
|
-
evaluators[ev] = fn(config)
|
|
40
|
-
return evaluators
|
|
41
|
-
|
|
42
17
|
@classmethod
|
|
43
|
-
def create_workflow(cls,
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
18
|
+
def create_workflow(cls, context: WorkflowContext) -> BaseWorkflow:
|
|
19
|
+
"""Create workflow using the given runtime context."""
|
|
20
|
+
wf_type = context.config.process.workflow_type
|
|
21
|
+
builder = cls._workflow_map.get(wf_type)
|
|
22
|
+
if not builder:
|
|
23
|
+
raise NotImplementedError(f"Workflow '{wf_type}' not implemented")
|
|
24
|
+
return builder(context)
|
|
48
25
|
|
|
49
26
|
@classmethod
|
|
50
|
-
def register_workflow(cls, wf_type: WorkflowType, builder: Callable[[
|
|
27
|
+
def register_workflow(cls, wf_type: WorkflowType, builder: Callable[[WorkflowContext], BaseWorkflow]) -> None:
|
|
28
|
+
"""Register a new workflow implementation."""
|
|
51
29
|
cls._workflow_map[wf_type] = builder
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
+
from levelapp.core.schemas import WorkflowType
|
|
1
2
|
from levelapp.workflow.factory import MainFactory
|
|
2
|
-
from levelapp.workflow.schemas import WorkflowType
|
|
3
3
|
from levelapp.workflow.base import SimulatorWorkflow, ComparatorWorkflow
|
|
4
4
|
|
|
5
5
|
MainFactory.register_workflow(WorkflowType.SIMULATOR, lambda ctx: SimulatorWorkflow(ctx))
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""levelapp/workflow/runtime.py: contains the workflow runtime context component."""
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Dict, List, Any
|
|
4
|
+
|
|
5
|
+
from levelapp.config import EndpointConfig
|
|
6
|
+
from levelapp.core.base import BaseRepository, BaseEvaluator
|
|
7
|
+
from levelapp.workflow.config import WorkflowConfig
|
|
8
|
+
from levelapp.core.schemas import EvaluatorType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class WorkflowContext:
|
|
13
|
+
"""Immutable data holder for workflow execution context."""
|
|
14
|
+
config: WorkflowConfig
|
|
15
|
+
repository: BaseRepository
|
|
16
|
+
evaluators: Dict[EvaluatorType, BaseEvaluator]
|
|
17
|
+
providers: List[str]
|
|
18
|
+
endpoint_config: EndpointConfig
|
|
19
|
+
inputs: Dict[str, Any]
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: levelapp
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
|
|
5
5
|
Project-URL: Homepage, https://github.com/levelapp-org
|
|
6
6
|
Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
|
|
7
7
|
Project-URL: Documentation, https://levelapp.readthedocs.io
|
|
8
8
|
Project-URL: Issues, https://github.com/levelapp-org/levelapp-framework/issues
|
|
9
|
-
Author-email:
|
|
9
|
+
Author-email: Mohamed Sofiene KADRI <ms.kadri.dev@gmail.com>
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: ai,evaluation,framework,llm,testing
|
|
12
12
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -18,7 +18,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
18
18
|
Classifier: Topic :: Software Development :: Testing
|
|
19
19
|
Requires-Python: >=3.12
|
|
20
20
|
Requires-Dist: arrow>=1.3.0
|
|
21
|
+
Requires-Dist: google-api-core>=2.25.1
|
|
22
|
+
Requires-Dist: google-auth>=2.40.3
|
|
23
|
+
Requires-Dist: google-cloud-firestore>=2.21.0
|
|
21
24
|
Requires-Dist: httpx>=0.28.1
|
|
25
|
+
Requires-Dist: humanize>=4.13.0
|
|
22
26
|
Requires-Dist: numpy>=2.3.2
|
|
23
27
|
Requires-Dist: openai>=1.99.9
|
|
24
28
|
Requires-Dist: pandas-stubs==2.3.0.250703
|
|
@@ -91,38 +95,47 @@ pip install levelapp
|
|
|
91
95
|
LevelApp uses a YAML configuration file to define the evaluation setup. Create a `workflow_config.yaml` with the following structure:
|
|
92
96
|
|
|
93
97
|
```yaml
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
98
|
+
process:
|
|
99
|
+
project_name: "test-project"
|
|
100
|
+
workflow_type: SIMULATOR # Pick one of the following workflows: SIMULATOR, COMPARATOR, ASSESSOR.
|
|
101
|
+
evaluation_params:
|
|
102
|
+
attempts: 1 # Add the number of simulation attempts.
|
|
103
|
+
batch_size: 5
|
|
104
|
+
|
|
105
|
+
evaluation:
|
|
106
|
+
evaluators: # Select from the following: JUDGE, REFERENCE, RAG.
|
|
107
|
+
- JUDGE
|
|
108
|
+
- REFERENCE
|
|
109
|
+
providers:
|
|
110
|
+
- openai
|
|
111
|
+
- ionos
|
|
112
|
+
metrics_map:
|
|
113
|
+
field_1: EXACT
|
|
114
|
+
field_2 : LEVENSHTEIN
|
|
97
115
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
- JUDGE
|
|
102
|
-
- REFERENCE
|
|
116
|
+
reference_data:
|
|
117
|
+
path:
|
|
118
|
+
data:
|
|
103
119
|
|
|
104
|
-
|
|
120
|
+
endpoint:
|
|
105
121
|
base_url: "http://127.0.0.1:8000"
|
|
106
122
|
url_path: ''
|
|
107
123
|
api_key: "<API-KEY>"
|
|
108
124
|
bearer_token: "<BEARER-TOKEN>"
|
|
109
125
|
model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
110
|
-
payload_path: "../../src/data/payload_example_1.yaml"
|
|
111
126
|
default_request_payload_template:
|
|
127
|
+
# Change the user message field name only according to the request payload schema (example: 'prompt' to 'message').
|
|
112
128
|
prompt: "${user_message}"
|
|
113
129
|
details: "${request_payload}" # Rest of the request payload data.
|
|
114
130
|
default_response_payload_template:
|
|
131
|
+
# Change the placeholder value only according to the response payload schema (example: ${agent_reply} to ${reply}).
|
|
115
132
|
agent_reply: "${agent_reply}"
|
|
116
|
-
guardrail_flag: "${guardrail_flag}"
|
|
117
133
|
generated_metadata: "${generated_metadata}"
|
|
118
134
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
metrics_map:
|
|
124
|
-
field_1: EXACT
|
|
125
|
-
field_2: LEVENSHTEIN
|
|
135
|
+
repository:
|
|
136
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM, MONGODB.
|
|
137
|
+
project_id: "(default)"
|
|
138
|
+
database_name: ""
|
|
126
139
|
```
|
|
127
140
|
|
|
128
141
|
- **Endpoint Configuration**: Define how to interact with your LLM-based system (base URL, auth, payload templates).
|
|
@@ -133,33 +146,26 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
133
146
|
|
|
134
147
|
```json
|
|
135
148
|
{
|
|
136
|
-
"id": "1fa6f6ed-3cfe-4c0b-b389-7292f58879d4",
|
|
137
149
|
"scripts": [
|
|
138
150
|
{
|
|
139
|
-
"id": "65f58cec-d55d-4a24-bf16-fa8327a3aa6b",
|
|
140
151
|
"interactions": [
|
|
141
152
|
{
|
|
142
|
-
"id": "e99a2898-6a79-4a20-ac85-dfe977ea9935",
|
|
143
153
|
"user_message": "Hello, I would like to book an appointment with a doctor.",
|
|
144
154
|
"reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
|
|
145
155
|
"interaction_type": "initial",
|
|
146
156
|
"reference_metadata": {},
|
|
147
|
-
"generated_metadata": {},
|
|
148
157
|
"guardrail_flag": false,
|
|
149
158
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
150
159
|
},
|
|
151
160
|
{
|
|
152
|
-
"id": "fe5c539a-d0a1-40ee-97bd-dbe456703ccc",
|
|
153
161
|
"user_message": "I need to see a cardiologist.",
|
|
154
162
|
"reference_reply": "When would you like to schedule your appointment?",
|
|
155
163
|
"interaction_type": "intermediate",
|
|
156
164
|
"reference_metadata": {},
|
|
157
|
-
"generated_metadata": {},
|
|
158
165
|
"guardrail_flag": false,
|
|
159
166
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
160
167
|
},
|
|
161
168
|
{
|
|
162
|
-
"id": "2cfdbd1c-a065-48bb-9aa9-b958342154b1",
|
|
163
169
|
"user_message": "I would like to book it for next Monday morning.",
|
|
164
170
|
"reference_reply": "We have an available slot at 10 AM next Monday. Does that work for you?",
|
|
165
171
|
"interaction_type": "intermediate",
|
|
@@ -168,11 +174,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
168
174
|
"date": "next Monday",
|
|
169
175
|
"time": "10 AM"
|
|
170
176
|
},
|
|
171
|
-
"generated_metadata": {
|
|
172
|
-
"appointment_type": "Cardiology",
|
|
173
|
-
"date": "next Monday",
|
|
174
|
-
"time": "morning"
|
|
175
|
-
},
|
|
176
177
|
"guardrail_flag": false,
|
|
177
178
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
178
179
|
},
|
|
@@ -182,7 +183,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
182
183
|
"reference_reply": "Your appointment with the cardiologist is booked for 10 AM next Monday. Is there anything else I can help you with?",
|
|
183
184
|
"interaction_type": "final",
|
|
184
185
|
"reference_metadata": {},
|
|
185
|
-
"generated_metadata": {},
|
|
186
186
|
"guardrail_flag": false,
|
|
187
187
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
188
188
|
}
|
|
@@ -195,9 +195,22 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
195
195
|
]
|
|
196
196
|
}
|
|
197
197
|
```
|
|
198
|
-
|
|
199
198
|
- **Fields**: Include user messages, reference/references replies, metadata for comparison, guardrail flags, and request payloads.
|
|
200
199
|
|
|
200
|
+
In the `.env` you need to add the LLM providers credentials that will be used for the evaluation process.
|
|
201
|
+
```
|
|
202
|
+
OPENAI_API_KEY=
|
|
203
|
+
IONOS_API_KEY=
|
|
204
|
+
ANTHROPIC_API_KEY=
|
|
205
|
+
MISTRAL_API_KEY=
|
|
206
|
+
|
|
207
|
+
# For IONOS, you must include the base URL and the model ID.
|
|
208
|
+
IONOS_BASE_URL="https://inference.de-txl.ionos.com"
|
|
209
|
+
IONOS_MODEL_ID="0b6c4a15-bb8d-4092-82b0-f357b77c59fd"
|
|
210
|
+
|
|
211
|
+
WORKFLOW_CONFIG_PATH="../../src/data/workflow_config_1.yaml"
|
|
212
|
+
```
|
|
213
|
+
|
|
201
214
|
## Usage Example
|
|
202
215
|
|
|
203
216
|
To run an evaluation:
|
|
@@ -214,15 +227,68 @@ if __name__ == "__main__":
|
|
|
214
227
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
215
228
|
|
|
216
229
|
# Run evaluation session
|
|
217
|
-
with EvaluationSession(session_name="
|
|
230
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
|
|
231
|
+
session.run()
|
|
232
|
+
results = session.workflow.collect_results()
|
|
233
|
+
print("Results:", results)
|
|
234
|
+
|
|
235
|
+
stats = session.get_stats()
|
|
236
|
+
print(f"session stats:\n{stats}")
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
Alternatively, if you want to pass the configuration and reference data from in-memory variables,
|
|
240
|
+
you can manually load the data like the following:
|
|
241
|
+
```python
|
|
242
|
+
if __name__ == "__main__":
|
|
243
|
+
from levelapp.workflow import WorkflowConfig
|
|
244
|
+
from levelapp.core.session import EvaluationSession
|
|
245
|
+
|
|
246
|
+
# Firestore -> retrieve endpoint config -> data => config_dict
|
|
247
|
+
|
|
248
|
+
config_dict = {
|
|
249
|
+
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
250
|
+
"evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
|
|
251
|
+
"reference_data": {"path": "", "data": {}},
|
|
252
|
+
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
253
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY", "metrics_map": {"field_1": "EXACT"}},
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
content = {
|
|
257
|
+
"scripts": [
|
|
258
|
+
{
|
|
259
|
+
"interactions": [
|
|
260
|
+
{
|
|
261
|
+
"user_message": "Hello!",
|
|
262
|
+
"reference_reply": "Hello, how can I help you!"
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
"user_message": "I need an apartment",
|
|
266
|
+
"reference_reply": "sorry, but I can only assist you with booking medical appointments."
|
|
267
|
+
},
|
|
268
|
+
]
|
|
269
|
+
},
|
|
270
|
+
]
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
# Load configuration from a dict variable
|
|
274
|
+
config = WorkflowConfig.from_dict(content=config_dict)
|
|
275
|
+
|
|
276
|
+
# Load reference data from dict variable
|
|
277
|
+
config.set_reference_data(content=content)
|
|
278
|
+
|
|
279
|
+
evaluation_session = EvaluationSession(session_name="test-session-2", workflow_config=config)
|
|
280
|
+
|
|
281
|
+
with evaluation_session as session:
|
|
218
282
|
session.run()
|
|
219
283
|
results = session.workflow.collect_results()
|
|
220
284
|
print("Results:", results)
|
|
221
285
|
|
|
222
286
|
stats = session.get_stats()
|
|
223
287
|
print(f"session stats:\n{stats}")
|
|
288
|
+
|
|
224
289
|
```
|
|
225
290
|
|
|
291
|
+
|
|
226
292
|
- This loads the config, runs the specified workflow (e.g., Simulator), collects results, and prints stats.
|
|
227
293
|
|
|
228
294
|
For more examples, see the `examples/` directory.
|
|
@@ -20,9 +20,10 @@ levelapp/config/endpoint.py,sha256=ll34rZ0KRmUwI81EWJ3HX9i6pziq2YrQb84kv4ErymI,7
|
|
|
20
20
|
levelapp/config/prompts.py,sha256=crjOk01weLz5_IdF6dDZWPfSmiKNL8SgnbThyf4Jz2o,1345
|
|
21
21
|
levelapp/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
levelapp/core/base.py,sha256=oh4OkKgwGxmw_jgjX6wrBoK0KPc1JvCMZfbZP_mGmIg,12453
|
|
23
|
-
levelapp/core/
|
|
23
|
+
levelapp/core/schemas.py,sha256=UrUnU6h90uqS1LpcFqmMEpgWJ3772ZD5QBIytczmAbE,469
|
|
24
|
+
levelapp/core/session.py,sha256=6bP6s3iWxofWL9LT4qv90VGOntAIa6PBJ_EaWZT0Ur4,7918
|
|
24
25
|
levelapp/evaluator/__init__.py,sha256=K-P75Q1FXXLCNqH1wyhT9sf4y2R9a1qR5449AXEsY1k,109
|
|
25
|
-
levelapp/evaluator/evaluator.py,sha256=
|
|
26
|
+
levelapp/evaluator/evaluator.py,sha256=SSveWDIXVg9CTLqexAZJSRpR_wtd5f1bD_s5dG5HJyc,10544
|
|
26
27
|
levelapp/metrics/__init__.py,sha256=1y4gDLOu2Jz4QVIgPH-v9YMgaWOFr263tYLUTiFJ-fc,1965
|
|
27
28
|
levelapp/metrics/embedding.py,sha256=wvlT8Q5DjDT6GrAIFtc5aFbA_80hDLUXMP4RbSpSwHE,115
|
|
28
29
|
levelapp/metrics/exact.py,sha256=Kb13nD2OVLrl3iYHaXrxDfrxDuhW0SMVvLAEXPaJtlY,6235
|
|
@@ -30,17 +31,19 @@ levelapp/metrics/fuzzy.py,sha256=Rg8ashzMxtQwKO-z_LLzdj2PDIRqL4CBw6PGRf9IBrI,259
|
|
|
30
31
|
levelapp/metrics/token.py,sha256=yQi9hxT_fXTGjLiCCemDxQ4Uk2zD-wQYtSnDlI2AuuY,3521
|
|
31
32
|
levelapp/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
33
|
levelapp/repository/__init__.py,sha256=hNmFRZ7kKJN1mMlOHeW9xf0j9Q7gqTXYJ3hMCzk9to4,79
|
|
33
|
-
levelapp/repository/firestore.py,sha256=
|
|
34
|
+
levelapp/repository/firestore.py,sha256=K9JgxsNCelAKtzTDv19c1dHRlitMeRzo7H3caTlKuF8,10369
|
|
34
35
|
levelapp/simulator/__init__.py,sha256=8Dz8g7rbpBZX3WoknVmMVoWm_VT72ZL9BABOF1xFpqs,83
|
|
35
|
-
levelapp/simulator/schemas.py,sha256=
|
|
36
|
-
levelapp/simulator/simulator.py,sha256=
|
|
36
|
+
levelapp/simulator/schemas.py,sha256=eEFNNWiAJ8FAIObWcFWF1pL9LmjWwz_2Y-yfY3pHESc,4061
|
|
37
|
+
levelapp/simulator/simulator.py,sha256=RKzpV-yDmtugJ3ScJTTA9MwxTdD-oEkwRZLm8N_scjg,19972
|
|
37
38
|
levelapp/simulator/utils.py,sha256=qx0JdV1ZDQdTRVKa9xfq278ASrE44GBXSnJZJuhICqo,7365
|
|
38
|
-
levelapp/workflow/__init__.py,sha256=
|
|
39
|
-
levelapp/workflow/base.py,sha256=
|
|
40
|
-
levelapp/workflow/
|
|
41
|
-
levelapp/workflow/
|
|
42
|
-
levelapp/workflow/
|
|
43
|
-
levelapp
|
|
44
|
-
levelapp
|
|
45
|
-
levelapp-0.1.
|
|
46
|
-
levelapp-0.1.
|
|
39
|
+
levelapp/workflow/__init__.py,sha256=27b2obG7ObhR43yd2uH-R0koRB7-DG8Emnvrq8EjsTA,193
|
|
40
|
+
levelapp/workflow/base.py,sha256=t-vJzwv_OJ9W_pORySJwZq9IENGbWAF3-9-7ozaKDPk,4637
|
|
41
|
+
levelapp/workflow/config.py,sha256=ClQaKSWxj7rFcOEQ4budmgOqMBskg6wAibf_gzqUf1o,2142
|
|
42
|
+
levelapp/workflow/context.py,sha256=DzyZEb8WHug6vWfzf7BIjZAwtmv43HqgbaB20Pw3eWo,2660
|
|
43
|
+
levelapp/workflow/factory.py,sha256=PZHp3AVt61Eop3HwGQDfbO0ju5k7rvNDHKy09eywMTQ,1245
|
|
44
|
+
levelapp/workflow/registration.py,sha256=VHUHjLHXad5kjcKukaEOIf7hBZ09bT3HAzVmIT08aLo,359
|
|
45
|
+
levelapp/workflow/runtime.py,sha256=cFyXNWXSuURKbrMDHdkTcjeItM9wHP-5DPljntwYL5g,686
|
|
46
|
+
levelapp-0.1.1.dist-info/METADATA,sha256=ozbAgnWY4gl149zqzPgYS7-qkKGutJFb9qL0CoYHbh0,12500
|
|
47
|
+
levelapp-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
48
|
+
levelapp-0.1.1.dist-info/licenses/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
+
levelapp-0.1.1.dist-info/RECORD,,
|