levelapp 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- {levelapp-0.1.0 → levelapp-0.1.1}/PKG-INFO +101 -35
- {levelapp-0.1.0 → levelapp-0.1.1}/README.md +95 -33
- levelapp-0.1.1/levelapp/core/schemas.py +26 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/core/session.py +68 -56
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/evaluator/evaluator.py +27 -11
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/repository/firestore.py +15 -6
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/simulator/schemas.py +9 -9
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/simulator/simulator.py +103 -37
- levelapp-0.1.1/levelapp/workflow/__init__.py +6 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/workflow/base.py +26 -14
- levelapp-0.1.1/levelapp/workflow/config.py +65 -0
- levelapp-0.1.1/levelapp/workflow/context.py +63 -0
- levelapp-0.1.1/levelapp/workflow/factory.py +29 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/workflow/registration.py +1 -1
- levelapp-0.1.1/levelapp/workflow/runtime.py +19 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/pyproject.toml +6 -2
- levelapp-0.1.1/src/data/workflow_config.yaml +41 -0
- levelapp-0.1.1/src/level_app/main_session.py +46 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/uv.lock +212 -1
- levelapp-0.1.0/levelapp/workflow/__init__.py +0 -5
- levelapp-0.1.0/levelapp/workflow/factory.py +0 -51
- levelapp-0.1.0/levelapp/workflow/schemas.py +0 -121
- levelapp-0.1.0/src/data/workflow_config.yaml +0 -35
- levelapp-0.1.0/src/level_app/main_session.py +0 -16
- {levelapp-0.1.0 → levelapp-0.1.1}/.gitignore +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/.python-version +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/LICENSE +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/MANIFEST.in +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/Makefile +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/examples/example_evaluation.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/aspects/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/aspects/loader.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/aspects/logger.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/aspects/monitor.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/aspects/sanitizer.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/clients/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/clients/anthropic.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/clients/ionos.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/clients/mistral.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/clients/openai.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/comparator.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/extractor.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/schemas.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/scorer.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/utils.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/config/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/config/endpoint.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/config/prompts.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/core/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/core/base.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/evaluator/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/metrics/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/metrics/embedding.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/metrics/exact.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/metrics/fuzzy.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/metrics/token.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/plugins/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/repository/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/simulator/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/simulator/utils.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/make.bat +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/project_structure.txt +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/src/data/conversation_example_1.json +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/src/data/endpoint_configuration.yaml +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/src/data/payload_example_1.yaml +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/src/data/payload_example_2.yaml +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/src/data/workflow_config_2.json +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/src/level_app/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/src/level_app/main.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/src/level_app/main_monitoring.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/src/level_app/main_simulator.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/tests/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_anthropic.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_comparator.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_ionos.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_mistral.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_monitoring.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_openai.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_session.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_simulator.py +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: levelapp
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
|
|
5
5
|
Project-URL: Homepage, https://github.com/levelapp-org
|
|
6
6
|
Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
|
|
7
7
|
Project-URL: Documentation, https://levelapp.readthedocs.io
|
|
8
8
|
Project-URL: Issues, https://github.com/levelapp-org/levelapp-framework/issues
|
|
9
|
-
Author-email:
|
|
9
|
+
Author-email: Mohamed Sofiene KADRI <ms.kadri.dev@gmail.com>
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: ai,evaluation,framework,llm,testing
|
|
12
12
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -18,7 +18,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
18
18
|
Classifier: Topic :: Software Development :: Testing
|
|
19
19
|
Requires-Python: >=3.12
|
|
20
20
|
Requires-Dist: arrow>=1.3.0
|
|
21
|
+
Requires-Dist: google-api-core>=2.25.1
|
|
22
|
+
Requires-Dist: google-auth>=2.40.3
|
|
23
|
+
Requires-Dist: google-cloud-firestore>=2.21.0
|
|
21
24
|
Requires-Dist: httpx>=0.28.1
|
|
25
|
+
Requires-Dist: humanize>=4.13.0
|
|
22
26
|
Requires-Dist: numpy>=2.3.2
|
|
23
27
|
Requires-Dist: openai>=1.99.9
|
|
24
28
|
Requires-Dist: pandas-stubs==2.3.0.250703
|
|
@@ -91,38 +95,47 @@ pip install levelapp
|
|
|
91
95
|
LevelApp uses a YAML configuration file to define the evaluation setup. Create a `workflow_config.yaml` with the following structure:
|
|
92
96
|
|
|
93
97
|
```yaml
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
98
|
+
process:
|
|
99
|
+
project_name: "test-project"
|
|
100
|
+
workflow_type: SIMULATOR # Pick one of the following workflows: SIMULATOR, COMPARATOR, ASSESSOR.
|
|
101
|
+
evaluation_params:
|
|
102
|
+
attempts: 1 # Add the number of simulation attempts.
|
|
103
|
+
batch_size: 5
|
|
104
|
+
|
|
105
|
+
evaluation:
|
|
106
|
+
evaluators: # Select from the following: JUDGE, REFERENCE, RAG.
|
|
107
|
+
- JUDGE
|
|
108
|
+
- REFERENCE
|
|
109
|
+
providers:
|
|
110
|
+
- openai
|
|
111
|
+
- ionos
|
|
112
|
+
metrics_map:
|
|
113
|
+
field_1: EXACT
|
|
114
|
+
field_2 : LEVENSHTEIN
|
|
97
115
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
- JUDGE
|
|
102
|
-
- REFERENCE
|
|
116
|
+
reference_data:
|
|
117
|
+
path:
|
|
118
|
+
data:
|
|
103
119
|
|
|
104
|
-
|
|
120
|
+
endpoint:
|
|
105
121
|
base_url: "http://127.0.0.1:8000"
|
|
106
122
|
url_path: ''
|
|
107
123
|
api_key: "<API-KEY>"
|
|
108
124
|
bearer_token: "<BEARER-TOKEN>"
|
|
109
125
|
model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
110
|
-
payload_path: "../../src/data/payload_example_1.yaml"
|
|
111
126
|
default_request_payload_template:
|
|
127
|
+
# Change the user message field name only according to the request payload schema (example: 'prompt' to 'message').
|
|
112
128
|
prompt: "${user_message}"
|
|
113
129
|
details: "${request_payload}" # Rest of the request payload data.
|
|
114
130
|
default_response_payload_template:
|
|
131
|
+
# Change the placeholder value only according to the response payload schema (example: ${agent_reply} to ${reply}).
|
|
115
132
|
agent_reply: "${agent_reply}"
|
|
116
|
-
guardrail_flag: "${guardrail_flag}"
|
|
117
133
|
generated_metadata: "${generated_metadata}"
|
|
118
134
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
metrics_map:
|
|
124
|
-
field_1: EXACT
|
|
125
|
-
field_2: LEVENSHTEIN
|
|
135
|
+
repository:
|
|
136
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM, MONGODB.
|
|
137
|
+
project_id: "(default)"
|
|
138
|
+
database_name: ""
|
|
126
139
|
```
|
|
127
140
|
|
|
128
141
|
- **Endpoint Configuration**: Define how to interact with your LLM-based system (base URL, auth, payload templates).
|
|
@@ -133,33 +146,26 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
133
146
|
|
|
134
147
|
```json
|
|
135
148
|
{
|
|
136
|
-
"id": "1fa6f6ed-3cfe-4c0b-b389-7292f58879d4",
|
|
137
149
|
"scripts": [
|
|
138
150
|
{
|
|
139
|
-
"id": "65f58cec-d55d-4a24-bf16-fa8327a3aa6b",
|
|
140
151
|
"interactions": [
|
|
141
152
|
{
|
|
142
|
-
"id": "e99a2898-6a79-4a20-ac85-dfe977ea9935",
|
|
143
153
|
"user_message": "Hello, I would like to book an appointment with a doctor.",
|
|
144
154
|
"reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
|
|
145
155
|
"interaction_type": "initial",
|
|
146
156
|
"reference_metadata": {},
|
|
147
|
-
"generated_metadata": {},
|
|
148
157
|
"guardrail_flag": false,
|
|
149
158
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
150
159
|
},
|
|
151
160
|
{
|
|
152
|
-
"id": "fe5c539a-d0a1-40ee-97bd-dbe456703ccc",
|
|
153
161
|
"user_message": "I need to see a cardiologist.",
|
|
154
162
|
"reference_reply": "When would you like to schedule your appointment?",
|
|
155
163
|
"interaction_type": "intermediate",
|
|
156
164
|
"reference_metadata": {},
|
|
157
|
-
"generated_metadata": {},
|
|
158
165
|
"guardrail_flag": false,
|
|
159
166
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
160
167
|
},
|
|
161
168
|
{
|
|
162
|
-
"id": "2cfdbd1c-a065-48bb-9aa9-b958342154b1",
|
|
163
169
|
"user_message": "I would like to book it for next Monday morning.",
|
|
164
170
|
"reference_reply": "We have an available slot at 10 AM next Monday. Does that work for you?",
|
|
165
171
|
"interaction_type": "intermediate",
|
|
@@ -168,11 +174,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
168
174
|
"date": "next Monday",
|
|
169
175
|
"time": "10 AM"
|
|
170
176
|
},
|
|
171
|
-
"generated_metadata": {
|
|
172
|
-
"appointment_type": "Cardiology",
|
|
173
|
-
"date": "next Monday",
|
|
174
|
-
"time": "morning"
|
|
175
|
-
},
|
|
176
177
|
"guardrail_flag": false,
|
|
177
178
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
178
179
|
},
|
|
@@ -182,7 +183,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
182
183
|
"reference_reply": "Your appointment with the cardiologist is booked for 10 AM next Monday. Is there anything else I can help you with?",
|
|
183
184
|
"interaction_type": "final",
|
|
184
185
|
"reference_metadata": {},
|
|
185
|
-
"generated_metadata": {},
|
|
186
186
|
"guardrail_flag": false,
|
|
187
187
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
188
188
|
}
|
|
@@ -195,9 +195,22 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
195
195
|
]
|
|
196
196
|
}
|
|
197
197
|
```
|
|
198
|
-
|
|
199
198
|
- **Fields**: Include user messages, reference/references replies, metadata for comparison, guardrail flags, and request payloads.
|
|
200
199
|
|
|
200
|
+
In the `.env` you need to add the LLM providers credentials that will be used for the evaluation process.
|
|
201
|
+
```
|
|
202
|
+
OPENAI_API_KEY=
|
|
203
|
+
IONOS_API_KEY=
|
|
204
|
+
ANTHROPIC_API_KEY=
|
|
205
|
+
MISTRAL_API_KEY=
|
|
206
|
+
|
|
207
|
+
# For IONOS, you must include the base URL and the model ID.
|
|
208
|
+
IONOS_BASE_URL="https://inference.de-txl.ionos.com"
|
|
209
|
+
IONOS_MODEL_ID="0b6c4a15-bb8d-4092-82b0-f357b77c59fd"
|
|
210
|
+
|
|
211
|
+
WORKFLOW_CONFIG_PATH="../../src/data/workflow_config_1.yaml"
|
|
212
|
+
```
|
|
213
|
+
|
|
201
214
|
## Usage Example
|
|
202
215
|
|
|
203
216
|
To run an evaluation:
|
|
@@ -214,15 +227,68 @@ if __name__ == "__main__":
|
|
|
214
227
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
215
228
|
|
|
216
229
|
# Run evaluation session
|
|
217
|
-
with EvaluationSession(session_name="
|
|
230
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
|
|
231
|
+
session.run()
|
|
232
|
+
results = session.workflow.collect_results()
|
|
233
|
+
print("Results:", results)
|
|
234
|
+
|
|
235
|
+
stats = session.get_stats()
|
|
236
|
+
print(f"session stats:\n{stats}")
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
Alternatively, if you want to pass the configuration and reference data from in-memory variables,
|
|
240
|
+
you can manually load the data like the following:
|
|
241
|
+
```python
|
|
242
|
+
if __name__ == "__main__":
|
|
243
|
+
from levelapp.workflow import WorkflowConfig
|
|
244
|
+
from levelapp.core.session import EvaluationSession
|
|
245
|
+
|
|
246
|
+
# Firestore -> retrieve endpoint config -> data => config_dict
|
|
247
|
+
|
|
248
|
+
config_dict = {
|
|
249
|
+
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
250
|
+
"evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
|
|
251
|
+
"reference_data": {"path": "", "data": {}},
|
|
252
|
+
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
253
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY", "metrics_map": {"field_1": "EXACT"}},
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
content = {
|
|
257
|
+
"scripts": [
|
|
258
|
+
{
|
|
259
|
+
"interactions": [
|
|
260
|
+
{
|
|
261
|
+
"user_message": "Hello!",
|
|
262
|
+
"reference_reply": "Hello, how can I help you!"
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
"user_message": "I need an apartment",
|
|
266
|
+
"reference_reply": "sorry, but I can only assist you with booking medical appointments."
|
|
267
|
+
},
|
|
268
|
+
]
|
|
269
|
+
},
|
|
270
|
+
]
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
# Load configuration from a dict variable
|
|
274
|
+
config = WorkflowConfig.from_dict(content=config_dict)
|
|
275
|
+
|
|
276
|
+
# Load reference data from dict variable
|
|
277
|
+
config.set_reference_data(content=content)
|
|
278
|
+
|
|
279
|
+
evaluation_session = EvaluationSession(session_name="test-session-2", workflow_config=config)
|
|
280
|
+
|
|
281
|
+
with evaluation_session as session:
|
|
218
282
|
session.run()
|
|
219
283
|
results = session.workflow.collect_results()
|
|
220
284
|
print("Results:", results)
|
|
221
285
|
|
|
222
286
|
stats = session.get_stats()
|
|
223
287
|
print(f"session stats:\n{stats}")
|
|
288
|
+
|
|
224
289
|
```
|
|
225
290
|
|
|
291
|
+
|
|
226
292
|
- This loads the config, runs the specified workflow (e.g., Simulator), collects results, and prints stats.
|
|
227
293
|
|
|
228
294
|
For more examples, see the `examples/` directory.
|
|
@@ -43,38 +43,47 @@ pip install levelapp
|
|
|
43
43
|
LevelApp uses a YAML configuration file to define the evaluation setup. Create a `workflow_config.yaml` with the following structure:
|
|
44
44
|
|
|
45
45
|
```yaml
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
process:
|
|
47
|
+
project_name: "test-project"
|
|
48
|
+
workflow_type: SIMULATOR # Pick one of the following workflows: SIMULATOR, COMPARATOR, ASSESSOR.
|
|
49
|
+
evaluation_params:
|
|
50
|
+
attempts: 1 # Add the number of simulation attempts.
|
|
51
|
+
batch_size: 5
|
|
52
|
+
|
|
53
|
+
evaluation:
|
|
54
|
+
evaluators: # Select from the following: JUDGE, REFERENCE, RAG.
|
|
55
|
+
- JUDGE
|
|
56
|
+
- REFERENCE
|
|
57
|
+
providers:
|
|
58
|
+
- openai
|
|
59
|
+
- ionos
|
|
60
|
+
metrics_map:
|
|
61
|
+
field_1: EXACT
|
|
62
|
+
field_2 : LEVENSHTEIN
|
|
49
63
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
- JUDGE
|
|
54
|
-
- REFERENCE
|
|
64
|
+
reference_data:
|
|
65
|
+
path:
|
|
66
|
+
data:
|
|
55
67
|
|
|
56
|
-
|
|
68
|
+
endpoint:
|
|
57
69
|
base_url: "http://127.0.0.1:8000"
|
|
58
70
|
url_path: ''
|
|
59
71
|
api_key: "<API-KEY>"
|
|
60
72
|
bearer_token: "<BEARER-TOKEN>"
|
|
61
73
|
model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
62
|
-
payload_path: "../../src/data/payload_example_1.yaml"
|
|
63
74
|
default_request_payload_template:
|
|
75
|
+
# Change the user message field name only according to the request payload schema (example: 'prompt' to 'message').
|
|
64
76
|
prompt: "${user_message}"
|
|
65
77
|
details: "${request_payload}" # Rest of the request payload data.
|
|
66
78
|
default_response_payload_template:
|
|
79
|
+
# Change the placeholder value only according to the response payload schema (example: ${agent_reply} to ${reply}).
|
|
67
80
|
agent_reply: "${agent_reply}"
|
|
68
|
-
guardrail_flag: "${guardrail_flag}"
|
|
69
81
|
generated_metadata: "${generated_metadata}"
|
|
70
82
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
metrics_map:
|
|
76
|
-
field_1: EXACT
|
|
77
|
-
field_2: LEVENSHTEIN
|
|
83
|
+
repository:
|
|
84
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM, MONGODB.
|
|
85
|
+
project_id: "(default)"
|
|
86
|
+
database_name: ""
|
|
78
87
|
```
|
|
79
88
|
|
|
80
89
|
- **Endpoint Configuration**: Define how to interact with your LLM-based system (base URL, auth, payload templates).
|
|
@@ -85,33 +94,26 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
85
94
|
|
|
86
95
|
```json
|
|
87
96
|
{
|
|
88
|
-
"id": "1fa6f6ed-3cfe-4c0b-b389-7292f58879d4",
|
|
89
97
|
"scripts": [
|
|
90
98
|
{
|
|
91
|
-
"id": "65f58cec-d55d-4a24-bf16-fa8327a3aa6b",
|
|
92
99
|
"interactions": [
|
|
93
100
|
{
|
|
94
|
-
"id": "e99a2898-6a79-4a20-ac85-dfe977ea9935",
|
|
95
101
|
"user_message": "Hello, I would like to book an appointment with a doctor.",
|
|
96
102
|
"reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
|
|
97
103
|
"interaction_type": "initial",
|
|
98
104
|
"reference_metadata": {},
|
|
99
|
-
"generated_metadata": {},
|
|
100
105
|
"guardrail_flag": false,
|
|
101
106
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
102
107
|
},
|
|
103
108
|
{
|
|
104
|
-
"id": "fe5c539a-d0a1-40ee-97bd-dbe456703ccc",
|
|
105
109
|
"user_message": "I need to see a cardiologist.",
|
|
106
110
|
"reference_reply": "When would you like to schedule your appointment?",
|
|
107
111
|
"interaction_type": "intermediate",
|
|
108
112
|
"reference_metadata": {},
|
|
109
|
-
"generated_metadata": {},
|
|
110
113
|
"guardrail_flag": false,
|
|
111
114
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
112
115
|
},
|
|
113
116
|
{
|
|
114
|
-
"id": "2cfdbd1c-a065-48bb-9aa9-b958342154b1",
|
|
115
117
|
"user_message": "I would like to book it for next Monday morning.",
|
|
116
118
|
"reference_reply": "We have an available slot at 10 AM next Monday. Does that work for you?",
|
|
117
119
|
"interaction_type": "intermediate",
|
|
@@ -120,11 +122,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
120
122
|
"date": "next Monday",
|
|
121
123
|
"time": "10 AM"
|
|
122
124
|
},
|
|
123
|
-
"generated_metadata": {
|
|
124
|
-
"appointment_type": "Cardiology",
|
|
125
|
-
"date": "next Monday",
|
|
126
|
-
"time": "morning"
|
|
127
|
-
},
|
|
128
125
|
"guardrail_flag": false,
|
|
129
126
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
130
127
|
},
|
|
@@ -134,7 +131,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
134
131
|
"reference_reply": "Your appointment with the cardiologist is booked for 10 AM next Monday. Is there anything else I can help you with?",
|
|
135
132
|
"interaction_type": "final",
|
|
136
133
|
"reference_metadata": {},
|
|
137
|
-
"generated_metadata": {},
|
|
138
134
|
"guardrail_flag": false,
|
|
139
135
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
140
136
|
}
|
|
@@ -147,9 +143,22 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
147
143
|
]
|
|
148
144
|
}
|
|
149
145
|
```
|
|
150
|
-
|
|
151
146
|
- **Fields**: Include user messages, reference/references replies, metadata for comparison, guardrail flags, and request payloads.
|
|
152
147
|
|
|
148
|
+
In the `.env` you need to add the LLM providers credentials that will be used for the evaluation process.
|
|
149
|
+
```
|
|
150
|
+
OPENAI_API_KEY=
|
|
151
|
+
IONOS_API_KEY=
|
|
152
|
+
ANTHROPIC_API_KEY=
|
|
153
|
+
MISTRAL_API_KEY=
|
|
154
|
+
|
|
155
|
+
# For IONOS, you must include the base URL and the model ID.
|
|
156
|
+
IONOS_BASE_URL="https://inference.de-txl.ionos.com"
|
|
157
|
+
IONOS_MODEL_ID="0b6c4a15-bb8d-4092-82b0-f357b77c59fd"
|
|
158
|
+
|
|
159
|
+
WORKFLOW_CONFIG_PATH="../../src/data/workflow_config_1.yaml"
|
|
160
|
+
```
|
|
161
|
+
|
|
153
162
|
## Usage Example
|
|
154
163
|
|
|
155
164
|
To run an evaluation:
|
|
@@ -166,15 +175,68 @@ if __name__ == "__main__":
|
|
|
166
175
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
167
176
|
|
|
168
177
|
# Run evaluation session
|
|
169
|
-
with EvaluationSession(session_name="
|
|
178
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
|
|
179
|
+
session.run()
|
|
180
|
+
results = session.workflow.collect_results()
|
|
181
|
+
print("Results:", results)
|
|
182
|
+
|
|
183
|
+
stats = session.get_stats()
|
|
184
|
+
print(f"session stats:\n{stats}")
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Alternatively, if you want to pass the configuration and reference data from in-memory variables,
|
|
188
|
+
you can manually load the data like the following:
|
|
189
|
+
```python
|
|
190
|
+
if __name__ == "__main__":
|
|
191
|
+
from levelapp.workflow import WorkflowConfig
|
|
192
|
+
from levelapp.core.session import EvaluationSession
|
|
193
|
+
|
|
194
|
+
# Firestore -> retrieve endpoint config -> data => config_dict
|
|
195
|
+
|
|
196
|
+
config_dict = {
|
|
197
|
+
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
198
|
+
"evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
|
|
199
|
+
"reference_data": {"path": "", "data": {}},
|
|
200
|
+
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
201
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY", "metrics_map": {"field_1": "EXACT"}},
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
content = {
|
|
205
|
+
"scripts": [
|
|
206
|
+
{
|
|
207
|
+
"interactions": [
|
|
208
|
+
{
|
|
209
|
+
"user_message": "Hello!",
|
|
210
|
+
"reference_reply": "Hello, how can I help you!"
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
"user_message": "I need an apartment",
|
|
214
|
+
"reference_reply": "sorry, but I can only assist you with booking medical appointments."
|
|
215
|
+
},
|
|
216
|
+
]
|
|
217
|
+
},
|
|
218
|
+
]
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
# Load configuration from a dict variable
|
|
222
|
+
config = WorkflowConfig.from_dict(content=config_dict)
|
|
223
|
+
|
|
224
|
+
# Load reference data from dict variable
|
|
225
|
+
config.set_reference_data(content=content)
|
|
226
|
+
|
|
227
|
+
evaluation_session = EvaluationSession(session_name="test-session-2", workflow_config=config)
|
|
228
|
+
|
|
229
|
+
with evaluation_session as session:
|
|
170
230
|
session.run()
|
|
171
231
|
results = session.workflow.collect_results()
|
|
172
232
|
print("Results:", results)
|
|
173
233
|
|
|
174
234
|
stats = session.get_stats()
|
|
175
235
|
print(f"session stats:\n{stats}")
|
|
236
|
+
|
|
176
237
|
```
|
|
177
238
|
|
|
239
|
+
|
|
178
240
|
- This loads the config, runs the specified workflow (e.g., Simulator), collects results, and prints stats.
|
|
179
241
|
|
|
180
242
|
For more examples, see the `examples/` directory.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ExtendedEnum(Enum):
|
|
5
|
+
@classmethod
|
|
6
|
+
def list(cls):
|
|
7
|
+
return [e.value for e in cls]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WorkflowType(ExtendedEnum):
|
|
11
|
+
SIMULATOR = "SIMULATOR"
|
|
12
|
+
COMPARATOR = "COMPARATOR"
|
|
13
|
+
ASSESSOR = "ASSESSOR"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RepositoryType(ExtendedEnum):
|
|
17
|
+
FIRESTORE = "FIRESTORE"
|
|
18
|
+
FILESYSTEM = "FILESYSTEM"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EvaluatorType(ExtendedEnum):
|
|
22
|
+
JUDGE = "JUDGE"
|
|
23
|
+
REFERENCE = "REFERENCE"
|
|
24
|
+
RAG = "RAG"
|
|
25
|
+
|
|
26
|
+
|