levelapp 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- {levelapp-0.1.0 → levelapp-0.1.2}/PKG-INFO +102 -39
- {levelapp-0.1.0 → levelapp-0.1.2}/README.md +96 -35
- levelapp-0.1.2/docs/media/simulator-module-diagram.PNG +0 -0
- levelapp-0.1.2/docs/media/simulator-sequence-diagram.png +0 -0
- levelapp-0.1.2/examples/README.md +322 -0
- levelapp-0.1.2/examples/conversation_script.json +38 -0
- levelapp-0.1.2/examples/example_chatbot.py +48 -0
- levelapp-0.1.2/examples/example_evaluation.py +28 -0
- levelapp-0.1.2/examples/workflow_configuration.yaml +38 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/aspects/monitor.py +3 -1
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/clients/__init__.py +0 -1
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/comparator/scorer.py +0 -2
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/config/endpoint.py +22 -13
- levelapp-0.1.2/levelapp/config/endpoint_.py +62 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/config/prompts.py +22 -0
- levelapp-0.1.2/levelapp/core/schemas.py +24 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/core/session.py +97 -59
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/evaluator/evaluator.py +42 -14
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/metrics/__init__.py +1 -5
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/repository/firestore.py +15 -6
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/simulator/schemas.py +15 -21
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/simulator/simulator.py +124 -55
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/simulator/utils.py +40 -78
- levelapp-0.1.2/levelapp/workflow/__init__.py +6 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/workflow/base.py +64 -17
- levelapp-0.1.2/levelapp/workflow/config.py +92 -0
- levelapp-0.1.2/levelapp/workflow/context.py +62 -0
- levelapp-0.1.2/levelapp/workflow/factory.py +42 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/workflow/registration.py +1 -1
- levelapp-0.1.2/levelapp/workflow/runtime.py +19 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/pyproject.toml +6 -4
- levelapp-0.1.2/src/data/evaluation_results.json +1 -0
- levelapp-0.1.2/src/data/workflow_config.yaml +41 -0
- levelapp-0.1.2/src/level_app/main_session.py +48 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/uv.lock +212 -5
- levelapp-0.1.0/examples/example_evaluation.py +0 -0
- levelapp-0.1.0/levelapp/workflow/__init__.py +0 -5
- levelapp-0.1.0/levelapp/workflow/factory.py +0 -51
- levelapp-0.1.0/levelapp/workflow/schemas.py +0 -121
- levelapp-0.1.0/src/data/workflow_config.yaml +0 -35
- levelapp-0.1.0/src/level_app/main_session.py +0 -16
- {levelapp-0.1.0 → levelapp-0.1.2}/.gitignore +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/.python-version +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/LICENSE +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/MANIFEST.in +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/Makefile +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/aspects/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/aspects/loader.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/aspects/logger.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/aspects/sanitizer.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/clients/anthropic.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/clients/ionos.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/clients/mistral.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/clients/openai.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/comparator/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/comparator/comparator.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/comparator/extractor.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/comparator/schemas.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/comparator/utils.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/config/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/core/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/core/base.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/evaluator/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/metrics/embedding.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/metrics/exact.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/metrics/fuzzy.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/metrics/token.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/plugins/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/repository/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/levelapp/simulator/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/make.bat +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/project_structure.txt +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/src/data/conversation_example_1.json +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/src/data/endpoint_configuration.yaml +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/src/data/payload_example_1.yaml +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/src/data/payload_example_2.yaml +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/src/data/workflow_config_2.json +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/src/level_app/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/src/level_app/main.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/src/level_app/main_monitoring.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/src/level_app/main_simulator.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/tests/__init__.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/tests/test_anthropic.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/tests/test_comparator.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/tests/test_ionos.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/tests/test_mistral.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/tests/test_monitoring.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/tests/test_openai.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/tests/test_session.py +0 -0
- {levelapp-0.1.0 → levelapp-0.1.2}/tests/test_simulator.py +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: levelapp
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
|
|
5
5
|
Project-URL: Homepage, https://github.com/levelapp-org
|
|
6
6
|
Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
|
|
7
7
|
Project-URL: Documentation, https://levelapp.readthedocs.io
|
|
8
8
|
Project-URL: Issues, https://github.com/levelapp-org/levelapp-framework/issues
|
|
9
|
-
Author-email:
|
|
9
|
+
Author-email: Mohamed Sofiene KADRI <ms.kadri.dev@gmail.com>
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: ai,evaluation,framework,llm,testing
|
|
12
12
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -17,10 +17,12 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
18
|
Classifier: Topic :: Software Development :: Testing
|
|
19
19
|
Requires-Python: >=3.12
|
|
20
|
-
Requires-Dist:
|
|
20
|
+
Requires-Dist: google-api-core>=2.25.1
|
|
21
|
+
Requires-Dist: google-auth>=2.40.3
|
|
22
|
+
Requires-Dist: google-cloud-firestore>=2.21.0
|
|
21
23
|
Requires-Dist: httpx>=0.28.1
|
|
24
|
+
Requires-Dist: humanize>=4.13.0
|
|
22
25
|
Requires-Dist: numpy>=2.3.2
|
|
23
|
-
Requires-Dist: openai>=1.99.9
|
|
24
26
|
Requires-Dist: pandas-stubs==2.3.0.250703
|
|
25
27
|
Requires-Dist: pandas>=2.3.1
|
|
26
28
|
Requires-Dist: pydantic>=2.11.7
|
|
@@ -91,38 +93,47 @@ pip install levelapp
|
|
|
91
93
|
LevelApp uses a YAML configuration file to define the evaluation setup. Create a `workflow_config.yaml` with the following structure:
|
|
92
94
|
|
|
93
95
|
```yaml
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
96
|
+
process:
|
|
97
|
+
project_name: "test-project"
|
|
98
|
+
workflow_type: SIMULATOR # Pick one of the following workflows: SIMULATOR, COMPARATOR, ASSESSOR.
|
|
99
|
+
evaluation_params:
|
|
100
|
+
attempts: 1 # Add the number of simulation attempts.
|
|
101
|
+
batch_size: 5
|
|
102
|
+
|
|
103
|
+
evaluation:
|
|
104
|
+
evaluators: # Select from the following: JUDGE, REFERENCE, RAG.
|
|
105
|
+
- JUDGE
|
|
106
|
+
- REFERENCE
|
|
107
|
+
providers:
|
|
108
|
+
- openai
|
|
109
|
+
- ionos
|
|
110
|
+
metrics_map:
|
|
111
|
+
field_1: EXACT
|
|
112
|
+
field_2 : LEVENSHTEIN
|
|
97
113
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
- JUDGE
|
|
102
|
-
- REFERENCE
|
|
114
|
+
reference_data:
|
|
115
|
+
path:
|
|
116
|
+
data:
|
|
103
117
|
|
|
104
|
-
|
|
118
|
+
endpoint:
|
|
105
119
|
base_url: "http://127.0.0.1:8000"
|
|
106
120
|
url_path: ''
|
|
107
121
|
api_key: "<API-KEY>"
|
|
108
122
|
bearer_token: "<BEARER-TOKEN>"
|
|
109
123
|
model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
110
|
-
payload_path: "../../src/data/payload_example_1.yaml"
|
|
111
124
|
default_request_payload_template:
|
|
125
|
+
# Change the user message field name only according to the request payload schema (example: 'prompt' to 'message').
|
|
112
126
|
prompt: "${user_message}"
|
|
113
127
|
details: "${request_payload}" # Rest of the request payload data.
|
|
114
128
|
default_response_payload_template:
|
|
129
|
+
# Change the placeholder value only according to the response payload schema (example: ${agent_reply} to ${reply}).
|
|
115
130
|
agent_reply: "${agent_reply}"
|
|
116
|
-
guardrail_flag: "${guardrail_flag}"
|
|
117
131
|
generated_metadata: "${generated_metadata}"
|
|
118
132
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
metrics_map:
|
|
124
|
-
field_1: EXACT
|
|
125
|
-
field_2: LEVENSHTEIN
|
|
133
|
+
repository:
|
|
134
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
135
|
+
project_id: "(default)"
|
|
136
|
+
database_name: ""
|
|
126
137
|
```
|
|
127
138
|
|
|
128
139
|
- **Endpoint Configuration**: Define how to interact with your LLM-based system (base URL, auth, payload templates).
|
|
@@ -133,33 +144,26 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
133
144
|
|
|
134
145
|
```json
|
|
135
146
|
{
|
|
136
|
-
"id": "1fa6f6ed-3cfe-4c0b-b389-7292f58879d4",
|
|
137
147
|
"scripts": [
|
|
138
148
|
{
|
|
139
|
-
"id": "65f58cec-d55d-4a24-bf16-fa8327a3aa6b",
|
|
140
149
|
"interactions": [
|
|
141
150
|
{
|
|
142
|
-
"id": "e99a2898-6a79-4a20-ac85-dfe977ea9935",
|
|
143
151
|
"user_message": "Hello, I would like to book an appointment with a doctor.",
|
|
144
152
|
"reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
|
|
145
153
|
"interaction_type": "initial",
|
|
146
154
|
"reference_metadata": {},
|
|
147
|
-
"generated_metadata": {},
|
|
148
155
|
"guardrail_flag": false,
|
|
149
156
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
150
157
|
},
|
|
151
158
|
{
|
|
152
|
-
"id": "fe5c539a-d0a1-40ee-97bd-dbe456703ccc",
|
|
153
159
|
"user_message": "I need to see a cardiologist.",
|
|
154
160
|
"reference_reply": "When would you like to schedule your appointment?",
|
|
155
161
|
"interaction_type": "intermediate",
|
|
156
162
|
"reference_metadata": {},
|
|
157
|
-
"generated_metadata": {},
|
|
158
163
|
"guardrail_flag": false,
|
|
159
164
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
160
165
|
},
|
|
161
166
|
{
|
|
162
|
-
"id": "2cfdbd1c-a065-48bb-9aa9-b958342154b1",
|
|
163
167
|
"user_message": "I would like to book it for next Monday morning.",
|
|
164
168
|
"reference_reply": "We have an available slot at 10 AM next Monday. Does that work for you?",
|
|
165
169
|
"interaction_type": "intermediate",
|
|
@@ -168,11 +172,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
168
172
|
"date": "next Monday",
|
|
169
173
|
"time": "10 AM"
|
|
170
174
|
},
|
|
171
|
-
"generated_metadata": {
|
|
172
|
-
"appointment_type": "Cardiology",
|
|
173
|
-
"date": "next Monday",
|
|
174
|
-
"time": "morning"
|
|
175
|
-
},
|
|
176
175
|
"guardrail_flag": false,
|
|
177
176
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
178
177
|
},
|
|
@@ -182,7 +181,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
182
181
|
"reference_reply": "Your appointment with the cardiologist is booked for 10 AM next Monday. Is there anything else I can help you with?",
|
|
183
182
|
"interaction_type": "final",
|
|
184
183
|
"reference_metadata": {},
|
|
185
|
-
"generated_metadata": {},
|
|
186
184
|
"guardrail_flag": false,
|
|
187
185
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
188
186
|
}
|
|
@@ -195,9 +193,22 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
195
193
|
]
|
|
196
194
|
}
|
|
197
195
|
```
|
|
198
|
-
|
|
199
196
|
- **Fields**: Include user messages, reference/references replies, metadata for comparison, guardrail flags, and request payloads.
|
|
200
197
|
|
|
198
|
+
In the `.env` you need to add the LLM providers credentials that will be used for the evaluation process.
|
|
199
|
+
```
|
|
200
|
+
OPENAI_API_KEY=
|
|
201
|
+
IONOS_API_KEY=
|
|
202
|
+
ANTHROPIC_API_KEY=
|
|
203
|
+
MISTRAL_API_KEY=
|
|
204
|
+
|
|
205
|
+
# For IONOS, you must include the base URL and the model ID.
|
|
206
|
+
IONOS_BASE_URL="https://inference.de-txl.ionos.com"
|
|
207
|
+
IONOS_MODEL_ID="0b6c4a15-bb8d-4092-82b0-f357b77c59fd"
|
|
208
|
+
|
|
209
|
+
WORKFLOW_CONFIG_PATH="../../src/data/workflow_config_1.yaml"
|
|
210
|
+
```
|
|
211
|
+
|
|
201
212
|
## Usage Example
|
|
202
213
|
|
|
203
214
|
To run an evaluation:
|
|
@@ -207,14 +218,14 @@ To run an evaluation:
|
|
|
207
218
|
|
|
208
219
|
```python
|
|
209
220
|
if __name__ == "__main__":
|
|
210
|
-
from levelapp.workflow
|
|
221
|
+
from levelapp.workflow import WorkflowConfig
|
|
211
222
|
from levelapp.core.session import EvaluationSession
|
|
212
223
|
|
|
213
224
|
# Load configuration from YAML
|
|
214
225
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
215
226
|
|
|
216
|
-
# Run evaluation session
|
|
217
|
-
with EvaluationSession(session_name="
|
|
227
|
+
# Run evaluation session (You can enable/disable the monitoring aspect)
|
|
228
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
|
|
218
229
|
session.run()
|
|
219
230
|
results = session.workflow.collect_results()
|
|
220
231
|
print("Results:", results)
|
|
@@ -223,6 +234,58 @@ if __name__ == "__main__":
|
|
|
223
234
|
print(f"session stats:\n{stats}")
|
|
224
235
|
```
|
|
225
236
|
|
|
237
|
+
Alternatively, if you want to pass the configuration and reference data from in-memory variables,
|
|
238
|
+
you can manually load the data like the following:
|
|
239
|
+
```python
|
|
240
|
+
if __name__ == "__main__":
|
|
241
|
+
from levelapp.workflow import WorkflowConfig
|
|
242
|
+
from levelapp.core.session import EvaluationSession
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
config_dict = {
|
|
246
|
+
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
247
|
+
"evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
|
|
248
|
+
"reference_data": {"path": "", "data": {}},
|
|
249
|
+
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
250
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
content = {
|
|
254
|
+
"scripts": [
|
|
255
|
+
{
|
|
256
|
+
"interactions": [
|
|
257
|
+
{
|
|
258
|
+
"user_message": "Hello!",
|
|
259
|
+
"reference_reply": "Hello, how can I help you!"
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
"user_message": "I need an apartment",
|
|
263
|
+
"reference_reply": "sorry, but I can only assist you with booking medical appointments."
|
|
264
|
+
},
|
|
265
|
+
]
|
|
266
|
+
},
|
|
267
|
+
]
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
# Load configuration from a dict variable
|
|
271
|
+
config = WorkflowConfig.from_dict(content=config_dict)
|
|
272
|
+
|
|
273
|
+
# Load reference data from dict variable
|
|
274
|
+
config.set_reference_data(content=content)
|
|
275
|
+
|
|
276
|
+
evaluation_session = EvaluationSession(session_name="test-session-2", workflow_config=config)
|
|
277
|
+
|
|
278
|
+
with evaluation_session as session:
|
|
279
|
+
session.run()
|
|
280
|
+
results = session.workflow.collect_results()
|
|
281
|
+
print("Results:", results)
|
|
282
|
+
|
|
283
|
+
stats = session.get_stats()
|
|
284
|
+
print(f"session stats:\n{stats}")
|
|
285
|
+
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
|
|
226
289
|
- This loads the config, runs the specified workflow (e.g., Simulator), collects results, and prints stats.
|
|
227
290
|
|
|
228
291
|
For more examples, see the `examples/` directory.
|
|
@@ -43,38 +43,47 @@ pip install levelapp
|
|
|
43
43
|
LevelApp uses a YAML configuration file to define the evaluation setup. Create a `workflow_config.yaml` with the following structure:
|
|
44
44
|
|
|
45
45
|
```yaml
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
process:
|
|
47
|
+
project_name: "test-project"
|
|
48
|
+
workflow_type: SIMULATOR # Pick one of the following workflows: SIMULATOR, COMPARATOR, ASSESSOR.
|
|
49
|
+
evaluation_params:
|
|
50
|
+
attempts: 1 # Add the number of simulation attempts.
|
|
51
|
+
batch_size: 5
|
|
52
|
+
|
|
53
|
+
evaluation:
|
|
54
|
+
evaluators: # Select from the following: JUDGE, REFERENCE, RAG.
|
|
55
|
+
- JUDGE
|
|
56
|
+
- REFERENCE
|
|
57
|
+
providers:
|
|
58
|
+
- openai
|
|
59
|
+
- ionos
|
|
60
|
+
metrics_map:
|
|
61
|
+
field_1: EXACT
|
|
62
|
+
field_2 : LEVENSHTEIN
|
|
49
63
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
- JUDGE
|
|
54
|
-
- REFERENCE
|
|
64
|
+
reference_data:
|
|
65
|
+
path:
|
|
66
|
+
data:
|
|
55
67
|
|
|
56
|
-
|
|
68
|
+
endpoint:
|
|
57
69
|
base_url: "http://127.0.0.1:8000"
|
|
58
70
|
url_path: ''
|
|
59
71
|
api_key: "<API-KEY>"
|
|
60
72
|
bearer_token: "<BEARER-TOKEN>"
|
|
61
73
|
model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|
62
|
-
payload_path: "../../src/data/payload_example_1.yaml"
|
|
63
74
|
default_request_payload_template:
|
|
75
|
+
# Change the user message field name only according to the request payload schema (example: 'prompt' to 'message').
|
|
64
76
|
prompt: "${user_message}"
|
|
65
77
|
details: "${request_payload}" # Rest of the request payload data.
|
|
66
78
|
default_response_payload_template:
|
|
79
|
+
# Change the placeholder value only according to the response payload schema (example: ${agent_reply} to ${reply}).
|
|
67
80
|
agent_reply: "${agent_reply}"
|
|
68
|
-
guardrail_flag: "${guardrail_flag}"
|
|
69
81
|
generated_metadata: "${generated_metadata}"
|
|
70
82
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
metrics_map:
|
|
76
|
-
field_1: EXACT
|
|
77
|
-
field_2: LEVENSHTEIN
|
|
83
|
+
repository:
|
|
84
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
85
|
+
project_id: "(default)"
|
|
86
|
+
database_name: ""
|
|
78
87
|
```
|
|
79
88
|
|
|
80
89
|
- **Endpoint Configuration**: Define how to interact with your LLM-based system (base URL, auth, payload templates).
|
|
@@ -85,33 +94,26 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
85
94
|
|
|
86
95
|
```json
|
|
87
96
|
{
|
|
88
|
-
"id": "1fa6f6ed-3cfe-4c0b-b389-7292f58879d4",
|
|
89
97
|
"scripts": [
|
|
90
98
|
{
|
|
91
|
-
"id": "65f58cec-d55d-4a24-bf16-fa8327a3aa6b",
|
|
92
99
|
"interactions": [
|
|
93
100
|
{
|
|
94
|
-
"id": "e99a2898-6a79-4a20-ac85-dfe977ea9935",
|
|
95
101
|
"user_message": "Hello, I would like to book an appointment with a doctor.",
|
|
96
102
|
"reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
|
|
97
103
|
"interaction_type": "initial",
|
|
98
104
|
"reference_metadata": {},
|
|
99
|
-
"generated_metadata": {},
|
|
100
105
|
"guardrail_flag": false,
|
|
101
106
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
102
107
|
},
|
|
103
108
|
{
|
|
104
|
-
"id": "fe5c539a-d0a1-40ee-97bd-dbe456703ccc",
|
|
105
109
|
"user_message": "I need to see a cardiologist.",
|
|
106
110
|
"reference_reply": "When would you like to schedule your appointment?",
|
|
107
111
|
"interaction_type": "intermediate",
|
|
108
112
|
"reference_metadata": {},
|
|
109
|
-
"generated_metadata": {},
|
|
110
113
|
"guardrail_flag": false,
|
|
111
114
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
112
115
|
},
|
|
113
116
|
{
|
|
114
|
-
"id": "2cfdbd1c-a065-48bb-9aa9-b958342154b1",
|
|
115
117
|
"user_message": "I would like to book it for next Monday morning.",
|
|
116
118
|
"reference_reply": "We have an available slot at 10 AM next Monday. Does that work for you?",
|
|
117
119
|
"interaction_type": "intermediate",
|
|
@@ -120,11 +122,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
120
122
|
"date": "next Monday",
|
|
121
123
|
"time": "10 AM"
|
|
122
124
|
},
|
|
123
|
-
"generated_metadata": {
|
|
124
|
-
"appointment_type": "Cardiology",
|
|
125
|
-
"date": "next Monday",
|
|
126
|
-
"time": "morning"
|
|
127
|
-
},
|
|
128
125
|
"guardrail_flag": false,
|
|
129
126
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
130
127
|
},
|
|
@@ -134,7 +131,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
134
131
|
"reference_reply": "Your appointment with the cardiologist is booked for 10 AM next Monday. Is there anything else I can help you with?",
|
|
135
132
|
"interaction_type": "final",
|
|
136
133
|
"reference_metadata": {},
|
|
137
|
-
"generated_metadata": {},
|
|
138
134
|
"guardrail_flag": false,
|
|
139
135
|
"request_payload": {"user_id": "0001", "user_role": "ADMIN"}
|
|
140
136
|
}
|
|
@@ -147,9 +143,22 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
|
|
|
147
143
|
]
|
|
148
144
|
}
|
|
149
145
|
```
|
|
150
|
-
|
|
151
146
|
- **Fields**: Include user messages, reference/references replies, metadata for comparison, guardrail flags, and request payloads.
|
|
152
147
|
|
|
148
|
+
In the `.env` you need to add the LLM providers credentials that will be used for the evaluation process.
|
|
149
|
+
```
|
|
150
|
+
OPENAI_API_KEY=
|
|
151
|
+
IONOS_API_KEY=
|
|
152
|
+
ANTHROPIC_API_KEY=
|
|
153
|
+
MISTRAL_API_KEY=
|
|
154
|
+
|
|
155
|
+
# For IONOS, you must include the base URL and the model ID.
|
|
156
|
+
IONOS_BASE_URL="https://inference.de-txl.ionos.com"
|
|
157
|
+
IONOS_MODEL_ID="0b6c4a15-bb8d-4092-82b0-f357b77c59fd"
|
|
158
|
+
|
|
159
|
+
WORKFLOW_CONFIG_PATH="../../src/data/workflow_config_1.yaml"
|
|
160
|
+
```
|
|
161
|
+
|
|
153
162
|
## Usage Example
|
|
154
163
|
|
|
155
164
|
To run an evaluation:
|
|
@@ -159,14 +168,14 @@ To run an evaluation:
|
|
|
159
168
|
|
|
160
169
|
```python
|
|
161
170
|
if __name__ == "__main__":
|
|
162
|
-
from levelapp.workflow
|
|
171
|
+
from levelapp.workflow import WorkflowConfig
|
|
163
172
|
from levelapp.core.session import EvaluationSession
|
|
164
173
|
|
|
165
174
|
# Load configuration from YAML
|
|
166
175
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
167
176
|
|
|
168
|
-
# Run evaluation session
|
|
169
|
-
with EvaluationSession(session_name="
|
|
177
|
+
# Run evaluation session (You can enable/disable the monitoring aspect)
|
|
178
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
|
|
170
179
|
session.run()
|
|
171
180
|
results = session.workflow.collect_results()
|
|
172
181
|
print("Results:", results)
|
|
@@ -175,6 +184,58 @@ if __name__ == "__main__":
|
|
|
175
184
|
print(f"session stats:\n{stats}")
|
|
176
185
|
```
|
|
177
186
|
|
|
187
|
+
Alternatively, if you want to pass the configuration and reference data from in-memory variables,
|
|
188
|
+
you can manually load the data like the following:
|
|
189
|
+
```python
|
|
190
|
+
if __name__ == "__main__":
|
|
191
|
+
from levelapp.workflow import WorkflowConfig
|
|
192
|
+
from levelapp.core.session import EvaluationSession
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
config_dict = {
|
|
196
|
+
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
197
|
+
"evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
|
|
198
|
+
"reference_data": {"path": "", "data": {}},
|
|
199
|
+
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
200
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
content = {
|
|
204
|
+
"scripts": [
|
|
205
|
+
{
|
|
206
|
+
"interactions": [
|
|
207
|
+
{
|
|
208
|
+
"user_message": "Hello!",
|
|
209
|
+
"reference_reply": "Hello, how can I help you!"
|
|
210
|
+
},
|
|
211
|
+
{
|
|
212
|
+
"user_message": "I need an apartment",
|
|
213
|
+
"reference_reply": "sorry, but I can only assist you with booking medical appointments."
|
|
214
|
+
},
|
|
215
|
+
]
|
|
216
|
+
},
|
|
217
|
+
]
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
# Load configuration from a dict variable
|
|
221
|
+
config = WorkflowConfig.from_dict(content=config_dict)
|
|
222
|
+
|
|
223
|
+
# Load reference data from dict variable
|
|
224
|
+
config.set_reference_data(content=content)
|
|
225
|
+
|
|
226
|
+
evaluation_session = EvaluationSession(session_name="test-session-2", workflow_config=config)
|
|
227
|
+
|
|
228
|
+
with evaluation_session as session:
|
|
229
|
+
session.run()
|
|
230
|
+
results = session.workflow.collect_results()
|
|
231
|
+
print("Results:", results)
|
|
232
|
+
|
|
233
|
+
stats = session.get_stats()
|
|
234
|
+
print(f"session stats:\n{stats}")
|
|
235
|
+
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
|
|
178
239
|
- This loads the config, runs the specified workflow (e.g., Simulator), collects results, and prints stats.
|
|
179
240
|
|
|
180
241
|
For more examples, see the `examples/` directory.
|
|
Binary file
|
|
Binary file
|