levelapp 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of levelapp might be problematic. Click here for more details.

Files changed (82) hide show
  1. {levelapp-0.1.0 → levelapp-0.1.1}/PKG-INFO +101 -35
  2. {levelapp-0.1.0 → levelapp-0.1.1}/README.md +95 -33
  3. levelapp-0.1.1/levelapp/core/schemas.py +26 -0
  4. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/core/session.py +68 -56
  5. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/evaluator/evaluator.py +27 -11
  6. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/repository/firestore.py +15 -6
  7. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/simulator/schemas.py +9 -9
  8. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/simulator/simulator.py +103 -37
  9. levelapp-0.1.1/levelapp/workflow/__init__.py +6 -0
  10. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/workflow/base.py +26 -14
  11. levelapp-0.1.1/levelapp/workflow/config.py +65 -0
  12. levelapp-0.1.1/levelapp/workflow/context.py +63 -0
  13. levelapp-0.1.1/levelapp/workflow/factory.py +29 -0
  14. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/workflow/registration.py +1 -1
  15. levelapp-0.1.1/levelapp/workflow/runtime.py +19 -0
  16. {levelapp-0.1.0 → levelapp-0.1.1}/pyproject.toml +6 -2
  17. levelapp-0.1.1/src/data/workflow_config.yaml +41 -0
  18. levelapp-0.1.1/src/level_app/main_session.py +46 -0
  19. {levelapp-0.1.0 → levelapp-0.1.1}/uv.lock +212 -1
  20. levelapp-0.1.0/levelapp/workflow/__init__.py +0 -5
  21. levelapp-0.1.0/levelapp/workflow/factory.py +0 -51
  22. levelapp-0.1.0/levelapp/workflow/schemas.py +0 -121
  23. levelapp-0.1.0/src/data/workflow_config.yaml +0 -35
  24. levelapp-0.1.0/src/level_app/main_session.py +0 -16
  25. {levelapp-0.1.0 → levelapp-0.1.1}/.gitignore +0 -0
  26. {levelapp-0.1.0 → levelapp-0.1.1}/.python-version +0 -0
  27. {levelapp-0.1.0 → levelapp-0.1.1}/LICENSE +0 -0
  28. {levelapp-0.1.0 → levelapp-0.1.1}/MANIFEST.in +0 -0
  29. {levelapp-0.1.0 → levelapp-0.1.1}/Makefile +0 -0
  30. {levelapp-0.1.0 → levelapp-0.1.1}/examples/example_evaluation.py +0 -0
  31. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/__init__.py +0 -0
  32. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/aspects/__init__.py +0 -0
  33. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/aspects/loader.py +0 -0
  34. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/aspects/logger.py +0 -0
  35. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/aspects/monitor.py +0 -0
  36. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/aspects/sanitizer.py +0 -0
  37. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/clients/__init__.py +0 -0
  38. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/clients/anthropic.py +0 -0
  39. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/clients/ionos.py +0 -0
  40. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/clients/mistral.py +0 -0
  41. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/clients/openai.py +0 -0
  42. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/__init__.py +0 -0
  43. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/comparator.py +0 -0
  44. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/extractor.py +0 -0
  45. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/schemas.py +0 -0
  46. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/scorer.py +0 -0
  47. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/comparator/utils.py +0 -0
  48. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/config/__init__.py +0 -0
  49. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/config/endpoint.py +0 -0
  50. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/config/prompts.py +0 -0
  51. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/core/__init__.py +0 -0
  52. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/core/base.py +0 -0
  53. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/evaluator/__init__.py +0 -0
  54. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/metrics/__init__.py +0 -0
  55. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/metrics/embedding.py +0 -0
  56. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/metrics/exact.py +0 -0
  57. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/metrics/fuzzy.py +0 -0
  58. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/metrics/token.py +0 -0
  59. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/plugins/__init__.py +0 -0
  60. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/repository/__init__.py +0 -0
  61. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/simulator/__init__.py +0 -0
  62. {levelapp-0.1.0 → levelapp-0.1.1}/levelapp/simulator/utils.py +0 -0
  63. {levelapp-0.1.0 → levelapp-0.1.1}/make.bat +0 -0
  64. {levelapp-0.1.0 → levelapp-0.1.1}/project_structure.txt +0 -0
  65. {levelapp-0.1.0 → levelapp-0.1.1}/src/data/conversation_example_1.json +0 -0
  66. {levelapp-0.1.0 → levelapp-0.1.1}/src/data/endpoint_configuration.yaml +0 -0
  67. {levelapp-0.1.0 → levelapp-0.1.1}/src/data/payload_example_1.yaml +0 -0
  68. {levelapp-0.1.0 → levelapp-0.1.1}/src/data/payload_example_2.yaml +0 -0
  69. {levelapp-0.1.0 → levelapp-0.1.1}/src/data/workflow_config_2.json +0 -0
  70. {levelapp-0.1.0 → levelapp-0.1.1}/src/level_app/__init__.py +0 -0
  71. {levelapp-0.1.0 → levelapp-0.1.1}/src/level_app/main.py +0 -0
  72. {levelapp-0.1.0 → levelapp-0.1.1}/src/level_app/main_monitoring.py +0 -0
  73. {levelapp-0.1.0 → levelapp-0.1.1}/src/level_app/main_simulator.py +0 -0
  74. {levelapp-0.1.0 → levelapp-0.1.1}/tests/__init__.py +0 -0
  75. {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_anthropic.py +0 -0
  76. {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_comparator.py +0 -0
  77. {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_ionos.py +0 -0
  78. {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_mistral.py +0 -0
  79. {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_monitoring.py +0 -0
  80. {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_openai.py +0 -0
  81. {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_session.py +0 -0
  82. {levelapp-0.1.0 → levelapp-0.1.1}/tests/test_simulator.py +0 -0
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: levelapp
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
5
5
  Project-URL: Homepage, https://github.com/levelapp-org
6
6
  Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
7
7
  Project-URL: Documentation, https://levelapp.readthedocs.io
8
8
  Project-URL: Issues, https://github.com/levelapp-org/levelapp-framework/issues
9
- Author-email: KadriSof <kadrisofyen@gmail.com>
9
+ Author-email: Mohamed Sofiene KADRI <ms.kadri.dev@gmail.com>
10
10
  License-File: LICENSE
11
11
  Keywords: ai,evaluation,framework,llm,testing
12
12
  Classifier: Development Status :: 3 - Alpha
@@ -18,7 +18,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Classifier: Topic :: Software Development :: Testing
19
19
  Requires-Python: >=3.12
20
20
  Requires-Dist: arrow>=1.3.0
21
+ Requires-Dist: google-api-core>=2.25.1
22
+ Requires-Dist: google-auth>=2.40.3
23
+ Requires-Dist: google-cloud-firestore>=2.21.0
21
24
  Requires-Dist: httpx>=0.28.1
25
+ Requires-Dist: humanize>=4.13.0
22
26
  Requires-Dist: numpy>=2.3.2
23
27
  Requires-Dist: openai>=1.99.9
24
28
  Requires-Dist: pandas-stubs==2.3.0.250703
@@ -91,38 +95,47 @@ pip install levelapp
91
95
  LevelApp uses a YAML configuration file to define the evaluation setup. Create a `workflow_config.yaml` with the following structure:
92
96
 
93
97
  ```yaml
94
- project_name: "test-project"
95
- evaluation_params:
96
- attempts: 1 # Number of simulation attempts.
98
+ process:
99
+ project_name: "test-project"
100
+ workflow_type: SIMULATOR # Pick one of the following workflows: SIMULATOR, COMPARATOR, ASSESSOR.
101
+ evaluation_params:
102
+ attempts: 1 # Add the number of simulation attempts.
103
+ batch_size: 5
104
+
105
+ evaluation:
106
+ evaluators: # Select from the following: JUDGE, REFERENCE, RAG.
107
+ - JUDGE
108
+ - REFERENCE
109
+ providers:
110
+ - openai
111
+ - ionos
112
+ metrics_map:
113
+ field_1: EXACT
114
+ field_2 : LEVENSHTEIN
97
115
 
98
- workflow: SIMULATOR # SIMULATOR, COMPARATOR, ASSESSOR.
99
- repository: FIRESTORE # FIRESTORE, FILESYSTEM, MONGODB.
100
- evaluators: # JUDGE, REFERENCE, RAG.
101
- - JUDGE
102
- - REFERENCE
116
+ reference_data:
117
+ path:
118
+ data:
103
119
 
104
- endpoint_configuration:
120
+ endpoint:
105
121
  base_url: "http://127.0.0.1:8000"
106
122
  url_path: ''
107
123
  api_key: "<API-KEY>"
108
124
  bearer_token: "<BEARER-TOKEN>"
109
125
  model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct"
110
- payload_path: "../../src/data/payload_example_1.yaml"
111
126
  default_request_payload_template:
127
+ # Change the user message field name only according to the request payload schema (example: 'prompt' to 'message').
112
128
  prompt: "${user_message}"
113
129
  details: "${request_payload}" # Rest of the request payload data.
114
130
  default_response_payload_template:
131
+ # Change the placeholder value only according to the response payload schema (example: ${agent_reply} to ${reply}).
115
132
  agent_reply: "${agent_reply}"
116
- guardrail_flag: "${guardrail_flag}"
117
133
  generated_metadata: "${generated_metadata}"
118
134
 
119
- reference_data:
120
- source: LOCAL # LOCAL or REMOTE.
121
- path: "../../src/data/conversation_example_1.json"
122
-
123
- metrics_map:
124
- field_1: EXACT
125
- field_2: LEVENSHTEIN
135
+ repository:
136
+ type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM, MONGODB.
137
+ project_id: "(default)"
138
+ database_name: ""
126
139
  ```
127
140
 
128
141
  - **Endpoint Configuration**: Define how to interact with your LLM-based system (base URL, auth, payload templates).
@@ -133,33 +146,26 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
133
146
 
134
147
  ```json
135
148
  {
136
- "id": "1fa6f6ed-3cfe-4c0b-b389-7292f58879d4",
137
149
  "scripts": [
138
150
  {
139
- "id": "65f58cec-d55d-4a24-bf16-fa8327a3aa6b",
140
151
  "interactions": [
141
152
  {
142
- "id": "e99a2898-6a79-4a20-ac85-dfe977ea9935",
143
153
  "user_message": "Hello, I would like to book an appointment with a doctor.",
144
154
  "reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
145
155
  "interaction_type": "initial",
146
156
  "reference_metadata": {},
147
- "generated_metadata": {},
148
157
  "guardrail_flag": false,
149
158
  "request_payload": {"user_id": "0001", "user_role": "ADMIN"}
150
159
  },
151
160
  {
152
- "id": "fe5c539a-d0a1-40ee-97bd-dbe456703ccc",
153
161
  "user_message": "I need to see a cardiologist.",
154
162
  "reference_reply": "When would you like to schedule your appointment?",
155
163
  "interaction_type": "intermediate",
156
164
  "reference_metadata": {},
157
- "generated_metadata": {},
158
165
  "guardrail_flag": false,
159
166
  "request_payload": {"user_id": "0001", "user_role": "ADMIN"}
160
167
  },
161
168
  {
162
- "id": "2cfdbd1c-a065-48bb-9aa9-b958342154b1",
163
169
  "user_message": "I would like to book it for next Monday morning.",
164
170
  "reference_reply": "We have an available slot at 10 AM next Monday. Does that work for you?",
165
171
  "interaction_type": "intermediate",
@@ -168,11 +174,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
168
174
  "date": "next Monday",
169
175
  "time": "10 AM"
170
176
  },
171
- "generated_metadata": {
172
- "appointment_type": "Cardiology",
173
- "date": "next Monday",
174
- "time": "morning"
175
- },
176
177
  "guardrail_flag": false,
177
178
  "request_payload": {"user_id": "0001", "user_role": "ADMIN"}
178
179
  },
@@ -182,7 +183,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
182
183
  "reference_reply": "Your appointment with the cardiologist is booked for 10 AM next Monday. Is there anything else I can help you with?",
183
184
  "interaction_type": "final",
184
185
  "reference_metadata": {},
185
- "generated_metadata": {},
186
186
  "guardrail_flag": false,
187
187
  "request_payload": {"user_id": "0001", "user_role": "ADMIN"}
188
188
  }
@@ -195,9 +195,22 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
195
195
  ]
196
196
  }
197
197
  ```
198
-
199
198
  - **Fields**: Include user messages, reference/references replies, metadata for comparison, guardrail flags, and request payloads.
200
199
 
200
+ In the `.env` you need to add the LLM providers credentials that will be used for the evaluation process.
201
+ ```
202
+ OPENAI_API_KEY=
203
+ IONOS_API_KEY=
204
+ ANTHROPIC_API_KEY=
205
+ MISTRAL_API_KEY=
206
+
207
+ # For IONOS, you must include the base URL and the model ID.
208
+ IONOS_BASE_URL="https://inference.de-txl.ionos.com"
209
+ IONOS_MODEL_ID="0b6c4a15-bb8d-4092-82b0-f357b77c59fd"
210
+
211
+ WORKFLOW_CONFIG_PATH="../../src/data/workflow_config_1.yaml"
212
+ ```
213
+
201
214
  ## Usage Example
202
215
 
203
216
  To run an evaluation:
@@ -214,15 +227,68 @@ if __name__ == "__main__":
214
227
  config = WorkflowConfig.load(path="../data/workflow_config.yaml")
215
228
 
216
229
  # Run evaluation session
217
- with EvaluationSession(session_name="sim-test", workflow_config=config) as session:
230
+ with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
231
+ session.run()
232
+ results = session.workflow.collect_results()
233
+ print("Results:", results)
234
+
235
+ stats = session.get_stats()
236
+ print(f"session stats:\n{stats}")
237
+ ```
238
+
239
+ Alternatively, if you want to pass the configuration and reference data from in-memory variables,
240
+ you can manually load the data like the following:
241
+ ```python
242
+ if __name__ == "__main__":
243
+ from levelapp.workflow import WorkflowConfig
244
+ from levelapp.core.session import EvaluationSession
245
+
246
+ # Firestore -> retrieve endpoint config -> data => config_dict
247
+
248
+ config_dict = {
249
+ "process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
250
+ "evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
251
+ "reference_data": {"path": "", "data": {}},
252
+ "endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
253
+ "repository": {"type": "FIRESTORE", "source": "IN_MEMORY", "metrics_map": {"field_1": "EXACT"}},
254
+ }
255
+
256
+ content = {
257
+ "scripts": [
258
+ {
259
+ "interactions": [
260
+ {
261
+ "user_message": "Hello!",
262
+ "reference_reply": "Hello, how can I help you!"
263
+ },
264
+ {
265
+ "user_message": "I need an apartment",
266
+ "reference_reply": "sorry, but I can only assist you with booking medical appointments."
267
+ },
268
+ ]
269
+ },
270
+ ]
271
+ }
272
+
273
+ # Load configuration from a dict variable
274
+ config = WorkflowConfig.from_dict(content=config_dict)
275
+
276
+ # Load reference data from dict variable
277
+ config.set_reference_data(content=content)
278
+
279
+ evaluation_session = EvaluationSession(session_name="test-session-2", workflow_config=config)
280
+
281
+ with evaluation_session as session:
218
282
  session.run()
219
283
  results = session.workflow.collect_results()
220
284
  print("Results:", results)
221
285
 
222
286
  stats = session.get_stats()
223
287
  print(f"session stats:\n{stats}")
288
+
224
289
  ```
225
290
 
291
+
226
292
  - This loads the config, runs the specified workflow (e.g., Simulator), collects results, and prints stats.
227
293
 
228
294
  For more examples, see the `examples/` directory.
@@ -43,38 +43,47 @@ pip install levelapp
43
43
  LevelApp uses a YAML configuration file to define the evaluation setup. Create a `workflow_config.yaml` with the following structure:
44
44
 
45
45
  ```yaml
46
- project_name: "test-project"
47
- evaluation_params:
48
- attempts: 1 # Number of simulation attempts.
46
+ process:
47
+ project_name: "test-project"
48
+ workflow_type: SIMULATOR # Pick one of the following workflows: SIMULATOR, COMPARATOR, ASSESSOR.
49
+ evaluation_params:
50
+ attempts: 1 # Add the number of simulation attempts.
51
+ batch_size: 5
52
+
53
+ evaluation:
54
+ evaluators: # Select from the following: JUDGE, REFERENCE, RAG.
55
+ - JUDGE
56
+ - REFERENCE
57
+ providers:
58
+ - openai
59
+ - ionos
60
+ metrics_map:
61
+ field_1: EXACT
62
+ field_2 : LEVENSHTEIN
49
63
 
50
- workflow: SIMULATOR # SIMULATOR, COMPARATOR, ASSESSOR.
51
- repository: FIRESTORE # FIRESTORE, FILESYSTEM, MONGODB.
52
- evaluators: # JUDGE, REFERENCE, RAG.
53
- - JUDGE
54
- - REFERENCE
64
+ reference_data:
65
+ path:
66
+ data:
55
67
 
56
- endpoint_configuration:
68
+ endpoint:
57
69
  base_url: "http://127.0.0.1:8000"
58
70
  url_path: ''
59
71
  api_key: "<API-KEY>"
60
72
  bearer_token: "<BEARER-TOKEN>"
61
73
  model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct"
62
- payload_path: "../../src/data/payload_example_1.yaml"
63
74
  default_request_payload_template:
75
+ # Change the user message field name only according to the request payload schema (example: 'prompt' to 'message').
64
76
  prompt: "${user_message}"
65
77
  details: "${request_payload}" # Rest of the request payload data.
66
78
  default_response_payload_template:
79
+ # Change the placeholder value only according to the response payload schema (example: ${agent_reply} to ${reply}).
67
80
  agent_reply: "${agent_reply}"
68
- guardrail_flag: "${guardrail_flag}"
69
81
  generated_metadata: "${generated_metadata}"
70
82
 
71
- reference_data:
72
- source: LOCAL # LOCAL or REMOTE.
73
- path: "../../src/data/conversation_example_1.json"
74
-
75
- metrics_map:
76
- field_1: EXACT
77
- field_2: LEVENSHTEIN
83
+ repository:
84
+ type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM, MONGODB.
85
+ project_id: "(default)"
86
+ database_name: ""
78
87
  ```
79
88
 
80
89
  - **Endpoint Configuration**: Define how to interact with your LLM-based system (base URL, auth, payload templates).
@@ -85,33 +94,26 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
85
94
 
86
95
  ```json
87
96
  {
88
- "id": "1fa6f6ed-3cfe-4c0b-b389-7292f58879d4",
89
97
  "scripts": [
90
98
  {
91
- "id": "65f58cec-d55d-4a24-bf16-fa8327a3aa6b",
92
99
  "interactions": [
93
100
  {
94
- "id": "e99a2898-6a79-4a20-ac85-dfe977ea9935",
95
101
  "user_message": "Hello, I would like to book an appointment with a doctor.",
96
102
  "reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
97
103
  "interaction_type": "initial",
98
104
  "reference_metadata": {},
99
- "generated_metadata": {},
100
105
  "guardrail_flag": false,
101
106
  "request_payload": {"user_id": "0001", "user_role": "ADMIN"}
102
107
  },
103
108
  {
104
- "id": "fe5c539a-d0a1-40ee-97bd-dbe456703ccc",
105
109
  "user_message": "I need to see a cardiologist.",
106
110
  "reference_reply": "When would you like to schedule your appointment?",
107
111
  "interaction_type": "intermediate",
108
112
  "reference_metadata": {},
109
- "generated_metadata": {},
110
113
  "guardrail_flag": false,
111
114
  "request_payload": {"user_id": "0001", "user_role": "ADMIN"}
112
115
  },
113
116
  {
114
- "id": "2cfdbd1c-a065-48bb-9aa9-b958342154b1",
115
117
  "user_message": "I would like to book it for next Monday morning.",
116
118
  "reference_reply": "We have an available slot at 10 AM next Monday. Does that work for you?",
117
119
  "interaction_type": "intermediate",
@@ -120,11 +122,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
120
122
  "date": "next Monday",
121
123
  "time": "10 AM"
122
124
  },
123
- "generated_metadata": {
124
- "appointment_type": "Cardiology",
125
- "date": "next Monday",
126
- "time": "morning"
127
- },
128
125
  "guardrail_flag": false,
129
126
  "request_payload": {"user_id": "0001", "user_role": "ADMIN"}
130
127
  },
@@ -134,7 +131,6 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
134
131
  "reference_reply": "Your appointment with the cardiologist is booked for 10 AM next Monday. Is there anything else I can help you with?",
135
132
  "interaction_type": "final",
136
133
  "reference_metadata": {},
137
- "generated_metadata": {},
138
134
  "guardrail_flag": false,
139
135
  "request_payload": {"user_id": "0001", "user_role": "ADMIN"}
140
136
  }
@@ -147,9 +143,22 @@ For conversation scripts (used in Simulator), provide a JSON file with this sche
147
143
  ]
148
144
  }
149
145
  ```
150
-
151
146
  - **Fields**: Include user messages, reference/references replies, metadata for comparison, guardrail flags, and request payloads.
152
147
 
148
+ In the `.env` you need to add the LLM providers credentials that will be used for the evaluation process.
149
+ ```
150
+ OPENAI_API_KEY=
151
+ IONOS_API_KEY=
152
+ ANTHROPIC_API_KEY=
153
+ MISTRAL_API_KEY=
154
+
155
+ # For IONOS, you must include the base URL and the model ID.
156
+ IONOS_BASE_URL="https://inference.de-txl.ionos.com"
157
+ IONOS_MODEL_ID="0b6c4a15-bb8d-4092-82b0-f357b77c59fd"
158
+
159
+ WORKFLOW_CONFIG_PATH="../../src/data/workflow_config_1.yaml"
160
+ ```
161
+
153
162
  ## Usage Example
154
163
 
155
164
  To run an evaluation:
@@ -166,15 +175,68 @@ if __name__ == "__main__":
166
175
  config = WorkflowConfig.load(path="../data/workflow_config.yaml")
167
176
 
168
177
  # Run evaluation session
169
- with EvaluationSession(session_name="sim-test", workflow_config=config) as session:
178
+ with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
179
+ session.run()
180
+ results = session.workflow.collect_results()
181
+ print("Results:", results)
182
+
183
+ stats = session.get_stats()
184
+ print(f"session stats:\n{stats}")
185
+ ```
186
+
187
+ Alternatively, if you want to pass the configuration and reference data from in-memory variables,
188
+ you can manually load the data like the following:
189
+ ```python
190
+ if __name__ == "__main__":
191
+ from levelapp.workflow import WorkflowConfig
192
+ from levelapp.core.session import EvaluationSession
193
+
194
+ # Firestore -> retrieve endpoint config -> data => config_dict
195
+
196
+ config_dict = {
197
+ "process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
198
+ "evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
199
+ "reference_data": {"path": "", "data": {}},
200
+ "endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
201
+ "repository": {"type": "FIRESTORE", "source": "IN_MEMORY", "metrics_map": {"field_1": "EXACT"}},
202
+ }
203
+
204
+ content = {
205
+ "scripts": [
206
+ {
207
+ "interactions": [
208
+ {
209
+ "user_message": "Hello!",
210
+ "reference_reply": "Hello, how can I help you!"
211
+ },
212
+ {
213
+ "user_message": "I need an apartment",
214
+ "reference_reply": "sorry, but I can only assist you with booking medical appointments."
215
+ },
216
+ ]
217
+ },
218
+ ]
219
+ }
220
+
221
+ # Load configuration from a dict variable
222
+ config = WorkflowConfig.from_dict(content=config_dict)
223
+
224
+ # Load reference data from dict variable
225
+ config.set_reference_data(content=content)
226
+
227
+ evaluation_session = EvaluationSession(session_name="test-session-2", workflow_config=config)
228
+
229
+ with evaluation_session as session:
170
230
  session.run()
171
231
  results = session.workflow.collect_results()
172
232
  print("Results:", results)
173
233
 
174
234
  stats = session.get_stats()
175
235
  print(f"session stats:\n{stats}")
236
+
176
237
  ```
177
238
 
239
+
178
240
  - This loads the config, runs the specified workflow (e.g., Simulator), collects results, and prints stats.
179
241
 
180
242
  For more examples, see the `examples/` directory.
@@ -0,0 +1,26 @@
1
+ from enum import Enum
2
+
3
+
4
+ class ExtendedEnum(Enum):
5
+ @classmethod
6
+ def list(cls):
7
+ return [e.value for e in cls]
8
+
9
+
10
+ class WorkflowType(ExtendedEnum):
11
+ SIMULATOR = "SIMULATOR"
12
+ COMPARATOR = "COMPARATOR"
13
+ ASSESSOR = "ASSESSOR"
14
+
15
+
16
+ class RepositoryType(ExtendedEnum):
17
+ FIRESTORE = "FIRESTORE"
18
+ FILESYSTEM = "FILESYSTEM"
19
+
20
+
21
+ class EvaluatorType(ExtendedEnum):
22
+ JUDGE = "JUDGE"
23
+ REFERENCE = "REFERENCE"
24
+ RAG = "RAG"
25
+
26
+