levelapp 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- {levelapp-0.1.1 → levelapp-0.1.2}/PKG-INFO +8 -11
- {levelapp-0.1.1 → levelapp-0.1.2}/README.md +7 -8
- levelapp-0.1.2/docs/media/simulator-module-diagram.PNG +0 -0
- levelapp-0.1.2/docs/media/simulator-sequence-diagram.png +0 -0
- levelapp-0.1.2/examples/README.md +322 -0
- levelapp-0.1.2/examples/conversation_script.json +38 -0
- levelapp-0.1.2/examples/example_chatbot.py +48 -0
- levelapp-0.1.2/examples/example_evaluation.py +28 -0
- levelapp-0.1.2/examples/workflow_configuration.yaml +38 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/aspects/monitor.py +3 -1
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/clients/__init__.py +0 -1
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/scorer.py +0 -2
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/config/endpoint.py +22 -13
- levelapp-0.1.2/levelapp/config/endpoint_.py +62 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/config/prompts.py +22 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/core/schemas.py +0 -2
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/core/session.py +29 -3
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/evaluator/evaluator.py +16 -4
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/metrics/__init__.py +1 -5
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/simulator/schemas.py +7 -13
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/simulator/simulator.py +21 -18
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/simulator/utils.py +40 -78
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/base.py +38 -3
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/config.py +31 -4
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/context.py +0 -1
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/factory.py +16 -3
- {levelapp-0.1.1 → levelapp-0.1.2}/pyproject.toml +1 -3
- levelapp-0.1.2/src/data/evaluation_results.json +1 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/src/data/workflow_config.yaml +3 -3
- levelapp-0.1.2/src/level_app/main_session.py +48 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/uv.lock +1 -5
- levelapp-0.1.1/examples/example_evaluation.py +0 -0
- levelapp-0.1.1/src/level_app/main_session.py +0 -46
- {levelapp-0.1.1 → levelapp-0.1.2}/.gitignore +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/.python-version +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/LICENSE +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/MANIFEST.in +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/Makefile +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/aspects/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/aspects/loader.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/aspects/logger.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/aspects/sanitizer.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/clients/anthropic.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/clients/ionos.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/clients/mistral.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/clients/openai.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/comparator.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/extractor.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/schemas.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/utils.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/config/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/core/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/core/base.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/evaluator/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/metrics/embedding.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/metrics/exact.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/metrics/fuzzy.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/metrics/token.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/plugins/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/repository/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/repository/firestore.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/simulator/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/registration.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/runtime.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/make.bat +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/project_structure.txt +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/src/data/conversation_example_1.json +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/src/data/endpoint_configuration.yaml +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/src/data/payload_example_1.yaml +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/src/data/payload_example_2.yaml +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/src/data/workflow_config_2.json +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/src/level_app/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/src/level_app/main.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/src/level_app/main_monitoring.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/src/level_app/main_simulator.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/tests/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_anthropic.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_comparator.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_ionos.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_mistral.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_monitoring.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_openai.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_session.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_simulator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: levelapp
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
|
|
5
5
|
Project-URL: Homepage, https://github.com/levelapp-org
|
|
6
6
|
Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
|
|
@@ -17,14 +17,12 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
18
|
Classifier: Topic :: Software Development :: Testing
|
|
19
19
|
Requires-Python: >=3.12
|
|
20
|
-
Requires-Dist: arrow>=1.3.0
|
|
21
20
|
Requires-Dist: google-api-core>=2.25.1
|
|
22
21
|
Requires-Dist: google-auth>=2.40.3
|
|
23
22
|
Requires-Dist: google-cloud-firestore>=2.21.0
|
|
24
23
|
Requires-Dist: httpx>=0.28.1
|
|
25
24
|
Requires-Dist: humanize>=4.13.0
|
|
26
25
|
Requires-Dist: numpy>=2.3.2
|
|
27
|
-
Requires-Dist: openai>=1.99.9
|
|
28
26
|
Requires-Dist: pandas-stubs==2.3.0.250703
|
|
29
27
|
Requires-Dist: pandas>=2.3.1
|
|
30
28
|
Requires-Dist: pydantic>=2.11.7
|
|
@@ -133,7 +131,7 @@ endpoint:
|
|
|
133
131
|
generated_metadata: "${generated_metadata}"
|
|
134
132
|
|
|
135
133
|
repository:
|
|
136
|
-
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
134
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
137
135
|
project_id: "(default)"
|
|
138
136
|
database_name: ""
|
|
139
137
|
```
|
|
@@ -220,14 +218,14 @@ To run an evaluation:
|
|
|
220
218
|
|
|
221
219
|
```python
|
|
222
220
|
if __name__ == "__main__":
|
|
223
|
-
from levelapp.workflow
|
|
221
|
+
from levelapp.workflow import WorkflowConfig
|
|
224
222
|
from levelapp.core.session import EvaluationSession
|
|
225
223
|
|
|
226
224
|
# Load configuration from YAML
|
|
227
225
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
228
226
|
|
|
229
|
-
# Run evaluation session
|
|
230
|
-
with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
|
|
227
|
+
# Run evaluation session (You can enable/disable the monitoring aspect)
|
|
228
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
|
|
231
229
|
session.run()
|
|
232
230
|
results = session.workflow.collect_results()
|
|
233
231
|
print("Results:", results)
|
|
@@ -243,14 +241,13 @@ if __name__ == "__main__":
|
|
|
243
241
|
from levelapp.workflow import WorkflowConfig
|
|
244
242
|
from levelapp.core.session import EvaluationSession
|
|
245
243
|
|
|
246
|
-
|
|
247
|
-
|
|
244
|
+
|
|
248
245
|
config_dict = {
|
|
249
246
|
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
250
|
-
"evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
|
|
247
|
+
"evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
|
|
251
248
|
"reference_data": {"path": "", "data": {}},
|
|
252
249
|
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
253
|
-
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"
|
|
250
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
|
|
254
251
|
}
|
|
255
252
|
|
|
256
253
|
content = {
|
|
@@ -81,7 +81,7 @@ endpoint:
|
|
|
81
81
|
generated_metadata: "${generated_metadata}"
|
|
82
82
|
|
|
83
83
|
repository:
|
|
84
|
-
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
84
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
85
85
|
project_id: "(default)"
|
|
86
86
|
database_name: ""
|
|
87
87
|
```
|
|
@@ -168,14 +168,14 @@ To run an evaluation:
|
|
|
168
168
|
|
|
169
169
|
```python
|
|
170
170
|
if __name__ == "__main__":
|
|
171
|
-
from levelapp.workflow
|
|
171
|
+
from levelapp.workflow import WorkflowConfig
|
|
172
172
|
from levelapp.core.session import EvaluationSession
|
|
173
173
|
|
|
174
174
|
# Load configuration from YAML
|
|
175
175
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
176
176
|
|
|
177
|
-
# Run evaluation session
|
|
178
|
-
with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
|
|
177
|
+
# Run evaluation session (You can enable/disable the monitoring aspect)
|
|
178
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
|
|
179
179
|
session.run()
|
|
180
180
|
results = session.workflow.collect_results()
|
|
181
181
|
print("Results:", results)
|
|
@@ -191,14 +191,13 @@ if __name__ == "__main__":
|
|
|
191
191
|
from levelapp.workflow import WorkflowConfig
|
|
192
192
|
from levelapp.core.session import EvaluationSession
|
|
193
193
|
|
|
194
|
-
|
|
195
|
-
|
|
194
|
+
|
|
196
195
|
config_dict = {
|
|
197
196
|
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
198
|
-
"evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
|
|
197
|
+
"evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
|
|
199
198
|
"reference_data": {"path": "", "data": {}},
|
|
200
199
|
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
201
|
-
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"
|
|
200
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
|
|
202
201
|
}
|
|
203
202
|
|
|
204
203
|
content = {
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
# Quickstart Guide: Using LevelApp's Conversation Simulator for Developers
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
#### Welcome to LevelApp Quickstart Guide!
|
|
5
|
+
This guide provides a step-by-step walkthrough for developers to set up and use the Simulator Module in LevelApp.
|
|
6
|
+
<br>
|
|
7
|
+
<br>
|
|
8
|
+
The **Simulator** focuses on black-box testing by simulating dialogues using predefined scripts, evaluating responses against references, and computing metrics on extracted metadata.
|
|
9
|
+
It leverages LLM-as-a-judge for qualitative scoring and supports quantitative metrics like exact matches or fuzzy comparisons.
|
|
10
|
+
<br>
|
|
11
|
+
<figure>
|
|
12
|
+
<img
|
|
13
|
+
src="../docs/media/simulator-module-diagram.PNG"
|
|
14
|
+
alt="Sequence Diagram">
|
|
15
|
+
<figcaption>Fig.1 - Simulator Module Diagram</figcaption>
|
|
16
|
+
</figure>
|
|
17
|
+
<br>
|
|
18
|
+
We'll emphasize technical details, including configuration schemas, placeholders, evaluators, metrics, and code execution flow. This assumes you're familiar with Python, YAML/JSON, and REST APIs for LLM endpoints. By the end, you'll have a runnable example for evaluating a chatbot's conversation flow.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
## Introduction
|
|
22
|
+
First, let's have a quick introduction on what LevelApp is and what it provides as a framework.
|
|
23
|
+
|
|
24
|
+
The idea behind LevelApp is to build a framework that assists developers to perform regression tests on their LLM-powered systems ensuring that recent changes to code have not negatively impacted existing functionality or introduced new defects. <br>
|
|
25
|
+
The evaluation of dialogue systems is very cost/time intensive and problematic since assessing the quality of a dialogue requires multiple iteration where a human conducts a message/reply evaluation for each interaction (quite tedious and boring task, if you ask me!).
|
|
26
|
+
|
|
27
|
+
Automating the evaluation and introducing an LLM-as-a-judge as an approach to evaluate the correctness of responses can
|
|
28
|
+
ease the process and render it more efficient.
|
|
29
|
+
---
|
|
30
|
+
## Walkthrough
|
|
31
|
+
### Step1: Installation and Prerequisites
|
|
32
|
+
Install LevelApp using pip. This pulls in dependencies like `pydantic`, `numpy`, `python-dotenv`,
|
|
33
|
+
and others for handling LLM clients, data validation, and metrics computation.
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install levelapp
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
#### Technical Prerequisites:
|
|
40
|
+
|
|
41
|
+
* **Python Version**: 3.12+. LevelApp uses modern features like type hints and async support (via `asyncio` for potential batch processing).
|
|
42
|
+
* **LLM Provider Credentials**: You'll need API keys for at least one supported provider (e.g., OpenAI, Anthropic, IONOS, Mistral). These are loaded via `python-dotenv` from a `.env` file. Without them, evaluators like JUDGE won't function.
|
|
43
|
+
* **No Internet for Dependencies**: All core deps are installed automatically; no manual `pip install` needed beyond the initial command.
|
|
44
|
+
* **Environment Setup**: Create a `.env` file in your project root. Example structure (replace with your actual keys):
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
IONOS_API_KEY=your-ionos-key
|
|
48
|
+
OPENAI_API_KEY=sk-your-openai-key
|
|
49
|
+
ANTHROPIC_API_KEY=your-anthropic-key
|
|
50
|
+
MISTRAL_API_KEY=your-mistral-key
|
|
51
|
+
|
|
52
|
+
IONOS_BASE_URL=https://inference.de-txl.ionos.com
|
|
53
|
+
IONOS_MODEL_ID=0b6c4a15-bb8d-4092-82b0-f357b77c59fd
|
|
54
|
+
|
|
55
|
+
# Optional: Path to workflow config if not loading programmatically
|
|
56
|
+
WORKFLOW_CONFIG_PATH=../data/workflow_config.yaml
|
|
57
|
+
```
|
|
58
|
+
**Note**: For IONOS, the base_url and model_id are mandatory in .env as they aren't always configurable via YAML alone.
|
|
59
|
+
LevelApp uses these to construct API requests.
|
|
60
|
+
|
|
61
|
+
### Step2: Understanding the Simulator Workflow
|
|
62
|
+
The Simulator Module simulates conversations by:
|
|
63
|
+
1. Sending user messages (from a JSON script) to your LLM-based system's endpoint.
|
|
64
|
+
2. Capturing generated responses and metadata.
|
|
65
|
+
3. Evaluating them using selected evaluators (e.g., JUDGE for LLM-scored quality, REFERENCE for direct comparison).
|
|
66
|
+
4. Computing metrics on metadata (e.g., EXACT for string matching, LEVENSHTEIN for edit distance).
|
|
67
|
+
|
|
68
|
+
<figure>
|
|
69
|
+
<img
|
|
70
|
+
src="../docs/media/simulator-sequence-diagram.png"
|
|
71
|
+
alt="Sequence Diagram">
|
|
72
|
+
<figcaption>Fig.2 - Conversation Simulator Sequence Diagram</figcaption>
|
|
73
|
+
</figure>
|
|
74
|
+
|
|
75
|
+
#### Key Technical Concepts:
|
|
76
|
+
|
|
77
|
+
Workflow Type: Set to `SIMULATOR` in YAML. This triggers dialogue simulation logic in `levelapp.workflow`.
|
|
78
|
+
|
|
79
|
+
* **Evaluators**:
|
|
80
|
+
* `JUDGE`: Uses an LLM (from providers like OpenAI) to score generated replies against references (e.g., on relevance, fluency). Configurable via providers list.
|
|
81
|
+
* `REFERENCE`: Direct comparison without LLM, using metrics for metadata (Used for comparing extracted metadata).
|
|
82
|
+
* `RAG`: Retrieval-Augmented Generation evaluator (for knowledge-grounded responses; requires additional setup).
|
|
83
|
+
<br>
|
|
84
|
+
<br>
|
|
85
|
+
* **Metrics Map**: A dict mapping metadata fields to comparison methods (e.g., `EXACT` for exact string match, `LEVENSHTEIN` for fuzzy matching with distance thresholds).
|
|
86
|
+
Full list in docs: includes Token-based, Embedded (vector similarity), Fuzzy.
|
|
87
|
+
<br>
|
|
88
|
+
<br>
|
|
89
|
+
* **Attempts and Batching**: `evaluation_params` attempts runs simulations multiple times for averaging scores (useful for non-deterministic LLMs). batch_size controls concurrent requests to avoid rate limits.
|
|
90
|
+
<br>
|
|
91
|
+
<br>
|
|
92
|
+
* **Placeholders in Payloads**:
|
|
93
|
+
* `default_request_payload_template`: For this section, you need to change **field** (e.g,. change the field name `prompt` to `message`) names and not the **placeholder** values. The placeholders are used by the simulator to populate the request body.
|
|
94
|
+
* `default_response_payload_template`: For this section, you need to change the placeholders values and not the fields, contrary to the request section. The simulator will use the provided placeholder values to extract and map the reply and metadata from the response body.
|
|
95
|
+
|
|
96
|
+
### Step 3: Creating the YAML Configuration File
|
|
97
|
+
Create `workflow_config.yaml` to define the workflow. This is parsed into a `WorkflowConfig` Pydantic model for validation.
|
|
98
|
+
|
|
99
|
+
Example `workflow_config.yaml` for Simulator:
|
|
100
|
+
```YAML
|
|
101
|
+
# PROCESS SECTION:
|
|
102
|
+
process:
|
|
103
|
+
project_name: "chatbot-evaluation"
|
|
104
|
+
workflow_type: SIMULATOR # Must be SIMULATOR for conversation testing
|
|
105
|
+
evaluation_params:
|
|
106
|
+
attempts: 3 # Run each interaction 3 times, average results
|
|
107
|
+
batch_size: 10 # Process 10 interactions concurrently
|
|
108
|
+
|
|
109
|
+
# EVALUATION SECTION:
|
|
110
|
+
evaluation:
|
|
111
|
+
evaluators: # Array of evaluators to apply
|
|
112
|
+
- JUDGE
|
|
113
|
+
- REFERENCE # REFERENCE evaluator can be used if your dialogue system returns additional metadata.
|
|
114
|
+
providers: # LLM providers for JUDGE (At least one must be provided for the JUDGE evaluator)
|
|
115
|
+
- openai
|
|
116
|
+
- ionos
|
|
117
|
+
metrics_map: # Map metadata fields to metrics
|
|
118
|
+
appointment_type: EXACT # Exact match for strings
|
|
119
|
+
date: LEVENSHTEIN # Fuzzy match for dates (e.g., tolerates formatting differences)
|
|
120
|
+
time: TOKEN_BASED # Token-level overlap
|
|
121
|
+
|
|
122
|
+
# REFERENCE DATA SECTION:
|
|
123
|
+
reference_data:
|
|
124
|
+
path: "conversation_script.json" # Path to JSON script
|
|
125
|
+
data: {} # Inline data if not using path (dict of scripts)
|
|
126
|
+
|
|
127
|
+
# ENDPOINT CONFIGURATION SECTION:
|
|
128
|
+
endpoint:
|
|
129
|
+
base_url: "http://127.0.0.1:8000" # Your chatbot's API base URL
|
|
130
|
+
url_path: "chat" # Endpoint path (full URL = base_url + url_path)
|
|
131
|
+
api_key: "" # Optional; overrides .env if set
|
|
132
|
+
bearer_token: "" # For auth
|
|
133
|
+
model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct" # Model for your endpoint (if applicable).
|
|
134
|
+
default_request_payload_template: # Template for POST body
|
|
135
|
+
message: "${user_message}" # Adapt to your API (e.g., 'prompt' for some)
|
|
136
|
+
payload: "${request_payload}" # Additional data from JSON script
|
|
137
|
+
default_response_payload_template: # Extract from API response
|
|
138
|
+
agent_reply: "${generated_reply}" # Map to your response field
|
|
139
|
+
generated_metadata: "${metadata}" # e.g., extracted entities
|
|
140
|
+
|
|
141
|
+
# REPOSITORY SECTION (Optional):
|
|
142
|
+
repository:
|
|
143
|
+
type: FILESYSTEM # Or FIRESTORE/MONGODB for persistence
|
|
144
|
+
project_id: "" # For FIRESTORE
|
|
145
|
+
database_name: "" # For FIRESTORE/MONGODB
|
|
146
|
+
source: "LOCAL" # Or IN_MEMORY for non-persistent
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
For the endpoint configuration section (`endpoint`), essentially, you need to provide:
|
|
150
|
+
* base_url
|
|
151
|
+
* url_path
|
|
152
|
+
* headers data: API Key, Bearer Token, or any additional header data.
|
|
153
|
+
|
|
154
|
+
As for the request payload, for example, if you have the following request payload schema:
|
|
155
|
+
```JSON
|
|
156
|
+
{
|
|
157
|
+
"prompt": "Hello, world!",
|
|
158
|
+
"user_id": "0001",
|
|
159
|
+
"user_role": "ADMIN",
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
You need to configure the `default_request_payload_template` like the following:
|
|
163
|
+
```YAML
|
|
164
|
+
default_request_payload_template:
|
|
165
|
+
prompt: "${user_message}" # As you can notice, we only changed the field name and not the placeholder value.
|
|
166
|
+
payload: "${request_payload}" # The rest of the data will be fetched from the "request_payload" field in the reference data JSON file.
|
|
167
|
+
```
|
|
168
|
+
while providing the rest of the payload request inside the reference data JSON file content:
|
|
169
|
+
```JSON
|
|
170
|
+
{
|
|
171
|
+
"scripts": [
|
|
172
|
+
{
|
|
173
|
+
"interactions": [
|
|
174
|
+
{
|
|
175
|
+
"user_message": "Hello, I would like to book an appointment with a doctor.",
|
|
176
|
+
"reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
|
|
177
|
+
"interaction_type": "initial",
|
|
178
|
+
"reference_metadata": {},
|
|
179
|
+
"guardrail_flag": false,
|
|
180
|
+
"request_payload": {"user_id": "0001", "user_role": "ADMIN"} // Here we add the rest of the request payload data.
|
|
181
|
+
}
|
|
182
|
+
...
|
|
183
|
+
```
|
|
184
|
+
And for the response payload, if you have the following response payload schema:
|
|
185
|
+
```JSON
|
|
186
|
+
{
|
|
187
|
+
"response": "Hello, world!",
|
|
188
|
+
"metadata": {"k1": "v1", "k2": "v2"},
|
|
189
|
+
"timestamp": "2025-10-14T14:49:00.123Z",
|
|
190
|
+
"status": "COMPLETE"
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
You need to configure the `default_response_payload_template` like the following:
|
|
194
|
+
```YAML
|
|
195
|
+
default_response_payload_template:
|
|
196
|
+
agent_reply: "${response}" # we changed the placeholder value here by adding "response" field where the reply value is held.
|
|
197
|
+
generated_metadata: "${metadata}"
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Step 4: Creating the JSON Conversation Script
|
|
201
|
+
The script defines simulation flows. It's a dict with a `scripts` array, each containing `interactions` (sequential turns).
|
|
202
|
+
<br>
|
|
203
|
+
Example `conversation_script.json`:
|
|
204
|
+
```JSON
|
|
205
|
+
{
|
|
206
|
+
"scripts": [
|
|
207
|
+
{
|
|
208
|
+
"interactions": [
|
|
209
|
+
{
|
|
210
|
+
"user_message": "Hello, book a doctor appointment.",
|
|
211
|
+
"reference_reply": "What type of doctor?",
|
|
212
|
+
"interaction_type": "initial",
|
|
213
|
+
"reference_metadata": {},
|
|
214
|
+
"guardrail_flag": false,
|
|
215
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
"user_message": "Cardiologist.",
|
|
219
|
+
"reference_reply": "When?",
|
|
220
|
+
"interaction_type": "intermediate",
|
|
221
|
+
"reference_metadata": {"type": "Cardiology"},
|
|
222
|
+
"guardrail_flag": false,
|
|
223
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
224
|
+
},
|
|
225
|
+
{
|
|
226
|
+
"user_message": "Next Monday at 10 AM.",
|
|
227
|
+
"reference_reply": "Booked for 10 AM next Monday.",
|
|
228
|
+
"interaction_type": "final",
|
|
229
|
+
"reference_metadata": {
|
|
230
|
+
"appointment_type": "Cardiology",
|
|
231
|
+
"date": "2025-10-20",
|
|
232
|
+
"time": "10:00"
|
|
233
|
+
},
|
|
234
|
+
"guardrail_flag": false,
|
|
235
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
236
|
+
}
|
|
237
|
+
],
|
|
238
|
+
"description": "Doctor booking flow",
|
|
239
|
+
"details": {"context": "Medical chatbot"}
|
|
240
|
+
}
|
|
241
|
+
]
|
|
242
|
+
}
|
|
243
|
+
```
|
|
244
|
+
#### Technical Notes:
|
|
245
|
+
|
|
246
|
+
* **Schema Validation**: Interactions are validated against a schema (e.g., user_message: str, reference_metadata: dict).
|
|
247
|
+
* **Metadata Comparison**: generated_metadata from your endpoint is compared to reference_metadata using metrics_map.
|
|
248
|
+
* **Interaction Types**: initial/intermediate/final for flow control; can influence evaluator behavior (e.g., stricter on final turns).
|
|
249
|
+
* **Request Payload**: Merged into the endpoint request template for context (e.g., user auth).
|
|
250
|
+
|
|
251
|
+
### Step 5: Writing and Running the Python Script
|
|
252
|
+
Use this to load configs, run the simulation, and collect results. LevelApp handles session management via context managers.
|
|
253
|
+
<br>
|
|
254
|
+
Example run_simulation.py:
|
|
255
|
+
```Python
|
|
256
|
+
from dotenv import load_dotenv
|
|
257
|
+
from levelapp.workflow import WorkflowConfig
|
|
258
|
+
from levelapp.core.session import EvaluationSession
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# Load .env (automatically done in LevelApp, but explicit for clarity)
|
|
262
|
+
load_dotenv()
|
|
263
|
+
|
|
264
|
+
if __name__ == "__main__":
|
|
265
|
+
# Load YAML config (validates via Pydantic)
|
|
266
|
+
config = WorkflowConfig.load(path="workflow_config.yaml")
|
|
267
|
+
|
|
268
|
+
# Alternative: Load from dict for in-memory config (e.g., from DB)
|
|
269
|
+
# config_dict = {...} # As in README
|
|
270
|
+
# config = WorkflowConfig.from_dict(content=config_dict)
|
|
271
|
+
# config.set_reference_data(content={"scripts": [...]}) # Inline script
|
|
272
|
+
|
|
273
|
+
# Create session (handles logging, repository init)
|
|
274
|
+
with EvaluationSession(session_name="chatbot-sim-1", workflow_config=config) as session:
|
|
275
|
+
# Run simulation: Sends requests, evaluates, stores in repo
|
|
276
|
+
session.run()
|
|
277
|
+
|
|
278
|
+
# Collect the evaluation results
|
|
279
|
+
results = session.workflow.collect_results()
|
|
280
|
+
print("Evaluation Results:", results)
|
|
281
|
+
|
|
282
|
+
stats = session.get_stats()
|
|
283
|
+
print("Session Stats:\n", stats)
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
Technical Execution Flow:
|
|
287
|
+
|
|
288
|
+
1. `WorkflowConfig.load()`: Parses YAML, loads .env secrets, validates.
|
|
289
|
+
2. `EvaluationSession`: Initializes the evaluation session.
|
|
290
|
+
3. `session.run()`: Loops over scripts/interactions:
|
|
291
|
+
* Substitutes placeholders, sends POST to endpoint.
|
|
292
|
+
* Extracts chatbot reply and generated metadata.
|
|
293
|
+
* Applies evaluators (e.g., JUDGE prompts LLM with "Score reply on scale 0-3: generated vs reference").
|
|
294
|
+
* Computes metrics (e.g., Levenshtein distance via numpy).
|
|
295
|
+
4. `collect_results()`: Returns the evaluation results.
|
|
296
|
+
5. `get_stats()`: Retrieves monitoring stats (API calls details, caching details, processing time, etc.).
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
### Let's Test It:
|
|
300
|
+
First, install the packages required to run the examples test:
|
|
301
|
+
<br>
|
|
302
|
+
(it is always recommended to set up a virtual environment for testing)
|
|
303
|
+
```Bash
|
|
304
|
+
pip install fastapi uvicorn levelapp
|
|
305
|
+
```
|
|
306
|
+
Second, run the chatbot (`example_chatbot.py`) using `uvicorn`:
|
|
307
|
+
<br>
|
|
308
|
+
(don't forget to add your `OPENAI_API_KEY`!)
|
|
309
|
+
```Bash
|
|
310
|
+
uvicorn example_chatbot:app --reload --port 8000
|
|
311
|
+
```
|
|
312
|
+
Next, optionally, run a health test to see if the chatbot is alive:
|
|
313
|
+
```Bash
|
|
314
|
+
curl http://localhost:8000/healthz
|
|
315
|
+
```
|
|
316
|
+
Finally, run the evaluation:
|
|
317
|
+
```Bash
|
|
318
|
+
python example_evaluation.py
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
That's it! All you need now is to verify and interpret the evaluation results.
|
|
322
|
+
**Good luck!**
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"scripts": [
|
|
3
|
+
{
|
|
4
|
+
"interactions": [
|
|
5
|
+
{
|
|
6
|
+
"user_message": "Hello, book a doctor appointment.",
|
|
7
|
+
"reference_reply": "What type of doctor?",
|
|
8
|
+
"interaction_type": "initial",
|
|
9
|
+
"reference_metadata": {},
|
|
10
|
+
"guardrail_flag": false,
|
|
11
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"user_message": "Cardiologist.",
|
|
15
|
+
"reference_reply": "When?",
|
|
16
|
+
"interaction_type": "intermediate",
|
|
17
|
+
"reference_metadata": {"type": "Cardiology"},
|
|
18
|
+
"guardrail_flag": false,
|
|
19
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"user_message": "Next Monday at 10 AM.",
|
|
23
|
+
"reference_reply": "Booked for 10 AM next Monday.",
|
|
24
|
+
"interaction_type": "final",
|
|
25
|
+
"reference_metadata": {
|
|
26
|
+
"appointment_type": "Cardiology",
|
|
27
|
+
"date": "2025-10-20",
|
|
28
|
+
"time": "10:00"
|
|
29
|
+
},
|
|
30
|
+
"guardrail_flag": false,
|
|
31
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
32
|
+
}
|
|
33
|
+
],
|
|
34
|
+
"description": "Doctor booking flow",
|
|
35
|
+
"details": {"context": "Medical chatbot"}
|
|
36
|
+
}
|
|
37
|
+
]
|
|
38
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from fastapi import FastAPI, HTTPException
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
from openai import OpenAI
|
|
4
|
+
|
|
5
|
+
app = FastAPI(title="Tiny Chatbot")
|
|
6
|
+
|
|
7
|
+
client = OpenAI(api_key="<YOUR-API-KEY-HERE>")
|
|
8
|
+
|
|
9
|
+
SYSTEM_PROMPT = "Play role as a medical assistant."
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ChatRequest(BaseModel):
|
|
13
|
+
message: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ChatResponse(BaseModel):
|
|
17
|
+
reply: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def generate_reply(user_message: str) -> str:
|
|
21
|
+
try:
|
|
22
|
+
resp = client.chat.completions.create(
|
|
23
|
+
model="gpt-4o-mini", # pick any chat-capable model you have access to
|
|
24
|
+
messages=[
|
|
25
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
26
|
+
{"role": "user", "content": user_message},
|
|
27
|
+
],
|
|
28
|
+
temperature=0.3,
|
|
29
|
+
)
|
|
30
|
+
return resp.choices[0].message.content.strip()
|
|
31
|
+
except Exception as e:
|
|
32
|
+
raise RuntimeError(f"LLM error: {e}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@app.post("/chat", response_model=ChatResponse)
|
|
36
|
+
def chat(req: ChatRequest):
|
|
37
|
+
if not req.message:
|
|
38
|
+
raise HTTPException(status_code=400, detail="`message` is required.")
|
|
39
|
+
try:
|
|
40
|
+
reply = generate_reply(req.message)
|
|
41
|
+
return ChatResponse(reply=reply)
|
|
42
|
+
except Exception as e:
|
|
43
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@app.get("/healthz")
|
|
47
|
+
def health():
|
|
48
|
+
return {"status": "ok"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from dotenv import load_dotenv
|
|
2
|
+
from levelapp.core.session import EvaluationSession
|
|
3
|
+
from levelapp.workflow import WorkflowConfig
|
|
4
|
+
|
|
5
|
+
# Load .env (automatically done in LevelApp, but explicit for clarity)
|
|
6
|
+
load_dotenv()
|
|
7
|
+
|
|
8
|
+
if __name__ == "__main__":
|
|
9
|
+
# 1. Load YAML config
|
|
10
|
+
config = WorkflowConfig.load(path="workflow_configuration.yaml")
|
|
11
|
+
|
|
12
|
+
# Alternatively: Load from dict for in-memory config (e.g., from DB)
|
|
13
|
+
# config_dict = {...} # As in README
|
|
14
|
+
# config = WorkflowConfig.from_dict(content=config_dict)
|
|
15
|
+
# config.set_reference_data(content={"scripts": [...]}) # Inline script
|
|
16
|
+
|
|
17
|
+
# 2. Create an evaluation session
|
|
18
|
+
with EvaluationSession(session_name="chatbot-sim-1", workflow_config=config) as session:
|
|
19
|
+
# 2.1. Run session (simulation session)
|
|
20
|
+
session.run()
|
|
21
|
+
|
|
22
|
+
# 2.2. Collect evaluation results
|
|
23
|
+
results = session.workflow.collect_results()
|
|
24
|
+
print("Evaluation Results:", results)
|
|
25
|
+
|
|
26
|
+
# 3. Get aggregated stats (monitoring stats)
|
|
27
|
+
stats = session.get_stats()
|
|
28
|
+
print("Session Stats:\n", stats)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
process:
|
|
2
|
+
project_name: "chatbot-evaluation"
|
|
3
|
+
workflow_type: SIMULATOR # Must be SIMULATOR for conversation testing
|
|
4
|
+
evaluation_params:
|
|
5
|
+
attempts: 3
|
|
6
|
+
batch_size: 10
|
|
7
|
+
|
|
8
|
+
evaluation:
|
|
9
|
+
evaluators:
|
|
10
|
+
- JUDGE
|
|
11
|
+
- REFERENCE
|
|
12
|
+
providers:
|
|
13
|
+
- openai
|
|
14
|
+
- ionos
|
|
15
|
+
metrics_map:
|
|
16
|
+
appointment_type: EXACT
|
|
17
|
+
date: LEVENSHTEIN
|
|
18
|
+
time: TOKEN_BASED
|
|
19
|
+
|
|
20
|
+
reference_data:
|
|
21
|
+
path: "conversation_script.json"
|
|
22
|
+
data: {}
|
|
23
|
+
|
|
24
|
+
endpoint:
|
|
25
|
+
base_url: "http://127.0.0.1:8000"
|
|
26
|
+
url_path: "chat"
|
|
27
|
+
api_key: ""
|
|
28
|
+
bearer_token: ""
|
|
29
|
+
default_request_payload_template:
|
|
30
|
+
message: "${user_message}"
|
|
31
|
+
default_response_payload_template:
|
|
32
|
+
agent_reply: "${reply}"
|
|
33
|
+
|
|
34
|
+
repository:
|
|
35
|
+
type: FIRESTORE # Keep this value as if for now.
|
|
36
|
+
project_id: ""
|
|
37
|
+
database_name: ""
|
|
38
|
+
source: "LOCAL"
|
|
@@ -343,6 +343,7 @@ class FunctionMonitor:
|
|
|
343
343
|
category: MetricType,
|
|
344
344
|
enable_timing: bool,
|
|
345
345
|
track_memory: bool,
|
|
346
|
+
verbose=False
|
|
346
347
|
) -> Callable[P, T]:
|
|
347
348
|
"""
|
|
348
349
|
Wrap function execution with timing and error handling.
|
|
@@ -352,6 +353,7 @@ class FunctionMonitor:
|
|
|
352
353
|
name: Unique identifier for the function
|
|
353
354
|
enable_timing: Enable execution time logging
|
|
354
355
|
track_memory: Enable memory tracking
|
|
356
|
+
verbose: Enable verbose logging
|
|
355
357
|
|
|
356
358
|
Returns:
|
|
357
359
|
Wrapped function
|
|
@@ -402,7 +404,7 @@ class FunctionMonitor:
|
|
|
402
404
|
|
|
403
405
|
self._aggregated_stats[name].update(metrics=metrics)
|
|
404
406
|
|
|
405
|
-
if enable_timing and metrics.duration is not None:
|
|
407
|
+
if verbose and enable_timing and metrics.duration is not None:
|
|
406
408
|
log_message = f"[FunctionMonitor] Executed '{name}' in {metrics.duration:.4f}s"
|
|
407
409
|
if metrics.cache_hit:
|
|
408
410
|
log_message += " (cache hit)"
|
|
@@ -44,7 +44,6 @@ class ClientRegistry:
|
|
|
44
44
|
|
|
45
45
|
cls._wrap_client_methods(client_class)
|
|
46
46
|
cls._clients[provider] = client_class
|
|
47
|
-
logger.info(f"[ClientRegistry] Registered client for provider: {provider}")
|
|
48
47
|
|
|
49
48
|
@classmethod
|
|
50
49
|
def _wrap_client_methods(cls, client_class: Type[BaseChatClient]) -> None:
|
|
@@ -78,7 +78,6 @@ class MetricsManager:
|
|
|
78
78
|
ValueError: if the scorer is not a callable.
|
|
79
79
|
"""
|
|
80
80
|
self._scorers[name] = scorer
|
|
81
|
-
logger.info(f"[MetricsManager] Registered scorer: {name}")
|
|
82
81
|
|
|
83
82
|
def get_scorer(self, name: str) -> Callable:
|
|
84
83
|
"""
|
|
@@ -95,7 +94,6 @@ class MetricsManager:
|
|
|
95
94
|
"""
|
|
96
95
|
try:
|
|
97
96
|
scorer = self._scorers.get(name)
|
|
98
|
-
logger.info(f"[get_scorer] Retrieved scorer: {name}")
|
|
99
97
|
return scorer
|
|
100
98
|
|
|
101
99
|
except KeyError:
|