levelapp 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- {levelapp-0.1.1 → levelapp-0.1.3}/PKG-INFO +12 -13
- {levelapp-0.1.1 → levelapp-0.1.3}/README.md +7 -8
- levelapp-0.1.3/docs/media/simulator-module-diagram.PNG +0 -0
- levelapp-0.1.3/docs/media/simulator-sequence-diagram.png +0 -0
- levelapp-0.1.3/examples/README.md +322 -0
- levelapp-0.1.3/examples/conversation_script.json +38 -0
- levelapp-0.1.3/examples/example_chatbot.py +48 -0
- levelapp-0.1.3/examples/example_evaluation.py +28 -0
- levelapp-0.1.3/examples/workflow_configuration.yaml +38 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/aspects/monitor.py +6 -3
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/clients/__init__.py +0 -1
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/comparator/scorer.py +0 -2
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/config/endpoint.py +22 -13
- levelapp-0.1.3/levelapp/config/endpoint_.py +62 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/config/prompts.py +22 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/core/schemas.py +0 -2
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/core/session.py +29 -3
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/evaluator/evaluator.py +16 -4
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/metrics/__init__.py +1 -5
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/simulator/schemas.py +7 -13
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/simulator/simulator.py +24 -21
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/simulator/utils.py +40 -78
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/workflow/base.py +38 -3
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/workflow/config.py +31 -4
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/workflow/context.py +0 -1
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/workflow/factory.py +16 -3
- {levelapp-0.1.1 → levelapp-0.1.3}/pyproject.toml +67 -67
- levelapp-0.1.3/src/data/evaluation_results.json +1 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/src/data/workflow_config.yaml +3 -3
- levelapp-0.1.3/src/level_app/main_session.py +48 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/uv.lock +1 -5
- levelapp-0.1.1/examples/example_evaluation.py +0 -0
- levelapp-0.1.1/src/level_app/main_session.py +0 -46
- {levelapp-0.1.1 → levelapp-0.1.3}/.gitignore +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/.python-version +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/LICENSE +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/MANIFEST.in +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/Makefile +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/aspects/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/aspects/loader.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/aspects/logger.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/aspects/sanitizer.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/clients/anthropic.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/clients/ionos.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/clients/mistral.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/clients/openai.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/comparator/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/comparator/comparator.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/comparator/extractor.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/comparator/schemas.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/comparator/utils.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/config/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/core/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/core/base.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/evaluator/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/metrics/embedding.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/metrics/exact.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/metrics/fuzzy.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/metrics/token.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/plugins/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/repository/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/repository/firestore.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/simulator/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/workflow/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/workflow/registration.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/levelapp/workflow/runtime.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/make.bat +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/project_structure.txt +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/src/data/conversation_example_1.json +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/src/data/endpoint_configuration.yaml +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/src/data/payload_example_1.yaml +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/src/data/payload_example_2.yaml +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/src/data/workflow_config_2.json +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/src/level_app/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/src/level_app/main.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/src/level_app/main_monitoring.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/src/level_app/main_simulator.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/tests/__init__.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/tests/test_anthropic.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/tests/test_comparator.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/tests/test_ionos.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/tests/test_mistral.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/tests/test_monitoring.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/tests/test_openai.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/tests/test_session.py +0 -0
- {levelapp-0.1.1 → levelapp-0.1.3}/tests/test_simulator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: levelapp
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
|
|
5
5
|
Project-URL: Homepage, https://github.com/levelapp-org
|
|
6
6
|
Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
|
|
@@ -17,14 +17,12 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
18
|
Classifier: Topic :: Software Development :: Testing
|
|
19
19
|
Requires-Python: >=3.12
|
|
20
|
-
Requires-Dist: arrow>=1.3.0
|
|
21
20
|
Requires-Dist: google-api-core>=2.25.1
|
|
22
21
|
Requires-Dist: google-auth>=2.40.3
|
|
23
22
|
Requires-Dist: google-cloud-firestore>=2.21.0
|
|
24
23
|
Requires-Dist: httpx>=0.28.1
|
|
25
24
|
Requires-Dist: humanize>=4.13.0
|
|
26
25
|
Requires-Dist: numpy>=2.3.2
|
|
27
|
-
Requires-Dist: openai>=1.99.9
|
|
28
26
|
Requires-Dist: pandas-stubs==2.3.0.250703
|
|
29
27
|
Requires-Dist: pandas>=2.3.1
|
|
30
28
|
Requires-Dist: pydantic>=2.11.7
|
|
@@ -35,10 +33,12 @@ Requires-Dist: rapidfuzz>=3.13.0
|
|
|
35
33
|
Requires-Dist: requests>=2.32.4
|
|
36
34
|
Requires-Dist: tenacity>=9.1.2
|
|
37
35
|
Provides-Extra: dev
|
|
38
|
-
Requires-Dist:
|
|
36
|
+
Requires-Dist: google-api-core>=2.25.1; extra == 'dev'
|
|
37
|
+
Requires-Dist: google-auth>=2.40.3; extra == 'dev'
|
|
38
|
+
Requires-Dist: google-cloud-firestore>=2.21.0; extra == 'dev'
|
|
39
39
|
Requires-Dist: httpx>=0.28.1; extra == 'dev'
|
|
40
|
+
Requires-Dist: humanize>=4.13.0; extra == 'dev'
|
|
40
41
|
Requires-Dist: numpy>=2.3.2; extra == 'dev'
|
|
41
|
-
Requires-Dist: openai>=1.99.9; extra == 'dev'
|
|
42
42
|
Requires-Dist: pandas-stubs==2.3.0.250703; extra == 'dev'
|
|
43
43
|
Requires-Dist: pandas>=2.3.1; extra == 'dev'
|
|
44
44
|
Requires-Dist: pydantic>=2.11.7; extra == 'dev'
|
|
@@ -133,7 +133,7 @@ endpoint:
|
|
|
133
133
|
generated_metadata: "${generated_metadata}"
|
|
134
134
|
|
|
135
135
|
repository:
|
|
136
|
-
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
136
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
137
137
|
project_id: "(default)"
|
|
138
138
|
database_name: ""
|
|
139
139
|
```
|
|
@@ -220,14 +220,14 @@ To run an evaluation:
|
|
|
220
220
|
|
|
221
221
|
```python
|
|
222
222
|
if __name__ == "__main__":
|
|
223
|
-
from levelapp.workflow
|
|
223
|
+
from levelapp.workflow import WorkflowConfig
|
|
224
224
|
from levelapp.core.session import EvaluationSession
|
|
225
225
|
|
|
226
226
|
# Load configuration from YAML
|
|
227
227
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
228
228
|
|
|
229
|
-
# Run evaluation session
|
|
230
|
-
with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
|
|
229
|
+
# Run evaluation session (You can enable/disable the monitoring aspect)
|
|
230
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
|
|
231
231
|
session.run()
|
|
232
232
|
results = session.workflow.collect_results()
|
|
233
233
|
print("Results:", results)
|
|
@@ -243,14 +243,13 @@ if __name__ == "__main__":
|
|
|
243
243
|
from levelapp.workflow import WorkflowConfig
|
|
244
244
|
from levelapp.core.session import EvaluationSession
|
|
245
245
|
|
|
246
|
-
|
|
247
|
-
|
|
246
|
+
|
|
248
247
|
config_dict = {
|
|
249
248
|
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
250
|
-
"evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
|
|
249
|
+
"evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
|
|
251
250
|
"reference_data": {"path": "", "data": {}},
|
|
252
251
|
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
253
|
-
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"
|
|
252
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
|
|
254
253
|
}
|
|
255
254
|
|
|
256
255
|
content = {
|
|
@@ -81,7 +81,7 @@ endpoint:
|
|
|
81
81
|
generated_metadata: "${generated_metadata}"
|
|
82
82
|
|
|
83
83
|
repository:
|
|
84
|
-
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
84
|
+
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
85
85
|
project_id: "(default)"
|
|
86
86
|
database_name: ""
|
|
87
87
|
```
|
|
@@ -168,14 +168,14 @@ To run an evaluation:
|
|
|
168
168
|
|
|
169
169
|
```python
|
|
170
170
|
if __name__ == "__main__":
|
|
171
|
-
from levelapp.workflow
|
|
171
|
+
from levelapp.workflow import WorkflowConfig
|
|
172
172
|
from levelapp.core.session import EvaluationSession
|
|
173
173
|
|
|
174
174
|
# Load configuration from YAML
|
|
175
175
|
config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
176
176
|
|
|
177
|
-
# Run evaluation session
|
|
178
|
-
with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
|
|
177
|
+
# Run evaluation session (You can enable/disable the monitoring aspect)
|
|
178
|
+
with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
|
|
179
179
|
session.run()
|
|
180
180
|
results = session.workflow.collect_results()
|
|
181
181
|
print("Results:", results)
|
|
@@ -191,14 +191,13 @@ if __name__ == "__main__":
|
|
|
191
191
|
from levelapp.workflow import WorkflowConfig
|
|
192
192
|
from levelapp.core.session import EvaluationSession
|
|
193
193
|
|
|
194
|
-
|
|
195
|
-
|
|
194
|
+
|
|
196
195
|
config_dict = {
|
|
197
196
|
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
198
|
-
"evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
|
|
197
|
+
"evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
|
|
199
198
|
"reference_data": {"path": "", "data": {}},
|
|
200
199
|
"endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
|
|
201
|
-
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"
|
|
200
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
|
|
202
201
|
}
|
|
203
202
|
|
|
204
203
|
content = {
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
# Quickstart Guide: Using LevelApp's Conversation Simulator for Developers
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
#### Welcome to LevelApp Quickstart Guide!
|
|
5
|
+
This guide provides a step-by-step walkthrough for developers to set up and use the Simulator Module in LevelApp.
|
|
6
|
+
<br>
|
|
7
|
+
<br>
|
|
8
|
+
The **Simulator** focuses on black-box testing by simulating dialogues using predefined scripts, evaluating responses against references, and computing metrics on extracted metadata.
|
|
9
|
+
It leverages LLM-as-a-judge for qualitative scoring and supports quantitative metrics like exact matches or fuzzy comparisons.
|
|
10
|
+
<br>
|
|
11
|
+
<figure>
|
|
12
|
+
<img
|
|
13
|
+
src="../docs/media/simulator-module-diagram.PNG"
|
|
14
|
+
alt="Sequence Diagram">
|
|
15
|
+
<figcaption>Fig.1 - Simulator Module Diagram</figcaption>
|
|
16
|
+
</figure>
|
|
17
|
+
<br>
|
|
18
|
+
We'll emphasize technical details, including configuration schemas, placeholders, evaluators, metrics, and code execution flow. This assumes you're familiar with Python, YAML/JSON, and REST APIs for LLM endpoints. By the end, you'll have a runnable example for evaluating a chatbot's conversation flow.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
## Introduction
|
|
22
|
+
First, let's have a quick introduction on what LevelApp is and what it provides as a framework.
|
|
23
|
+
|
|
24
|
+
The idea behind LevelApp is to build a framework that assists developers to perform regression tests on their LLM-powered systems ensuring that recent changes to code have not negatively impacted existing functionality or introduced new defects. <br>
|
|
25
|
+
The evaluation of dialogue systems is very cost/time intensive and problematic since assessing the quality of a dialogue requires multiple iteration where a human conducts a message/reply evaluation for each interaction (quite tedious and boring task, if you ask me!).
|
|
26
|
+
|
|
27
|
+
Automating the evaluation and introducing an LLM-as-a-judge as an approach to evaluate the correctness of responses can
|
|
28
|
+
ease the process and render it more efficient.
|
|
29
|
+
---
|
|
30
|
+
## Walkthrough
|
|
31
|
+
### Step1: Installation and Prerequisites
|
|
32
|
+
Install LevelApp using pip. This pulls in dependencies like `pydantic`, `numpy`, `python-dotenv`,
|
|
33
|
+
and others for handling LLM clients, data validation, and metrics computation.
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install levelapp
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
#### Technical Prerequisites:
|
|
40
|
+
|
|
41
|
+
* **Python Version**: 3.12+. LevelApp uses modern features like type hints and async support (via `asyncio` for potential batch processing).
|
|
42
|
+
* **LLM Provider Credentials**: You'll need API keys for at least one supported provider (e.g., OpenAI, Anthropic, IONOS, Mistral). These are loaded via `python-dotenv` from a `.env` file. Without them, evaluators like JUDGE won't function.
|
|
43
|
+
* **No Internet for Dependencies**: All core deps are installed automatically; no manual `pip install` needed beyond the initial command.
|
|
44
|
+
* **Environment Setup**: Create a `.env` file in your project root. Example structure (replace with your actual keys):
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
IONOS_API_KEY=your-ionos-key
|
|
48
|
+
OPENAI_API_KEY=sk-your-openai-key
|
|
49
|
+
ANTHROPIC_API_KEY=your-anthropic-key
|
|
50
|
+
MISTRAL_API_KEY=your-mistral-key
|
|
51
|
+
|
|
52
|
+
IONOS_BASE_URL=https://inference.de-txl.ionos.com
|
|
53
|
+
IONOS_MODEL_ID=0b6c4a15-bb8d-4092-82b0-f357b77c59fd
|
|
54
|
+
|
|
55
|
+
# Optional: Path to workflow config if not loading programmatically
|
|
56
|
+
WORKFLOW_CONFIG_PATH=../data/workflow_config.yaml
|
|
57
|
+
```
|
|
58
|
+
**Note**: For IONOS, the base_url and model_id are mandatory in .env as they aren't always configurable via YAML alone.
|
|
59
|
+
LevelApp uses these to construct API requests.
|
|
60
|
+
|
|
61
|
+
### Step2: Understanding the Simulator Workflow
|
|
62
|
+
The Simulator Module simulates conversations by:
|
|
63
|
+
1. Sending user messages (from a JSON script) to your LLM-based system's endpoint.
|
|
64
|
+
2. Capturing generated responses and metadata.
|
|
65
|
+
3. Evaluating them using selected evaluators (e.g., JUDGE for LLM-scored quality, REFERENCE for direct comparison).
|
|
66
|
+
4. Computing metrics on metadata (e.g., EXACT for string matching, LEVENSHTEIN for edit distance).
|
|
67
|
+
|
|
68
|
+
<figure>
|
|
69
|
+
<img
|
|
70
|
+
src="../docs/media/simulator-sequence-diagram.png"
|
|
71
|
+
alt="Sequence Diagram">
|
|
72
|
+
<figcaption>Fig.2 - Conversation Simulator Sequence Diagram</figcaption>
|
|
73
|
+
</figure>
|
|
74
|
+
|
|
75
|
+
#### Key Technical Concepts:
|
|
76
|
+
|
|
77
|
+
Workflow Type: Set to `SIMULATOR` in YAML. This triggers dialogue simulation logic in `levelapp.workflow`.
|
|
78
|
+
|
|
79
|
+
* **Evaluators**:
|
|
80
|
+
* `JUDGE`: Uses an LLM (from providers like OpenAI) to score generated replies against references (e.g., on relevance, fluency). Configurable via providers list.
|
|
81
|
+
* `REFERENCE`: Direct comparison without LLM, using metrics for metadata (Used for comparing extracted metadata).
|
|
82
|
+
* `RAG`: Retrieval-Augmented Generation evaluator (for knowledge-grounded responses; requires additional setup).
|
|
83
|
+
<br>
|
|
84
|
+
<br>
|
|
85
|
+
* **Metrics Map**: A dict mapping metadata fields to comparison methods (e.g., `EXACT` for exact string match, `LEVENSHTEIN` for fuzzy matching with distance thresholds).
|
|
86
|
+
Full list in docs: includes Token-based, Embedded (vector similarity), Fuzzy.
|
|
87
|
+
<br>
|
|
88
|
+
<br>
|
|
89
|
+
* **Attempts and Batching**: `evaluation_params` attempts runs simulations multiple times for averaging scores (useful for non-deterministic LLMs). batch_size controls concurrent requests to avoid rate limits.
|
|
90
|
+
<br>
|
|
91
|
+
<br>
|
|
92
|
+
* **Placeholders in Payloads**:
|
|
93
|
+
* `default_request_payload_template`: For this section, you need to change **field** (e.g,. change the field name `prompt` to `message`) names and not the **placeholder** values. The placeholders are used by the simulator to populate the request body.
|
|
94
|
+
* `default_response_payload_template`: For this section, you need to change the placeholders values and not the fields, contrary to the request section. The simulator will use the provided placeholder values to extract and map the reply and metadata from the response body.
|
|
95
|
+
|
|
96
|
+
### Step 3: Creating the YAML Configuration File
|
|
97
|
+
Create `workflow_config.yaml` to define the workflow. This is parsed into a `WorkflowConfig` Pydantic model for validation.
|
|
98
|
+
|
|
99
|
+
Example `workflow_config.yaml` for Simulator:
|
|
100
|
+
```YAML
|
|
101
|
+
# PROCESS SECTION:
|
|
102
|
+
process:
|
|
103
|
+
project_name: "chatbot-evaluation"
|
|
104
|
+
workflow_type: SIMULATOR # Must be SIMULATOR for conversation testing
|
|
105
|
+
evaluation_params:
|
|
106
|
+
attempts: 3 # Run each interaction 3 times, average results
|
|
107
|
+
batch_size: 10 # Process 10 interactions concurrently
|
|
108
|
+
|
|
109
|
+
# EVALUATION SECTION:
|
|
110
|
+
evaluation:
|
|
111
|
+
evaluators: # Array of evaluators to apply
|
|
112
|
+
- JUDGE
|
|
113
|
+
- REFERENCE # REFERENCE evaluator can be used if your dialogue system returns additional metadata.
|
|
114
|
+
providers: # LLM providers for JUDGE (At least one must be provided for the JUDGE evaluator)
|
|
115
|
+
- openai
|
|
116
|
+
- ionos
|
|
117
|
+
metrics_map: # Map metadata fields to metrics
|
|
118
|
+
appointment_type: EXACT # Exact match for strings
|
|
119
|
+
date: LEVENSHTEIN # Fuzzy match for dates (e.g., tolerates formatting differences)
|
|
120
|
+
time: TOKEN_BASED # Token-level overlap
|
|
121
|
+
|
|
122
|
+
# REFERENCE DATA SECTION:
|
|
123
|
+
reference_data:
|
|
124
|
+
path: "conversation_script.json" # Path to JSON script
|
|
125
|
+
data: {} # Inline data if not using path (dict of scripts)
|
|
126
|
+
|
|
127
|
+
# ENDPOINT CONFIGURATION SECTION:
|
|
128
|
+
endpoint:
|
|
129
|
+
base_url: "http://127.0.0.1:8000" # Your chatbot's API base URL
|
|
130
|
+
url_path: "chat" # Endpoint path (full URL = base_url + url_path)
|
|
131
|
+
api_key: "" # Optional; overrides .env if set
|
|
132
|
+
bearer_token: "" # For auth
|
|
133
|
+
model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct" # Model for your endpoint (if applicable).
|
|
134
|
+
default_request_payload_template: # Template for POST body
|
|
135
|
+
message: "${user_message}" # Adapt to your API (e.g., 'prompt' for some)
|
|
136
|
+
payload: "${request_payload}" # Additional data from JSON script
|
|
137
|
+
default_response_payload_template: # Extract from API response
|
|
138
|
+
agent_reply: "${generated_reply}" # Map to your response field
|
|
139
|
+
generated_metadata: "${metadata}" # e.g., extracted entities
|
|
140
|
+
|
|
141
|
+
# REPOSITORY SECTION (Optional):
|
|
142
|
+
repository:
|
|
143
|
+
type: FILESYSTEM # Or FIRESTORE/MONGODB for persistence
|
|
144
|
+
project_id: "" # For FIRESTORE
|
|
145
|
+
database_name: "" # For FIRESTORE/MONGODB
|
|
146
|
+
source: "LOCAL" # Or IN_MEMORY for non-persistent
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
For the endpoint configuration section (`endpoint`), essentially, you need to provide:
|
|
150
|
+
* base_url
|
|
151
|
+
* url_path
|
|
152
|
+
* headers data: API Key, Bearer Token, or any additional header data.
|
|
153
|
+
|
|
154
|
+
As for the request payload, for example, if you have the following request payload schema:
|
|
155
|
+
```JSON
|
|
156
|
+
{
|
|
157
|
+
"prompt": "Hello, world!",
|
|
158
|
+
"user_id": "0001",
|
|
159
|
+
"user_role": "ADMIN",
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
You need to configure the `default_request_payload_template` like the following:
|
|
163
|
+
```YAML
|
|
164
|
+
default_request_payload_template:
|
|
165
|
+
prompt: "${user_message}" # As you can notice, we only changed the field name and not the placeholder value.
|
|
166
|
+
payload: "${request_payload}" # The rest of the data will be fetched from the "request_payload" field in the reference data JSON file.
|
|
167
|
+
```
|
|
168
|
+
while providing the rest of the payload request inside the reference data JSON file content:
|
|
169
|
+
```JSON
|
|
170
|
+
{
|
|
171
|
+
"scripts": [
|
|
172
|
+
{
|
|
173
|
+
"interactions": [
|
|
174
|
+
{
|
|
175
|
+
"user_message": "Hello, I would like to book an appointment with a doctor.",
|
|
176
|
+
"reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
|
|
177
|
+
"interaction_type": "initial",
|
|
178
|
+
"reference_metadata": {},
|
|
179
|
+
"guardrail_flag": false,
|
|
180
|
+
"request_payload": {"user_id": "0001", "user_role": "ADMIN"} // Here we add the rest of the request payload data.
|
|
181
|
+
}
|
|
182
|
+
...
|
|
183
|
+
```
|
|
184
|
+
And for the response payload, if you have the following response payload schema:
|
|
185
|
+
```JSON
|
|
186
|
+
{
|
|
187
|
+
"response": "Hello, world!",
|
|
188
|
+
"metadata": {"k1": "v1", "k2": "v2"},
|
|
189
|
+
"timestamp": "2025-10-14T14:49:00.123Z",
|
|
190
|
+
"status": "COMPLETE"
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
You need to configure the `default_response_payload_template` like the following:
|
|
194
|
+
```YAML
|
|
195
|
+
default_response_payload_template:
|
|
196
|
+
agent_reply: "${response}" # we changed the placeholder value here by adding "response" field where the reply value is held.
|
|
197
|
+
generated_metadata: "${metadata}"
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Step 4: Creating the JSON Conversation Script
|
|
201
|
+
The script defines simulation flows. It's a dict with a `scripts` array, each containing `interactions` (sequential turns).
|
|
202
|
+
<br>
|
|
203
|
+
Example `conversation_script.json`:
|
|
204
|
+
```JSON
|
|
205
|
+
{
|
|
206
|
+
"scripts": [
|
|
207
|
+
{
|
|
208
|
+
"interactions": [
|
|
209
|
+
{
|
|
210
|
+
"user_message": "Hello, book a doctor appointment.",
|
|
211
|
+
"reference_reply": "What type of doctor?",
|
|
212
|
+
"interaction_type": "initial",
|
|
213
|
+
"reference_metadata": {},
|
|
214
|
+
"guardrail_flag": false,
|
|
215
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
"user_message": "Cardiologist.",
|
|
219
|
+
"reference_reply": "When?",
|
|
220
|
+
"interaction_type": "intermediate",
|
|
221
|
+
"reference_metadata": {"type": "Cardiology"},
|
|
222
|
+
"guardrail_flag": false,
|
|
223
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
224
|
+
},
|
|
225
|
+
{
|
|
226
|
+
"user_message": "Next Monday at 10 AM.",
|
|
227
|
+
"reference_reply": "Booked for 10 AM next Monday.",
|
|
228
|
+
"interaction_type": "final",
|
|
229
|
+
"reference_metadata": {
|
|
230
|
+
"appointment_type": "Cardiology",
|
|
231
|
+
"date": "2025-10-20",
|
|
232
|
+
"time": "10:00"
|
|
233
|
+
},
|
|
234
|
+
"guardrail_flag": false,
|
|
235
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
236
|
+
}
|
|
237
|
+
],
|
|
238
|
+
"description": "Doctor booking flow",
|
|
239
|
+
"details": {"context": "Medical chatbot"}
|
|
240
|
+
}
|
|
241
|
+
]
|
|
242
|
+
}
|
|
243
|
+
```
|
|
244
|
+
#### Technical Notes:
|
|
245
|
+
|
|
246
|
+
* **Schema Validation**: Interactions are validated against a schema (e.g., user_message: str, reference_metadata: dict).
|
|
247
|
+
* **Metadata Comparison**: generated_metadata from your endpoint is compared to reference_metadata using metrics_map.
|
|
248
|
+
* **Interaction Types**: initial/intermediate/final for flow control; can influence evaluator behavior (e.g., stricter on final turns).
|
|
249
|
+
* **Request Payload**: Merged into the endpoint request template for context (e.g., user auth).
|
|
250
|
+
|
|
251
|
+
### Step 5: Writing and Running the Python Script
|
|
252
|
+
Use this to load configs, run the simulation, and collect results. LevelApp handles session management via context managers.
|
|
253
|
+
<br>
|
|
254
|
+
Example run_simulation.py:
|
|
255
|
+
```Python
|
|
256
|
+
from dotenv import load_dotenv
|
|
257
|
+
from levelapp.workflow import WorkflowConfig
|
|
258
|
+
from levelapp.core.session import EvaluationSession
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# Load .env (automatically done in LevelApp, but explicit for clarity)
|
|
262
|
+
load_dotenv()
|
|
263
|
+
|
|
264
|
+
if __name__ == "__main__":
|
|
265
|
+
# Load YAML config (validates via Pydantic)
|
|
266
|
+
config = WorkflowConfig.load(path="workflow_config.yaml")
|
|
267
|
+
|
|
268
|
+
# Alternative: Load from dict for in-memory config (e.g., from DB)
|
|
269
|
+
# config_dict = {...} # As in README
|
|
270
|
+
# config = WorkflowConfig.from_dict(content=config_dict)
|
|
271
|
+
# config.set_reference_data(content={"scripts": [...]}) # Inline script
|
|
272
|
+
|
|
273
|
+
# Create session (handles logging, repository init)
|
|
274
|
+
with EvaluationSession(session_name="chatbot-sim-1", workflow_config=config) as session:
|
|
275
|
+
# Run simulation: Sends requests, evaluates, stores in repo
|
|
276
|
+
session.run()
|
|
277
|
+
|
|
278
|
+
# Collect the evaluation results
|
|
279
|
+
results = session.workflow.collect_results()
|
|
280
|
+
print("Evaluation Results:", results)
|
|
281
|
+
|
|
282
|
+
stats = session.get_stats()
|
|
283
|
+
print("Session Stats:\n", stats)
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
Technical Execution Flow:
|
|
287
|
+
|
|
288
|
+
1. `WorkflowConfig.load()`: Parses YAML, loads .env secrets, validates.
|
|
289
|
+
2. `EvaluationSession`: Initializes the evaluation session.
|
|
290
|
+
3. `session.run()`: Loops over scripts/interactions:
|
|
291
|
+
* Substitutes placeholders, sends POST to endpoint.
|
|
292
|
+
* Extracts chatbot reply and generated metadata.
|
|
293
|
+
* Applies evaluators (e.g., JUDGE prompts LLM with "Score reply on scale 0-3: generated vs reference").
|
|
294
|
+
* Computes metrics (e.g., Levenshtein distance via numpy).
|
|
295
|
+
4. `collect_results()`: Returns the evaluation results.
|
|
296
|
+
5. `get_stats()`: Retrieves monitoring stats (API calls details, caching details, processing time, etc.).
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
### Let's Test It:
|
|
300
|
+
First, install the packages required to run the examples test:
|
|
301
|
+
<br>
|
|
302
|
+
(it is always recommended to set up a virtual environment for testing)
|
|
303
|
+
```Bash
|
|
304
|
+
pip install fastapi uvicorn levelapp
|
|
305
|
+
```
|
|
306
|
+
Second, run the chatbot (`example_chatbot.py`) using `uvicorn`:
|
|
307
|
+
<br>
|
|
308
|
+
(don't forget to add your `OPENAI_API_KEY`!)
|
|
309
|
+
```Bash
|
|
310
|
+
uvicorn example_chatbot:app --reload --port 8000
|
|
311
|
+
```
|
|
312
|
+
Next, optionally, run a health test to see if the chatbot is alive:
|
|
313
|
+
```Bash
|
|
314
|
+
curl http://localhost:8000/healthz
|
|
315
|
+
```
|
|
316
|
+
Finally, run the evaluation:
|
|
317
|
+
```Bash
|
|
318
|
+
python example_evaluation.py
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
That's it! All you need now is to verify and interpret the evaluation results.
|
|
322
|
+
**Good luck!**
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"scripts": [
|
|
3
|
+
{
|
|
4
|
+
"interactions": [
|
|
5
|
+
{
|
|
6
|
+
"user_message": "Hello, book a doctor appointment.",
|
|
7
|
+
"reference_reply": "What type of doctor?",
|
|
8
|
+
"interaction_type": "initial",
|
|
9
|
+
"reference_metadata": {},
|
|
10
|
+
"guardrail_flag": false,
|
|
11
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"user_message": "Cardiologist.",
|
|
15
|
+
"reference_reply": "When?",
|
|
16
|
+
"interaction_type": "intermediate",
|
|
17
|
+
"reference_metadata": {"type": "Cardiology"},
|
|
18
|
+
"guardrail_flag": false,
|
|
19
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"user_message": "Next Monday at 10 AM.",
|
|
23
|
+
"reference_reply": "Booked for 10 AM next Monday.",
|
|
24
|
+
"interaction_type": "final",
|
|
25
|
+
"reference_metadata": {
|
|
26
|
+
"appointment_type": "Cardiology",
|
|
27
|
+
"date": "2025-10-20",
|
|
28
|
+
"time": "10:00"
|
|
29
|
+
},
|
|
30
|
+
"guardrail_flag": false,
|
|
31
|
+
"request_payload": {"user_id": "123", "role": "user"}
|
|
32
|
+
}
|
|
33
|
+
],
|
|
34
|
+
"description": "Doctor booking flow",
|
|
35
|
+
"details": {"context": "Medical chatbot"}
|
|
36
|
+
}
|
|
37
|
+
]
|
|
38
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from fastapi import FastAPI, HTTPException
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
from openai import OpenAI
|
|
4
|
+
|
|
5
|
+
app = FastAPI(title="Tiny Chatbot")
|
|
6
|
+
|
|
7
|
+
client = OpenAI(api_key="<YOUR-API-KEY-HERE>")
|
|
8
|
+
|
|
9
|
+
SYSTEM_PROMPT = "Play role as a medical assistant."
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ChatRequest(BaseModel):
|
|
13
|
+
message: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ChatResponse(BaseModel):
|
|
17
|
+
reply: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def generate_reply(user_message: str) -> str:
|
|
21
|
+
try:
|
|
22
|
+
resp = client.chat.completions.create(
|
|
23
|
+
model="gpt-4o-mini", # pick any chat-capable model you have access to
|
|
24
|
+
messages=[
|
|
25
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
|
26
|
+
{"role": "user", "content": user_message},
|
|
27
|
+
],
|
|
28
|
+
temperature=0.3,
|
|
29
|
+
)
|
|
30
|
+
return resp.choices[0].message.content.strip()
|
|
31
|
+
except Exception as e:
|
|
32
|
+
raise RuntimeError(f"LLM error: {e}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@app.post("/chat", response_model=ChatResponse)
|
|
36
|
+
def chat(req: ChatRequest):
|
|
37
|
+
if not req.message:
|
|
38
|
+
raise HTTPException(status_code=400, detail="`message` is required.")
|
|
39
|
+
try:
|
|
40
|
+
reply = generate_reply(req.message)
|
|
41
|
+
return ChatResponse(reply=reply)
|
|
42
|
+
except Exception as e:
|
|
43
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@app.get("/healthz")
|
|
47
|
+
def health():
|
|
48
|
+
return {"status": "ok"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from dotenv import load_dotenv
|
|
2
|
+
from levelapp.core.session import EvaluationSession
|
|
3
|
+
from levelapp.workflow import WorkflowConfig
|
|
4
|
+
|
|
5
|
+
# Load .env (automatically done in LevelApp, but explicit for clarity)
|
|
6
|
+
load_dotenv()
|
|
7
|
+
|
|
8
|
+
if __name__ == "__main__":
|
|
9
|
+
# 1. Load YAML config
|
|
10
|
+
config = WorkflowConfig.load(path="workflow_configuration.yaml")
|
|
11
|
+
|
|
12
|
+
# Alternatively: Load from dict for in-memory config (e.g., from DB)
|
|
13
|
+
# config_dict = {...} # As in README
|
|
14
|
+
# config = WorkflowConfig.from_dict(content=config_dict)
|
|
15
|
+
# config.set_reference_data(content={"scripts": [...]}) # Inline script
|
|
16
|
+
|
|
17
|
+
# 2. Create an evaluation session
|
|
18
|
+
with EvaluationSession(session_name="chatbot-sim-1", workflow_config=config) as session:
|
|
19
|
+
# 2.1. Run session (simulation session)
|
|
20
|
+
session.run()
|
|
21
|
+
|
|
22
|
+
# 2.2. Collect evaluation results
|
|
23
|
+
results = session.workflow.collect_results()
|
|
24
|
+
print("Evaluation Results:", results)
|
|
25
|
+
|
|
26
|
+
# 3. Get aggregated stats (monitoring stats)
|
|
27
|
+
stats = session.get_stats()
|
|
28
|
+
print("Session Stats:\n", stats)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
process:
|
|
2
|
+
project_name: "chatbot-evaluation"
|
|
3
|
+
workflow_type: SIMULATOR # Must be SIMULATOR for conversation testing
|
|
4
|
+
evaluation_params:
|
|
5
|
+
attempts: 3
|
|
6
|
+
batch_size: 10
|
|
7
|
+
|
|
8
|
+
evaluation:
|
|
9
|
+
evaluators:
|
|
10
|
+
- JUDGE
|
|
11
|
+
- REFERENCE
|
|
12
|
+
providers:
|
|
13
|
+
- openai
|
|
14
|
+
- ionos
|
|
15
|
+
metrics_map:
|
|
16
|
+
appointment_type: EXACT
|
|
17
|
+
date: LEVENSHTEIN
|
|
18
|
+
time: TOKEN_BASED
|
|
19
|
+
|
|
20
|
+
reference_data:
|
|
21
|
+
path: "conversation_script.json"
|
|
22
|
+
data: {}
|
|
23
|
+
|
|
24
|
+
endpoint:
|
|
25
|
+
base_url: "http://127.0.0.1:8000"
|
|
26
|
+
url_path: "chat"
|
|
27
|
+
api_key: ""
|
|
28
|
+
bearer_token: ""
|
|
29
|
+
default_request_payload_template:
|
|
30
|
+
message: "${user_message}"
|
|
31
|
+
default_response_payload_template:
|
|
32
|
+
agent_reply: "${reply}"
|
|
33
|
+
|
|
34
|
+
repository:
|
|
35
|
+
type: FIRESTORE # Keep this value as if for now.
|
|
36
|
+
project_id: ""
|
|
37
|
+
database_name: ""
|
|
38
|
+
source: "LOCAL"
|
|
@@ -343,6 +343,7 @@ class FunctionMonitor:
|
|
|
343
343
|
category: MetricType,
|
|
344
344
|
enable_timing: bool,
|
|
345
345
|
track_memory: bool,
|
|
346
|
+
verbose=False
|
|
346
347
|
) -> Callable[P, T]:
|
|
347
348
|
"""
|
|
348
349
|
Wrap function execution with timing and error handling.
|
|
@@ -352,6 +353,7 @@ class FunctionMonitor:
|
|
|
352
353
|
name: Unique identifier for the function
|
|
353
354
|
enable_timing: Enable execution time logging
|
|
354
355
|
track_memory: Enable memory tracking
|
|
356
|
+
verbose: Enable verbose logging
|
|
355
357
|
|
|
356
358
|
Returns:
|
|
357
359
|
Wrapped function
|
|
@@ -402,7 +404,7 @@ class FunctionMonitor:
|
|
|
402
404
|
|
|
403
405
|
self._aggregated_stats[name].update(metrics=metrics)
|
|
404
406
|
|
|
405
|
-
if enable_timing and metrics.duration is not None:
|
|
407
|
+
if verbose and enable_timing and metrics.duration is not None:
|
|
406
408
|
log_message = f"[FunctionMonitor] Executed '{name}' in {metrics.duration:.4f}s"
|
|
407
409
|
if metrics.cache_hit:
|
|
408
410
|
log_message += " (cache hit)"
|
|
@@ -420,7 +422,8 @@ class FunctionMonitor:
|
|
|
420
422
|
maxsize: int | None = 128,
|
|
421
423
|
enable_timing: bool = True,
|
|
422
424
|
track_memory: bool = True,
|
|
423
|
-
collectors: List[Type[MetricsCollector]] | None = None
|
|
425
|
+
collectors: List[Type[MetricsCollector]] | None = None,
|
|
426
|
+
verbose: bool = False
|
|
424
427
|
) -> Callable[[Callable[P, T]], Callable[P, T]]:
|
|
425
428
|
"""
|
|
426
429
|
Decorator factory for monitoring functions.
|
|
@@ -454,7 +457,7 @@ class FunctionMonitor:
|
|
|
454
457
|
)
|
|
455
458
|
|
|
456
459
|
with self._lock:
|
|
457
|
-
if name in self._monitored_procedures:
|
|
460
|
+
if name in self._monitored_procedures and verbose:
|
|
458
461
|
raise ValueError(f"Function '{name}' is already registered.")
|
|
459
462
|
|
|
460
463
|
self._monitored_procedures[name] = monitored_func
|
|
@@ -44,7 +44,6 @@ class ClientRegistry:
|
|
|
44
44
|
|
|
45
45
|
cls._wrap_client_methods(client_class)
|
|
46
46
|
cls._clients[provider] = client_class
|
|
47
|
-
logger.info(f"[ClientRegistry] Registered client for provider: {provider}")
|
|
48
47
|
|
|
49
48
|
@classmethod
|
|
50
49
|
def _wrap_client_methods(cls, client_class: Type[BaseChatClient]) -> None:
|