levelapp 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of levelapp might be problematic. Click here for more details.

Files changed (87) hide show
  1. {levelapp-0.1.1 → levelapp-0.1.2}/PKG-INFO +8 -11
  2. {levelapp-0.1.1 → levelapp-0.1.2}/README.md +7 -8
  3. levelapp-0.1.2/docs/media/simulator-module-diagram.PNG +0 -0
  4. levelapp-0.1.2/docs/media/simulator-sequence-diagram.png +0 -0
  5. levelapp-0.1.2/examples/README.md +322 -0
  6. levelapp-0.1.2/examples/conversation_script.json +38 -0
  7. levelapp-0.1.2/examples/example_chatbot.py +48 -0
  8. levelapp-0.1.2/examples/example_evaluation.py +28 -0
  9. levelapp-0.1.2/examples/workflow_configuration.yaml +38 -0
  10. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/aspects/monitor.py +3 -1
  11. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/clients/__init__.py +0 -1
  12. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/scorer.py +0 -2
  13. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/config/endpoint.py +22 -13
  14. levelapp-0.1.2/levelapp/config/endpoint_.py +62 -0
  15. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/config/prompts.py +22 -0
  16. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/core/schemas.py +0 -2
  17. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/core/session.py +29 -3
  18. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/evaluator/evaluator.py +16 -4
  19. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/metrics/__init__.py +1 -5
  20. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/simulator/schemas.py +7 -13
  21. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/simulator/simulator.py +21 -18
  22. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/simulator/utils.py +40 -78
  23. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/base.py +38 -3
  24. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/config.py +31 -4
  25. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/context.py +0 -1
  26. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/factory.py +16 -3
  27. {levelapp-0.1.1 → levelapp-0.1.2}/pyproject.toml +1 -3
  28. levelapp-0.1.2/src/data/evaluation_results.json +1 -0
  29. {levelapp-0.1.1 → levelapp-0.1.2}/src/data/workflow_config.yaml +3 -3
  30. levelapp-0.1.2/src/level_app/main_session.py +48 -0
  31. {levelapp-0.1.1 → levelapp-0.1.2}/uv.lock +1 -5
  32. levelapp-0.1.1/examples/example_evaluation.py +0 -0
  33. levelapp-0.1.1/src/level_app/main_session.py +0 -46
  34. {levelapp-0.1.1 → levelapp-0.1.2}/.gitignore +0 -0
  35. {levelapp-0.1.1 → levelapp-0.1.2}/.python-version +0 -0
  36. {levelapp-0.1.1 → levelapp-0.1.2}/LICENSE +0 -0
  37. {levelapp-0.1.1 → levelapp-0.1.2}/MANIFEST.in +0 -0
  38. {levelapp-0.1.1 → levelapp-0.1.2}/Makefile +0 -0
  39. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/__init__.py +0 -0
  40. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/aspects/__init__.py +0 -0
  41. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/aspects/loader.py +0 -0
  42. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/aspects/logger.py +0 -0
  43. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/aspects/sanitizer.py +0 -0
  44. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/clients/anthropic.py +0 -0
  45. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/clients/ionos.py +0 -0
  46. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/clients/mistral.py +0 -0
  47. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/clients/openai.py +0 -0
  48. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/__init__.py +0 -0
  49. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/comparator.py +0 -0
  50. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/extractor.py +0 -0
  51. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/schemas.py +0 -0
  52. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/comparator/utils.py +0 -0
  53. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/config/__init__.py +0 -0
  54. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/core/__init__.py +0 -0
  55. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/core/base.py +0 -0
  56. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/evaluator/__init__.py +0 -0
  57. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/metrics/embedding.py +0 -0
  58. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/metrics/exact.py +0 -0
  59. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/metrics/fuzzy.py +0 -0
  60. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/metrics/token.py +0 -0
  61. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/plugins/__init__.py +0 -0
  62. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/repository/__init__.py +0 -0
  63. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/repository/firestore.py +0 -0
  64. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/simulator/__init__.py +0 -0
  65. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/__init__.py +0 -0
  66. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/registration.py +0 -0
  67. {levelapp-0.1.1 → levelapp-0.1.2}/levelapp/workflow/runtime.py +0 -0
  68. {levelapp-0.1.1 → levelapp-0.1.2}/make.bat +0 -0
  69. {levelapp-0.1.1 → levelapp-0.1.2}/project_structure.txt +0 -0
  70. {levelapp-0.1.1 → levelapp-0.1.2}/src/data/conversation_example_1.json +0 -0
  71. {levelapp-0.1.1 → levelapp-0.1.2}/src/data/endpoint_configuration.yaml +0 -0
  72. {levelapp-0.1.1 → levelapp-0.1.2}/src/data/payload_example_1.yaml +0 -0
  73. {levelapp-0.1.1 → levelapp-0.1.2}/src/data/payload_example_2.yaml +0 -0
  74. {levelapp-0.1.1 → levelapp-0.1.2}/src/data/workflow_config_2.json +0 -0
  75. {levelapp-0.1.1 → levelapp-0.1.2}/src/level_app/__init__.py +0 -0
  76. {levelapp-0.1.1 → levelapp-0.1.2}/src/level_app/main.py +0 -0
  77. {levelapp-0.1.1 → levelapp-0.1.2}/src/level_app/main_monitoring.py +0 -0
  78. {levelapp-0.1.1 → levelapp-0.1.2}/src/level_app/main_simulator.py +0 -0
  79. {levelapp-0.1.1 → levelapp-0.1.2}/tests/__init__.py +0 -0
  80. {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_anthropic.py +0 -0
  81. {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_comparator.py +0 -0
  82. {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_ionos.py +0 -0
  83. {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_mistral.py +0 -0
  84. {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_monitoring.py +0 -0
  85. {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_openai.py +0 -0
  86. {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_session.py +0 -0
  87. {levelapp-0.1.1 → levelapp-0.1.2}/tests/test_simulator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: levelapp
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
5
5
  Project-URL: Homepage, https://github.com/levelapp-org
6
6
  Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
@@ -17,14 +17,12 @@ Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Classifier: Topic :: Software Development :: Testing
19
19
  Requires-Python: >=3.12
20
- Requires-Dist: arrow>=1.3.0
21
20
  Requires-Dist: google-api-core>=2.25.1
22
21
  Requires-Dist: google-auth>=2.40.3
23
22
  Requires-Dist: google-cloud-firestore>=2.21.0
24
23
  Requires-Dist: httpx>=0.28.1
25
24
  Requires-Dist: humanize>=4.13.0
26
25
  Requires-Dist: numpy>=2.3.2
27
- Requires-Dist: openai>=1.99.9
28
26
  Requires-Dist: pandas-stubs==2.3.0.250703
29
27
  Requires-Dist: pandas>=2.3.1
30
28
  Requires-Dist: pydantic>=2.11.7
@@ -133,7 +131,7 @@ endpoint:
133
131
  generated_metadata: "${generated_metadata}"
134
132
 
135
133
  repository:
136
- type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM, MONGODB.
134
+ type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
137
135
  project_id: "(default)"
138
136
  database_name: ""
139
137
  ```
@@ -220,14 +218,14 @@ To run an evaluation:
220
218
 
221
219
  ```python
222
220
  if __name__ == "__main__":
223
- from levelapp.workflow.schemas import WorkflowConfig
221
+ from levelapp.workflow import WorkflowConfig
224
222
  from levelapp.core.session import EvaluationSession
225
223
 
226
224
  # Load configuration from YAML
227
225
  config = WorkflowConfig.load(path="../data/workflow_config.yaml")
228
226
 
229
- # Run evaluation session
230
- with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
227
+ # Run evaluation session (You can enable/disable the monitoring aspect)
228
+ with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
231
229
  session.run()
232
230
  results = session.workflow.collect_results()
233
231
  print("Results:", results)
@@ -243,14 +241,13 @@ if __name__ == "__main__":
243
241
  from levelapp.workflow import WorkflowConfig
244
242
  from levelapp.core.session import EvaluationSession
245
243
 
246
- # Firestore -> retrieve endpoint config -> data => config_dict
247
-
244
+
248
245
  config_dict = {
249
246
  "process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
250
- "evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
247
+ "evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
251
248
  "reference_data": {"path": "", "data": {}},
252
249
  "endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
253
- "repository": {"type": "FIRESTORE", "source": "IN_MEMORY", "metrics_map": {"field_1": "EXACT"}},
250
+ "repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
254
251
  }
255
252
 
256
253
  content = {
@@ -81,7 +81,7 @@ endpoint:
81
81
  generated_metadata: "${generated_metadata}"
82
82
 
83
83
  repository:
84
- type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM, MONGODB.
84
+ type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
85
85
  project_id: "(default)"
86
86
  database_name: ""
87
87
  ```
@@ -168,14 +168,14 @@ To run an evaluation:
168
168
 
169
169
  ```python
170
170
  if __name__ == "__main__":
171
- from levelapp.workflow.schemas import WorkflowConfig
171
+ from levelapp.workflow import WorkflowConfig
172
172
  from levelapp.core.session import EvaluationSession
173
173
 
174
174
  # Load configuration from YAML
175
175
  config = WorkflowConfig.load(path="../data/workflow_config.yaml")
176
176
 
177
- # Run evaluation session
178
- with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
177
+ # Run evaluation session (You can enable/disable the monitoring aspect)
178
+ with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
179
179
  session.run()
180
180
  results = session.workflow.collect_results()
181
181
  print("Results:", results)
@@ -191,14 +191,13 @@ if __name__ == "__main__":
191
191
  from levelapp.workflow import WorkflowConfig
192
192
  from levelapp.core.session import EvaluationSession
193
193
 
194
- # Firestore -> retrieve endpoint config -> data => config_dict
195
-
194
+
196
195
  config_dict = {
197
196
  "process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
198
- "evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
197
+ "evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
199
198
  "reference_data": {"path": "", "data": {}},
200
199
  "endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
201
- "repository": {"type": "FIRESTORE", "source": "IN_MEMORY", "metrics_map": {"field_1": "EXACT"}},
200
+ "repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
202
201
  }
203
202
 
204
203
  content = {
@@ -0,0 +1,322 @@
1
+ # Quickstart Guide: Using LevelApp's Conversation Simulator for Developers
2
+
3
+ ---
4
+ #### Welcome to LevelApp Quickstart Guide!
5
+ This guide provides a step-by-step walkthrough for developers to set up and use the Simulator Module in LevelApp.
6
+ <br>
7
+ <br>
8
+ The **Simulator** focuses on black-box testing by simulating dialogues using predefined scripts, evaluating responses against references, and computing metrics on extracted metadata.
9
+ It leverages LLM-as-a-judge for qualitative scoring and supports quantitative metrics like exact matches or fuzzy comparisons.
10
+ <br>
11
+ <figure>
12
+ <img
13
+ src="../docs/media/simulator-module-diagram.PNG"
14
+ alt="Sequence Diagram">
15
+ <figcaption>Fig.1 - Simulator Module Diagram</figcaption>
16
+ </figure>
17
+ <br>
18
+ We'll emphasize technical details, including configuration schemas, placeholders, evaluators, metrics, and code execution flow. This assumes you're familiar with Python, YAML/JSON, and REST APIs for LLM endpoints. By the end, you'll have a runnable example for evaluating a chatbot's conversation flow.
19
+
20
+ ---
21
+ ## Introduction
22
+ First, let's have a quick introduction on what LevelApp is and what it provides as a framework.
23
+
24
+ The idea behind LevelApp is to build a framework that assists developers to perform regression tests on their LLM-powered systems ensuring that recent changes to code have not negatively impacted existing functionality or introduced new defects. <br>
25
+ The evaluation of dialogue systems is very cost/time intensive and problematic since assessing the quality of a dialogue requires multiple iteration where a human conducts a message/reply evaluation for each interaction (quite tedious and boring task, if you ask me!).
26
+
27
+ Automating the evaluation and introducing an LLM-as-a-judge as an approach to evaluate the correctness of responses can
28
+ ease the process and render it more efficient.
29
+ ---
30
+ ## Walkthrough
31
+ ### Step1: Installation and Prerequisites
32
+ Install LevelApp using pip. This pulls in dependencies like `pydantic`, `numpy`, `python-dotenv`,
33
+ and others for handling LLM clients, data validation, and metrics computation.
34
+
35
+ ```bash
36
+ pip install levelapp
37
+ ```
38
+
39
+ #### Technical Prerequisites:
40
+
41
+ * **Python Version**: 3.12+. LevelApp uses modern features like type hints and async support (via `asyncio` for potential batch processing).
42
+ * **LLM Provider Credentials**: You'll need API keys for at least one supported provider (e.g., OpenAI, Anthropic, IONOS, Mistral). These are loaded via `python-dotenv` from a `.env` file. Without them, evaluators like JUDGE won't function.
43
+ * **No Internet for Dependencies**: All core deps are installed automatically; no manual `pip install` needed beyond the initial command.
44
+ * **Environment Setup**: Create a `.env` file in your project root. Example structure (replace with your actual keys):
45
+
46
+ ```
47
+ IONOS_API_KEY=your-ionos-key
48
+ OPENAI_API_KEY=sk-your-openai-key
49
+ ANTHROPIC_API_KEY=your-anthropic-key
50
+ MISTRAL_API_KEY=your-mistral-key
51
+
52
+ IONOS_BASE_URL=https://inference.de-txl.ionos.com
53
+ IONOS_MODEL_ID=0b6c4a15-bb8d-4092-82b0-f357b77c59fd
54
+
55
+ # Optional: Path to workflow config if not loading programmatically
56
+ WORKFLOW_CONFIG_PATH=../data/workflow_config.yaml
57
+ ```
58
+ **Note**: For IONOS, the base_url and model_id are mandatory in .env as they aren't always configurable via YAML alone.
59
+ LevelApp uses these to construct API requests.
60
+
61
+ ### Step2: Understanding the Simulator Workflow
62
+ The Simulator Module simulates conversations by:
63
+ 1. Sending user messages (from a JSON script) to your LLM-based system's endpoint.
64
+ 2. Capturing generated responses and metadata.
65
+ 3. Evaluating them using selected evaluators (e.g., JUDGE for LLM-scored quality, REFERENCE for direct comparison).
66
+ 4. Computing metrics on metadata (e.g., EXACT for string matching, LEVENSHTEIN for edit distance).
67
+
68
+ <figure>
69
+ <img
70
+ src="../docs/media/simulator-sequence-diagram.png"
71
+ alt="Sequence Diagram">
72
+ <figcaption>Fig.2 - Conversation Simulator Sequence Diagram</figcaption>
73
+ </figure>
74
+
75
+ #### Key Technical Concepts:
76
+
77
+ Workflow Type: Set to `SIMULATOR` in YAML. This triggers dialogue simulation logic in `levelapp.workflow`.
78
+
79
+ * **Evaluators**:
80
+ * `JUDGE`: Uses an LLM (from providers like OpenAI) to score generated replies against references (e.g., on relevance, fluency). Configurable via providers list.
81
+ * `REFERENCE`: Direct comparison without LLM, using metrics for metadata (Used for comparing extracted metadata).
82
+ * `RAG`: Retrieval-Augmented Generation evaluator (for knowledge-grounded responses; requires additional setup).
83
+ <br>
84
+ <br>
85
+ * **Metrics Map**: A dict mapping metadata fields to comparison methods (e.g., `EXACT` for exact string match, `LEVENSHTEIN` for fuzzy matching with distance thresholds).
86
+ Full list in docs: includes Token-based, Embedded (vector similarity), Fuzzy.
87
+ <br>
88
+ <br>
89
+ * **Attempts and Batching**: `evaluation_params` attempts runs simulations multiple times for averaging scores (useful for non-deterministic LLMs). batch_size controls concurrent requests to avoid rate limits.
90
+ <br>
91
+ <br>
92
+ * **Placeholders in Payloads**:
93
+ * `default_request_payload_template`: For this section, you need to change **field** (e.g,. change the field name `prompt` to `message`) names and not the **placeholder** values. The placeholders are used by the simulator to populate the request body.
94
+ * `default_response_payload_template`: For this section, you need to change the placeholders values and not the fields, contrary to the request section. The simulator will use the provided placeholder values to extract and map the reply and metadata from the response body.
95
+
96
+ ### Step 3: Creating the YAML Configuration File
97
+ Create `workflow_config.yaml` to define the workflow. This is parsed into a `WorkflowConfig` Pydantic model for validation.
98
+
99
+ Example `workflow_config.yaml` for Simulator:
100
+ ```YAML
101
+ # PROCESS SECTION:
102
+ process:
103
+ project_name: "chatbot-evaluation"
104
+ workflow_type: SIMULATOR # Must be SIMULATOR for conversation testing
105
+ evaluation_params:
106
+ attempts: 3 # Run each interaction 3 times, average results
107
+ batch_size: 10 # Process 10 interactions concurrently
108
+
109
+ # EVALUATION SECTION:
110
+ evaluation:
111
+ evaluators: # Array of evaluators to apply
112
+ - JUDGE
113
+ - REFERENCE # REFERENCE evaluator can be used if your dialogue system returns additional metadata.
114
+ providers: # LLM providers for JUDGE (At least one must be provided for the JUDGE evaluator)
115
+ - openai
116
+ - ionos
117
+ metrics_map: # Map metadata fields to metrics
118
+ appointment_type: EXACT # Exact match for strings
119
+ date: LEVENSHTEIN # Fuzzy match for dates (e.g., tolerates formatting differences)
120
+ time: TOKEN_BASED # Token-level overlap
121
+
122
+ # REFERENCE DATA SECTION:
123
+ reference_data:
124
+ path: "conversation_script.json" # Path to JSON script
125
+ data: {} # Inline data if not using path (dict of scripts)
126
+
127
+ # ENDPOINT CONFIGURATION SECTION:
128
+ endpoint:
129
+ base_url: "http://127.0.0.1:8000" # Your chatbot's API base URL
130
+ url_path: "chat" # Endpoint path (full URL = base_url + url_path)
131
+ api_key: "" # Optional; overrides .env if set
132
+ bearer_token: "" # For auth
133
+ model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct" # Model for your endpoint (if applicable).
134
+ default_request_payload_template: # Template for POST body
135
+ message: "${user_message}" # Adapt to your API (e.g., 'prompt' for some)
136
+ payload: "${request_payload}" # Additional data from JSON script
137
+ default_response_payload_template: # Extract from API response
138
+ agent_reply: "${generated_reply}" # Map to your response field
139
+ generated_metadata: "${metadata}" # e.g., extracted entities
140
+
141
+ # REPOSITORY SECTION (Optional):
142
+ repository:
143
+ type: FILESYSTEM # Or FIRESTORE/MONGODB for persistence
144
+ project_id: "" # For FIRESTORE
145
+ database_name: "" # For FIRESTORE/MONGODB
146
+ source: "LOCAL" # Or IN_MEMORY for non-persistent
147
+ ```
148
+
149
+ For the endpoint configuration section (`endpoint`), essentially, you need to provide:
150
+ * base_url
151
+ * url_path
152
+ * headers data: API Key, Bearer Token, or any additional header data.
153
+
154
+ As for the request payload, for example, if you have the following request payload schema:
155
+ ```JSON
156
+ {
157
+ "prompt": "Hello, world!",
158
+ "user_id": "0001",
159
+ "user_role": "ADMIN",
160
+ }
161
+ ```
162
+ You need to configure the `default_request_payload_template` like the following:
163
+ ```YAML
164
+ default_request_payload_template:
165
+ prompt: "${user_message}" # As you can notice, we only changed the field name and not the placeholder value.
166
+ payload: "${request_payload}" # The rest of the data will be fetched from the "request_payload" field in the reference data JSON file.
167
+ ```
168
+ while providing the rest of the payload request inside the reference data JSON file content:
169
+ ```JSON
170
+ {
171
+ "scripts": [
172
+ {
173
+ "interactions": [
174
+ {
175
+ "user_message": "Hello, I would like to book an appointment with a doctor.",
176
+ "reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
177
+ "interaction_type": "initial",
178
+ "reference_metadata": {},
179
+ "guardrail_flag": false,
180
+ "request_payload": {"user_id": "0001", "user_role": "ADMIN"} // Here we add the rest of the request payload data.
181
+ }
182
+ ...
183
+ ```
184
+ And for the response payload, if you have the following response payload schema:
185
+ ```JSON
186
+ {
187
+ "response": "Hello, world!",
188
+ "metadata": {"k1": "v1", "k2": "v2"},
189
+ "timestamp": "2025-10-14T14:49:00.123Z",
190
+ "status": "COMPLETE"
191
+ }
192
+ ```
193
+ You need to configure the `default_response_payload_template` like the following:
194
+ ```YAML
195
+ default_response_payload_template:
196
+ agent_reply: "${response}" # we changed the placeholder value here by adding "response" field where the reply value is held.
197
+ generated_metadata: "${metadata}"
198
+ ```
199
+
200
+ ### Step 4: Creating the JSON Conversation Script
201
+ The script defines simulation flows. It's a dict with a `scripts` array, each containing `interactions` (sequential turns).
202
+ <br>
203
+ Example `conversation_script.json`:
204
+ ```JSON
205
+ {
206
+ "scripts": [
207
+ {
208
+ "interactions": [
209
+ {
210
+ "user_message": "Hello, book a doctor appointment.",
211
+ "reference_reply": "What type of doctor?",
212
+ "interaction_type": "initial",
213
+ "reference_metadata": {},
214
+ "guardrail_flag": false,
215
+ "request_payload": {"user_id": "123", "role": "user"}
216
+ },
217
+ {
218
+ "user_message": "Cardiologist.",
219
+ "reference_reply": "When?",
220
+ "interaction_type": "intermediate",
221
+ "reference_metadata": {"type": "Cardiology"},
222
+ "guardrail_flag": false,
223
+ "request_payload": {"user_id": "123", "role": "user"}
224
+ },
225
+ {
226
+ "user_message": "Next Monday at 10 AM.",
227
+ "reference_reply": "Booked for 10 AM next Monday.",
228
+ "interaction_type": "final",
229
+ "reference_metadata": {
230
+ "appointment_type": "Cardiology",
231
+ "date": "2025-10-20",
232
+ "time": "10:00"
233
+ },
234
+ "guardrail_flag": false,
235
+ "request_payload": {"user_id": "123", "role": "user"}
236
+ }
237
+ ],
238
+ "description": "Doctor booking flow",
239
+ "details": {"context": "Medical chatbot"}
240
+ }
241
+ ]
242
+ }
243
+ ```
244
+ #### Technical Notes:
245
+
246
+ * **Schema Validation**: Interactions are validated against a schema (e.g., user_message: str, reference_metadata: dict).
247
+ * **Metadata Comparison**: generated_metadata from your endpoint is compared to reference_metadata using metrics_map.
248
+ * **Interaction Types**: initial/intermediate/final for flow control; can influence evaluator behavior (e.g., stricter on final turns).
249
+ * **Request Payload**: Merged into the endpoint request template for context (e.g., user auth).
250
+
251
+ ### Step 5: Writing and Running the Python Script
252
+ Use this to load configs, run the simulation, and collect results. LevelApp handles session management via context managers.
253
+ <br>
254
+ Example run_simulation.py:
255
+ ```Python
256
+ from dotenv import load_dotenv
257
+ from levelapp.workflow import WorkflowConfig
258
+ from levelapp.core.session import EvaluationSession
259
+
260
+
261
+ # Load .env (automatically done in LevelApp, but explicit for clarity)
262
+ load_dotenv()
263
+
264
+ if __name__ == "__main__":
265
+ # Load YAML config (validates via Pydantic)
266
+ config = WorkflowConfig.load(path="workflow_config.yaml")
267
+
268
+ # Alternative: Load from dict for in-memory config (e.g., from DB)
269
+ # config_dict = {...} # As in README
270
+ # config = WorkflowConfig.from_dict(content=config_dict)
271
+ # config.set_reference_data(content={"scripts": [...]}) # Inline script
272
+
273
+ # Create session (handles logging, repository init)
274
+ with EvaluationSession(session_name="chatbot-sim-1", workflow_config=config) as session:
275
+ # Run simulation: Sends requests, evaluates, stores in repo
276
+ session.run()
277
+
278
+ # Collect the evaluation results
279
+ results = session.workflow.collect_results()
280
+ print("Evaluation Results:", results)
281
+
282
+ stats = session.get_stats()
283
+ print("Session Stats:\n", stats)
284
+ ```
285
+
286
+ Technical Execution Flow:
287
+
288
+ 1. `WorkflowConfig.load()`: Parses YAML, loads .env secrets, validates.
289
+ 2. `EvaluationSession`: Initializes the evaluation session.
290
+ 3. `session.run()`: Loops over scripts/interactions:
291
+ * Substitutes placeholders, sends POST to endpoint.
292
+ * Extracts chatbot reply and generated metadata.
293
+ * Applies evaluators (e.g., JUDGE prompts LLM with "Score reply on scale 0-3: generated vs reference").
294
+ * Computes metrics (e.g., Levenshtein distance via numpy).
295
+ 4. `collect_results()`: Returns the evaluation results.
296
+ 5. `get_stats()`: Retrieves monitoring stats (API calls details, caching details, processing time, etc.).
297
+
298
+ ---
299
+ ### Let's Test It:
300
+ First, install the packages required to run the examples test:
301
+ <br>
302
+ (it is always recommended to set up a virtual environment for testing)
303
+ ```Bash
304
+ pip install fastapi uvicorn levelapp
305
+ ```
306
+ Second, run the chatbot (`example_chatbot.py`) using `uvicorn`:
307
+ <br>
308
+ (don't forget to add your `OPENAI_API_KEY`!)
309
+ ```Bash
310
+ uvicorn example_chatbot:app --reload --port 8000
311
+ ```
312
+ Next, optionally, run a health test to see if the chatbot is alive:
313
+ ```Bash
314
+ curl http://localhost:8000/healthz
315
+ ```
316
+ Finally, run the evaluation:
317
+ ```Bash
318
+ python example_evaluation.py
319
+ ```
320
+
321
+ That's it! All you need now is to verify and interpret the evaluation results.
322
+ **Good luck!**
@@ -0,0 +1,38 @@
1
+ {
2
+ "scripts": [
3
+ {
4
+ "interactions": [
5
+ {
6
+ "user_message": "Hello, book a doctor appointment.",
7
+ "reference_reply": "What type of doctor?",
8
+ "interaction_type": "initial",
9
+ "reference_metadata": {},
10
+ "guardrail_flag": false,
11
+ "request_payload": {"user_id": "123", "role": "user"}
12
+ },
13
+ {
14
+ "user_message": "Cardiologist.",
15
+ "reference_reply": "When?",
16
+ "interaction_type": "intermediate",
17
+ "reference_metadata": {"type": "Cardiology"},
18
+ "guardrail_flag": false,
19
+ "request_payload": {"user_id": "123", "role": "user"}
20
+ },
21
+ {
22
+ "user_message": "Next Monday at 10 AM.",
23
+ "reference_reply": "Booked for 10 AM next Monday.",
24
+ "interaction_type": "final",
25
+ "reference_metadata": {
26
+ "appointment_type": "Cardiology",
27
+ "date": "2025-10-20",
28
+ "time": "10:00"
29
+ },
30
+ "guardrail_flag": false,
31
+ "request_payload": {"user_id": "123", "role": "user"}
32
+ }
33
+ ],
34
+ "description": "Doctor booking flow",
35
+ "details": {"context": "Medical chatbot"}
36
+ }
37
+ ]
38
+ }
@@ -0,0 +1,48 @@
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from openai import OpenAI
4
+
5
+ app = FastAPI(title="Tiny Chatbot")
6
+
7
+ client = OpenAI(api_key="<YOUR-API-KEY-HERE>")
8
+
9
+ SYSTEM_PROMPT = "Play role as a medical assistant."
10
+
11
+
12
+ class ChatRequest(BaseModel):
13
+ message: str
14
+
15
+
16
+ class ChatResponse(BaseModel):
17
+ reply: str
18
+
19
+
20
+ def generate_reply(user_message: str) -> str:
21
+ try:
22
+ resp = client.chat.completions.create(
23
+ model="gpt-4o-mini", # pick any chat-capable model you have access to
24
+ messages=[
25
+ {"role": "system", "content": SYSTEM_PROMPT},
26
+ {"role": "user", "content": user_message},
27
+ ],
28
+ temperature=0.3,
29
+ )
30
+ return resp.choices[0].message.content.strip()
31
+ except Exception as e:
32
+ raise RuntimeError(f"LLM error: {e}")
33
+
34
+
35
+ @app.post("/chat", response_model=ChatResponse)
36
+ def chat(req: ChatRequest):
37
+ if not req.message:
38
+ raise HTTPException(status_code=400, detail="`message` is required.")
39
+ try:
40
+ reply = generate_reply(req.message)
41
+ return ChatResponse(reply=reply)
42
+ except Exception as e:
43
+ raise HTTPException(status_code=500, detail=str(e))
44
+
45
+
46
+ @app.get("/healthz")
47
+ def health():
48
+ return {"status": "ok"}
@@ -0,0 +1,28 @@
1
+ from dotenv import load_dotenv
2
+ from levelapp.core.session import EvaluationSession
3
+ from levelapp.workflow import WorkflowConfig
4
+
5
+ # Load .env (automatically done in LevelApp, but explicit for clarity)
6
+ load_dotenv()
7
+
8
+ if __name__ == "__main__":
9
+ # 1. Load YAML config
10
+ config = WorkflowConfig.load(path="workflow_configuration.yaml")
11
+
12
+ # Alternatively: Load from dict for in-memory config (e.g., from DB)
13
+ # config_dict = {...} # As in README
14
+ # config = WorkflowConfig.from_dict(content=config_dict)
15
+ # config.set_reference_data(content={"scripts": [...]}) # Inline script
16
+
17
+ # 2. Create an evaluation session
18
+ with EvaluationSession(session_name="chatbot-sim-1", workflow_config=config) as session:
19
+ # 2.1. Run session (simulation session)
20
+ session.run()
21
+
22
+ # 2.2. Collect evaluation results
23
+ results = session.workflow.collect_results()
24
+ print("Evaluation Results:", results)
25
+
26
+ # 3. Get aggregated stats (monitoring stats)
27
+ stats = session.get_stats()
28
+ print("Session Stats:\n", stats)
@@ -0,0 +1,38 @@
1
+ process:
2
+ project_name: "chatbot-evaluation"
3
+ workflow_type: SIMULATOR # Must be SIMULATOR for conversation testing
4
+ evaluation_params:
5
+ attempts: 3
6
+ batch_size: 10
7
+
8
+ evaluation:
9
+ evaluators:
10
+ - JUDGE
11
+ - REFERENCE
12
+ providers:
13
+ - openai
14
+ - ionos
15
+ metrics_map:
16
+ appointment_type: EXACT
17
+ date: LEVENSHTEIN
18
+ time: TOKEN_BASED
19
+
20
+ reference_data:
21
+ path: "conversation_script.json"
22
+ data: {}
23
+
24
+ endpoint:
25
+ base_url: "http://127.0.0.1:8000"
26
+ url_path: "chat"
27
+ api_key: ""
28
+ bearer_token: ""
29
+ default_request_payload_template:
30
+ message: "${user_message}"
31
+ default_response_payload_template:
32
+ agent_reply: "${reply}"
33
+
34
+ repository:
35
+ type: FIRESTORE # Keep this value as if for now.
36
+ project_id: ""
37
+ database_name: ""
38
+ source: "LOCAL"
@@ -343,6 +343,7 @@ class FunctionMonitor:
343
343
  category: MetricType,
344
344
  enable_timing: bool,
345
345
  track_memory: bool,
346
+ verbose=False
346
347
  ) -> Callable[P, T]:
347
348
  """
348
349
  Wrap function execution with timing and error handling.
@@ -352,6 +353,7 @@ class FunctionMonitor:
352
353
  name: Unique identifier for the function
353
354
  enable_timing: Enable execution time logging
354
355
  track_memory: Enable memory tracking
356
+ verbose: Enable verbose logging
355
357
 
356
358
  Returns:
357
359
  Wrapped function
@@ -402,7 +404,7 @@ class FunctionMonitor:
402
404
 
403
405
  self._aggregated_stats[name].update(metrics=metrics)
404
406
 
405
- if enable_timing and metrics.duration is not None:
407
+ if verbose and enable_timing and metrics.duration is not None:
406
408
  log_message = f"[FunctionMonitor] Executed '{name}' in {metrics.duration:.4f}s"
407
409
  if metrics.cache_hit:
408
410
  log_message += " (cache hit)"
@@ -44,7 +44,6 @@ class ClientRegistry:
44
44
 
45
45
  cls._wrap_client_methods(client_class)
46
46
  cls._clients[provider] = client_class
47
- logger.info(f"[ClientRegistry] Registered client for provider: {provider}")
48
47
 
49
48
  @classmethod
50
49
  def _wrap_client_methods(cls, client_class: Type[BaseChatClient]) -> None:
@@ -78,7 +78,6 @@ class MetricsManager:
78
78
  ValueError: if the scorer is not a callable.
79
79
  """
80
80
  self._scorers[name] = scorer
81
- logger.info(f"[MetricsManager] Registered scorer: {name}")
82
81
 
83
82
  def get_scorer(self, name: str) -> Callable:
84
83
  """
@@ -95,7 +94,6 @@ class MetricsManager:
95
94
  """
96
95
  try:
97
96
  scorer = self._scorers.get(name)
98
- logger.info(f"[get_scorer] Retrieved scorer: {name}")
99
97
  return scorer
100
98
 
101
99
  except KeyError: