levelapp 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. levelapp/__init__.py +0 -0
  2. levelapp/aspects/__init__.py +8 -0
  3. levelapp/aspects/loader.py +253 -0
  4. levelapp/aspects/logger.py +59 -0
  5. levelapp/aspects/monitor.py +617 -0
  6. levelapp/aspects/sanitizer.py +168 -0
  7. levelapp/clients/__init__.py +122 -0
  8. levelapp/clients/anthropic.py +112 -0
  9. levelapp/clients/gemini.py +130 -0
  10. levelapp/clients/groq.py +101 -0
  11. levelapp/clients/huggingface.py +162 -0
  12. levelapp/clients/ionos.py +126 -0
  13. levelapp/clients/mistral.py +106 -0
  14. levelapp/clients/openai.py +116 -0
  15. levelapp/comparator/__init__.py +5 -0
  16. levelapp/comparator/comparator.py +232 -0
  17. levelapp/comparator/extractor.py +108 -0
  18. levelapp/comparator/schemas.py +61 -0
  19. levelapp/comparator/scorer.py +269 -0
  20. levelapp/comparator/utils.py +136 -0
  21. levelapp/config/__init__.py +5 -0
  22. levelapp/config/endpoint.py +199 -0
  23. levelapp/config/prompts.py +57 -0
  24. levelapp/core/__init__.py +0 -0
  25. levelapp/core/base.py +386 -0
  26. levelapp/core/schemas.py +24 -0
  27. levelapp/core/session.py +336 -0
  28. levelapp/endpoint/__init__.py +0 -0
  29. levelapp/endpoint/client.py +188 -0
  30. levelapp/endpoint/client_test.py +41 -0
  31. levelapp/endpoint/manager.py +114 -0
  32. levelapp/endpoint/parsers.py +119 -0
  33. levelapp/endpoint/schemas.py +38 -0
  34. levelapp/endpoint/tester.py +52 -0
  35. levelapp/evaluator/__init__.py +3 -0
  36. levelapp/evaluator/evaluator.py +307 -0
  37. levelapp/metrics/__init__.py +63 -0
  38. levelapp/metrics/embedding.py +56 -0
  39. levelapp/metrics/embeddings/__init__.py +0 -0
  40. levelapp/metrics/embeddings/sentence_transformer.py +30 -0
  41. levelapp/metrics/embeddings/torch_based.py +56 -0
  42. levelapp/metrics/exact.py +182 -0
  43. levelapp/metrics/fuzzy.py +80 -0
  44. levelapp/metrics/token.py +103 -0
  45. levelapp/plugins/__init__.py +0 -0
  46. levelapp/repository/__init__.py +3 -0
  47. levelapp/repository/filesystem.py +203 -0
  48. levelapp/repository/firestore.py +291 -0
  49. levelapp/simulator/__init__.py +3 -0
  50. levelapp/simulator/schemas.py +116 -0
  51. levelapp/simulator/simulator.py +531 -0
  52. levelapp/simulator/utils.py +134 -0
  53. levelapp/visualization/__init__.py +7 -0
  54. levelapp/visualization/charts.py +358 -0
  55. levelapp/visualization/dashboard.py +240 -0
  56. levelapp/visualization/exporter.py +167 -0
  57. levelapp/visualization/templates/base.html +158 -0
  58. levelapp/visualization/templates/comparator_dashboard.html +57 -0
  59. levelapp/visualization/templates/simulator_dashboard.html +111 -0
  60. levelapp/workflow/__init__.py +6 -0
  61. levelapp/workflow/base.py +192 -0
  62. levelapp/workflow/config.py +96 -0
  63. levelapp/workflow/context.py +64 -0
  64. levelapp/workflow/factory.py +42 -0
  65. levelapp/workflow/registration.py +6 -0
  66. levelapp/workflow/runtime.py +19 -0
  67. levelapp-0.1.15.dist-info/METADATA +571 -0
  68. levelapp-0.1.15.dist-info/RECORD +70 -0
  69. levelapp-0.1.15.dist-info/WHEEL +4 -0
  70. levelapp-0.1.15.dist-info/licenses/LICENSE +0 -0
@@ -0,0 +1,571 @@
1
+ Metadata-Version: 2.4
2
+ Name: levelapp
3
+ Version: 0.1.15
4
+ Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
5
+ Project-URL: Homepage, https://github.com/levelapp-org
6
+ Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
7
+ Project-URL: Documentation, https://levelapp.readthedocs.io
8
+ Project-URL: Issues, https://github.com/levelapp-org/levelapp-framework/issues
9
+ Author-email: Mohamed Sofiene KADRI <ms.kadri.dev@gmail.com>
10
+ License-File: LICENSE
11
+ Keywords: ai,evaluation,framework,llm,testing
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Software Development :: Testing
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: backoff>=2.2.1
21
+ Requires-Dist: google-api-core>=2.25.1
22
+ Requires-Dist: google-auth>=2.40.3
23
+ Requires-Dist: google-cloud-firestore>=2.21.0
24
+ Requires-Dist: httpx>=0.28.1
25
+ Requires-Dist: humanize>=4.13.0
26
+ Requires-Dist: numpy>=2.3.2
27
+ Requires-Dist: pandas-stubs==2.3.0.250703
28
+ Requires-Dist: pandas>=2.3.1
29
+ Requires-Dist: pydantic>=2.11.7
30
+ Requires-Dist: python-dotenv>=1.1.1
31
+ Requires-Dist: pyyaml>=6.0.2
32
+ Requires-Dist: rapid>=0.0.3
33
+ Requires-Dist: rapidfuzz>=3.13.0
34
+ Requires-Dist: requests>=2.32.4
35
+ Requires-Dist: tenacity>=9.1.2
36
+ Provides-Extra: dev
37
+ Requires-Dist: google-api-core>=2.25.1; extra == 'dev'
38
+ Requires-Dist: google-auth>=2.40.3; extra == 'dev'
39
+ Requires-Dist: google-cloud-firestore>=2.21.0; extra == 'dev'
40
+ Requires-Dist: httpx>=0.28.1; extra == 'dev'
41
+ Requires-Dist: humanize>=4.13.0; extra == 'dev'
42
+ Requires-Dist: numpy>=2.3.2; extra == 'dev'
43
+ Requires-Dist: pandas-stubs==2.3.0.250703; extra == 'dev'
44
+ Requires-Dist: pandas>=2.3.1; extra == 'dev'
45
+ Requires-Dist: pydantic>=2.11.7; extra == 'dev'
46
+ Requires-Dist: python-dotenv>=1.1.1; extra == 'dev'
47
+ Requires-Dist: pyyaml>=6.0.2; extra == 'dev'
48
+ Requires-Dist: rapid>=0.0.3; extra == 'dev'
49
+ Requires-Dist: rapidfuzz>=3.13.0; extra == 'dev'
50
+ Requires-Dist: requests>=2.32.4; extra == 'dev'
51
+ Requires-Dist: tenacity>=9.1.2; extra == 'dev'
52
+ Provides-Extra: embeddings
53
+ Requires-Dist: bert-score>=0.3; extra == 'embeddings'
54
+ Requires-Dist: torch>=2.0; extra == 'embeddings'
55
+ Requires-Dist: transformers>=4.40; extra == 'embeddings'
56
+ Description-Content-Type: text/markdown
57
+
58
+ # LevelApp: AI/LLM Evaluation Framework for Regression Testing
59
+
60
+ [![PyPI version](https://badge.fury.io/py/levelapp.svg)](https://badge.fury.io/py/levelapp)
61
+ [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
62
+ [![Python Version](https://img.shields.io/badge/python-3.12%2B-blue.svg)](https://www.python.org/downloads/)
63
+
64
+ ## Overview
65
+
66
+ LevelApp is an evaluation framework designed for regression testing (black-box) of already built LLM-based systems in production or testing phases. It focuses on assessing the performance and reliability of AI/LLM applications through simulation and comparison modules. Powered by Norma.
67
+
68
+ Key benefits:
69
+ - Configuration-driven: Minimal coding required; define evaluations via YAML files.
70
+ - Supports LLM-as-a-judge for qualitative assessments and quantitative metrics for metadata evaluation.
71
+ - Modular architecture for easy extension to new workflows, evaluators, and repositories.
72
+
73
+ ## Features
74
+
75
+ - **Simulator Module**: Evaluates dialogue systems by simulating conversations using predefined scripts. It uses an LLM as a judge to score replies against references and supports metrics (e.g., Exact, Embedded, Token-based, Fuzzy) for comparing extracted metadata to ground truth.
76
+ - **Comparator Module**: Evaluates metadata extraction from JSON outputs (e.g., from legal/financial document processing with LLMs) by comparing against reference/ground-truth data.
77
+ - **Configuration-Based Workflow**: Users provide YAML configs for endpoints, parameters, data sources, and metrics, reducing the need for custom code.
78
+ - **Supported Workflows**: SIMULATOR, COMPARATOR, ASSESSOR (coming soon!).
79
+ - **Repositories**: FIRESTORE, FILESYSTEM, MONGODB.
80
+ - **Evaluators**: JUDGE, REFERENCE, RAG.
81
+ - **Metrics**: Exact, Levenshtein, and more (see docs for full list).
82
+ - **Data Sources**: Local or remote JSON for conversation scripts.
83
+
84
+ ## Installation
85
+
86
+ Install LevelApp via pip:
87
+
88
+ ```bash
89
+ pip install levelapp
90
+ ```
91
+
92
+ ### Prerequisites
93
+ - Python 3.12 or higher.
94
+ - API keys for LLM providers (e.g., OpenAI, Anthropic) if using external clients—store in a `.env` file.
95
+ - Optional: Google Cloud credentials for Firestore repository.
96
+ - Dependencies are automatically installed, including `openai`, `pydantic`, `numpy`, etc. (see `pyproject.toml` for full list).
97
+
98
+ ## Configuration
99
+
100
+ LevelApp uses a YAML configuration file to define the evaluation setup. Create a `workflow_config.yaml` with the following structure:
101
+
102
+ ```yaml
103
+ process:
104
+ project_name: "test-project"
105
+ workflow_type: SIMULATOR # Pick one of the following workflows: SIMULATOR, COMPARATOR, ASSESSOR.
106
+ evaluation_params:
107
+ attempts: 1 # Add the number of simulation attempts.
108
+ batch_size: 5
109
+
110
+ evaluation:
111
+ evaluators: # Select from the following: JUDGE, REFERENCE, RAG.
112
+ - JUDGE
113
+ - REFERENCE
114
+ providers:
115
+ - openai
116
+ - ionos
117
+ - mistral
118
+ - grok
119
+ - gemini
120
+ metrics_map:
121
+ field_1: EXACT
122
+ field_2 : LEVENSHTEIN
123
+
124
+ reference_data:
125
+ path: "../data/conversation_example_1.json"
126
+ data:
127
+
128
+ endpoint:
129
+ name: conversational-agent
130
+ base_url: http://127.0.0.1:8000
131
+ path: /v1/chat
132
+ method: POST
133
+ timeout: 60
134
+ retry_count: 3
135
+ retry_backoff: 0.5
136
+ headers:
137
+ - name: model_id
138
+ value: meta-llama/Meta-Llama-3-8B-Instruct
139
+ secure: false
140
+ - name: x-api-key
141
+ value: API_KEY # Load from .env file using python-dotenv.
142
+ secure: true
143
+ - name: Content-Type
144
+ value: application/json
145
+ secure: false
146
+ request_schema:
147
+ # Static field to be included in every request.
148
+ - field_path: message.source
149
+ value: system
150
+ value_type: static
151
+ required: true
152
+
153
+ # Dynamic field to be populated from runtime context.
154
+ - field_path: message.text
155
+ value: message_text # the key from the runtime context.
156
+ value_type: dynamic
157
+ required: true
158
+
159
+ # Env-based field (from OS environment variables).
160
+ - field_path: metadata.env
161
+ value: ENV_VAR_NAME
162
+ value_type: env
163
+ required: false
164
+
165
+ response_mapping:
166
+ # Map the response fields that will be extracted.
167
+ - field_path: reply.text
168
+ extract_as: agent_reply # The simulator requires this key: 'agent_reply'.
169
+ - field_path: reply.metadata
170
+ extract_as: generated_metadata # The simulator requires this key: 'generated_metadata'.
171
+ - field_path: reply.guardrail_flag
172
+ extract_as: guardrail_flag # The simulator requires this key: 'guardrail_flag'.
173
+
174
+ repository:
175
+ type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
176
+ project_id: "(default)"
177
+ database_name: ""
178
+ ```
179
+
180
+ - **Endpoint Configuration**: Define how to interact with your LLM-based system (base URL, headers, request/response payload schema).
181
+ - **Placeholders**: For dynamic request schema fields, use the values ('value') to dynamically populate these fields during runtime (e.g., `context = {'message_text': "Hello, world!"}`).
182
+ - **Secrets**: Store API keys in `.env` and load via `python-dotenv` (e.g., `API_KEY=your_key_here`).
183
+
184
+ For conversation scripts (used in Simulator), provide a JSON file with this schema:
185
+
186
+ ```json
187
+ {
188
+ "scripts": [
189
+ {
190
+ "variable_request_schema": false,
191
+ "interactions": [
192
+ {
193
+ "user_message": "Hello, I would like to book an appointment with a doctor.",
194
+ "reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
195
+ "interaction_type": "initial",
196
+ "reference_metadata": {},
197
+ "guardrail_flag": false,
198
+ "request_payload": {}
199
+ },
200
+ {
201
+ "user_message": "I need to see a cardiologist.",
202
+ "reference_reply": "When would you like to schedule your appointment?",
203
+ "interaction_type": "intermediate",
204
+ "reference_metadata": {},
205
+ "guardrail_flag": false,
206
+ "request_payload": {}
207
+ },
208
+ {
209
+ "user_message": "I would like to book it for next Monday morning.",
210
+ "reference_reply": "We have an available slot at 10 AM next Monday. Does that work for you?",
211
+ "interaction_type": "intermediate",
212
+ "reference_metadata": {
213
+ "appointment_type": "Cardiology",
214
+ "date": "next Monday",
215
+ "time": "10 AM"
216
+ },
217
+ "guardrail_flag": false,
218
+ "request_payload": {}
219
+ },
220
+ {
221
+ "id": "f4f2dd35-71d7-4b75-ba2b-93a4f546004a",
222
+ "user_message": "Yes, please book it for 10 AM then.",
223
+ "reference_reply": "Your appointment with the cardiologist is booked for 10 AM next Monday. Is there anything else I can help you with?",
224
+ "interaction_type": "final",
225
+ "reference_metadata": {},
226
+ "guardrail_flag": false,
227
+ "request_payload": {}
228
+ }
229
+ ],
230
+ "description": "A conversation about booking a doctor appointment.",
231
+ "details": {
232
+ "context": "Booking a doctor appointment"
233
+ }
234
+ }
235
+ ]
236
+ }
237
+ ```
238
+ - **Fields**:
239
+ - **Scripts Level**:
240
+ - **description**: a brief description of the script.
241
+ - **details**: any additioanl information.
242
+ - **variable_request_schema**: a flag variable that defaults to `False`.
243
+ When changed to True, it allows the user to pass the request payload content directly from the reference file
244
+ ignoring any configuration made in the YAML.
245
+ - **Interactions**: A list of single-turn conversation data for the simulation and evaluation process:
246
+ - **user_message_path**: If `variable_request_schema` is `True`, the user must indicate the path of the user message
247
+ in the attached **request_payload** dict. Example: ```"user_message_path": "user.message"```
248
+ for "request_payload": ```{"user": {"message": Hello, world!", "role": "user"}}```.
249
+ - **user_message**: The text content that will be used as a user message for the simulation,
250
+ - **reference_reply**: the text content of the reference reply.
251
+ - **reference_metadata**: a dict containing the reference metadata.
252
+ - **guardrail flags**: Guardrail flag (`True`/`False`).
253
+ - **request payloads**: A dict containing the request payload that must be sent for each turn.
254
+
255
+ In the `.env` you need to add the LLM providers credentials that will be used for the evaluation process.
256
+ ```
257
+ # Add the API key for any used provider:
258
+ OPENAI_API_KEY=
259
+ IONOS_API_KEY=
260
+ ANTHROPIC_API_KEY=
261
+ MISTRAL_API_KEY=
262
+ GEMINI_API_KEY=
263
+ GROK_API_KEY=
264
+
265
+ # Include the model of choice for any used provider:
266
+ OPENAI_MODEL= "gpt-4o-mini"
267
+ GROK_MODEL = "llama-3.3-70b-versatile"
268
+ GEMINI_MODEL = "gemini-2.5-flash"
269
+
270
+ # For IONOS, you must include the base URL and the model ID.
271
+ IONOS_BASE_URL="https://openai.inference.de-txl.ionos.com"
272
+ IONOS_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct"
273
+ ```
274
+
275
+ ## Usage Example
276
+
277
+ To run an evaluation:
278
+
279
+ 1. Prepare your YAML config and JSON data files.
280
+ 2. Use the following Python script:
281
+
282
+ ```python
283
+ if __name__ == "__main__":
284
+ from levelapp.workflow import WorkflowConfig
285
+ from levelapp.core.session import EvaluationSession
286
+
287
+ # Load configuration from YAML
288
+ config = WorkflowConfig.load(path="../data/workflow_config.yaml")
289
+
290
+ # Run evaluation session (You can enable/disable the monitoring aspect)
291
+ with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
292
+ session.run()
293
+ results = session.workflow.collect_results()
294
+ print("Results:", results)
295
+
296
+ stats = session.get_stats()
297
+ print(f"session stats:\n{stats}")
298
+ ```
299
+
300
+ Alternatively, if you want to pass the configuration and reference data from in-memory variables,
301
+ you can manually load the data like the following:
302
+ ```python
303
+ if __name__ == "__main__":
304
+ from levelapp.workflow import WorkflowConfig
305
+ from levelapp.core.session import EvaluationSession
306
+
307
+
308
+ config_dict = {
309
+ "process": {
310
+ "project_name": "test-project",
311
+ "workflow_type": "SIMULATOR", # Pick one of the following workflows: SIMULATOR, COMPARATOR, ASSESSOR.
312
+ "evaluation_params": {
313
+ "attempts": 1, # Add the number of simulation attempts.
314
+ }
315
+ },
316
+ "evaluation": {
317
+ "evaluators": ["JUDGE", "REFERENCE"], # Select from the following: JUDGE, REFERENCE, RAG.
318
+ "providers": ["openai", "ionos"],
319
+ "metrics_map": {
320
+ "field_1": "EXACT",
321
+ "field_2": "LEVENSHTEIN"
322
+ }
323
+ },
324
+ "reference_data": {
325
+ "path": "../data/conversation_example_1.json",
326
+ "data": None
327
+ },
328
+ "endpoint": {
329
+ "name": "conversational-agent",
330
+ "base_url": "http://127.0.0.1:8000",
331
+ "path": "/v1/chat",
332
+ "method": "POST",
333
+ "timeout": 60,
334
+ "retry_count": 3,
335
+ "retry_backoff": 0.5,
336
+ "headers": [
337
+ {
338
+ "name": "model_id",
339
+ "value": "meta-llama/Meta-Llama-3.1-8B-Instruct",
340
+ "secure": False
341
+ },
342
+ {
343
+ "name": "x-api-key",
344
+ "value": "API_KEY", # Load from .env file using python-dotenv.
345
+ "secure": True
346
+ },
347
+ {
348
+ "name": "Content-Type",
349
+ "value": "application/json",
350
+ "secure": False
351
+ }
352
+ ],
353
+ "request_schema": [
354
+ {
355
+ "field_path": "message.source",
356
+ "value": "system",
357
+ "value_type": "static",
358
+ "required": True
359
+ },
360
+ {
361
+ "field_path": "message.text",
362
+ "value": "message_text", # the key from the runtime context.
363
+ "value_type": "dynamic",
364
+ "required": True
365
+ },
366
+ {
367
+ "field_path": "metadata.env",
368
+ "value": "ENV_VAR_NAME",
369
+ "value_type": "env",
370
+ "required": False
371
+ }
372
+ ],
373
+ "response_mapping": [
374
+ {
375
+ "field_path": "reply.text",
376
+ "extract_as": "agent_reply" # Remember that the simulator requires this key: 'agent_reply'.
377
+ },
378
+ {
379
+ "field_path": "reply.metadata",
380
+ "extract_as": "agent_reply" # Remember that the simulator requires this key: 'agent_reply'.
381
+ },
382
+ {
383
+ "field_path": "reply.guardrail_flag",
384
+ "extract_as": "metadata" # Remember that the simulator requires this key: 'agent_reply'.
385
+ }
386
+ ]
387
+ },
388
+ "repository": {
389
+ "type": "FIRESTORE", # Pick one of the following: FIRESTORE, FILESYSTEM
390
+ "project_id": "(default)",
391
+ "database_name": ""
392
+ }
393
+ }
394
+
395
+ content = {
396
+ "scripts": [
397
+ {
398
+ "interactions": [
399
+ {
400
+ "user_message": "Hello!",
401
+ "reference_reply": "Hello, how can I help you!"
402
+ },
403
+ {
404
+ "user_message": "I need an apartment",
405
+ "reference_reply": "sorry, but I can only assist you with booking medical appointments."
406
+ },
407
+ ]
408
+ },
409
+ ]
410
+ }
411
+
412
+ # Load configuration from a dict variable
413
+ config = WorkflowConfig.from_dict(content=config_dict)
414
+
415
+ # Load reference data from dict variable
416
+ config.set_reference_data(content=content)
417
+
418
+ evaluation_session = EvaluationSession(
419
+ session_name="test-session",
420
+ workflow_config=config,
421
+ enable_monitoring=True # To disable the monitoring aspect, set this to False.
422
+ )
423
+
424
+ with evaluation_session as session:
425
+ # Optional: Run connectivity test before the full evaluation
426
+ test_results = session.run_connectivity_test(
427
+ context={"user_message": "I want to book an appointment with a dentist."}
428
+ )
429
+ print(f"Connectivity Test Results:\n{test_results}\n---")
430
+ session.run()
431
+ results = session.workflow.collect_results()
432
+ print("Results:", results)
433
+
434
+ stats = session.get_stats()
435
+ print(f"session stats:\n{stats}")
436
+
437
+ ```
438
+
439
+
440
+ - This loads the config, runs the specified workflow (e.g., Simulator), collects results, and prints stats.
441
+
442
+ For more examples, see the `examples/` directory.
443
+
444
+ Or, Check the following Colab Notebook for an easy and quick demo:<br>
445
+ | Notebook | Description | |
446
+ |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
447
+ | [Quick-tour of LevelApp framework](https://github.com/levelapp-org/levelapp-framework/blob/dev/examples/conversation_evaluation_example/LevelApp_Conversation_Simulator_Notebook.ipynb) | Tutorial Notebook with UI widgets |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tD2ljiBkrTxSfeRObTBrc2UmZvzqEuRU?usp=sharing) |
448
+
449
+ ## Visualization
450
+
451
+ LevelApp includes powerful visualization capabilities to help you analyze and present evaluation results through interactive charts and dashboards.
452
+
453
+ ### Features
454
+
455
+ - **Automatic Dashboard Generation**: Create comprehensive HTML dashboards with all evaluation metrics
456
+ - **Multi-Format Export**: Export visualizations in HTML and PNG formats.
457
+ - **Interactive Charts**: Generate interactive Plotly charts for detailed analysis
458
+ - **Provider Comparison**: Compare performance across different LLM providers
459
+ - **Score Trends**: Visualize score trends across conversation scripts
460
+ - **Distribution Analysis**: Analyze score distributions for individual providers
461
+ - **Summary Metrics**: Display key performance indicators and statistics
462
+
463
+ ### Installation
464
+
465
+ To use visualization features, install the required dependencies:
466
+
467
+ ```bash
468
+ pip install plotly kaleido jinja2
469
+ ```
470
+
471
+ These dependencies enable:
472
+ - `plotly`: Interactive chart generation
473
+ - `kaleido`: Static image export (PNG, PDF)
474
+ - `jinja2`: HTML dashboard templating
475
+
476
+ ### Basic Usage
477
+
478
+ Generate visualizations directly from an evaluation session:
479
+
480
+ ```python
481
+ from levelapp.core.session import EvaluationSession
482
+ from levelapp.workflow import WorkflowConfig
483
+
484
+ # Load configuration
485
+ config = WorkflowConfig.load(path="workflow_config.yaml")
486
+
487
+ # Run evaluation with visualization
488
+ with EvaluationSession(
489
+ session_name="my-evaluation",
490
+ workflow_config=config,
491
+ enable_monitoring=True
492
+ ) as session:
493
+ # Run the evaluation
494
+ session.run()
495
+
496
+ # Generate visualizations
497
+ files = session.visualize_results(
498
+ output_dir="./visualization_output",
499
+ formats=["html", "png"]
500
+ )
501
+
502
+ # Access generated files
503
+ print(f"Dashboard: {files['html']}")
504
+ print(f"Charts: {files['png']}")
505
+ ```
506
+
507
+ ### Available Chart Types
508
+
509
+ 1. **Provider Comparison**: Bar charts comparing average scores across LLM providers
510
+ 2. **Score Trend**: Line charts showing score progression across conversation scripts
511
+ 3. **Score Distribution**: Histograms showing score distribution for specific providers
512
+ 4. **Summary Metrics**: Key performance indicators and aggregate statistics
513
+
514
+ ### Customization
515
+
516
+ Customize visualizations by:
517
+
518
+ - **Themes**: Choose from Plotly themes (`plotly`, `plotly_white`, `plotly_dark`, `ggplot2`, `seaborn`, etc.)
519
+ - **Export Formats**: Select from `html` or `png`.
520
+ - **Output Directory**: Specify custom paths for generated files
521
+ - **Chart Layout**: Modify chart properties through the ChartGenerator API
522
+
523
+ Example with custom theme:
524
+
525
+ ```python
526
+ # Use dark theme for all charts
527
+ chart_gen = ChartGenerator(theme="plotly_dark")
528
+
529
+ # Generate with custom settings
530
+ files = session.visualize_results(
531
+ output_dir="./reports",
532
+ formats=["html", "png", "pdf"],
533
+ theme="plotly_dark"
534
+ )
535
+ ```
536
+
537
+ ### Example Output
538
+
539
+ The visualization module generates:
540
+
541
+ - **Interactive HTML Dashboard**: Complete evaluation report with all charts and metrics
542
+ - **Static Images**: PNG/PDF exports for presentations and reports
543
+ - **JSON Data**: Raw data export for custom processing
544
+
545
+ For complete examples, see the `examples/visualization_example/` directory.
546
+
547
+ ## Documentation
548
+
549
+ Detailed docs are in the `docs/` directory, including API references and advanced configuration.
550
+
551
+ ## Contributing
552
+
553
+ Contributions are welcome! Please follow these steps:
554
+ - Fork the repository on GitHub.
555
+ - Create a feature branch (`git checkout -b feature/new-feature`).
556
+ - Commit changes (`git commit -am 'Add new feature'`).
557
+ - Push to the branch (`git push origin feature/new-feature`).
558
+ - Open a pull request.
559
+
560
+ Report issues via GitHub Issues. Follow the code of conduct (if applicable).
561
+
562
+ ## Acknowledgments
563
+
564
+ - Powered by Norma.
565
+ - Thanks to contributors and open-source libraries like Pydantic, NumPy, and OpenAI SDK.
566
+
567
+ ## License
568
+
569
+ This project is licensed under the MIT License - see the [LICENCE](LICENCE) file for details.
570
+
571
+ ---
@@ -0,0 +1,70 @@
1
+ levelapp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ levelapp/aspects/__init__.py,sha256=_OaPcjTWBizqcUdDVj5aYue7lG9ytjQGLhPvReriKnU,326
3
+ levelapp/aspects/loader.py,sha256=IB2sZTmTdAvYHQZlH7PdZGQHh3r86P-zX3rIp0PyG2M,9577
4
+ levelapp/aspects/logger.py,sha256=MJ9HphyHYkTE5-ajA_WuMUTM0qQzd0WIP243vF-pj3M,1698
5
+ levelapp/aspects/monitor.py,sha256=ibUk01Y5y67_qBJRA5YzvjMX8QrRkMTJ-mN77ztuLlo,22113
6
+ levelapp/aspects/sanitizer.py,sha256=zUqgb76tXJ8UUYtHp0Rz7q9PZjAHpSpHPPFfGTjjQNg,5229
7
+ levelapp/clients/__init__.py,sha256=p2xgIhWuDHxMck_b0PcCo1NOGEYjpX5-2FVLGT4SHF4,4234
8
+ levelapp/clients/anthropic.py,sha256=Bxp-HffcIPLwM9BLcTR7n-D8ZXYVWCmbr2oH33fKV04,4030
9
+ levelapp/clients/gemini.py,sha256=nVPbntRN-t-hrPvF59wadX1Ptqp7eOlwsHUP2_PuQfI,4594
10
+ levelapp/clients/groq.py,sha256=W8NNN6enl-wkQfKsCULBKYQmdnOorFUAHJaURlMDbZk,3669
11
+ levelapp/clients/huggingface.py,sha256=uSo7EYpiUlgo3OGWcLZ2P77eFMaTVrIijwFU2RPZ_Oo,5301
12
+ levelapp/clients/ionos.py,sha256=PIZIA6IqwJfs1GYmD8lXXkgfth-nmhe6d-m7-269zes,4275
13
+ levelapp/clients/mistral.py,sha256=e1NRvP9qN7O2zWAzBbgdQmmUDHQfCRLtVKDJCrh0DNA,3777
14
+ levelapp/clients/openai.py,sha256=lrz8eZNbJGJgXTB0rKkMVhUArilrThtBPq2rkgGf5Gk,3887
15
+ levelapp/comparator/__init__.py,sha256=ynmc0mrx-JbcCqLH-z4hOVezqGocDbDQGqgbhWy2xzI,187
16
+ levelapp/comparator/comparator.py,sha256=HONX6I_uzKlQWClP3Bj8gUkk2zsAV2ywxcqjlr3i7HI,8160
17
+ levelapp/comparator/extractor.py,sha256=vJ9iEoWAtXo2r9r7X72uUQPKW3UZE9Kx3uIjCufEp9k,3910
18
+ levelapp/comparator/schemas.py,sha256=qP_77sWW1SgpM7_u2rVsFXxUl1ogZ9wrOIrNv8QYR8M,1809
19
+ levelapp/comparator/scorer.py,sha256=goxgZ-usGIpTPut6j1TcXDf9PWMpyQpW5swK0QSRrUE,9041
20
+ levelapp/comparator/utils.py,sha256=Eu48nDrNzFr0lwAJJS0aNhKsAWQ72syTEWYMNYfg764,4331
21
+ levelapp/config/__init__.py,sha256=9oaajE5zW-OVWOszUzMAG6nHDSbLQWa3KT6bVoSvzRA,137
22
+ levelapp/config/endpoint.py,sha256=B-uIEKF-0_Y6Vo8MZ8eoCZocRkghijrdpwT3zq0FDLk,7647
23
+ levelapp/config/prompts.py,sha256=NXOKRp5l1VQ9LO0pUojVH6TDJhWyZImsAvZEz2QiD9k,2206
24
+ levelapp/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ levelapp/core/base.py,sha256=sEm_tsU-GjQWzSkHA8iZighC713pNkEFSOrL1U0pmB8,12597
26
+ levelapp/core/schemas.py,sha256=E47d93MMOj4eRYZIqUyLBiE5Ye7WgwkOJPOWQ6swRmo,465
27
+ levelapp/core/session.py,sha256=DCu0L5BMMBwpFNmIkoF2SkyM5PF1E9QiBYNpp5V5c80,11576
28
+ levelapp/endpoint/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
+ levelapp/endpoint/client.py,sha256=kDGA-eKB_q45u2EyYdB3OHsQt0jpEMUCKLalPV-iJpo,6363
30
+ levelapp/endpoint/client_test.py,sha256=RpV9338Uab2QVen5dHixurPJlwdVdJZNz-HTNr0KSgc,1170
31
+ levelapp/endpoint/manager.py,sha256=7o-rAzsQ7O9fz4KITos4FXEB9YVbtbWQSc-3bbo_EsM,4218
32
+ levelapp/endpoint/parsers.py,sha256=YuT5Ua3FHGNUuXbYh6fLn-sDWlnrscW5IwZR5sWjVRc,4225
33
+ levelapp/endpoint/schemas.py,sha256=V0tpXC8aawHpX5zatderYa0fB14_QEkPiKRvfsuGZRM,851
34
+ levelapp/endpoint/tester.py,sha256=nfKx09Uk4EHNuajZg0SYSH6jHf7FkxrwEJGZBY9PrVk,2048
35
+ levelapp/evaluator/__init__.py,sha256=K-P75Q1FXXLCNqH1wyhT9sf4y2R9a1qR5449AXEsY1k,109
36
+ levelapp/evaluator/evaluator.py,sha256=lSRxCunkzycP7bRDzKx63nPACSw6MP-jGV8dbNRgWaY,11278
37
+ levelapp/metrics/__init__.py,sha256=x8iTaeDezJyQ9-NFe8GGvzwIBhyAJHWSRfBE3JRX-PE,1878
38
+ levelapp/metrics/embedding.py,sha256=a3PNKBGedVS0Z9Ow5cOc0_aWgyoa8l_dAYRF4ViSMiI,2075
39
+ levelapp/metrics/exact.py,sha256=Kb13nD2OVLrl3iYHaXrxDfrxDuhW0SMVvLAEXPaJtlY,6235
40
+ levelapp/metrics/fuzzy.py,sha256=Rg8ashzMxtQwKO-z_LLzdj2PDIRqL4CBw6PGRf9IBrI,2598
41
+ levelapp/metrics/token.py,sha256=yQi9hxT_fXTGjLiCCemDxQ4Uk2zD-wQYtSnDlI2AuuY,3521
42
+ levelapp/metrics/embeddings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
+ levelapp/metrics/embeddings/sentence_transformer.py,sha256=S0XgSr_kiQN9Dk9o1dm_t9Z9J0nGh_krdBbfreSo61o,1179
44
+ levelapp/metrics/embeddings/torch_based.py,sha256=qQ3Y06H8OPhqFLSozr2PYwxxmWcFAF_pVSxHrdtyyR8,2219
45
+ levelapp/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
+ levelapp/repository/__init__.py,sha256=hNmFRZ7kKJN1mMlOHeW9xf0j9Q7gqTXYJ3hMCzk9to4,79
47
+ levelapp/repository/filesystem.py,sha256=-C2oVThZt16K41iZNSEaM2qO3tTPsFKVsDYQQiwo1Bk,7475
48
+ levelapp/repository/firestore.py,sha256=K9JgxsNCelAKtzTDv19c1dHRlitMeRzo7H3caTlKuF8,10369
49
+ levelapp/simulator/__init__.py,sha256=8Dz8g7rbpBZX3WoknVmMVoWm_VT72ZL9BABOF1xFpqs,83
50
+ levelapp/simulator/schemas.py,sha256=0rZz5XfLxxfYF-7nk9f0EwHzGURidgKOYWz-U4qGu1Y,5860
51
+ levelapp/simulator/simulator.py,sha256=jya4HmACv1y-TcrNRp62VA42aKNlmhdN7_IkHhUHVA8,21029
52
+ levelapp/simulator/utils.py,sha256=smSrZ8praKINK0wFpKl3tmqr21OUz_dheUOTH0miTys,4882
53
+ levelapp/visualization/__init__.py,sha256=MniD9ECzFoFb1KnLU2aqEUJcZSpae_Xm869yFbmDdR0,269
54
+ levelapp/visualization/charts.py,sha256=JZDtN4bp994iiBrqhV6TO99qiRewmMkUu8BjifJlX4A,11399
55
+ levelapp/visualization/dashboard.py,sha256=ZboEh0wbHctEYcItXly47Q7LkuQaN6b0j692ikFtq-Y,8426
56
+ levelapp/visualization/exporter.py,sha256=AC5GX3n61j1mLNZrZ7LpUqwxw-M_kkOnxukjb42rFEw,5831
57
+ levelapp/visualization/templates/base.html,sha256=0lye-LdIxvxcX_WFYZIeGocOdGZT9q9ibMIso656-0Q,4057
58
+ levelapp/visualization/templates/comparator_dashboard.html,sha256=L_0M7y_ng-wnLZhrvSt8SD15tfUZxKD-swkD7cqDbT8,2508
59
+ levelapp/visualization/templates/simulator_dashboard.html,sha256=tD3HD7d7BqEf98QWaQGqsATwI8tV78Fn25dbun3He5E,3321
60
+ levelapp/workflow/__init__.py,sha256=27b2obG7ObhR43yd2uH-R0koRB7-DG8Emnvrq8EjsTA,193
61
+ levelapp/workflow/base.py,sha256=dL85mrFDB42113ONIHAhmIfhNsIp9767uU9SxwauDMw,6753
62
+ levelapp/workflow/config.py,sha256=hgch9KV-TMtbesEFi00eibFdk5JJStJwb5TGO5m-o-M,3124
63
+ levelapp/workflow/context.py,sha256=KOXm_5HJWYyWfl9C83BqT37X7QVuUYSS-ZoDpZgXFQw,2696
64
+ levelapp/workflow/factory.py,sha256=z1ttJmI59sU9HgOvPo3ixUJ_oPv838XgehfuOorlTt8,1634
65
+ levelapp/workflow/registration.py,sha256=VHUHjLHXad5kjcKukaEOIf7hBZ09bT3HAzVmIT08aLo,359
66
+ levelapp/workflow/runtime.py,sha256=a3REqikh3-QHj0uYikqx0b4xQjq-w6VNyiUandL5GWw,690
67
+ levelapp-0.1.15.dist-info/METADATA,sha256=_FHrkNsuZlJdxrx5GENLWg_4kYhsDk51NOVoLe0QnFw,22194
68
+ levelapp-0.1.15.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
69
+ levelapp-0.1.15.dist-info/licenses/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
+ levelapp-0.1.15.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
File without changes