levelapp 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- {levelapp-0.1.2 → levelapp-0.1.4}/PKG-INFO +5 -3
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/aspects/monitor.py +3 -2
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/simulator/simulator.py +3 -3
- levelapp-0.1.4/levelapp/simulator/utils.py +257 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/pyproject.toml +67 -65
- {levelapp-0.1.2 → levelapp-0.1.4}/src/data/workflow_config.yaml +2 -2
- levelapp-0.1.4/src/level_app/main_session.py +73 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/uv.lock +9 -124
- levelapp-0.1.2/levelapp/simulator/utils.py +0 -163
- levelapp-0.1.2/src/level_app/main_session.py +0 -48
- {levelapp-0.1.2 → levelapp-0.1.4}/.gitignore +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/.python-version +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/LICENSE +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/MANIFEST.in +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/Makefile +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/README.md +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/docs/media/simulator-module-diagram.PNG +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/docs/media/simulator-sequence-diagram.png +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/examples/README.md +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/examples/conversation_script.json +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/examples/example_chatbot.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/examples/example_evaluation.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/examples/workflow_configuration.yaml +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/aspects/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/aspects/loader.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/aspects/logger.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/aspects/sanitizer.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/clients/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/clients/anthropic.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/clients/ionos.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/clients/mistral.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/clients/openai.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/comparator/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/comparator/comparator.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/comparator/extractor.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/comparator/schemas.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/comparator/scorer.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/comparator/utils.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/config/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/config/endpoint.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/config/endpoint_.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/config/prompts.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/core/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/core/base.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/core/schemas.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/core/session.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/evaluator/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/evaluator/evaluator.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/metrics/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/metrics/embedding.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/metrics/exact.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/metrics/fuzzy.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/metrics/token.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/plugins/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/repository/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/repository/firestore.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/simulator/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/simulator/schemas.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/workflow/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/workflow/base.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/workflow/config.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/workflow/context.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/workflow/factory.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/workflow/registration.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/levelapp/workflow/runtime.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/make.bat +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/project_structure.txt +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/src/data/conversation_example_1.json +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/src/data/endpoint_configuration.yaml +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/src/data/evaluation_results.json +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/src/data/payload_example_1.yaml +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/src/data/payload_example_2.yaml +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/src/data/workflow_config_2.json +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/src/level_app/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/src/level_app/main.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/src/level_app/main_monitoring.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/src/level_app/main_simulator.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/tests/__init__.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/tests/test_anthropic.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/tests/test_comparator.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/tests/test_ionos.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/tests/test_mistral.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/tests/test_monitoring.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/tests/test_openai.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/tests/test_session.py +0 -0
- {levelapp-0.1.2 → levelapp-0.1.4}/tests/test_simulator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: levelapp
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
|
|
5
5
|
Project-URL: Homepage, https://github.com/levelapp-org
|
|
6
6
|
Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
|
|
@@ -33,10 +33,12 @@ Requires-Dist: rapidfuzz>=3.13.0
|
|
|
33
33
|
Requires-Dist: requests>=2.32.4
|
|
34
34
|
Requires-Dist: tenacity>=9.1.2
|
|
35
35
|
Provides-Extra: dev
|
|
36
|
-
Requires-Dist:
|
|
36
|
+
Requires-Dist: google-api-core>=2.25.1; extra == 'dev'
|
|
37
|
+
Requires-Dist: google-auth>=2.40.3; extra == 'dev'
|
|
38
|
+
Requires-Dist: google-cloud-firestore>=2.21.0; extra == 'dev'
|
|
37
39
|
Requires-Dist: httpx>=0.28.1; extra == 'dev'
|
|
40
|
+
Requires-Dist: humanize>=4.13.0; extra == 'dev'
|
|
38
41
|
Requires-Dist: numpy>=2.3.2; extra == 'dev'
|
|
39
|
-
Requires-Dist: openai>=1.99.9; extra == 'dev'
|
|
40
42
|
Requires-Dist: pandas-stubs==2.3.0.250703; extra == 'dev'
|
|
41
43
|
Requires-Dist: pandas>=2.3.1; extra == 'dev'
|
|
42
44
|
Requires-Dist: pydantic>=2.11.7; extra == 'dev'
|
|
@@ -422,7 +422,8 @@ class FunctionMonitor:
|
|
|
422
422
|
maxsize: int | None = 128,
|
|
423
423
|
enable_timing: bool = True,
|
|
424
424
|
track_memory: bool = True,
|
|
425
|
-
collectors: List[Type[MetricsCollector]] | None = None
|
|
425
|
+
collectors: List[Type[MetricsCollector]] | None = None,
|
|
426
|
+
verbose: bool = False
|
|
426
427
|
) -> Callable[[Callable[P, T]], Callable[P, T]]:
|
|
427
428
|
"""
|
|
428
429
|
Decorator factory for monitoring functions.
|
|
@@ -456,7 +457,7 @@ class FunctionMonitor:
|
|
|
456
457
|
)
|
|
457
458
|
|
|
458
459
|
with self._lock:
|
|
459
|
-
if name in self._monitored_procedures:
|
|
460
|
+
if name in self._monitored_procedures and verbose:
|
|
460
461
|
raise ValueError(f"Function '{name}' is already registered.")
|
|
461
462
|
|
|
462
463
|
self._monitored_procedures[name] = monitored_func
|
|
@@ -396,7 +396,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
396
396
|
evaluation_results=evaluation_results,
|
|
397
397
|
)
|
|
398
398
|
else:
|
|
399
|
-
logger.info(f"
|
|
399
|
+
logger.info(f"{_LOG} Judge evaluation skipped (no evaluator or no providers).")
|
|
400
400
|
|
|
401
401
|
if metadata_evaluator and reference_metadata:
|
|
402
402
|
self._metadata_evaluation(
|
|
@@ -406,7 +406,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
406
406
|
evaluation_results=evaluation_results,
|
|
407
407
|
)
|
|
408
408
|
else:
|
|
409
|
-
logger.info(f"
|
|
409
|
+
logger.info(f"{_LOG} Metadata evaluation skipped (no evaluator or no reference metadata).")
|
|
410
410
|
|
|
411
411
|
evaluation_results.guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
|
|
412
412
|
|
|
@@ -480,7 +480,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
480
480
|
reference_data=reference_metadata,
|
|
481
481
|
)
|
|
482
482
|
except Exception as e:
|
|
483
|
-
logger.error(f"
|
|
483
|
+
logger.error(f"{_LOG} Metadata evaluation failed:\n{e}", exc_info=e)
|
|
484
484
|
|
|
485
485
|
@staticmethod
|
|
486
486
|
def store_evaluation_results(
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""
|
|
2
|
+
'simulators/aspects.py': Utility functions for handling VLA interactions and requests.
|
|
3
|
+
"""
|
|
4
|
+
import re
|
|
5
|
+
import ast
|
|
6
|
+
import json
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
from string import Template
|
|
11
|
+
from typing import Any, Dict, List, Union, Iterable
|
|
12
|
+
|
|
13
|
+
from pydantic import ValidationError
|
|
14
|
+
|
|
15
|
+
from levelapp.clients import ClientRegistry
|
|
16
|
+
from levelapp.config.prompts import SUMMARIZATION_PROMPT_TEMPLATE
|
|
17
|
+
from levelapp.simulator.schemas import InteractionResults
|
|
18
|
+
from levelapp.aspects import MonitoringAspect, MetricType, logger
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class UUIDEncoder(json.JSONEncoder):
|
|
22
|
+
def default(self, obj):
|
|
23
|
+
if isinstance(obj, UUID):
|
|
24
|
+
return str(obj)
|
|
25
|
+
return json.JSONEncoder.default(self, obj)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_PLACEHOLDER_RE = re.compile(r"\$\{([^}]+)\}") # captures inner name(s) of ${...}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _traverse_path(d: Dict[str, Any], path: str):
|
|
32
|
+
"""Traverse a dot-separated path (payload.metadata.budget) and return value or None."""
|
|
33
|
+
parts = path.split(".")
|
|
34
|
+
cur = d
|
|
35
|
+
try:
|
|
36
|
+
for p in parts:
|
|
37
|
+
if isinstance(cur, dict) and p in cur:
|
|
38
|
+
cur = cur[p]
|
|
39
|
+
else:
|
|
40
|
+
return None
|
|
41
|
+
return cur
|
|
42
|
+
except Exception:
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _recursive_find(container: Any, target_key: str):
|
|
47
|
+
"""
|
|
48
|
+
Recursively search container (dicts/lists) for the first occurrence of target_key.
|
|
49
|
+
Returns the value if found, else None.
|
|
50
|
+
"""
|
|
51
|
+
if isinstance(container, dict):
|
|
52
|
+
# direct hit
|
|
53
|
+
if target_key in container:
|
|
54
|
+
return container[target_key]
|
|
55
|
+
# recurse into values
|
|
56
|
+
for v in container.values():
|
|
57
|
+
found = _recursive_find(v, target_key)
|
|
58
|
+
if found is not None:
|
|
59
|
+
return found
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
if isinstance(container, list):
|
|
63
|
+
for item in container:
|
|
64
|
+
found = _recursive_find(item, target_key)
|
|
65
|
+
if found is not None:
|
|
66
|
+
return found
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
# not a container
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _extract_placeholders(template_str: str) -> Iterable[str]:
|
|
74
|
+
"""Return list of placeholder names in a template string (inner contents of ${...})."""
|
|
75
|
+
return [m.group(1) for m in _PLACEHOLDER_RE.finditer(template_str)]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def extract_interaction_details(
|
|
79
|
+
response: str | Dict[str, Any],
|
|
80
|
+
template: Dict[str, Any],
|
|
81
|
+
) -> InteractionResults:
|
|
82
|
+
"""
|
|
83
|
+
Parse response (str or dict), look up placeholders recursively in the response and
|
|
84
|
+
use Template.safe_substitute with a mapping built from those lookups.
|
|
85
|
+
"""
|
|
86
|
+
try:
|
|
87
|
+
response_dict = response if isinstance(response, dict) else json.loads(response)
|
|
88
|
+
print(f"response:\n{response_dict}\n--")
|
|
89
|
+
if not isinstance(response_dict, dict):
|
|
90
|
+
raise ValueError("Response is not a valid dictionary")
|
|
91
|
+
|
|
92
|
+
output: Dict[str, Any] = {}
|
|
93
|
+
|
|
94
|
+
for out_key, tpl_str in template.items():
|
|
95
|
+
# Build mapping for placeholders found in tpl_str
|
|
96
|
+
placeholders = _extract_placeholders(tpl_str)
|
|
97
|
+
mapping: Dict[str, str] = {}
|
|
98
|
+
|
|
99
|
+
for ph in placeholders:
|
|
100
|
+
value = None
|
|
101
|
+
|
|
102
|
+
# 1) If ph looks like a dotted path, try explicit path traversal first
|
|
103
|
+
if "." in ph:
|
|
104
|
+
value = _traverse_path(response_dict, ph)
|
|
105
|
+
|
|
106
|
+
# 2) If not found yet, try recursive search for the bare key (last path segment)
|
|
107
|
+
if value is None:
|
|
108
|
+
bare = ph.split(".")[-1]
|
|
109
|
+
value = _recursive_find(response_dict, bare)
|
|
110
|
+
|
|
111
|
+
# Prepare mapping value for Template substitution:
|
|
112
|
+
# - dict/list -> JSON string (so substitution yields valid JSON text)
|
|
113
|
+
# - None -> empty string
|
|
114
|
+
# - otherwise -> str(value)
|
|
115
|
+
if isinstance(value, (dict, list)):
|
|
116
|
+
try:
|
|
117
|
+
mapping[ph] = json.dumps(value, ensure_ascii=False)
|
|
118
|
+
except Exception:
|
|
119
|
+
mapping[ph] = str(value)
|
|
120
|
+
elif value is None:
|
|
121
|
+
mapping[ph] = ""
|
|
122
|
+
else:
|
|
123
|
+
mapping[ph] = str(value)
|
|
124
|
+
|
|
125
|
+
# Perform substitution using Template (safe_substitute: missing keys left intact)
|
|
126
|
+
substituted = Template(tpl_str).safe_substitute(mapping)
|
|
127
|
+
output[out_key] = substituted
|
|
128
|
+
|
|
129
|
+
# Post-process generated_metadata if present: convert JSON text back to dict/list when possible
|
|
130
|
+
raw_meta = output.get("generated_metadata", {})
|
|
131
|
+
if isinstance(raw_meta, str) and raw_meta:
|
|
132
|
+
# Try json first (since we used json.dumps above for mapping)
|
|
133
|
+
try:
|
|
134
|
+
output["generated_metadata"] = json.loads(raw_meta)
|
|
135
|
+
except Exception:
|
|
136
|
+
# fallback to ast.literal_eval (handles Python dict strings)
|
|
137
|
+
try:
|
|
138
|
+
output["generated_metadata"] = ast.literal_eval(raw_meta)
|
|
139
|
+
except Exception:
|
|
140
|
+
# if parsing fails, keep the original raw string or use an empty dict
|
|
141
|
+
output["generated_metadata"] = raw_meta
|
|
142
|
+
|
|
143
|
+
# If generated_metadata is empty string, normalize to {}
|
|
144
|
+
if output.get("generated_metadata") == "":
|
|
145
|
+
output["generated_metadata"] = {}
|
|
146
|
+
|
|
147
|
+
print(f"output:\n{output}\n---")
|
|
148
|
+
# Return validated model
|
|
149
|
+
return InteractionResults.model_validate(output)
|
|
150
|
+
|
|
151
|
+
except json.JSONDecodeError as e:
|
|
152
|
+
logger.error(f"[extract_interaction_details] Failed to parse JSON response: {e}")
|
|
153
|
+
return InteractionResults()
|
|
154
|
+
|
|
155
|
+
except ValidationError as e:
|
|
156
|
+
logger.exception(f"[extract_interaction_details] InteractionResults validation failed: {e}")
|
|
157
|
+
return InteractionResults()
|
|
158
|
+
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.exception(f"[extract_interaction_details] Unexpected error: {e}")
|
|
161
|
+
return InteractionResults()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@MonitoringAspect.monitor(name="interaction_request", category=MetricType.API_CALL)
|
|
165
|
+
async def async_interaction_request(
|
|
166
|
+
url: str,
|
|
167
|
+
headers: Dict[str, str],
|
|
168
|
+
payload: Dict[str, Any],
|
|
169
|
+
) -> httpx.Response | None:
|
|
170
|
+
"""
|
|
171
|
+
Perform an asynchronous interaction request.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
url (str): The URL to send the request to.
|
|
175
|
+
headers (Dict[str, str]): The headers to include in the request.
|
|
176
|
+
payload (Dict[str, Any]): The payload to send in the request.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
httpx.Response: The response from the interaction request, or None if an error occurred.
|
|
180
|
+
"""
|
|
181
|
+
try:
|
|
182
|
+
async with httpx.AsyncClient(timeout=180) as client:
|
|
183
|
+
response = await client.post(url=url, headers=headers, json=payload)
|
|
184
|
+
response.raise_for_status()
|
|
185
|
+
|
|
186
|
+
return response
|
|
187
|
+
|
|
188
|
+
except httpx.HTTPStatusError as http_err:
|
|
189
|
+
logger.error(f"[async_interaction_request] HTTP error: {http_err.response.text}", exc_info=True)
|
|
190
|
+
|
|
191
|
+
except httpx.RequestError as req_err:
|
|
192
|
+
logger.error(f"[async_interaction_request] Request error: {str(req_err)}", exc_info=True)
|
|
193
|
+
|
|
194
|
+
return None
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@MonitoringAspect.monitor(
|
|
198
|
+
name="average_calc",
|
|
199
|
+
category=MetricType.SCORING,
|
|
200
|
+
cached=True,
|
|
201
|
+
maxsize=1000
|
|
202
|
+
)
|
|
203
|
+
def calculate_average_scores(scores: Dict[str, Union[List[float], float]]) -> Dict[str, float]:
|
|
204
|
+
"""
|
|
205
|
+
Helper function that calculates the average scores for a dictionary of score lists.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
scores (Dict[str, List[float]]): A dictionary where keys are identifiers and values are lists of scores.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Dict[str, float]: A dictionary with average scores rounded to three decimal places.
|
|
212
|
+
"""
|
|
213
|
+
result: Dict[str, float] = {}
|
|
214
|
+
for field, value in scores.items():
|
|
215
|
+
if isinstance(value, (int, float)):
|
|
216
|
+
result[field] = value
|
|
217
|
+
elif isinstance(value, list):
|
|
218
|
+
result[field] = round((sum(value) / len(value)), 3) if value else 0.0
|
|
219
|
+
else:
|
|
220
|
+
raise TypeError(f"[calculate_average_scores] Unexpected type '{type(value)}' for field '{field}")
|
|
221
|
+
|
|
222
|
+
return result
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
@MonitoringAspect.monitor(name="summarization", category=MetricType.API_CALL)
|
|
226
|
+
def summarize_verdicts(
|
|
227
|
+
verdicts: List[str],
|
|
228
|
+
judge: str,
|
|
229
|
+
max_bullets: int = 5
|
|
230
|
+
) -> List[str]:
|
|
231
|
+
client_registry = ClientRegistry()
|
|
232
|
+
client = client_registry.get(provider=judge)
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
verdicts = chr(10).join(verdicts)
|
|
236
|
+
prompt = SUMMARIZATION_PROMPT_TEMPLATE.format(max_bullets=max_bullets, judge=judge, verdicts=verdicts)
|
|
237
|
+
response = client.call(message=prompt)
|
|
238
|
+
parsed = client.parse_response(response=response)
|
|
239
|
+
striped = parsed.get("output", "").strip("")
|
|
240
|
+
bullet_points = [point.strip() for point in striped.split("- ") if point.strip()]
|
|
241
|
+
|
|
242
|
+
return bullet_points[:max_bullets]
|
|
243
|
+
|
|
244
|
+
except Exception as e:
|
|
245
|
+
logger.error(f"[summarize_justifications] Error during summarization: {str(e)}", exc_info=True)
|
|
246
|
+
return []
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# if __name__ == '__main__':
|
|
250
|
+
# template = {'generated_reply': '${agent_reply}', 'generated_metadata': '${generated_metadata}'}
|
|
251
|
+
# response_dict = {
|
|
252
|
+
# 'agent_reply': "I'd be happy to help you book something for 10 AM.",
|
|
253
|
+
# 'generated_metadata': {'appointment_type': 'Cardiology', 'date': 'next Monday', 'time': '10 AM'}
|
|
254
|
+
# }
|
|
255
|
+
#
|
|
256
|
+
# result = extract_interaction_details(response_dict, template)
|
|
257
|
+
# print(f"result: {result.model_dump()}")
|
|
@@ -1,65 +1,67 @@
|
|
|
1
|
-
[project]
|
|
2
|
-
name = "levelapp"
|
|
3
|
-
version = "0.1.
|
|
4
|
-
description = "LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]"
|
|
5
|
-
readme = "README.md"
|
|
6
|
-
authors = [
|
|
7
|
-
{ name = "Mohamed Sofiene KADRI", email = "ms.kadri.dev@gmail.com" }
|
|
8
|
-
]
|
|
9
|
-
licence = { file = "LICENCE" }
|
|
10
|
-
requires-python = ">=3.12"
|
|
11
|
-
keywords = ["ai", "llm", "evaluation", "framework", "testing"]
|
|
12
|
-
classifiers = [
|
|
13
|
-
"Development Status :: 3 - Alpha",
|
|
14
|
-
"Intended Audience :: Developers",
|
|
15
|
-
"License :: OSI Approved :: MIT License",
|
|
16
|
-
"Programming Language :: Python :: 3",
|
|
17
|
-
"Programming Language :: Python :: 3.12",
|
|
18
|
-
"Topic :: Software Development :: Testing",
|
|
19
|
-
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
20
|
-
]
|
|
21
|
-
|
|
22
|
-
dependencies = [
|
|
23
|
-
"google-api-core>=2.25.1",
|
|
24
|
-
"google-auth>=2.40.3",
|
|
25
|
-
"google-cloud-firestore>=2.21.0",
|
|
26
|
-
"httpx>=0.28.1",
|
|
27
|
-
"humanize>=4.13.0",
|
|
28
|
-
"numpy>=2.3.2",
|
|
29
|
-
"pandas>=2.3.1",
|
|
30
|
-
"pandas-stubs==2.3.0.250703",
|
|
31
|
-
"pydantic>=2.11.7",
|
|
32
|
-
"python-dotenv>=1.1.1",
|
|
33
|
-
"pyyaml>=6.0.2",
|
|
34
|
-
"rapid>=0.0.3",
|
|
35
|
-
"rapidfuzz>=3.13.0",
|
|
36
|
-
"requests>=2.32.4",
|
|
37
|
-
"tenacity>=9.1.2",
|
|
38
|
-
]
|
|
39
|
-
|
|
40
|
-
[project.urls]
|
|
41
|
-
Homepage = "https://github.com/levelapp-org"
|
|
42
|
-
Repository = "https://github.com/levelapp-org/levelapp-framework"
|
|
43
|
-
Documentation = "https://levelapp.readthedocs.io"
|
|
44
|
-
Issues = "https://github.com/levelapp-org/levelapp-framework/issues"
|
|
45
|
-
|
|
46
|
-
[build-system]
|
|
47
|
-
requires = ["hatchling"]
|
|
48
|
-
build-backend = "hatchling.build"
|
|
49
|
-
|
|
50
|
-
[project.optional-dependencies]
|
|
51
|
-
dev = [
|
|
52
|
-
"
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
"
|
|
60
|
-
"
|
|
61
|
-
"
|
|
62
|
-
"
|
|
63
|
-
"
|
|
64
|
-
"
|
|
65
|
-
|
|
1
|
+
[project]
|
|
2
|
+
name = "levelapp"
|
|
3
|
+
version = "0.1.4"
|
|
4
|
+
description = "LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Mohamed Sofiene KADRI", email = "ms.kadri.dev@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
licence = { file = "LICENCE" }
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
keywords = ["ai", "llm", "evaluation", "framework", "testing"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Topic :: Software Development :: Testing",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
dependencies = [
|
|
23
|
+
"google-api-core>=2.25.1",
|
|
24
|
+
"google-auth>=2.40.3",
|
|
25
|
+
"google-cloud-firestore>=2.21.0",
|
|
26
|
+
"httpx>=0.28.1",
|
|
27
|
+
"humanize>=4.13.0",
|
|
28
|
+
"numpy>=2.3.2",
|
|
29
|
+
"pandas>=2.3.1",
|
|
30
|
+
"pandas-stubs==2.3.0.250703",
|
|
31
|
+
"pydantic>=2.11.7",
|
|
32
|
+
"python-dotenv>=1.1.1",
|
|
33
|
+
"pyyaml>=6.0.2",
|
|
34
|
+
"rapid>=0.0.3",
|
|
35
|
+
"rapidfuzz>=3.13.0",
|
|
36
|
+
"requests>=2.32.4",
|
|
37
|
+
"tenacity>=9.1.2",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/levelapp-org"
|
|
42
|
+
Repository = "https://github.com/levelapp-org/levelapp-framework"
|
|
43
|
+
Documentation = "https://levelapp.readthedocs.io"
|
|
44
|
+
Issues = "https://github.com/levelapp-org/levelapp-framework/issues"
|
|
45
|
+
|
|
46
|
+
[build-system]
|
|
47
|
+
requires = ["hatchling"]
|
|
48
|
+
build-backend = "hatchling.build"
|
|
49
|
+
|
|
50
|
+
[project.optional-dependencies]
|
|
51
|
+
dev = [
|
|
52
|
+
"google-api-core>=2.25.1",
|
|
53
|
+
"google-auth>=2.40.3",
|
|
54
|
+
"google-cloud-firestore>=2.21.0",
|
|
55
|
+
"httpx>=0.28.1",
|
|
56
|
+
"humanize>=4.13.0",
|
|
57
|
+
"numpy>=2.3.2",
|
|
58
|
+
"pandas>=2.3.1",
|
|
59
|
+
"pandas-stubs==2.3.0.250703",
|
|
60
|
+
"pydantic>=2.11.7",
|
|
61
|
+
"python-dotenv>=1.1.1",
|
|
62
|
+
"pyyaml>=6.0.2",
|
|
63
|
+
"rapid>=0.0.3",
|
|
64
|
+
"rapidfuzz>=3.13.0",
|
|
65
|
+
"requests>=2.32.4",
|
|
66
|
+
"tenacity>=9.1.2",
|
|
67
|
+
]
|
|
@@ -32,8 +32,8 @@ endpoint:
|
|
|
32
32
|
details: "${request_payload}" # Rest of the request payload data.
|
|
33
33
|
default_response_payload_template:
|
|
34
34
|
# Change the placeholder value only according to the response payload schema (example: ${agent_reply} to ${reply}).
|
|
35
|
-
generated_reply: "${
|
|
36
|
-
generated_metadata: "${
|
|
35
|
+
generated_reply: "${message}"
|
|
36
|
+
generated_metadata: "${metadata}"
|
|
37
37
|
|
|
38
38
|
repository:
|
|
39
39
|
type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
if __name__ == "__main__":
|
|
2
|
+
from levelapp.workflow import WorkflowConfig
|
|
3
|
+
from levelapp.core.session import EvaluationSession
|
|
4
|
+
|
|
5
|
+
# Firestore -> retrieve endpoint config -> data => config_dict
|
|
6
|
+
|
|
7
|
+
config_dict_ = {
|
|
8
|
+
"process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
|
|
9
|
+
"evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
|
|
10
|
+
"reference_data": {"path": "", "data": {}},
|
|
11
|
+
"endpoint": {
|
|
12
|
+
"base_url": "https://dashq-gateway-485vb8zi.uc.gateway.dev/api/conversations/events",
|
|
13
|
+
"api_key": "AIzaSyAmL8blcS2hpPrEH2b84B8ugsVoV7AXrfc",
|
|
14
|
+
"model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
15
|
+
"default_request_payload_template": {
|
|
16
|
+
"eventType": "newConversation",
|
|
17
|
+
"conversationId": "435484ef-403b-43c5-9908-884486149d0b",
|
|
18
|
+
"payload": {
|
|
19
|
+
"messageType": "newInquiry",
|
|
20
|
+
"communityId": 3310,
|
|
21
|
+
"accountId": 1440,
|
|
22
|
+
"prospectFirstName": "BAD DOE X",
|
|
23
|
+
"prospectLastName": "Doe",
|
|
24
|
+
"message": "${user_message}",
|
|
25
|
+
"datetime": "2025-06-25T11:12:27.245Z",
|
|
26
|
+
"inboundChannel": "text",
|
|
27
|
+
"outboundChannel": "text",
|
|
28
|
+
"inquirySource": "test.com",
|
|
29
|
+
"inquiryMetadata": {}
|
|
30
|
+
},
|
|
31
|
+
},
|
|
32
|
+
"default_response_payload_template": {
|
|
33
|
+
"generated_reply": "${message}",
|
|
34
|
+
"generated_metadata": "${metadata}"
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
"repository": {"type": "FIRESTORE", "source": "IN_MEMORY", "metrics_map": {"field_1": "EXACT"}},
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
content = {
|
|
41
|
+
"scripts": [
|
|
42
|
+
{
|
|
43
|
+
"interactions": [
|
|
44
|
+
{
|
|
45
|
+
"user_message": "Hi I would like to rent an apartment",
|
|
46
|
+
"reference_reply": "thank you for reaching out. I’d be happy to help you find an apartment. Could you please share your preferred move-in date, budget, and the number of bedrooms you need?"
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
"user_message": "I am moving in next month, and I would like to rent a two bedroom apartment",
|
|
50
|
+
"reference_reply": "sorry, but I can only assist you with booking medical appointments."
|
|
51
|
+
},
|
|
52
|
+
]
|
|
53
|
+
},
|
|
54
|
+
]
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Load configuration from YAML
|
|
58
|
+
config = WorkflowConfig.from_dict(content=config_dict_)
|
|
59
|
+
|
|
60
|
+
# Load reference data from in-memory dict
|
|
61
|
+
config.set_reference_data(content=content)
|
|
62
|
+
|
|
63
|
+
# config = WorkflowConfig.load(path="../data/workflow_config.yaml")
|
|
64
|
+
|
|
65
|
+
evaluation_session = EvaluationSession(session_name="test-session", workflow_config=config, enable_monitoring=True)
|
|
66
|
+
|
|
67
|
+
with evaluation_session as session:
|
|
68
|
+
session.run()
|
|
69
|
+
results = session.workflow.collect_results()
|
|
70
|
+
print("Results:", results)
|
|
71
|
+
|
|
72
|
+
stats = session.get_stats()
|
|
73
|
+
print(f"session stats:\n{stats}")
|