levelapp 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of levelapp might be problematic. Click here for more details.
- levelapp/aspects/monitor.py +3 -1
- levelapp/clients/__init__.py +0 -1
- levelapp/comparator/scorer.py +0 -2
- levelapp/config/endpoint.py +22 -13
- levelapp/config/endpoint_.py +62 -0
- levelapp/config/prompts.py +22 -0
- levelapp/core/schemas.py +24 -0
- levelapp/core/session.py +97 -59
- levelapp/evaluator/evaluator.py +42 -14
- levelapp/metrics/__init__.py +1 -5
- levelapp/repository/firestore.py +15 -6
- levelapp/simulator/schemas.py +15 -21
- levelapp/simulator/simulator.py +124 -55
- levelapp/simulator/utils.py +40 -78
- levelapp/workflow/__init__.py +3 -2
- levelapp/workflow/base.py +64 -17
- levelapp/workflow/config.py +92 -0
- levelapp/workflow/context.py +62 -0
- levelapp/workflow/factory.py +32 -41
- levelapp/workflow/registration.py +1 -1
- levelapp/workflow/runtime.py +19 -0
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/METADATA +102 -39
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/RECORD +25 -21
- levelapp/workflow/schemas.py +0 -121
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/WHEEL +0 -0
- {levelapp-0.1.0.dist-info → levelapp-0.1.2.dist-info}/licenses/LICENSE +0 -0
levelapp/repository/firestore.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
"""levelapp/repository/firestore.py"""
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
from typing import List, Dict, Any, Type
|
|
2
|
+
from typing import List, Dict, Any, Type, TYPE_CHECKING
|
|
5
3
|
from pydantic import ValidationError
|
|
6
4
|
|
|
7
5
|
from google.cloud import firestore_v1
|
|
@@ -13,15 +11,25 @@ from levelapp.core.base import BaseRepository, Model
|
|
|
13
11
|
from levelapp.aspects import logger
|
|
14
12
|
|
|
15
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from levelapp.workflow.config import WorkflowConfig
|
|
16
|
+
|
|
17
|
+
|
|
16
18
|
class FirestoreRepository(BaseRepository):
|
|
17
19
|
"""
|
|
18
20
|
Firestore implementation of BaseRepository.
|
|
19
21
|
(Uses hierarchical path: {user_id}/{collection_id}/{document_id}
|
|
20
22
|
"""
|
|
21
23
|
|
|
22
|
-
def __init__(self,
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
def __init__(self, config: "WorkflowConfig | None"):
|
|
25
|
+
if config:
|
|
26
|
+
self.config = config
|
|
27
|
+
self.project_id: str | Any = config.repository.project_id
|
|
28
|
+
self.database_name: str | Any = config.repository.database_name
|
|
29
|
+
else:
|
|
30
|
+
self.project_id: str | Any = None
|
|
31
|
+
self.database_name: str | Any = '(default)'
|
|
32
|
+
|
|
25
33
|
self.client: firestore_v1.Client | None = None
|
|
26
34
|
|
|
27
35
|
def connect(self) -> None:
|
|
@@ -29,6 +37,7 @@ class FirestoreRepository(BaseRepository):
|
|
|
29
37
|
Connects to Firestore, prioritizing the project ID passed to the constructor.
|
|
30
38
|
"""
|
|
31
39
|
try:
|
|
40
|
+
import google.auth
|
|
32
41
|
credentials, default_project_id = google.auth.default()
|
|
33
42
|
|
|
34
43
|
if not credentials:
|
levelapp/simulator/schemas.py
CHANGED
|
@@ -8,8 +8,8 @@ from enum import Enum
|
|
|
8
8
|
from uuid import UUID, uuid4
|
|
9
9
|
from datetime import datetime
|
|
10
10
|
|
|
11
|
-
from typing import
|
|
12
|
-
from pydantic import BaseModel, Field, computed_field
|
|
11
|
+
from typing import Dict, Any, List
|
|
12
|
+
from pydantic import BaseModel, Field, computed_field, field_validator
|
|
13
13
|
|
|
14
14
|
from levelapp.evaluator.evaluator import JudgeEvaluationResults
|
|
15
15
|
|
|
@@ -25,11 +25,11 @@ class Interaction(BaseModel):
|
|
|
25
25
|
"""Represents a single interaction within a conversation."""
|
|
26
26
|
id: UUID = Field(default_factory=uuid4, description="Interaction identifier")
|
|
27
27
|
user_message: str = Field(..., description="The user's query message")
|
|
28
|
-
generated_reply: str = Field(..., description="The agent's reply message")
|
|
28
|
+
# generated_reply: str = Field(..., description="The agent's reply message")
|
|
29
29
|
reference_reply: str = Field(..., description="The preset reference message")
|
|
30
|
-
interaction_type: InteractionLevel = Field(
|
|
31
|
-
reference_metadata:
|
|
32
|
-
generated_metadata:
|
|
30
|
+
interaction_type: InteractionLevel = Field(default=InteractionLevel.INITIAL, description="Type of interaction")
|
|
31
|
+
reference_metadata: Dict[str, Any] = Field(default_factory=dict, description="Expected metadata")
|
|
32
|
+
# generated_metadata: Dict[str, Any] = Field(default_factory=dict, description="Extracted metadata")
|
|
33
33
|
guardrail_flag: bool = Field(default=False, description="Flag for guardrail signaling")
|
|
34
34
|
request_payload: Dict[str, Any] = Field(default_factory=dict, description="Additional request payload")
|
|
35
35
|
|
|
@@ -38,7 +38,7 @@ class ConversationScript(BaseModel):
|
|
|
38
38
|
"""Represents a basic conversation with multiple interactions."""
|
|
39
39
|
id: UUID = Field(default_factory=uuid4, description="Conversation identifier")
|
|
40
40
|
interactions: List[Interaction] = Field(default_factory=list, description="List of interactions")
|
|
41
|
-
description: str = Field(
|
|
41
|
+
description: str = Field(default="no-description", description="A short description of the conversation")
|
|
42
42
|
details: Dict[str, str] = Field(default_factory=dict, description="Conversation details")
|
|
43
43
|
|
|
44
44
|
|
|
@@ -58,32 +58,26 @@ class InteractionResults(BaseModel):
|
|
|
58
58
|
|
|
59
59
|
class InteractionEvaluationResults(BaseModel):
|
|
60
60
|
"""Model representing the evaluation result of an interaction."""
|
|
61
|
-
judge_evaluations: Dict[str, JudgeEvaluationResults] = Field(default_factory=dict)
|
|
62
|
-
metadata_evaluation: Dict[str, float] = Field(default_factory=dict)
|
|
61
|
+
judge_evaluations: Dict[str, JudgeEvaluationResults] | None = Field(default_factory=dict)
|
|
62
|
+
metadata_evaluation: Dict[str, float] | None = Field(default_factory=dict)
|
|
63
63
|
guardrail_flag: int = Field(default=0)
|
|
64
64
|
|
|
65
65
|
|
|
66
66
|
class SimulationResults(BaseModel):
|
|
67
|
-
# Initial data
|
|
68
|
-
project_id: str = Field(default_factory=uuid4, description="Project identifier")
|
|
69
|
-
user_id: str = Field(default_factory=uuid4, description="User identifier")
|
|
70
|
-
batch_id: str = Field(default_factory=uuid4, description="Batch identifier")
|
|
71
67
|
# Collected data
|
|
72
68
|
started_at: datetime = datetime.now()
|
|
73
69
|
finished_at: datetime
|
|
74
70
|
# Collected Results
|
|
75
71
|
evaluation_summary: Dict[str, Any] | None = Field(default_factory=dict, description="Evaluation result")
|
|
76
72
|
average_scores: Dict[str, Any] | None = Field(default_factory=dict, description="Average scores")
|
|
73
|
+
interaction_results: List[Dict[str, Any]] | None = Field(default_factory=list, description="detailed results")
|
|
74
|
+
|
|
75
|
+
@computed_field
|
|
76
|
+
@property
|
|
77
|
+
def batch_id(self) -> str:
|
|
78
|
+
return str(uuid4())
|
|
77
79
|
|
|
78
80
|
@computed_field
|
|
79
81
|
@property
|
|
80
82
|
def elapsed_time(self) -> float:
|
|
81
83
|
return (self.finished_at - self.started_at).total_seconds()
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
class TestResults(BaseModel):
|
|
85
|
-
api_host: str = Field(..., alias="apiHost")
|
|
86
|
-
ionos_model_name: str = Field(..., alias="ionosModelName")
|
|
87
|
-
test_name: str = Field(..., alias="testName")
|
|
88
|
-
test_type: str = Field(..., alias="testType")
|
|
89
|
-
batch_details: Optional[SimulationResults] = Field(..., alias="results")
|
levelapp/simulator/simulator.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
'simulators/service.py': Service layer to manage conversation simulation and evaluation.
|
|
3
3
|
"""
|
|
4
|
+
import json
|
|
4
5
|
import time
|
|
5
6
|
import asyncio
|
|
6
7
|
|
|
@@ -23,7 +24,7 @@ from levelapp.simulator.utils import (
|
|
|
23
24
|
summarize_verdicts,
|
|
24
25
|
)
|
|
25
26
|
from levelapp.aspects import logger
|
|
26
|
-
from levelapp.
|
|
27
|
+
from levelapp.core.schemas import EvaluatorType
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
class ConversationSimulator(BaseProcess):
|
|
@@ -33,6 +34,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
33
34
|
self,
|
|
34
35
|
repository: BaseRepository | None = None,
|
|
35
36
|
evaluators: Dict[EvaluatorType, BaseEvaluator] | None = None,
|
|
37
|
+
providers: List[str] | None = None,
|
|
36
38
|
endpoint_config: EndpointConfig | None = None,
|
|
37
39
|
):
|
|
38
40
|
"""
|
|
@@ -47,6 +49,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
47
49
|
|
|
48
50
|
self.repository = repository
|
|
49
51
|
self.evaluators = evaluators
|
|
52
|
+
self.providers = providers
|
|
50
53
|
self.endpoint_config = endpoint_config
|
|
51
54
|
|
|
52
55
|
self._url: str | None = None
|
|
@@ -60,7 +63,8 @@ class ConversationSimulator(BaseProcess):
|
|
|
60
63
|
def setup(
|
|
61
64
|
self,
|
|
62
65
|
repository: BaseRepository,
|
|
63
|
-
evaluators: Dict[
|
|
66
|
+
evaluators: Dict[EvaluatorType, BaseEvaluator],
|
|
67
|
+
providers: List[str],
|
|
64
68
|
endpoint_config: EndpointConfig,
|
|
65
69
|
) -> None:
|
|
66
70
|
"""
|
|
@@ -69,6 +73,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
69
73
|
Args:
|
|
70
74
|
repository (BaseRepository): Repository object for storing simulation results.
|
|
71
75
|
evaluators (Dict[str, BaseEvaluator]): List of evaluator objects for evaluating interactions.
|
|
76
|
+
providers (List[str]): List of LLM provider names.
|
|
72
77
|
endpoint_config (EndpointConfig): Configuration object for VLA.
|
|
73
78
|
"""
|
|
74
79
|
_LOG: str = f"[{self._CLASS_NAME}][{self.setup.__name__}]"
|
|
@@ -76,6 +81,11 @@ class ConversationSimulator(BaseProcess):
|
|
|
76
81
|
|
|
77
82
|
self.repository = repository
|
|
78
83
|
self.evaluators = evaluators
|
|
84
|
+
self.providers = providers
|
|
85
|
+
|
|
86
|
+
if not self.providers:
|
|
87
|
+
logger.warning(f"{_LOG} No LLM providers were provided. The Judge Evaluation process will not be executed.")
|
|
88
|
+
|
|
79
89
|
self.endpoint_config = endpoint_config
|
|
80
90
|
|
|
81
91
|
self._url = endpoint_config.full_url
|
|
@@ -83,17 +93,27 @@ class ConversationSimulator(BaseProcess):
|
|
|
83
93
|
self._headers = endpoint_config.headers
|
|
84
94
|
|
|
85
95
|
def get_evaluator(self, name: EvaluatorType) -> BaseEvaluator:
|
|
96
|
+
"""
|
|
97
|
+
Retrieve an evaluator by name.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
name (EvaluatorType): Name of evaluator.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
An evaluator object.
|
|
104
|
+
"""
|
|
86
105
|
_LOG: str = f"[{self._CLASS_NAME}][{self.get_evaluator.__name__}]"
|
|
87
106
|
|
|
88
107
|
if name not in self.evaluators:
|
|
89
108
|
raise KeyError(f"{_LOG} Evaluator {name} not registered.")
|
|
109
|
+
|
|
90
110
|
return self.evaluators[name]
|
|
91
111
|
|
|
92
112
|
async def run(
|
|
93
113
|
self,
|
|
94
114
|
test_batch: ScriptsBatch,
|
|
95
115
|
attempts: int = 1,
|
|
96
|
-
) ->
|
|
116
|
+
) -> Any:
|
|
97
117
|
"""
|
|
98
118
|
Run a batch test for the given batch name and details.
|
|
99
119
|
|
|
@@ -119,9 +139,10 @@ class ConversationSimulator(BaseProcess):
|
|
|
119
139
|
finished_at=finished_at,
|
|
120
140
|
evaluation_summary=self.verdict_summaries,
|
|
121
141
|
average_scores=results.get("average_scores", {}),
|
|
142
|
+
interaction_results=results.get("results")
|
|
122
143
|
)
|
|
123
144
|
|
|
124
|
-
return
|
|
145
|
+
return results.model_dump_json(indent=2)
|
|
125
146
|
|
|
126
147
|
async def simulate_conversation(self, attempts: int = 1) -> Dict[str, Any]:
|
|
127
148
|
"""
|
|
@@ -161,10 +182,11 @@ class ConversationSimulator(BaseProcess):
|
|
|
161
182
|
verdicts=verdicts, judge=judge
|
|
162
183
|
)
|
|
163
184
|
|
|
164
|
-
return {"
|
|
185
|
+
return {"results": results, "average_scores": overall_average_scores}
|
|
165
186
|
|
|
166
187
|
async def simulate_single_scenario(
|
|
167
|
-
self, script: ConversationScript,
|
|
188
|
+
self, script: ConversationScript,
|
|
189
|
+
attempts: int = 1
|
|
168
190
|
) -> Dict[str, Any]:
|
|
169
191
|
"""
|
|
170
192
|
Simulate a single scenario with the given number of attempts, concurrently.
|
|
@@ -183,19 +205,18 @@ class ConversationSimulator(BaseProcess):
|
|
|
183
205
|
all_attempts_verdicts: Dict[str, List[str]] = defaultdict(list)
|
|
184
206
|
|
|
185
207
|
async def simulate_attempt(attempt_number: int) -> Dict[str, Any]:
|
|
186
|
-
logger.info(f"{_LOG} Running attempt: {attempt_number + 1}/{attempts}")
|
|
208
|
+
logger.info(f"{_LOG} Running attempt: {attempt_number + 1}/{attempts}\n---")
|
|
187
209
|
start_time = time.time()
|
|
188
210
|
|
|
189
211
|
collected_scores: Dict[str, List[Any]] = defaultdict(list)
|
|
190
212
|
collected_verdicts: Dict[str, List[str]] = defaultdict(list)
|
|
191
213
|
|
|
192
|
-
|
|
214
|
+
interaction_results = await self.simulate_interactions(
|
|
193
215
|
script=script,
|
|
194
216
|
evaluation_verdicts=collected_verdicts,
|
|
195
217
|
collected_scores=collected_scores,
|
|
196
218
|
)
|
|
197
219
|
|
|
198
|
-
logger.info(f"{_LOG} collected_scores: {collected_scores}\n---")
|
|
199
220
|
single_attempt_scores = calculate_average_scores(collected_scores)
|
|
200
221
|
|
|
201
222
|
for target, scores in single_attempt_scores.items():
|
|
@@ -215,7 +236,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
215
236
|
"attempt": attempt_number + 1,
|
|
216
237
|
"script_id": script.id,
|
|
217
238
|
"total_duration": elapsed_time,
|
|
218
|
-
"interaction_results":
|
|
239
|
+
"interaction_results": interaction_results,
|
|
219
240
|
"evaluation_verdicts": collected_verdicts,
|
|
220
241
|
"average_scores": single_attempt_scores,
|
|
221
242
|
}
|
|
@@ -228,10 +249,6 @@ class ConversationSimulator(BaseProcess):
|
|
|
228
249
|
for judge_, verdicts_ in all_attempts_verdicts.items():
|
|
229
250
|
self.evaluation_verdicts[judge_].extend(verdicts_)
|
|
230
251
|
|
|
231
|
-
logger.info(
|
|
232
|
-
f"{_LOG} average scores:\n{average_scores}\n---"
|
|
233
|
-
)
|
|
234
|
-
|
|
235
252
|
return {
|
|
236
253
|
"script_id": script.id,
|
|
237
254
|
"attempts": attempt_results,
|
|
@@ -314,8 +331,6 @@ class ConversationSimulator(BaseProcess):
|
|
|
314
331
|
reference_guardrail=reference_guardrail_flag,
|
|
315
332
|
)
|
|
316
333
|
|
|
317
|
-
logger.info(f"{_LOG} Evaluation results:\n{evaluation_results.model_dump()}\n")
|
|
318
|
-
|
|
319
334
|
self.store_evaluation_results(
|
|
320
335
|
results=evaluation_results,
|
|
321
336
|
evaluation_verdicts=evaluation_verdicts,
|
|
@@ -323,9 +338,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
323
338
|
)
|
|
324
339
|
|
|
325
340
|
elapsed_time = time.time() - start_time
|
|
326
|
-
logger.info(
|
|
327
|
-
f"{_LOG} Interaction simulation complete in {elapsed_time:.2f} seconds.\n---"
|
|
328
|
-
)
|
|
341
|
+
logger.info(f"{_LOG} Interaction simulation complete in {elapsed_time:.2f} seconds.\n---")
|
|
329
342
|
|
|
330
343
|
result = {
|
|
331
344
|
"user_message": user_message,
|
|
@@ -368,50 +381,106 @@ class ConversationSimulator(BaseProcess):
|
|
|
368
381
|
"""
|
|
369
382
|
_LOG: str = f"[{self._CLASS_NAME}][{self.evaluate_interaction.__name__}]"
|
|
370
383
|
|
|
371
|
-
judge_evaluator = self.evaluators.get(EvaluatorType.JUDGE)
|
|
372
|
-
metadata_evaluator = self.evaluators.get(EvaluatorType.REFERENCE)
|
|
384
|
+
judge_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.JUDGE, None)
|
|
385
|
+
metadata_evaluator: BaseEvaluator | None = self.evaluators.get(EvaluatorType.REFERENCE, None)
|
|
373
386
|
|
|
374
|
-
|
|
375
|
-
raise ValueError(f"{_LOG} No Judge Evaluator found.")
|
|
387
|
+
evaluation_results = InteractionEvaluationResults()
|
|
376
388
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
389
|
+
if judge_evaluator and self.providers:
|
|
390
|
+
await self._judge_evaluation(
|
|
391
|
+
user_input=user_input,
|
|
392
|
+
generated_reply=generated_reply,
|
|
393
|
+
reference_reply=reference_reply,
|
|
394
|
+
providers=self.providers,
|
|
395
|
+
judge_evaluator=judge_evaluator,
|
|
396
|
+
evaluation_results=evaluation_results,
|
|
397
|
+
)
|
|
398
|
+
else:
|
|
399
|
+
logger.info(f"[{_LOG}] Judge evaluation skipped (no evaluator or no providers).")
|
|
383
400
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
401
|
+
if metadata_evaluator and reference_metadata:
|
|
402
|
+
self._metadata_evaluation(
|
|
403
|
+
metadata_evaluator=metadata_evaluator,
|
|
404
|
+
generated_metadata=generated_metadata,
|
|
405
|
+
reference_metadata=reference_metadata,
|
|
406
|
+
evaluation_results=evaluation_results,
|
|
407
|
+
)
|
|
408
|
+
else:
|
|
409
|
+
logger.info(f"[{_LOG}] Metadata evaluation skipped (no evaluator or no reference metadata).")
|
|
390
410
|
|
|
391
|
-
|
|
392
|
-
openai_eval_task, ionos_eval_task
|
|
393
|
-
)
|
|
411
|
+
evaluation_results.guardrail_flag = 1 if generated_guardrail == reference_guardrail else 0
|
|
394
412
|
|
|
395
|
-
|
|
396
|
-
raise ValueError(f"{_LOG} No Metadata Evaluator found.")
|
|
413
|
+
return evaluation_results
|
|
397
414
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
415
|
+
async def _judge_evaluation(
|
|
416
|
+
self,
|
|
417
|
+
user_input: str,
|
|
418
|
+
generated_reply: str,
|
|
419
|
+
reference_reply: str,
|
|
420
|
+
providers: List[str],
|
|
421
|
+
judge_evaluator: BaseEvaluator,
|
|
422
|
+
evaluation_results: InteractionEvaluationResults,
|
|
423
|
+
) -> None:
|
|
424
|
+
"""
|
|
425
|
+
Run LLM-as-a-judge evaluation using multiple providers (async).
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
user_input (str): The user input message.
|
|
429
|
+
generated_reply (str): The generated agent reply.
|
|
430
|
+
reference_reply (str): The reference agent reply.
|
|
431
|
+
providers (List[str]): List of judge provider names.
|
|
432
|
+
judge_evaluator (BaseEvaluator): Evaluator instance.
|
|
433
|
+
evaluation_results (InteractionEvaluationResults): Results container (Pydantic model).
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
None
|
|
437
|
+
"""
|
|
438
|
+
_LOG: str = f"[{self._CLASS_NAME}][judge_evaluation]"
|
|
439
|
+
|
|
440
|
+
tasks = {
|
|
441
|
+
provider: judge_evaluator.async_evaluate(
|
|
442
|
+
generated_data=generated_reply,
|
|
443
|
+
reference_data=reference_reply,
|
|
444
|
+
user_input=user_input,
|
|
445
|
+
provider=provider,
|
|
403
446
|
)
|
|
447
|
+
for provider in providers
|
|
448
|
+
}
|
|
404
449
|
|
|
405
|
-
|
|
450
|
+
results = await asyncio.gather(*tasks.values(), return_exceptions=True)
|
|
406
451
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
452
|
+
for provider, result in zip(tasks.keys(), results):
|
|
453
|
+
if isinstance(result, Exception):
|
|
454
|
+
logger.error(f"{_LOG} Provider '{provider}' failed to perform Judge Evaluation.")
|
|
455
|
+
continue
|
|
456
|
+
|
|
457
|
+
evaluation_results.judge_evaluations[provider] = result
|
|
458
|
+
|
|
459
|
+
def _metadata_evaluation(
|
|
460
|
+
self,
|
|
461
|
+
metadata_evaluator: BaseEvaluator,
|
|
462
|
+
generated_metadata: Dict[str, Any],
|
|
463
|
+
reference_metadata: Dict[str, Any],
|
|
464
|
+
evaluation_results: InteractionEvaluationResults,
|
|
465
|
+
) -> None:
|
|
466
|
+
"""
|
|
467
|
+
Run metadata evaluation using the provided evaluator.
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
metadata_evaluator (BaseEvaluator): Evaluator for metadata comparison.
|
|
471
|
+
generated_metadata (Dict[str, Any]): The generated metadata.
|
|
472
|
+
reference_metadata (Dict[str, Any]): The reference metadata.
|
|
473
|
+
evaluation_results (InteractionEvaluationResults): Results container.
|
|
474
|
+
"""
|
|
475
|
+
_LOG: str = f"[{self._CLASS_NAME}][metadata_evaluation]"
|
|
476
|
+
|
|
477
|
+
try:
|
|
478
|
+
evaluation_results.metadata_evaluation = metadata_evaluator.evaluate(
|
|
479
|
+
generated_data=generated_metadata,
|
|
480
|
+
reference_data=reference_metadata,
|
|
481
|
+
)
|
|
482
|
+
except Exception as e:
|
|
483
|
+
logger.error(f"[{_LOG}] Metadata evaluation failed:\n{e}", exc_info=e)
|
|
415
484
|
|
|
416
485
|
@staticmethod
|
|
417
486
|
def store_evaluation_results(
|
|
@@ -428,7 +497,7 @@ class ConversationSimulator(BaseProcess):
|
|
|
428
497
|
collected_scores (Dict[str, List[Any]]): The collected scores.
|
|
429
498
|
"""
|
|
430
499
|
for provider in results.judge_evaluations.keys():
|
|
431
|
-
evaluation_verdicts[f"{provider}
|
|
500
|
+
evaluation_verdicts[f"{provider}"].append(
|
|
432
501
|
results.judge_evaluations.get(provider, "").justification
|
|
433
502
|
)
|
|
434
503
|
|
levelapp/simulator/utils.py
CHANGED
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
"""
|
|
2
2
|
'simulators/aspects.py': Utility functions for handling VLA interactions and requests.
|
|
3
3
|
"""
|
|
4
|
+
import ast
|
|
4
5
|
import json
|
|
5
|
-
|
|
6
6
|
import httpx
|
|
7
|
-
import arrow
|
|
8
7
|
|
|
9
8
|
from uuid import UUID
|
|
10
|
-
from
|
|
9
|
+
from string import Template
|
|
10
|
+
from typing import Any, Dict, List, Union
|
|
11
11
|
|
|
12
|
-
from openai import OpenAI
|
|
13
12
|
from pydantic import ValidationError
|
|
14
13
|
|
|
14
|
+
from levelapp.clients import ClientRegistry
|
|
15
|
+
from levelapp.config.prompts import SUMMARIZATION_PROMPT_TEMPLATE
|
|
15
16
|
from levelapp.simulator.schemas import InteractionResults
|
|
16
17
|
from levelapp.aspects import MonitoringAspect, MetricType, logger
|
|
17
18
|
|
|
@@ -48,7 +49,14 @@ def extract_interaction_details(
|
|
|
48
49
|
missing_keys = required_keys - response_dict.keys()
|
|
49
50
|
logger.warning(f"[extract_interaction_details] Missing data: {missing_keys}]")
|
|
50
51
|
|
|
51
|
-
|
|
52
|
+
output = {}
|
|
53
|
+
for k, v in template.items():
|
|
54
|
+
output[k] = Template(v).safe_substitute(response_dict)
|
|
55
|
+
|
|
56
|
+
raw_value = output.get("generated_metadata", {})
|
|
57
|
+
output["generated_metadata"] = ast.literal_eval(raw_value) if isinstance(raw_value, str) else raw_value
|
|
58
|
+
|
|
59
|
+
return InteractionResults.model_validate(output)
|
|
52
60
|
|
|
53
61
|
except json.JSONDecodeError as e:
|
|
54
62
|
logger.error(f"[extract_interaction_details] Failed to extract details:\n{e}")
|
|
@@ -64,7 +72,7 @@ async def async_interaction_request(
|
|
|
64
72
|
url: str,
|
|
65
73
|
headers: Dict[str, str],
|
|
66
74
|
payload: Dict[str, Any],
|
|
67
|
-
) ->
|
|
75
|
+
) -> httpx.Response | None:
|
|
68
76
|
"""
|
|
69
77
|
Perform an asynchronous interaction request.
|
|
70
78
|
|
|
@@ -74,7 +82,7 @@ async def async_interaction_request(
|
|
|
74
82
|
payload (Dict[str, Any]): The payload to send in the request.
|
|
75
83
|
|
|
76
84
|
Returns:
|
|
77
|
-
|
|
85
|
+
httpx.Response: The response from the interaction request, or None if an error occurred.
|
|
78
86
|
"""
|
|
79
87
|
try:
|
|
80
88
|
async with httpx.AsyncClient(timeout=180) as client:
|
|
@@ -92,42 +100,6 @@ async def async_interaction_request(
|
|
|
92
100
|
return None
|
|
93
101
|
|
|
94
102
|
|
|
95
|
-
def parse_date_value(raw_date_value: Optional[str], default_date_value: Optional[str] = "") -> str:
|
|
96
|
-
"""
|
|
97
|
-
Cleans and parses a dehumanized relative date string to ISO format.
|
|
98
|
-
|
|
99
|
-
Args:
|
|
100
|
-
raw_date_value (Optional[str]): The raw date value to parse.
|
|
101
|
-
default_date_value (Optional[str]): The default value to return if parsing fails. Defaults to an empty string.
|
|
102
|
-
|
|
103
|
-
Returns:
|
|
104
|
-
str: The parsed date in ISO format, or the default value if parsing fails.
|
|
105
|
-
"""
|
|
106
|
-
if not raw_date_value:
|
|
107
|
-
logger.info(f"[parse_date_value] No raw value provided. returning default: '{default_date_value}'")
|
|
108
|
-
return default_date_value
|
|
109
|
-
|
|
110
|
-
clean = raw_date_value.replace("{{", "").replace("}}", "").replace("_", " ").strip()
|
|
111
|
-
clean += 's' if not clean.endswith('s') else clean
|
|
112
|
-
|
|
113
|
-
try:
|
|
114
|
-
arw = arrow.utcnow()
|
|
115
|
-
parsed_date = arw.dehumanize(clean).utcnow().format('YYYY-MM-DD')
|
|
116
|
-
return parsed_date
|
|
117
|
-
|
|
118
|
-
except arrow.parser.ParserError as e:
|
|
119
|
-
logger.error(f"[parse_date_value] Failed to parse date: '{clean}'\nParserError: {str(e)}", exc_info=True)
|
|
120
|
-
return default_date_value
|
|
121
|
-
|
|
122
|
-
except ValueError as e:
|
|
123
|
-
logger.error(f"[parse_date_value] Invalid date value: '{clean}'\nValueError: {str(e)}", exc_info=True)
|
|
124
|
-
return default_date_value
|
|
125
|
-
|
|
126
|
-
except Exception as e:
|
|
127
|
-
logger.error(f"[parse_date_value] Unexpected error.\nException: {str(e)}", exc_info=True)
|
|
128
|
-
return default_date_value
|
|
129
|
-
|
|
130
|
-
|
|
131
103
|
@MonitoringAspect.monitor(
|
|
132
104
|
name="average_calc",
|
|
133
105
|
category=MetricType.SCORING,
|
|
@@ -157,45 +129,35 @@ def calculate_average_scores(scores: Dict[str, Union[List[float], float]]) -> Di
|
|
|
157
129
|
|
|
158
130
|
|
|
159
131
|
@MonitoringAspect.monitor(name="summarization", category=MetricType.API_CALL)
|
|
160
|
-
def summarize_verdicts(
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
max_bullets (int): The maximum number of bullets allowed per judge.
|
|
168
|
-
|
|
169
|
-
Returns:
|
|
170
|
-
List[str]: The summarized justifications.
|
|
171
|
-
"""
|
|
172
|
-
if not verdicts:
|
|
173
|
-
return []
|
|
174
|
-
|
|
175
|
-
prompt = f"""
|
|
176
|
-
You are reviewing evaluation justifications from LL judges about replies generated by a virtual leasing agent.\n
|
|
177
|
-
Each justification contains the judge's assessment of how well the agent's response matched the expected reply.\n
|
|
178
|
-
Your task is to identify and summarize only the **negative points**, such as errors, misunderstandings,
|
|
179
|
-
missing information, or failure to meet expectations.\n
|
|
180
|
-
Return up to {max_bullets} bullet points. Be concise and start each point with '- '\n\n
|
|
181
|
-
---
|
|
182
|
-
- Judge: {judge}
|
|
183
|
-
- Justifications:\n{chr(10).join(verdicts)}\n
|
|
184
|
-
"""
|
|
185
|
-
|
|
186
|
-
client = OpenAI()
|
|
132
|
+
def summarize_verdicts(
|
|
133
|
+
verdicts: List[str],
|
|
134
|
+
judge: str,
|
|
135
|
+
max_bullets: int = 5
|
|
136
|
+
) -> List[str]:
|
|
137
|
+
client_registry = ClientRegistry()
|
|
138
|
+
client = client_registry.get(provider=judge)
|
|
187
139
|
|
|
188
140
|
try:
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
).
|
|
141
|
+
verdicts = chr(10).join(verdicts)
|
|
142
|
+
prompt = SUMMARIZATION_PROMPT_TEMPLATE.format(max_bullets=max_bullets, judge=judge, verdicts=verdicts)
|
|
143
|
+
response = client.call(message=prompt)
|
|
144
|
+
parsed = client.parse_response(response=response)
|
|
145
|
+
striped = parsed.get("output", "").strip("")
|
|
146
|
+
bullet_points = [point.strip() for point in striped.split("- ") if point.strip()]
|
|
194
147
|
|
|
195
|
-
bullet_points
|
|
196
|
-
|
|
197
|
-
return bullet_points
|
|
148
|
+
return bullet_points[:max_bullets]
|
|
198
149
|
|
|
199
150
|
except Exception as e:
|
|
200
151
|
logger.error(f"[summarize_justifications] Error during summarization: {str(e)}", exc_info=True)
|
|
201
152
|
return []
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# if __name__ == '__main__':
|
|
156
|
+
# template = {'generated_reply': '${agent_reply}', 'generated_metadata': '${generated_metadata}'}
|
|
157
|
+
# response_dict = {
|
|
158
|
+
# 'agent_reply': "I'd be happy to help you book something for 10 AM.",
|
|
159
|
+
# 'generated_metadata': {'appointment_type': 'Cardiology', 'date': 'next Monday', 'time': '10 AM'}
|
|
160
|
+
# }
|
|
161
|
+
#
|
|
162
|
+
# result = extract_interaction_details(response_dict, template)
|
|
163
|
+
# print(f"result: {result.model_dump()}")
|
levelapp/workflow/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from . import registration
|
|
2
|
-
from .
|
|
2
|
+
from .base import BaseWorkflow
|
|
3
|
+
from .config import WorkflowConfig
|
|
3
4
|
from .factory import MainFactory
|
|
4
5
|
|
|
5
|
-
__all__ = ["
|
|
6
|
+
__all__ = ["BaseWorkflow", "WorkflowConfig", "MainFactory"]
|