veadk-python 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- veadk/a2a/remote_ve_agent.py +56 -1
- veadk/agent.py +79 -26
- veadk/agents/loop_agent.py +22 -9
- veadk/agents/parallel_agent.py +21 -9
- veadk/agents/sequential_agent.py +18 -9
- veadk/auth/veauth/apmplus_veauth.py +32 -39
- veadk/auth/veauth/ark_veauth.py +3 -1
- veadk/auth/veauth/utils.py +12 -0
- veadk/auth/veauth/viking_mem0_veauth.py +91 -0
- veadk/cli/cli.py +5 -1
- veadk/cli/cli_create.py +62 -1
- veadk/cli/cli_deploy.py +36 -1
- veadk/cli/cli_eval.py +55 -0
- veadk/cli/cli_init.py +44 -3
- veadk/cli/cli_kb.py +36 -1
- veadk/cli/cli_pipeline.py +66 -1
- veadk/cli/cli_prompt.py +16 -1
- veadk/cli/cli_uploadevalset.py +15 -1
- veadk/cli/cli_web.py +35 -4
- veadk/cloud/cloud_agent_engine.py +142 -25
- veadk/cloud/cloud_app.py +219 -12
- veadk/configs/database_configs.py +4 -0
- veadk/configs/model_configs.py +5 -1
- veadk/configs/tracing_configs.py +2 -2
- veadk/evaluation/adk_evaluator/adk_evaluator.py +77 -17
- veadk/evaluation/base_evaluator.py +219 -3
- veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +116 -1
- veadk/evaluation/eval_set_file_loader.py +20 -0
- veadk/evaluation/eval_set_recorder.py +54 -0
- veadk/evaluation/types.py +32 -0
- veadk/evaluation/utils/prometheus.py +61 -0
- veadk/knowledgebase/backends/base_backend.py +14 -1
- veadk/knowledgebase/backends/in_memory_backend.py +10 -1
- veadk/knowledgebase/backends/opensearch_backend.py +26 -0
- veadk/knowledgebase/backends/redis_backend.py +29 -2
- veadk/knowledgebase/backends/vikingdb_knowledge_backend.py +43 -5
- veadk/knowledgebase/knowledgebase.py +173 -12
- veadk/memory/long_term_memory.py +148 -4
- veadk/memory/long_term_memory_backends/mem0_backend.py +11 -0
- veadk/memory/short_term_memory.py +119 -5
- veadk/runner.py +412 -1
- veadk/tools/builtin_tools/llm_shield.py +381 -0
- veadk/tools/builtin_tools/mcp_router.py +9 -2
- veadk/tools/builtin_tools/run_code.py +25 -5
- veadk/tools/builtin_tools/web_search.py +38 -154
- veadk/tracing/base_tracer.py +28 -1
- veadk/tracing/telemetry/attributes/extractors/common_attributes_extractors.py +105 -1
- veadk/tracing/telemetry/attributes/extractors/llm_attributes_extractors.py +260 -0
- veadk/tracing/telemetry/attributes/extractors/tool_attributes_extractors.py +69 -0
- veadk/tracing/telemetry/attributes/extractors/types.py +78 -0
- veadk/tracing/telemetry/exporters/apmplus_exporter.py +157 -0
- veadk/tracing/telemetry/exporters/base_exporter.py +8 -0
- veadk/tracing/telemetry/exporters/cozeloop_exporter.py +60 -1
- veadk/tracing/telemetry/exporters/inmemory_exporter.py +118 -1
- veadk/tracing/telemetry/exporters/tls_exporter.py +66 -0
- veadk/tracing/telemetry/opentelemetry_tracer.py +111 -1
- veadk/tracing/telemetry/telemetry.py +118 -2
- veadk/version.py +1 -1
- {veadk_python-0.2.16.dist-info → veadk_python-0.2.17.dist-info}/METADATA +1 -1
- {veadk_python-0.2.16.dist-info → veadk_python-0.2.17.dist-info}/RECORD +64 -62
- {veadk_python-0.2.16.dist-info → veadk_python-0.2.17.dist-info}/WHEEL +0 -0
- {veadk_python-0.2.16.dist-info → veadk_python-0.2.17.dist-info}/entry_points.txt +0 -0
- {veadk_python-0.2.16.dist-info → veadk_python-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {veadk_python-0.2.16.dist-info → veadk_python-0.2.17.dist-info}/top_level.txt +0 -0
|
@@ -29,12 +29,44 @@ from veadk.utils.misc import formatted_timestamp
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class ToolInvocation(BaseModel):
|
|
32
|
+
"""Represents a single tool invocation in agent execution.
|
|
33
|
+
|
|
34
|
+
This model holds tool name, arguments, and result.
|
|
35
|
+
Used in tracking tool usage during evaluation.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
tool_name (str): Name of the tool called.
|
|
39
|
+
tool_args (dict[str, Any]): Arguments passed to the tool. Defaults to empty dict.
|
|
40
|
+
tool_result (Any): Result from tool execution. Defaults to None.
|
|
41
|
+
|
|
42
|
+
Note:
|
|
43
|
+
Flexible for various tool types and results.
|
|
44
|
+
"""
|
|
45
|
+
|
|
32
46
|
tool_name: str
|
|
33
47
|
tool_args: dict[str, Any] = {}
|
|
34
48
|
tool_result: Any = None
|
|
35
49
|
|
|
36
50
|
|
|
37
51
|
class Invocation(BaseModel):
|
|
52
|
+
"""Models a single invocation in the evaluation process.
|
|
53
|
+
|
|
54
|
+
This class stores input, expected and actual outputs, tools, and latency.
|
|
55
|
+
Essential for comparing agent behavior.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
invocation_id (str): Unique ID for the invocation. Defaults to empty.
|
|
59
|
+
input (str): User input prompt.
|
|
60
|
+
actual_output (str): Actual response from agent.
|
|
61
|
+
expected_output (str): Expected response.
|
|
62
|
+
actual_tool (list[dict]): List of actual tools called with details.
|
|
63
|
+
expected_tool (list[dict]): List of expected tools.
|
|
64
|
+
latency (str): Execution time in ms. Defaults to empty.
|
|
65
|
+
|
|
66
|
+
Note:
|
|
67
|
+
Tools are dicts with 'name' and 'args'.
|
|
68
|
+
"""
|
|
69
|
+
|
|
38
70
|
invocation_id: str = ""
|
|
39
71
|
input: str
|
|
40
72
|
actual_output: str
|
|
@@ -45,10 +77,37 @@ class Invocation(BaseModel):
|
|
|
45
77
|
|
|
46
78
|
|
|
47
79
|
class EvalTestCase(BaseModel):
|
|
80
|
+
"""Groups invocations for a single test case.
|
|
81
|
+
|
|
82
|
+
This model contains a list of invocations for one evaluation scenario.
|
|
83
|
+
Used to structure test data.
|
|
84
|
+
|
|
85
|
+
Attributes:
|
|
86
|
+
invocations (list[Invocation]): List of invocation objects in the case.
|
|
87
|
+
|
|
88
|
+
Note:
|
|
89
|
+
Each case corresponds to one session or conversation.
|
|
90
|
+
"""
|
|
91
|
+
|
|
48
92
|
invocations: list[Invocation]
|
|
49
93
|
|
|
50
94
|
|
|
51
95
|
class MetricResult(BaseModel):
|
|
96
|
+
"""Stores result of a single metric evaluation.
|
|
97
|
+
|
|
98
|
+
This model holds the outcome of one metric application.
|
|
99
|
+
Includes success, score, and reason.
|
|
100
|
+
|
|
101
|
+
Attributes:
|
|
102
|
+
metric_type (str): Type or name of the metric.
|
|
103
|
+
success (bool): If the metric passed.
|
|
104
|
+
score (float): Numerical score from evaluation.
|
|
105
|
+
reason (str): Explanation for the score.
|
|
106
|
+
|
|
107
|
+
Note:
|
|
108
|
+
Score is float between 0 and 1 typically.
|
|
109
|
+
"""
|
|
110
|
+
|
|
52
111
|
metric_type: str
|
|
53
112
|
success: bool
|
|
54
113
|
score: float
|
|
@@ -56,32 +115,102 @@ class MetricResult(BaseModel):
|
|
|
56
115
|
|
|
57
116
|
|
|
58
117
|
class EvalResultData(BaseModel):
|
|
118
|
+
"""Aggregates metric results for an evaluation.
|
|
119
|
+
|
|
120
|
+
This class collects multiple metric results and computes averages.
|
|
121
|
+
Used for overall case scoring.
|
|
122
|
+
|
|
123
|
+
Attributes:
|
|
124
|
+
metric_results (list[MetricResult]): List of individual metric outcomes.
|
|
125
|
+
average_score (float): Computed average score. Defaults to 0.0.
|
|
126
|
+
total_reason (str): Combined reasons. Defaults to empty.
|
|
127
|
+
|
|
128
|
+
Note:
|
|
129
|
+
Call call_before_append to compute averages and reasons.
|
|
130
|
+
"""
|
|
131
|
+
|
|
59
132
|
metric_results: list[MetricResult]
|
|
60
133
|
average_score: float = 0.0
|
|
61
134
|
total_reason: str = ""
|
|
62
135
|
|
|
63
136
|
def calculate_average_score(self):
|
|
137
|
+
"""Calculates the average score from metric results.
|
|
138
|
+
|
|
139
|
+
This method sums scores and divides by count.
|
|
140
|
+
Updates average_score attribute.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
None: Updates internal state.
|
|
144
|
+
|
|
145
|
+
Raises:
|
|
146
|
+
ZeroDivisionError: If no metrics.
|
|
147
|
+
"""
|
|
64
148
|
total_score = sum(result.score for result in self.metric_results)
|
|
65
149
|
self.average_score = (
|
|
66
150
|
total_score / len(self.metric_results) if self.metric_results else 0.0
|
|
67
151
|
)
|
|
68
152
|
|
|
69
153
|
def generate_total_reason(self):
|
|
154
|
+
"""Generates a combined reason string from all metrics.
|
|
155
|
+
|
|
156
|
+
This method joins reasons with metric types.
|
|
157
|
+
Updates total_reason attribute.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
None: Updates internal state.
|
|
161
|
+
|
|
162
|
+
Note:
|
|
163
|
+
Format: 'metric_type: reason'
|
|
164
|
+
"""
|
|
70
165
|
self.total_reason = "\n".join(
|
|
71
166
|
f"{result.metric_type:}:{result.reason}" for result in self.metric_results
|
|
72
167
|
)
|
|
73
168
|
|
|
74
169
|
def call_before_append(self):
|
|
170
|
+
"""Computes average score and total reason before adding to list.
|
|
171
|
+
|
|
172
|
+
This method calls calculate_average_score and generate_total_reason.
|
|
173
|
+
Ensures data is ready for storage.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
None: Updates internal state.
|
|
177
|
+
"""
|
|
75
178
|
self.calculate_average_score()
|
|
76
179
|
self.generate_total_reason()
|
|
77
180
|
|
|
78
181
|
|
|
79
182
|
class BaseEvaluator:
|
|
183
|
+
"""Base class for all evaluators in the system.
|
|
184
|
+
|
|
185
|
+
This abstract class provides common functionality for evaluation.
|
|
186
|
+
Handles building eval sets, generating outputs, and abstract evaluate.
|
|
187
|
+
|
|
188
|
+
Attributes:
|
|
189
|
+
name (str): Name of the evaluator.
|
|
190
|
+
agent: The agent being evaluated.
|
|
191
|
+
invocation_list (list[EvalTestCase]): List of test cases.
|
|
192
|
+
result_list (list[EvalResultData]): List of evaluation results.
|
|
193
|
+
agent_information_list (list[dict]): List of agent config info.
|
|
194
|
+
|
|
195
|
+
Note:
|
|
196
|
+
Subclasses must implement evaluate method.
|
|
197
|
+
Supports JSON and tracing formats for input.
|
|
198
|
+
"""
|
|
199
|
+
|
|
80
200
|
def __init__(
|
|
81
201
|
self,
|
|
82
202
|
agent,
|
|
83
203
|
name: str,
|
|
84
204
|
):
|
|
205
|
+
"""Initializes the base evaluator with agent and name.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
agent: Agent instance to evaluate.
|
|
209
|
+
name (str): Identifier for the evaluator.
|
|
210
|
+
|
|
211
|
+
Raises:
|
|
212
|
+
ValueError: If agent or name invalid.
|
|
213
|
+
"""
|
|
85
214
|
self.name = name
|
|
86
215
|
self.agent = agent
|
|
87
216
|
self.invocation_list: list[EvalTestCase] = []
|
|
@@ -89,11 +218,41 @@ class BaseEvaluator:
|
|
|
89
218
|
self.agent_information_list: list[dict] = []
|
|
90
219
|
|
|
91
220
|
def _build_eval_set_from_eval_json(self, eval_json_path: str) -> EvalSet:
|
|
221
|
+
"""Builds eval set from standard eval JSON file.
|
|
222
|
+
|
|
223
|
+
This private method loads using file loader.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
eval_json_path (str): Path to JSON file.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
EvalSet: Loaded set.
|
|
230
|
+
|
|
231
|
+
Raises:
|
|
232
|
+
ValueError: If loading fails.
|
|
233
|
+
"""
|
|
92
234
|
from veadk.evaluation.eval_set_file_loader import load_eval_set_from_file
|
|
93
235
|
|
|
94
236
|
return load_eval_set_from_file(eval_json_path)
|
|
95
237
|
|
|
96
238
|
def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
|
|
239
|
+
"""Builds eval set from tracing JSON spans.
|
|
240
|
+
|
|
241
|
+
This private method parses spans, groups by trace, extracts tools and conversation.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
tracing_json_path (str): Path to tracing JSON.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
EvalSet: Constructed set from traces.
|
|
248
|
+
|
|
249
|
+
Raises:
|
|
250
|
+
ValueError: If JSON invalid or parsing fails.
|
|
251
|
+
json.JSONDecodeError: For malformed JSON.
|
|
252
|
+
|
|
253
|
+
Note:
|
|
254
|
+
Assumes spans have gen_ai attributes for tools and content.
|
|
255
|
+
"""
|
|
97
256
|
try:
|
|
98
257
|
with open(tracing_json_path, "r") as f:
|
|
99
258
|
tracing_data = json.load(f)
|
|
@@ -213,7 +372,21 @@ class BaseEvaluator:
|
|
|
213
372
|
def build_eval_set(
|
|
214
373
|
self, eval_set: Optional[EvalSet] = None, file_path: Optional[str] = None
|
|
215
374
|
):
|
|
216
|
-
"""
|
|
375
|
+
"""Builds invocation list from eval set or file.
|
|
376
|
+
|
|
377
|
+
This method parses input, extracts invocations with expected data.
|
|
378
|
+
Supports eval JSON and tracing JSON formats.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
eval_set (Optional[EvalSet]): Direct eval set object.
|
|
382
|
+
file_path (Optional[str]): Path to file for loading.
|
|
383
|
+
|
|
384
|
+
Raises:
|
|
385
|
+
ValueError: If neither provided or format unsupported.
|
|
386
|
+
|
|
387
|
+
Note:
|
|
388
|
+
Generates random session IDs for isolation.
|
|
389
|
+
"""
|
|
217
390
|
|
|
218
391
|
if eval_set is None and file_path is None:
|
|
219
392
|
raise ValueError("eval_set or file_path is required")
|
|
@@ -294,6 +467,21 @@ class BaseEvaluator:
|
|
|
294
467
|
self.invocation_list = eval_case_data_list
|
|
295
468
|
|
|
296
469
|
async def generate_actual_outputs(self):
|
|
470
|
+
"""Generates actual outputs by running the agent on inputs.
|
|
471
|
+
|
|
472
|
+
This method uses Runner to execute agent for each invocation.
|
|
473
|
+
Captures outputs, tools, and latency.
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
None: Updates invocation actual fields.
|
|
477
|
+
|
|
478
|
+
Raises:
|
|
479
|
+
Exception: If runner or execution fails.
|
|
480
|
+
|
|
481
|
+
Note:
|
|
482
|
+
Uses InMemorySessionService for isolation.
|
|
483
|
+
Supports long-term memory if present.
|
|
484
|
+
"""
|
|
297
485
|
for eval_case_data, agent_information in zip(
|
|
298
486
|
self.invocation_list, self.agent_information_list
|
|
299
487
|
):
|
|
@@ -366,7 +554,17 @@ class BaseEvaluator:
|
|
|
366
554
|
invocation.latency = _latency
|
|
367
555
|
|
|
368
556
|
def get_eval_set_information(self) -> list[list[dict[str, Any]]]:
|
|
369
|
-
"""
|
|
557
|
+
"""Retrieves combined evaluation information.
|
|
558
|
+
|
|
559
|
+
This method merges invocations and results into dict lists.
|
|
560
|
+
Useful for reporting.
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
list[list[dict[str, Any]]]: Nested list of case data dicts.
|
|
564
|
+
|
|
565
|
+
Note:
|
|
566
|
+
Defaults to empty results if not evaluated yet.
|
|
567
|
+
"""
|
|
370
568
|
result = []
|
|
371
569
|
for i, eval_case in enumerate(self.invocation_list):
|
|
372
570
|
case_data = []
|
|
@@ -399,5 +597,23 @@ class BaseEvaluator:
|
|
|
399
597
|
eval_set_file_path: Optional[str],
|
|
400
598
|
eval_id: str,
|
|
401
599
|
):
|
|
402
|
-
"""
|
|
600
|
+
"""Abstract method for performing the evaluation.
|
|
601
|
+
|
|
602
|
+
Subclasses implement specific metric evaluation logic.
|
|
603
|
+
|
|
604
|
+
Args:
|
|
605
|
+
metrics (list[Any]): Metrics to apply.
|
|
606
|
+
eval_set (Optional[EvalSet]): Eval set.
|
|
607
|
+
eval_set_file_path (Optional[str]): File path.
|
|
608
|
+
eval_id (str): Evaluation ID.
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
Any: Evaluation results specific to subclass.
|
|
612
|
+
|
|
613
|
+
Raises:
|
|
614
|
+
NotImplementedError: If not overridden.
|
|
615
|
+
|
|
616
|
+
Note:
|
|
617
|
+
Must populate result_list after evaluation.
|
|
618
|
+
"""
|
|
403
619
|
pass
|
|
@@ -37,11 +37,44 @@ logger = get_logger(__name__)
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
def formatted_timestamp():
|
|
40
|
+
"""Generates a formatted timestamp string in YYYYMMDDHHMMSS format.
|
|
41
|
+
|
|
42
|
+
This function creates a string representation of the current time.
|
|
43
|
+
It uses local time for formatting.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
str: Timestamp string like '20251028123045'.
|
|
47
|
+
"""
|
|
40
48
|
# YYYYMMDDHHMMSS
|
|
41
49
|
return time.strftime("%Y%m%d%H%M%S", time.localtime())
|
|
42
50
|
|
|
43
51
|
|
|
44
52
|
class DeepevalEvaluator(BaseEvaluator):
|
|
53
|
+
"""Evaluates agents using DeepEval metrics with Prometheus export.
|
|
54
|
+
|
|
55
|
+
This class uses DeepEval to test agent performance.
|
|
56
|
+
It runs agents on test cases and scores them.
|
|
57
|
+
Results can be sent to Prometheus for monitoring.
|
|
58
|
+
|
|
59
|
+
Attributes:
|
|
60
|
+
judge_model_name (str): Name of the model that judges the agent.
|
|
61
|
+
judge_model (LocalModel): The judge model instance.
|
|
62
|
+
prometheus_config (PrometheusPushgatewayConfig | None): Settings for
|
|
63
|
+
Prometheus export. If None, no export happens.
|
|
64
|
+
|
|
65
|
+
Note:
|
|
66
|
+
Needs judge model credentials from environment if not given.
|
|
67
|
+
Turns off cache to get fresh results each time.
|
|
68
|
+
|
|
69
|
+
Examples:
|
|
70
|
+
```python
|
|
71
|
+
agent = Agent(tools=[get_city_weather])
|
|
72
|
+
evaluator = DeepevalEvaluator(agent=agent)
|
|
73
|
+
metrics = [GEval(threshold=0.8)]
|
|
74
|
+
results = await evaluator.evaluate(metrics, eval_set_file_path="test.json")
|
|
75
|
+
```
|
|
76
|
+
"""
|
|
77
|
+
|
|
45
78
|
def __init__(
|
|
46
79
|
self,
|
|
47
80
|
agent,
|
|
@@ -51,6 +84,32 @@ class DeepevalEvaluator(BaseEvaluator):
|
|
|
51
84
|
name: str = "veadk_deepeval_evaluator",
|
|
52
85
|
prometheus_config: PrometheusPushgatewayConfig | None = None,
|
|
53
86
|
):
|
|
87
|
+
"""Sets up the DeepEval evaluator with agent and judge model.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
agent: The agent to test.
|
|
91
|
+
judge_model_api_key: API key for the judge model. If empty,
|
|
92
|
+
gets from MODEL_JUDGE_API_KEY environment variable.
|
|
93
|
+
judge_model_name: Name of the judge model. If empty,
|
|
94
|
+
gets from MODEL_JUDGE_NAME environment variable.
|
|
95
|
+
judge_model_api_base: Base URL for judge model API. If empty,
|
|
96
|
+
gets from MODEL_JUDGE_API_BASE environment variable.
|
|
97
|
+
name: Name for this evaluator. Defaults to 'veadk_deepeval_evaluator'.
|
|
98
|
+
prometheus_config: Settings for Prometheus export. If None,
|
|
99
|
+
no export happens.
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
ValueError: If model settings are wrong.
|
|
103
|
+
EnvironmentError: If environment variables are missing.
|
|
104
|
+
|
|
105
|
+
Examples:
|
|
106
|
+
```python
|
|
107
|
+
evaluator = DeepevalEvaluator(
|
|
108
|
+
agent=my_agent,
|
|
109
|
+
judge_model_api_key="sk-...",
|
|
110
|
+
prometheus_config=prometheus_config)
|
|
111
|
+
```
|
|
112
|
+
"""
|
|
54
113
|
super().__init__(agent=agent, name=name)
|
|
55
114
|
|
|
56
115
|
if not judge_model_api_key:
|
|
@@ -83,7 +142,38 @@ class DeepevalEvaluator(BaseEvaluator):
|
|
|
83
142
|
eval_set_file_path: Optional[str] = None,
|
|
84
143
|
eval_id: str = f"test_{formatted_timestamp()}",
|
|
85
144
|
):
|
|
86
|
-
"""
|
|
145
|
+
"""Tests agent using DeepEval on given test cases.
|
|
146
|
+
|
|
147
|
+
This method does these steps:
|
|
148
|
+
1. Loads test cases from memory or file
|
|
149
|
+
2. Runs agent to get actual responses
|
|
150
|
+
3. Converts to DeepEval test format
|
|
151
|
+
4. Runs metrics evaluation
|
|
152
|
+
5. Sends results to Prometheus if needed
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
metrics: List of DeepEval metrics to use for scoring.
|
|
156
|
+
eval_set: Test cases in memory. If given, used first.
|
|
157
|
+
eval_set_file_path: Path to test case file. Used if no eval_set.
|
|
158
|
+
eval_id: Unique name for this test run. Used for tracking.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
EvaluationResult: Results from DeepEval with scores and details.
|
|
162
|
+
|
|
163
|
+
Raises:
|
|
164
|
+
ValueError: If no test cases found.
|
|
165
|
+
FileNotFoundError: If test file not found.
|
|
166
|
+
EvaluationError: If agent fails or metrics fail.
|
|
167
|
+
|
|
168
|
+
Examples:
|
|
169
|
+
```python
|
|
170
|
+
metrics = [GEval(threshold=0.8), ToolCorrectnessMetric(threshold=0.5)]
|
|
171
|
+
results = await evaluator.evaluate(
|
|
172
|
+
metrics=metrics,
|
|
173
|
+
eval_set_file_path="test_cases.json")
|
|
174
|
+
print(f"Test cases run: {len(results.test_results)}")
|
|
175
|
+
```
|
|
176
|
+
"""
|
|
87
177
|
# Get evaluation data by parsing eval set file
|
|
88
178
|
self.build_eval_set(eval_set, eval_set_file_path)
|
|
89
179
|
|
|
@@ -162,6 +252,31 @@ class DeepevalEvaluator(BaseEvaluator):
|
|
|
162
252
|
return test_results
|
|
163
253
|
|
|
164
254
|
def export_results(self, eval_id: str, test_results: EvaluationResult):
|
|
255
|
+
"""Sends evaluation results to Prometheus for monitoring.
|
|
256
|
+
|
|
257
|
+
This method takes test results, counts passes and failures,
|
|
258
|
+
and sends metrics to Prometheus.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
eval_id: Unique name for this test. Used as label in Prometheus.
|
|
262
|
+
test_results: Results from DeepEval evaluation.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
None: Results are sent directly to Prometheus.
|
|
266
|
+
|
|
267
|
+
Raises:
|
|
268
|
+
PrometheusConnectionError: If cannot connect to Prometheus.
|
|
269
|
+
PrometheusPushError: If sending data fails.
|
|
270
|
+
|
|
271
|
+
Note:
|
|
272
|
+
Uses fixed thresholds for now: case_threshold=0.5, diff_threshold=0.2.
|
|
273
|
+
These may change later.
|
|
274
|
+
|
|
275
|
+
Examples:
|
|
276
|
+
```python
|
|
277
|
+
evaluator.export_results("test_20240101", test_results)
|
|
278
|
+
```
|
|
279
|
+
"""
|
|
165
280
|
# fixed attributions
|
|
166
281
|
test_name = eval_id
|
|
167
282
|
test_cases_total = len(test_results.test_results)
|
|
@@ -19,6 +19,26 @@ from google.adk.evaluation.local_eval_sets_manager import (
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def load_eval_set_from_file(eval_set_file_path: str) -> EvalSet:
|
|
22
|
+
"""Loads an evaluation set from a JSON file.
|
|
23
|
+
|
|
24
|
+
This function uses ADK's loader to parse the file into an EvalSet object.
|
|
25
|
+
It handles errors in file reading or parsing.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
eval_set_file_path (str): Path to the JSON eval set file.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
EvalSet: Loaded evaluation set object.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
Exception: If file loading or parsing fails, with details.
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
```python
|
|
38
|
+
eval_set = load_eval_set_from_file("my_eval.json")
|
|
39
|
+
print(len(eval_set.eval_cases))
|
|
40
|
+
```
|
|
41
|
+
"""
|
|
22
42
|
try:
|
|
23
43
|
eval_set = adk_load_eval_set_from_file(eval_set_file_path, eval_set_file_path)
|
|
24
44
|
except Exception as e:
|
|
@@ -27,9 +27,32 @@ logger = get_logger(__name__)
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class EvalSetRecorder(LocalEvalSetsManager):
|
|
30
|
+
"""Records evaluation sets from sessions for later use in testing.
|
|
31
|
+
|
|
32
|
+
This class extends LocalEvalSetsManager to add sessions to eval sets.
|
|
33
|
+
It handles dumping eval sets to files from session data.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
eval_set_id (str): ID of the eval set. Defaults to 'default'.
|
|
37
|
+
session_service (BaseSessionService): Service for session management.
|
|
38
|
+
|
|
39
|
+
Note:
|
|
40
|
+
Uses temporary directory for storing eval sets.
|
|
41
|
+
Creates eval cases from session invocations.
|
|
42
|
+
"""
|
|
43
|
+
|
|
30
44
|
def __init__(
|
|
31
45
|
self, session_service: BaseSessionService, eval_set_id: str = "default"
|
|
32
46
|
):
|
|
47
|
+
"""Initializes the eval set recorder with session service and ID.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
session_service (BaseSessionService): Service to retrieve sessions.
|
|
51
|
+
eval_set_id (str): ID for the eval set. Defaults to 'default'.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
ValueError: If eval_set_id is invalid.
|
|
55
|
+
"""
|
|
33
56
|
super().__init__(agents_dir=get_temp_dir())
|
|
34
57
|
self.eval_set_id = eval_set_id if eval_set_id != "" else "default"
|
|
35
58
|
self.session_service: BaseSessionService = session_service
|
|
@@ -42,6 +65,21 @@ class EvalSetRecorder(LocalEvalSetsManager):
|
|
|
42
65
|
session_id: str,
|
|
43
66
|
user_id: str,
|
|
44
67
|
):
|
|
68
|
+
"""Adds a session to the evaluation set as an eval case.
|
|
69
|
+
|
|
70
|
+
This method retrieves a session and converts it to eval invocations.
|
|
71
|
+
It creates a new eval case with timestamp.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
app_name (str): Name of the app for the session.
|
|
75
|
+
eval_set_id (str): ID of the eval set to add to.
|
|
76
|
+
session_id (str): ID of the session to add.
|
|
77
|
+
user_id (str): ID of the user owning the session.
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
AssertionError: If session not found.
|
|
81
|
+
ValueError: If adding eval case fails.
|
|
82
|
+
"""
|
|
45
83
|
eval_id = f"veadk_eval_{formatted_timestamp()}"
|
|
46
84
|
|
|
47
85
|
# Get the session
|
|
@@ -74,6 +112,22 @@ class EvalSetRecorder(LocalEvalSetsManager):
|
|
|
74
112
|
user_id: str,
|
|
75
113
|
session_id: str,
|
|
76
114
|
) -> str:
|
|
115
|
+
"""Dumps the current eval set to a file path.
|
|
116
|
+
|
|
117
|
+
This method creates the eval set if needed and adds the session.
|
|
118
|
+
It ensures directory exists and logs the dump path.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
app_name (str): Name of the app.
|
|
122
|
+
user_id (str): ID of the user.
|
|
123
|
+
session_id (str): ID of the session to dump.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
str: Path where the eval set was dumped.
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
ValueError: If dump operation fails.
|
|
130
|
+
"""
|
|
77
131
|
dump_path = self._get_eval_set_file_path(app_name, self.eval_set_id)
|
|
78
132
|
Path(dump_path).parent.mkdir(parents=True, exist_ok=True)
|
|
79
133
|
|
veadk/evaluation/types.py
CHANGED
|
@@ -17,6 +17,25 @@ from dataclasses import dataclass
|
|
|
17
17
|
|
|
18
18
|
@dataclass
|
|
19
19
|
class EvalResultCaseData:
|
|
20
|
+
"""Holds data for a single evaluation case result.
|
|
21
|
+
|
|
22
|
+
This dataclass stores input, outputs, score, and status for one test case.
|
|
23
|
+
Used in evaluation reporting and metrics export.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
id (str): Unique ID of the case.
|
|
27
|
+
input (str): User input for the case.
|
|
28
|
+
actual_output (str): Actual agent response.
|
|
29
|
+
expected_output (str): Expected agent response.
|
|
30
|
+
score (str): Score as string from evaluation.
|
|
31
|
+
reason (str): Reason for the score.
|
|
32
|
+
status (str): Status like 'PASSED' or 'FAILURE'.
|
|
33
|
+
latency (str): Latency in milliseconds as string.
|
|
34
|
+
|
|
35
|
+
Note:
|
|
36
|
+
Score and latency are strings for compatibility with external systems.
|
|
37
|
+
"""
|
|
38
|
+
|
|
20
39
|
id: str
|
|
21
40
|
input: str
|
|
22
41
|
actual_output: str
|
|
@@ -29,5 +48,18 @@ class EvalResultCaseData:
|
|
|
29
48
|
|
|
30
49
|
@dataclass
|
|
31
50
|
class EvalResultMetadata:
|
|
51
|
+
"""Stores metadata about the evaluation run.
|
|
52
|
+
|
|
53
|
+
This dataclass captures model information for the evaluation.
|
|
54
|
+
Used in reporting and tracing.
|
|
55
|
+
|
|
56
|
+
Attributes:
|
|
57
|
+
tested_model (str): Name of the model being tested.
|
|
58
|
+
judge_model (str): Name of the judge model used.
|
|
59
|
+
|
|
60
|
+
Note:
|
|
61
|
+
Simple structure for quick metadata access.
|
|
62
|
+
"""
|
|
63
|
+
|
|
32
64
|
tested_model: str
|
|
33
65
|
judge_model: str
|
|
@@ -22,6 +22,23 @@ from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class PrometheusPushgatewayConfig:
|
|
25
|
+
"""Configures connection to Prometheus Pushgateway for metrics export.
|
|
26
|
+
|
|
27
|
+
This class holds settings for pushing evaluation metrics to Prometheus.
|
|
28
|
+
It uses environment variables for default values.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
url (str): URL of the Prometheus Pushgateway endpoint.
|
|
32
|
+
Defaults to OBSERVABILITY_PROMETHEUS_PUSHGATEWAY_URL environment variable.
|
|
33
|
+
username (str): Username for authentication.
|
|
34
|
+
Defaults to OBSERVABILITY_PROMETHEUS_USERNAME environment variable.
|
|
35
|
+
password (str): Password for authentication.
|
|
36
|
+
Defaults to OBSERVABILITY_PROMETHEUS_PASSWORD environment variable.
|
|
37
|
+
|
|
38
|
+
Note:
|
|
39
|
+
All fields are optional and use environment variables if not provided.
|
|
40
|
+
"""
|
|
41
|
+
|
|
25
42
|
url: str = Field(
|
|
26
43
|
default_factory=lambda: getenv(
|
|
27
44
|
"OBSERVABILITY_PROMETHEUS_PUSHGATEWAY_URL",
|
|
@@ -87,6 +104,26 @@ def post_pushgateway(
|
|
|
87
104
|
registry: CollectorRegistry,
|
|
88
105
|
grouping_key: dict[str, str] | None = None,
|
|
89
106
|
):
|
|
107
|
+
"""Pushes metrics to Prometheus Pushgateway with authentication.
|
|
108
|
+
|
|
109
|
+
This function sends collected metrics to the specified Pushgateway URL.
|
|
110
|
+
It uses basic authentication and optional grouping keys.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
pushgateway_url (str): URL of the Pushgateway endpoint.
|
|
114
|
+
username (str): Authentication username.
|
|
115
|
+
password (str): Authentication password.
|
|
116
|
+
job_name (str): Name of the job for metrics labeling.
|
|
117
|
+
registry (CollectorRegistry): Registry containing metrics to push.
|
|
118
|
+
grouping_key (dict[str, str] | None): Optional key-value pairs for grouping.
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
Exception: If push operation fails due to network or auth issues.
|
|
122
|
+
|
|
123
|
+
Note:
|
|
124
|
+
Authentication handler is created internally using provided credentials.
|
|
125
|
+
"""
|
|
126
|
+
|
|
90
127
|
def auth_handler(url, method, timeout, headers, data):
|
|
91
128
|
return basic_auth_handler(
|
|
92
129
|
url, method, timeout, headers, data, username, password
|
|
@@ -114,6 +151,30 @@ def push_to_prometheus(
|
|
|
114
151
|
username: str = "",
|
|
115
152
|
password: str = "",
|
|
116
153
|
):
|
|
154
|
+
"""Sets and pushes evaluation metrics to Prometheus.
|
|
155
|
+
|
|
156
|
+
This function updates gauge metrics with evaluation results and pushes them.
|
|
157
|
+
It handles counts, thresholds, and specific data labels.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
test_name (str): Name of the test for grouping.
|
|
161
|
+
test_cases_total (int): Total number of test cases.
|
|
162
|
+
test_cases_failure (int): Number of failed test cases.
|
|
163
|
+
test_cases_pass (int): Number of passed test cases.
|
|
164
|
+
test_data_list (list[EvalResultCaseData]): List of case data for labeling.
|
|
165
|
+
eval_data (EvalResultMetadata): Metadata for evaluation.
|
|
166
|
+
case_threshold (float): Threshold value for cases. Defaults to 0.5.
|
|
167
|
+
diff_threshold (float): Diff threshold value. Defaults to 0.2.
|
|
168
|
+
url (str): Pushgateway URL. Defaults to empty.
|
|
169
|
+
username (str): Auth username. Defaults to empty.
|
|
170
|
+
password (str): Auth password. Defaults to empty.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
None: Metrics are set and pushed directly.
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
ValueError: If required data is invalid.
|
|
177
|
+
"""
|
|
117
178
|
test_cases_total_metric.set(test_cases_total)
|
|
118
179
|
test_cases_failure_metric.set(test_cases_failure)
|
|
119
180
|
test_cases_pass_metric.set(test_cases_pass)
|