ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/METADATA +1 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/RECORD +33 -29
- wxo_agentic_evaluation/analyze_run.py +805 -344
- wxo_agentic_evaluation/arg_configs.py +10 -1
- wxo_agentic_evaluation/description_quality_checker.py +11 -2
- wxo_agentic_evaluation/evaluation_package.py +8 -3
- wxo_agentic_evaluation/inference_backend.py +46 -79
- wxo_agentic_evaluation/llm_matching.py +14 -2
- wxo_agentic_evaluation/main.py +1 -1
- wxo_agentic_evaluation/metrics/__init__.py +1 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
- wxo_agentic_evaluation/metrics/metrics.py +43 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +4 -2
- wxo_agentic_evaluation/quick_eval.py +7 -9
- wxo_agentic_evaluation/record_chat.py +2 -5
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +139 -100
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -34
- wxo_agentic_evaluation/red_teaming/attack_list.py +89 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +51 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +77 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/service_instance.py +7 -0
- wxo_agentic_evaluation/type.py +1 -1
- wxo_agentic_evaluation/utils/__init__.py +3 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +131 -16
- wxo_agentic_evaluation/wxo_client.py +80 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/top_level.txt +0 -0
|
@@ -390,18 +390,18 @@ class PipelineResult(BaseModel):
|
|
|
390
390
|
)
|
|
391
391
|
|
|
392
392
|
@model_validator(mode="after")
|
|
393
|
-
def compute_overall(
|
|
393
|
+
def compute_overall(self) -> Self:
|
|
394
394
|
"""
|
|
395
395
|
After validation, compute overall_valid as AND of:
|
|
396
396
|
• all semantic is_correct flags
|
|
397
397
|
• if transform exists: all execution_success flags
|
|
398
398
|
"""
|
|
399
|
-
static: StaticResult =
|
|
399
|
+
static: StaticResult = self.static
|
|
400
400
|
if static:
|
|
401
401
|
# static checks
|
|
402
402
|
ok = static.final_decision
|
|
403
403
|
|
|
404
|
-
sem: SemanticResult =
|
|
404
|
+
sem: SemanticResult = self.semantic
|
|
405
405
|
if sem:
|
|
406
406
|
# semantic checks
|
|
407
407
|
if sem.general and sem.general.metrics:
|
|
@@ -441,11 +441,11 @@ class PipelineResult(BaseModel):
|
|
|
441
441
|
if param_avgs:
|
|
442
442
|
cat_avgs.append(sum(param_avgs) / len(param_avgs))
|
|
443
443
|
|
|
444
|
-
|
|
444
|
+
self.overall_avg_score = (
|
|
445
445
|
sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
|
|
446
446
|
)
|
|
447
|
-
|
|
448
|
-
return
|
|
447
|
+
self.overall_valid = ok
|
|
448
|
+
return self
|
|
449
449
|
|
|
450
450
|
|
|
451
451
|
# ----------------------------------------------------------------------
|
|
@@ -531,17 +531,17 @@ class ToolFunctionCall(BaseModel):
|
|
|
531
531
|
)
|
|
532
532
|
|
|
533
533
|
@model_validator(mode="after")
|
|
534
|
-
def _parse_arguments(
|
|
534
|
+
def _parse_arguments(self) -> Self:
|
|
535
535
|
"""
|
|
536
536
|
After model construction, parse the `arguments` JSON string
|
|
537
537
|
into `parsed_arguments`, or raise a ValidationError.
|
|
538
538
|
"""
|
|
539
539
|
try:
|
|
540
|
-
raw =
|
|
541
|
-
|
|
540
|
+
raw = self.arguments
|
|
541
|
+
self.parsed_arguments = json.loads(raw)
|
|
542
542
|
except json.JSONDecodeError as e:
|
|
543
543
|
raise ValidationError(f"Invalid JSON in arguments: {e}") from e
|
|
544
|
-
return
|
|
544
|
+
return self
|
|
545
545
|
|
|
546
546
|
|
|
547
547
|
class ToolCall(BaseModel):
|
|
@@ -17,6 +17,11 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
|
|
|
17
17
|
from wxo_agentic_evaluation.service_provider import get_provider
|
|
18
18
|
from wxo_agentic_evaluation.type import Message
|
|
19
19
|
|
|
20
|
+
DEFAULT_GENERATION_PARAMS= {
|
|
21
|
+
"min_new_tokens": 0,
|
|
22
|
+
"decoding_method": "greedy",
|
|
23
|
+
"max_new_tokens": 4096,
|
|
24
|
+
}
|
|
20
25
|
|
|
21
26
|
class ReferencelessEvaluation:
|
|
22
27
|
"""
|
|
@@ -31,19 +36,16 @@ class ReferencelessEvaluation:
|
|
|
31
36
|
def __init__(
|
|
32
37
|
self,
|
|
33
38
|
api_spec: List[Mapping[str, Any]],
|
|
34
|
-
messages: List[Message],
|
|
35
39
|
model_id: str,
|
|
36
40
|
task_n: str,
|
|
37
41
|
dataset_name: str,
|
|
42
|
+
runtime_pipeline: bool = True,
|
|
43
|
+
generation_params = DEFAULT_GENERATION_PARAMS
|
|
38
44
|
):
|
|
39
45
|
|
|
40
46
|
self.metrics_client = get_provider(
|
|
41
47
|
model_id=model_id,
|
|
42
|
-
params=
|
|
43
|
-
"min_new_tokens": 0,
|
|
44
|
-
"decoding_method": "greedy",
|
|
45
|
-
"max_new_tokens": 4096,
|
|
46
|
-
},
|
|
48
|
+
params=generation_params,
|
|
47
49
|
referenceless_eval=True,
|
|
48
50
|
)
|
|
49
51
|
|
|
@@ -52,39 +54,45 @@ class ReferencelessEvaluation:
|
|
|
52
54
|
general_metrics=[METRIC_GENERAL_HALLUCINATION_CHECK],
|
|
53
55
|
function_metrics=[METRIC_FUNCTION_SELECTION_APPROPRIATENESS],
|
|
54
56
|
parameter_metrics=None,
|
|
57
|
+
runtime_pipeline=runtime_pipeline,
|
|
55
58
|
)
|
|
56
59
|
|
|
57
60
|
self.task_n = task_n
|
|
58
61
|
self.dataset_name = dataset_name
|
|
59
62
|
|
|
60
63
|
self.apis_specs = [ToolSpec.model_validate(spec) for spec in api_spec]
|
|
61
|
-
self.messages = messages
|
|
62
|
-
|
|
63
|
-
def _run_pipeline(self, examples: List[Mapping[str, Any]]):
|
|
64
|
-
results = []
|
|
65
|
-
for example in examples:
|
|
66
|
-
result = self.pipeline.run_sync(
|
|
67
|
-
conversation=example["context"],
|
|
68
|
-
inventory=self.apis_specs,
|
|
69
|
-
call=example["call"],
|
|
70
|
-
continue_on_static=False,
|
|
71
|
-
retries=2,
|
|
72
|
-
)
|
|
73
|
-
result_dict = result.model_dump()
|
|
74
|
-
results.append(result_dict)
|
|
75
64
|
|
|
76
|
-
|
|
65
|
+
@staticmethod
|
|
66
|
+
def fmt_tool_call(tool_id, tool_call_name, arguments, context):
|
|
67
|
+
call = {
|
|
68
|
+
"call": {
|
|
69
|
+
"id": tool_id,
|
|
70
|
+
"type": "function",
|
|
71
|
+
"function": {
|
|
72
|
+
"name": tool_call_name,
|
|
73
|
+
"arguments": arguments,
|
|
74
|
+
},
|
|
75
|
+
},
|
|
76
|
+
"context": context,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return call
|
|
77
80
|
|
|
78
|
-
|
|
81
|
+
@staticmethod
|
|
82
|
+
def fmt_msgs_referenceless(
|
|
83
|
+
messages: List[Message],
|
|
84
|
+
) -> List[Mapping[str, Any]]:
|
|
85
|
+
"""Assume that the last item in the `messages` array is the tool call, and preceding items
|
|
86
|
+
in the messages array is the context.
|
|
87
|
+
"""
|
|
79
88
|
examples = []
|
|
80
|
-
|
|
81
89
|
processed_data = [
|
|
82
90
|
{
|
|
83
91
|
k: msg.model_dump().get(k)
|
|
84
92
|
for k in ["role", "content", "type"]
|
|
85
93
|
if k in msg.model_dump()
|
|
86
94
|
}
|
|
87
|
-
for msg in
|
|
95
|
+
for msg in messages
|
|
88
96
|
]
|
|
89
97
|
|
|
90
98
|
for idx, message in enumerate(processed_data):
|
|
@@ -96,23 +104,48 @@ class ReferencelessEvaluation:
|
|
|
96
104
|
tool_call_msg = json.loads(content)
|
|
97
105
|
if tool_call_msg["name"].startswith("transfer_to"):
|
|
98
106
|
continue
|
|
99
|
-
|
|
100
|
-
call =
|
|
101
|
-
"
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
"arguments": json.dumps(tool_call_msg["args"]),
|
|
107
|
-
},
|
|
108
|
-
},
|
|
109
|
-
"context": context,
|
|
110
|
-
}
|
|
107
|
+
|
|
108
|
+
call = ReferencelessEvaluation.fmt_tool_call(
|
|
109
|
+
tool_id=tool_call_msg.get("id", "1"),
|
|
110
|
+
tool_call_name=tool_call_msg["name"],
|
|
111
|
+
arguments=json.dumps(tool_call_msg["args"]),
|
|
112
|
+
context=context
|
|
113
|
+
)
|
|
111
114
|
examples.append(call)
|
|
112
115
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
+
return examples
|
|
117
|
+
|
|
118
|
+
def _run_pipeline(self, examples: List[Mapping[str, Any]]):
|
|
119
|
+
results = []
|
|
120
|
+
for example in examples:
|
|
121
|
+
result = self.pipeline.run_sync(
|
|
122
|
+
conversation=example["context"],
|
|
123
|
+
inventory=self.apis_specs,
|
|
124
|
+
call=example["call"],
|
|
125
|
+
continue_on_static=False,
|
|
126
|
+
retries=2,
|
|
127
|
+
)
|
|
128
|
+
result_dict = result.model_dump()
|
|
129
|
+
results.append(result_dict)
|
|
130
|
+
|
|
131
|
+
return results
|
|
132
|
+
|
|
133
|
+
def run(self, examples: List[Mapping[str, str]], verbose=False):
|
|
134
|
+
"""`examples` should be an array where each element is formatted:
|
|
135
|
+
|
|
136
|
+
call = {
|
|
137
|
+
"call": {
|
|
138
|
+
"id": tool_call_msg.get("id", "1"),
|
|
139
|
+
"type": "function",
|
|
140
|
+
"function": {
|
|
141
|
+
"name": tool_call_msg["name"],
|
|
142
|
+
"arguments": json.dumps(tool_call_msg["args"]),
|
|
143
|
+
},
|
|
144
|
+
},
|
|
145
|
+
"context": context,
|
|
146
|
+
}
|
|
147
|
+
"""
|
|
148
|
+
|
|
116
149
|
examples = [
|
|
117
150
|
{
|
|
118
151
|
"call": ToolCall.model_validate(ex["call"]),
|
|
@@ -120,6 +153,11 @@ class ReferencelessEvaluation:
|
|
|
120
153
|
}
|
|
121
154
|
for ex in examples
|
|
122
155
|
]
|
|
156
|
+
|
|
157
|
+
if verbose:
|
|
158
|
+
rich.print(
|
|
159
|
+
f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
|
|
160
|
+
)
|
|
123
161
|
results = self._run_pipeline(examples)
|
|
124
162
|
|
|
125
163
|
return results
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
|
|
3
|
-
from wxo_agentic_evaluation.
|
|
3
|
+
from wxo_agentic_evaluation.utils.utils import is_saas_url
|
|
4
|
+
from wxo_agentic_evaluation.wxo_client import WXOClient
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class ResourceMap:
|
|
@@ -34,6 +35,7 @@ class ResourceMap:
|
|
|
34
35
|
|
|
35
36
|
if resp.status_code == 200:
|
|
36
37
|
agents = resp.json()
|
|
38
|
+
self.all_agent_objs = agents
|
|
37
39
|
for agent in agents:
|
|
38
40
|
agent_name = agent["name"]
|
|
39
41
|
tools = [tool_map[id] for id in agent["tools"]]
|
|
@@ -247,6 +247,13 @@ def tenant_setup(
|
|
|
247
247
|
|
|
248
248
|
context["active_environment"] = tenant_name
|
|
249
249
|
|
|
250
|
+
# Ensure parent directories exist so tests (which may run in clean envs)
|
|
251
|
+
# can write these files without raising FileNotFoundError.
|
|
252
|
+
auth_dir = os.path.dirname(auth_config_path)
|
|
253
|
+
env_dir = os.path.dirname(env_config_path)
|
|
254
|
+
os.makedirs(auth_dir, exist_ok=True)
|
|
255
|
+
os.makedirs(env_dir, exist_ok=True)
|
|
256
|
+
|
|
250
257
|
with open(auth_config_path, "w") as f:
|
|
251
258
|
yaml.dump(auth_config, f)
|
|
252
259
|
with open(env_config_path, "w") as f:
|
wxo_agentic_evaluation/type.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from wxo_agentic_evaluation.utils.utils import TestCaseResources, add_line_seperator, list_run_files, load_run_metrics, N_A
|
|
3
|
+
from wxo_agentic_evaluation.utils.open_ai_tool_extractor import ToolExtractionOpenAIFormat
|
|
4
|
+
from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
|
|
2
5
|
|
|
3
6
|
|
|
4
7
|
def json_dump(output_path, object):
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from typing import Any, List, Mapping, Optional
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.metrics import (
|
|
4
|
+
Annotation,
|
|
5
|
+
FailedSemanticTestCases,
|
|
6
|
+
FailedStaticTestCases,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ReferencelessEvalParser:
|
|
11
|
+
@staticmethod
|
|
12
|
+
def static_parser(
|
|
13
|
+
static_metrics: Mapping[str, Mapping[str, Any]],
|
|
14
|
+
) -> List[FailedStaticTestCases]:
|
|
15
|
+
"""
|
|
16
|
+
static.metrics
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
failed_test_cases = []
|
|
20
|
+
|
|
21
|
+
for metric, metric_data in static_metrics.items():
|
|
22
|
+
if not metric_data.get("valid", False):
|
|
23
|
+
fail = FailedStaticTestCases(
|
|
24
|
+
metric_name=metric,
|
|
25
|
+
description=metric_data.get("description"),
|
|
26
|
+
explanation=metric_data.get("explanation"),
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
failed_test_cases.append(fail)
|
|
30
|
+
|
|
31
|
+
return failed_test_cases
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def parse_annotations(
|
|
35
|
+
actionable_reccomendations, filters: List[str]
|
|
36
|
+
) -> Optional[List[Annotation]]:
|
|
37
|
+
annotations = [
|
|
38
|
+
Annotation(
|
|
39
|
+
parameter_name=recc.get("parameter_name"),
|
|
40
|
+
recommendation=recc.get("recommendation"),
|
|
41
|
+
details=recc.get("details"),
|
|
42
|
+
quote=recc.get("quote"),
|
|
43
|
+
)
|
|
44
|
+
for recc in actionable_reccomendations
|
|
45
|
+
if recc.get("recommendation") in filters
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
annotations = annotations if annotations else None
|
|
49
|
+
|
|
50
|
+
return annotations
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def semantic_parser(
|
|
54
|
+
metric_name, data, annotation_filters: Optional[List[str]]
|
|
55
|
+
):
|
|
56
|
+
semantic_metric = FailedSemanticTestCases(
|
|
57
|
+
metric_name=metric_name,
|
|
58
|
+
evidence=data.get("evidence"),
|
|
59
|
+
explanation=data.get("explanation"),
|
|
60
|
+
output=data.get("output"),
|
|
61
|
+
confidence=data.get("confidence"),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if annotation_filters and (
|
|
65
|
+
annotations := ReferencelessEvalParser.parse_annotations(
|
|
66
|
+
data.get("actionable_recommendations"), annotation_filters
|
|
67
|
+
)
|
|
68
|
+
):
|
|
69
|
+
semantic_metric.annotations = annotations
|
|
70
|
+
|
|
71
|
+
return semantic_metric
|
|
@@ -2,10 +2,14 @@ import glob
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import re
|
|
5
|
-
|
|
5
|
+
import csv
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import List, Optional, Union, Mapping, Tuple, Any
|
|
6
9
|
from urllib.parse import urlparse
|
|
7
10
|
|
|
8
11
|
import yaml
|
|
12
|
+
import rich
|
|
9
13
|
from rich import box, print
|
|
10
14
|
from rich.console import Console, Group
|
|
11
15
|
from rich.panel import Panel
|
|
@@ -18,14 +22,20 @@ from wxo_agentic_evaluation.metrics.metrics import (
|
|
|
18
22
|
KnowledgeBaseMetricSummary,
|
|
19
23
|
ReferenceLessEvalMetrics,
|
|
20
24
|
ToolCallAndRoutingMetrics,
|
|
25
|
+
EnhancedAnalyzeMetrics,
|
|
21
26
|
)
|
|
22
27
|
from wxo_agentic_evaluation.type import (
|
|
23
28
|
ConversationalConfidenceThresholdScore,
|
|
24
29
|
Message,
|
|
30
|
+
ExtendedMessage,
|
|
25
31
|
)
|
|
26
32
|
|
|
27
33
|
console = Console()
|
|
28
34
|
|
|
35
|
+
RUN_FILE_RE = re.compile(
|
|
36
|
+
r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
|
|
37
|
+
)
|
|
38
|
+
N_A = "N/A"
|
|
29
39
|
|
|
30
40
|
class AttackResultsTable:
|
|
31
41
|
def __init__(self, attack_results: dict):
|
|
@@ -65,6 +75,94 @@ class AttackResultsTable:
|
|
|
65
75
|
console.print(self.table)
|
|
66
76
|
|
|
67
77
|
|
|
78
|
+
class TestCaseResources:
|
|
79
|
+
def __init__(self, output_dir: str):
|
|
80
|
+
"""Todo flesh out for all resources that are saved"""
|
|
81
|
+
self.output_dir = Path(output_dir)
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def get_summary(self):
|
|
85
|
+
summary = []
|
|
86
|
+
|
|
87
|
+
with open(self.output_dir / "summary_metrics.csv", "r") as f:
|
|
88
|
+
reader = csv.reader(f)
|
|
89
|
+
header = next(reader)
|
|
90
|
+
for row in reader:
|
|
91
|
+
summary.append(dict(zip(header, row)))
|
|
92
|
+
|
|
93
|
+
return summary
|
|
94
|
+
|
|
95
|
+
def get_analyze_messages(
|
|
96
|
+
self, test_case_name=None, path=None
|
|
97
|
+
) -> Tuple[List[ExtendedMessage], Mapping[str, Any]]:
|
|
98
|
+
test_messages = []
|
|
99
|
+
|
|
100
|
+
if test_case_name:
|
|
101
|
+
path = os.path.join(
|
|
102
|
+
self.output_dir,
|
|
103
|
+
"messages",
|
|
104
|
+
f"{test_case_name}.messages.analyze.json",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
if not Path(str(path)).is_file():
|
|
108
|
+
rich.print(f"[r]No analyze file found at {path}")
|
|
109
|
+
raise Exception(f"No analyze file found at {path}")
|
|
110
|
+
|
|
111
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
112
|
+
temp = json.load(f)
|
|
113
|
+
meta = None
|
|
114
|
+
if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
|
|
115
|
+
meta = temp[-1]["meta"]
|
|
116
|
+
temp = temp[:-1]
|
|
117
|
+
|
|
118
|
+
for entry in temp:
|
|
119
|
+
msg = ExtendedMessage(**entry)
|
|
120
|
+
test_messages.append(msg)
|
|
121
|
+
|
|
122
|
+
return test_messages, meta
|
|
123
|
+
|
|
124
|
+
def get_messages(self, test_case_name=None, path=None) -> List[Message]:
|
|
125
|
+
test_messages = []
|
|
126
|
+
|
|
127
|
+
if test_case_name:
|
|
128
|
+
path = os.path.join(
|
|
129
|
+
self.output_dir,
|
|
130
|
+
"messages",
|
|
131
|
+
f"{test_case_name}.messages.json",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if not Path(str(path)).is_file():
|
|
135
|
+
rich.print(f"[r]No messages file found at {path}")
|
|
136
|
+
raise Exception(f"No messages file found at {path}")
|
|
137
|
+
|
|
138
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
139
|
+
temp = json.load(f)
|
|
140
|
+
for entry in temp:
|
|
141
|
+
msg = Message(**entry)
|
|
142
|
+
test_messages.append(msg)
|
|
143
|
+
|
|
144
|
+
return test_messages
|
|
145
|
+
|
|
146
|
+
def get_test_metrics(
|
|
147
|
+
self, test_case_name=None, path=None
|
|
148
|
+
) -> ToolCallAndRoutingMetrics:
|
|
149
|
+
if test_case_name:
|
|
150
|
+
path = os.path.join(
|
|
151
|
+
self.output_dir,
|
|
152
|
+
"messages",
|
|
153
|
+
f"{test_case_name}.metrics.json",
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if not Path(str(path)).is_file():
|
|
157
|
+
rich.print(f"[r]No metrics file found at {path}")
|
|
158
|
+
raise Exception(f"No metrics file found at {path}")
|
|
159
|
+
|
|
160
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
161
|
+
metrics = ToolCallAndRoutingMetrics(**json.load(f))
|
|
162
|
+
|
|
163
|
+
return metrics
|
|
164
|
+
|
|
165
|
+
|
|
68
166
|
class AgentMetricsTable:
|
|
69
167
|
def __init__(self, data):
|
|
70
168
|
self.table = Table(
|
|
@@ -123,18 +221,27 @@ def is_ibm_cloud_url(service_url: str) -> bool:
|
|
|
123
221
|
|
|
124
222
|
def add_line_seperator(
|
|
125
223
|
style_config: Optional[Union[str, Style]] = None,
|
|
224
|
+
print=True,
|
|
126
225
|
):
|
|
226
|
+
"""
|
|
227
|
+
Adds a lined seperator provided the style config.
|
|
228
|
+
`print` is a boolean to indicate if the lined seperator should go to stdout immeadiatly or returned as an object.
|
|
229
|
+
Set `print` to False, the lined seperator is printed later as part of the pager view for example.
|
|
230
|
+
"""
|
|
127
231
|
|
|
128
232
|
if not style_config:
|
|
129
233
|
style = "grey42"
|
|
130
234
|
else:
|
|
131
235
|
style = style_config
|
|
132
236
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
237
|
+
if print:
|
|
238
|
+
console.print(
|
|
239
|
+
Rule(
|
|
240
|
+
style=style,
|
|
241
|
+
)
|
|
136
242
|
)
|
|
137
|
-
|
|
243
|
+
else:
|
|
244
|
+
return Rule(style=style, characters="==")
|
|
138
245
|
|
|
139
246
|
|
|
140
247
|
class FaithfulnessTable:
|
|
@@ -347,6 +454,7 @@ class ReferencelessEvalPanel:
|
|
|
347
454
|
|
|
348
455
|
# Function to load messages from JSON file
|
|
349
456
|
def load_messages(file_path):
|
|
457
|
+
"""TODO: replace in favor of TestCaseResources.get_messages(...)"""
|
|
350
458
|
with open(file_path, "r") as f:
|
|
351
459
|
try:
|
|
352
460
|
message_data = json.load(f)
|
|
@@ -362,7 +470,7 @@ def load_messages(file_path):
|
|
|
362
470
|
return None
|
|
363
471
|
|
|
364
472
|
|
|
365
|
-
def
|
|
473
|
+
def load_agents_from_disk(agents_path: str):
|
|
366
474
|
agents_json = glob.glob(os.path.join(agents_path, "*.json"))
|
|
367
475
|
agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
|
|
368
476
|
|
|
@@ -379,32 +487,39 @@ def load_agents(agents_path: str):
|
|
|
379
487
|
return agents
|
|
380
488
|
|
|
381
489
|
|
|
382
|
-
|
|
383
|
-
r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
def list_run_files(messages_dir: str, dataset_base: str):
|
|
490
|
+
def list_run_files(messages_dir: str, dataset_base: str, filter_run: int = -1):
|
|
388
491
|
"""
|
|
389
492
|
Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
|
|
390
493
|
(We only need analyze+metrics for this feature.)
|
|
494
|
+
|
|
495
|
+
`filter_run` only get gets the runs files for that run. If it is -1, then all run files are retrieved
|
|
496
|
+
For example, if there is `data3.run1.messages.json`, `data3.run2.messages.json`, and filter_run is 2, then,
|
|
497
|
+
the files related to only the second run are retrieved.
|
|
498
|
+
|
|
391
499
|
"""
|
|
392
|
-
runs =
|
|
500
|
+
runs = defaultdict(
|
|
501
|
+
lambda: {"analyze": None, "metrics": None, "messages": None}
|
|
502
|
+
)
|
|
393
503
|
for fn in os.listdir(messages_dir):
|
|
394
504
|
m = RUN_FILE_RE.match(fn)
|
|
395
505
|
if not m or m.group("base") != dataset_base:
|
|
396
506
|
continue
|
|
397
507
|
run_id = int(m.group("run"))
|
|
508
|
+
if filter_run != -1 and run_id != filter_run:
|
|
509
|
+
continue
|
|
510
|
+
|
|
398
511
|
kind = m.group("kind")
|
|
399
|
-
entry = runs.setdefault(run_id, {"analyze": None, "metrics": None})
|
|
400
512
|
full = os.path.join(messages_dir, fn)
|
|
401
513
|
if kind == "messages.analyze":
|
|
402
|
-
|
|
514
|
+
runs[run_id]["analyze"] = full
|
|
403
515
|
elif kind == "metrics":
|
|
404
|
-
|
|
516
|
+
runs[run_id]["metrics"] = full
|
|
517
|
+
elif kind == "messages":
|
|
518
|
+
runs[run_id]["messages"] = full
|
|
405
519
|
return runs
|
|
406
520
|
|
|
407
521
|
|
|
408
522
|
def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
|
|
523
|
+
"""Todo remove in a later PR"""
|
|
409
524
|
with open(metrics_path, "r", encoding="utf-8") as f:
|
|
410
525
|
return ToolCallAndRoutingMetrics(**json.load(f))
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
import urllib3
|
|
4
|
+
from urllib3.exceptions import InsecureRequestWarning
|
|
5
|
+
from typing import Dict, Any, Optional
|
|
6
|
+
|
|
7
|
+
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class WXOClient:
|
|
11
|
+
def __init__(
|
|
12
|
+
self, service_url, api_key, env: Optional[Dict[str, Any]] = None
|
|
13
|
+
):
|
|
14
|
+
self.service_url = service_url
|
|
15
|
+
self.api_key = api_key
|
|
16
|
+
|
|
17
|
+
ov = os.getenv("WO_SSL_VERIFY")
|
|
18
|
+
if ov and ov.strip().lower() in ("true", "false"):
|
|
19
|
+
self._verify_ssl = ov.strip().lower() == "true"
|
|
20
|
+
else:
|
|
21
|
+
v, bs = (env.get("verify") if env else None), (
|
|
22
|
+
env.get("bypass_ssl") if env else None
|
|
23
|
+
)
|
|
24
|
+
self._verify_ssl = (
|
|
25
|
+
False
|
|
26
|
+
if (
|
|
27
|
+
(bs is True)
|
|
28
|
+
or (isinstance(bs, str) and bs.strip().lower() == "true")
|
|
29
|
+
or (v is None)
|
|
30
|
+
or (
|
|
31
|
+
isinstance(v, str)
|
|
32
|
+
and v.strip().lower() in {"none", "null"}
|
|
33
|
+
)
|
|
34
|
+
)
|
|
35
|
+
else (v if isinstance(v, bool) else True)
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
if not self._verify_ssl:
|
|
39
|
+
urllib3.disable_warnings(InsecureRequestWarning)
|
|
40
|
+
|
|
41
|
+
def _get_headers(self) -> dict:
|
|
42
|
+
headers = {}
|
|
43
|
+
if self.api_key:
|
|
44
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
45
|
+
return headers
|
|
46
|
+
|
|
47
|
+
def post(self, payload: dict, path: str, stream=False):
|
|
48
|
+
url = f"{self.service_url}/{path}"
|
|
49
|
+
return requests.post(
|
|
50
|
+
url=url,
|
|
51
|
+
headers=self._get_headers(),
|
|
52
|
+
json=payload,
|
|
53
|
+
stream=stream,
|
|
54
|
+
verify=self._verify_ssl,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def get(self, path: str, params: dict = None):
|
|
58
|
+
url = f"{self.service_url}/{path}"
|
|
59
|
+
return requests.get(
|
|
60
|
+
url,
|
|
61
|
+
params=params,
|
|
62
|
+
headers=self._get_headers(),
|
|
63
|
+
verify=self._verify_ssl,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_wxo_client(
|
|
68
|
+
service_url: Optional[str], tenant_name: str, token: Optional[str] = None
|
|
69
|
+
) -> WXOClient:
|
|
70
|
+
|
|
71
|
+
token, resolved_url, env = tenant_setup(service_url, tenant_name)
|
|
72
|
+
service_url = service_url or resolved_url
|
|
73
|
+
|
|
74
|
+
if not (service_url and str(service_url).strip()):
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"service_url not provided and not found in config for tenant '{tenant_name}'"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
|
|
80
|
+
return wxo_client
|
|
File without changes
|