ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +49 -39
- wxo_agentic_evaluation/analyze_run.py +822 -344
- wxo_agentic_evaluation/arg_configs.py +39 -2
- wxo_agentic_evaluation/data_annotator.py +22 -4
- wxo_agentic_evaluation/description_quality_checker.py +29 -4
- wxo_agentic_evaluation/evaluation_package.py +197 -18
- wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
- wxo_agentic_evaluation/external_agent/types.py +1 -1
- wxo_agentic_evaluation/inference_backend.py +105 -108
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_user.py +2 -2
- wxo_agentic_evaluation/main.py +147 -38
- wxo_agentic_evaluation/metrics/__init__.py +5 -0
- wxo_agentic_evaluation/metrics/evaluations.py +124 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
- wxo_agentic_evaluation/metrics/metrics.py +64 -1
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +20 -2
- wxo_agentic_evaluation/quick_eval.py +23 -11
- wxo_agentic_evaluation/record_chat.py +18 -10
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +169 -100
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +78 -8
- wxo_agentic_evaluation/red_teaming/attack_runner.py +71 -14
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +103 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/service_instance.py +12 -3
- wxo_agentic_evaluation/service_provider/__init__.py +129 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
- wxo_agentic_evaluation/type.py +15 -5
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +140 -20
- wxo_agentic_evaluation/wxo_client.py +81 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,47 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
2
5
|
|
|
6
|
+
from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
|
|
7
|
+
ToolExtractionOpenAIFormat,
|
|
8
|
+
)
|
|
9
|
+
from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
|
|
10
|
+
from wxo_agentic_evaluation.utils.utils import (
|
|
11
|
+
N_A,
|
|
12
|
+
TestCaseResources,
|
|
13
|
+
add_line_seperator,
|
|
14
|
+
list_run_files,
|
|
15
|
+
load_run_metrics,
|
|
16
|
+
)
|
|
3
17
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
18
|
+
|
|
19
|
+
def json_dump(output_path, obj):
|
|
20
|
+
"""
|
|
21
|
+
Atomically dump JSON to `output_path`.
|
|
22
|
+
|
|
23
|
+
- Writes to a temporary file first
|
|
24
|
+
- Then atomically replaces the target file
|
|
25
|
+
- Prevents corrupted/half-written JSON if process is interrupted
|
|
26
|
+
"""
|
|
27
|
+
output_path = Path(output_path)
|
|
28
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
fd, tmp_path = tempfile.mkstemp(
|
|
31
|
+
dir=output_path.parent,
|
|
32
|
+
prefix=output_path.stem,
|
|
33
|
+
suffix=".tmp",
|
|
34
|
+
text=True,
|
|
35
|
+
)
|
|
36
|
+
try:
|
|
37
|
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
38
|
+
json.dump(obj, f, indent=4, ensure_ascii=False)
|
|
39
|
+
f.flush()
|
|
40
|
+
os.fsync(f.fileno())
|
|
41
|
+
os.replace(tmp_path, output_path)
|
|
42
|
+
except Exception:
|
|
43
|
+
try:
|
|
44
|
+
os.remove(tmp_path)
|
|
45
|
+
except OSError:
|
|
46
|
+
pass
|
|
47
|
+
raise
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation discovery mechanism.
|
|
3
|
+
|
|
4
|
+
This module provides functionality for discovering classes that inherit from Evaluation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import importlib.util
|
|
8
|
+
import inspect
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def find_evaluation_subclasses(directory: str, base_class_name="Evaluation"):
|
|
13
|
+
"""
|
|
14
|
+
Dynamically import Python files under 'directory' and find classes that
|
|
15
|
+
inherit from a class named 'Evaluation'. Returns a list of non-abstract
|
|
16
|
+
class objects.
|
|
17
|
+
"""
|
|
18
|
+
subclasses = []
|
|
19
|
+
|
|
20
|
+
for root, _, files in os.walk(directory):
|
|
21
|
+
for file in files:
|
|
22
|
+
if file.endswith(".py") and not file.startswith("__"):
|
|
23
|
+
filepath = os.path.join(root, file)
|
|
24
|
+
module_name = os.path.splitext(os.path.basename(filepath))[0]
|
|
25
|
+
|
|
26
|
+
spec = importlib.util.spec_from_file_location(
|
|
27
|
+
module_name, filepath
|
|
28
|
+
)
|
|
29
|
+
if spec and spec.loader:
|
|
30
|
+
module = importlib.util.module_from_spec(spec)
|
|
31
|
+
try:
|
|
32
|
+
spec.loader.exec_module(module)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
print(f"Skipping {filepath} due to import error: {e}")
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
# Inspect for subclasses
|
|
38
|
+
for name, obj in inspect.getmembers(
|
|
39
|
+
module, inspect.isclass
|
|
40
|
+
):
|
|
41
|
+
if any(
|
|
42
|
+
base.__name__ == base_class_name
|
|
43
|
+
for base in obj.__mro__[1:]
|
|
44
|
+
) and not inspect.isabstract(obj):
|
|
45
|
+
subclasses.append(obj)
|
|
46
|
+
|
|
47
|
+
return subclasses
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.arg_configs import AuthConfig
|
|
5
|
+
from wxo_agentic_evaluation.service_provider import USE_GATEWAY_MODEL_PROVIDER
|
|
6
|
+
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
7
|
+
|
|
8
|
+
WXO_AUTH_CONFIG_DEFAULTS = AuthConfig(
|
|
9
|
+
url=os.getenv("WXO_URL", "http://localhost:4321"),
|
|
10
|
+
tenant_name=os.getenv("WXO_TENANT", "wxo-dev"),
|
|
11
|
+
token=os.getenv("WXO_TOKEN", None),
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@lru_cache(maxsize=1)
|
|
16
|
+
def _get_cached_wxo_client():
|
|
17
|
+
# TODO: remove this once the client is implemented as a Singleton.
|
|
18
|
+
return get_wxo_client(
|
|
19
|
+
WXO_AUTH_CONFIG_DEFAULTS.url,
|
|
20
|
+
WXO_AUTH_CONFIG_DEFAULTS.tenant_name,
|
|
21
|
+
WXO_AUTH_CONFIG_DEFAULTS.token,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_provider_kwargs(**base_kwargs: dict) -> dict:
|
|
26
|
+
|
|
27
|
+
if not USE_GATEWAY_MODEL_PROVIDER:
|
|
28
|
+
return base_kwargs
|
|
29
|
+
|
|
30
|
+
if "instance_url" in base_kwargs and "token" in base_kwargs:
|
|
31
|
+
return base_kwargs
|
|
32
|
+
|
|
33
|
+
wxo_client = _get_cached_wxo_client()
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
**base_kwargs,
|
|
37
|
+
"instance_url": wxo_client.service_url,
|
|
38
|
+
"token": wxo_client.api_key,
|
|
39
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from wxo_agentic_evaluation.type import ContentType, Message
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ParsedMessages(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
A parsed history of messages.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
messages: list[Message] = Field(description="The list of messages")
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def user_input(self) -> Optional[str]:
|
|
17
|
+
"""Find the original user message."""
|
|
18
|
+
for message in self.messages:
|
|
19
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
20
|
+
return str(message.content)
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def agent_response(self) -> Optional[str]:
|
|
25
|
+
"""Find the most recent assistant message."""
|
|
26
|
+
messages_in_reverse = reversed(self.messages)
|
|
27
|
+
for message in messages_in_reverse:
|
|
28
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
29
|
+
return str(message.content)
|
|
30
|
+
return None
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from typing import Any, List, Mapping, Optional
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.metrics import (
|
|
4
|
+
Annotation,
|
|
5
|
+
FailedSemanticTestCases,
|
|
6
|
+
FailedStaticTestCases,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ReferencelessEvalParser:
|
|
11
|
+
@staticmethod
|
|
12
|
+
def static_parser(
|
|
13
|
+
static_metrics: Mapping[str, Mapping[str, Any]],
|
|
14
|
+
) -> List[FailedStaticTestCases]:
|
|
15
|
+
"""
|
|
16
|
+
static.metrics
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
failed_test_cases = []
|
|
20
|
+
|
|
21
|
+
for metric, metric_data in static_metrics.items():
|
|
22
|
+
if not metric_data.get("valid", False):
|
|
23
|
+
fail = FailedStaticTestCases(
|
|
24
|
+
metric_name=metric,
|
|
25
|
+
description=metric_data.get("description"),
|
|
26
|
+
explanation=metric_data.get("explanation"),
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
failed_test_cases.append(fail)
|
|
30
|
+
|
|
31
|
+
return failed_test_cases
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def parse_annotations(
|
|
35
|
+
actionable_reccomendations, filters: List[str]
|
|
36
|
+
) -> Optional[List[Annotation]]:
|
|
37
|
+
annotations = [
|
|
38
|
+
Annotation(
|
|
39
|
+
parameter_name=recc.get("parameter_name"),
|
|
40
|
+
recommendation=recc.get("recommendation"),
|
|
41
|
+
details=recc.get("details"),
|
|
42
|
+
quote=recc.get("quote"),
|
|
43
|
+
)
|
|
44
|
+
for recc in actionable_reccomendations
|
|
45
|
+
if recc.get("recommendation") in filters
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
annotations = annotations if annotations else None
|
|
49
|
+
|
|
50
|
+
return annotations
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def semantic_parser(
|
|
54
|
+
metric_name, data, annotation_filters: Optional[List[str]]
|
|
55
|
+
):
|
|
56
|
+
semantic_metric = FailedSemanticTestCases(
|
|
57
|
+
metric_name=metric_name,
|
|
58
|
+
evidence=data.get("evidence"),
|
|
59
|
+
explanation=data.get("explanation"),
|
|
60
|
+
output=data.get("output"),
|
|
61
|
+
confidence=data.get("confidence"),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if annotation_filters and (
|
|
65
|
+
annotations := ReferencelessEvalParser.parse_annotations(
|
|
66
|
+
data.get("actionable_recommendations"), annotation_filters
|
|
67
|
+
)
|
|
68
|
+
):
|
|
69
|
+
semantic_metric.annotations = annotations
|
|
70
|
+
|
|
71
|
+
return semantic_metric
|
|
@@ -1,10 +1,14 @@
|
|
|
1
|
+
import csv
|
|
1
2
|
import glob
|
|
2
3
|
import json
|
|
3
4
|
import os
|
|
4
5
|
import re
|
|
5
|
-
from
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, List, Mapping, Optional, Tuple, Union
|
|
6
9
|
from urllib.parse import urlparse
|
|
7
10
|
|
|
11
|
+
import rich
|
|
8
12
|
import yaml
|
|
9
13
|
from rich import box, print
|
|
10
14
|
from rich.console import Console, Group
|
|
@@ -15,17 +19,24 @@ from rich.table import Table
|
|
|
15
19
|
|
|
16
20
|
from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
|
|
17
21
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
22
|
+
EnhancedAnalyzeMetrics,
|
|
18
23
|
KnowledgeBaseMetricSummary,
|
|
19
24
|
ReferenceLessEvalMetrics,
|
|
20
25
|
ToolCallAndRoutingMetrics,
|
|
21
26
|
)
|
|
22
27
|
from wxo_agentic_evaluation.type import (
|
|
23
28
|
ConversationalConfidenceThresholdScore,
|
|
29
|
+
ExtendedMessage,
|
|
24
30
|
Message,
|
|
25
31
|
)
|
|
26
32
|
|
|
27
33
|
console = Console()
|
|
28
34
|
|
|
35
|
+
RUN_FILE_RE = re.compile(
|
|
36
|
+
r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
|
|
37
|
+
)
|
|
38
|
+
N_A = "N/A"
|
|
39
|
+
|
|
29
40
|
|
|
30
41
|
class AttackResultsTable:
|
|
31
42
|
def __init__(self, attack_results: dict):
|
|
@@ -65,10 +76,100 @@ class AttackResultsTable:
|
|
|
65
76
|
console.print(self.table)
|
|
66
77
|
|
|
67
78
|
|
|
79
|
+
class TestCaseResources:
|
|
80
|
+
def __init__(self, output_dir: str):
|
|
81
|
+
"""Todo flesh out for all resources that are saved"""
|
|
82
|
+
self.output_dir = Path(output_dir)
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def get_summary(self):
|
|
86
|
+
summary = []
|
|
87
|
+
|
|
88
|
+
with open(self.output_dir / "summary_metrics.csv", "r") as f:
|
|
89
|
+
reader = csv.reader(f)
|
|
90
|
+
header = next(reader)
|
|
91
|
+
for row in reader:
|
|
92
|
+
summary.append(dict(zip(header, row)))
|
|
93
|
+
|
|
94
|
+
return summary
|
|
95
|
+
|
|
96
|
+
def get_analyze_messages(
|
|
97
|
+
self, test_case_name=None, path=None
|
|
98
|
+
) -> Tuple[List[ExtendedMessage], Mapping[str, Any]]:
|
|
99
|
+
test_messages = []
|
|
100
|
+
|
|
101
|
+
if test_case_name:
|
|
102
|
+
path = os.path.join(
|
|
103
|
+
self.output_dir,
|
|
104
|
+
"messages",
|
|
105
|
+
f"{test_case_name}.messages.analyze.json",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if not Path(str(path)).is_file():
|
|
109
|
+
rich.print(f"[r]No analyze file found at {path}")
|
|
110
|
+
raise Exception(f"No analyze file found at {path}")
|
|
111
|
+
|
|
112
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
113
|
+
temp = json.load(f)
|
|
114
|
+
meta = None
|
|
115
|
+
if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
|
|
116
|
+
meta = temp[-1]["meta"]
|
|
117
|
+
temp = temp[:-1]
|
|
118
|
+
|
|
119
|
+
for entry in temp:
|
|
120
|
+
msg = ExtendedMessage(**entry)
|
|
121
|
+
test_messages.append(msg)
|
|
122
|
+
|
|
123
|
+
return test_messages, meta
|
|
124
|
+
|
|
125
|
+
def get_messages(self, test_case_name=None, path=None) -> List[Message]:
|
|
126
|
+
test_messages = []
|
|
127
|
+
|
|
128
|
+
if test_case_name:
|
|
129
|
+
path = os.path.join(
|
|
130
|
+
self.output_dir,
|
|
131
|
+
"messages",
|
|
132
|
+
f"{test_case_name}.messages.json",
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if not Path(str(path)).is_file():
|
|
136
|
+
rich.print(f"[r]No messages file found at {path}")
|
|
137
|
+
raise Exception(f"No messages file found at {path}")
|
|
138
|
+
|
|
139
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
140
|
+
temp = json.load(f)
|
|
141
|
+
for entry in temp:
|
|
142
|
+
msg = Message(**entry)
|
|
143
|
+
test_messages.append(msg)
|
|
144
|
+
|
|
145
|
+
return test_messages
|
|
146
|
+
|
|
147
|
+
def get_test_metrics(
|
|
148
|
+
self, test_case_name=None, path=None
|
|
149
|
+
) -> ToolCallAndRoutingMetrics:
|
|
150
|
+
if test_case_name:
|
|
151
|
+
path = os.path.join(
|
|
152
|
+
self.output_dir,
|
|
153
|
+
"messages",
|
|
154
|
+
f"{test_case_name}.metrics.json",
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
if not Path(str(path)).is_file():
|
|
158
|
+
rich.print(f"[r]No metrics file found at {path}")
|
|
159
|
+
raise Exception(f"No metrics file found at {path}")
|
|
160
|
+
|
|
161
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
162
|
+
metrics = ToolCallAndRoutingMetrics(**json.load(f))
|
|
163
|
+
|
|
164
|
+
return metrics
|
|
165
|
+
|
|
166
|
+
|
|
68
167
|
class AgentMetricsTable:
|
|
69
|
-
def __init__(self, data):
|
|
168
|
+
def __init__(self, data, title: Optional[str] = None):
|
|
169
|
+
if title is None:
|
|
170
|
+
title = "Agent Metrics"
|
|
70
171
|
self.table = Table(
|
|
71
|
-
title=
|
|
172
|
+
title=title,
|
|
72
173
|
box=box.ROUNDED,
|
|
73
174
|
show_lines=True,
|
|
74
175
|
)
|
|
@@ -89,7 +190,9 @@ class AgentMetricsTable:
|
|
|
89
190
|
console.print(self.table)
|
|
90
191
|
|
|
91
192
|
|
|
92
|
-
def create_table(
|
|
193
|
+
def create_table(
|
|
194
|
+
data: List[dict], title: Optional[str] = None
|
|
195
|
+
) -> AgentMetricsTable:
|
|
93
196
|
"""
|
|
94
197
|
Generate a Rich table from a list of dictionaries.
|
|
95
198
|
Returns the AgentMetricsTable instance.
|
|
@@ -101,7 +204,7 @@ def create_table(data: List[dict]) -> AgentMetricsTable:
|
|
|
101
204
|
print("create_table() received an empty dataset. No table generated.")
|
|
102
205
|
return None
|
|
103
206
|
|
|
104
|
-
return AgentMetricsTable(data)
|
|
207
|
+
return AgentMetricsTable(data, title=title)
|
|
105
208
|
|
|
106
209
|
|
|
107
210
|
def safe_divide(nom, denom):
|
|
@@ -123,18 +226,27 @@ def is_ibm_cloud_url(service_url: str) -> bool:
|
|
|
123
226
|
|
|
124
227
|
def add_line_seperator(
|
|
125
228
|
style_config: Optional[Union[str, Style]] = None,
|
|
229
|
+
print=True,
|
|
126
230
|
):
|
|
231
|
+
"""
|
|
232
|
+
Adds a lined seperator provided the style config.
|
|
233
|
+
`print` is a boolean to indicate if the lined seperator should go to stdout immeadiatly or returned as an object.
|
|
234
|
+
Set `print` to False, the lined seperator is printed later as part of the pager view for example.
|
|
235
|
+
"""
|
|
127
236
|
|
|
128
237
|
if not style_config:
|
|
129
238
|
style = "grey42"
|
|
130
239
|
else:
|
|
131
240
|
style = style_config
|
|
132
241
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
242
|
+
if print:
|
|
243
|
+
console.print(
|
|
244
|
+
Rule(
|
|
245
|
+
style=style,
|
|
246
|
+
)
|
|
136
247
|
)
|
|
137
|
-
|
|
248
|
+
else:
|
|
249
|
+
return Rule(style=style, characters="==")
|
|
138
250
|
|
|
139
251
|
|
|
140
252
|
class FaithfulnessTable:
|
|
@@ -347,6 +459,7 @@ class ReferencelessEvalPanel:
|
|
|
347
459
|
|
|
348
460
|
# Function to load messages from JSON file
|
|
349
461
|
def load_messages(file_path):
|
|
462
|
+
"""TODO: replace in favor of TestCaseResources.get_messages(...)"""
|
|
350
463
|
with open(file_path, "r") as f:
|
|
351
464
|
try:
|
|
352
465
|
message_data = json.load(f)
|
|
@@ -362,7 +475,7 @@ def load_messages(file_path):
|
|
|
362
475
|
return None
|
|
363
476
|
|
|
364
477
|
|
|
365
|
-
def
|
|
478
|
+
def load_agents_from_disk(agents_path: str):
|
|
366
479
|
agents_json = glob.glob(os.path.join(agents_path, "*.json"))
|
|
367
480
|
agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
|
|
368
481
|
|
|
@@ -379,32 +492,39 @@ def load_agents(agents_path: str):
|
|
|
379
492
|
return agents
|
|
380
493
|
|
|
381
494
|
|
|
382
|
-
|
|
383
|
-
r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
def list_run_files(messages_dir: str, dataset_base: str):
|
|
495
|
+
def list_run_files(messages_dir: str, dataset_base: str, filter_run: int = -1):
|
|
388
496
|
"""
|
|
389
497
|
Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
|
|
390
498
|
(We only need analyze+metrics for this feature.)
|
|
499
|
+
|
|
500
|
+
`filter_run` only get gets the runs files for that run. If it is -1, then all run files are retrieved
|
|
501
|
+
For example, if there is `data3.run1.messages.json`, `data3.run2.messages.json`, and filter_run is 2, then,
|
|
502
|
+
the files related to only the second run are retrieved.
|
|
503
|
+
|
|
391
504
|
"""
|
|
392
|
-
runs =
|
|
505
|
+
runs = defaultdict(
|
|
506
|
+
lambda: {"analyze": None, "metrics": None, "messages": None}
|
|
507
|
+
)
|
|
393
508
|
for fn in os.listdir(messages_dir):
|
|
394
509
|
m = RUN_FILE_RE.match(fn)
|
|
395
510
|
if not m or m.group("base") != dataset_base:
|
|
396
511
|
continue
|
|
397
512
|
run_id = int(m.group("run"))
|
|
513
|
+
if filter_run != -1 and run_id != filter_run:
|
|
514
|
+
continue
|
|
515
|
+
|
|
398
516
|
kind = m.group("kind")
|
|
399
|
-
entry = runs.setdefault(run_id, {"analyze": None, "metrics": None})
|
|
400
517
|
full = os.path.join(messages_dir, fn)
|
|
401
518
|
if kind == "messages.analyze":
|
|
402
|
-
|
|
519
|
+
runs[run_id]["analyze"] = full
|
|
403
520
|
elif kind == "metrics":
|
|
404
|
-
|
|
521
|
+
runs[run_id]["metrics"] = full
|
|
522
|
+
elif kind == "messages":
|
|
523
|
+
runs[run_id]["messages"] = full
|
|
405
524
|
return runs
|
|
406
525
|
|
|
407
526
|
|
|
408
527
|
def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
|
|
528
|
+
"""Todo remove in a later PR"""
|
|
409
529
|
with open(metrics_path, "r", encoding="utf-8") as f:
|
|
410
530
|
return ToolCallAndRoutingMetrics(**json.load(f))
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
import urllib3
|
|
6
|
+
from urllib3.exceptions import InsecureRequestWarning
|
|
7
|
+
|
|
8
|
+
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class WXOClient:
|
|
12
|
+
def __init__(
|
|
13
|
+
self, service_url, api_key, env: Optional[Dict[str, Any]] = None
|
|
14
|
+
):
|
|
15
|
+
self.service_url = service_url
|
|
16
|
+
self.api_key = api_key
|
|
17
|
+
|
|
18
|
+
ov = os.getenv("WO_SSL_VERIFY")
|
|
19
|
+
if ov and ov.strip().lower() in ("true", "false"):
|
|
20
|
+
self._verify_ssl = ov.strip().lower() == "true"
|
|
21
|
+
else:
|
|
22
|
+
v, bs = (env.get("verify") if env else None), (
|
|
23
|
+
env.get("bypass_ssl") if env else None
|
|
24
|
+
)
|
|
25
|
+
self._verify_ssl = (
|
|
26
|
+
False
|
|
27
|
+
if (
|
|
28
|
+
(bs is True)
|
|
29
|
+
or (isinstance(bs, str) and bs.strip().lower() == "true")
|
|
30
|
+
or (v is None)
|
|
31
|
+
or (
|
|
32
|
+
isinstance(v, str)
|
|
33
|
+
and v.strip().lower() in {"none", "null"}
|
|
34
|
+
)
|
|
35
|
+
)
|
|
36
|
+
else (v if isinstance(v, bool) else True)
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if not self._verify_ssl:
|
|
40
|
+
urllib3.disable_warnings(InsecureRequestWarning)
|
|
41
|
+
|
|
42
|
+
def _get_headers(self) -> dict:
|
|
43
|
+
headers = {}
|
|
44
|
+
if self.api_key:
|
|
45
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
46
|
+
return headers
|
|
47
|
+
|
|
48
|
+
def post(self, payload: dict, path: str, stream=False):
|
|
49
|
+
url = f"{self.service_url}/{path}"
|
|
50
|
+
return requests.post(
|
|
51
|
+
url=url,
|
|
52
|
+
headers=self._get_headers(),
|
|
53
|
+
json=payload,
|
|
54
|
+
stream=stream,
|
|
55
|
+
verify=self._verify_ssl,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def get(self, path: str, params: dict = None):
|
|
59
|
+
url = f"{self.service_url}/{path}"
|
|
60
|
+
return requests.get(
|
|
61
|
+
url,
|
|
62
|
+
params=params,
|
|
63
|
+
headers=self._get_headers(),
|
|
64
|
+
verify=self._verify_ssl,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_wxo_client(
|
|
69
|
+
service_url: Optional[str], tenant_name: str, token: Optional[str] = None
|
|
70
|
+
) -> WXOClient:
|
|
71
|
+
|
|
72
|
+
token, resolved_url, env = tenant_setup(service_url, tenant_name)
|
|
73
|
+
service_url = service_url or resolved_url
|
|
74
|
+
|
|
75
|
+
if not (service_url and str(service_url).strip()):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"service_url not provided and not found in config for tenant '{tenant_name}'"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
|
|
81
|
+
return wxo_client
|
|
File without changes
|