ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from typing import Any, List, Mapping, Optional
|
|
2
|
+
|
|
3
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
4
|
+
Annotation,
|
|
5
|
+
FailedSemanticTestCases,
|
|
6
|
+
FailedStaticTestCases,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ReferencelessEvalParser:
|
|
11
|
+
@staticmethod
|
|
12
|
+
def static_parser(
|
|
13
|
+
static_metrics: Mapping[str, Mapping[str, Any]],
|
|
14
|
+
) -> List[FailedStaticTestCases]:
|
|
15
|
+
"""
|
|
16
|
+
static.metrics
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
failed_test_cases = []
|
|
20
|
+
|
|
21
|
+
for metric, metric_data in static_metrics.items():
|
|
22
|
+
if not metric_data.get("valid", False):
|
|
23
|
+
fail = FailedStaticTestCases(
|
|
24
|
+
metric_name=metric,
|
|
25
|
+
description=metric_data.get("description"),
|
|
26
|
+
explanation=metric_data.get("explanation"),
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
failed_test_cases.append(fail)
|
|
30
|
+
|
|
31
|
+
return failed_test_cases
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def parse_annotations(
|
|
35
|
+
actionable_reccomendations, filters: List[str]
|
|
36
|
+
) -> Optional[List[Annotation]]:
|
|
37
|
+
annotations = [
|
|
38
|
+
Annotation(
|
|
39
|
+
parameter_name=recc.get("parameter_name"),
|
|
40
|
+
recommendation=recc.get("recommendation"),
|
|
41
|
+
details=recc.get("details"),
|
|
42
|
+
quote=recc.get("quote"),
|
|
43
|
+
)
|
|
44
|
+
for recc in actionable_reccomendations
|
|
45
|
+
if recc.get("recommendation") in filters
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
annotations = annotations if annotations else None
|
|
49
|
+
|
|
50
|
+
return annotations
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def semantic_parser(
|
|
54
|
+
metric_name, data, annotation_filters: Optional[List[str]]
|
|
55
|
+
):
|
|
56
|
+
semantic_metric = FailedSemanticTestCases(
|
|
57
|
+
metric_name=metric_name,
|
|
58
|
+
evidence=data.get("evidence"),
|
|
59
|
+
explanation=data.get("explanation"),
|
|
60
|
+
output=data.get("output"),
|
|
61
|
+
confidence=data.get("confidence"),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if annotation_filters and (
|
|
65
|
+
annotations := ReferencelessEvalParser.parse_annotations(
|
|
66
|
+
data.get("actionable_recommendations"), annotation_filters
|
|
67
|
+
)
|
|
68
|
+
):
|
|
69
|
+
semantic_metric.annotations = annotations
|
|
70
|
+
|
|
71
|
+
return semantic_metric
|
|
@@ -1,10 +1,15 @@
|
|
|
1
|
+
import csv
|
|
1
2
|
import glob
|
|
2
3
|
import json
|
|
4
|
+
import math
|
|
3
5
|
import os
|
|
4
6
|
import re
|
|
5
|
-
from
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
|
|
6
10
|
from urllib.parse import urlparse
|
|
7
11
|
|
|
12
|
+
import rich
|
|
8
13
|
import yaml
|
|
9
14
|
from rich import box, print
|
|
10
15
|
from rich.console import Console, Group
|
|
@@ -17,14 +22,25 @@ from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
|
|
|
17
22
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
18
23
|
KnowledgeBaseMetricSummary,
|
|
19
24
|
ReferenceLessEvalMetrics,
|
|
25
|
+
ToolCallAndRoutingMetrics,
|
|
20
26
|
)
|
|
21
27
|
from wxo_agentic_evaluation.type import (
|
|
22
28
|
ConversationalConfidenceThresholdScore,
|
|
29
|
+
ExtendedMessage,
|
|
23
30
|
Message,
|
|
24
31
|
)
|
|
25
32
|
|
|
26
33
|
console = Console()
|
|
27
34
|
|
|
35
|
+
RUN_FILE_RE = re.compile(
|
|
36
|
+
r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
|
|
37
|
+
)
|
|
38
|
+
N_A = "N/A"
|
|
39
|
+
|
|
40
|
+
# File name constants
|
|
41
|
+
REFERENCE_FILE_NAME = "reference"
|
|
42
|
+
EXPERIMENT_FILE_NAME = "experiment"
|
|
43
|
+
|
|
28
44
|
|
|
29
45
|
class AttackResultsTable:
|
|
30
46
|
def __init__(self, attack_results: dict):
|
|
@@ -64,10 +80,100 @@ class AttackResultsTable:
|
|
|
64
80
|
console.print(self.table)
|
|
65
81
|
|
|
66
82
|
|
|
83
|
+
class TestCaseResources:
|
|
84
|
+
def __init__(self, output_dir: str):
|
|
85
|
+
"""Todo flesh out for all resources that are saved"""
|
|
86
|
+
self.output_dir = Path(output_dir)
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def get_summary(self):
|
|
90
|
+
summary = []
|
|
91
|
+
|
|
92
|
+
with open(self.output_dir / "summary_metrics.csv", "r") as f:
|
|
93
|
+
reader = csv.reader(f)
|
|
94
|
+
header = next(reader)
|
|
95
|
+
for row in reader:
|
|
96
|
+
summary.append(dict(zip(header, row)))
|
|
97
|
+
|
|
98
|
+
return summary
|
|
99
|
+
|
|
100
|
+
def get_analyze_messages(
|
|
101
|
+
self, test_case_name=None, path=None
|
|
102
|
+
) -> Tuple[List[ExtendedMessage], Mapping[str, Any]]:
|
|
103
|
+
test_messages = []
|
|
104
|
+
|
|
105
|
+
if test_case_name:
|
|
106
|
+
path = os.path.join(
|
|
107
|
+
self.output_dir,
|
|
108
|
+
"messages",
|
|
109
|
+
f"{test_case_name}.messages.analyze.json",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if not Path(str(path)).is_file():
|
|
113
|
+
rich.print(f"[r]No analyze file found at {path}")
|
|
114
|
+
raise Exception(f"No analyze file found at {path}")
|
|
115
|
+
|
|
116
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
117
|
+
temp = json.load(f)
|
|
118
|
+
meta = None
|
|
119
|
+
if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
|
|
120
|
+
meta = temp[-1]["meta"]
|
|
121
|
+
temp = temp[:-1]
|
|
122
|
+
|
|
123
|
+
for entry in temp:
|
|
124
|
+
msg = ExtendedMessage(**entry)
|
|
125
|
+
test_messages.append(msg)
|
|
126
|
+
|
|
127
|
+
return test_messages, meta
|
|
128
|
+
|
|
129
|
+
def get_messages(self, test_case_name=None, path=None) -> List[Message]:
|
|
130
|
+
test_messages = []
|
|
131
|
+
|
|
132
|
+
if test_case_name:
|
|
133
|
+
path = os.path.join(
|
|
134
|
+
self.output_dir,
|
|
135
|
+
"messages",
|
|
136
|
+
f"{test_case_name}.messages.json",
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if not Path(str(path)).is_file():
|
|
140
|
+
rich.print(f"[r]No messages file found at {path}")
|
|
141
|
+
raise Exception(f"No messages file found at {path}")
|
|
142
|
+
|
|
143
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
144
|
+
temp = json.load(f)
|
|
145
|
+
for entry in temp:
|
|
146
|
+
msg = Message(**entry)
|
|
147
|
+
test_messages.append(msg)
|
|
148
|
+
|
|
149
|
+
return test_messages
|
|
150
|
+
|
|
151
|
+
def get_test_metrics(
|
|
152
|
+
self, test_case_name=None, path=None
|
|
153
|
+
) -> ToolCallAndRoutingMetrics:
|
|
154
|
+
if test_case_name:
|
|
155
|
+
path = os.path.join(
|
|
156
|
+
self.output_dir,
|
|
157
|
+
"messages",
|
|
158
|
+
f"{test_case_name}.metrics.json",
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if not Path(str(path)).is_file():
|
|
162
|
+
rich.print(f"[r]No metrics file found at {path}")
|
|
163
|
+
raise Exception(f"No metrics file found at {path}")
|
|
164
|
+
|
|
165
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
166
|
+
metrics = ToolCallAndRoutingMetrics(**json.load(f))
|
|
167
|
+
|
|
168
|
+
return metrics
|
|
169
|
+
|
|
170
|
+
|
|
67
171
|
class AgentMetricsTable:
|
|
68
|
-
def __init__(self, data):
|
|
172
|
+
def __init__(self, data, title: Optional[str] = None):
|
|
173
|
+
if title is None:
|
|
174
|
+
title = "Agent Metrics"
|
|
69
175
|
self.table = Table(
|
|
70
|
-
title=
|
|
176
|
+
title=title,
|
|
71
177
|
box=box.ROUNDED,
|
|
72
178
|
show_lines=True,
|
|
73
179
|
)
|
|
@@ -88,7 +194,9 @@ class AgentMetricsTable:
|
|
|
88
194
|
console.print(self.table)
|
|
89
195
|
|
|
90
196
|
|
|
91
|
-
def create_table(
|
|
197
|
+
def create_table(
|
|
198
|
+
data: List[dict], title: Optional[str] = None
|
|
199
|
+
) -> AgentMetricsTable:
|
|
92
200
|
"""
|
|
93
201
|
Generate a Rich table from a list of dictionaries.
|
|
94
202
|
Returns the AgentMetricsTable instance.
|
|
@@ -100,7 +208,47 @@ def create_table(data: List[dict]) -> AgentMetricsTable:
|
|
|
100
208
|
print("create_table() received an empty dataset. No table generated.")
|
|
101
209
|
return None
|
|
102
210
|
|
|
103
|
-
return AgentMetricsTable(data)
|
|
211
|
+
return AgentMetricsTable(data, title=title)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def mean(vals: List[float]) -> float:
|
|
215
|
+
"""
|
|
216
|
+
Calculate the mean of a list of values.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
vals: List of values
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Mean value
|
|
223
|
+
"""
|
|
224
|
+
return round(sum(vals) / len(vals), 2) if vals else 0.0
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def to_pct(value: float | None, decimals: int = 0) -> str:
|
|
228
|
+
"""
|
|
229
|
+
Convert a value to a percentage string.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
value: Value to convert
|
|
233
|
+
decimals: Number of decimal places
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Percentage string
|
|
237
|
+
"""
|
|
238
|
+
if value is None:
|
|
239
|
+
return "NA"
|
|
240
|
+
try:
|
|
241
|
+
return f"{round(float(value) * 100, decimals)}%"
|
|
242
|
+
except Exception:
|
|
243
|
+
return "NA"
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def average(array) -> float:
|
|
247
|
+
if len(array) == 0:
|
|
248
|
+
return math.nan
|
|
249
|
+
|
|
250
|
+
else:
|
|
251
|
+
return sum(array) / len(array)
|
|
104
252
|
|
|
105
253
|
|
|
106
254
|
def safe_divide(nom, denom):
|
|
@@ -122,20 +270,114 @@ def is_ibm_cloud_url(service_url: str) -> bool:
|
|
|
122
270
|
|
|
123
271
|
def add_line_seperator(
|
|
124
272
|
style_config: Optional[Union[str, Style]] = None,
|
|
273
|
+
print=True,
|
|
125
274
|
):
|
|
275
|
+
"""
|
|
276
|
+
Adds a lined seperator provided the style config.
|
|
277
|
+
`print` is a boolean to indicate if the lined seperator should go to stdout immeadiatly or returned as an object.
|
|
278
|
+
Set `print` to False, the lined seperator is printed later as part of the pager view for example.
|
|
279
|
+
"""
|
|
126
280
|
|
|
127
281
|
if not style_config:
|
|
128
282
|
style = "grey42"
|
|
129
283
|
else:
|
|
130
284
|
style = style_config
|
|
131
285
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
286
|
+
if print:
|
|
287
|
+
console.print(
|
|
288
|
+
Rule(
|
|
289
|
+
style=style,
|
|
290
|
+
)
|
|
135
291
|
)
|
|
292
|
+
else:
|
|
293
|
+
return Rule(style=style, characters="==")
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def get_reference_column(base_name: str) -> str:
|
|
297
|
+
"""Generate a column name with the reference suffix."""
|
|
298
|
+
return f"{base_name}_{REFERENCE_FILE_NAME}"
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def get_experiment_column(base_name: str) -> str:
|
|
302
|
+
"""Generate a column name with the experiment suffix."""
|
|
303
|
+
return f"{base_name}_{EXPERIMENT_FILE_NAME}"
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def get_diff_column(base_name: str) -> str:
|
|
307
|
+
"""Generate a diff column name."""
|
|
308
|
+
return f"{base_name}_diff"
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def get_column_value(
|
|
312
|
+
row: Dict[str, Any], base_name: str, file_type: str
|
|
313
|
+
) -> Any:
|
|
314
|
+
"""Get a value from a column with the appropriate suffix.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
row: The data row
|
|
318
|
+
base_name: The base column name
|
|
319
|
+
file_type: Either 'reference' or 'experiment'
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
The value from the column, or None if not found
|
|
323
|
+
"""
|
|
324
|
+
if file_type.lower() == "reference":
|
|
325
|
+
key = get_reference_column(base_name)
|
|
326
|
+
elif file_type.lower() == "experiment":
|
|
327
|
+
key = get_experiment_column(base_name)
|
|
328
|
+
else:
|
|
329
|
+
raise ValueError(f"Invalid file_type: {file_type}")
|
|
330
|
+
|
|
331
|
+
return row.get(key)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def has_column_in_both(row: Dict[str, Any], base_name: str) -> bool:
|
|
335
|
+
"""Check if a column exists with both reference and experiment suffixes."""
|
|
336
|
+
return (
|
|
337
|
+
get_reference_column(base_name) in row
|
|
338
|
+
and get_experiment_column(base_name) in row
|
|
136
339
|
)
|
|
137
340
|
|
|
138
341
|
|
|
342
|
+
def format_ratio(ratio: Optional[float]) -> str:
|
|
343
|
+
"""Format a ratio as a percentage string."""
|
|
344
|
+
if ratio is None:
|
|
345
|
+
return "N/A"
|
|
346
|
+
return f"{ratio * 100:.1f}%"
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def read_file(path: str, type: str = "csv") -> List[Dict[str, Any]]:
|
|
350
|
+
"""Read a file and return its contents as a structured object."""
|
|
351
|
+
if type == "csv":
|
|
352
|
+
return read_csv_file(path)
|
|
353
|
+
elif type == "json":
|
|
354
|
+
# Add JSON reading logic if needed
|
|
355
|
+
raise NotImplementedError("JSON reading not yet implemented")
|
|
356
|
+
else:
|
|
357
|
+
raise ValueError(f"Unsupported file type: {type}")
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def read_csv_file(file_path: str) -> List[Dict[str, Any]]:
|
|
361
|
+
"""Read a CSV file and return a list of dictionaries."""
|
|
362
|
+
data = []
|
|
363
|
+
with open(file_path, "r") as f:
|
|
364
|
+
reader = csv.DictReader(f)
|
|
365
|
+
for row in reader:
|
|
366
|
+
# Convert numeric values to appropriate types
|
|
367
|
+
for key, value in row.items():
|
|
368
|
+
if key == "dataset_name" or key == "text_match":
|
|
369
|
+
continue
|
|
370
|
+
elif key == "is_success":
|
|
371
|
+
row[key] = value.lower() == "true"
|
|
372
|
+
else:
|
|
373
|
+
try:
|
|
374
|
+
row[key] = float(value)
|
|
375
|
+
except ValueError:
|
|
376
|
+
pass
|
|
377
|
+
data.append(row)
|
|
378
|
+
return data
|
|
379
|
+
|
|
380
|
+
|
|
139
381
|
class FaithfulnessTable:
|
|
140
382
|
def __init__(
|
|
141
383
|
self, faithfulness_metrics: List[Faithfulness], tool_call_ids: List[str]
|
|
@@ -346,6 +588,7 @@ class ReferencelessEvalPanel:
|
|
|
346
588
|
|
|
347
589
|
# Function to load messages from JSON file
|
|
348
590
|
def load_messages(file_path):
|
|
591
|
+
"""TODO: replace in favor of TestCaseResources.get_messages(...)"""
|
|
349
592
|
with open(file_path, "r") as f:
|
|
350
593
|
try:
|
|
351
594
|
message_data = json.load(f)
|
|
@@ -361,7 +604,7 @@ def load_messages(file_path):
|
|
|
361
604
|
return None
|
|
362
605
|
|
|
363
606
|
|
|
364
|
-
def
|
|
607
|
+
def load_agents_from_disk(agents_path: str):
|
|
365
608
|
agents_json = glob.glob(os.path.join(agents_path, "*.json"))
|
|
366
609
|
agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
|
|
367
610
|
|
|
@@ -376,3 +619,64 @@ def load_agents(agents_path: str):
|
|
|
376
619
|
agents.append(yaml.safe_load(f))
|
|
377
620
|
|
|
378
621
|
return agents
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def list_run_files(messages_dir: str, dataset_base: str, filter_run: int = -1):
|
|
625
|
+
"""
|
|
626
|
+
Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
|
|
627
|
+
(We only need analyze+metrics for this feature.)
|
|
628
|
+
|
|
629
|
+
`filter_run` only get gets the runs files for that run. If it is -1, then all run files are retrieved
|
|
630
|
+
For example, if there is `data3.run1.messages.json`, `data3.run2.messages.json`, and filter_run is 2, then,
|
|
631
|
+
the files related to only the second run are retrieved.
|
|
632
|
+
|
|
633
|
+
"""
|
|
634
|
+
runs = defaultdict(
|
|
635
|
+
lambda: {"analyze": None, "metrics": None, "messages": None}
|
|
636
|
+
)
|
|
637
|
+
for fn in os.listdir(messages_dir):
|
|
638
|
+
m = RUN_FILE_RE.match(fn)
|
|
639
|
+
if not m or m.group("base") != dataset_base:
|
|
640
|
+
continue
|
|
641
|
+
run_id = int(m.group("run"))
|
|
642
|
+
if filter_run != -1 and run_id != filter_run:
|
|
643
|
+
continue
|
|
644
|
+
|
|
645
|
+
kind = m.group("kind")
|
|
646
|
+
full = os.path.join(messages_dir, fn)
|
|
647
|
+
if kind == "messages.analyze":
|
|
648
|
+
runs[run_id]["analyze"] = full
|
|
649
|
+
elif kind == "metrics":
|
|
650
|
+
runs[run_id]["metrics"] = full
|
|
651
|
+
elif kind == "messages":
|
|
652
|
+
runs[run_id]["messages"] = full
|
|
653
|
+
return runs
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
|
|
657
|
+
"""Todo remove in a later PR"""
|
|
658
|
+
with open(metrics_path, "r", encoding="utf-8") as f:
|
|
659
|
+
return ToolCallAndRoutingMetrics(**json.load(f))
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def csv_dump(file_path: Union[str, Path], rows: List[Dict[str, Any]]) -> None:
|
|
663
|
+
"""
|
|
664
|
+
Write rows to a CSV file.
|
|
665
|
+
|
|
666
|
+
Args:
|
|
667
|
+
file_path: Path to the output CSV file
|
|
668
|
+
rows: List of dictionaries representing CSV rows
|
|
669
|
+
"""
|
|
670
|
+
if not rows:
|
|
671
|
+
return
|
|
672
|
+
|
|
673
|
+
# Ensure the parent directory exists
|
|
674
|
+
if isinstance(file_path, str):
|
|
675
|
+
file_path = Path(file_path)
|
|
676
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
677
|
+
|
|
678
|
+
# Write to CSV
|
|
679
|
+
with open(file_path, "w", newline="") as f:
|
|
680
|
+
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
|
|
681
|
+
writer.writeheader()
|
|
682
|
+
writer.writerows(rows)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
import urllib3
|
|
6
|
+
from urllib3.exceptions import InsecureRequestWarning
|
|
7
|
+
|
|
8
|
+
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class WXOClient:
|
|
12
|
+
def __init__(
|
|
13
|
+
self, service_url, api_key, env: Optional[Dict[str, Any]] = None
|
|
14
|
+
):
|
|
15
|
+
self.service_url = service_url
|
|
16
|
+
self.api_key = api_key
|
|
17
|
+
|
|
18
|
+
ov = os.getenv("WO_SSL_VERIFY")
|
|
19
|
+
if ov and ov.strip().lower() in ("true", "false"):
|
|
20
|
+
self._verify_ssl = ov.strip().lower() == "true"
|
|
21
|
+
else:
|
|
22
|
+
v, bs = (env.get("verify") if env else None), (
|
|
23
|
+
env.get("bypass_ssl") if env else None
|
|
24
|
+
)
|
|
25
|
+
self._verify_ssl = (
|
|
26
|
+
False
|
|
27
|
+
if (
|
|
28
|
+
(bs is True)
|
|
29
|
+
or (isinstance(bs, str) and bs.strip().lower() == "true")
|
|
30
|
+
or (v is None)
|
|
31
|
+
or (
|
|
32
|
+
isinstance(v, str)
|
|
33
|
+
and v.strip().lower() in {"none", "null"}
|
|
34
|
+
)
|
|
35
|
+
)
|
|
36
|
+
else (v if isinstance(v, bool) else True)
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if not self._verify_ssl:
|
|
40
|
+
urllib3.disable_warnings(InsecureRequestWarning)
|
|
41
|
+
|
|
42
|
+
def _get_headers(self) -> dict:
|
|
43
|
+
headers = {}
|
|
44
|
+
if self.api_key:
|
|
45
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
46
|
+
return headers
|
|
47
|
+
|
|
48
|
+
def post(self, payload: dict, path: str, stream=False):
|
|
49
|
+
url = f"{self.service_url}/{path}"
|
|
50
|
+
return requests.post(
|
|
51
|
+
url=url,
|
|
52
|
+
headers=self._get_headers(),
|
|
53
|
+
json=payload,
|
|
54
|
+
stream=stream,
|
|
55
|
+
verify=self._verify_ssl,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def get(self, path: str, params: dict = None):
|
|
59
|
+
url = f"{self.service_url}/{path}"
|
|
60
|
+
return requests.get(
|
|
61
|
+
url,
|
|
62
|
+
params=params,
|
|
63
|
+
headers=self._get_headers(),
|
|
64
|
+
verify=self._verify_ssl,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_wxo_client(
|
|
69
|
+
service_url: Optional[str], tenant_name: str, token: Optional[str] = None
|
|
70
|
+
) -> WXOClient:
|
|
71
|
+
|
|
72
|
+
token, resolved_url, env = tenant_setup(service_url, tenant_name)
|
|
73
|
+
service_url = service_url or resolved_url
|
|
74
|
+
|
|
75
|
+
if not (service_url and str(service_url).strip()):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"service_url not provided and not found in config for tenant '{tenant_name}'"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
|
|
81
|
+
return wxo_client
|
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
|
|
3
|
-
wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
|
|
4
|
-
wxo_agentic_evaluation/arg_configs.py,sha256=KttX3LFPXjg4qRlbeQ-fQ4Qp5-9_Uz5tt4TCx93KRAY,3028
|
|
5
|
-
wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
|
|
6
|
-
wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
|
|
7
|
-
wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
|
|
8
|
-
wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
|
|
9
|
-
wxo_agentic_evaluation/evaluation_package.py,sha256=Ud1h7HDr47Gs4XPUoPagm6oS54Iqb_UWGlcyKoCLnfE,24319
|
|
10
|
-
wxo_agentic_evaluation/inference_backend.py,sha256=mG7Z-Hi63znfJ7vzwCCYNPMc6AHgu7Codnw4puoAM3U,33004
|
|
11
|
-
wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
|
|
12
|
-
wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
|
|
13
|
-
wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
|
|
14
|
-
wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6ew,13574
|
|
15
|
-
wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
|
|
16
|
-
wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
|
|
17
|
-
wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
|
|
18
|
-
wxo_agentic_evaluation/service_instance.py,sha256=lAwfIRJD20vOZFsmtqBt7z4-AmIWE-Fu5VGjmVeyoso,8506
|
|
19
|
-
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
20
|
-
wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
|
|
21
|
-
wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
|
|
22
|
-
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=mI2fyYzbLpSjSr2iwSwpjrOAenxvfA-6h9z2oky0uMs,18349
|
|
23
|
-
wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
|
|
24
|
-
wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
|
|
25
|
-
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
|
|
26
|
-
wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
|
|
27
|
-
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkVvomgY0hlS44N_n_7ld3YAQ5PFZdfU,4200
|
|
28
|
-
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
|
|
29
|
-
wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
|
|
30
|
-
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
-
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
|
|
32
|
-
wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
|
|
33
|
-
wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
|
|
34
|
-
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py,sha256=gY5m5INv0IQrA4Xi2wigAUI1cnxzGPYtMLWCIo9pubQ,5602
|
|
35
|
-
wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
|
|
36
|
-
wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
|
|
37
|
-
wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
|
-
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
|
|
39
|
-
wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
|
|
40
|
-
wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
|
|
41
|
-
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
|
|
42
|
-
wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
|
|
43
|
-
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
|
|
44
|
-
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
|
|
45
|
-
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=GAHtEJvFNtgWBQma1I9KJdhXdhmqbEQf7JY66Z1JLMU,1113
|
|
46
|
-
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=yrLtXfmVIJ_C3XIaTvpqlQGlg9kKIibrVR3UzpzBEmo,1288
|
|
47
|
-
wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
|
|
48
|
-
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
|
|
49
|
-
wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
|
|
50
|
-
wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
|
|
51
|
-
wxo_agentic_evaluation/prompt/template_render.py,sha256=xVy7NOeGk5_XxzTT-YIY4HVAseQFU2SbRMSdvQGa-FE,4829
|
|
52
|
-
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
|
|
53
|
-
wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
|
|
54
|
-
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
|
-
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
56
|
-
wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=pfhMUjddv32pIRewea7o1vn_xrV_LuyC8vRlJ7qVyO8,5267
|
|
57
|
-
wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=Sz9zB5O1ct7EoZCog8GNdwj8yWFZo7HJLPbA9HvelZc,11886
|
|
58
|
-
wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
|
|
59
|
-
wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=XXNP43mEneuDBo_zGPdCVNRdUNy-KGd7kbIKYwKhKJQ,4477
|
|
60
|
-
wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
|
|
61
|
-
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=G2b7rwN0VTLBVGwU1VXKUl4eqT8Ya8zCcOorwkZwrZA,4354
|
|
62
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
|
|
64
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
65
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
|
|
66
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
|
|
67
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
68
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
|
|
69
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
|
|
70
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
|
|
72
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
|
|
73
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
74
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
|
|
75
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
|
|
76
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
|
|
77
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
|
|
78
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
|
|
79
|
-
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=mm7eOx6a_2ExDgck29IkgAzjeQkICpMDXecuxa6ZULo,17182
|
|
80
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
|
|
81
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
|
|
82
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
|
|
83
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
|
|
84
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
|
|
85
|
-
wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
|
|
86
|
-
wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
87
|
-
wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
|
|
88
|
-
wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
|
|
89
|
-
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
|
|
90
|
-
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
|
|
91
|
-
wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
|
|
92
|
-
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
|
|
93
|
-
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=LYSpxOI2oMQSysasb8WT_nn5SdDy-dsLFyJDJHXFtn0,6876
|
|
94
|
-
wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
|
|
95
|
-
wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
|
|
96
|
-
wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
|
|
97
|
-
wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
|
|
98
|
-
wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
|
|
99
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA,sha256=SRO-KH4zJYQhHMhyhDIqrkeoELwrDnTvYbwcIZT9i9w,1435
|
|
100
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
101
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
102
|
-
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD,,
|