ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,7 @@ from jsonargparse import CLI
|
|
|
6
6
|
|
|
7
7
|
from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
|
|
8
8
|
from wxo_agentic_evaluation.data_annotator import DataAnnotator
|
|
9
|
-
from wxo_agentic_evaluation.type import
|
|
9
|
+
from wxo_agentic_evaluation.type import Message, OrchestrateDataset
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def main(config: TestCaseGenerationConfig):
|
|
@@ -17,7 +17,7 @@ def main(config: TestCaseGenerationConfig):
|
|
|
17
17
|
messages.append(Message.model_validate(entry))
|
|
18
18
|
|
|
19
19
|
with open(config.seed_data_path, "r") as f:
|
|
20
|
-
evaluation_data =
|
|
20
|
+
evaluation_data = OrchestrateDataset(**json.load(f))
|
|
21
21
|
|
|
22
22
|
# Generate annonated dataset
|
|
23
23
|
annotator = DataAnnotator(
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
+
from enum import StrEnum
|
|
3
4
|
from typing import List, Optional, Union
|
|
4
5
|
|
|
5
6
|
from wxo_agentic_evaluation import __file__
|
|
@@ -30,7 +31,33 @@ class LLMUserConfig:
|
|
|
30
31
|
@dataclass
|
|
31
32
|
class ProviderConfig:
|
|
32
33
|
model_id: str = field(default="meta-llama/llama-3-405b-instruct")
|
|
33
|
-
provider: str = field(
|
|
34
|
+
provider: str = field(
|
|
35
|
+
default_factory=lambda: (
|
|
36
|
+
"gateway"
|
|
37
|
+
if os.getenv("USE_GATEWAY_MODEL_PROVIDER", "").lower() == "true"
|
|
38
|
+
else "watsonx"
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
embedding_model_id: str = field(
|
|
42
|
+
default="sentence-transformers/all-minilm-l6-v2"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CustomMetricsConfig:
|
|
48
|
+
paths: Optional[list[str]] = field(default=None)
|
|
49
|
+
llmaaj_config: ProviderConfig = field(default_factory=ProviderConfig)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ExtractorsConfig:
|
|
54
|
+
paths: Optional[list[str]] = field(default=None)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ControllerConfig:
|
|
59
|
+
enable_verbose_logging: bool = True
|
|
60
|
+
enable_manual_user_input: bool = False
|
|
34
61
|
|
|
35
62
|
|
|
36
63
|
@dataclass
|
|
@@ -41,11 +68,21 @@ class TestConfig:
|
|
|
41
68
|
wxo_lite_version: str
|
|
42
69
|
provider_config: ProviderConfig = field(default_factory=ProviderConfig)
|
|
43
70
|
llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
|
|
71
|
+
custom_metrics_config: CustomMetricsConfig = field(
|
|
72
|
+
default_factory=CustomMetricsConfig
|
|
73
|
+
)
|
|
74
|
+
extractors_config: ExtractorsConfig = field(default_factory=ExtractorsConfig)
|
|
44
75
|
enable_verbose_logging: bool = True
|
|
45
76
|
enable_manual_user_input: bool = False
|
|
46
77
|
skip_available_results: bool = False
|
|
47
78
|
data_annotation_run: bool = False
|
|
48
79
|
num_workers: int = 2
|
|
80
|
+
n_runs: int = 1
|
|
81
|
+
similarity_threshold: float = 0.8
|
|
82
|
+
enable_fuzzy_matching: bool = False
|
|
83
|
+
strict_topological_matching: bool = True
|
|
84
|
+
enable_recursive_search: bool = False
|
|
85
|
+
skip_legacy_evaluation: bool = False # Skip legacy evaluation and only run user/agent simulations
|
|
49
86
|
|
|
50
87
|
|
|
51
88
|
@dataclass
|
|
@@ -58,22 +95,32 @@ class AttackConfig:
|
|
|
58
95
|
enable_verbose_logging: bool = True
|
|
59
96
|
enable_manual_user_input: bool = False
|
|
60
97
|
num_workers: int = 2
|
|
98
|
+
skip_available_results: bool = True
|
|
61
99
|
|
|
62
100
|
|
|
63
101
|
@dataclass
|
|
64
102
|
class AttackGeneratorConfig:
|
|
65
103
|
attacks_list: Union[List[str], str]
|
|
66
104
|
datasets_path: Union[List[str], str]
|
|
67
|
-
|
|
105
|
+
agents_list_or_path: Union[List[str], str]
|
|
68
106
|
target_agent_name: str
|
|
107
|
+
auth_config: AuthConfig
|
|
69
108
|
output_dir: str = None
|
|
70
109
|
max_variants: int = None
|
|
71
110
|
|
|
72
111
|
|
|
112
|
+
class AnalyzeMode(StrEnum):
|
|
113
|
+
default = "default"
|
|
114
|
+
enhanced = "enhanced"
|
|
115
|
+
|
|
116
|
+
|
|
73
117
|
@dataclass
|
|
74
118
|
class AnalyzeConfig:
|
|
75
119
|
data_path: str
|
|
76
120
|
tool_definition_path: Optional[str] = None
|
|
121
|
+
mode: str = AnalyzeMode.default
|
|
122
|
+
num_workers: int = 10
|
|
123
|
+
run: int = -1
|
|
77
124
|
|
|
78
125
|
|
|
79
126
|
@dataclass
|
|
@@ -117,3 +164,14 @@ class BatchAnnotateConfig:
|
|
|
117
164
|
stories_path: str
|
|
118
165
|
output_dir: str
|
|
119
166
|
num_variants: int = 2
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@dataclass
|
|
170
|
+
class CompareRunsConfig:
|
|
171
|
+
reference_file_location: str
|
|
172
|
+
experiment_file_location: str
|
|
173
|
+
csv_output: Optional[str] = None
|
|
174
|
+
column_stats_csv: Optional[str] = (
|
|
175
|
+
"column_by_column_summary_stats_comparison.csv"
|
|
176
|
+
)
|
|
177
|
+
verbose: bool = False
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.type import Message
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseUserSimulator(ABC):
|
|
8
|
+
"""Abstract base class for user simulators."""
|
|
9
|
+
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def generate_user_input(
|
|
12
|
+
self, user_story: str, conversation_history: List[Message], **kwargs
|
|
13
|
+
) -> Message:
|
|
14
|
+
"""
|
|
15
|
+
Generate user input based on the user story and conversation history.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
user_story: The user's story or goal
|
|
19
|
+
conversation_history: List of previous messages in the conversation
|
|
20
|
+
**kwargs: Additional parameters specific to the simulator implementation
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Message: The generated user input message
|
|
24
|
+
"""
|
|
25
|
+
pass
|
|
@@ -125,9 +125,11 @@ def generate_multiple_in_one(
|
|
|
125
125
|
output_dir,
|
|
126
126
|
starting_index,
|
|
127
127
|
model_id="meta-llama/llama-3-405b-instruct",
|
|
128
|
+
# model_id="gpt-4o",
|
|
128
129
|
):
|
|
129
130
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
130
131
|
|
|
132
|
+
# Legacy provider (e.g., watsonx)
|
|
131
133
|
provider = get_provider(
|
|
132
134
|
model_id=model_id,
|
|
133
135
|
params={
|
|
@@ -135,9 +137,24 @@ def generate_multiple_in_one(
|
|
|
135
137
|
"decoding_method": "greedy",
|
|
136
138
|
"max_new_tokens": 3000,
|
|
137
139
|
},
|
|
140
|
+
use_portkey_provider=False,
|
|
138
141
|
)
|
|
139
|
-
|
|
140
|
-
|
|
142
|
+
response = provider.chat(prompt).text
|
|
143
|
+
|
|
144
|
+
# # OpenAI provider
|
|
145
|
+
# provider = get_provider(provider="openai", model_id=model_id, api_key=os.getenv("OPENAI_API_KEY"))
|
|
146
|
+
# response = provider.chat(prompt).choices[0].message.content
|
|
147
|
+
|
|
148
|
+
# # Azure OpenAI provider
|
|
149
|
+
# provider = get_provider(
|
|
150
|
+
# provider = "azure-openai",
|
|
151
|
+
# azure_model_name = model_id,
|
|
152
|
+
# azure_deployment_id = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
|
|
153
|
+
# azure_resource_name = os.getenv("AZURE_OPENAI_RESOURCE_NAME"),
|
|
154
|
+
# azure_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
|
|
155
|
+
# api_key = f"Bearer {os.getenv('AZURE_OPENAI_API_KEY')}"
|
|
156
|
+
# )
|
|
157
|
+
# response = provider.chat(prompt).choices[0].message.content
|
|
141
158
|
|
|
142
159
|
try:
|
|
143
160
|
raw_text = response
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from dataclasses import asdict, dataclass
|
|
3
|
+
|
|
4
|
+
from wxo_agentic_evaluation.arg_configs import ProviderConfig, TestConfig
|
|
5
|
+
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
6
|
+
from wxo_agentic_evaluation.prompt.template_render import (
|
|
7
|
+
LlamaUserTemplateRenderer,
|
|
8
|
+
)
|
|
9
|
+
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
10
|
+
from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
|
|
11
|
+
WXORuntimeAdapter,
|
|
12
|
+
)
|
|
13
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
14
|
+
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
15
|
+
from wxo_agentic_evaluation.wxo_client import WXOClient, get_wxo_client
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Clients:
|
|
20
|
+
wxo_client: WXOClient
|
|
21
|
+
llmaaj_provider: Provider
|
|
22
|
+
resource_map: ResourceMap
|
|
23
|
+
inference_backend: WXORuntimeAdapter
|
|
24
|
+
llm_user: LLMUser
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def bootstrap_clients(config: TestConfig) -> Clients:
|
|
28
|
+
"""
|
|
29
|
+
Bootstrap all clients needed for the evaluation.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
config: The test configuration
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
A tuple containing:
|
|
36
|
+
- wxo_client: The WXO client
|
|
37
|
+
- llmaaj_provider: The provider for custom metrics
|
|
38
|
+
- resource_map: The resource map
|
|
39
|
+
- inference_backend: The inference backend
|
|
40
|
+
- llm_user: The LLM user
|
|
41
|
+
"""
|
|
42
|
+
# Initialize WXO client
|
|
43
|
+
wxo_client = get_wxo_client(
|
|
44
|
+
config.auth_config.url,
|
|
45
|
+
config.auth_config.tenant_name,
|
|
46
|
+
config.auth_config.token,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Initialize provider for custom metrics
|
|
50
|
+
original_provider_config = config.provider_config
|
|
51
|
+
provider_config_dict = asdict(original_provider_config)
|
|
52
|
+
|
|
53
|
+
provider_kwargs = {
|
|
54
|
+
"config": ProviderConfig(**provider_config_dict),
|
|
55
|
+
"model_id": config.llm_user_config.model_id,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
if provider_config_dict.get("provider", "gateway") == "gateway":
|
|
59
|
+
provider_kwargs.update(
|
|
60
|
+
token=config.auth_config.token or wxo_client.api_key,
|
|
61
|
+
instance_url=wxo_client.service_url,
|
|
62
|
+
)
|
|
63
|
+
config.auth_config.token = (
|
|
64
|
+
config.auth_config.token or wxo_client.api_key
|
|
65
|
+
)
|
|
66
|
+
config.auth_config.url = (
|
|
67
|
+
config.auth_config.url or wxo_client.service_url
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Initialize resource map
|
|
71
|
+
resource_map = ResourceMap(wxo_client)
|
|
72
|
+
|
|
73
|
+
# Initialize inference backend
|
|
74
|
+
inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
|
|
75
|
+
|
|
76
|
+
# Initialize LLM user
|
|
77
|
+
llm_user = LLMUser(
|
|
78
|
+
wai_client=get_provider(**provider_kwargs),
|
|
79
|
+
template=LlamaUserTemplateRenderer(
|
|
80
|
+
config.llm_user_config.prompt_config
|
|
81
|
+
),
|
|
82
|
+
user_response_style=config.llm_user_config.user_response_style,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
llamaj_provider_kwargs = copy.deepcopy(provider_kwargs)
|
|
86
|
+
llamaj_config_dict = asdict(llamaj_provider_kwargs["config"])
|
|
87
|
+
|
|
88
|
+
llamaj_config_dict["model_id"] = (
|
|
89
|
+
config.custom_metrics_config.llmaaj_config.model_id
|
|
90
|
+
)
|
|
91
|
+
llamaj_config_dict["embedding_model_id"] = (
|
|
92
|
+
config.custom_metrics_config.llmaaj_config.embedding_model_id
|
|
93
|
+
)
|
|
94
|
+
llamaj_provider_kwargs["config"] = ProviderConfig(**llamaj_config_dict)
|
|
95
|
+
llmaaj_provider = get_provider(**llamaj_provider_kwargs)
|
|
96
|
+
|
|
97
|
+
return Clients(
|
|
98
|
+
wxo_client=wxo_client,
|
|
99
|
+
llmaaj_provider=llmaaj_provider,
|
|
100
|
+
resource_map=resource_map,
|
|
101
|
+
inference_backend=inference_backend,
|
|
102
|
+
llm_user=llm_user,
|
|
103
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
import statistics
|
|
4
|
+
import sys
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
|
8
|
+
|
|
9
|
+
from jsonargparse import CLI
|
|
10
|
+
|
|
11
|
+
from wxo_agentic_evaluation.arg_configs import CompareRunsConfig
|
|
12
|
+
from wxo_agentic_evaluation.compare_runs.diff import DiffResults
|
|
13
|
+
from wxo_agentic_evaluation.compare_runs.model import EvaluationResult
|
|
14
|
+
from wxo_agentic_evaluation.utils.utils import create_table, read_file
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main(config: CompareRunsConfig):
|
|
18
|
+
"""Main function to compare two run result files."""
|
|
19
|
+
# Extract values from config
|
|
20
|
+
reference_file = config.reference_file_location
|
|
21
|
+
experiment_file = config.experiment_file_location
|
|
22
|
+
csv_output = config.csv_output
|
|
23
|
+
column_stats_csv = config.column_stats_csv
|
|
24
|
+
verbose = config.verbose
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
# Read the files
|
|
28
|
+
obj1 = read_file(reference_file)
|
|
29
|
+
obj2 = read_file(experiment_file)
|
|
30
|
+
|
|
31
|
+
# Create evaluation results
|
|
32
|
+
result1 = EvaluationResult.from_csv(obj1)
|
|
33
|
+
result2 = EvaluationResult.from_csv(obj2)
|
|
34
|
+
|
|
35
|
+
# Create diff results
|
|
36
|
+
diff_results = DiffResults(result1, result2)
|
|
37
|
+
|
|
38
|
+
# Display summary statistics
|
|
39
|
+
summary_stats = diff_results.summary_statistics()
|
|
40
|
+
summary_table = create_table(summary_stats, title="Summary Statistics")
|
|
41
|
+
print(
|
|
42
|
+
"\nALL metrics are computed on OVERLAPPING test cases, ie cases that exist in both the Reference and Experiment runs\n"
|
|
43
|
+
)
|
|
44
|
+
print(
|
|
45
|
+
"If Experiment - Reference is Positive, that's an increase in the metric. If Experiment - Reference is Negative, that's a decrease in the metric.\n"
|
|
46
|
+
)
|
|
47
|
+
summary_table.print()
|
|
48
|
+
|
|
49
|
+
# Display exclusive tests
|
|
50
|
+
if verbose:
|
|
51
|
+
diff_results.display_exclusive_tests()
|
|
52
|
+
|
|
53
|
+
# Display test cases with differing summary match and success status
|
|
54
|
+
diff_results.display_differing_summary_matches()
|
|
55
|
+
|
|
56
|
+
# Display tabular diff
|
|
57
|
+
diff_results.compute_tabular_diff(verbose=verbose)
|
|
58
|
+
|
|
59
|
+
# Write results to CSV if specified
|
|
60
|
+
if csv_output:
|
|
61
|
+
diff_results.to_csv(csv_output)
|
|
62
|
+
|
|
63
|
+
except Exception as e:
|
|
64
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
65
|
+
return 1
|
|
66
|
+
|
|
67
|
+
return 0
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
args = CLI(CompareRunsConfig, as_positional=False)
|
|
72
|
+
sys.exit(main(args))
|
|
73
|
+
|
|
74
|
+
# Made with Bob
|