ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
-
|
|
4
|
-
import requests
|
|
5
3
|
import shutil
|
|
6
4
|
from pathlib import Path
|
|
7
|
-
from typing import
|
|
5
|
+
from typing import Any, Dict, Iterable, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
8
|
import yaml
|
|
9
9
|
|
|
10
10
|
from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url, is_saas_url
|
|
@@ -143,7 +143,7 @@ class ServiceInstance:
|
|
|
143
143
|
def create_tenant_if_not_exist(self) -> str:
|
|
144
144
|
if self.is_saas:
|
|
145
145
|
logger.info(
|
|
146
|
-
"SaaS mode: running against Remote Service and skipping tenant creation"
|
|
146
|
+
"[d b]SaaS mode: running against Remote Service and skipping tenant creation"
|
|
147
147
|
)
|
|
148
148
|
return None
|
|
149
149
|
|
|
@@ -151,20 +151,24 @@ class ServiceInstance:
|
|
|
151
151
|
default_tenant = self.get_default_tenant(user_auth_token)
|
|
152
152
|
|
|
153
153
|
if not default_tenant:
|
|
154
|
-
logger.info(
|
|
154
|
+
logger.info(
|
|
155
|
+
"[d b]no local tenant found. A default tenant is created"
|
|
156
|
+
)
|
|
155
157
|
self.create_eval_tenant(user_auth_token)
|
|
156
158
|
default_tenant = self.get_default_tenant(user_auth_token)
|
|
157
159
|
else:
|
|
158
|
-
logger.info("local tenant found")
|
|
160
|
+
logger.info("[d b]local tenant found")
|
|
159
161
|
|
|
160
162
|
return default_tenant["id"]
|
|
161
163
|
|
|
164
|
+
|
|
162
165
|
def get_env_settings(
|
|
163
|
-
tenant_name: str,
|
|
164
|
-
env_config_path: Optional[str] = None
|
|
166
|
+
tenant_name: str, env_config_path: Optional[str] = None
|
|
165
167
|
) -> Dict[str, Any]:
|
|
166
168
|
if env_config_path is None:
|
|
167
|
-
env_config_path =
|
|
169
|
+
env_config_path = (
|
|
170
|
+
f"{os.path.expanduser('~')}/.config/orchestrate/config.yaml"
|
|
171
|
+
)
|
|
168
172
|
|
|
169
173
|
try:
|
|
170
174
|
with open(env_config_path, "r", encoding="utf-8") as f:
|
|
@@ -180,12 +184,11 @@ def get_env_settings(
|
|
|
180
184
|
return dict(merged)
|
|
181
185
|
|
|
182
186
|
|
|
183
|
-
|
|
184
187
|
def apply_env_overrides(
|
|
185
188
|
base: Dict[str, Any],
|
|
186
189
|
tenant_name: str,
|
|
187
190
|
keys: Optional[Iterable[str]] = None,
|
|
188
|
-
env_config_path: Optional[str] = None
|
|
191
|
+
env_config_path: Optional[str] = None,
|
|
189
192
|
) -> Dict[str, Any]:
|
|
190
193
|
"""
|
|
191
194
|
Returns a new dict where base is overridden by tenant-defined values.
|
|
@@ -202,8 +205,9 @@ def apply_env_overrides(
|
|
|
202
205
|
return merged
|
|
203
206
|
|
|
204
207
|
|
|
205
|
-
|
|
206
|
-
|
|
208
|
+
def tenant_setup(
|
|
209
|
+
service_url: Optional[str], tenant_name: str
|
|
210
|
+
) -> Tuple[Optional[str], Optional[str], Dict[str, Any]]:
|
|
207
211
|
# service_instance = ServiceInstance(
|
|
208
212
|
# service_url=service_url,
|
|
209
213
|
# tenant_name=tenant_name
|
|
@@ -245,17 +249,22 @@ def tenant_setup(service_url: Optional[str], tenant_name: str) -> Tuple[Optional
|
|
|
245
249
|
|
|
246
250
|
context["active_environment"] = tenant_name
|
|
247
251
|
|
|
252
|
+
# Ensure parent directories exist so tests (which may run in clean envs)
|
|
253
|
+
# can write these files without raising FileNotFoundError.
|
|
254
|
+
auth_dir = os.path.dirname(auth_config_path)
|
|
255
|
+
env_dir = os.path.dirname(env_config_path)
|
|
256
|
+
os.makedirs(auth_dir, exist_ok=True)
|
|
257
|
+
os.makedirs(env_dir, exist_ok=True)
|
|
258
|
+
|
|
248
259
|
with open(auth_config_path, "w") as f:
|
|
249
260
|
yaml.dump(auth_config, f)
|
|
250
261
|
with open(env_config_path, "w") as f:
|
|
251
262
|
yaml.dump(env_config, f)
|
|
252
263
|
|
|
253
264
|
token = (
|
|
254
|
-
auth_config.get("auth", {})
|
|
255
|
-
.get(tenant_name, {})
|
|
256
|
-
.get("wxo_mcsp_token")
|
|
265
|
+
auth_config.get("auth", {}).get(tenant_name, {}).get("wxo_mcsp_token")
|
|
257
266
|
)
|
|
258
267
|
|
|
259
268
|
env_merged = get_env_settings(tenant_name, env_config_path=env_config_path)
|
|
260
269
|
|
|
261
|
-
return token, resolved_service_url, env_merged
|
|
270
|
+
return token, resolved_service_url, env_merged
|
|
@@ -1,6 +1,13 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import os
|
|
2
3
|
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
from rich.logging import RichHandler
|
|
6
|
+
|
|
3
7
|
from wxo_agentic_evaluation.arg_configs import ProviderConfig
|
|
8
|
+
from wxo_agentic_evaluation.service_provider.gateway_provider import (
|
|
9
|
+
GatewayProvider,
|
|
10
|
+
)
|
|
4
11
|
from wxo_agentic_evaluation.service_provider.model_proxy_provider import (
|
|
5
12
|
ModelProxyProvider,
|
|
6
13
|
)
|
|
@@ -8,6 +15,7 @@ from wxo_agentic_evaluation.service_provider.ollama_provider import (
|
|
|
8
15
|
OllamaProvider,
|
|
9
16
|
)
|
|
10
17
|
from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
|
|
18
|
+
GatewayProviderLLMKitWrapper,
|
|
11
19
|
ModelProxyProviderLLMKitWrapper,
|
|
12
20
|
WatsonXLLMKitWrapper,
|
|
13
21
|
)
|
|
@@ -15,24 +23,131 @@ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
|
15
23
|
WatsonXProvider,
|
|
16
24
|
)
|
|
17
25
|
|
|
26
|
+
try:
|
|
27
|
+
from wxo_agentic_evaluation.service_provider.portkey_provider import (
|
|
28
|
+
PortkeyProvider,
|
|
29
|
+
)
|
|
30
|
+
except:
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
USE_GATEWAY_MODEL_PROVIDER: bool = (
|
|
35
|
+
os.environ.get("USE_GATEWAY_MODEL_PROVIDER", "FALSE").upper() == "TRUE"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_logging_console = Console(stderr=True)
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_log_level_from_env():
|
|
45
|
+
|
|
46
|
+
level_env = os.getenv("WXO_EVALUATION_LOGLEVEL")
|
|
47
|
+
return level_env
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
LOGGING_ENABLED = get_log_level_from_env() is not None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def configure_logging_for_package_from_env(
|
|
54
|
+
package_name: str = "wxo_agentic_evaluation",
|
|
55
|
+
ensure_output: bool = True,
|
|
56
|
+
) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Configure logging using the env var WXO_EVALUATION_LOGLEVEL - no logging if that's not set
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
level_env = get_log_level_from_env()
|
|
62
|
+
if not level_env:
|
|
63
|
+
return
|
|
64
|
+
|
|
65
|
+
level = None
|
|
66
|
+
upper = level_env.strip().upper()
|
|
67
|
+
if hasattr(logging, upper):
|
|
68
|
+
level = getattr(logging, upper, None)
|
|
69
|
+
|
|
70
|
+
pkg_logger = logging.getLogger(package_name)
|
|
71
|
+
pkg_logger.setLevel(level)
|
|
72
|
+
|
|
73
|
+
if ensure_output:
|
|
74
|
+
if not pkg_logger.handlers:
|
|
75
|
+
handler = RichHandler(
|
|
76
|
+
console=_logging_console,
|
|
77
|
+
rich_tracebacks=True,
|
|
78
|
+
show_time=False,
|
|
79
|
+
show_level=False,
|
|
80
|
+
show_path=False,
|
|
81
|
+
markup=True,
|
|
82
|
+
enable_link_path=True,
|
|
83
|
+
omit_repeated_times=True,
|
|
84
|
+
tracebacks_theme="github-dark",
|
|
85
|
+
)
|
|
86
|
+
handler.setFormatter(
|
|
87
|
+
logging.Formatter("%(levelname)s %(message)s")
|
|
88
|
+
)
|
|
89
|
+
handler.setLevel(logging.NOTSET)
|
|
90
|
+
pkg_logger.addHandler(handler)
|
|
91
|
+
pkg_logger.propagate = False
|
|
92
|
+
|
|
93
|
+
# Quiet common noisy debug libs
|
|
94
|
+
for name in (
|
|
95
|
+
"urllib3",
|
|
96
|
+
"urllib3.connectionpool",
|
|
97
|
+
"requests.packages.urllib3",
|
|
98
|
+
):
|
|
99
|
+
logging.getLogger(name).setLevel(logging.WARNING)
|
|
100
|
+
except:
|
|
101
|
+
logger.warning("Input log level %s not valid", level_env)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
configure_logging_for_package_from_env()
|
|
105
|
+
|
|
18
106
|
|
|
19
107
|
def _instantiate_provider(
|
|
20
108
|
config: ProviderConfig, is_referenceless_eval: bool = False, **kwargs
|
|
21
109
|
):
|
|
110
|
+
|
|
22
111
|
if config.provider == "watsonx":
|
|
112
|
+
logger.info("Instantiate watsonx provider")
|
|
23
113
|
if is_referenceless_eval:
|
|
24
114
|
provider = WatsonXLLMKitWrapper
|
|
25
115
|
else:
|
|
26
116
|
provider = WatsonXProvider
|
|
27
|
-
return provider(
|
|
117
|
+
return provider(
|
|
118
|
+
model_id=config.model_id,
|
|
119
|
+
embedding_model_id=config.embedding_model_id,
|
|
120
|
+
**kwargs,
|
|
121
|
+
)
|
|
28
122
|
elif config.provider == "ollama":
|
|
123
|
+
logger.info("Instantiate Ollama")
|
|
29
124
|
return OllamaProvider(model_id=config.model_id, **kwargs)
|
|
125
|
+
|
|
126
|
+
elif config.provider == "gateway":
|
|
127
|
+
logger.info("Instantiate gateway inference provider")
|
|
128
|
+
if is_referenceless_eval:
|
|
129
|
+
provider = GatewayProviderLLMKitWrapper
|
|
130
|
+
else:
|
|
131
|
+
provider = GatewayProvider
|
|
132
|
+
return provider(
|
|
133
|
+
model_id=config.model_id,
|
|
134
|
+
embedding_model_id=config.embedding_model_id,
|
|
135
|
+
**kwargs,
|
|
136
|
+
)
|
|
137
|
+
|
|
30
138
|
elif config.provider == "model_proxy":
|
|
139
|
+
logger.info("Instantiate model proxy provider")
|
|
31
140
|
if is_referenceless_eval:
|
|
32
141
|
provider = ModelProxyProviderLLMKitWrapper
|
|
33
142
|
else:
|
|
34
143
|
provider = ModelProxyProvider
|
|
35
|
-
|
|
144
|
+
|
|
145
|
+
return provider(
|
|
146
|
+
model_id=config.model_id,
|
|
147
|
+
embedding_model_id=config.embedding_model_id,
|
|
148
|
+
**kwargs,
|
|
149
|
+
)
|
|
150
|
+
|
|
36
151
|
else:
|
|
37
152
|
raise RuntimeError(
|
|
38
153
|
f"target provider is not supported {config.provider}"
|
|
@@ -42,23 +157,44 @@ def _instantiate_provider(
|
|
|
42
157
|
def get_provider(
|
|
43
158
|
config: ProviderConfig = None,
|
|
44
159
|
model_id: str = None,
|
|
160
|
+
embedding_model_id: str = None,
|
|
45
161
|
referenceless_eval: bool = False,
|
|
162
|
+
provider: str = None,
|
|
163
|
+
api_key: str = None,
|
|
164
|
+
use_portkey_provider: bool = False,
|
|
46
165
|
**kwargs,
|
|
47
166
|
):
|
|
167
|
+
|
|
168
|
+
if use_portkey_provider:
|
|
169
|
+
return PortkeyProvider(
|
|
170
|
+
provider=provider, model_id=model_id, api_key=api_key, **kwargs
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if config:
|
|
174
|
+
return _instantiate_provider(config, **kwargs)
|
|
175
|
+
|
|
48
176
|
if not model_id:
|
|
49
177
|
raise ValueError("model_id must be provided if config is not supplied")
|
|
50
178
|
|
|
179
|
+
if USE_GATEWAY_MODEL_PROVIDER:
|
|
180
|
+
logger.info("[d b]Using gateway inference provider override")
|
|
181
|
+
config = ProviderConfig(provider="gateway", model_id=model_id)
|
|
182
|
+
return _instantiate_provider(config, referenceless_eval, **kwargs)
|
|
183
|
+
|
|
51
184
|
if "WATSONX_APIKEY" in os.environ and "WATSONX_SPACE_ID" in os.environ:
|
|
52
|
-
|
|
185
|
+
logger.info("[d b]Using watsonx inference provider")
|
|
186
|
+
config = ProviderConfig(
|
|
187
|
+
provider="watsonx",
|
|
188
|
+
model_id=model_id,
|
|
189
|
+
embedding_model_id=embedding_model_id,
|
|
190
|
+
)
|
|
53
191
|
return _instantiate_provider(config, referenceless_eval, **kwargs)
|
|
54
192
|
|
|
55
193
|
if "WO_INSTANCE" in os.environ:
|
|
194
|
+
logger.info("[d b]Using model_proxy inference provider")
|
|
56
195
|
config = ProviderConfig(provider="model_proxy", model_id=model_id)
|
|
57
196
|
return _instantiate_provider(config, referenceless_eval, **kwargs)
|
|
58
197
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
raise RuntimeError(
|
|
63
|
-
"No provider found. Please either provide a config or set the required environment variables."
|
|
64
|
-
)
|
|
198
|
+
logger.info("[d b]Using gateway inference provider default")
|
|
199
|
+
config = ProviderConfig(provider="gateway", model_id=model_id)
|
|
200
|
+
return _instantiate_provider(config, referenceless_eval, **kwargs)
|