ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,10 @@
1
1
  import logging
2
2
  import os
3
-
4
- import requests
5
3
  import shutil
6
4
  from pathlib import Path
7
- from typing import Optional, Any, Dict, Iterable, Tuple
5
+ from typing import Any, Dict, Iterable, Optional, Tuple
6
+
7
+ import requests
8
8
  import yaml
9
9
 
10
10
  from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url, is_saas_url
@@ -143,7 +143,7 @@ class ServiceInstance:
143
143
  def create_tenant_if_not_exist(self) -> str:
144
144
  if self.is_saas:
145
145
  logger.info(
146
- "SaaS mode: running against Remote Service and skipping tenant creation"
146
+ "[d b]SaaS mode: running against Remote Service and skipping tenant creation"
147
147
  )
148
148
  return None
149
149
 
@@ -151,20 +151,24 @@ class ServiceInstance:
151
151
  default_tenant = self.get_default_tenant(user_auth_token)
152
152
 
153
153
  if not default_tenant:
154
- logger.info("no local tenant found. A default tenant is created")
154
+ logger.info(
155
+ "[d b]no local tenant found. A default tenant is created"
156
+ )
155
157
  self.create_eval_tenant(user_auth_token)
156
158
  default_tenant = self.get_default_tenant(user_auth_token)
157
159
  else:
158
- logger.info("local tenant found")
160
+ logger.info("[d b]local tenant found")
159
161
 
160
162
  return default_tenant["id"]
161
163
 
164
+
162
165
  def get_env_settings(
163
- tenant_name: str,
164
- env_config_path: Optional[str] = None
166
+ tenant_name: str, env_config_path: Optional[str] = None
165
167
  ) -> Dict[str, Any]:
166
168
  if env_config_path is None:
167
- env_config_path = f"{os.path.expanduser('~')}/.config/orchestrate/config.yaml"
169
+ env_config_path = (
170
+ f"{os.path.expanduser('~')}/.config/orchestrate/config.yaml"
171
+ )
168
172
 
169
173
  try:
170
174
  with open(env_config_path, "r", encoding="utf-8") as f:
@@ -180,12 +184,11 @@ def get_env_settings(
180
184
  return dict(merged)
181
185
 
182
186
 
183
-
184
187
  def apply_env_overrides(
185
188
  base: Dict[str, Any],
186
189
  tenant_name: str,
187
190
  keys: Optional[Iterable[str]] = None,
188
- env_config_path: Optional[str] = None
191
+ env_config_path: Optional[str] = None,
189
192
  ) -> Dict[str, Any]:
190
193
  """
191
194
  Returns a new dict where base is overridden by tenant-defined values.
@@ -202,8 +205,9 @@ def apply_env_overrides(
202
205
  return merged
203
206
 
204
207
 
205
-
206
- def tenant_setup(service_url: Optional[str], tenant_name: str) -> Tuple[Optional[str], Optional[str], Dict[str, Any]]:
208
+ def tenant_setup(
209
+ service_url: Optional[str], tenant_name: str
210
+ ) -> Tuple[Optional[str], Optional[str], Dict[str, Any]]:
207
211
  # service_instance = ServiceInstance(
208
212
  # service_url=service_url,
209
213
  # tenant_name=tenant_name
@@ -245,17 +249,22 @@ def tenant_setup(service_url: Optional[str], tenant_name: str) -> Tuple[Optional
245
249
 
246
250
  context["active_environment"] = tenant_name
247
251
 
252
+ # Ensure parent directories exist so tests (which may run in clean envs)
253
+ # can write these files without raising FileNotFoundError.
254
+ auth_dir = os.path.dirname(auth_config_path)
255
+ env_dir = os.path.dirname(env_config_path)
256
+ os.makedirs(auth_dir, exist_ok=True)
257
+ os.makedirs(env_dir, exist_ok=True)
258
+
248
259
  with open(auth_config_path, "w") as f:
249
260
  yaml.dump(auth_config, f)
250
261
  with open(env_config_path, "w") as f:
251
262
  yaml.dump(env_config, f)
252
263
 
253
264
  token = (
254
- auth_config.get("auth", {})
255
- .get(tenant_name, {})
256
- .get("wxo_mcsp_token")
265
+ auth_config.get("auth", {}).get(tenant_name, {}).get("wxo_mcsp_token")
257
266
  )
258
267
 
259
268
  env_merged = get_env_settings(tenant_name, env_config_path=env_config_path)
260
269
 
261
- return token, resolved_service_url, env_merged
270
+ return token, resolved_service_url, env_merged
@@ -1,6 +1,13 @@
1
+ import logging
1
2
  import os
2
3
 
4
+ from rich.console import Console
5
+ from rich.logging import RichHandler
6
+
3
7
  from wxo_agentic_evaluation.arg_configs import ProviderConfig
8
+ from wxo_agentic_evaluation.service_provider.gateway_provider import (
9
+ GatewayProvider,
10
+ )
4
11
  from wxo_agentic_evaluation.service_provider.model_proxy_provider import (
5
12
  ModelProxyProvider,
6
13
  )
@@ -8,6 +15,7 @@ from wxo_agentic_evaluation.service_provider.ollama_provider import (
8
15
  OllamaProvider,
9
16
  )
10
17
  from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
18
+ GatewayProviderLLMKitWrapper,
11
19
  ModelProxyProviderLLMKitWrapper,
12
20
  WatsonXLLMKitWrapper,
13
21
  )
@@ -15,24 +23,131 @@ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
15
23
  WatsonXProvider,
16
24
  )
17
25
 
26
+ try:
27
+ from wxo_agentic_evaluation.service_provider.portkey_provider import (
28
+ PortkeyProvider,
29
+ )
30
+ except:
31
+ pass
32
+
33
+
34
+ USE_GATEWAY_MODEL_PROVIDER: bool = (
35
+ os.environ.get("USE_GATEWAY_MODEL_PROVIDER", "FALSE").upper() == "TRUE"
36
+ )
37
+
38
+
39
+ _logging_console = Console(stderr=True)
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ def get_log_level_from_env():
45
+
46
+ level_env = os.getenv("WXO_EVALUATION_LOGLEVEL")
47
+ return level_env
48
+
49
+
50
+ LOGGING_ENABLED = get_log_level_from_env() is not None
51
+
52
+
53
+ def configure_logging_for_package_from_env(
54
+ package_name: str = "wxo_agentic_evaluation",
55
+ ensure_output: bool = True,
56
+ ) -> None:
57
+ """
58
+ Configure logging using the env var WXO_EVALUATION_LOGLEVEL - no logging if that's not set
59
+ """
60
+ try:
61
+ level_env = get_log_level_from_env()
62
+ if not level_env:
63
+ return
64
+
65
+ level = None
66
+ upper = level_env.strip().upper()
67
+ if hasattr(logging, upper):
68
+ level = getattr(logging, upper, None)
69
+
70
+ pkg_logger = logging.getLogger(package_name)
71
+ pkg_logger.setLevel(level)
72
+
73
+ if ensure_output:
74
+ if not pkg_logger.handlers:
75
+ handler = RichHandler(
76
+ console=_logging_console,
77
+ rich_tracebacks=True,
78
+ show_time=False,
79
+ show_level=False,
80
+ show_path=False,
81
+ markup=True,
82
+ enable_link_path=True,
83
+ omit_repeated_times=True,
84
+ tracebacks_theme="github-dark",
85
+ )
86
+ handler.setFormatter(
87
+ logging.Formatter("%(levelname)s %(message)s")
88
+ )
89
+ handler.setLevel(logging.NOTSET)
90
+ pkg_logger.addHandler(handler)
91
+ pkg_logger.propagate = False
92
+
93
+ # Quiet common noisy debug libs
94
+ for name in (
95
+ "urllib3",
96
+ "urllib3.connectionpool",
97
+ "requests.packages.urllib3",
98
+ ):
99
+ logging.getLogger(name).setLevel(logging.WARNING)
100
+ except:
101
+ logger.warning("Input log level %s not valid", level_env)
102
+
103
+
104
+ configure_logging_for_package_from_env()
105
+
18
106
 
19
107
  def _instantiate_provider(
20
108
  config: ProviderConfig, is_referenceless_eval: bool = False, **kwargs
21
109
  ):
110
+
22
111
  if config.provider == "watsonx":
112
+ logger.info("Instantiate watsonx provider")
23
113
  if is_referenceless_eval:
24
114
  provider = WatsonXLLMKitWrapper
25
115
  else:
26
116
  provider = WatsonXProvider
27
- return provider(model_id=config.model_id, **kwargs)
117
+ return provider(
118
+ model_id=config.model_id,
119
+ embedding_model_id=config.embedding_model_id,
120
+ **kwargs,
121
+ )
28
122
  elif config.provider == "ollama":
123
+ logger.info("Instantiate Ollama")
29
124
  return OllamaProvider(model_id=config.model_id, **kwargs)
125
+
126
+ elif config.provider == "gateway":
127
+ logger.info("Instantiate gateway inference provider")
128
+ if is_referenceless_eval:
129
+ provider = GatewayProviderLLMKitWrapper
130
+ else:
131
+ provider = GatewayProvider
132
+ return provider(
133
+ model_id=config.model_id,
134
+ embedding_model_id=config.embedding_model_id,
135
+ **kwargs,
136
+ )
137
+
30
138
  elif config.provider == "model_proxy":
139
+ logger.info("Instantiate model proxy provider")
31
140
  if is_referenceless_eval:
32
141
  provider = ModelProxyProviderLLMKitWrapper
33
142
  else:
34
143
  provider = ModelProxyProvider
35
- return provider(model_id=config.model_id, **kwargs)
144
+
145
+ return provider(
146
+ model_id=config.model_id,
147
+ embedding_model_id=config.embedding_model_id,
148
+ **kwargs,
149
+ )
150
+
36
151
  else:
37
152
  raise RuntimeError(
38
153
  f"target provider is not supported {config.provider}"
@@ -42,23 +157,44 @@ def _instantiate_provider(
42
157
  def get_provider(
43
158
  config: ProviderConfig = None,
44
159
  model_id: str = None,
160
+ embedding_model_id: str = None,
45
161
  referenceless_eval: bool = False,
162
+ provider: str = None,
163
+ api_key: str = None,
164
+ use_portkey_provider: bool = False,
46
165
  **kwargs,
47
166
  ):
167
+
168
+ if use_portkey_provider:
169
+ return PortkeyProvider(
170
+ provider=provider, model_id=model_id, api_key=api_key, **kwargs
171
+ )
172
+
173
+ if config:
174
+ return _instantiate_provider(config, **kwargs)
175
+
48
176
  if not model_id:
49
177
  raise ValueError("model_id must be provided if config is not supplied")
50
178
 
179
+ if USE_GATEWAY_MODEL_PROVIDER:
180
+ logger.info("[d b]Using gateway inference provider override")
181
+ config = ProviderConfig(provider="gateway", model_id=model_id)
182
+ return _instantiate_provider(config, referenceless_eval, **kwargs)
183
+
51
184
  if "WATSONX_APIKEY" in os.environ and "WATSONX_SPACE_ID" in os.environ:
52
- config = ProviderConfig(provider="watsonx", model_id=model_id)
185
+ logger.info("[d b]Using watsonx inference provider")
186
+ config = ProviderConfig(
187
+ provider="watsonx",
188
+ model_id=model_id,
189
+ embedding_model_id=embedding_model_id,
190
+ )
53
191
  return _instantiate_provider(config, referenceless_eval, **kwargs)
54
192
 
55
193
  if "WO_INSTANCE" in os.environ:
194
+ logger.info("[d b]Using model_proxy inference provider")
56
195
  config = ProviderConfig(provider="model_proxy", model_id=model_id)
57
196
  return _instantiate_provider(config, referenceless_eval, **kwargs)
58
197
 
59
- if config:
60
- return _instantiate_provider(config, **kwargs)
61
-
62
- raise RuntimeError(
63
- "No provider found. Please either provide a config or set the required environment variables."
64
- )
198
+ logger.info("[d b]Using gateway inference provider default")
199
+ config = ProviderConfig(provider="gateway", model_id=model_id)
200
+ return _instantiate_provider(config, referenceless_eval, **kwargs)