ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (49) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +49 -39
  3. wxo_agentic_evaluation/analyze_run.py +822 -344
  4. wxo_agentic_evaluation/arg_configs.py +39 -2
  5. wxo_agentic_evaluation/data_annotator.py +22 -4
  6. wxo_agentic_evaluation/description_quality_checker.py +29 -4
  7. wxo_agentic_evaluation/evaluation_package.py +197 -18
  8. wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
  9. wxo_agentic_evaluation/external_agent/types.py +1 -1
  10. wxo_agentic_evaluation/inference_backend.py +105 -108
  11. wxo_agentic_evaluation/llm_matching.py +104 -2
  12. wxo_agentic_evaluation/llm_user.py +2 -2
  13. wxo_agentic_evaluation/main.py +147 -38
  14. wxo_agentic_evaluation/metrics/__init__.py +5 -0
  15. wxo_agentic_evaluation/metrics/evaluations.py +124 -0
  16. wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
  17. wxo_agentic_evaluation/metrics/metrics.py +64 -1
  18. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  19. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  20. wxo_agentic_evaluation/prompt/template_render.py +20 -2
  21. wxo_agentic_evaluation/quick_eval.py +23 -11
  22. wxo_agentic_evaluation/record_chat.py +18 -10
  23. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +169 -100
  24. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  25. wxo_agentic_evaluation/red_teaming/attack_list.py +78 -8
  26. wxo_agentic_evaluation/red_teaming/attack_runner.py +71 -14
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  30. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +103 -39
  31. wxo_agentic_evaluation/resource_map.py +3 -1
  32. wxo_agentic_evaluation/service_instance.py +12 -3
  33. wxo_agentic_evaluation/service_provider/__init__.py +129 -9
  34. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  35. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
  36. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  37. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  38. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  39. wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
  40. wxo_agentic_evaluation/type.py +15 -5
  41. wxo_agentic_evaluation/utils/__init__.py +44 -3
  42. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  43. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  44. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  45. wxo_agentic_evaluation/utils/parsers.py +71 -0
  46. wxo_agentic_evaluation/utils/utils.py +140 -20
  47. wxo_agentic_evaluation/wxo_client.py +81 -0
  48. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
  49. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
@@ -390,18 +390,18 @@ class PipelineResult(BaseModel):
390
390
  )
391
391
 
392
392
  @model_validator(mode="after")
393
- def compute_overall(cls, values: PipelineResult) -> PipelineResult:
393
+ def compute_overall(self) -> Self:
394
394
  """
395
395
  After validation, compute overall_valid as AND of:
396
396
  • all semantic is_correct flags
397
397
  • if transform exists: all execution_success flags
398
398
  """
399
- static: StaticResult = values.static
399
+ static: StaticResult = self.static
400
400
  if static:
401
401
  # static checks
402
402
  ok = static.final_decision
403
403
 
404
- sem: SemanticResult = values.semantic
404
+ sem: SemanticResult = self.semantic
405
405
  if sem:
406
406
  # semantic checks
407
407
  if sem.general and sem.general.metrics:
@@ -441,11 +441,11 @@ class PipelineResult(BaseModel):
441
441
  if param_avgs:
442
442
  cat_avgs.append(sum(param_avgs) / len(param_avgs))
443
443
 
444
- values.overall_avg_score = (
444
+ self.overall_avg_score = (
445
445
  sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
446
446
  )
447
- values.overall_valid = ok
448
- return values
447
+ self.overall_valid = ok
448
+ return self
449
449
 
450
450
 
451
451
  # ----------------------------------------------------------------------
@@ -531,17 +531,17 @@ class ToolFunctionCall(BaseModel):
531
531
  )
532
532
 
533
533
  @model_validator(mode="after")
534
- def _parse_arguments(cls, values: ToolFunctionCall) -> ToolFunctionCall:
534
+ def _parse_arguments(self) -> Self:
535
535
  """
536
536
  After model construction, parse the `arguments` JSON string
537
537
  into `parsed_arguments`, or raise a ValidationError.
538
538
  """
539
539
  try:
540
- raw = values.arguments
541
- values.parsed_arguments = json.loads(raw)
540
+ raw = self.arguments
541
+ self.parsed_arguments = json.loads(raw)
542
542
  except json.JSONDecodeError as e:
543
543
  raise ValidationError(f"Invalid JSON in arguments: {e}") from e
544
- return values
544
+ return self
545
545
 
546
546
 
547
547
  class ToolCall(BaseModel):
@@ -1,8 +1,9 @@
1
1
  import json
2
- from typing import Any, List, Mapping
2
+ from typing import Any, List, Mapping, Optional
3
3
 
4
4
  import rich
5
5
 
6
+ from wxo_agentic_evaluation.inference_backend import WXOInferenceBackend
6
7
  from wxo_agentic_evaluation.referenceless_eval.function_calling.consts import (
7
8
  METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
8
9
  METRIC_GENERAL_HALLUCINATION_CHECK,
@@ -16,6 +17,15 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
16
17
  )
17
18
  from wxo_agentic_evaluation.service_provider import get_provider
18
19
  from wxo_agentic_evaluation.type import Message
20
+ from wxo_agentic_evaluation.utils.gateway_provider_utils import (
21
+ get_provider_kwargs,
22
+ )
23
+
24
+ DEFAULT_GENERATION_PARAMS = {
25
+ "min_new_tokens": 0,
26
+ "decoding_method": "greedy",
27
+ "max_new_tokens": 4096,
28
+ }
19
29
 
20
30
 
21
31
  class ReferencelessEvaluation:
@@ -31,20 +41,29 @@ class ReferencelessEvaluation:
31
41
  def __init__(
32
42
  self,
33
43
  api_spec: List[Mapping[str, Any]],
34
- messages: List[Message],
35
44
  model_id: str,
36
45
  task_n: str,
37
46
  dataset_name: str,
47
+ runtime_pipeline: bool = True,
48
+ generation_params=DEFAULT_GENERATION_PARAMS,
49
+ inference_backend: Optional[WXOInferenceBackend] = None,
38
50
  ):
39
51
 
40
- self.metrics_client = get_provider(
52
+ extra_kwargs = {}
53
+ if inference_backend is not None:
54
+ wxo_client = getattr(inference_backend, "wxo_client")
55
+ instance_url = getattr(wxo_client, "service_url", None)
56
+ token = getattr(wxo_client, "api_key", None)
57
+ if instance_url:
58
+ extra_kwargs["instance_url"] = instance_url
59
+ if token:
60
+ extra_kwargs["token"] = token
61
+
62
+ self.metrics_client = ReferencelessEvaluation.get_metrics_client(
41
63
  model_id=model_id,
42
- params={
43
- "min_new_tokens": 0,
44
- "decoding_method": "greedy",
45
- "max_new_tokens": 4096,
46
- },
64
+ params=generation_params,
47
65
  referenceless_eval=True,
66
+ **extra_kwargs,
48
67
  )
49
68
 
50
69
  self.pipeline = ReflectionPipeline(
@@ -52,39 +71,54 @@ class ReferencelessEvaluation:
52
71
  general_metrics=[METRIC_GENERAL_HALLUCINATION_CHECK],
53
72
  function_metrics=[METRIC_FUNCTION_SELECTION_APPROPRIATENESS],
54
73
  parameter_metrics=None,
74
+ runtime_pipeline=runtime_pipeline,
55
75
  )
56
76
 
57
77
  self.task_n = task_n
58
78
  self.dataset_name = dataset_name
59
79
 
60
80
  self.apis_specs = [ToolSpec.model_validate(spec) for spec in api_spec]
61
- self.messages = messages
62
81
 
63
- def _run_pipeline(self, examples: List[Mapping[str, Any]]):
64
- results = []
65
- for example in examples:
66
- result = self.pipeline.run_sync(
67
- conversation=example["context"],
68
- inventory=self.apis_specs,
69
- call=example["call"],
70
- continue_on_static=False,
71
- retries=2,
72
- )
73
- result_dict = result.model_dump()
74
- results.append(result_dict)
82
+ @staticmethod
83
+ def get_metrics_client(**kwargs):
75
84
 
76
- return results
85
+ provider_kwargs = get_provider_kwargs(**kwargs)
77
86
 
78
- def run(self):
79
- examples = []
87
+ return get_provider(
88
+ **provider_kwargs,
89
+ )
90
+
91
+ @staticmethod
92
+ def fmt_tool_call(tool_id, tool_call_name, arguments, context):
93
+ call = {
94
+ "call": {
95
+ "id": tool_id,
96
+ "type": "function",
97
+ "function": {
98
+ "name": tool_call_name,
99
+ "arguments": arguments,
100
+ },
101
+ },
102
+ "context": context,
103
+ }
80
104
 
105
+ return call
106
+
107
+ @staticmethod
108
+ def fmt_msgs_referenceless(
109
+ messages: List[Message],
110
+ ) -> List[Mapping[str, Any]]:
111
+ """Assume that the last item in the `messages` array is the tool call, and preceding items
112
+ in the messages array is the context.
113
+ """
114
+ examples = []
81
115
  processed_data = [
82
116
  {
83
117
  k: msg.model_dump().get(k)
84
118
  for k in ["role", "content", "type"]
85
119
  if k in msg.model_dump()
86
120
  }
87
- for msg in self.messages
121
+ for msg in messages
88
122
  ]
89
123
 
90
124
  for idx, message in enumerate(processed_data):
@@ -97,22 +131,47 @@ class ReferencelessEvaluation:
97
131
  if tool_call_msg["name"].startswith("transfer_to"):
98
132
  continue
99
133
 
100
- call = {
101
- "call": {
102
- "id": tool_call_msg.get("id", "1"),
103
- "type": "function",
104
- "function": {
105
- "name": tool_call_msg["name"],
106
- "arguments": json.dumps(tool_call_msg["args"]),
107
- },
108
- },
109
- "context": context,
110
- }
134
+ call = ReferencelessEvaluation.fmt_tool_call(
135
+ tool_id=tool_call_msg.get("id", "1"),
136
+ tool_call_name=tool_call_msg["name"],
137
+ arguments=json.dumps(tool_call_msg["args"]),
138
+ context=context,
139
+ )
111
140
  examples.append(call)
112
141
 
113
- rich.print(
114
- f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
115
- )
142
+ return examples
143
+
144
+ def _run_pipeline(self, examples: List[Mapping[str, Any]]):
145
+ results = []
146
+ for example in examples:
147
+ result = self.pipeline.run_sync(
148
+ conversation=example["context"],
149
+ inventory=self.apis_specs,
150
+ call=example["call"],
151
+ continue_on_static=False,
152
+ retries=2,
153
+ )
154
+ result_dict = result.model_dump()
155
+ results.append(result_dict)
156
+
157
+ return results
158
+
159
+ def run(self, examples: List[Mapping[str, str]], verbose=False):
160
+ """`examples` should be an array where each element is formatted:
161
+
162
+ call = {
163
+ "call": {
164
+ "id": tool_call_msg.get("id", "1"),
165
+ "type": "function",
166
+ "function": {
167
+ "name": tool_call_msg["name"],
168
+ "arguments": json.dumps(tool_call_msg["args"]),
169
+ },
170
+ },
171
+ "context": context,
172
+ }
173
+ """
174
+
116
175
  examples = [
117
176
  {
118
177
  "call": ToolCall.model_validate(ex["call"]),
@@ -120,6 +179,11 @@ class ReferencelessEvaluation:
120
179
  }
121
180
  for ex in examples
122
181
  ]
182
+
183
+ if verbose:
184
+ rich.print(
185
+ f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
186
+ )
123
187
  results = self._run_pipeline(examples)
124
188
 
125
189
  return results
@@ -1,6 +1,7 @@
1
1
  from collections import defaultdict
2
2
 
3
- from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
3
+ from wxo_agentic_evaluation.utils.utils import is_saas_url
4
+ from wxo_agentic_evaluation.wxo_client import WXOClient
4
5
 
5
6
 
6
7
  class ResourceMap:
@@ -34,6 +35,7 @@ class ResourceMap:
34
35
 
35
36
  if resp.status_code == 200:
36
37
  agents = resp.json()
38
+ self.all_agent_objs = agents
37
39
  for agent in agents:
38
40
  agent_name = agent["name"]
39
41
  tools = [tool_map[id] for id in agent["tools"]]
@@ -143,7 +143,7 @@ class ServiceInstance:
143
143
  def create_tenant_if_not_exist(self) -> str:
144
144
  if self.is_saas:
145
145
  logger.info(
146
- "SaaS mode: running against Remote Service and skipping tenant creation"
146
+ "[d b]SaaS mode: running against Remote Service and skipping tenant creation"
147
147
  )
148
148
  return None
149
149
 
@@ -151,11 +151,13 @@ class ServiceInstance:
151
151
  default_tenant = self.get_default_tenant(user_auth_token)
152
152
 
153
153
  if not default_tenant:
154
- logger.info("no local tenant found. A default tenant is created")
154
+ logger.info(
155
+ "[d b]no local tenant found. A default tenant is created"
156
+ )
155
157
  self.create_eval_tenant(user_auth_token)
156
158
  default_tenant = self.get_default_tenant(user_auth_token)
157
159
  else:
158
- logger.info("local tenant found")
160
+ logger.info("[d b]local tenant found")
159
161
 
160
162
  return default_tenant["id"]
161
163
 
@@ -247,6 +249,13 @@ def tenant_setup(
247
249
 
248
250
  context["active_environment"] = tenant_name
249
251
 
252
+ # Ensure parent directories exist so tests (which may run in clean envs)
253
+ # can write these files without raising FileNotFoundError.
254
+ auth_dir = os.path.dirname(auth_config_path)
255
+ env_dir = os.path.dirname(env_config_path)
256
+ os.makedirs(auth_dir, exist_ok=True)
257
+ os.makedirs(env_dir, exist_ok=True)
258
+
250
259
  with open(auth_config_path, "w") as f:
251
260
  yaml.dump(auth_config, f)
252
261
  with open(env_config_path, "w") as f:
@@ -1,6 +1,13 @@
1
+ import logging
1
2
  import os
2
3
 
4
+ from rich.console import Console
5
+ from rich.logging import RichHandler
6
+
3
7
  from wxo_agentic_evaluation.arg_configs import ProviderConfig
8
+ from wxo_agentic_evaluation.service_provider.gateway_provider import (
9
+ GatewayProvider,
10
+ )
4
11
  from wxo_agentic_evaluation.service_provider.model_proxy_provider import (
5
12
  ModelProxyProvider,
6
13
  )
@@ -8,6 +15,7 @@ from wxo_agentic_evaluation.service_provider.ollama_provider import (
8
15
  OllamaProvider,
9
16
  )
10
17
  from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
18
+ GatewayProviderLLMKitWrapper,
11
19
  ModelProxyProviderLLMKitWrapper,
12
20
  WatsonXLLMKitWrapper,
13
21
  )
@@ -15,24 +23,123 @@ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
15
23
  WatsonXProvider,
16
24
  )
17
25
 
26
+ USE_GATEWAY_MODEL_PROVIDER: bool = (
27
+ os.environ.get("USE_GATEWAY_MODEL_PROVIDER", "FALSE").upper() == "TRUE"
28
+ )
29
+
30
+
31
+ _logging_console = Console(stderr=True)
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ def get_log_level_from_env():
37
+
38
+ level_env = os.getenv("WXO_EVALUATION_LOGLEVEL")
39
+ return level_env
40
+
41
+
42
+ LOGGING_ENABLED = get_log_level_from_env() is not None
43
+
44
+
45
+ def configure_logging_for_package_from_env(
46
+ package_name: str = "wxo_agentic_evaluation",
47
+ ensure_output: bool = True,
48
+ ) -> None:
49
+ """
50
+ Configure logging using the env var WXO_EVALUATION_LOGLEVEL - no logging if that's not set
51
+ """
52
+ try:
53
+ level_env = get_log_level_from_env()
54
+ if not level_env:
55
+ return
56
+
57
+ level = None
58
+ upper = level_env.strip().upper()
59
+ if hasattr(logging, upper):
60
+ level = getattr(logging, upper, None)
61
+
62
+ pkg_logger = logging.getLogger(package_name)
63
+ pkg_logger.setLevel(level)
64
+
65
+ if ensure_output:
66
+ if not pkg_logger.handlers:
67
+ handler = RichHandler(
68
+ console=_logging_console,
69
+ rich_tracebacks=True,
70
+ show_time=False,
71
+ show_level=False,
72
+ show_path=False,
73
+ markup=True,
74
+ enable_link_path=True,
75
+ omit_repeated_times=True,
76
+ tracebacks_theme="github-dark",
77
+ )
78
+ handler.setFormatter(
79
+ logging.Formatter("%(levelname)s %(message)s")
80
+ )
81
+ handler.setLevel(logging.NOTSET)
82
+ pkg_logger.addHandler(handler)
83
+ pkg_logger.propagate = False
84
+
85
+ # Quiet common noisy debug libs
86
+ for name in (
87
+ "urllib3",
88
+ "urllib3.connectionpool",
89
+ "requests.packages.urllib3",
90
+ ):
91
+ logging.getLogger(name).setLevel(logging.WARNING)
92
+ except:
93
+ logger.warning("Input log level %s not valid", level_env)
94
+
95
+
96
+ configure_logging_for_package_from_env()
97
+
18
98
 
19
99
  def _instantiate_provider(
20
100
  config: ProviderConfig, is_referenceless_eval: bool = False, **kwargs
21
101
  ):
102
+
22
103
  if config.provider == "watsonx":
104
+ logger.info("Instantiate watsonx provider")
23
105
  if is_referenceless_eval:
24
106
  provider = WatsonXLLMKitWrapper
25
107
  else:
26
108
  provider = WatsonXProvider
27
- return provider(model_id=config.model_id, **kwargs)
109
+ return provider(
110
+ model_id=config.model_id,
111
+ embedding_model_id=config.embedding_model_id,
112
+ **kwargs,
113
+ )
28
114
  elif config.provider == "ollama":
115
+ logger.info("Instantiate Ollama")
29
116
  return OllamaProvider(model_id=config.model_id, **kwargs)
117
+
118
+ elif config.provider == "gateway":
119
+ logger.info("Instantiate gateway inference provider")
120
+ if is_referenceless_eval:
121
+ provider = GatewayProviderLLMKitWrapper
122
+ else:
123
+ provider = GatewayProvider
124
+ return provider(
125
+ model_id=config.model_id,
126
+ embedding_model_id=config.embedding_model_id,
127
+ **kwargs,
128
+ )
129
+
30
130
  elif config.provider == "model_proxy":
131
+ logger.info("Instantiate model proxy provider")
31
132
  if is_referenceless_eval:
32
133
  provider = ModelProxyProviderLLMKitWrapper
33
134
  else:
34
135
  provider = ModelProxyProvider
35
- return provider(model_id=config.model_id, **kwargs)
136
+
137
+ return provider(
138
+ model_id=config.model_id,
139
+ embedding_model_id=config.embedding_model_id,
140
+ **kwargs,
141
+ )
142
+
36
143
  else:
37
144
  raise RuntimeError(
38
145
  f"target provider is not supported {config.provider}"
@@ -42,23 +149,36 @@ def _instantiate_provider(
42
149
  def get_provider(
43
150
  config: ProviderConfig = None,
44
151
  model_id: str = None,
152
+ embedding_model_id: str = None,
45
153
  referenceless_eval: bool = False,
46
154
  **kwargs,
47
155
  ):
156
+
157
+ if config:
158
+ return _instantiate_provider(config, **kwargs)
159
+
48
160
  if not model_id:
49
161
  raise ValueError("model_id must be provided if config is not supplied")
50
162
 
163
+ if USE_GATEWAY_MODEL_PROVIDER:
164
+ logger.info("[d b]Using gateway inference provider override")
165
+ config = ProviderConfig(provider="gateway", model_id=model_id)
166
+ return _instantiate_provider(config, referenceless_eval, **kwargs)
167
+
51
168
  if "WATSONX_APIKEY" in os.environ and "WATSONX_SPACE_ID" in os.environ:
52
- config = ProviderConfig(provider="watsonx", model_id=model_id)
169
+ logger.info("[d b]Using watsonx inference provider")
170
+ config = ProviderConfig(
171
+ provider="watsonx",
172
+ model_id=model_id,
173
+ embedding_model_id=embedding_model_id,
174
+ )
53
175
  return _instantiate_provider(config, referenceless_eval, **kwargs)
54
176
 
55
177
  if "WO_INSTANCE" in os.environ:
178
+ logger.info("[d b]Using model_proxy inference provider")
56
179
  config = ProviderConfig(provider="model_proxy", model_id=model_id)
57
180
  return _instantiate_provider(config, referenceless_eval, **kwargs)
58
181
 
59
- if config:
60
- return _instantiate_provider(config, **kwargs)
61
-
62
- raise RuntimeError(
63
- "No provider found. Please either provide a config or set the required environment variables."
64
- )
182
+ logger.info("[d b]Using gateway inference provider default")
183
+ config = ProviderConfig(provider="gateway", model_id=model_id)
184
+ return _instantiate_provider(config, referenceless_eval, **kwargs)