ibm-watsonx-orchestrate-evaluation-framework 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (42) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +42 -36
  3. wxo_agentic_evaluation/analyze_run.py +49 -32
  4. wxo_agentic_evaluation/arg_configs.py +30 -2
  5. wxo_agentic_evaluation/data_annotator.py +22 -4
  6. wxo_agentic_evaluation/description_quality_checker.py +20 -4
  7. wxo_agentic_evaluation/evaluation_package.py +189 -15
  8. wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
  9. wxo_agentic_evaluation/external_agent/types.py +1 -1
  10. wxo_agentic_evaluation/inference_backend.py +64 -34
  11. wxo_agentic_evaluation/llm_matching.py +92 -2
  12. wxo_agentic_evaluation/llm_user.py +2 -2
  13. wxo_agentic_evaluation/main.py +147 -38
  14. wxo_agentic_evaluation/metrics/__init__.py +5 -1
  15. wxo_agentic_evaluation/metrics/evaluations.py +124 -0
  16. wxo_agentic_evaluation/metrics/metrics.py +24 -3
  17. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  18. wxo_agentic_evaluation/prompt/template_render.py +16 -0
  19. wxo_agentic_evaluation/quick_eval.py +17 -3
  20. wxo_agentic_evaluation/record_chat.py +17 -6
  21. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +44 -14
  22. wxo_agentic_evaluation/red_teaming/attack_generator.py +31 -12
  23. wxo_agentic_evaluation/red_teaming/attack_list.py +23 -24
  24. wxo_agentic_evaluation/red_teaming/attack_runner.py +36 -19
  25. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +42 -16
  26. wxo_agentic_evaluation/service_instance.py +5 -3
  27. wxo_agentic_evaluation/service_provider/__init__.py +129 -9
  28. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  29. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
  30. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  31. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  32. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  33. wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
  34. wxo_agentic_evaluation/type.py +14 -4
  35. wxo_agentic_evaluation/utils/__init__.py +43 -5
  36. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  37. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  38. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  39. wxo_agentic_evaluation/utils/utils.py +14 -9
  40. wxo_agentic_evaluation/wxo_client.py +2 -1
  41. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
  42. {ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,9 @@
1
1
  import json
2
- from typing import Any, List, Mapping
2
+ from typing import Any, List, Mapping, Optional
3
3
 
4
4
  import rich
5
5
 
6
+ from wxo_agentic_evaluation.inference_backend import WXOInferenceBackend
6
7
  from wxo_agentic_evaluation.referenceless_eval.function_calling.consts import (
7
8
  METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
8
9
  METRIC_GENERAL_HALLUCINATION_CHECK,
@@ -16,13 +17,17 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
16
17
  )
17
18
  from wxo_agentic_evaluation.service_provider import get_provider
18
19
  from wxo_agentic_evaluation.type import Message
20
+ from wxo_agentic_evaluation.utils.gateway_provider_utils import (
21
+ get_provider_kwargs,
22
+ )
19
23
 
20
- DEFAULT_GENERATION_PARAMS= {
24
+ DEFAULT_GENERATION_PARAMS = {
21
25
  "min_new_tokens": 0,
22
26
  "decoding_method": "greedy",
23
27
  "max_new_tokens": 4096,
24
28
  }
25
29
 
30
+
26
31
  class ReferencelessEvaluation:
27
32
  """
28
33
  Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
@@ -40,13 +45,25 @@ class ReferencelessEvaluation:
40
45
  task_n: str,
41
46
  dataset_name: str,
42
47
  runtime_pipeline: bool = True,
43
- generation_params = DEFAULT_GENERATION_PARAMS
48
+ generation_params=DEFAULT_GENERATION_PARAMS,
49
+ inference_backend: Optional[WXOInferenceBackend] = None,
44
50
  ):
45
51
 
46
- self.metrics_client = get_provider(
52
+ extra_kwargs = {}
53
+ if inference_backend is not None:
54
+ wxo_client = getattr(inference_backend, "wxo_client")
55
+ instance_url = getattr(wxo_client, "service_url", None)
56
+ token = getattr(wxo_client, "api_key", None)
57
+ if instance_url:
58
+ extra_kwargs["instance_url"] = instance_url
59
+ if token:
60
+ extra_kwargs["token"] = token
61
+
62
+ self.metrics_client = ReferencelessEvaluation.get_metrics_client(
47
63
  model_id=model_id,
48
64
  params=generation_params,
49
65
  referenceless_eval=True,
66
+ **extra_kwargs,
50
67
  )
51
68
 
52
69
  self.pipeline = ReflectionPipeline(
@@ -62,20 +79,29 @@ class ReferencelessEvaluation:
62
79
 
63
80
  self.apis_specs = [ToolSpec.model_validate(spec) for spec in api_spec]
64
81
 
82
+ @staticmethod
83
+ def get_metrics_client(**kwargs):
84
+
85
+ provider_kwargs = get_provider_kwargs(**kwargs)
86
+
87
+ return get_provider(
88
+ **provider_kwargs,
89
+ )
90
+
65
91
  @staticmethod
66
92
  def fmt_tool_call(tool_id, tool_call_name, arguments, context):
67
93
  call = {
68
- "call": {
69
- "id": tool_id,
70
- "type": "function",
71
- "function": {
72
- "name": tool_call_name,
73
- "arguments": arguments,
74
- },
94
+ "call": {
95
+ "id": tool_id,
96
+ "type": "function",
97
+ "function": {
98
+ "name": tool_call_name,
99
+ "arguments": arguments,
75
100
  },
76
- "context": context,
77
- }
78
-
101
+ },
102
+ "context": context,
103
+ }
104
+
79
105
  return call
80
106
 
81
107
  @staticmethod
@@ -104,12 +130,12 @@ class ReferencelessEvaluation:
104
130
  tool_call_msg = json.loads(content)
105
131
  if tool_call_msg["name"].startswith("transfer_to"):
106
132
  continue
107
-
133
+
108
134
  call = ReferencelessEvaluation.fmt_tool_call(
109
135
  tool_id=tool_call_msg.get("id", "1"),
110
136
  tool_call_name=tool_call_msg["name"],
111
137
  arguments=json.dumps(tool_call_msg["args"]),
112
- context=context
138
+ context=context,
113
139
  )
114
140
  examples.append(call)
115
141
 
@@ -143,7 +143,7 @@ class ServiceInstance:
143
143
  def create_tenant_if_not_exist(self) -> str:
144
144
  if self.is_saas:
145
145
  logger.info(
146
- "SaaS mode: running against Remote Service and skipping tenant creation"
146
+ "[d b]SaaS mode: running against Remote Service and skipping tenant creation"
147
147
  )
148
148
  return None
149
149
 
@@ -151,11 +151,13 @@ class ServiceInstance:
151
151
  default_tenant = self.get_default_tenant(user_auth_token)
152
152
 
153
153
  if not default_tenant:
154
- logger.info("no local tenant found. A default tenant is created")
154
+ logger.info(
155
+ "[d b]no local tenant found. A default tenant is created"
156
+ )
155
157
  self.create_eval_tenant(user_auth_token)
156
158
  default_tenant = self.get_default_tenant(user_auth_token)
157
159
  else:
158
- logger.info("local tenant found")
160
+ logger.info("[d b]local tenant found")
159
161
 
160
162
  return default_tenant["id"]
161
163
 
@@ -1,6 +1,13 @@
1
+ import logging
1
2
  import os
2
3
 
4
+ from rich.console import Console
5
+ from rich.logging import RichHandler
6
+
3
7
  from wxo_agentic_evaluation.arg_configs import ProviderConfig
8
+ from wxo_agentic_evaluation.service_provider.gateway_provider import (
9
+ GatewayProvider,
10
+ )
4
11
  from wxo_agentic_evaluation.service_provider.model_proxy_provider import (
5
12
  ModelProxyProvider,
6
13
  )
@@ -8,6 +15,7 @@ from wxo_agentic_evaluation.service_provider.ollama_provider import (
8
15
  OllamaProvider,
9
16
  )
10
17
  from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
18
+ GatewayProviderLLMKitWrapper,
11
19
  ModelProxyProviderLLMKitWrapper,
12
20
  WatsonXLLMKitWrapper,
13
21
  )
@@ -15,24 +23,123 @@ from wxo_agentic_evaluation.service_provider.watsonx_provider import (
15
23
  WatsonXProvider,
16
24
  )
17
25
 
26
+ USE_GATEWAY_MODEL_PROVIDER: bool = (
27
+ os.environ.get("USE_GATEWAY_MODEL_PROVIDER", "FALSE").upper() == "TRUE"
28
+ )
29
+
30
+
31
+ _logging_console = Console(stderr=True)
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ def get_log_level_from_env():
37
+
38
+ level_env = os.getenv("WXO_EVALUATION_LOGLEVEL")
39
+ return level_env
40
+
41
+
42
+ LOGGING_ENABLED = get_log_level_from_env() is not None
43
+
44
+
45
+ def configure_logging_for_package_from_env(
46
+ package_name: str = "wxo_agentic_evaluation",
47
+ ensure_output: bool = True,
48
+ ) -> None:
49
+ """
50
+ Configure logging using the env var WXO_EVALUATION_LOGLEVEL - no logging if that's not set
51
+ """
52
+ try:
53
+ level_env = get_log_level_from_env()
54
+ if not level_env:
55
+ return
56
+
57
+ level = None
58
+ upper = level_env.strip().upper()
59
+ if hasattr(logging, upper):
60
+ level = getattr(logging, upper, None)
61
+
62
+ pkg_logger = logging.getLogger(package_name)
63
+ pkg_logger.setLevel(level)
64
+
65
+ if ensure_output:
66
+ if not pkg_logger.handlers:
67
+ handler = RichHandler(
68
+ console=_logging_console,
69
+ rich_tracebacks=True,
70
+ show_time=False,
71
+ show_level=False,
72
+ show_path=False,
73
+ markup=True,
74
+ enable_link_path=True,
75
+ omit_repeated_times=True,
76
+ tracebacks_theme="github-dark",
77
+ )
78
+ handler.setFormatter(
79
+ logging.Formatter("%(levelname)s %(message)s")
80
+ )
81
+ handler.setLevel(logging.NOTSET)
82
+ pkg_logger.addHandler(handler)
83
+ pkg_logger.propagate = False
84
+
85
+ # Quiet common noisy debug libs
86
+ for name in (
87
+ "urllib3",
88
+ "urllib3.connectionpool",
89
+ "requests.packages.urllib3",
90
+ ):
91
+ logging.getLogger(name).setLevel(logging.WARNING)
92
+ except:
93
+ logger.warning("Input log level %s not valid", level_env)
94
+
95
+
96
+ configure_logging_for_package_from_env()
97
+
18
98
 
19
99
  def _instantiate_provider(
20
100
  config: ProviderConfig, is_referenceless_eval: bool = False, **kwargs
21
101
  ):
102
+
22
103
  if config.provider == "watsonx":
104
+ logger.info("Instantiate watsonx provider")
23
105
  if is_referenceless_eval:
24
106
  provider = WatsonXLLMKitWrapper
25
107
  else:
26
108
  provider = WatsonXProvider
27
- return provider(model_id=config.model_id, **kwargs)
109
+ return provider(
110
+ model_id=config.model_id,
111
+ embedding_model_id=config.embedding_model_id,
112
+ **kwargs,
113
+ )
28
114
  elif config.provider == "ollama":
115
+ logger.info("Instantiate Ollama")
29
116
  return OllamaProvider(model_id=config.model_id, **kwargs)
117
+
118
+ elif config.provider == "gateway":
119
+ logger.info("Instantiate gateway inference provider")
120
+ if is_referenceless_eval:
121
+ provider = GatewayProviderLLMKitWrapper
122
+ else:
123
+ provider = GatewayProvider
124
+ return provider(
125
+ model_id=config.model_id,
126
+ embedding_model_id=config.embedding_model_id,
127
+ **kwargs,
128
+ )
129
+
30
130
  elif config.provider == "model_proxy":
131
+ logger.info("Instantiate model proxy provider")
31
132
  if is_referenceless_eval:
32
133
  provider = ModelProxyProviderLLMKitWrapper
33
134
  else:
34
135
  provider = ModelProxyProvider
35
- return provider(model_id=config.model_id, **kwargs)
136
+
137
+ return provider(
138
+ model_id=config.model_id,
139
+ embedding_model_id=config.embedding_model_id,
140
+ **kwargs,
141
+ )
142
+
36
143
  else:
37
144
  raise RuntimeError(
38
145
  f"target provider is not supported {config.provider}"
@@ -42,23 +149,36 @@ def _instantiate_provider(
42
149
  def get_provider(
43
150
  config: ProviderConfig = None,
44
151
  model_id: str = None,
152
+ embedding_model_id: str = None,
45
153
  referenceless_eval: bool = False,
46
154
  **kwargs,
47
155
  ):
156
+
157
+ if config:
158
+ return _instantiate_provider(config, **kwargs)
159
+
48
160
  if not model_id:
49
161
  raise ValueError("model_id must be provided if config is not supplied")
50
162
 
163
+ if USE_GATEWAY_MODEL_PROVIDER:
164
+ logger.info("[d b]Using gateway inference provider override")
165
+ config = ProviderConfig(provider="gateway", model_id=model_id)
166
+ return _instantiate_provider(config, referenceless_eval, **kwargs)
167
+
51
168
  if "WATSONX_APIKEY" in os.environ and "WATSONX_SPACE_ID" in os.environ:
52
- config = ProviderConfig(provider="watsonx", model_id=model_id)
169
+ logger.info("[d b]Using watsonx inference provider")
170
+ config = ProviderConfig(
171
+ provider="watsonx",
172
+ model_id=model_id,
173
+ embedding_model_id=embedding_model_id,
174
+ )
53
175
  return _instantiate_provider(config, referenceless_eval, **kwargs)
54
176
 
55
177
  if "WO_INSTANCE" in os.environ:
178
+ logger.info("[d b]Using model_proxy inference provider")
56
179
  config = ProviderConfig(provider="model_proxy", model_id=model_id)
57
180
  return _instantiate_provider(config, referenceless_eval, **kwargs)
58
181
 
59
- if config:
60
- return _instantiate_provider(config, **kwargs)
61
-
62
- raise RuntimeError(
63
- "No provider found. Please either provide a config or set the required environment variables."
64
- )
182
+ logger.info("[d b]Using gateway inference provider default")
183
+ config = ProviderConfig(provider="gateway", model_id=model_id)
184
+ return _instantiate_provider(config, referenceless_eval, **kwargs)