PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show

wxo_agentic_evaluation/red_teaming/attack_runner.py CHANGED Viewed

@@ -1,29 +1,31 @@
-from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.resource_map import ResourceMap
-from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
+import dataclasses
+import glob
+import json
+import os
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+import rich
+import yaml
+from jsonargparse import CLI
+from rich.progress import Progress
+from wxo_agentic_evaluation.arg_configs import AttackConfig
 from wxo_agentic_evaluation.inference_backend import (
     EvaluationController,
-    get_wxo_client,
     WXOInferenceBackend,
+    get_wxo_client,
+)
+from wxo_agentic_evaluation.llm_user import LLMUser
+from wxo_agentic_evaluation.prompt.template_render import (
+    LlamaUserTemplateRenderer,
 )
-from wxo_agentic_evaluation.type import AttackData
-from wxo_agentic_evaluation.arg_configs import AttackConfig
 from wxo_agentic_evaluation.red_teaming.attack_evaluator import AttackEvaluator
+from wxo_agentic_evaluation.resource_map import ResourceMap
+from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.type import AttackData
 from wxo_agentic_evaluation.utils import json_dump
-import os
-import json
-import traceback
-import yaml
-import dataclasses
-import glob
-import rich
-from rich.progress import Progress
-from concurrent.futures import ThreadPoolExecutor
-from jsonargparse import CLI
 def process_attack(task_n, attack_path, config, inference_backend, llm_user):
     tc_name = os.path.basename(attack_path).replace(".json", "")
@@ -31,7 +33,9 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
         attack: AttackData = AttackData.model_validate(json.load(f))
     evaluation_controller = EvaluationController(
-        wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
+        wxo_inference_backend=inference_backend,
+        llm_user=llm_user,
+        config=config,
     )
     rich.print(f"[bold magenta]Running attack: {tc_name}[/bold magenta]")
     history, _, _ = evaluation_controller.run(
@@ -46,7 +50,8 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
         result.append(message.model_dump())
     json_dump(
-        os.path.join(config.output_dir, "messages", tc_name + ".messages.json"), result
+        os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
+        result,
     )
     return result
@@ -55,19 +60,26 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
 def run_attacks(config: AttackConfig):
     executor = ThreadPoolExecutor(max_workers=config.num_workers)
     wxo_client = get_wxo_client(
-        config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
+        config.auth_config.url,
+        config.auth_config.tenant_name,
+        config.auth_config.token,
     )
     resource_map = ResourceMap(wxo_client)
     inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
     llm_user = LLMUser(
         wai_client=get_provider(
-            config=config.provider_config, model_id=config.llm_user_config.model_id
+            config=config.provider_config,
+            model_id=config.llm_user_config.model_id,
+        ),
+        template=LlamaUserTemplateRenderer(
+            config.llm_user_config.prompt_config
         ),
-        template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
         user_response_style=config.llm_user_config.user_response_style,
     )
-    print(f"Running red teaming attacks with tenant {config.auth_config.tenant_name}")
+    print(
+        f"Running red teaming attacks with tenant {config.auth_config.tenant_name}"
+    )
     os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
     results_list = []
@@ -81,7 +93,9 @@ def run_attacks(config: AttackConfig):
     task_n = 0
     for attack_path in attack_paths:
-        if not attack_path.endswith(".json") or attack_path.endswith("agent.json"):
+        if not attack_path.endswith(".json") or attack_path.endswith(
+            "agent.json"
+        ):
             continue
         future = executor.submit(
@@ -118,7 +132,9 @@ def run_attacks(config: AttackConfig):
     ) as f:
         yaml.safe_dump(dataclasses.asdict(config), f)
-    with open(os.path.join(config.output_dir, "attacks_results.json"), "w") as f:
+    with open(
+        os.path.join(config.output_dir, "attacks_results.json"), "w"
+    ) as f:
         json.dump(attack_results, f, indent=2)
     print(f"Attack results saved to {config.output_dir}")

wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from abc import ABC
 from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
-from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import MetricPrompt
+from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
+    MetricPrompt,
+)
 class FunctionMetricsPrompt(MetricPrompt, ABC):

wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py CHANGED Viewed

@@ -1,14 +1,7 @@
 import json
 from enum import Enum
 from pathlib import Path
-from typing import (
-    Any,
-    Dict,
-    Iterable,
-    List,
-    Tuple,
-    Union,
-)
+from typing import Any, Dict, Iterable, List, Tuple, Union
 from pydantic import ValidationError
@@ -18,7 +11,10 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function
 from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_selection.function_selection import (
     FunctionSelectionPrompt,
 )
-from wxo_agentic_evaluation.referenceless_eval.metrics import Metric, MetricPrompt
+from wxo_agentic_evaluation.referenceless_eval.metrics import (
+    Metric,
+    MetricPrompt,
+)
 PromptType = Union[
     GeneralMetricsPrompt,
@@ -70,7 +66,9 @@ def load_prompts_from_jsonl(
         raise LoaderError(f"File not found: {path}")
     prompts: List[PromptType] = []
-    for lineno, raw in enumerate(p.read_text(encoding="utf-8").splitlines(), start=1):
+    for lineno, raw in enumerate(
+        p.read_text(encoding="utf-8").splitlines(), start=1
+    ):
         if not raw.strip():
             continue
         try:
@@ -96,7 +94,9 @@ def load_prompts_from_jsonl(
         # Instantiate prompt
         prompt: MetricPrompt
         try:
-            prompt = PromptCls(metric=metric, task_description=metric.description)
+            prompt = PromptCls(
+                metric=metric, task_description=metric.description
+            )
         except TypeError:
             prompt = PromptCls(metric=metric)
@@ -158,7 +158,9 @@ def load_prompts_from_list(
             raise LoaderError(f"Record {idx} invalid schema: {e}") from e
         try:
-            prompt = PromptCls(metric=metric, task_description=rec["task_description"])
+            prompt = PromptCls(
+                metric=metric, task_description=rec["task_description"]
+            )
         except TypeError:
             prompt = PromptCls(metric=metric)
@@ -167,11 +169,15 @@ def load_prompts_from_list(
                 user_kwargs = ex["user_kwargs"]
                 output = ex["output"]
             except KeyError as e:
-                raise LoaderError(f"Record {idx}, example {ex_idx} missing {e}") from e
+                raise LoaderError(
+                    f"Record {idx}, example {ex_idx} missing {e}"
+                ) from e
             try:
                 prompt.add_example(user_kwargs, output)
             except (ValidationError, ValueError) as e:
-                raise LoaderError(f"Record {idx}, example {ex_idx} invalid: {e}") from e
+                raise LoaderError(
+                    f"Record {idx}, example {ex_idx} invalid: {e}"
+                ) from e
         prompts.append(prompt)
@@ -211,7 +217,9 @@ def load_prompts_from_metrics(
         # Instantiate prompt with the metric's description as task_description
         try:
-            prompt = PromptCls(metric=metric, task_description=metric.description)
+            prompt = PromptCls(
+                metric=metric, task_description=metric.description
+            )
         except TypeError:
             # Fallback if constructor signature differs
             prompt = PromptCls(metric=metric)

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py CHANGED Viewed

@@ -1,8 +1,4 @@
-from typing import (
-    Any,
-    Dict,
-    List,
-)
+from typing import Any, Dict, List
 from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
     ToolCall,
@@ -65,7 +61,9 @@ class OpenAIAdapter(BaseAdapter):
         ]
     def get_tool_spec(self, tool_name: str) -> Dict[str, Any]:
-        tool = next((t for t in self.specs if t.function.name == tool_name), None)
+        tool = next(
+            (t for t in self.specs if t.function.name == tool_name), None
+        )
         return tool.function.model_dump() if tool else {}
     def get_call_dict(self) -> Dict[str, Any]:
@@ -87,11 +85,18 @@ class OpenAIAdapter(BaseAdapter):
     def get_param_spec_snippet(self, param_name: str) -> Dict[str, Any]:
         spec = next(
-            (s for s in self.specs if s.function.name == self.get_function_name()), None
+            (
+                s
+                for s in self.specs
+                if s.function.name == self.get_function_name()
+            ),
+            None,
         )
         if not spec:
             return {"type": "object", "properties": {}, "required": []}
-        props = spec.function.parameters.get("properties", spec.function.parameters)
+        props = spec.function.parameters.get(
+            "properties", spec.function.parameters
+        )
         if param_name not in props:
             return {"type": "object", "properties": {}, "required": []}
         return {

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py CHANGED Viewed

@@ -1,8 +1,9 @@
-import json
 import importlib.resources
+import json
 from pathlib import Path
 from typing import Dict, List, Optional, Union
+from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
 from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.semantic_checker import (
     SemanticChecker,
 )
@@ -19,13 +20,16 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
     ToolCall,
     ToolSpec,
 )
-from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
-from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import LLMKitWrapper
+from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
+    LLMKitWrapper,
+)
 def metrics_dir():
     path = importlib.resources.files(metrics)
     return path
 # Default metric JSON paths
 _METRICS_DIR = metrics_dir()
 _DEFAULT_GENERAL = _METRICS_DIR / "function_call" / "general_metrics.json"
@@ -36,10 +40,14 @@ _DEFAULT_FUNCSEL = (
     _METRICS_DIR / "function_selection" / "function_selection_metrics.json"
 )
 _DEFAULT_FUNCSEL_RUNTIME = (
-    _METRICS_DIR / "function_selection" / "function_selection_metrics_runtime.json"
+    _METRICS_DIR
+    / "function_selection"
+    / "function_selection_metrics_runtime.json"
 )
 _DEFAULT_PARAM = _METRICS_DIR / "parameter" / "parameter_metrics.json"
-_DEFAULT_PARAM_RUNTIME = _METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
+_DEFAULT_PARAM_RUNTIME = (
+    _METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
+)
 class ReflectionPipeline:
@@ -88,11 +96,19 @@ class ReflectionPipeline:
         for metrics, default_path in [
             (
                 self.general_metrics,
-                _DEFAULT_GENERAL_RUNTIME if runtime_pipeline else _DEFAULT_GENERAL,
+                (
+                    _DEFAULT_GENERAL_RUNTIME
+                    if runtime_pipeline
+                    else _DEFAULT_GENERAL
+                ),
             ),
             (
                 self.function_metrics,
-                _DEFAULT_FUNCSEL_RUNTIME if runtime_pipeline else _DEFAULT_FUNCSEL,
+                (
+                    _DEFAULT_FUNCSEL_RUNTIME
+                    if runtime_pipeline
+                    else _DEFAULT_FUNCSEL
+                ),
             ),
             (
                 self.parameter_metrics,
@@ -104,7 +120,9 @@ class ReflectionPipeline:
                 continue
             # Handle metric names list
-            if isinstance(metrics, list) and all(isinstance(x, str) for x in metrics):
+            if isinstance(metrics, list) and all(
+                isinstance(x, str) for x in metrics
+            ):
                 # Load the default JSON file
                 if not default_path.is_file():
                     raise FileNotFoundError(
@@ -116,7 +134,9 @@ class ReflectionPipeline:
                 # Filter metrics by name
                 filtered_metrics = [
-                    metric for metric in all_metrics if metric.get("name") in metrics
+                    metric
+                    for metric in all_metrics
+                    if metric.get("name") in metrics
                 ]
                 # Remove examples from prompts if requested
@@ -125,7 +145,9 @@ class ReflectionPipeline:
                         metric.pop("examples", None)
                 if len(filtered_metrics) != len(metrics):
-                    found_names = {metric.get("name") for metric in filtered_metrics}
+                    found_names = {
+                        metric.get("name") for metric in filtered_metrics
+                    }
                     missing = set(metrics) - found_names
                     raise ValueError(f"Metrics not found: {missing}")
@@ -140,14 +162,20 @@ class ReflectionPipeline:
             if isinstance(metrics, list) and all(
                 isinstance(x, FunctionCallMetric) for x in metrics
             ):
-                metrics_definitions.append([metric.model_dump() for metric in metrics])
+                metrics_definitions.append(
+                    [metric.model_dump() for metric in metrics]
+                )
             else:
                 if not metrics.is_file():
-                    raise FileNotFoundError(f"Metrics file not found: {metrics}")
+                    raise FileNotFoundError(
+                        f"Metrics file not found: {metrics}"
+                    )
                 metrics_definitions.append(
                     [
                         json.loads(json_obj)
-                        for json_obj in metrics.read_text(encoding="utf8").splitlines()
+                        for json_obj in metrics.read_text(
+                            encoding="utf8"
+                        ).splitlines()
                         if json_obj.strip()
                     ]
                 )

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py CHANGED Viewed

@@ -1,14 +1,7 @@
 import json
 import math
 import re
-from typing import (
-    Any,
-    Dict,
-    List,
-    Optional,
-    Tuple,
-    Union,
-)
+from typing import Any, Dict, List, Optional, Tuple, Union
 from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_call.general import (
     GeneralMetricsPrompt,
@@ -39,11 +32,13 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
     ToolSpec,
     TransformResult,
 )
-from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
 from wxo_agentic_evaluation.referenceless_eval.metrics.metrics_runner import (
     MetricRunner,
     MetricRunResult,
 )
+from wxo_agentic_evaluation.service_provider.watsonx_provider import (
+    WatsonXProvider,
+)
 class SemanticChecker:
@@ -231,7 +226,9 @@ class SemanticChecker:
                     schema_param_name="schema",
                     retries=retries,
                 )
-                general_results = SemanticCategoryResult.from_results(sync_results)
+                general_results = SemanticCategoryResult.from_results(
+                    sync_results
+                )
             except Exception as e:
                 general_results = {"error": str(e)}
         else:
@@ -261,7 +258,9 @@ class SemanticChecker:
                     schema_param_name="schema",
                     retries=retries,
                 )
-                function_results = SemanticCategoryResult.from_results(sync_results)
+                function_results = SemanticCategoryResult.from_results(
+                    sync_results
+                )
             except Exception as e:
                 function_results = {"error": str(e)}
         else:
@@ -272,7 +271,9 @@ class SemanticChecker:
         for pname, pval in params.items():
             # Each parameter has its own prompts
             try:
-                param_entries: List[Tuple[ParameterMetricsPrompt, Dict[str, Any]]] = []
+                param_entries: List[
+                    Tuple[ParameterMetricsPrompt, Dict[str, Any]]
+                ] = []
                 for prompt in self.parameter_prompts:
                     param_entries.append(
                         (
@@ -351,7 +352,10 @@ class SemanticChecker:
                         )
                         gen_code = self.codegen_client.generate(
                             prompt=[
-                                {"role": "system", "content": GENERATE_CODE_SYSTEM},
+                                {
+                                    "role": "system",
+                                    "content": GENERATE_CODE_SYSTEM,
+                                },
                                 {"role": "user", "content": prompt},
                             ],
                             schema=GENERATE_CODE_SCHEMA,
@@ -386,11 +390,15 @@ class SemanticChecker:
         """
         Strip code fences, install imports, exec code, compare, return TransformResult.
         """
-        clean = re.sub(r"^```(?:python)?|```$", "", code, flags=re.MULTILINE).strip()
+        clean = re.sub(
+            r"^```(?:python)?|```$", "", code, flags=re.MULTILINE
+        ).strip()
         # install imports
         for mod in set(
-            re.findall(r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE)
+            re.findall(
+                r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE
+            )
         ):
             try:
                 __import__(mod)
@@ -417,7 +425,9 @@ class SemanticChecker:
             out_t = fn_t(user_val)
             out_c = fn_c(api_val)
-            if isinstance(out_t, (int, float)) and isinstance(out_c, (int, float)):
+            if isinstance(out_t, (int, float)) and isinstance(
+                out_c, (int, float)
+            ):
                 success = math.isclose(out_t, out_c, abs_tol=1e-3)
             else:
                 success = str(out_t) == str(out_c)

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py CHANGED Viewed

@@ -1,8 +1,6 @@
 from typing import Dict, List
-from jsonschema import (
-    Draft7Validator,
-)
+from jsonschema import Draft7Validator
 from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
     StaticMetricResult,
@@ -27,7 +25,9 @@ _STATIC_CHECKS: Dict[str, str] = {
 }
-def evaluate_static(apis_specs: List[ToolSpec], api_call: ToolCall) -> StaticResult:
+def evaluate_static(
+    apis_specs: List[ToolSpec], api_call: ToolCall
+) -> StaticResult:
     """
     Perform static validation on a single tool call.
@@ -97,7 +97,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
     errors: Dict[str, str] = {}
     # 1) Function existence
-    spec = next((s for s in specs if s.function.name == call.function.name), None)
+    spec = next(
+        (s for s in specs if s.function.name == call.function.name), None
+    )
     if not spec:
         errors["non_existent_function"] = (
             f"Function '{call.function.name}' does not exist in the provided API specifications:"
@@ -110,7 +112,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
     parsed_arguments = call.function.parsed_arguments
     # 2) Parameter existence check
-    if non_existent_params := set(parsed_arguments.keys()) - set(properties.keys()):
+    if non_existent_params := set(parsed_arguments.keys()) - set(
+        properties.keys()
+    ):
         errors["non_existent_parameter"] = (
             f"Parameters not defined in function '{call.function.name}': "
             f"{', '.join(sorted(non_existent_params))}. "
@@ -126,7 +130,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
     other_errors = []
     for error in validator.iter_errors(parsed_arguments):
-        field = ".".join(str(x) for x in error.path) if error.path else "unknown"
+        field = (
+            ".".join(str(x) for x in error.path) if error.path else "unknown"
+        )
         if error.validator == "required":
             missing_required.append(error.message)
         elif error.validator == "type":
@@ -145,12 +151,12 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
             "Incorrect parameter type(s): " + "; ".join(incorrect_types)
         )
     if invalid_enum:
-        errors["allowed_values_violation"] = "Invalid parameter value(s): " + "; ".join(
-            invalid_enum
+        errors["allowed_values_violation"] = (
+            "Invalid parameter value(s): " + "; ".join(invalid_enum)
         )
     if other_errors:
-        errors["json_schema_validation"] = "Other validation error(s): " + "; ".join(
-            other_errors
+        errors["json_schema_validation"] = (
+            "Other validation error(s): " + "; ".join(other_errors)
         )
     return errors

ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl