PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.1py3-none-any.whl → 1.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

wxo_agentic_evaluation/red_teaming/attack_generator.py CHANGED Viewed

@@ -1,19 +1,23 @@
+import ast
 import json
-import random
 import os
-import ast
+import random
 import rich
+from jsonargparse import CLI
-from wxo_agentic_evaluation.utils.utils import load_agents
-from wxo_agentic_evaluation.red_teaming.attack_list import RED_TEAMING_ATTACKS, print_attacks
-from wxo_agentic_evaluation.type import AttackCategory
+from wxo_agentic_evaluation.arg_configs import AttackGeneratorConfig
 from wxo_agentic_evaluation.prompt.template_render import (
-    OnPolicyAttackGeneratorTemplateRenderer,
     OffPolicyAttackGeneratorTemplateRenderer,
+    OnPolicyAttackGeneratorTemplateRenderer,
+)
+from wxo_agentic_evaluation.red_teaming.attack_list import (
+    RED_TEAMING_ATTACKS,
+    print_attacks,
 )
 from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.arg_configs import AttackGeneratorConfig
-from jsonargparse import CLI
+from wxo_agentic_evaluation.type import AttackCategory
+from wxo_agentic_evaluation.utils.utils import load_agents
 root_dir = os.path.dirname(os.path.dirname(__file__))
 ON_POLICY_ATTACK_GENERATION_PROMPT = os.path.join(
@@ -60,13 +64,17 @@ class AttackGenerator:
                     if f.lower().endswith(".json")
                 ]
                 if not json_files:
-                    rich.print(f"[yellow]WARNING:[/yellow] No .json files found in directory {path}")
+                    rich.print(
+                        f"[yellow]WARNING:[/yellow] No .json files found in directory {path}"
+                    )
                     continue
                 paths_to_read = json_files
             elif os.path.isfile(path):
                 paths_to_read = [path]
             else:
-                rich.print(f"[yellow]WARNING:[/yellow] Path not found, skipping: {path}")
+                rich.print(
+                    f"[yellow]WARNING:[/yellow] Path not found, skipping: {path}"
+                )
                 continue
             for file_path in paths_to_read:
@@ -74,7 +82,9 @@ class AttackGenerator:
                     with open(file_path) as f:
                         data = json.load(f)
                 except Exception as e:
-                    rich.print(f"[red]ERROR:[/red] Failed to load {file_path}: {e}")
+                    rich.print(
+                        f"[red]ERROR:[/red] Failed to load {file_path}: {e}"
+                    )
                     continue
                 info = {
@@ -107,7 +117,7 @@ class AttackGenerator:
             if agent["name"].endswith("_manager"):
                 manager_agent_name = agent["name"]
                 break
         if manager_agent_name is None:
             manager_agent_name = target_agent_name
             rich.print(
@@ -122,7 +132,9 @@ class AttackGenerator:
             if attack.get("attack_name") == clean_name:
                 return attack
         rich.print(f"[red]ERROR:[/red] No attack found with name: {name}")
-        rich.print("[green]INFO:[/green] See the list of available attacks below under the \"Name\" column:")
+        rich.print(
+            '[green]INFO:[/green] See the list of available attacks below under the "Name" column:'
+        )
         print_attacks()
         return None
@@ -171,7 +183,9 @@ class AttackGenerator:
                         tools_list=tools,
                         agent_instructions=policy_instructions,
                         original_story=info.get("story", ""),
-                        original_starting_sentence=info.get("starting_sentence", ""),
+                        original_starting_sentence=info.get(
+                            "starting_sentence", ""
+                        ),
                     )
                     res = self.llm_client.query(on_policy_prompt)
                     try:
@@ -221,11 +235,15 @@ class AttackGenerator:
                 if attack_category == AttackCategory.off_policy:
                     off_policy_prompt = self.off_policy_renderer.render(
                         original_story=info.get("story", ""),
-                        original_starting_sentence=info.get("starting_sentence", ""),
+                        original_starting_sentence=info.get(
+                            "starting_sentence", ""
+                        ),
                     )
                     res = self.llm_client.query(off_policy_prompt)
                     try:
-                        off_policy_attack_data = ast.literal_eval(res.strip())[0]
+                        off_policy_attack_data = ast.literal_eval(res.strip())[
+                            0
+                        ]
                     except:
                         off_policy_attack_data = {}
@@ -249,11 +267,13 @@ class AttackGenerator:
                             "modified_starting_sentence", ""
                         )
-                        results.append({"dataset": info.get("dataset"), "attack": out})
+                        results.append(
+                            {"dataset": info.get("dataset"), "attack": out}
+                        )
         if output_dir is None:
             output_dir = os.path.join(os.getcwd(), "red_team_attacks")
         os.makedirs(output_dir, exist_ok=True)
         for idx, res in enumerate(results):
             attack = res.get("attack", {})

wxo_agentic_evaluation/red_teaming/attack_runner.py CHANGED Viewed

@@ -1,29 +1,31 @@
-from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.resource_map import ResourceMap
-from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
+import dataclasses
+import glob
+import json
+import os
+import traceback
+from concurrent.futures import ThreadPoolExecutor
+import rich
+import yaml
+from jsonargparse import CLI
+from rich.progress import Progress
+from wxo_agentic_evaluation.arg_configs import AttackConfig
 from wxo_agentic_evaluation.inference_backend import (
     EvaluationController,
-    get_wxo_client,
     WXOInferenceBackend,
+    get_wxo_client,
+)
+from wxo_agentic_evaluation.llm_user import LLMUser
+from wxo_agentic_evaluation.prompt.template_render import (
+    LlamaUserTemplateRenderer,
 )
-from wxo_agentic_evaluation.type import AttackData
-from wxo_agentic_evaluation.arg_configs import AttackConfig
 from wxo_agentic_evaluation.red_teaming.attack_evaluator import AttackEvaluator
+from wxo_agentic_evaluation.resource_map import ResourceMap
+from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.type import AttackData
 from wxo_agentic_evaluation.utils import json_dump
-import os
-import json
-import traceback
-import yaml
-import dataclasses
-import glob
-import rich
-from rich.progress import Progress
-from concurrent.futures import ThreadPoolExecutor
-from jsonargparse import CLI
 def process_attack(task_n, attack_path, config, inference_backend, llm_user):
     tc_name = os.path.basename(attack_path).replace(".json", "")
@@ -31,7 +33,9 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
         attack: AttackData = AttackData.model_validate(json.load(f))
     evaluation_controller = EvaluationController(
-        wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
+        wxo_inference_backend=inference_backend,
+        llm_user=llm_user,
+        config=config,
     )
     rich.print(f"[bold magenta]Running attack: {tc_name}[/bold magenta]")
     history, _, _ = evaluation_controller.run(
@@ -46,7 +50,8 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
         result.append(message.model_dump())
     json_dump(
-        os.path.join(config.output_dir, "messages", tc_name + ".messages.json"), result
+        os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
+        result,
     )
     return result
@@ -55,19 +60,26 @@ def process_attack(task_n, attack_path, config, inference_backend, llm_user):
 def run_attacks(config: AttackConfig):
     executor = ThreadPoolExecutor(max_workers=config.num_workers)
     wxo_client = get_wxo_client(
-        config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
+        config.auth_config.url,
+        config.auth_config.tenant_name,
+        config.auth_config.token,
     )
     resource_map = ResourceMap(wxo_client)
     inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
     llm_user = LLMUser(
         wai_client=get_provider(
-            config=config.provider_config, model_id=config.llm_user_config.model_id
+            config=config.provider_config,
+            model_id=config.llm_user_config.model_id,
+        ),
+        template=LlamaUserTemplateRenderer(
+            config.llm_user_config.prompt_config
         ),
-        template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
         user_response_style=config.llm_user_config.user_response_style,
     )
-    print(f"Running red teaming attacks with tenant {config.auth_config.tenant_name}")
+    print(
+        f"Running red teaming attacks with tenant {config.auth_config.tenant_name}"
+    )
     os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
     results_list = []
@@ -81,7 +93,9 @@ def run_attacks(config: AttackConfig):
     task_n = 0
     for attack_path in attack_paths:
-        if not attack_path.endswith(".json") or attack_path.endswith("agent.json"):
+        if not attack_path.endswith(".json") or attack_path.endswith(
+            "agent.json"
+        ):
             continue
         future = executor.submit(
@@ -118,7 +132,9 @@ def run_attacks(config: AttackConfig):
     ) as f:
         yaml.safe_dump(dataclasses.asdict(config), f)
-    with open(os.path.join(config.output_dir, "attacks_results.json"), "w") as f:
+    with open(
+        os.path.join(config.output_dir, "attacks_results.json"), "w"
+    ) as f:
         json.dump(attack_results, f, indent=2)
     print(f"Attack results saved to {config.output_dir}")

wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from abc import ABC
 from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
-from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import MetricPrompt
+from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
+    MetricPrompt,
+)
 class FunctionMetricsPrompt(MetricPrompt, ABC):

wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py CHANGED Viewed

@@ -1,14 +1,7 @@
 import json
 from enum import Enum
 from pathlib import Path
-from typing import (
-    Any,
-    Dict,
-    Iterable,
-    List,
-    Tuple,
-    Union,
-)
+from typing import Any, Dict, Iterable, List, Tuple, Union
 from pydantic import ValidationError
@@ -18,7 +11,10 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function
 from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_selection.function_selection import (
     FunctionSelectionPrompt,
 )
-from wxo_agentic_evaluation.referenceless_eval.metrics import Metric, MetricPrompt
+from wxo_agentic_evaluation.referenceless_eval.metrics import (
+    Metric,
+    MetricPrompt,
+)
 PromptType = Union[
     GeneralMetricsPrompt,
@@ -70,7 +66,9 @@ def load_prompts_from_jsonl(
         raise LoaderError(f"File not found: {path}")
     prompts: List[PromptType] = []
-    for lineno, raw in enumerate(p.read_text(encoding="utf-8").splitlines(), start=1):
+    for lineno, raw in enumerate(
+        p.read_text(encoding="utf-8").splitlines(), start=1
+    ):
         if not raw.strip():
             continue
         try:
@@ -96,7 +94,9 @@ def load_prompts_from_jsonl(
         # Instantiate prompt
         prompt: MetricPrompt
         try:
-            prompt = PromptCls(metric=metric, task_description=metric.description)
+            prompt = PromptCls(
+                metric=metric, task_description=metric.description
+            )
         except TypeError:
             prompt = PromptCls(metric=metric)
@@ -158,7 +158,9 @@ def load_prompts_from_list(
             raise LoaderError(f"Record {idx} invalid schema: {e}") from e
         try:
-            prompt = PromptCls(metric=metric, task_description=rec["task_description"])
+            prompt = PromptCls(
+                metric=metric, task_description=rec["task_description"]
+            )
         except TypeError:
             prompt = PromptCls(metric=metric)
@@ -167,11 +169,15 @@ def load_prompts_from_list(
                 user_kwargs = ex["user_kwargs"]
                 output = ex["output"]
             except KeyError as e:
-                raise LoaderError(f"Record {idx}, example {ex_idx} missing {e}") from e
+                raise LoaderError(
+                    f"Record {idx}, example {ex_idx} missing {e}"
+                ) from e
             try:
                 prompt.add_example(user_kwargs, output)
             except (ValidationError, ValueError) as e:
-                raise LoaderError(f"Record {idx}, example {ex_idx} invalid: {e}") from e
+                raise LoaderError(
+                    f"Record {idx}, example {ex_idx} invalid: {e}"
+                ) from e
         prompts.append(prompt)
@@ -211,7 +217,9 @@ def load_prompts_from_metrics(
         # Instantiate prompt with the metric's description as task_description
         try:
-            prompt = PromptCls(metric=metric, task_description=metric.description)
+            prompt = PromptCls(
+                metric=metric, task_description=metric.description
+            )
         except TypeError:
             # Fallback if constructor signature differs
             prompt = PromptCls(metric=metric)

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py CHANGED Viewed

@@ -1,8 +1,4 @@
-from typing import (
-    Any,
-    Dict,
-    List,
-)
+from typing import Any, Dict, List
 from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
     ToolCall,
@@ -65,7 +61,9 @@ class OpenAIAdapter(BaseAdapter):
         ]
     def get_tool_spec(self, tool_name: str) -> Dict[str, Any]:
-        tool = next((t for t in self.specs if t.function.name == tool_name), None)
+        tool = next(
+            (t for t in self.specs if t.function.name == tool_name), None
+        )
         return tool.function.model_dump() if tool else {}
     def get_call_dict(self) -> Dict[str, Any]:
@@ -87,11 +85,18 @@ class OpenAIAdapter(BaseAdapter):
     def get_param_spec_snippet(self, param_name: str) -> Dict[str, Any]:
         spec = next(
-            (s for s in self.specs if s.function.name == self.get_function_name()), None
+            (
+                s
+                for s in self.specs
+                if s.function.name == self.get_function_name()
+            ),
+            None,
         )
         if not spec:
             return {"type": "object", "properties": {}, "required": []}
-        props = spec.function.parameters.get("properties", spec.function.parameters)
+        props = spec.function.parameters.get(
+            "properties", spec.function.parameters
+        )
         if param_name not in props:
             return {"type": "object", "properties": {}, "required": []}
         return {

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py CHANGED Viewed

@@ -1,8 +1,9 @@
-import json
 import importlib.resources
+import json
 from pathlib import Path
 from typing import Dict, List, Optional, Union
+from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
 from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.semantic_checker import (
     SemanticChecker,
 )
@@ -19,13 +20,16 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
     ToolCall,
     ToolSpec,
 )
-from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
-from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import LLMKitWrapper
+from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
+    LLMKitWrapper,
+)
 def metrics_dir():
     path = importlib.resources.files(metrics)
     return path
 # Default metric JSON paths
 _METRICS_DIR = metrics_dir()
 _DEFAULT_GENERAL = _METRICS_DIR / "function_call" / "general_metrics.json"
@@ -36,10 +40,14 @@ _DEFAULT_FUNCSEL = (
     _METRICS_DIR / "function_selection" / "function_selection_metrics.json"
 )
 _DEFAULT_FUNCSEL_RUNTIME = (
-    _METRICS_DIR / "function_selection" / "function_selection_metrics_runtime.json"
+    _METRICS_DIR
+    / "function_selection"
+    / "function_selection_metrics_runtime.json"
 )
 _DEFAULT_PARAM = _METRICS_DIR / "parameter" / "parameter_metrics.json"
-_DEFAULT_PARAM_RUNTIME = _METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
+_DEFAULT_PARAM_RUNTIME = (
+    _METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
+)
 class ReflectionPipeline:
@@ -88,11 +96,19 @@ class ReflectionPipeline:
         for metrics, default_path in [
             (
                 self.general_metrics,
-                _DEFAULT_GENERAL_RUNTIME if runtime_pipeline else _DEFAULT_GENERAL,
+                (
+                    _DEFAULT_GENERAL_RUNTIME
+                    if runtime_pipeline
+                    else _DEFAULT_GENERAL
+                ),
             ),
             (
                 self.function_metrics,
-                _DEFAULT_FUNCSEL_RUNTIME if runtime_pipeline else _DEFAULT_FUNCSEL,
+                (
+                    _DEFAULT_FUNCSEL_RUNTIME
+                    if runtime_pipeline
+                    else _DEFAULT_FUNCSEL
+                ),
             ),
             (
                 self.parameter_metrics,
@@ -104,7 +120,9 @@ class ReflectionPipeline:
                 continue
             # Handle metric names list
-            if isinstance(metrics, list) and all(isinstance(x, str) for x in metrics):
+            if isinstance(metrics, list) and all(
+                isinstance(x, str) for x in metrics
+            ):
                 # Load the default JSON file
                 if not default_path.is_file():
                     raise FileNotFoundError(
@@ -116,7 +134,9 @@ class ReflectionPipeline:
                 # Filter metrics by name
                 filtered_metrics = [
-                    metric for metric in all_metrics if metric.get("name") in metrics
+                    metric
+                    for metric in all_metrics
+                    if metric.get("name") in metrics
                 ]
                 # Remove examples from prompts if requested
@@ -125,7 +145,9 @@ class ReflectionPipeline:
                         metric.pop("examples", None)
                 if len(filtered_metrics) != len(metrics):
-                    found_names = {metric.get("name") for metric in filtered_metrics}
+                    found_names = {
+                        metric.get("name") for metric in filtered_metrics
+                    }
                     missing = set(metrics) - found_names
                     raise ValueError(f"Metrics not found: {missing}")
@@ -140,14 +162,20 @@ class ReflectionPipeline:
             if isinstance(metrics, list) and all(
                 isinstance(x, FunctionCallMetric) for x in metrics
             ):
-                metrics_definitions.append([metric.model_dump() for metric in metrics])
+                metrics_definitions.append(
+                    [metric.model_dump() for metric in metrics]
+                )
             else:
                 if not metrics.is_file():
-                    raise FileNotFoundError(f"Metrics file not found: {metrics}")
+                    raise FileNotFoundError(
+                        f"Metrics file not found: {metrics}"
+                    )
                 metrics_definitions.append(
                     [
                         json.loads(json_obj)
-                        for json_obj in metrics.read_text(encoding="utf8").splitlines()
+                        for json_obj in metrics.read_text(
+                            encoding="utf8"
+                        ).splitlines()
                         if json_obj.strip()
                     ]
                 )

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py CHANGED Viewed

@@ -1,14 +1,7 @@
 import json
 import math
 import re
-from typing import (
-    Any,
-    Dict,
-    List,
-    Optional,
-    Tuple,
-    Union,
-)
+from typing import Any, Dict, List, Optional, Tuple, Union
 from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_call.general import (
     GeneralMetricsPrompt,
@@ -39,11 +32,13 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
     ToolSpec,
     TransformResult,
 )
-from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
 from wxo_agentic_evaluation.referenceless_eval.metrics.metrics_runner import (
     MetricRunner,
     MetricRunResult,
 )
+from wxo_agentic_evaluation.service_provider.watsonx_provider import (
+    WatsonXProvider,
+)
 class SemanticChecker:
@@ -231,7 +226,9 @@ class SemanticChecker:
                     schema_param_name="schema",
                     retries=retries,
                 )
-                general_results = SemanticCategoryResult.from_results(sync_results)
+                general_results = SemanticCategoryResult.from_results(
+                    sync_results
+                )
             except Exception as e:
                 general_results = {"error": str(e)}
         else:
@@ -261,7 +258,9 @@ class SemanticChecker:
                     schema_param_name="schema",
                     retries=retries,
                 )
-                function_results = SemanticCategoryResult.from_results(sync_results)
+                function_results = SemanticCategoryResult.from_results(
+                    sync_results
+                )
             except Exception as e:
                 function_results = {"error": str(e)}
         else:
@@ -272,7 +271,9 @@ class SemanticChecker:
         for pname, pval in params.items():
             # Each parameter has its own prompts
             try:
-                param_entries: List[Tuple[ParameterMetricsPrompt, Dict[str, Any]]] = []
+                param_entries: List[
+                    Tuple[ParameterMetricsPrompt, Dict[str, Any]]
+                ] = []
                 for prompt in self.parameter_prompts:
                     param_entries.append(
                         (
@@ -351,7 +352,10 @@ class SemanticChecker:
                         )
                         gen_code = self.codegen_client.generate(
                             prompt=[
-                                {"role": "system", "content": GENERATE_CODE_SYSTEM},
+                                {
+                                    "role": "system",
+                                    "content": GENERATE_CODE_SYSTEM,
+                                },
                                 {"role": "user", "content": prompt},
                             ],
                             schema=GENERATE_CODE_SCHEMA,
@@ -386,11 +390,15 @@ class SemanticChecker:
         """
         Strip code fences, install imports, exec code, compare, return TransformResult.
         """
-        clean = re.sub(r"^```(?:python)?|```$", "", code, flags=re.MULTILINE).strip()
+        clean = re.sub(
+            r"^```(?:python)?|```$", "", code, flags=re.MULTILINE
+        ).strip()
         # install imports
         for mod in set(
-            re.findall(r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE)
+            re.findall(
+                r"^(?:import|from)\s+([A-Za-z0-9_]+)", clean, flags=re.MULTILINE
+            )
         ):
             try:
                 __import__(mod)
@@ -417,7 +425,9 @@ class SemanticChecker:
             out_t = fn_t(user_val)
             out_c = fn_c(api_val)
-            if isinstance(out_t, (int, float)) and isinstance(out_c, (int, float)):
+            if isinstance(out_t, (int, float)) and isinstance(
+                out_c, (int, float)
+            ):
                 success = math.isclose(out_t, out_c, abs_tol=1e-3)
             else:
                 success = str(out_t) == str(out_c)

ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.1py3-none-any.whl → 1.1.3py3-none-any.whl