PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show

wxo_agentic_evaluation/tool_planner.py CHANGED Viewed

@@ -1,26 +1,35 @@
-import json
 import ast
 import csv
-from pathlib import Path
 import importlib.util
-import re
-from jsonargparse import CLI
+import json
 import os
+import re
 import sys
 import textwrap
-from dataclasses import is_dataclass, asdict
+from dataclasses import asdict, is_dataclass
+from pathlib import Path
+from jsonargparse import CLI
-from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
-from wxo_agentic_evaluation.prompt.template_render import ToolPlannerTemplateRenderer, ArgsExtractorTemplateRenderer
 from wxo_agentic_evaluation import __file__
+from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
+from wxo_agentic_evaluation.prompt.template_render import (
+    ArgsExtractorTemplateRenderer,
+    ToolPlannerTemplateRenderer,
+)
+from wxo_agentic_evaluation.service_provider import get_provider
 root_dir = os.path.dirname(__file__)
-TOOL_PLANNER_PROMPT_PATH = os.path.join(root_dir, "prompt", "tool_planner.jinja2")
-ARGS_EXTRACTOR_PROMPT_PATH = os.path.join(root_dir, "prompt", "args_extractor_prompt.jinja2")
+TOOL_PLANNER_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "tool_planner.jinja2"
+)
+ARGS_EXTRACTOR_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "args_extractor_prompt.jinja2"
+)
 MISSING_DOCSTRING_PROMPT = "No description available"
 class UniversalEncoder(json.JSONEncoder):
     def default(self, obj):
         if is_dataclass(obj):
@@ -29,12 +38,15 @@ class UniversalEncoder(json.JSONEncoder):
             return obj.__dict__
         return super().default(obj)
 def extract_first_json_list(raw: str) -> list:
     matches = re.findall(r"\[\s*{.*?}\s*]", raw, re.DOTALL)
     for match in matches:
         try:
             parsed = json.loads(match)
-            if isinstance(parsed, list) and all("tool_name" in step for step in parsed):
+            if isinstance(parsed, list) and all(
+                "tool_name" in step for step in parsed
+            ):
                 return parsed
         except Exception:
             continue
@@ -42,6 +54,7 @@ def extract_first_json_list(raw: str) -> list:
     print(raw)
     return []
 def parse_json_string(input_string):
     json_char_count = 0
     json_objects = []
@@ -79,12 +92,16 @@ def load_tools_module(tools_path: Path) -> dict:
     elif tools_path.is_dir():
         files_to_parse.extend(tools_path.glob("**/*.py"))
     else:
-        raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
+        raise ValueError(
+            f"Tools path {tools_path} is neither a file nor directory"
+        )
     for file_path in files_to_parse:
         try:
             module_name = file_path.stem
-            spec = importlib.util.spec_from_file_location(module_name, file_path)
+            spec = importlib.util.spec_from_file_location(
+                module_name, file_path
+            )
             module = importlib.util.module_from_spec(spec)
             parent_dir = str(file_path.parent)
             sys_path_modified = False
@@ -99,7 +116,7 @@ def load_tools_module(tools_path: Path) -> dict:
             # Add all module's non-private functions to tools_dict
             for attr_name in dir(module):
                 attr = getattr(module, attr_name)
-                if callable(attr) and not attr_name.startswith('_'):
+                if callable(attr) and not attr_name.startswith("_"):
                     tools_dict[attr_name] = attr
         except Exception as e:
             print(f"Warning: Failed to load {file_path}: {str(e)}")
@@ -117,7 +134,9 @@ def extract_tool_signatures(tools_path: Path) -> list:
     elif tools_path.is_dir():
         files_to_parse.extend(tools_path.glob("**/*.py"))
     else:
-        raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
+        raise ValueError(
+            f"Tools path {tools_path} is neither a file nor directory"
+        )
     for file_path in files_to_parse:
         try:
@@ -128,19 +147,24 @@ def extract_tool_signatures(tools_path: Path) -> list:
             for node in parsed_code.body:
                 if isinstance(node, ast.FunctionDef):
                     name = node.name
-                    args = [arg.arg for arg in node.args.args if arg.arg != "self"]
+                    args = [
+                        arg.arg for arg in node.args.args if arg.arg != "self"
+                    ]
                     docstring = ast.get_docstring(node)
-                    tool_data.append({
-                        "Function Name": name,
-                        "Arguments": args,
-                        "Docstring": docstring or MISSING_DOCSTRING_PROMPT
-                    })
+                    tool_data.append(
+                        {
+                            "Function Name": name,
+                            "Arguments": args,
+                            "Docstring": docstring or MISSING_DOCSTRING_PROMPT,
+                        }
+                    )
         except Exception as e:
             print(f"Warning: Failed to parse {file_path}: {str(e)}")
             continue
     return tool_data
 def extract_tool_signatures_for_prompt(tools_path: Path) -> dict[str, str]:
     functions = {}
     files_to_parse = []
@@ -151,7 +175,9 @@ def extract_tool_signatures_for_prompt(tools_path: Path) -> dict[str, str]:
     elif tools_path.is_dir():
         files_to_parse.extend(tools_path.glob("**/*.py"))
     else:
-        raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
+        raise ValueError(
+            f"Tools path {tools_path} is neither a file nor directory"
+        )
     for file_path in files_to_parse:
         try:
@@ -168,23 +194,35 @@ def extract_tool_signatures_for_prompt(tools_path: Path) -> dict[str, str]:
                     for arg in node.args.args:
                         if arg.arg == "self":
                             continue
-                        annotation = ast.unparse(arg.annotation) if arg.annotation else "Any"
+                        annotation = (
+                            ast.unparse(arg.annotation)
+                            if arg.annotation
+                            else "Any"
+                        )
                         args.append((arg.arg, annotation))
                     # Get return type
-                    returns = ast.unparse(node.returns) if node.returns else "None"
+                    returns = (
+                        ast.unparse(node.returns) if node.returns else "None"
+                    )
                     # Get docstring
                     docstring = ast.get_docstring(node)
-                    docstring = textwrap.dedent(docstring).strip() if docstring else ""
+                    docstring = (
+                        textwrap.dedent(docstring).strip() if docstring else ""
+                    )
                     # Format parameter descriptions if available in docstring
                     doc_lines = docstring.splitlines()
                     doc_summary = doc_lines[0] if doc_lines else ""
-                    param_descriptions = "\n".join([line for line in doc_lines[1:] if ":param" in line])
+                    param_descriptions = "\n".join(
+                        [line for line in doc_lines[1:] if ":param" in line]
+                    )
                     # Compose the final string
-                    args_str = ", ".join(f"{arg}: {type_}" for arg, type_ in args)
+                    args_str = ", ".join(
+                        f"{arg}: {type_}" for arg, type_ in args
+                    )
                     function_str = f"""def {name}({args_str}) -> {returns}:
     {doc_summary}"""
                     if param_descriptions:
@@ -197,9 +235,18 @@ def extract_tool_signatures_for_prompt(tools_path: Path) -> dict[str, str]:
     return functions
-def ensure_data_available(step: dict, inputs: dict, snapshot: dict, tools_module: dict, tool_signatures_for_prompt) -> dict:
+def ensure_data_available(
+    step: dict,
+    inputs: dict,
+    snapshot: dict,
+    tools_module: dict,
+    tool_signatures_for_prompt,
+) -> dict:
     tool_name = step["tool_name"]
-    cache = snapshot.setdefault("input_output_examples", {}).setdefault(tool_name, [])
+    cache = snapshot.setdefault("input_output_examples", {}).setdefault(
+        tool_name, []
+    )
     for entry in cache:
         if entry["inputs"] == inputs:
             return entry["output"]
@@ -212,7 +259,11 @@ def ensure_data_available(step: dict, inputs: dict, snapshot: dict, tools_module
     except:
         provider = get_provider(
             model_id="meta-llama/llama-3-405b-instruct",
-            params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 500},
+            params={
+                "min_new_tokens": 0,
+                "decoding_method": "greedy",
+                "max_new_tokens": 500,
+            },
         )
         renderer = ArgsExtractorTemplateRenderer(ARGS_EXTRACTOR_PROMPT_PATH)
@@ -226,14 +277,19 @@ def ensure_data_available(step: dict, inputs: dict, snapshot: dict, tools_module
         try:
             output = tools_module[json_obj["tool_name"]](**json_obj["inputs"])
         except:
-            raise ValueError(f"Failed to execute tool '{tool_name}' with inputs {inputs}")
+            raise ValueError(
+                f"Failed to execute tool '{tool_name}' with inputs {inputs}"
+            )
     cache.append({"inputs": inputs, "output": output})
     if not isinstance(output, dict):
         print(f" Tool {tool_name} returned non-dict output: {output}")
     return output
-def plan_tool_calls_with_llm(story: str, agent_name: str, tool_signatures_str: str, provider) -> list:
+def plan_tool_calls_with_llm(
+    story: str, agent_name: str, tool_signatures_str: str, provider
+) -> list:
     renderer = ToolPlannerTemplateRenderer(TOOL_PLANNER_PROMPT_PATH)
@@ -250,7 +306,9 @@ def plan_tool_calls_with_llm(story: str, agent_name: str, tool_signatures_str: s
 # --- Tool Execution Logic ---
-def run_tool_chain(tool_plan: list, snapshot: dict, tools_module, tool_signatures_for_prompt) -> None:
+def run_tool_chain(
+    tool_plan: list, snapshot: dict, tools_module, tool_signatures_for_prompt
+) -> None:
     memory = {}
     for step in tool_plan:
@@ -280,7 +338,9 @@ def run_tool_chain(tool_plan: list, snapshot: dict, tools_module, tool_signature
         if list_keys:
             if len(list_keys) > 1:
-                raise ValueError(f"Tool '{name}' received multiple list inputs. Only one supported for now.")
+                raise ValueError(
+                    f"Tool '{name}' received multiple list inputs. Only one supported for now."
+                )
             list_key = list_keys[0]
             value_list = resolved_inputs[list_key]
@@ -289,20 +349,36 @@ def run_tool_chain(tool_plan: list, snapshot: dict, tools_module, tool_signature
                 item_inputs = resolved_inputs.copy()
                 item_inputs[list_key] = val
                 print(f" ⚙️ Running {name} with {list_key} = {val}")
-                output = ensure_data_available(step, item_inputs, snapshot, tools_module, tool_signatures_for_prompt)
+                output = ensure_data_available(
+                    step,
+                    item_inputs,
+                    snapshot,
+                    tools_module,
+                    tool_signatures_for_prompt,
+                )
                 results.append(output)
                 memory[f"{name}_{idx}"] = output
             memory[name] = results
-            print(f"Stored {len(results)} outputs under '{name}' and indexed as '{name}_i'")
+            print(
+                f"Stored {len(results)} outputs under '{name}' and indexed as '{name}_i'"
+            )
         else:
-            output = ensure_data_available(step, resolved_inputs, snapshot, tools_module, tool_signatures_for_prompt)
+            output = ensure_data_available(
+                step,
+                resolved_inputs,
+                snapshot,
+                tools_module,
+                tool_signatures_for_prompt,
+            )
             memory[name] = output
             print(f"Stored output under tool name: {name} = {output}")
 # --- Main Snapshot Builder ---
-def build_snapshot(agent_name: str, tools_path: Path, stories: list, output_path: Path):
+def build_snapshot(
+    agent_name: str, tools_path: Path, stories: list, output_path: Path
+):
     agent = {"name": agent_name}
     tools_module = load_tools_module(tools_path)
     tool_signatures = extract_tool_signatures(tools_path)
@@ -310,20 +386,28 @@ def build_snapshot(agent_name: str, tools_path: Path, stories: list, output_path
     provider = get_provider(
         model_id="meta-llama/llama-3-405b-instruct",
-        params={"min_new_tokens": 1, "decoding_method": "greedy", "max_new_tokens": 2048},
+        params={
+            "min_new_tokens": 1,
+            "decoding_method": "greedy",
+            "max_new_tokens": 2048,
+        },
     )
     snapshot = {
         "agent": agent,
         "tools": tool_signatures,
-        "input_output_examples": {}
+        "input_output_examples": {},
     }
     for story in stories:
         print(f"\n📘 Planning tool calls for story: {story}")
-        tool_plan = plan_tool_calls_with_llm(story, agent["name"], tool_signatures, provider)
+        tool_plan = plan_tool_calls_with_llm(
+            story, agent["name"], tool_signatures, provider
+        )
         try:
-            run_tool_chain(tool_plan, snapshot, tools_module, tool_signatures_for_prompt)
+            run_tool_chain(
+                tool_plan, snapshot, tools_module, tool_signatures_for_prompt
+            )
         except ValueError as e:
             print(f"❌ Error running tool chain for story '{story}': {e}")
             continue
@@ -340,7 +424,7 @@ if __name__ == "__main__":
     stories = []
     agent_name = None
-    with stories_path.open("r", encoding="utf-8", newline='') as f:
+    with stories_path.open("r", encoding="utf-8", newline="") as f:
         csv_reader = csv.DictReader(f)
         for row in csv_reader:
             stories.append(row["story"])
@@ -349,4 +433,4 @@ if __name__ == "__main__":
     snapshot_path = stories_path.parent / f"{agent_name}_snapshot_llm.json"
-    build_snapshot(agent_name, tools_path, stories, snapshot_path)
+    build_snapshot(agent_name, tools_path, stories, snapshot_path)

wxo_agentic_evaluation/type.py CHANGED Viewed

@@ -1,10 +1,7 @@
-from typing import Dict, List, Union, Any, Optional
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Field
-    )
 from enum import StrEnum
+from typing import Any, Dict, List, Optional, Union
+from pydantic import BaseModel, ConfigDict, Field
 from rich.text import Text
@@ -61,9 +58,13 @@ class ConversationalConfidenceThresholdScore(BaseModel):
     def table(self):
         return {
             "response_confidence": str(self.response_confidence),
-            "response_confidence_threshold": str(self.response_confidence_threshold),
+            "response_confidence_threshold": str(
+                self.response_confidence_threshold
+            ),
             "retrieval_confidence": str(self.retrieval_confidence),
-            "retrieval_confidence_threshold": str(self.retrieval_confidence_threshold),
+            "retrieval_confidence_threshold": str(
+                self.retrieval_confidence_threshold
+            ),
         }
@@ -120,12 +121,14 @@ class GoalDetail(BaseModel):
     keywords: List = None
     knowledge_base: KnowledgeBaseGoalDetail = KnowledgeBaseGoalDetail()
 class AttackData(BaseModel):
     attack_category: AttackCategory
     attack_type: str
     attack_name: str
     attack_instructions: str
 class AttackData(BaseModel):
     agent: str
     agents_path: str
@@ -143,8 +146,8 @@ class EvaluationData(BaseModel):
     goal_details: List[GoalDetail]
     starting_sentence: str = None
 class ToolDefinition(BaseModel):
     tool_description: Optional[str]
     tool_name: str
     tool_params: List[str]

wxo_agentic_evaluation/utils/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 def json_dump(output_path, object):
     with open(output_path, "w", encoding="utf-8") as f:
         json.dump(object, f, indent=4)

wxo_agentic_evaluation/utils/open_ai_tool_extractor.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import ast
 import re
 from pathlib import Path
-from typing import Union, Mapping, Any, List
+from typing import Any, List, Mapping, Union
 class PythonTypeToJsonType:
     OPTIONAL_PARAM_EXTRACT = re.compile(r"[Oo]ptional\[(\w+)\]")
     @staticmethod
     def python_to_json_type(python_annotation: str):
         if not python_annotation:
@@ -25,30 +26,33 @@ class PythonTypeToJsonType:
             return "object"
         if python_annotation.startswith("optional"):
             # extract the type within Optional[T]
-            inner_type = PythonTypeToJsonType.OPTIONAL_PARAM_EXTRACT.search(python_annotation).group(1)
+            inner_type = PythonTypeToJsonType.OPTIONAL_PARAM_EXTRACT.search(
+                python_annotation
+            ).group(1)
             return PythonTypeToJsonType.python_to_json_type(inner_type)
         return "string"
 class ToolExtractionOpenAIFormat:
     @staticmethod
     def get_default_arguments(node):
-        """ Returns the default arguments (if any)
+        """Returns the default arguments (if any)
         The default arguments are stored in args.default array.
         Since, in Python, the default arguments only come after positional arguments,
         we can index the argument array starting from the last `n` arguments, where n is
         the length of the default arguments.
-        ex.
+        ex.
         def add(a, b=5):
            pass
         Then we have,
         args = [a, b]
         defaults = [Constant(value=5)]
-        args[-len(defaults):] = [b]
+        args[-len(defaults):] = [b]
         (
         "FunctionDef(
@@ -70,12 +74,12 @@ class ToolExtractionOpenAIFormat:
         if num_defaults > 0:
             for arg in node.args.args[-num_defaults:]:
                 default_arguments.add(arg)
         return default_arguments
     @staticmethod
     def from_file(tools_path: Union[str, Path]) -> Mapping[str, Any]:
-        """ Uses `extract_tool_signatures` function, but converts the response
+        """Uses `extract_tool_signatures` function, but converts the response
             to open-ai format
             ```
@@ -100,7 +104,11 @@ class ToolExtractionOpenAIFormat:
             parsed_code = ast.parse(code)
             for node in parsed_code.body:
                 if isinstance(node, ast.FunctionDef):
-                    parameters = {"type": "object", "properties": {}, "required": []}
+                    parameters = {
+                        "type": "object",
+                        "properties": {},
+                        "required": [],
+                    }
                     function_name = node.name
                     for arg in node.args.args:
                         type_annotation = None
@@ -109,16 +117,25 @@ class ToolExtractionOpenAIFormat:
                         if arg.annotation:
                             type_annotation = ast.unparse(arg.annotation)
-                        parameter_type = PythonTypeToJsonType.python_to_json_type(type_annotation)
+                        parameter_type = (
+                            PythonTypeToJsonType.python_to_json_type(
+                                type_annotation
+                            )
+                        )
                         parameters["properties"][arg.arg] = {
                             "type": parameter_type,
-                            "description": "", # todo
+                            "description": "",  # todo
                         }
-                        if type_annotation and "Optional" not in type_annotation:
+                        if (
+                            type_annotation
+                            and "Optional" not in type_annotation
+                        ):
                             parameters["required"].append(arg.arg)
-                    default_arguments = ToolExtractionOpenAIFormat.get_default_arguments(node)
+                    default_arguments = (
+                        ToolExtractionOpenAIFormat.get_default_arguments(node)
+                    )
                     for arg_name in parameters["required"]:
                         if arg_name in default_arguments:
                             parameters.remove(arg_name)
@@ -128,8 +145,10 @@ class ToolExtractionOpenAIFormat:
                         "function": {
                             "name": function_name,
                             "parameters": parameters,
-                            "description": ast.get_docstring(node) # fix (does not do :params)
-                        }
+                            "description": ast.get_docstring(
+                                node
+                            ),  # fix (does not do :params)
+                        },
                     }
                     tool_data.append(open_ai_format_fn)
@@ -149,9 +168,11 @@ class ToolExtractionOpenAIFormat:
         elif tools_path.is_dir():
             files_to_parse.extend(tools_path.glob("**/*.py"))
         else:
-            raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
+            raise ValueError(
+                f"Tools path {tools_path} is neither a file nor directory"
+            )
         for file_path in files_to_parse:
             all_tools.extend(ToolExtractionOpenAIFormat.from_file(file_path))
-        return all_tools
+        return all_tools

wxo_agentic_evaluation/utils/rich_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from rich.text import Text
-from typing import Optional, List, Any
+from typing import Any, List, Optional
 import rich
+from rich.text import Text
 def pretty_print(content: Any, style: Optional[str] = None):
@@ -33,13 +34,17 @@ def warn(
 def is_ok(
-    message: str, style: Optional[str] = "bold green", prompt: Optional[str] = "OK ✅ :"
+    message: str,
+    style: Optional[str] = "bold green",
+    prompt: Optional[str] = "OK ✅ :",
 ) -> Text:
     """Utility function for formatting an OK message."""
     return Text(f"{prompt}{message}\n\n", style=style)
-def print_done(prompt: Optional[str] = "Done ✅", style: Optional[str] = "bold cyan"):
+def print_done(
+    prompt: Optional[str] = "Done ✅", style: Optional[str] = "bold cyan"
+):
     """
     Prints a prompt indicating completion of a process/routine.
     :param prompt: default is `"Done ✅"`
@@ -63,7 +68,9 @@ def print_success(
 def print_failure(
-    message: str, style: Optional[str] = "bold red", prompt: Optional[str] = "❌ FAILED"
+    message: str,
+    style: Optional[str] = "bold red",
+    prompt: Optional[str] = "❌ FAILED",
 ):
     """
     Prints a failure message.
@@ -108,7 +115,9 @@ class IncorrectParameterUtils:
         ]
     @staticmethod
-    def format_bad_description_message(tool_name: str, tool_desc: str) -> List[Text]:
+    def format_bad_description_message(
+        tool_name: str, tool_desc: str
+    ) -> List[Text]:
         return [
             warn(
@@ -139,12 +148,15 @@ class TestingUtils:
         For example, this can be read as: `"{\n⚙️ Testing} {20} {good tool descriptions}"`.
         """
         pretty_print(
-            content=f"{prompt} {test_case_count} {test_description}", style=style
+            content=f"{prompt} {test_case_count} {test_description}",
+            style=style,
         )
     @staticmethod
     def print_error_details(
-        expected: List[str], detected: List[str], style: Optional[str] = "bold red"
+        expected: List[str],
+        detected: List[str],
+        style: Optional[str] = "bold red",
     ):
         """
         Print detailed error information.
@@ -169,6 +181,8 @@ class TestingUtils:
         :param style: The style for the text (default is bold red).
         """
         if failed_cases:
-            pretty_print(content=f"{prompt} ({len(failed_cases)}):", style=style)
+            pretty_print(
+                content=f"{prompt} ({len(failed_cases)}):", style=style
+            )
             for case in failed_cases:
                 pretty_print(content=f"  - {case}", style=style)

ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl