PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.7py3-none-any.whl → 1.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (63) hide show

wxo_agentic_evaluation/referenceless_eval/prompt/runner.py ADDED Viewed

@@ -0,0 +1,145 @@
+import asyncio
+from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, TypeVar, Union
+from pydantic import BaseModel
+Prompt = Union[str, List[Dict[str, Any]]]
+PromptAndSchema = Tuple[Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]]
+SyncGen = Callable[[Prompt], Union[str, Any]]
+BatchGen = Callable[[List[Prompt]], List[Union[str, Any]]]
+AsyncGen = Callable[[Prompt], Awaitable[Union[str, Any]]]
+AsyncBatchGen = Callable[[List[Prompt]], Awaitable[List[Union[str, Any]]]]
+T = TypeVar("T")
+class PromptResult(BaseModel):
+    """
+    Holds the prompt sent and the response (or error).
+    """
+    prompt: Prompt
+    response: Optional[Any] = None
+    error: Optional[str] = None
+class PromptRunner:
+    """
+    Runs a collection of prompts through various generation strategies.
+    Attributes:
+        prompts: the list of prompts to run.
+    """
+    def __init__(
+        self, prompts: Optional[List[Union[Prompt, PromptAndSchema]]] = None
+    ) -> None:
+        """
+        Args:
+            prompts: initial list of prompts (strings or chat messages).
+        """
+        self.prompts: List[Union[Prompt, PromptAndSchema]] = prompts or []
+    def add_prompt(self, prompt: Union[Prompt, PromptAndSchema]) -> None:
+        """Append a prompt to the runner."""
+        self.prompts.append(prompt)
+    def remove_prompt(self, prompt: Union[Prompt, PromptAndSchema]) -> None:
+        """Remove a prompt (first occurrence)."""
+        self.prompts.remove(prompt)
+    def clear_prompts(self) -> None:
+        """Remove all prompts."""
+        self.prompts.clear()
+    def get_prompt_and_schema(
+        self, prompt: Union[Prompt, PromptAndSchema]
+    ) -> Tuple[Prompt, Optional[Dict[str, Any]]]:
+        """
+        Extract the prompt and schema from a Prompt object.
+        Args:
+            prompt: The prompt to extract from.
+        Returns:
+            Tuple of (prompt, schema).
+        """
+        if isinstance(prompt, tuple):
+            return prompt[0], prompt[1]
+        return prompt, None
+    def run_all(
+        self,
+        gen_fn: SyncGen,
+        prompt_param_name: str = "prompt",
+        schema_param_name: Optional[str] = None,
+        **kwargs: Any,
+    ) -> List[PromptResult]:
+        """
+        Run each prompt through a synchronous single-prompt generator.
+        Args:
+            gen_fn: Callable taking one Prompt, returning str or Any.
+            prompt_param_name: Name of the parameter for the prompt.
+            schema_param_name: Name of the parameter for the schema.
+            kwargs: Additional arguments to pass to the function.
+        Returns:
+            List of PromptResult.
+        """
+        results: List[PromptResult] = []
+        for p in self.prompts:
+            try:
+                prompt, schema = self.get_prompt_and_schema(p)
+                args = {prompt_param_name: prompt, **kwargs}
+                if schema_param_name and schema:
+                    args[schema_param_name] = schema
+                resp = gen_fn(**args)
+                results.append(PromptResult(prompt=prompt, response=resp))
+            except Exception as e:
+                results.append(PromptResult(prompt=prompt, error=str(e)))
+        return results
+    async def run_async(
+        self,
+        async_fn: AsyncGen,
+        max_parallel: int = 10,
+        prompt_param_name: str = "prompt",
+        schema_param_name: Optional[str] = None,
+        **kwargs: Any,
+    ) -> List[PromptResult]:
+        """
+        Run each prompt through an async single-prompt generator with concurrency limit.
+        Results are returned in the same order as self.prompts.
+        Args:
+            async_fn: Async callable taking one Prompt, returning str or Any.
+            max_parallel: Max concurrent tasks.
+            prompt_param_name: Name of the parameter for the prompt.
+            schema_param_name: Name of the parameter for the schema.
+            kwargs: Additional arguments to pass to the async function.
+        Returns:
+            List of PromptResult.
+        """
+        semaphore = asyncio.Semaphore(max_parallel)
+        async def _run_one(index: int, p: Prompt) -> Tuple[int, PromptResult]:
+            async with semaphore:
+                try:
+                    prompt, schema = self.get_prompt_and_schema(p)
+                    args = {prompt_param_name: prompt, **kwargs}
+                    if schema_param_name and schema:
+                        args[schema_param_name] = schema
+                    resp = await async_fn(**args)
+                    return index, PromptResult(prompt=prompt, response=resp)
+                except Exception as e:
+                    return index, PromptResult(prompt=prompt, error=str(e))
+        tasks = [
+            asyncio.create_task(_run_one(i, p)) for i, p in enumerate(self.prompts)
+        ]
+        indexed_results = await asyncio.gather(*tasks)
+        # Sort results to match original order
+        indexed_results.sort(key=lambda x: x[0])
+        return [res for _, res in indexed_results]

wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py ADDED Viewed

@@ -0,0 +1,116 @@
+import json
+import os
+from typing import Any, List, Mapping
+import rich
+from wxo_agentic_evaluation.referenceless_eval.function_calling.consts import (
+    METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
+    METRIC_GENERAL_HALLUCINATION_CHECK,
+)
+from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.pipeline import (
+    ReflectionPipeline,
+)
+from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
+    ToolCall,
+    ToolSpec,
+)
+from wxo_agentic_evaluation.type import Message
+from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXLLMKitWrapper
+class ReferencelessEvaluation:
+    """
+        Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
+        Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
+        ---
+        Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
+        Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
+        Note: When parsing the semantic metrics, check for `is_correct` field.  if `false` there is some mistake that the LLMaJ found in that tool call.
+    """
+    def __init__(
+        self,
+        api_spec: List[Mapping[str, Any]],
+        messages: List[Message],
+        model_id: str,
+        task_n: str,
+        dataset_name: str,
+    ):
+        self.metrics_client = WatsonXLLMKitWrapper(
+            model_id=model_id,
+            api_key=os.getenv("WATSONX_APIKEY", ""),
+            space_id=os.getenv("WATSONX_SPACE_ID")
+        )
+        self.pipeline = ReflectionPipeline(
+            metrics_client=self.metrics_client,
+            general_metrics=[METRIC_GENERAL_HALLUCINATION_CHECK],
+            function_metrics=[METRIC_FUNCTION_SELECTION_APPROPRIATENESS],
+            parameter_metrics=None,
+        )
+        self.task_n = task_n
+        self.dataset_name = dataset_name
+        self.apis_specs = [ToolSpec.model_validate(spec) for spec in api_spec]
+        self.messages = messages
+    def _run_pipeline(self, examples: List[Mapping[str, Any]]):
+        results = []
+        for example in examples:
+            # self.pipeline.sy
+            result = self.pipeline.run_sync(
+                conversation=example["context"],
+                inventory=self.apis_specs,
+                call=example["call"],
+                continue_on_static=False,
+                retries=2,
+            )
+            result_dict = result.model_dump()
+            results.append(result_dict)
+        return results
+    def run(self):
+        examples = []
+        processed_data = [
+            {k: msg.model_dump().get(k) for k in ["role", "content", "type"] if k in msg.model_dump()}
+            for msg in self.messages
+        ]
+        for idx, message in enumerate(processed_data):
+            role = message["role"]
+            content = message["content"]
+            context = processed_data[:idx]
+            if role == "assistant" and message["type"] == "tool_call":
+                tool_call_msg = json.loads(content)
+                if tool_call_msg["name"].startswith("transfer_to"):
+                    continue
+                call = {
+                    "call": {
+                        "id": tool_call_msg.get("id", "1"),
+                        "type": "function",
+                        "function": {
+                            "name": tool_call_msg["name"],
+                            "arguments": json.dumps(tool_call_msg["args"]),
+                        },
+                    },
+                    "context": context,
+                }
+                examples.append(call)
+        rich.print(
+            f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
+        )
+        examples = [
+            {
+                "call": ToolCall.model_validate(ex["call"]),
+                "context": ex["context"],
+            }
+            for ex in examples
+        ]
+        results = self._run_pipeline(examples)
+        return results

wxo_agentic_evaluation/service_instance.py CHANGED Viewed

@@ -49,10 +49,10 @@ class ServiceInstance:
     def get_user_token(self):
         try:
             if self.is_saas:
-                apikey = os.environ.get("WATSONX_IAM_SAAS_APIKEY")
+                apikey = os.environ.get("WO_API_KEY")
                 if not apikey:
                     raise RuntimeError(
-                        "WATSONX_IAM_SAAS_APIKEY not set in environment for SaaS mode"
+                        "WO_API_KEY not set in environment for SaaS mode"
                     )
                 if self.is_ibm_cloud:
                     data = {

wxo_agentic_evaluation/service_provider/watsonx_provider.py CHANGED Viewed

@@ -2,10 +2,12 @@ import os
 import requests
 import json
 from types import MappingProxyType
-from typing import List
+from typing import List, Mapping, Union, Optional, Any
+from functools import singledispatchmethod
 import dataclasses
 from threading import Lock
 import time
+import rich
 from wxo_agentic_evaluation.service_provider.provider import Provider
 ACCESS_URL = "https://iam.cloud.ibm.com/identity/token"
@@ -88,7 +90,12 @@ class WatsonXProvider(Provider):
                   "Content-Type": "application/json"}
         return headers
-    def generate(self, sentence: str):
+    @singledispatchmethod
+    def generate(self, sentence):
+        raise ValueError(f"Input must either be a string or a list of dictionaries")
+    @generate.register
+    def _(self, sentence: str):
         headers = self.prepare_header()
         data = {"model_id": self.model_id, "input": sentence,
@@ -100,6 +107,22 @@ class WatsonXProvider(Provider):
         else:
             resp.raise_for_status()
+    @generate.register
+    def _(self, sentence: list):
+        chat_url = f"{self.api_endpoint}/ml/v1/text/chat?version=2023-05-02"
+        headers = self.prepare_header()
+        data = {
+            "model_id": self.model_id,
+            "messages": sentence,
+            "parameters": self.params,
+            "space_id": self.space_id
+        }
+        resp = requests.post(url=chat_url, headers=headers, json=data)
+        if resp.status_code == 200:
+            return resp.json()
+        else:
+            resp.raise_for_status()
     def _refresh_token(self):
         # if we do not have a token or the current timestamp is 9 minutes away from expire.
         if not self.access_token or time.time() > self.refresh_time:
@@ -107,11 +130,18 @@ class WatsonXProvider(Provider):
                 if not self.access_token or time.time() > self.refresh_time:
                     self.access_token, self.refresh_time = self._get_access_token()
-    def query(self, sentence: str) -> str:
+    def query(self, sentence: Union[str, Mapping[str, str]]) -> str:
         if self.model_id is None:
             raise Exception("model id must be specified for text generation")
         try:
-            return self.generate(sentence)["generated_text"]
+            response = self.generate(sentence)
+            if (generated_text := response.get("generated_text")):
+                return generated_text
+            elif (message := response.get("message")):
+                return message
+            else:
+                raise ValueError(f"Unexpected response from WatsonX: {response}")
         except Exception as e:
             with self.lock:
                 if "authentication_token_expired" in str(e):
@@ -135,6 +165,90 @@ class WatsonXProvider(Provider):
         else:
             resp.raise_for_status()
+class LLMResponse:
+    """
+    NOTE: Taken from LLM-Eval-Kit
+    Response object that can contain both content and tool calls
+    """
+    def __init__(self, content: str, tool_calls: Optional[List[Mapping[str, Any]]] = None):
+        self.content = content
+        self.tool_calls = tool_calls or []
+    def __str__(self) -> str:
+        """Return the content of the response as a string."""
+        return self.content
+    def __repr__(self) -> str:
+        """Return a string representation of the LLMResponse object."""
+        return f"LLMResponse(content='{self.content}', tool_calls={self.tool_calls})"
+class WatsonXLLMKitWrapper(WatsonXProvider):
+    def generate(
+            self,
+            prompt: Union[str, List[Mapping[str, str]]],
+            *,
+            schema,
+            retries: int = 3,
+            generation_args: Optional[Any] = None,
+            **kwargs: Any
+        ):
+        """
+        In future, implement validation of response like in llmevalkit
+        """
+        for attempt in range(1, retries + 1):
+            try:
+                raw_response = super().generate(prompt)
+                response = self._parse_llm_response(raw_response)
+                return response
+            except Exception as e:
+                rich.print(f"[b][r] WatsonX generation failed with error '{str(e)}' during `quick-eval` ... Attempt ({attempt} / {retries}))")
+    def _parse_llm_response(self, raw: Any) -> Union[str, LLMResponse]:
+        """
+        Extract the generated text and tool calls from a watsonx response.
+        - For text generation: raw['results'][0]['generated_text']
+        - For chat:           raw['choices'][0]['message']['content']
+        """
+        content = ""
+        tool_calls = []
+        if isinstance(raw, dict) and "choices" in raw:
+            choices = raw["choices"]
+            if isinstance(choices, list) and choices:
+                first = choices[0]
+                msg = first.get("message")
+                if isinstance(msg, dict):
+                    content = msg.get("content", "")
+                    # Extract tool calls if present
+                    if "tool_calls" in msg and msg["tool_calls"]:
+                        tool_calls = []
+                        for tool_call in msg["tool_calls"]:
+                            tool_call_dict = {
+                                "id": tool_call.get("id"),
+                                "type": tool_call.get("type", "function"),
+                                "function": {
+                                    "name": tool_call.get("function", {}).get("name"),
+                                    "arguments": tool_call.get("function", {}).get(
+                                        "arguments"
+                                    ),
+                                },
+                            }
+                            tool_calls.append(tool_call_dict)
+                elif "text" in first:
+                    content = first["text"]
+        if not content and not tool_calls:
+            raise ValueError(f"Unexpected watsonx response format: {raw!r}")
+        # Return LLMResponse if tool calls exist, otherwise just content
+        if tool_calls:
+            return LLMResponse(content=content, tool_calls=tool_calls)
+        return content
 if __name__ == "__main__":
     provider = WatsonXProvider(model_id="meta-llama/llama-3-2-90b-vision-instruct")

wxo_agentic_evaluation/tool_planner.py CHANGED Viewed

@@ -19,6 +19,8 @@ root_dir = os.path.dirname(__file__)
 TOOL_PLANNER_PROMPT_PATH = os.path.join(root_dir, "prompt", "tool_planner.jinja2")
 ARGS_EXTRACTOR_PROMPT_PATH = os.path.join(root_dir, "prompt", "args_extractor_prompt.jinja2")
+MISSING_DOCSTRING_PROMPT = "No description available"
 class UniversalEncoder(json.JSONEncoder):
     def default(self, obj):
         if is_dataclass(obj):
@@ -131,7 +133,7 @@ def extract_tool_signatures(tools_path: Path) -> list:
                     tool_data.append({
                         "Function Name": name,
                         "Arguments": args,
-                        "Docstring": docstring or "No description available"
+                        "Docstring": docstring or MISSING_DOCSTRING_PROMPT
                     })
         except Exception as e:
             print(f"Warning: Failed to parse {file_path}: {str(e)}")

wxo_agentic_evaluation/type.py CHANGED Viewed

@@ -1,6 +1,11 @@
 from typing import Dict, List, Union, Any, Optional
-from pydantic import BaseModel, computed_field, ConfigDict
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field
+    )
 from enum import StrEnum
+from rich.text import Text
 class EventTypes(StrEnum):
@@ -20,6 +25,11 @@ class ContentType(StrEnum):
     conversational_search = "conversational_search"
+class AttackCategory(StrEnum):
+    on_policy = "on_policy"
+    off_policy = "off_policy"
 class ConversationalSearchCitations(BaseModel):
     url: str
     body: str
@@ -93,7 +103,7 @@ class Message(BaseModel):
 class ExtendedMessage(BaseModel):
     message: Message
-    reason: dict | None = None
+    reason: dict | list | None = None
 class KnowledgeBaseGoalDetail(BaseModel):
@@ -110,6 +120,21 @@ class GoalDetail(BaseModel):
     keywords: List = None
     knowledge_base: KnowledgeBaseGoalDetail = KnowledgeBaseGoalDetail()
+class AttackData(BaseModel):
+    attack_category: AttackCategory
+    attack_type: str
+    attack_name: str
+    attack_instructions: str
+class AttackData(BaseModel):
+    agent: str
+    agents_path: str
+    attack_data: AttackData
+    story: str
+    starting_sentence: str
+    goals: Dict = None
+    goal_details: List[GoalDetail] = None
 class EvaluationData(BaseModel):
     agent: str
@@ -117,3 +142,9 @@ class EvaluationData(BaseModel):
     story: str
     goal_details: List[GoalDetail]
     starting_sentence: str = None
+class ToolDefinition(BaseModel):
+    tool_description: Optional[str]
+    tool_name: str
+    tool_params: List[str]

wxo_agentic_evaluation/utils/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 def json_dump(output_path, object):
     with open(output_path, "w", encoding="utf-8") as f:
         json.dump(object, f, indent=4)

wxo_agentic_evaluation/utils/open_ai_tool_extractor.py ADDED Viewed

@@ -0,0 +1,157 @@
+import ast
+import re
+from pathlib import Path
+from typing import Union, Mapping, Any, List
+class PythonTypeToJsonType:
+    OPTIONAL_PARAM_EXTRACT = re.compile(r"[Oo]ptional\[(\w+)\]")
+    @staticmethod
+    def python_to_json_type(python_annotation: str):
+        if not python_annotation:
+            return "string"
+        python_annotation = python_annotation.lower().strip()
+        if "str" == python_annotation:
+            return "string"
+        if "int" == python_annotation:
+            return "integer"
+        if "float" == python_annotation:
+            return "number"
+        if "bool" == python_annotation:
+            return "boolean"
+        if python_annotation.startswith("list"):
+            return "array"
+        if python_annotation.startswith("dict"):
+            return "object"
+        if python_annotation.startswith("optional"):
+            # extract the type within Optional[T]
+            inner_type = PythonTypeToJsonType.OPTIONAL_PARAM_EXTRACT.search(python_annotation).group(1)
+            return PythonTypeToJsonType.python_to_json_type(inner_type)
+        return "string"
+class ToolExtractionOpenAIFormat:
+    @staticmethod
+    def get_default_arguments(node):
+        """ Returns the default arguments (if any)
+        The default arguments are stored in args.default array.
+        Since, in Python, the default arguments only come after positional arguments,
+        we can index the argument array starting from the last `n` arguments, where n is
+        the length of the default arguments.
+        ex.
+        def add(a, b=5):
+           pass
+        Then we have,
+        args = [a, b]
+        defaults = [Constant(value=5)]
+        args[-len(defaults):] = [b]
+        (
+        "FunctionDef(
+            name='add',
+            args=arguments(
+                posonlyargs=[],
+                args=[
+                    arg(arg='a'), "
+                    "arg(arg='b')
+                ],
+                kwonlyargs=[],
+                kw_defaults=[],
+                defaults=[Constant(value=5)]), "
+            "body=[Return(value=BinOp(left=Name(id='a', ctx=Load()), op=Add(), "
+            "right=Name(id='b', ctx=Load())))], decorator_list=[], type_params=[])")
+        """
+        default_arguments = set()
+        num_defaults = len(node.args.defaults)
+        if num_defaults > 0:
+            for arg in node.args.args[-num_defaults:]:
+                default_arguments.add(arg)
+        return default_arguments
+    @staticmethod
+    def from_file(tools_path: Union[str, Path]) -> Mapping[str, Any]:
+        """ Uses `extract_tool_signatures` function, but converts the response
+            to open-ai format
+            ```
+            function_spec = {
+                    "type": "function",
+                    "function": {
+                        "name": func_name,
+                        "description": description,
+                        "parameters": parameters,
+                    },
+                }
+        ```
+        """
+        tool_data = []
+        tools_path = Path(tools_path)
+        with tools_path.open("r", encoding="utf-8") as f:
+            code = f.read()
+        try:
+            parsed_code = ast.parse(code)
+            for node in parsed_code.body:
+                if isinstance(node, ast.FunctionDef):
+                    parameters = {"type": "object", "properties": {}, "required": []}
+                    function_name = node.name
+                    for arg in node.args.args:
+                        type_annotation = None
+                        if arg.arg == "self":
+                            continue
+                        if arg.annotation:
+                            type_annotation = ast.unparse(arg.annotation)
+                        parameter_type = PythonTypeToJsonType.python_to_json_type(type_annotation)
+                        parameters["properties"][arg.arg] = {
+                            "type": parameter_type,
+                            "description": "", # todo
+                        }
+                        if type_annotation and "Optional" not in type_annotation:
+                            parameters["required"].append(arg.arg)
+                    default_arguments = ToolExtractionOpenAIFormat.get_default_arguments(node)
+                    for arg_name in parameters["required"]:
+                        if arg_name in default_arguments:
+                            parameters.remove(arg_name)
+                    open_ai_format_fn = {
+                        "type": "function",
+                        "function": {
+                            "name": function_name,
+                            "parameters": parameters,
+                            "description": ast.get_docstring(node) # fix (does not do :params)
+                        }
+                    }
+                    tool_data.append(open_ai_format_fn)
+        except Exception as e:
+            print(f"Warning: Failed to parse {tools_path}: {str(e)}")
+        return tool_data
+    @staticmethod
+    def from_path(tools_path: Union[str, Path]) -> List[Mapping[str, Any]]:
+        tools_path = Path(tools_path)
+        files_to_parse = []
+        all_tools = []
+        if tools_path.is_file():
+            files_to_parse.append(tools_path)
+        elif tools_path.is_dir():
+            files_to_parse.extend(tools_path.glob("**/*.py"))
+        else:
+            raise ValueError(f"Tools path {tools_path} is neither a file nor directory")
+        for file_path in files_to_parse:
+            all_tools.extend(ToolExtractionOpenAIFormat.from_file(file_path))
+        return all_tools

ibm-watsonx-orchestrate-evaluation-framework 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.0.7py3-none-any.whl → 1.0.9py3-none-any.whl