PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show

ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
wxo_agentic_evaluation/__init__.py +0 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
wxo_agentic_evaluation/analytics/tools/main.py +163 -0
wxo_agentic_evaluation/analytics/tools/types.py +130 -0
wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
wxo_agentic_evaluation/analyze_run.py +123 -0
wxo_agentic_evaluation/annotate.py +40 -0
wxo_agentic_evaluation/arg_configs.py +78 -0
wxo_agentic_evaluation/batch_annotate.py +181 -0
wxo_agentic_evaluation/data_annotator.py +253 -0
wxo_agentic_evaluation/evaluation_package.py +518 -0
wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
wxo_agentic_evaluation/external_agent/types.py +65 -0
wxo_agentic_evaluation/inference_backend.py +601 -0
wxo_agentic_evaluation/llm_matching.py +39 -0
wxo_agentic_evaluation/llm_rag_eval.py +47 -0
wxo_agentic_evaluation/llm_user.py +38 -0
wxo_agentic_evaluation/main.py +231 -0
wxo_agentic_evaluation/metrics/__init__.py +0 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
wxo_agentic_evaluation/metrics/metrics.py +101 -0
wxo_agentic_evaluation/prompt/__init__.py +0 -0
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
wxo_agentic_evaluation/prompt/template_render.py +90 -0
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
wxo_agentic_evaluation/record_chat.py +165 -0
wxo_agentic_evaluation/service_instance.py +179 -0
wxo_agentic_evaluation/tool_planner.py +228 -0
wxo_agentic_evaluation/type.py +176 -0
wxo_agentic_evaluation/utils/__init__.py +6 -0
wxo_agentic_evaluation/utils/utils.py +233 -0
wxo_agentic_evaluation/watsonx_provider.py +175 -0

wxo_agentic_evaluation/utils/utils.py ADDED Viewed

@@ -0,0 +1,233 @@
+from urllib.parse import urlparse
+from rich.console import Console, Group
+from rich.table import Table
+from rich.panel import Panel
+from rich.rule import Rule
+from rich import box
+from rich import print
+from typing import List
+from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
+from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary
+from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
+console = Console()
+class AgentMetricsTable:
+    def __init__(self, data):
+        self.table = Table(
+            title="Agent Metrics",
+            box=box.ROUNDED,
+            show_lines=True,
+        )
+        if not data:
+            return
+        # Add columns with styling
+        headers = list(data[0].keys())
+        for header in headers:
+            self.table.add_column(header, style="cyan")
+        # Add rows
+        for row in data:
+            self.table.add_row(*[str(row.get(col, "")) for col in headers])
+    def print(self):
+        console.print(self.table)
+def create_table(data: List[dict]) -> AgentMetricsTable:
+    """
+    Generate a Rich table from a list of dictionaries.
+    Returns the AgentMetricsTable instance.
+    """
+    if isinstance(data, dict):
+        data = [data]
+    if not data:
+        print("create_table() received an empty dataset. No table generated.")
+        return None
+    return AgentMetricsTable(data)
+def create_average_row(results_list, include_response_time=True):
+    # TO-DO: we are hiding some the columns to allow the table to display properly
+    # need a better solution
+    columns = {
+        "average_over_test_cases": [
+            "Total Step",
+            "Agent Step",
+            # "Tool Call Accuracy",
+            # "Tool Call Relevancy",
+            # "Agent Routing Accuracy"
+        ],
+        "average_over_ground_truth_calls": [
+            "Wrong Function Calls",
+            # "Bad Calls",
+            "Wrong Parameters",
+            "Wrong Routing Calls",
+        ],
+    }
+    if include_response_time:
+        columns["average_over_test_cases"].append("Avg Resp Time (Secs)")
+    journey_success_values = [
+        1 if item["Journey Success"] else 0 for item in results_list
+    ]
+    journey_success_avg = round(
+        sum(journey_success_values) / len(journey_success_values), 2
+    )
+    text_match_values = [
+        1 if item["Text Match"] == "Summary Matched" else 0 for item in results_list
+    ]
+    text_match_avg = round(sum(text_match_values) / len(text_match_values), 2)
+    ground_truth_calls = sum(item["Ground Truth Calls"] for item in results_list)
+    ground_truth_calls_avg = round(ground_truth_calls / len(results_list), 2)
+    avg_row = {
+        col: round(sum(item[col] for item in results_list) / len(results_list), 2)
+        for col in columns["average_over_test_cases"]
+    }
+    avg_row.update(
+        {
+            col: round(sum(item[col] for item in results_list) / ground_truth_calls, 2)
+            for col in columns["average_over_ground_truth_calls"]
+        }
+    )
+    # TODO: FIX as part of PR
+    # avg_row["WXO Average Response Time (Secs)"] = round(sum(item["WXO Average Response Time (Secs)"] for item in results_list) / len(results_list), 2)
+    avg_row["Journey Success"] = journey_success_avg
+    avg_row["Text Match"] = text_match_avg
+    avg_row["Ground Truth Calls"] = ground_truth_calls_avg
+    avg_row["Dataset"] = "Summary (Average)"
+    return avg_row
+def is_saas_url(service_url: str) -> bool:
+    hostname = urlparse(service_url).hostname
+    return hostname not in ("localhost", "127.0.0.1", "0.0.0.0", "::1")
+def is_ibm_cloud_url(service_url: str) -> bool:
+    hostname = urlparse(service_url).hostname
+    return ".cloud.ibm.com" in hostname
+def add_line_seperator():
+    console.print(Rule(style="grey42"))
+class FaithfulnessTable:
+    def __init__(
+        self, faithfulness_metrics: List[Faithfulness], tool_call_ids: List[str]
+    ):
+        self.table = Table(title="Faithfulness", box=box.ROUNDED, show_lines=True)
+        self.table.add_column("Tool Call Id", style="blue")
+        self.table.add_column("Faithfulness Score", style="blue3")
+        self.table.add_column("Evidence", style="cyan")
+        self.table.add_column("Reasoning", style="yellow3")
+        for tool_call_id, faithfulness in zip(tool_call_ids, faithfulness_metrics):
+            faithfulness = faithfulness.table()
+            self.table.add_row(
+                tool_call_id,
+                faithfulness["faithfulness_score"],
+                faithfulness["evidence"],
+                faithfulness["reason"],
+            )
+    def print(self):
+        console.print(self.table)
+class ConversationalSearchTable:
+    def __init__(
+        self,
+        confidence_scores_list: List[ConversationalConfidenceThresholdScore],
+        tool_call_ids: List[str],
+    ):
+        self.table = Table(
+            title="Conversational Search", box=box.ROUNDED, show_lines=True
+        )
+        self.table.add_column("Tool Call Id", style="blue")
+        self.table.add_column("Response Confidence", style="blue3")
+        self.table.add_column("Response Confidence Threshold", style="cyan")
+        self.table.add_column("Retrieval Confidence", style="blue3")
+        self.table.add_column("Retrieval Confidence Threshold", style="cyan")
+        for tool_call_id, confidence_scores in zip(
+            tool_call_ids, confidence_scores_list
+        ):
+            confidence_scores = confidence_scores.table()
+            self.table.add_row(
+                tool_call_id,
+                confidence_scores["response_confidence"],
+                confidence_scores["response_confidence_threshold"],
+                confidence_scores["retrieval_confidence"],
+                confidence_scores["retrieval_confidence_threshold"],
+            )
+class KnowledgePanel:
+    def __init__(
+        self,
+        dataset_name: str,
+        tool_call_id: List[str],
+        faithfulness: List[Faithfulness] = None,
+        confidence_scores: List[ConversationalConfidenceThresholdScore] = None,
+    ):
+        self.faithfulness = FaithfulnessTable(faithfulness, tool_call_id)
+        self.confidence_scores = ConversationalSearchTable(
+            confidence_scores, tool_call_id
+        )
+        self.group = Group(self.faithfulness.table, self.confidence_scores.table)
+        # Panel acts as a section
+        self.section = Panel(
+            self.group,
+            title=f"Agent with Knowledge Metrics for {dataset_name}",
+            border_style="grey37",
+            title_align="left",
+        )
+    def print(self):
+        console.print(self.section)
+class SummaryPanel:
+    def __init__(self, summary_metrics: KnowledgeBaseMetricSummary):
+        self.table = Table(
+            title="Agent with Knowledge Summary Metrics",
+            box=box.ROUNDED,
+            show_lines=True,
+        )
+        self.table.add_column("Dataset", style="blue3")
+        self.table.add_column("Average Response Confidence", style="cyan")
+        self.table.add_column("Average Retrieval Confidence", style="blue3")
+        self.table.add_column("Average Faithfulness", style="cyan")
+        self.table.add_column("Average Answer Relevancy", style="blue3")
+        self.table.add_column("Number Calls to Knowledge Bases", style="cyan")
+        self.table.add_column("Knowledge Bases Called", style="blue3")
+        average_metrics = summary_metrics.average
+        for dataset, metrics in average_metrics.items():
+            self.table.add_row(
+                dataset,
+                str(round(metrics["average_response_confidence"], 4)),
+                str(round(metrics["average_retrieval_confidence"], 4)),
+                str(metrics["average_faithfulness"]),
+                str(metrics["average_answer_relevancy"]),
+                str(metrics["number_of_calls"]),
+                metrics["knowledge_bases_called"],
+            )
+    def print(self):
+        console.print(self.table)

wxo_agentic_evaluation/watsonx_provider.py ADDED Viewed

@@ -0,0 +1,175 @@
+import os
+import requests
+import json
+from types import MappingProxyType
+from typing import List
+import dataclasses
+from ibm_watsonx_ai.foundation_models import ModelInference, Embeddings
+from ibm_watsonx_ai.credentials import Credentials
+from threading import Lock
+ACCESS_URL = "https://iam.cloud.ibm.com/identity/token"
+ACCESS_HEADER = {
+    "content-type": "application/x-www-form-urlencoded",
+    "accept": "application/json",
+}
+YPQA_URL = "https://yp-qa.ml.cloud.ibm.com"
+PROD_URL = "https://us-south.ml.cloud.ibm.com"
+DEFAULT_PARAM = MappingProxyType(
+    {"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 100}
+)
+class WatsonXProvider:
+    def __init__(
+        self,
+        model_id=None,
+        api_key=None,
+        space_id=None,
+        api_endpoint=PROD_URL,
+        url=ACCESS_URL,
+        timeout=60,
+        llm_decode_parameter=DEFAULT_PARAM,
+        embedding_model_id=None,
+    ):
+        super().__init__()
+        self.url = url
+        if (embedding_model_id is None) and (model_id is None):
+            raise Exception("either model_id or embedding_model_id must be specified")
+        self.model_id = model_id
+        api_key = os.environ.get("WATSONX_APIKEY", api_key)
+        if not api_key:
+            raise Exception("apikey must be specified")
+        self.api_key = api_key
+        self.access_data = {
+            "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
+            "apikey": self.api_key,
+        }
+        self.api_endpoint = api_endpoint
+        space_id = os.environ.get("WATSONX_SPACE_ID", space_id)
+        if not space_id:
+            raise Exception("space id must be specified")
+        self.space_id = space_id
+        self.timeout = timeout
+        self.embedding_model_id = embedding_model_id
+        self.lock = Lock()
+        if isinstance(llm_decode_parameter, MappingProxyType):
+            llm_decode_parameter = dict(llm_decode_parameter)
+        if dataclasses.is_dataclass(llm_decode_parameter):
+            llm_decode_parameter = dataclasses.asdict(llm_decode_parameter)
+        self.decode_param = llm_decode_parameter
+        self._refresh_token()
+    def _get_access_token(self):
+        response = requests.post(
+            self.url, headers=ACCESS_HEADER, data=self.access_data, timeout=self.timeout
+        )
+        if response.status_code == 200:
+            token_data = json.loads(response.text)
+            token = token_data["access_token"]
+            return token
+        raise RuntimeError(
+            f"try to acquire access token and get {response.status_code}"
+        )
+    def _refresh_token(self):
+        self.access_token = self._get_access_token()
+        if self.embedding_model_id is not None:
+            self.embedding_client = Embeddings(
+                model_id=self.embedding_model_id,
+                credentials=Credentials(token=self.access_token, url=self.api_endpoint),
+                space_id=self.space_id,
+            )
+        else:
+            self.embedding_client = None
+        if self.model_id is not None:
+            self.client = ModelInference(
+                model_id=self.model_id,
+                params=self.decode_param,
+                credentials=Credentials(token=self.access_token, url=self.api_endpoint),
+                space_id=self.space_id,
+            )
+        else:
+            self.client = None
+    def query(self, sentence: str) -> dict:
+        if self.model_id is None:
+            raise Exception("model id must be specified for text generation")
+        try:
+            return self.client.generate([sentence])[0][
+                "results"
+            ][  # pylint: disable=E1136
+                0
+            ]
+        except Exception as e:
+            with self.lock:
+                if "authentication_token_expired" in str(e):
+                    self._refresh_token()
+                raise e
+    def batch_query(self, sentences: List[str]) -> List[dict]:
+        if self.model_id is None:
+            raise Exception("model id must be specified for text generation")
+        try:
+            outputs = self.client.generate(sentences)
+            outputs = [output["results"][0] for output in outputs]
+            return outputs
+        except Exception as e:
+            with self.lock:
+                if "authentication_token_expired" in str(e):
+                    self._refresh_token()
+                raise e
+        # pylint: disable=E1133
+        return []
+    def encode(self, sentences: List[str]) -> List[list]:
+        if self.embedding_model_id is None:
+            raise Exception("embedding model id must be specified for text encoding")
+        output = self.embedding_client.generate(sentences)
+        return [entry["embedding"] for entry in output["results"]]
+if __name__ == "__main__":
+    import os
+    provider = WatsonXProvider(model_id="meta-llama/llama-3-2-90b-vision-instruct")
+    prompt = """
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+Your username is nwaters and you want to find out timeoff schedule for yourself for 20250101 to 20250303
+<|eot_id|><|start_header_id|>user<|end_header_id|>
+Reminder:
+- try to respond only once per input
+- if you get everything you need. respond with END
+<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+my username is nwaters. what's my timeoff schedule?<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
+we need to call get_assignment_id for username nwaters. do you want to make the function call? yes/no<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+yes<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
+ok, i have your assignment id. what's the start and end date?<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+start and end is 20250101 to 20250303<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
+Usernwaters did not take anytime off during the period<|eot_id|>
+{% endfor -%}
+<|eot_id|><|start_header_id|>user<|end_header_id|>
+"""
+    print(provider.query(prompt))