PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
wxo_agentic_evaluation/analyze_run.py +1025 -220
wxo_agentic_evaluation/annotate.py +2 -2
wxo_agentic_evaluation/arg_configs.py +60 -2
wxo_agentic_evaluation/base_user.py +25 -0
wxo_agentic_evaluation/batch_annotate.py +19 -2
wxo_agentic_evaluation/clients.py +103 -0
wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
wxo_agentic_evaluation/compare_runs/diff.py +554 -0
wxo_agentic_evaluation/compare_runs/model.py +193 -0
wxo_agentic_evaluation/data_annotator.py +25 -7
wxo_agentic_evaluation/description_quality_checker.py +29 -6
wxo_agentic_evaluation/evaluation.py +16 -8
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
wxo_agentic_evaluation/evaluation_package.py +414 -69
wxo_agentic_evaluation/external_agent/__init__.py +1 -1
wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
wxo_agentic_evaluation/external_agent/types.py +3 -9
wxo_agentic_evaluation/extractors/__init__.py +3 -0
wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
wxo_agentic_evaluation/langfuse_collection.py +60 -0
wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
wxo_agentic_evaluation/llm_matching.py +104 -2
wxo_agentic_evaluation/llm_safety_eval.py +64 -0
wxo_agentic_evaluation/llm_user.py +5 -4
wxo_agentic_evaluation/llm_user_v2.py +114 -0
wxo_agentic_evaluation/main.py +112 -343
wxo_agentic_evaluation/metrics/__init__.py +15 -0
wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
wxo_agentic_evaluation/metrics/evaluations.py +107 -0
wxo_agentic_evaluation/metrics/journey_success.py +137 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
wxo_agentic_evaluation/metrics/metrics.py +276 -8
wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
wxo_agentic_evaluation/otel_parser/parser.py +163 -0
wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
wxo_agentic_evaluation/otel_parser/utils.py +15 -0
wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
wxo_agentic_evaluation/prompt/template_render.py +103 -4
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
wxo_agentic_evaluation/quick_eval.py +33 -17
wxo_agentic_evaluation/record_chat.py +38 -32
wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
wxo_agentic_evaluation/resource_map.py +3 -1
wxo_agentic_evaluation/runner.py +329 -0
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
wxo_agentic_evaluation/scheduler.py +247 -0
wxo_agentic_evaluation/service_instance.py +26 -17
wxo_agentic_evaluation/service_provider/__init__.py +145 -9
wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
wxo_agentic_evaluation/service_provider/provider.py +130 -10
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
wxo_agentic_evaluation/simluation_runner.py +125 -0
wxo_agentic_evaluation/test_prompt.py +4 -4
wxo_agentic_evaluation/type.py +185 -16
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
wxo_agentic_evaluation/utils/__init__.py +44 -3
wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
wxo_agentic_evaluation/utils/messages_parser.py +30 -0
wxo_agentic_evaluation/utils/parsers.py +71 -0
wxo_agentic_evaluation/utils/utils.py +313 -9
wxo_agentic_evaluation/wxo_client.py +81 -0
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0

wxo_agentic_evaluation/utils/parsers.py ADDED Viewed

@@ -0,0 +1,71 @@
+from typing import Any, List, Mapping, Optional
+from wxo_agentic_evaluation.metrics.metrics import (
+    Annotation,
+    FailedSemanticTestCases,
+    FailedStaticTestCases,
+)
+class ReferencelessEvalParser:
+    @staticmethod
+    def static_parser(
+        static_metrics: Mapping[str, Mapping[str, Any]],
+    ) -> List[FailedStaticTestCases]:
+        """
+        static.metrics
+        """
+        failed_test_cases = []
+        for metric, metric_data in static_metrics.items():
+            if not metric_data.get("valid", False):
+                fail = FailedStaticTestCases(
+                    metric_name=metric,
+                    description=metric_data.get("description"),
+                    explanation=metric_data.get("explanation"),
+                )
+                failed_test_cases.append(fail)
+        return failed_test_cases
+    @staticmethod
+    def parse_annotations(
+        actionable_reccomendations, filters: List[str]
+    ) -> Optional[List[Annotation]]:
+        annotations = [
+            Annotation(
+                parameter_name=recc.get("parameter_name"),
+                recommendation=recc.get("recommendation"),
+                details=recc.get("details"),
+                quote=recc.get("quote"),
+            )
+            for recc in actionable_reccomendations
+            if recc.get("recommendation") in filters
+        ]
+        annotations = annotations if annotations else None
+        return annotations
+    @staticmethod
+    def semantic_parser(
+        metric_name, data, annotation_filters: Optional[List[str]]
+    ):
+        semantic_metric = FailedSemanticTestCases(
+            metric_name=metric_name,
+            evidence=data.get("evidence"),
+            explanation=data.get("explanation"),
+            output=data.get("output"),
+            confidence=data.get("confidence"),
+        )
+        if annotation_filters and (
+            annotations := ReferencelessEvalParser.parse_annotations(
+                data.get("actionable_recommendations"), annotation_filters
+            )
+        ):
+            semantic_metric.annotations = annotations
+        return semantic_metric

wxo_agentic_evaluation/utils/utils.py CHANGED Viewed

@@ -1,10 +1,15 @@
+import csv
 import glob
 import json
+import math
 import os
 import re
-from typing import List, Optional, Union
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
 from urllib.parse import urlparse
+import rich
 import yaml
 from rich import box, print
 from rich.console import Console, Group
@@ -17,14 +22,25 @@ from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
 from wxo_agentic_evaluation.metrics.metrics import (
     KnowledgeBaseMetricSummary,
     ReferenceLessEvalMetrics,
+    ToolCallAndRoutingMetrics,
 )
 from wxo_agentic_evaluation.type import (
     ConversationalConfidenceThresholdScore,
+    ExtendedMessage,
     Message,
 )
 console = Console()
+RUN_FILE_RE = re.compile(
+    r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
+)
+N_A = "N/A"
+# File name constants
+REFERENCE_FILE_NAME = "reference"
+EXPERIMENT_FILE_NAME = "experiment"
 class AttackResultsTable:
     def __init__(self, attack_results: dict):
@@ -64,10 +80,100 @@ class AttackResultsTable:
         console.print(self.table)
+class TestCaseResources:
+    def __init__(self, output_dir: str):
+        """Todo flesh out for all resources that are saved"""
+        self.output_dir = Path(output_dir)
+    @property
+    def get_summary(self):
+        summary = []
+        with open(self.output_dir / "summary_metrics.csv", "r") as f:
+            reader = csv.reader(f)
+            header = next(reader)
+            for row in reader:
+                summary.append(dict(zip(header, row)))
+        return summary
+    def get_analyze_messages(
+        self, test_case_name=None, path=None
+    ) -> Tuple[List[ExtendedMessage], Mapping[str, Any]]:
+        test_messages = []
+        if test_case_name:
+            path = os.path.join(
+                self.output_dir,
+                "messages",
+                f"{test_case_name}.messages.analyze.json",
+            )
+        if not Path(str(path)).is_file():
+            rich.print(f"[r]No analyze file found at {path}")
+            raise Exception(f"No analyze file found at {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            temp = json.load(f)
+            meta = None
+            if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
+                meta = temp[-1]["meta"]
+                temp = temp[:-1]
+            for entry in temp:
+                msg = ExtendedMessage(**entry)
+                test_messages.append(msg)
+        return test_messages, meta
+    def get_messages(self, test_case_name=None, path=None) -> List[Message]:
+        test_messages = []
+        if test_case_name:
+            path = os.path.join(
+                self.output_dir,
+                "messages",
+                f"{test_case_name}.messages.json",
+            )
+        if not Path(str(path)).is_file():
+            rich.print(f"[r]No messages file found at {path}")
+            raise Exception(f"No messages file found at {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            temp = json.load(f)
+            for entry in temp:
+                msg = Message(**entry)
+                test_messages.append(msg)
+        return test_messages
+    def get_test_metrics(
+        self, test_case_name=None, path=None
+    ) -> ToolCallAndRoutingMetrics:
+        if test_case_name:
+            path = os.path.join(
+                self.output_dir,
+                "messages",
+                f"{test_case_name}.metrics.json",
+            )
+        if not Path(str(path)).is_file():
+            rich.print(f"[r]No metrics file found at {path}")
+            raise Exception(f"No metrics file found at {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            metrics = ToolCallAndRoutingMetrics(**json.load(f))
+        return metrics
 class AgentMetricsTable:
-    def __init__(self, data):
+    def __init__(self, data, title: Optional[str] = None):
+        if title is None:
+            title = "Agent Metrics"
         self.table = Table(
-            title="Agent Metrics",
+            title=title,
             box=box.ROUNDED,
             show_lines=True,
         )
@@ -88,7 +194,9 @@ class AgentMetricsTable:
         console.print(self.table)
-def create_table(data: List[dict]) -> AgentMetricsTable:
+def create_table(
+    data: List[dict], title: Optional[str] = None
+) -> AgentMetricsTable:
     """
     Generate a Rich table from a list of dictionaries.
     Returns the AgentMetricsTable instance.
@@ -100,7 +208,47 @@ def create_table(data: List[dict]) -> AgentMetricsTable:
         print("create_table() received an empty dataset. No table generated.")
         return None
-    return AgentMetricsTable(data)
+    return AgentMetricsTable(data, title=title)
+def mean(vals: List[float]) -> float:
+    """
+    Calculate the mean of a list of values.
+    Args:
+        vals: List of values
+    Returns:
+        Mean value
+    """
+    return round(sum(vals) / len(vals), 2) if vals else 0.0
+def to_pct(value: float | None, decimals: int = 0) -> str:
+    """
+    Convert a value to a percentage string.
+    Args:
+        value: Value to convert
+        decimals: Number of decimal places
+    Returns:
+        Percentage string
+    """
+    if value is None:
+        return "NA"
+    try:
+        return f"{round(float(value) * 100, decimals)}%"
+    except Exception:
+        return "NA"
+def average(array) -> float:
+    if len(array) == 0:
+        return math.nan
+    else:
+        return sum(array) / len(array)
 def safe_divide(nom, denom):
@@ -122,20 +270,114 @@ def is_ibm_cloud_url(service_url: str) -> bool:
 def add_line_seperator(
     style_config: Optional[Union[str, Style]] = None,
+    print=True,
 ):
+    """
+    Adds a lined seperator provided the style config.
+    `print` is a boolean to indicate if the lined seperator should go to stdout immeadiatly or returned as an object.
+    Set `print` to False, the lined seperator is printed later as part of the pager view for example.
+    """
     if not style_config:
         style = "grey42"
     else:
         style = style_config
-    console.print(
-        Rule(
-            style=style,
+    if print:
+        console.print(
+            Rule(
+                style=style,
+            )
         )
+    else:
+        return Rule(style=style, characters="==")
+def get_reference_column(base_name: str) -> str:
+    """Generate a column name with the reference suffix."""
+    return f"{base_name}_{REFERENCE_FILE_NAME}"
+def get_experiment_column(base_name: str) -> str:
+    """Generate a column name with the experiment suffix."""
+    return f"{base_name}_{EXPERIMENT_FILE_NAME}"
+def get_diff_column(base_name: str) -> str:
+    """Generate a diff column name."""
+    return f"{base_name}_diff"
+def get_column_value(
+    row: Dict[str, Any], base_name: str, file_type: str
+) -> Any:
+    """Get a value from a column with the appropriate suffix.
+    Args:
+        row: The data row
+        base_name: The base column name
+        file_type: Either 'reference' or 'experiment'
+    Returns:
+        The value from the column, or None if not found
+    """
+    if file_type.lower() == "reference":
+        key = get_reference_column(base_name)
+    elif file_type.lower() == "experiment":
+        key = get_experiment_column(base_name)
+    else:
+        raise ValueError(f"Invalid file_type: {file_type}")
+    return row.get(key)
+def has_column_in_both(row: Dict[str, Any], base_name: str) -> bool:
+    """Check if a column exists with both reference and experiment suffixes."""
+    return (
+        get_reference_column(base_name) in row
+        and get_experiment_column(base_name) in row
     )
+def format_ratio(ratio: Optional[float]) -> str:
+    """Format a ratio as a percentage string."""
+    if ratio is None:
+        return "N/A"
+    return f"{ratio * 100:.1f}%"
+def read_file(path: str, type: str = "csv") -> List[Dict[str, Any]]:
+    """Read a file and return its contents as a structured object."""
+    if type == "csv":
+        return read_csv_file(path)
+    elif type == "json":
+        # Add JSON reading logic if needed
+        raise NotImplementedError("JSON reading not yet implemented")
+    else:
+        raise ValueError(f"Unsupported file type: {type}")
+def read_csv_file(file_path: str) -> List[Dict[str, Any]]:
+    """Read a CSV file and return a list of dictionaries."""
+    data = []
+    with open(file_path, "r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            # Convert numeric values to appropriate types
+            for key, value in row.items():
+                if key == "dataset_name" or key == "text_match":
+                    continue
+                elif key == "is_success":
+                    row[key] = value.lower() == "true"
+                else:
+                    try:
+                        row[key] = float(value)
+                    except ValueError:
+                        pass
+            data.append(row)
+    return data
 class FaithfulnessTable:
     def __init__(
         self, faithfulness_metrics: List[Faithfulness], tool_call_ids: List[str]
@@ -346,6 +588,7 @@ class ReferencelessEvalPanel:
 # Function to load messages from JSON file
 def load_messages(file_path):
+    """TODO: replace in favor of TestCaseResources.get_messages(...)"""
     with open(file_path, "r") as f:
         try:
             message_data = json.load(f)
@@ -361,7 +604,7 @@ def load_messages(file_path):
             return None
-def load_agents(agents_path: str):
+def load_agents_from_disk(agents_path: str):
     agents_json = glob.glob(os.path.join(agents_path, "*.json"))
     agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
@@ -376,3 +619,64 @@ def load_agents(agents_path: str):
             agents.append(yaml.safe_load(f))
     return agents
+def list_run_files(messages_dir: str, dataset_base: str, filter_run: int = -1):
+    """
+    Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
+    (We only need analyze+metrics for this feature.)
+    `filter_run` only get gets the runs files for that run. If it is -1, then all run files are retrieved
+    For example, if there is `data3.run1.messages.json`, `data3.run2.messages.json`, and filter_run is 2, then,
+    the files related to only the second run are retrieved.
+    """
+    runs = defaultdict(
+        lambda: {"analyze": None, "metrics": None, "messages": None}
+    )
+    for fn in os.listdir(messages_dir):
+        m = RUN_FILE_RE.match(fn)
+        if not m or m.group("base") != dataset_base:
+            continue
+        run_id = int(m.group("run"))
+        if filter_run != -1 and run_id != filter_run:
+            continue
+        kind = m.group("kind")
+        full = os.path.join(messages_dir, fn)
+        if kind == "messages.analyze":
+            runs[run_id]["analyze"] = full
+        elif kind == "metrics":
+            runs[run_id]["metrics"] = full
+        elif kind == "messages":
+            runs[run_id]["messages"] = full
+    return runs
+def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
+    """Todo remove in a later PR"""
+    with open(metrics_path, "r", encoding="utf-8") as f:
+        return ToolCallAndRoutingMetrics(**json.load(f))
+def csv_dump(file_path: Union[str, Path], rows: List[Dict[str, Any]]) -> None:
+    """
+    Write rows to a CSV file.
+    Args:
+        file_path: Path to the output CSV file
+        rows: List of dictionaries representing CSV rows
+    """
+    if not rows:
+        return
+    # Ensure the parent directory exists
+    if isinstance(file_path, str):
+        file_path = Path(file_path)
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    # Write to CSV
+    with open(file_path, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
+        writer.writeheader()
+        writer.writerows(rows)

wxo_agentic_evaluation/wxo_client.py ADDED Viewed

@@ -0,0 +1,81 @@
+import os
+from typing import Any, Dict, Optional
+import requests
+import urllib3
+from urllib3.exceptions import InsecureRequestWarning
+from wxo_agentic_evaluation.service_instance import tenant_setup
+class WXOClient:
+    def __init__(
+        self, service_url, api_key, env: Optional[Dict[str, Any]] = None
+    ):
+        self.service_url = service_url
+        self.api_key = api_key
+        ov = os.getenv("WO_SSL_VERIFY")
+        if ov and ov.strip().lower() in ("true", "false"):
+            self._verify_ssl = ov.strip().lower() == "true"
+        else:
+            v, bs = (env.get("verify") if env else None), (
+                env.get("bypass_ssl") if env else None
+            )
+            self._verify_ssl = (
+                False
+                if (
+                    (bs is True)
+                    or (isinstance(bs, str) and bs.strip().lower() == "true")
+                    or (v is None)
+                    or (
+                        isinstance(v, str)
+                        and v.strip().lower() in {"none", "null"}
+                    )
+                )
+                else (v if isinstance(v, bool) else True)
+            )
+        if not self._verify_ssl:
+            urllib3.disable_warnings(InsecureRequestWarning)
+    def _get_headers(self) -> dict:
+        headers = {}
+        if self.api_key:
+            headers["Authorization"] = f"Bearer {self.api_key}"
+        return headers
+    def post(self, payload: dict, path: str, stream=False):
+        url = f"{self.service_url}/{path}"
+        return requests.post(
+            url=url,
+            headers=self._get_headers(),
+            json=payload,
+            stream=stream,
+            verify=self._verify_ssl,
+        )
+    def get(self, path: str, params: dict = None):
+        url = f"{self.service_url}/{path}"
+        return requests.get(
+            url,
+            params=params,
+            headers=self._get_headers(),
+            verify=self._verify_ssl,
+        )
+def get_wxo_client(
+    service_url: Optional[str], tenant_name: str, token: Optional[str] = None
+) -> WXOClient:
+    token, resolved_url, env = tenant_setup(service_url, tenant_name)
+    service_url = service_url or resolved_url
+    if not (service_url and str(service_url).strip()):
+        raise ValueError(
+            f"service_url not provided and not found in config for tenant '{tenant_name}'"
+        )
+    wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
+    return wxo_client

ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD DELETED Viewed

@@ -1,102 +0,0 @@
-wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/analyze_run.py,sha256=Ji3aVrEJoF47nkFHdJWp_j3JSqzYAmnLJAg_H2Y-Qgs,13295
-wxo_agentic_evaluation/annotate.py,sha256=PwgRBAIVBW_yEoOLYNHg9-XVo78zzTXb68kzR2JbtCM,1230
-wxo_agentic_evaluation/arg_configs.py,sha256=KttX3LFPXjg4qRlbeQ-fQ4Qp5-9_Uz5tt4TCx93KRAY,3028
-wxo_agentic_evaluation/batch_annotate.py,sha256=cZWhDt4k1Tap-e7Mw48tOXKwlHk-JxUvDT6N6n_A7PA,6694
-wxo_agentic_evaluation/data_annotator.py,sha256=ovrymEn2Jlivffg9_mtW7sT73_iOBLU5mX9n5hQQWPo,8398
-wxo_agentic_evaluation/description_quality_checker.py,sha256=Skmt_X-z5rJ9-rBXu5acp0sxq_LyjL0sOOYQVcn25K4,6163
-wxo_agentic_evaluation/evaluation.py,sha256=ZeMmxSbJyA86CVjX8EUdMsVrn25MMqMYO91DZqbe7f0,1090
-wxo_agentic_evaluation/evaluation_package.py,sha256=Ud1h7HDr47Gs4XPUoPagm6oS54Iqb_UWGlcyKoCLnfE,24319
-wxo_agentic_evaluation/inference_backend.py,sha256=mG7Z-Hi63znfJ7vzwCCYNPMc6AHgu7Codnw4puoAM3U,33004
-wxo_agentic_evaluation/llm_matching.py,sha256=HY_4T_4-JXr08Z8o0XWcZfyrzxM0hBpCYGbwh7uSOkw,1479
-wxo_agentic_evaluation/llm_rag_eval.py,sha256=cMUutCpmR1Zg5MM7KbHjHkF42FRBBACFNc-MzwxAw9M,1655
-wxo_agentic_evaluation/llm_user.py,sha256=-DezpQKYoWuN5kQBAx5zwE3Qd_OaxfizZQU7MeqScoM,1519
-wxo_agentic_evaluation/main.py,sha256=5yfynZkzYl52by-7xNMuNdN2FKGEamM-6k-w6fkg6ew,13574
-wxo_agentic_evaluation/quick_eval.py,sha256=7JWBB4Q5psZi2O26wZkQgPudu3uNZZBFrZSYou2ivgw,12876
-wxo_agentic_evaluation/record_chat.py,sha256=uDnc0r5rZdy-KZv36ntBMZMzTiv2pcbHayakg_seZGg,8660
-wxo_agentic_evaluation/resource_map.py,sha256=fmkLcQEx4_tpc46rkSoEOsvmd5WMkxaJpIfgqaR31Ms,1646
-wxo_agentic_evaluation/service_instance.py,sha256=lAwfIRJD20vOZFsmtqBt7z4-AmIWE-Fu5VGjmVeyoso,8506
-wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
-wxo_agentic_evaluation/tool_planner.py,sha256=RohospVdfYURyFVETgjm1EukmgpNBvBJopUs6obdhn0,14111
-wxo_agentic_evaluation/type.py,sha256=wAqE7sHEOuAD6s-GxLzdPdMyyjNqh-jOuV-KJR5zH5U,4047
-wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=mI2fyYzbLpSjSr2iwSwpjrOAenxvfA-6h9z2oky0uMs,18349
-wxo_agentic_evaluation/analytics/tools/main.py,sha256=tkDirlsRvLWQCSXcX6BlQFg24HY31K400M1a-O5xKfU,6250
-wxo_agentic_evaluation/analytics/tools/types.py,sha256=FxEHvL2i_4qMIhZBGbhMNd6Ics3nLbl5_vyLYJF5Qp0,4568
-wxo_agentic_evaluation/analytics/tools/ux.py,sha256=VwDc_d74HoI2sYMURciRzJzrUBiPzmCFx57C4JhGqGM,18974
-wxo_agentic_evaluation/external_agent/__init__.py,sha256=P1T0JYPIZeVyEYRqpEMKqGORQ1h_fVRvm9_lra9U0Q4,1570
-wxo_agentic_evaluation/external_agent/external_validate.py,sha256=gBnizwTIYRHjkVvomgY0hlS44N_n_7ld3YAQ5PFZdfU,4200
-wxo_agentic_evaluation/external_agent/performance_test.py,sha256=mLiUsgZlpj6sKZl2lOjb04ts6UOTf7PoOmvLMZrTN1M,2494
-wxo_agentic_evaluation/external_agent/types.py,sha256=Fu_hPBk-vhJ5kAAi6nwVRTrnr0oaxcoV7aXHsJwxYlg,1653
-wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=2GvvenWwWn-PV6HAwqL6-L-Wt6jCE8AthQTrtFAh8f4,1218
-wxo_agentic_evaluation/metrics/metrics.py,sha256=X2Bapjc7aGwU8--evEYOr2x-CNGsMMBTmMP1dXnURUo,6336
-wxo_agentic_evaluation/otel_support/evaluate_tau.py,sha256=RGfaM0jx5WmF4y1EnVsE2FWpfDQEoL3_sIMNcMUbZuQ,2023
-wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py,sha256=gY5m5INv0IQrA4Xi2wigAUI1cnxzGPYtMLWCIo9pubQ,5602
-wxo_agentic_evaluation/otel_support/otel_message_conversion.py,sha256=6fU2nXtMZUiQuhp2w2ByYigeo7wlOmUSo1CEHcb1iqE,649
-wxo_agentic_evaluation/otel_support/tasks_test.py,sha256=Z7NglyL-uI4vzb1Lum9aEdPZ7x_J2g1A-MKEABRX3nU,67543
-wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
-wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
-wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
-wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
-wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
-wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
-wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
-wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=GAHtEJvFNtgWBQma1I9KJdhXdhmqbEQf7JY66Z1JLMU,1113
-wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=yrLtXfmVIJ_C3XIaTvpqlQGlg9kKIibrVR3UzpzBEmo,1288
-wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
-wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
-wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
-wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
-wxo_agentic_evaluation/prompt/template_render.py,sha256=xVy7NOeGk5_XxzTT-YIY4HVAseQFU2SbRMSdvQGa-FE,4829
-wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
-wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
-wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
-wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=pfhMUjddv32pIRewea7o1vn_xrV_LuyC8vRlJ7qVyO8,5267
-wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=Sz9zB5O1ct7EoZCog8GNdwj8yWFZo7HJLPbA9HvelZc,11886
-wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
-wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=XXNP43mEneuDBo_zGPdCVNRdUNy-KGd7kbIKYwKhKJQ,4477
-wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
-wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=G2b7rwN0VTLBVGwU1VXKUl4eqT8Ya8zCcOorwkZwrZA,4354
-wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=IEyo5H_TTrzMLPD9y2eFDCSTB80G5QetZRiUhRlCx-A,852
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=3JDWWjYuYfGwa2uYLXaxGETMuppGld5c901h_-YkFO4,7645
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
-wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=QP43RjUfozozXBtYEzPHv7EC3pdwIWLdNRsJ8xzvcjU,3701
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=f4GmTXNTBeH171GGRWaDCIRuFPRyuVMy62evWV8TEl8,9713
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=Fm0unqhpFBxeofTQjQaLl_SZFSFke7K7S56t46812-E,17589
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=0m4iHqb68psvLMNQasFaaxgQP5XmmGjBkuID8aw5Kv8,6069
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
-wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=mm7eOx6a_2ExDgck29IkgAzjeQkICpMDXecuxa6ZULo,17182
-wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
-wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=ki6ZqLfg9f6il7Pk7FxqwZLeZDuZFKwON_hKPNH5jkg,8446
-wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=bDRYG-HObwFvi4-CS7am4F_9WPXqh6T4UzNIrxqynsY,12331
-wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=pt-XIVTzJn5c3_lM1H6r82ag5c_uxdA5PPCyCwBV1O8,6012
-wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=Y2baaQ4IaS-oPqvweJd8cPYj2pgyJwS-2_HwvE2PP-s,15112
-wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=O3axxDTD2e7lFk5m7amz5713rom9hHKDvwWlrspSK3k,1466
-wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=CLJgDoUp80ug0lDpfYJFEiLnmXej_6R-YloJjdX6I1Y,5111
-wxo_agentic_evaluation/service_provider/__init__.py,sha256=Xu-Wdo7vZI6iNKFp4cNGo7rXv-OQ4BkgLaKeCfALCrk,2162
-wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=VN1DFF1woJcjijwj3lMA0JS-9pxJ6fXSYu91Ah7nTNE,9866
-wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=OCpnqd8E9WUqPGc7Q01L5HWVIZsZ5V5-XvjhcwvqRA4,1097
-wxo_agentic_evaluation/service_provider/provider.py,sha256=OkMjZ_xHPXy-YqkBbKXC4K67VWJrCQb1nSZxMRt-a4g,416
-wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=hM085FbKEBM_LC2O-rURtGx-RMBtulbm1FAZa73k1gg,5321
-wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=LYSpxOI2oMQSysasb8WT_nn5SdDy-dsLFyJDJHXFtn0,6876
-wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
-wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=kJUuprXfY5IfCRIvLT4oSvMf-Ly7RYNo4if-Lb6yGiA,6080
-wxo_agentic_evaluation/utils/rich_utils.py,sha256=pJ43hwvSZRJwckPOeUhqGdw4_RwxLsYO02dDt6PLqxA,6351
-wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
-wxo_agentic_evaluation/utils/utils.py,sha256=8PUpmOoPrEG5xBDOWMsaKanYsnZV5-UZWQa7x8P-J2g,11634
-ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/METADATA,sha256=SRO-KH4zJYQhHMhyhDIqrkeoELwrDnTvYbwcIZT9i9w,1435
-ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
-ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD,,

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl