PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

wxo_agentic_evaluation/metrics/llm_as_judge.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from pydantic import BaseModel, computed_field
 from abc import abstractmethod
 from functools import cached_property
+from pydantic import BaseModel, computed_field
 class BaseLLMJudgeMetric(BaseModel):
     @abstractmethod
@@ -44,3 +44,29 @@ class AnswerRelevancy(BaseLLMJudgeMetric):
             "answer_relevancy": self.answer_relevancy,
             "answer_relevancy_score": self.answer_relevancy_score,
         }
+class AnswerDerailment(BaseLLMJudgeMetric):
+    in_scope: str | float
+    statement: str
+    reason: str
+    def table(self):
+        return {
+            "statement": self.statement,
+            "reason": self.reason,
+            "on_topic_score": str(self.in_scope),
+        }
+class AnswerUnsafeTopic(BaseLLMJudgeMetric):
+    is_safe: str | float
+    statement: str
+    reason: str
+    def table(self):
+        return {
+            "statement": self.statement,
+            "reason": self.reason,
+            "safe_topic_score": str(self.is_safe),
+        }

wxo_agentic_evaluation/metrics/metrics.py CHANGED Viewed

@@ -1,19 +1,45 @@
-import math
-from typing import List, Mapping, Any
-from enum import Enum
+from collections import defaultdict
+from enum import Enum, StrEnum
+from typing import Any, Dict, List, Mapping, Optional, Tuple
 from pydantic import BaseModel, computed_field
+from pydantic.fields import Field
-from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
+from wxo_agentic_evaluation.metrics.llm_as_judge import (
+    AnswerRelevancy,
+    Faithfulness,
+)
 from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
-def average(array):
-    if len(array) == 0:
-        return math.nan
+class DescriptionQuality(StrEnum):
+    GOOD = "GOOD"
+    BAD = "BAD"
+    MISSING = "MISSING"
-    else:
-        return sum(array)/len(array)
+class DescriptionQualityMetric(BaseModel):
+    tool_name: str = None
+    description_score: float | None = None
+    threshold: float | None = None
+    @computed_field
+    @property
+    def is_bad_description(self) -> Optional[bool]:
+        if self.description_score and self.threshold:
+            return self.description_score >= self.threshold
+        return None
+    @computed_field
+    @property
+    def description_quality(self) -> str:
+        if self.description_score is None:
+            return DescriptionQuality.MISSING
+        elif self.is_bad_description:
+            return DescriptionQuality.BAD
+        else:
+            return DescriptionQuality.GOOD
 class KnowledgeBaseMetrics(BaseModel):
@@ -54,7 +80,9 @@ class KnowledgeBaseMetricSummary(BaseModel):
                     }
                 else:
                     values = groupby[name]
-                    values.get("knowledge_base_name").append(knowledge_base_name)
+                    values.get("knowledge_base_name").append(
+                        knowledge_base_name
+                    )
                     values.get("faithfulness").append(faithfulness)
                     values.get("answer_relevancy").append(answer_relevancy)
                     values.get("confidence_scores").append(confidence_scores)
@@ -67,6 +95,8 @@ class KnowledgeBaseMetricSummary(BaseModel):
     @computed_field(alias="summary")
     @property
     def average(self) -> Mapping[str, Any]:
+        from wxo_agentic_evaluation.utils.utils import average
         summary = {}
         for dataset, metric in self.groupby_dataset.items():
             average_metric = {}
@@ -109,6 +139,7 @@ class KeywordSemanticSearchMetric(BaseModel):
     message: str
     goal_detail: str
 class TextMatchType(Enum):
     text_match = "Summary Matched"
     text_mismatch = "Summary MisMatched"
@@ -117,12 +148,14 @@ class TextMatchType(Enum):
 class ToolCallAndRoutingMetrics(BaseModel):
     dataset_name: str = ""
-    total_steps: int=0
-    llm_step: int =0
+    total_steps: int = 0
+    llm_step: int = 0
     total_tool_calls: int = 0
     expected_tool_calls: int = 0
     correct_tool_calls: int = 0
-    relevant_tool_calls: int = 0 # calls with the same function but different args
+    relevant_tool_calls: int = (
+        0  # calls with the same function but different args
+    )
     total_routing_calls: int = 0
     relevant_routing_calls: int = 0
     tool_calls_with_incorrect_parameter: int = 0
@@ -135,7 +168,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
     def tool_call_recall(self) -> float:
         return round(
             (
-                self.correct_tool_calls/self.expected_tool_calls
+                self.correct_tool_calls / self.expected_tool_calls
                 if self.expected_tool_calls > 0
                 else 0.0
             ),
@@ -147,8 +180,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
     def tool_call_precision(self) -> float:
         return round(
             (
-                (self.correct_tool_calls)
-                / self.total_tool_calls
+                (self.correct_tool_calls) / self.total_tool_calls
                 if self.total_tool_calls > 0
                 else 0.0
             ),
@@ -166,3 +198,274 @@ class ToolCallAndRoutingMetrics(BaseModel):
             ),
             2,
         )
+class Annotation(BaseModel):
+    recommendation: str
+    details: str
+    quote: str
+    parameter_name: Optional[str]
+class FailedStaticTestCases(BaseModel):
+    metric_name: str
+    description: str
+    explanation: str
+class FailedSemanticTestCases(BaseModel):
+    metric_name: str
+    evidence: str
+    explanation: str
+    output: int
+    confidence: float
+    annotations: Optional[List[Annotation]] = None
+class EnhancedAnalyzeMetrics(BaseModel):
+    test_case_name: str
+    tool_names: List[str]
+    parameter_annotations: List[List[FailedSemanticTestCases]] = [[]]
+    tool_annotations: List[List[FailedSemanticTestCases]] = [[]]
+    static_metrics: List[List[FailedStaticTestCases]] = [[]]
+class ReferenceLessEvalMetrics(BaseModel):
+    dataset_name: str
+    number_of_tool_calls: int
+    number_of_successful_tool_calls: int
+    number_of_static_failed_tool_calls: int
+    number_of_semantic_failed_tool_calls: int
+    failed_static_tool_calls: Optional[
+        List[Tuple[int, List[FailedStaticTestCases]]]
+    ]
+    failed_semantic_tool_calls: Optional[
+        List[Tuple[int, List[FailedSemanticTestCases]]]
+    ]
+class Metric(BaseModel):
+    """Generic metric result."""
+    eval_name: str = Field(description="name of eval that produce metric")
+    value: int | float | bool | str = Field(description="metric value")
+    metadata: Optional[dict] = Field(
+        default=None,
+        description="metadata that was generated along side the metric. example: llmaaj reason, retrieval score",
+    )
+class LangfuseMetric(Metric):
+    comment: Optional[str] = ""
+    data_type: Optional[str] = ""
+class CustomEvalMetrics(BaseModel):
+    dataset_name: str
+    custom_metrics: list[Metric]
+def create_avg_row(metrics: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Create an average row from a list of metric dictionaries.
+    Args:
+        metrics: List of metric dictionaries
+    Returns:
+        Dictionary with averaged metrics
+    """
+    from wxo_agentic_evaluation.utils.utils import safe_divide
+    avg_row = {
+        "Dataset": "Summary (Average)",
+        "Runs": 0,
+        "Total Steps": 0,
+        "LLM Steps": 0,
+        "Total Tool Calls": 0,
+        "Tool Call Precision": 0,
+        "Tool Call Recall": 0,
+        "Agent Routing Accuracy": 0,
+        "Text Match": 0,
+        "Journey Success": 0,
+        "Avg Resp Time (sec)": 0,
+    }
+    if metrics:
+        for row in metrics:
+            avg_row["Runs"] += row.get("Runs", 0)
+            avg_row["Total Steps"] += row["Total Steps"]
+            avg_row["LLM Steps"] += row["LLM Steps"]
+            avg_row["Total Tool Calls"] += row["Total Tool Calls"]
+            avg_row["Tool Call Precision"] += row["Tool Call Precision"]
+            avg_row["Tool Call Recall"] += row["Tool Call Recall"]
+            avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
+            avg_row["Text Match"] += row["Text Match"]
+            avg_row["Journey Success"] += row["Journey Success"]
+            avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
+        n = len(metrics)
+        # Average over datasets
+        avg_row["Runs"] = round(safe_divide(avg_row["Runs"], n), 2)
+        avg_row["Total Steps"] = round(
+            safe_divide(avg_row["Total Steps"], n), 2
+        )
+        avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], n), 2)
+        avg_row["Total Tool Calls"] = round(
+            safe_divide(avg_row["Total Tool Calls"], n), 2
+        )
+        avg_row["Tool Call Precision"] = round(
+            safe_divide(avg_row["Tool Call Precision"], n), 2
+        )
+        avg_row["Tool Call Recall"] = round(
+            safe_divide(avg_row["Tool Call Recall"], n), 2
+        )
+        avg_row["Agent Routing Accuracy"] = round(
+            safe_divide(avg_row["Agent Routing Accuracy"], n), 2
+        )
+        avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], n), 2)
+        avg_row["Journey Success"] = round(
+            safe_divide(avg_row["Journey Success"], n), 2
+        )
+        avg_row["Avg Resp Time (sec)"] = round(
+            safe_divide(avg_row["Avg Resp Time (sec)"], n), 2
+        )
+    return avg_row
+def format_metrics_for_display(
+    tool_call_metrics: list[ToolCallAndRoutingMetrics],
+) -> list[dict[str, Any]]:
+    from wxo_agentic_evaluation.utils.utils import mean, safe_divide, to_pct
+    # Group metrics by dataset name
+    grouped = defaultdict(list)
+    for m in tool_call_metrics:
+        grouped[m.dataset_name].append(
+            {
+                "Dataset": m.dataset_name,
+                "Total Steps": m.total_steps,
+                "LLM Steps": m.llm_step,
+                "Total Tool Calls": m.total_tool_calls,
+                "Tool Call Precision": m.tool_call_precision,
+                "Tool Call Recall": m.tool_call_recall,
+                "Agent Routing Accuracy": m.agent_routing_accuracy,
+                "Text Match": m.text_match,
+                "Journey Success": m.is_success,
+                "Avg Resp Time (sec)": m.avg_resp_time,
+            }
+        )
+    # Create per-test rows with averages over runs
+    per_test_rows = []
+    for ds, rows in grouped.items():
+        out = {"Dataset": ds}
+        # Average numeric columns over runs
+        numeric_keys = [
+            "Total Steps",
+            "LLM Steps",
+            "Total Tool Calls",
+            "Tool Call Precision",
+            "Tool Call Recall",
+            "Agent Routing Accuracy",
+            "Avg Resp Time (sec)",
+        ]
+        for k in numeric_keys:
+            out[k] = mean(
+                [r[k] for r in rows if isinstance(r.get(k), (int, float))]
+            )
+        # Add total runs per dataset
+        out["Runs"] = round(float(len(rows)), 2)
+        # Journey Success -> numeric fraction in [0,1]
+        js_vals = [1 if bool(r.get("Journey Success")) else 0 for r in rows]
+        out["Journey Success"] = round(
+            safe_divide(sum(js_vals), len(js_vals)), 2
+        )
+        # Text Match -> numeric fraction in [0,1]
+        tm_hits = 0
+        tm_den = len(rows)
+        for r in rows:
+            val = r.get("Text Match")
+            if str(val).strip() == TextMatchType.text_match.value:
+                tm_hits += 1
+        out["Text Match"] = round(safe_divide(tm_hits, tm_den), 2)
+        per_test_rows.append(out)
+    # Create overall average row
+    overall_row = create_avg_row(per_test_rows)
+    # Format percentages
+    tool_call_metrics_for_display = per_test_rows + [overall_row]
+    for row in tool_call_metrics_for_display:
+        row["Text Match"] = to_pct(row.get("Text Match"), decimals=0)
+        row["Journey Success"] = to_pct(row.get("Journey Success"), decimals=0)
+    column_order = [
+        "Dataset",
+        "Runs",
+        "Total Steps",
+        "LLM Steps",
+        "Total Tool Calls",
+        "Tool Call Precision",
+        "Tool Call Recall",
+        "Agent Routing Accuracy",
+        "Text Match",
+        "Journey Success",
+        "Avg Resp Time (sec)",
+    ]
+    tool_call_metrics_for_display = [
+        {col: row.get(col, "") for col in column_order}
+        for row in tool_call_metrics_for_display
+    ]
+    return tool_call_metrics_for_display
+def extract_metrics(
+    results: List[
+        Tuple[
+            ToolCallAndRoutingMetrics,
+            KnowledgeBaseMetricSummary,
+            CustomEvalMetrics,
+        ]
+    ],
+) -> tuple[
+    list[ToolCallAndRoutingMetrics],
+    KnowledgeBaseMetricSummary,
+    List[CustomEvalMetrics],
+]:
+    """
+    Aggregate metrics from test results.
+    Args:
+        results: List of tuples (metrics, knowledge_base_metrics, custom_metrics)
+    Returns:
+        Tuple of (knowledge_base_summary, tool_rows, custom_metrics)
+    """
+    tool_call_metrics = [metric[0] for metric in results]
+    knowledge_base_metrics = [metric[1] for metric in results]
+    custom_metrics: List[CustomEvalMetrics] = [metric[2] for metric in results]
+    kb_summary = KnowledgeBaseMetricSummary(
+        knowledge_base_metrics=knowledge_base_metrics
+    )
+    if len(tool_call_metrics) > 0:
+        # Remove the average row if it exists
+        tool_call_metrics = [
+            row
+            for row in tool_call_metrics
+            if row.dataset_name != "Summary (Average)"
+        ]
+    return tool_call_metrics, kb_summary, custom_metrics

wxo_agentic_evaluation/metrics/tool_calling.py ADDED Viewed

@@ -0,0 +1,93 @@
+import json
+from typing import List, Union
+from wxo_agentic_evaluation.metrics import Evaluation, argument_matching
+from wxo_agentic_evaluation.metrics.metrics import (
+    LangfuseMetric,
+    ToolCallAndRoutingMetrics,
+)
+from wxo_agentic_evaluation.type import ContentType
+class ToolCalling(Evaluation):
+    @property
+    def name(self):
+        return "Tool Calling Metrics"
+    def evaluate(
+        self, messages, ground_truth, extracted_context, metadata, **kwargs
+    ) -> Union[LangfuseMetric, List[LangfuseMetric]]:
+        dataset_name = kwargs.get("dataset", "")
+        total_tool_calls = 0
+        relevant_tool_calls = 0
+        tool_calls_with_incorrect_parameter = 0
+        correct_tool_calls = set()
+        tool_dictionary = (
+            {
+                goal_detail.name: goal_detail
+                for goal_detail in ground_truth.goal_details
+                if goal_detail.type == ContentType.tool_call
+            }
+            if ground_truth.goal_details
+            else {}
+        )
+        labeled_messages = extracted_context.get("labeled_messages")
+        total_tool_calls = len(
+            [
+                message
+                for message in messages
+                if message.type == ContentType.tool_call
+            ]
+        )
+        relevant_tool_calls = len(labeled_messages)
+        for message_idx, matching_goal_details in labeled_messages.items():
+            msg_tool_call = messages[message_idx]
+            msg_tool_call = msg_tool_call.tool_calls[0].function
+            for goal_detail in matching_goal_details:
+                # TODO flesh out to match ADK EVAL
+                args_match = argument_matching(
+                    expected=goal_detail.args,
+                    actual=None if len(msg_tool_call.arguments) == 0 else json.loads(msg_tool_call.arguments),
+                )
+                if args_match:
+                    correct_tool_calls.add(goal_detail.name)
+                else:
+                    tool_calls_with_incorrect_parameter += 1
+        # TODO: think about the dataset name
+        # TODO: total_steps
+        tool_call_metrics = ToolCallAndRoutingMetrics(
+            dataset_name=dataset_name,
+            total_tool_calls=total_tool_calls,
+            expected_tool_calls=len(tool_dictionary),
+            correct_tool_calls=len(correct_tool_calls),
+            relevant_tool_calls=relevant_tool_calls,
+            tool_calls_with_incorrect_parameter=tool_calls_with_incorrect_parameter,
+        )
+        tool_call_metrics = tool_call_metrics.model_dump()
+        metrics = []
+        for tool in [
+            "total_tool_calls",
+            "correct_tool_calls",
+            "expected_tool_calls",
+            "tool_calls_with_incorrect_parameter",
+            "tool_call_recall",
+            "tool_call_precision",
+        ]:
+            metric = LangfuseMetric(
+                eval_name=tool,
+                value=tool_call_metrics.get(tool),
+                metadata=metadata,
+                data_type="NUMERIC",
+            )
+            metrics.append(metric)
+        return metrics

wxo_agentic_evaluation/otel_parser/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from wxo_agentic_evaluation.otel_parser import parser as otel_parser

wxo_agentic_evaluation/otel_parser/langflow_parser.py ADDED Viewed

@@ -0,0 +1,86 @@
+import json
+from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
+def parse_observations(observation_tree, dfs_observations, dfs_callable: callable):
+    messages = []
+    for node in dfs_observations:
+        # assume there will only be one AgentExecutor in the trace!
+        if node.obs.name == 'AgentExecutor': return _parse_agent_executor(node.children, dfs_callable(node.children))
+    return messages
+def _parse_agent_executor(observation_tree, dfs_observations):
+    messages = []
+    for node in dfs_observations:
+        if node.obs.type == 'GENERATION':
+            print(node.obs.id)
+            messages.extend(_get_messages(node.obs.input))
+            # get intemediate steps from parent
+            messages.extend(_get_intermediate_steps(node.parent))
+            messages.extend(_get_messages([node.obs.output]))
+    return messages
+def _get_messages(data):
+    messages = []
+    for msg in data:
+        if msg['role'] == 'system': messages.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
+        elif msg['role'] == 'user':
+            content = ''
+            if isinstance(msg['content'], list):
+                content = []
+                for item in msg['content']:
+                    if item['type'] == ['text']: content.append(item['text'])
+                content = ' '.join(content)
+            elif isinstance(msg['content'], str):
+                content = msg['content']
+            messages.append(OTelParserMessage(role='user', content=content, type=ContentType.text))
+        elif msg['role'] == 'assistant':
+            content = msg['content'] or ''
+            additional_kwargs = msg.get('additional_kwargs', {})
+            tool_calls = None
+            if 'tool_calls' in additional_kwargs:
+                tool_calls = []
+                for tc in additional_kwargs['tool_calls']:
+                    id_ = tc['id']
+                    function = OTelParserFunction(name=tc['function']['name'], arguments=tc['function']['arguments'])
+                    tool_calls.append(OTelParserToolCall(id=id_, function=function))
+            messages.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=ContentType.tool_call))
+    return messages
+def _get_intermediate_steps(node):
+    messages = []
+    tool_calls_n_responses = node.obs.input['intermediate_steps']
+    for tc, tr in tool_calls_n_responses:
+        if 'tool' in tc and 'tool_input' in tc and 'tool_call_id' in tc:
+            tool_call_id = tc['tool_call_id']
+            if isinstance(tr, str):
+                messages.append(OTelParserMessage(role='tool', content=tr, tool_call_id=tool_call_id, type=ContentType.tool_response))
+                continue
+            elif (isinstance(tr, dict) and 'content' not in tr):
+                messages.append(OTelParserMessage(role='tool', content=json.dumps(tr), tool_call_id=tool_call_id, type=ContentType.tool_response))
+                continue
+            elif isinstance(tr, dict) and 'content' in tr:
+                content = tr['content']
+                if isinstance(content, str):
+                    messages.append(OTelParserMessage(role='tool', content=content, tool_call_id=tool_call_id, type=ContentType.tool_response))
+                    continue
+                elif isinstance(content, list):
+                    for part in content:
+                        if isinstance(part, dict) and part['type'] == 'text':
+                            text = part['text']
+                            if isinstance(text, dict): text = json.dumps(text)
+                            messages.append(OTelParserMessage(role='tool', content=text, tool_call_id=tool_call_id, type=ContentType.tool_response))
+                            continue
+                        else:
+                            raise ValueError(f"Unexpected part type: {type(part)} or part[type] '{part['type']}' != 'text'")
+                else:
+                    raise ValueError(f"Unexpected content type: {type(content)}")
+            else:
+                raise ValueError(f"Unexpected tool response: Type: {type(tr)}, Value: {tr}")
+        else:
+            print('Tool Call:', tc)
+            print('Tool Response:', tr)
+    return messages

wxo_agentic_evaluation/otel_parser/langgraph_parser.py ADDED Viewed

@@ -0,0 +1,61 @@
+import json
+from wxo_agentic_evaluation.type import OTelParserMessage, OTelParserToolCall, OTelParserFunction, ContentType
+def parse_observations(observation_tree, dfs_observations):
+    messages = []
+    is_first_generation = True
+    for obs in dfs_observations:
+        if obs.obs.type == 'GENERATION':
+            if is_first_generation:
+                messages.extend(_get_input_message(obs))
+                is_first_generation = False
+            parent = obs.parent
+            if parent.obs.type == 'CHAIN':
+                # TODO: messages is a list. confirm, we will only see one message in the list.
+                msg = parent.obs.output['messages'][0]
+                content = msg['content'] or ''
+                msg_type = ContentType.text
+                tool_calls = msg['tool_calls'] or None
+                if tool_calls is not None:
+                    msg_type = ContentType.tool_call
+                    tool_calls = [_to_tool_call(tc) for tc in tool_calls]
+                messages.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=msg_type))
+        elif obs.obs.type == 'TOOL':
+            parent_node = obs.parent
+            if parent_node.obs.type == 'CHAIN':
+                for tool_response in parent_node.obs.output['messages']:
+                    messages.append(OTelParserMessage(role='tool', content=tool_response['content'], tool_call_id=tool_response['tool_call_id'], type=ContentType.tool_response))
+    return messages
+def _get_input_message(obs_node):
+    ret = []
+    parent = obs_node.parent
+    if parent.obs.type == 'CHAIN':
+        for msg in parent.obs.input['messages']:
+            if msg['type'] == 'system': ret.append(OTelParserMessage(role='system', content=msg['content'], type=ContentType.text))
+            elif msg['type'] == 'human': ret.append(OTelParserMessage(role='user', content=msg['content'], type=ContentType.text))
+            elif msg['type'] == 'tool': ret.append(OTelParserMessage(role='tool', content=msg['content'], tool_call_id=msg['tool_call_id'], type=ContentType.tool_response))
+            elif msg['type'] == 'ai':
+                content = msg['content'] or ''
+                tool_calls = msg['tool_calls'] or None
+                msg_type = ContentType.text
+                if tool_calls is not None:
+                    msg_type = ContentType.tool_call
+                    tool_calls = [_to_tool_call(tc) for tc in tool_calls]
+                ret.append(OTelParserMessage(role='assistant', content=content, tool_calls=tool_calls, type=msg_type))
+    return ret
+def _to_tool_call(tool_call):
+    return OTelParserToolCall(
+        id=tool_call['id'],
+        type='function',  # OTelParserToolCall expects literal 'function'
+        function=_to_function(tool_call)
+    )
+def _to_function(func):
+    return OTelParserFunction(
+        name=func['name'],
+        arguments=json.dumps(func['args'])
+    )

ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl