PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (49) hide show

wxo_agentic_evaluation/main.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import copy
 import csv
 import dataclasses
 import glob
@@ -7,6 +8,7 @@ import re
 import traceback
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
+from dataclasses import asdict
 from datetime import datetime
 from pathlib import Path
 from typing import List
@@ -16,15 +18,16 @@ import yaml
 from jsonargparse import CLI
 from rich.progress import Progress
-from wxo_agentic_evaluation.arg_configs import TestConfig
+from wxo_agentic_evaluation.arg_configs import ProviderConfig, TestConfig
 from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
 from wxo_agentic_evaluation.inference_backend import (
     EvaluationController,
     WXOInferenceBackend,
-    get_wxo_client,
 )
 from wxo_agentic_evaluation.llm_user import LLMUser
+from wxo_agentic_evaluation.metrics.evaluations import Extractor
 from wxo_agentic_evaluation.metrics.metrics import (
+    CustomEvalMetrics,
     KnowledgeBaseMetricSummary,
     TextMatchType,
     ToolCallAndRoutingMetrics,
@@ -33,46 +36,61 @@ from wxo_agentic_evaluation.prompt.template_render import (
     LlamaUserTemplateRenderer,
 )
 from wxo_agentic_evaluation.resource_map import ResourceMap
-from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.service_provider import (
+    LOGGING_ENABLED,
+    get_provider,
+)
+from wxo_agentic_evaluation.service_provider.provider import Provider
 from wxo_agentic_evaluation.type import EvaluationData
 from wxo_agentic_evaluation.utils import json_dump
+from wxo_agentic_evaluation.utils.evaluation_discovery import (
+    find_evaluation_subclasses,
+)
 from wxo_agentic_evaluation.utils.utils import (
     SummaryPanel,
     create_table,
     safe_divide,
 )
+from wxo_agentic_evaluation.wxo_client import get_wxo_client
 def process_test_case(
-    task_n,
-    test_case,
-    config,
-    inference_backend,
-    resource_map,
-    llm_user,
+    task_n: int,
+    test_case: str,
+    config: TestConfig,
+    inference_backend: WXOInferenceBackend,
+    resource_map: ResourceMap,
+    llm_user: LLMUser,
+    llmaaj_provider: Provider,
     run_idx: int = 0,
 ):
     summary_results_for_path = []
-    tc_name = os.path.basename(test_case).replace(".json", "")
-    run_tag = f".run{run_idx+1}" if getattr(config, "n_runs", 1) > 1 else ""
+    test_case_name = os.path.basename(test_case).replace(".json", "")
+    run_tag = f".run{run_idx+1}" if config.n_runs > 1 else ""
     with open(test_case, "r") as f:
-        test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
+        evaluation_data = EvaluationData.model_validate(json.load(f))
     evaluation_controller = EvaluationController(
         wxo_inference_backend=inference_backend,
         llm_user=llm_user,
         config=config,
     )
-    rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
+    rich.print(
+        f"[bold magenta]Running test case: {test_case_name}[/bold magenta]"
+    )
     (
         history,
         call_tracker,
         conversational_search_data,
     ) = evaluation_controller.run(
         task_n,
-        test_case.story,
-        agent_name=test_case.agent,
-        starting_user_input=test_case.starting_sentence,
+        evaluation_data.story,
+        agent_name=evaluation_data.agent,
+        starting_user_input=evaluation_data.starting_sentence,
+        max_user_turns=evaluation_data.max_user_turns,
     )
     result = list()
     for message in history:
@@ -80,13 +98,15 @@ def process_test_case(
     json_dump(
         os.path.join(
-            config.output_dir, "messages", tc_name + run_tag + ".messages.json"
+            config.output_dir,
+            "messages",
+            f"{test_case_name}{run_tag}.messages.json",
         ),
         result,
     )
     if len(conversational_search_data) > 0:
-        fn = tc_name + run_tag + ".retrieval_context.json"
+        fn = f"{test_case_name}{run_tag}.retrieval_context.json"
         out_folder = Path(config.output_dir) / "knowledge_base_metrics"
         out_folder.mkdir(exist_ok=True)
         rc = [context.model_dump() for context in conversational_search_data]
@@ -96,25 +116,51 @@ def process_test_case(
     if config.data_annotation_run:
         return summary_results_for_path  # empty result set, skip summary
+    # Handle custom extractions
+    all_extractors = []
+    if config.extrators_config.paths is not None:
+        for path in config.extrators_config.paths:
+            extractors = find_evaluation_subclasses(
+                directory=path, base_class_name="Extractor"
+            )
+            for extractor_class in extractors:
+                extractor: Extractor = extractor_class()
+                all_extractors.append(extractor)
+    # Handle custom evaluations
+    all_custom_evals = []
+    if config.custom_metrics_config.paths is not None:
+        for path in config.custom_metrics_config.paths:
+            custom_eval_classes = find_evaluation_subclasses(path)
+            for _class in custom_eval_classes:
+                custom_eval = _class(llm_client=llmaaj_provider)
+                all_custom_evals.append(custom_eval)
     evaluation_package = EvaluationPackage(
-        test_case_name=tc_name,
+        test_case_name=test_case_name,
         messages=history,
-        ground_truth=test_case,
+        ground_truth=evaluation_data,
         conversational_search_data=conversational_search_data,
         resource_map=resource_map,
+        config=config,
+        custom_evals=all_custom_evals,
+        extractors=all_extractors,
+        similarity_threshold=config.similarity_threshold,
+        enable_fuzzy_matching=config.enable_fuzzy_matching,
     )
     (
         keyword_semantic_matches,
         knowledge_base_metrics,
         messages_with_reason,
         metrics,
+        custom_metrics,
     ) = evaluation_package.generate_summary()
     temp = []
     for message in messages_with_reason:
         temp.append(message.model_dump())
     expected_tools = [
         gd.tool_name
-        for gd in test_case.goal_details
+        for gd in evaluation_data.goal_details
         if getattr(gd, "type", None) == "tool_call"
     ]
@@ -157,25 +203,29 @@ def process_test_case(
         os.path.join(
             config.output_dir,
             "messages",
-            tc_name + run_tag + ".messages.analyze.json",
+            f"{test_case_name}{run_tag}.messages.analyze.json",
         ),
         temp,
     )
     json_dump(
         os.path.join(
-            config.output_dir, "messages", tc_name + run_tag + ".metrics.json"
+            config.output_dir,
+            "messages",
+            f"{test_case_name}{run_tag}.metrics.json",
         ),
         metrics.model_dump(),
     )
-    metrics.dataset_name = tc_name
+    metrics.dataset_name = test_case_name
     metrics.avg_resp_time = (
         sum(call_tracker.generic) + sum(call_tracker.tool_call)
     ) / (len(call_tracker.generic) + len(call_tracker.tool_call))
     metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
-    summary_results_for_path.append((metrics, knowledge_base_metrics))
+    summary_results_for_path.append(
+        (metrics, knowledge_base_metrics, custom_metrics)
+    )
     return summary_results_for_path
@@ -199,19 +249,49 @@ def main(config: TestConfig):
         config.auth_config.tenant_name,
         config.auth_config.token,
     )
     resource_map = ResourceMap(wxo_client)
     inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
+    original_provider_config = config.provider_config
+    provider_config_dict = asdict(original_provider_config)
+    provider_kwargs = {
+        "config": ProviderConfig(**provider_config_dict),
+        "model_id": config.llm_user_config.model_id,
+    }
+    if provider_config_dict.get("provider", "gateway") == "gateway":
+        provider_kwargs.update(
+            token=config.auth_config.token or wxo_client.api_key,
+            instance_url=wxo_client.service_url,
+        )
+        config.auth_config.token = (
+            config.auth_config.token or wxo_client.api_key
+        )
+        config.auth_config.url = (
+            config.auth_config.url or wxo_client.service_url
+        )
     llm_user = LLMUser(
-        wai_client=get_provider(
-            config=config.provider_config,
-            model_id=config.llm_user_config.model_id,
-        ),
+        wai_client=get_provider(**provider_kwargs),
         template=LlamaUserTemplateRenderer(
             config.llm_user_config.prompt_config
         ),
         user_response_style=config.llm_user_config.user_response_style,
     )
+    llamaj_provider_kwargs = copy.deepcopy(provider_kwargs)
+    llamaj_config_dict = asdict(llamaj_provider_kwargs["config"])
+    llamaj_config_dict["model_id"] = (
+        config.custom_metrics_config.llmaaj_config.model_id
+    )
+    llamaj_config_dict["embedding_model_id"] = (
+        config.custom_metrics_config.llmaaj_config.embedding_model_id
+    )
+    llamaj_provider_kwargs["config"] = ProviderConfig(**llamaj_config_dict)
+    llmaaj_provider = get_provider(**llamaj_provider_kwargs)
     print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
     results_list = []
@@ -247,7 +327,7 @@ def main(config: TestConfig):
             run_num = int(m.group("run") or 1)  # no suffix ⇒ run 1
             available_runs[stem].add(run_num)
-    test_cases = []
+    test_cases: list[str] = []
     for test_path in config.test_paths:
         if os.path.isdir(test_path):
             test_path = os.path.join(test_path, "*.json")
@@ -256,9 +336,11 @@ def main(config: TestConfig):
     futures = []
     task_n = 0
     n_runs = getattr(config, "n_runs", 1)
     for test_case in test_cases:
         if not test_case.endswith(".json") or test_case.endswith("agent.json"):
             continue
         stem = Path(test_case).stem
         for run_idx in range(n_runs):
@@ -272,6 +354,7 @@ def main(config: TestConfig):
                     f"Skipping {stem} run {run_number} as results already exist."
                 )
                 continue
             future = executor.submit(
                 process_test_case,
                 task_n,
@@ -280,28 +363,42 @@ def main(config: TestConfig):
                 inference_backend,
                 resource_map,
                 llm_user,
+                llmaaj_provider,
                 run_idx,  # 👈 pass run index
             )
             futures.append(((test_case, run_idx), future))
             task_n += 1
     if futures:
-        with Progress() as progress:
-            task1 = progress.add_task(
-                f"[purple]Evaluating {len(futures)} tasks...",
-                total=len(futures),
-            )
+        if LOGGING_ENABLED:
+            # No progress bar when logging - just process tasks
             for (test_case, run_idx), future in futures:
                 try:
                     results_list.extend(future.result())
                 except Exception as e:
                     rich.print(f"test case {test_case} fails with {e}")
                     traceback.print_exc()
-                finally:
-                    progress.update(task1, advance=1)
+        else:
+            with Progress() as progress:
+                task1 = progress.add_task(
+                    f"[purple]Evaluating {len(futures)} tasks...",
+                    total=len(futures),
+                )
+                for (test_case, run_idx), future in futures:
+                    try:
+                        results_list.extend(future.result())
+                    except Exception as e:
+                        rich.print(f"test case {test_case} fails with {e}")
+                        traceback.print_exc()
+                    finally:
+                        progress.update(task1, advance=1)
     tool_call_metrics = [metric[0] for metric in results_list]
     knowledge_base_metrics = [metric[1] for metric in results_list]
+    custom_metrics: List[CustomEvalMetrics] = [
+        metric[2] for metric in results_list
+    ]
     rag_metric_summary = KnowledgeBaseMetricSummary(
         knowledge_base_metrics=knowledge_base_metrics
@@ -502,11 +599,23 @@ def main(config: TestConfig):
         output_file = os.path.join(config.output_dir, "summary_metrics.csv")
         header = list(tool_call_metrics[0].keys())
-        with open(output_file, "w") as file:
+        with open(output_file, "w", newline="") as file:
             csv_writer = csv.writer(file)
             csv_writer.writerow(header)
             for entry in tool_call_metrics:
                 csv_writer.writerow([entry[name] for name in header])
+    # Check if any custom metrics have been calculated
+    if any([m.custom_metrics for m in custom_metrics]):
+        custom_metrics_display_data = []
+        for metric in custom_metrics:
+            row = {}
+            row["dataset_name"] = metric.dataset_name
+            for metric in metric.custom_metrics:
+                row[metric.eval_name] = metric.value
+            custom_metrics_display_data.append(row)
+        create_table(
+            custom_metrics_display_data, title="Custom Metrics"
+        ).print()
     with open(
         os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"

wxo_agentic_evaluation/metrics/__init__.py CHANGED Viewed

@@ -0,0 +1,5 @@
+from wxo_agentic_evaluation.metrics.metrics import (
+    Annotation,
+    FailedSemanticTestCases,
+    FailedStaticTestCases,
+)

wxo_agentic_evaluation/metrics/evaluations.py ADDED Viewed

@@ -0,0 +1,124 @@
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+from wxo_agentic_evaluation.metrics.metrics import Metric
+from wxo_agentic_evaluation.prompt.template_render import LLMaaJTemplateRenderer
+from wxo_agentic_evaluation.service_provider.provider import Provider
+from wxo_agentic_evaluation.type import EvaluationData, Message
+from wxo_agentic_evaluation.utils.messages_parser import ParsedMessages
+root_dir: str = os.path.dirname(os.path.dirname(__file__))
+LLMAAJ_PROMPT_PATH = os.path.join(root_dir, "prompt", "llmaaj_prompt.jinja2")
+class Extractor(ABC):
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Unique name for the extractor."""
+        raise NotImplementedError
+    @staticmethod
+    @abstractmethod
+    def extract(
+        messages: list[Message],
+        **kwargs,
+    ) -> Any:
+        """Extract data from messages."""
+        raise NotImplementedError
+class Evaluation(ABC):
+    """Abstract base class for all evaluations."""
+    def __init__(self, llm_client: Optional[Provider] = None) -> None:
+        self._llm_client = llm_client
+    @property
+    def llm_client(self) -> Any:
+        """Access client, require it if used."""
+        if self._llm_client is None:
+            raise RuntimeError(
+                f"{self.__class__.__name__} requires a client, but none was provided"
+            )
+        return self._llm_client
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Unique name for the evaluator."""
+        raise NotImplementedError
+    @abstractmethod
+    def evaluate(
+        self,
+        messages: list[Message],
+        ground_truth: EvaluationData,
+        extracted_context: Dict[str, Any],
+    ) -> Optional[Metric]:
+        """
+        Evaluation method.
+        Args:
+            messages: agent and user conversational messages (includes tool calls)
+            ground_truth: ground truth data
+            extracted_context: dictionary containing data derived from the messages
+        Returns:
+            Metic
+        """
+        raise NotImplementedError
+class LLMaaJEvaluation(Evaluation, ABC):
+    """Evaluation metric for LLMaaJ."""
+    @property
+    @abstractmethod
+    def llmaaj_instructions(self) -> str:
+        """LLMaaJ instructions for the evaluator."""
+        raise NotImplementedError
+    @abstractmethod
+    def format_llm_output(self, string: str) -> int | float | bool | str:
+        """Format the output of the LLMaaJ query."""
+        raise NotImplementedError
+    @property
+    def selected_context_keys(self) -> set[str]:
+        """Override to implement context keys to pass to the prompt."""
+        return set()
+    def select_context(
+        self, extracted_context: Dict[str, Any]
+    ) -> dict[str, Any]:
+        """Additional context to be added to the prompt."""
+        selected_context = {
+            key: value
+            for key, value in extracted_context.items()
+            if key in self.selected_context_keys
+        }
+        return selected_context
+    def evaluate(
+        self,
+        messages: list[Message],
+        ground_truth: EvaluationData,
+        extracted_context: Dict[str, Any],
+    ) -> Optional[Metric]:
+        renderer = LLMaaJTemplateRenderer(LLMAAJ_PROMPT_PATH)
+        parsed = ParsedMessages(messages=messages)
+        if parsed.user_input is None or parsed.agent_response is None:
+            return None
+        context = str(self.select_context(extracted_context))
+        prompt = renderer.render(
+            user_input=parsed.user_input,
+            agent_answer=parsed.agent_response,
+            llmaaj_instructions=self.llmaaj_instructions,
+            context=context,
+        )
+        score_str = self.llm_client.query(prompt)
+        value = self.format_llm_output(score_str)
+        return Metric(eval_name=self.name, value=value)

wxo_agentic_evaluation/metrics/llm_as_judge.py CHANGED Viewed

@@ -53,8 +53,9 @@ class AnswerDerailment(BaseLLMJudgeMetric):
     def table(self):
         return {
-            "statement": ",".join(self.statement),
+            "statement": self.statement,
             "reason": self.reason,
+            "on_topic_score": str(self.in_scope),
         }
@@ -65,7 +66,7 @@ class AnswerUnsafeTopic(BaseLLMJudgeMetric):
     def table(self):
         return {
-            "statement": ",".join(self.statement),
+            "statement": self.statement,
             "reason": self.reason,
-            "unsafe_topic_score": str(self.is_safe),
+            "safe_topic_score": str(self.is_safe),
         }

wxo_agentic_evaluation/metrics/metrics.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import math
-from enum import Enum
+from enum import Enum, StrEnum
 from typing import Any, List, Mapping, Optional, Tuple
 from pydantic import BaseModel, computed_field
+from pydantic.fields import Field
 from wxo_agentic_evaluation.metrics.llm_as_judge import (
     AnswerRelevancy,
@@ -19,6 +20,36 @@ def average(array):
         return sum(array) / len(array)
+class DescriptionQuality(StrEnum):
+    GOOD = "GOOD"
+    BAD = "BAD"
+    MISSING = "MISSING"
+class DescriptionQualityMetric(BaseModel):
+    tool_name: str = None
+    description_score: float | None = None
+    threshold: float | None = None
+    @computed_field
+    @property
+    def is_bad_description(self) -> Optional[bool]:
+        if self.description_score and self.threshold:
+            return self.description_score >= self.threshold
+        return None
+    @computed_field
+    @property
+    def description_quality(self) -> str:
+        if self.description_score is None:
+            return DescriptionQuality.MISSING
+        elif self.is_bad_description:
+            return DescriptionQuality.BAD
+        else:
+            return DescriptionQuality.GOOD
 class KnowledgeBaseMetrics(BaseModel):
     dataset_name: str = None
     knowledge_base_name: str = (
@@ -175,6 +206,13 @@ class ToolCallAndRoutingMetrics(BaseModel):
         )
+class Annotation(BaseModel):
+    recommendation: str
+    details: str
+    quote: str
+    parameter_name: Optional[str]
 class FailedStaticTestCases(BaseModel):
     metric_name: str
     description: str
@@ -187,6 +225,15 @@ class FailedSemanticTestCases(BaseModel):
     explanation: str
     output: int
     confidence: float
+    annotations: Optional[List[Annotation]] = None
+class EnhancedAnalyzeMetrics(BaseModel):
+    test_case_name: str
+    tool_names: List[str]
+    parameter_annotations: List[List[FailedSemanticTestCases]] = [[]]
+    tool_annotations: List[List[FailedSemanticTestCases]] = [[]]
+    static_metrics: List[List[FailedStaticTestCases]] = [[]]
 class ReferenceLessEvalMetrics(BaseModel):
@@ -201,3 +248,19 @@ class ReferenceLessEvalMetrics(BaseModel):
     failed_semantic_tool_calls: Optional[
         List[Tuple[int, List[FailedSemanticTestCases]]]
     ]
+class Metric(BaseModel):
+    """Generic metric result."""
+    eval_name: str = Field(description="name of eval that produce metric")
+    value: int | float | bool | str = Field(description="metric value")
+    metadata: Optional[dict] = Field(
+        default=None,
+        description="metadata that was generated along side the metric. example: llmaaj reason, retrieval score",
+    )
+class CustomEvalMetrics(BaseModel):
+    dataset_name: str
+    custom_metrics: list[Metric]

wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 ADDED Viewed

@@ -0,0 +1,15 @@
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+{{llmaaj_instructions}}
+<|start_header_id|>user<|end_header_id|>
+User question: {{user_input}}
+Answer: {{agent_answer}}
+Additional Conversationl Context: {{context}}
+<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>

ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl