PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
wxo_agentic_evaluation/analyze_run.py +1025 -220
wxo_agentic_evaluation/annotate.py +2 -2
wxo_agentic_evaluation/arg_configs.py +60 -2
wxo_agentic_evaluation/base_user.py +25 -0
wxo_agentic_evaluation/batch_annotate.py +19 -2
wxo_agentic_evaluation/clients.py +103 -0
wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
wxo_agentic_evaluation/compare_runs/diff.py +554 -0
wxo_agentic_evaluation/compare_runs/model.py +193 -0
wxo_agentic_evaluation/data_annotator.py +25 -7
wxo_agentic_evaluation/description_quality_checker.py +29 -6
wxo_agentic_evaluation/evaluation.py +16 -8
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
wxo_agentic_evaluation/evaluation_package.py +414 -69
wxo_agentic_evaluation/external_agent/__init__.py +1 -1
wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
wxo_agentic_evaluation/external_agent/types.py +3 -9
wxo_agentic_evaluation/extractors/__init__.py +3 -0
wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
wxo_agentic_evaluation/langfuse_collection.py +60 -0
wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
wxo_agentic_evaluation/llm_matching.py +104 -2
wxo_agentic_evaluation/llm_safety_eval.py +64 -0
wxo_agentic_evaluation/llm_user.py +5 -4
wxo_agentic_evaluation/llm_user_v2.py +114 -0
wxo_agentic_evaluation/main.py +112 -343
wxo_agentic_evaluation/metrics/__init__.py +15 -0
wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
wxo_agentic_evaluation/metrics/evaluations.py +107 -0
wxo_agentic_evaluation/metrics/journey_success.py +137 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
wxo_agentic_evaluation/metrics/metrics.py +276 -8
wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
wxo_agentic_evaluation/otel_parser/parser.py +163 -0
wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
wxo_agentic_evaluation/otel_parser/utils.py +15 -0
wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
wxo_agentic_evaluation/prompt/template_render.py +103 -4
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
wxo_agentic_evaluation/quick_eval.py +33 -17
wxo_agentic_evaluation/record_chat.py +38 -32
wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
wxo_agentic_evaluation/resource_map.py +3 -1
wxo_agentic_evaluation/runner.py +329 -0
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
wxo_agentic_evaluation/scheduler.py +247 -0
wxo_agentic_evaluation/service_instance.py +26 -17
wxo_agentic_evaluation/service_provider/__init__.py +145 -9
wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
wxo_agentic_evaluation/service_provider/provider.py +130 -10
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
wxo_agentic_evaluation/simluation_runner.py +125 -0
wxo_agentic_evaluation/test_prompt.py +4 -4
wxo_agentic_evaluation/type.py +185 -16
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
wxo_agentic_evaluation/utils/__init__.py +44 -3
wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
wxo_agentic_evaluation/utils/messages_parser.py +30 -0
wxo_agentic_evaluation/utils/parsers.py +71 -0
wxo_agentic_evaluation/utils/utils.py +313 -9
wxo_agentic_evaluation/wxo_client.py +81 -0
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0

wxo_agentic_evaluation/llm_user_v2.py ADDED Viewed

@@ -0,0 +1,114 @@
+from typing import List
+from wxo_agentic_evaluation.base_user import BaseUserSimulator
+from wxo_agentic_evaluation.prompt.template_render import UserTemplateRenderer
+from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
+from wxo_agentic_evaluation.type import ContentType, Message
+class LLMUserV2(BaseUserSimulator):
+    def __init__(
+        self,
+        llm_client: Provider,
+        user_prompt_path: str,
+    ):
+        self.llm_client = llm_client
+        self.user_prompt_path = user_prompt_path
+        self.prompt_template = UserTemplateRenderer(
+            template_path=user_prompt_path
+        )
+    def _get_system_prompt(
+        self, user_story: str, user_response_style: List[str] = None
+    ) -> Message:
+        # Get the user system prompt
+        prompt_messages = self.prompt_template.render(
+            user_story=user_story,
+            user_response_style=user_response_style,
+        )
+        return Message(**prompt_messages[0], type=ContentType.text)
+    def _get_message_dicts(self, messages: List[Message]) -> List[dict]:
+        # Convert messages to dictionary format for the llm client
+        return [message.model_dump() for message in messages]
+    def _filter_conversation_history(
+        self, conversation_history: List[Message]
+    ) -> List[Message]:
+        # Filter out the agent system prompt
+        return [
+            message
+            for message in conversation_history
+            if message.role != "system"
+        ]
+    def flip_message_roles(self, messages: List[Message]) -> List[Message]:
+        # We flip the roles of messages in conversation history to basically prompt the
+        # user simulator with the assistant message as the user input message
+        # This helps to get the llm to respond as a natural user with the given story.
+        new_messages = []
+        for message in messages:
+            if message.role == "user":
+                new_messages.append(
+                    Message(
+                        role="assistant",
+                        content=message.content,
+                        type=ContentType.text,
+                    )
+                )
+            else:
+                new_messages.append(
+                    Message(
+                        role="user",
+                        content=message.content,
+                        type=ContentType.text,
+                    )
+                )
+        return new_messages
+    def generate_user_input(
+        self,
+        user_story: str,
+        conversation_history: List[Message],
+        user_response_style: List[str] = None,
+        starting_user_input: Message = None,
+        **kwargs,
+    ) -> Message:
+        # Get the user system prompt
+        system_prompt = self._get_system_prompt(user_story, user_response_style)
+        conversation_history = self._filter_conversation_history(
+            conversation_history
+        )
+        ## Adding dummy message if not provided from the simulation side.
+        if len(conversation_history) == 0:
+            conversation_history.append(
+                Message(
+                    role="assistant",
+                    content="Hi! How can I help you today?",
+                    type=ContentType.text,
+                )
+            )
+        conversation_history = self.flip_message_roles(conversation_history)
+        # build the conversation history with the system prompt
+        messages = [system_prompt] + conversation_history
+        if starting_user_input is not None:
+            # If starting user input is provided, return it as is for the initial turn
+            return starting_user_input
+        else:
+            # Get response from LLM for simulation
+            response = self.llm_client.chat(
+                messages=self._get_message_dicts(messages)
+            )
+            response_message = Message(
+                role="user",
+                content=response.choices[0].message.content,
+                type=ContentType.text,
+            )
+            return response_message

wxo_agentic_evaluation/main.py CHANGED Viewed

@@ -1,383 +1,152 @@
-import csv
 import dataclasses
-import glob
 import json
 import os
-import traceback
-from concurrent.futures import ThreadPoolExecutor
+import pathlib
+from datetime import datetime
 from pathlib import Path
-from typing import List
-import rich
 import yaml
 from jsonargparse import CLI
-from rich.progress import Progress
 from wxo_agentic_evaluation.arg_configs import TestConfig
-from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
-from wxo_agentic_evaluation.inference_backend import (
-    EvaluationController,
-    WXOInferenceBackend,
-    get_wxo_client,
-)
-from wxo_agentic_evaluation.llm_user import LLMUser
+from wxo_agentic_evaluation.clients import bootstrap_clients
 from wxo_agentic_evaluation.metrics.metrics import (
-    KnowledgeBaseMetricSummary,
-    TextMatchType,
-    ToolCallAndRoutingMetrics,
+    extract_metrics,
+    format_metrics_for_display,
 )
-from wxo_agentic_evaluation.prompt.template_render import (
-    LlamaUserTemplateRenderer,
+from wxo_agentic_evaluation.runner import process_test_case
+from wxo_agentic_evaluation.scheduler import (
+    discover_tests,
+    enumerate_jobs,
+    run_jobs,
 )
-from wxo_agentic_evaluation.resource_map import ResourceMap
-from wxo_agentic_evaluation.service_provider import get_provider
-from wxo_agentic_evaluation.type import EvaluationData
-from wxo_agentic_evaluation.utils import json_dump
 from wxo_agentic_evaluation.utils.utils import (
     SummaryPanel,
     create_table,
-    safe_divide,
+    csv_dump,
 )
-def process_test_case(
-    task_n, test_case, config, inference_backend, resource_map, llm_user
-):
-    summary_results_for_path = []
-    tc_name = os.path.basename(test_case).replace(".json", "")
-    with open(test_case, "r") as f:
-        test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
-    evaluation_controller = EvaluationController(
-        wxo_inference_backend=inference_backend,
-        llm_user=llm_user,
-        config=config,
-    )
-    rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
-    (
-        history,
-        call_tracker,
-        conversational_search_data,
-    ) = evaluation_controller.run(
-        task_n,
-        test_case.story,
-        agent_name=test_case.agent,
-        starting_user_input=test_case.starting_sentence,
-    )
-    result = list()
-    for message in history:
-        result.append(message.model_dump())
-    json_dump(
-        os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
-        result,
-    )
-    if len(conversational_search_data) > 0:
-        fn = tc_name + ".retrieval_context.json"
-        out_folder = Path(config.output_dir) / "knowledge_base_metrics"
-        out_folder.mkdir(exist_ok=True)
-        rc = [context.model_dump() for context in conversational_search_data]
-        json_dump(out_folder / fn, rc)
-    # If data annotation run, skip summary generation
-    if config.data_annotation_run:
-        return summary_results_for_path  # empty result set, skip summary
-    evaluation_package = EvaluationPackage(
-        test_case_name=tc_name,
-        messages=history,
-        ground_truth=test_case,
-        conversational_search_data=conversational_search_data,
-        resource_map=resource_map,
-    )
-    (
-        keyword_semantic_matches,
-        knowledge_base_metrics,
-        messages_with_reason,
-        metrics,
-    ) = evaluation_package.generate_summary()
-    temp = []
-    for message in messages_with_reason:
-        temp.append(message.model_dump())
-    json_dump(
-        os.path.join(
-            config.output_dir, "messages", tc_name + ".messages.analyze.json"
-        ),
-        temp,
-    )
-    json_dump(
-        os.path.join(config.output_dir, "messages", tc_name + ".metrics.json"),
-        metrics.model_dump(),
-    )
-    metrics.dataset_name = tc_name
-    metrics.avg_resp_time = (
-        sum(call_tracker.generic) + sum(call_tracker.tool_call)
-    ) / (len(call_tracker.generic) + len(call_tracker.tool_call))
-    metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
-    summary_results_for_path.append((metrics, knowledge_base_metrics))
-    return summary_results_for_path
+from wxo_agentic_evaluation.langfuse_collection import LangfuseCollection
+from wxo_agentic_evaluation.metrics.journey_success import JourneySuccessMetric
+from wxo_agentic_evaluation.metrics.tool_calling import ToolCalling
+from wxo_agentic_evaluation.langfuse_evaluation_package import EvaluationRunner, sample_aggregator
 def main(config: TestConfig):
-    executor = ThreadPoolExecutor(max_workers=config.num_workers)
-    if config.num_workers > 1 and config.enable_manual_user_input:
-        rich.print(
-            "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
+    # setup
+    clients = bootstrap_clients(config)
+    if not getattr(config, "skip_available_results", False):
+        ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        config.output_dir = os.path.join(config.output_dir, ts)
+    if not config.skip_legacy_evaluation:
+        knowledge_base_output_folder = (
+            Path(config.output_dir) / "knowledge_base_metrics"
         )
-        config.enable_manual_user_input = (
-            False  # disable manual user input for parallel execution
+        knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
+        detailed_rag_output_file = (
+            knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
         )
-        # reason: threads continue to stream messages while waiting for user input, which is not desired
-        # and the manual input prompt is not labelled properly in the UI
-    wxo_client = get_wxo_client(
-        config.auth_config.url,
-        config.auth_config.tenant_name,
-        config.auth_config.token,
-    )
-    resource_map = ResourceMap(wxo_client)
-    inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
-    llm_user = LLMUser(
-        wai_client=get_provider(
-            config=config.provider_config,
-            model_id=config.llm_user_config.model_id,
-        ),
-        template=LlamaUserTemplateRenderer(
-            config.llm_user_config.prompt_config
-        ),
-        user_response_style=config.llm_user_config.user_response_style,
-    )
-    print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
-    results_list = []
+        summary_rag_output_file = (
+            Path(config.output_dir) / "knowledge_base_summary_metrics.json"
+        )
+        os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
-    knowledge_base_output_folder = (
-        Path(config.output_dir) / "knowledge_base_metrics"
+    # discover & schedule tests
+    test_cases = discover_tests(
+        config.test_paths, config.enable_recursive_search
     )
-    knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
-    detailed_rag_output_file = (
-        knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
+    jobs = enumerate_jobs(
+        test_cases,
+        config.n_runs,
+        config.skip_available_results,
+        config.output_dir,
     )
-    summary_rag_output_file = (
-        Path(config.output_dir) / "knowledge_base_summary_metrics.json"
+    results = run_jobs(
+        jobs, config, clients, process_test_case, config.num_workers
     )
-    os.makedirs(os.path.join(config.output_dir, "messages"), exist_ok=True)
-    available_res = set()
-    if config.skip_available_results:
-        available_res = set(
-            [
-                os.path.basename(f).replace(".messages", "")
-                for f in glob.glob(
-                    os.path.join(
-                        config.output_dir, "messages", "*.messages.json"
-                    )
-                )
-            ]
-        )
-    test_cases = []
-    for test_path in config.test_paths:
-        if os.path.isdir(test_path):
-            test_path = os.path.join(test_path, "*.json")
-        test_cases.extend(sorted(glob.glob(test_path)))
-    futures = []
-    task_n = 0
-    for test_case in test_cases:
-        if not test_case.endswith(".json") or test_case.endswith("agent.json"):
-            continue
-        if config.skip_available_results:
-            if test_case in available_res:
-                print(
-                    f"Skipping test case {test_case} as results already exist."
-                )
-                continue
-        future = executor.submit(
-            process_test_case,
-            task_n,
-            test_case,
-            config,
-            inference_backend,
-            resource_map,
-            llm_user,
-        )
-        futures.append((test_case, future))
-        task_n += 1
-    if futures:
-        with Progress() as progress:
-            task1 = progress.add_task(
-                f"[purple]Evaluating {len(futures)} tasks...",
-                total=len(futures),
-            )
-            for test_case, future in futures:
-                try:
-                    results_list.extend(future.result())
-                except Exception as e:
-                    rich.print(f"test case {test_case} fails with {e}")
-                    traceback.print_exc()
-                finally:
-                    progress.update(task1, advance=1)
-    tool_call_metrics = [metric[0] for metric in results_list]
-    knowledge_base_metrics = [metric[1] for metric in results_list]
+    # extract
+    tool_metrics, kb_summary, custom_metrics = extract_metrics(results)
-    rag_metric_summary = KnowledgeBaseMetricSummary(
-        knowledge_base_metrics=knowledge_base_metrics
-    )
-    SummaryPanel(rag_metric_summary).print()
-    with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
-        json.dump(
-            rag_metric_summary.model_dump(by_alias=True)["detailed"],
-            f,
-            indent=4,
+    if not config.skip_legacy_evaluation:
+        # write results
+        csv_dump(
+            pathlib.Path(config.output_dir) / "summary_metrics.csv",
+            rows=[metric.model_dump() for metric in tool_metrics],
         )
-    with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
-        json.dump(
-            rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4
+        for file_path, key in [
+            (detailed_rag_output_file, "detailed"),
+            (summary_rag_output_file, "summary"),
+        ]:
+            with open(file_path, "w+", encoding="utf-8") as f:
+                json.dump(kb_summary.model_dump(by_alias=True)[key], f, indent=4)
+        # print results
+        SummaryPanel(kb_summary).print()
+        tool_table = create_table(
+            format_metrics_for_display(tool_metrics), title="Agent Metrics"
         )
-    if len(tool_call_metrics) > 0:
-        # remove the average row if exist
-        tool_call_metrics = [
-            row
-            for row in tool_call_metrics
-            if row.dataset_name != "Summary (Average)"
-        ]
-        def filter_display_only_values(
-            tool_call_metric: ToolCallAndRoutingMetrics,
-        ):
-            row = {
-                "Dataset": tool_call_metric.dataset_name,
-                "Total Steps": tool_call_metric.total_steps,
-                "LLM Steps": tool_call_metric.llm_step,
-                "Total Tool Calls": tool_call_metric.total_tool_calls,
-                "Tool Call Precision": tool_call_metric.tool_call_precision,
-                "Tool Call Recall": tool_call_metric.tool_call_recall,
-                "Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy,
-                "Text Match": tool_call_metric.text_match,
-                "Journey Success": tool_call_metric.is_success,
-                "Avg Resp Time (sec)": tool_call_metric.avg_resp_time,
-            }
-            return row
-        def create_avg_row(metrics: List[dict]):
-            avg_row = {
-                "Dataset": "Summary (Average)",
-                "Total Steps": 0,
-                "LLM Steps": 0,
-                "Total Tool Calls": 0,
-                "Tool Call Precision": 0,
-                "Tool Call Recall": 0,
-                "Agent Routing Accuracy": 0,
-                "Text Match": 0,
-                "Journey Success": 0,
-                "Avg Resp Time (sec)": 0,
-            }
-            if metrics:
-                for row in metrics:
-                    avg_row["Total Steps"] += row["Total Steps"]
-                    avg_row["LLM Steps"] += row["LLM Steps"]
-                    avg_row["Total Tool Calls"] += row["Total Tool Calls"]
-                    avg_row["Tool Call Precision"] += row["Tool Call Precision"]
-                    avg_row["Tool Call Recall"] += row["Tool Call Recall"]
-                    avg_row["Agent Routing Accuracy"] += row[
-                        "Agent Routing Accuracy"
-                    ]
-                    avg_row["Text Match"] += (
-                        row["Text Match"] == TextMatchType.text_match.value
-                    )
-                    avg_row["Journey Success"] += row["Journey Success"]
-                    avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
-                avg_row["Total Steps"] = round(
-                    safe_divide(avg_row["Total Steps"], len(metrics)), 2
-                )
-                avg_row["LLM Steps"] = round(
-                    safe_divide(avg_row["LLM Steps"], len(metrics)), 2
-                )
-                avg_row["Total Tool Calls"] = round(
-                    safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2
-                )
-                avg_row["Tool Call Precision"] = round(
-                    safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2
-                )
-                avg_row["Tool Call Recall"] = round(
-                    safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2
-                )
-                avg_row["Agent Routing Accuracy"] = round(
-                    safe_divide(
-                        avg_row["Agent Routing Accuracy"], len(metrics)
-                    ),
-                    2,
-                )
-                avg_row["Text Match"] = round(
-                    safe_divide(
-                        avg_row["Text Match"],
-                        len(
-                            [
-                                row
-                                for row in metrics
-                                if row["Text Match"]
-                                != TextMatchType.text_match.na
-                            ]
-                        ),
-                    ),
-                    2,
-                )
-                avg_row["Journey Success"] = round(
-                    safe_divide(avg_row["Journey Success"], len(metrics)), 2
-                )
-                avg_row["Avg Resp Time (sec)"] = round(
-                    safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2
-                )
-            return avg_row
-        tool_call_metrics_for_display = []
-        for row in tool_call_metrics:
-            tool_call_metrics_for_display.append(
-                filter_display_only_values(row)
-            )
-        tool_call_metrics_for_display.append(
-            create_avg_row(tool_call_metrics_for_display)
+        if tool_table:
+            tool_table.print()
+        if any(cm.custom_metrics for cm in custom_metrics):
+            rows = []
+            for cm in custom_metrics:
+                row = {"dataset_name": cm.dataset_name}
+                for m in cm.custom_metrics:
+                    row[m.eval_name] = str(
+                        m.value
+                    )  # Convert to string to avoid type issues
+                rows.append(row)
+            custom_metrics_table = create_table(rows, title="Custom Metrics")
+            if custom_metrics_table:
+                custom_metrics_table.print()
+    else:
+        collection_name = os.path.basename(config.output_dir) + "_collection"
+        collection = LangfuseCollection(
+            name=collection_name,
+            description="",
         )
-        tool_call_table_for_display = create_table(
-            tool_call_metrics_for_display
+        dataset_paths = []
+        session_ids = []
+        for test_case in test_cases:
+            name = os.path.basename(test_case).replace(".json", "")
+            with open(os.path.join(config.output_dir, f"{name}.metadata.json"), "r") as f:
+                metadata = json.load(f)
+            session_id = metadata["thread_id"]
+            dataset_paths.append(test_case)
+            session_ids.append(session_id)
+        collection.upload(paths=dataset_paths)
+        langfuse_collection = LangfuseCollection(name=collection_name)
+        journey_sucess_metric = JourneySuccessMetric()
+        tool_calling = ToolCalling()
+        run = EvaluationRunner(
+            evaluation_name=os.path.basename(config.output_dir) + "_evaluation",
+            run_name=os.path.basename(config.output_dir) + "_run",
+            session_ids=session_ids,
+            collection=langfuse_collection,
+            metrics=[journey_sucess_metric, tool_calling],
+            aggregator=sample_aggregator
         )
-        if tool_call_table_for_display:
-            tool_call_table_for_display.print()
-    if len(tool_call_metrics) > 0:
-        tool_call_metrics = [
-            metric.model_dump() for metric in tool_call_metrics
-        ]
-        output_file = os.path.join(config.output_dir, "summary_metrics.csv")
-        header = list(tool_call_metrics[0].keys())
-        with open(output_file, "w") as file:
-            csv_writer = csv.writer(file)
-            csv_writer.writerow(header)
-            for entry in tool_call_metrics:
-                csv_writer.writerow([entry[name] for name in header])
+        run.evaluate()
+    # persist config
     with open(
-        os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"
+        pathlib.Path(config.output_dir) / "config.yml", "w", encoding="utf-8"
     ) as f:
         yaml.safe_dump(dataclasses.asdict(config), f)
-    print(f"Results saved to {config.output_dir}")
+    if not config.skip_legacy_evaluation:
+        print(f"Results saved to {config.output_dir}")
+    else:
+        print(f"Config and metadata saved to {config.output_dir}")
+        print(f"Langfuse Evaluation run completed for collection {collection_name}:")
+        for session_id in session_ids:
+            print(f" - http://localhost:3010/project/orchestrate-lite/sessions/{session_id}")
 if __name__ == "__main__":

wxo_agentic_evaluation/metrics/__init__.py CHANGED Viewed

@@ -0,0 +1,15 @@
+from wxo_agentic_evaluation.metrics.evaluations import Evaluation
+from wxo_agentic_evaluation.metrics.metrics import (
+    Annotation,
+    FailedSemanticTestCases,
+    FailedStaticTestCases,
+)
+def argument_matching(expected, actual):
+    if actual is None:
+        return False
+    for field in actual:
+        if field not in expected:
+            return False
+    return True

wxo_agentic_evaluation/metrics/dummy_metric.py ADDED Viewed

@@ -0,0 +1,16 @@
+from langfuse.api.resources.commons.types.score_data_type import ScoreDataType
+from wxo_agentic_evaluation.metrics import Evaluation
+from wxo_agentic_evaluation.metrics.metrics import LangfuseMetric
+class DummyMetric(Evaluation):
+    def __init__(self, llm_client = None):
+        super().__init__(llm_client)
+    def evaluate(self, messages, ground_truth, extracted_context, metadata = ..., **kwargs):
+        return LangfuseMetric(
+            eval_name="dummy_metric",
+            value=True,
+            metadata=metadata,
+            data_type="BOOLEAN",
+        )

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl