PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
wxo_agentic_evaluation/analyze_run.py +1025 -220
wxo_agentic_evaluation/annotate.py +2 -2
wxo_agentic_evaluation/arg_configs.py +60 -2
wxo_agentic_evaluation/base_user.py +25 -0
wxo_agentic_evaluation/batch_annotate.py +19 -2
wxo_agentic_evaluation/clients.py +103 -0
wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
wxo_agentic_evaluation/compare_runs/diff.py +554 -0
wxo_agentic_evaluation/compare_runs/model.py +193 -0
wxo_agentic_evaluation/data_annotator.py +25 -7
wxo_agentic_evaluation/description_quality_checker.py +29 -6
wxo_agentic_evaluation/evaluation.py +16 -8
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
wxo_agentic_evaluation/evaluation_package.py +414 -69
wxo_agentic_evaluation/external_agent/__init__.py +1 -1
wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
wxo_agentic_evaluation/external_agent/types.py +3 -9
wxo_agentic_evaluation/extractors/__init__.py +3 -0
wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
wxo_agentic_evaluation/langfuse_collection.py +60 -0
wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
wxo_agentic_evaluation/llm_matching.py +104 -2
wxo_agentic_evaluation/llm_safety_eval.py +64 -0
wxo_agentic_evaluation/llm_user.py +5 -4
wxo_agentic_evaluation/llm_user_v2.py +114 -0
wxo_agentic_evaluation/main.py +112 -343
wxo_agentic_evaluation/metrics/__init__.py +15 -0
wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
wxo_agentic_evaluation/metrics/evaluations.py +107 -0
wxo_agentic_evaluation/metrics/journey_success.py +137 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
wxo_agentic_evaluation/metrics/metrics.py +276 -8
wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
wxo_agentic_evaluation/otel_parser/parser.py +163 -0
wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
wxo_agentic_evaluation/otel_parser/utils.py +15 -0
wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
wxo_agentic_evaluation/prompt/template_render.py +103 -4
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
wxo_agentic_evaluation/quick_eval.py +33 -17
wxo_agentic_evaluation/record_chat.py +38 -32
wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
wxo_agentic_evaluation/resource_map.py +3 -1
wxo_agentic_evaluation/runner.py +329 -0
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
wxo_agentic_evaluation/scheduler.py +247 -0
wxo_agentic_evaluation/service_instance.py +26 -17
wxo_agentic_evaluation/service_provider/__init__.py +145 -9
wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
wxo_agentic_evaluation/service_provider/provider.py +130 -10
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
wxo_agentic_evaluation/simluation_runner.py +125 -0
wxo_agentic_evaluation/test_prompt.py +4 -4
wxo_agentic_evaluation/type.py +185 -16
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
wxo_agentic_evaluation/utils/__init__.py +44 -3
wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
wxo_agentic_evaluation/utils/messages_parser.py +30 -0
wxo_agentic_evaluation/utils/parsers.py +71 -0
wxo_agentic_evaluation/utils/utils.py +313 -9
wxo_agentic_evaluation/wxo_client.py +81 -0
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0

wxo_agentic_evaluation/annotate.py CHANGED Viewed

@@ -6,7 +6,7 @@ from jsonargparse import CLI
 from wxo_agentic_evaluation.arg_configs import TestCaseGenerationConfig
 from wxo_agentic_evaluation.data_annotator import DataAnnotator
-from wxo_agentic_evaluation.type import EvaluationData, Message
+from wxo_agentic_evaluation.type import Message, OrchestrateDataset
 def main(config: TestCaseGenerationConfig):
@@ -17,7 +17,7 @@ def main(config: TestCaseGenerationConfig):
             messages.append(Message.model_validate(entry))
     with open(config.seed_data_path, "r") as f:
-        evaluation_data = EvaluationData(**json.load(f))
+        evaluation_data = OrchestrateDataset(**json.load(f))
     # Generate annonated dataset
     annotator = DataAnnotator(

wxo_agentic_evaluation/arg_configs.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from dataclasses import dataclass, field
+from enum import StrEnum
 from typing import List, Optional, Union
 from wxo_agentic_evaluation import __file__
@@ -30,7 +31,33 @@ class LLMUserConfig:
 @dataclass
 class ProviderConfig:
     model_id: str = field(default="meta-llama/llama-3-405b-instruct")
-    provider: str = field(default="watsonx")
+    provider: str = field(
+        default_factory=lambda: (
+            "gateway"
+            if os.getenv("USE_GATEWAY_MODEL_PROVIDER", "").lower() == "true"
+            else "watsonx"
+        )
+    )
+    embedding_model_id: str = field(
+        default="sentence-transformers/all-minilm-l6-v2"
+    )
+@dataclass
+class CustomMetricsConfig:
+    paths: Optional[list[str]] = field(default=None)
+    llmaaj_config: ProviderConfig = field(default_factory=ProviderConfig)
+@dataclass
+class ExtractorsConfig:
+    paths: Optional[list[str]] = field(default=None)
+class ControllerConfig:
+    enable_verbose_logging: bool = True
+    enable_manual_user_input: bool = False
 @dataclass
@@ -41,11 +68,21 @@ class TestConfig:
     wxo_lite_version: str
     provider_config: ProviderConfig = field(default_factory=ProviderConfig)
     llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
+    custom_metrics_config: CustomMetricsConfig = field(
+        default_factory=CustomMetricsConfig
+    )
+    extractors_config: ExtractorsConfig = field(default_factory=ExtractorsConfig)
     enable_verbose_logging: bool = True
     enable_manual_user_input: bool = False
     skip_available_results: bool = False
     data_annotation_run: bool = False
     num_workers: int = 2
+    n_runs: int = 1
+    similarity_threshold: float = 0.8
+    enable_fuzzy_matching: bool = False
+    strict_topological_matching: bool = True
+    enable_recursive_search: bool = False
+    skip_legacy_evaluation: bool = False # Skip legacy evaluation and only run user/agent simulations
 @dataclass
@@ -58,22 +95,32 @@ class AttackConfig:
     enable_verbose_logging: bool = True
     enable_manual_user_input: bool = False
     num_workers: int = 2
+    skip_available_results: bool = True
 @dataclass
 class AttackGeneratorConfig:
     attacks_list: Union[List[str], str]
     datasets_path: Union[List[str], str]
-    agents_path: str
+    agents_list_or_path: Union[List[str], str]
     target_agent_name: str
+    auth_config: AuthConfig
     output_dir: str = None
     max_variants: int = None
+class AnalyzeMode(StrEnum):
+    default = "default"
+    enhanced = "enhanced"
 @dataclass
 class AnalyzeConfig:
     data_path: str
     tool_definition_path: Optional[str] = None
+    mode: str = AnalyzeMode.default
+    num_workers: int = 10
+    run: int = -1
 @dataclass
@@ -117,3 +164,14 @@ class BatchAnnotateConfig:
     stories_path: str
     output_dir: str
     num_variants: int = 2
+@dataclass
+class CompareRunsConfig:
+    reference_file_location: str
+    experiment_file_location: str
+    csv_output: Optional[str] = None
+    column_stats_csv: Optional[str] = (
+        "column_by_column_summary_stats_comparison.csv"
+    )
+    verbose: bool = False

wxo_agentic_evaluation/base_user.py ADDED Viewed

@@ -0,0 +1,25 @@
+from abc import ABC, abstractmethod
+from typing import List
+from wxo_agentic_evaluation.type import Message
+class BaseUserSimulator(ABC):
+    """Abstract base class for user simulators."""
+    @abstractmethod
+    def generate_user_input(
+        self, user_story: str, conversation_history: List[Message], **kwargs
+    ) -> Message:
+        """
+        Generate user input based on the user story and conversation history.
+        Args:
+            user_story: The user's story or goal
+            conversation_history: List of previous messages in the conversation
+            **kwargs: Additional parameters specific to the simulator implementation
+        Returns:
+            Message: The generated user input message
+        """
+        pass

wxo_agentic_evaluation/batch_annotate.py CHANGED Viewed

@@ -125,9 +125,11 @@ def generate_multiple_in_one(
     output_dir,
     starting_index,
     model_id="meta-llama/llama-3-405b-instruct",
+    # model_id="gpt-4o",
 ):
     output_dir.mkdir(parents=True, exist_ok=True)
+    # Legacy provider (e.g., watsonx)
     provider = get_provider(
         model_id=model_id,
         params={
@@ -135,9 +137,24 @@ def generate_multiple_in_one(
             "decoding_method": "greedy",
             "max_new_tokens": 3000,
         },
+        use_portkey_provider=False,
     )
-    response = provider.query(prompt)
+    response = provider.chat(prompt).text
+    # # OpenAI provider
+    # provider = get_provider(provider="openai", model_id=model_id, api_key=os.getenv("OPENAI_API_KEY"))
+    # response = provider.chat(prompt).choices[0].message.content
+    # # Azure OpenAI provider
+    # provider = get_provider(
+    #     provider = "azure-openai",
+    #     azure_model_name = model_id,
+    #     azure_deployment_id = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
+    #     azure_resource_name = os.getenv("AZURE_OPENAI_RESOURCE_NAME"),
+    #     azure_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
+    #     api_key = f"Bearer {os.getenv('AZURE_OPENAI_API_KEY')}"
+    # )
+    # response = provider.chat(prompt).choices[0].message.content
     try:
         raw_text = response

wxo_agentic_evaluation/clients.py ADDED Viewed

@@ -0,0 +1,103 @@
+import copy
+from dataclasses import asdict, dataclass
+from wxo_agentic_evaluation.arg_configs import ProviderConfig, TestConfig
+from wxo_agentic_evaluation.llm_user import LLMUser
+from wxo_agentic_evaluation.prompt.template_render import (
+    LlamaUserTemplateRenderer,
+)
+from wxo_agentic_evaluation.resource_map import ResourceMap
+from wxo_agentic_evaluation.runtime_adapter.wxo_runtime_adapter import (
+    WXORuntimeAdapter,
+)
+from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.service_provider.provider import Provider
+from wxo_agentic_evaluation.wxo_client import WXOClient, get_wxo_client
+@dataclass
+class Clients:
+    wxo_client: WXOClient
+    llmaaj_provider: Provider
+    resource_map: ResourceMap
+    inference_backend: WXORuntimeAdapter
+    llm_user: LLMUser
+def bootstrap_clients(config: TestConfig) -> Clients:
+    """
+    Bootstrap all clients needed for the evaluation.
+    Args:
+        config: The test configuration
+    Returns:
+        A tuple containing:
+        - wxo_client: The WXO client
+        - llmaaj_provider: The provider for custom metrics
+        - resource_map: The resource map
+        - inference_backend: The inference backend
+        - llm_user: The LLM user
+    """
+    # Initialize WXO client
+    wxo_client = get_wxo_client(
+        config.auth_config.url,
+        config.auth_config.tenant_name,
+        config.auth_config.token,
+    )
+    # Initialize provider for custom metrics
+    original_provider_config = config.provider_config
+    provider_config_dict = asdict(original_provider_config)
+    provider_kwargs = {
+        "config": ProviderConfig(**provider_config_dict),
+        "model_id": config.llm_user_config.model_id,
+    }
+    if provider_config_dict.get("provider", "gateway") == "gateway":
+        provider_kwargs.update(
+            token=config.auth_config.token or wxo_client.api_key,
+            instance_url=wxo_client.service_url,
+        )
+        config.auth_config.token = (
+            config.auth_config.token or wxo_client.api_key
+        )
+        config.auth_config.url = (
+            config.auth_config.url or wxo_client.service_url
+        )
+    # Initialize resource map
+    resource_map = ResourceMap(wxo_client)
+    # Initialize inference backend
+    inference_backend = WXORuntimeAdapter(wxo_client=wxo_client)
+    # Initialize LLM user
+    llm_user = LLMUser(
+        wai_client=get_provider(**provider_kwargs),
+        template=LlamaUserTemplateRenderer(
+            config.llm_user_config.prompt_config
+        ),
+        user_response_style=config.llm_user_config.user_response_style,
+    )
+    llamaj_provider_kwargs = copy.deepcopy(provider_kwargs)
+    llamaj_config_dict = asdict(llamaj_provider_kwargs["config"])
+    llamaj_config_dict["model_id"] = (
+        config.custom_metrics_config.llmaaj_config.model_id
+    )
+    llamaj_config_dict["embedding_model_id"] = (
+        config.custom_metrics_config.llmaaj_config.embedding_model_id
+    )
+    llamaj_provider_kwargs["config"] = ProviderConfig(**llamaj_config_dict)
+    llmaaj_provider = get_provider(**llamaj_provider_kwargs)
+    return Clients(
+        wxo_client=wxo_client,
+        llmaaj_provider=llmaaj_provider,
+        resource_map=resource_map,
+        inference_backend=inference_backend,
+        llm_user=llm_user,
+    )

wxo_agentic_evaluation/compare_runs/__init__.py ADDED Viewed

File without changes

wxo_agentic_evaluation/compare_runs/compare_2_runs.py ADDED Viewed

@@ -0,0 +1,74 @@
+import csv
+import os
+import statistics
+import sys
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Union
+from jsonargparse import CLI
+from wxo_agentic_evaluation.arg_configs import CompareRunsConfig
+from wxo_agentic_evaluation.compare_runs.diff import DiffResults
+from wxo_agentic_evaluation.compare_runs.model import EvaluationResult
+from wxo_agentic_evaluation.utils.utils import create_table, read_file
+def main(config: CompareRunsConfig):
+    """Main function to compare two run result files."""
+    # Extract values from config
+    reference_file = config.reference_file_location
+    experiment_file = config.experiment_file_location
+    csv_output = config.csv_output
+    column_stats_csv = config.column_stats_csv
+    verbose = config.verbose
+    try:
+        # Read the files
+        obj1 = read_file(reference_file)
+        obj2 = read_file(experiment_file)
+        # Create evaluation results
+        result1 = EvaluationResult.from_csv(obj1)
+        result2 = EvaluationResult.from_csv(obj2)
+        # Create diff results
+        diff_results = DiffResults(result1, result2)
+        # Display summary statistics
+        summary_stats = diff_results.summary_statistics()
+        summary_table = create_table(summary_stats, title="Summary Statistics")
+        print(
+            "\nALL metrics are computed on OVERLAPPING test cases, ie cases that exist in both the Reference and Experiment runs\n"
+        )
+        print(
+            "If Experiment - Reference is Positive, that's an increase in the metric. If Experiment - Reference is Negative, that's a decrease in the metric.\n"
+        )
+        summary_table.print()
+        # Display exclusive tests
+        if verbose:
+            diff_results.display_exclusive_tests()
+            # Display test cases with differing summary match and success status
+            diff_results.display_differing_summary_matches()
+        # Display tabular diff
+        diff_results.compute_tabular_diff(verbose=verbose)
+        # Write results to CSV if specified
+        if csv_output:
+            diff_results.to_csv(csv_output)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    return 0
+if __name__ == "__main__":
+    args = CLI(CompareRunsConfig, as_positional=False)
+    sys.exit(main(args))
+# Made with Bob

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl