PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
wxo_agentic_evaluation/analyze_run.py +1025 -220
wxo_agentic_evaluation/annotate.py +2 -2
wxo_agentic_evaluation/arg_configs.py +60 -2
wxo_agentic_evaluation/base_user.py +25 -0
wxo_agentic_evaluation/batch_annotate.py +19 -2
wxo_agentic_evaluation/clients.py +103 -0
wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
wxo_agentic_evaluation/compare_runs/diff.py +554 -0
wxo_agentic_evaluation/compare_runs/model.py +193 -0
wxo_agentic_evaluation/data_annotator.py +25 -7
wxo_agentic_evaluation/description_quality_checker.py +29 -6
wxo_agentic_evaluation/evaluation.py +16 -8
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
wxo_agentic_evaluation/evaluation_package.py +414 -69
wxo_agentic_evaluation/external_agent/__init__.py +1 -1
wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
wxo_agentic_evaluation/external_agent/types.py +3 -9
wxo_agentic_evaluation/extractors/__init__.py +3 -0
wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
wxo_agentic_evaluation/langfuse_collection.py +60 -0
wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
wxo_agentic_evaluation/llm_matching.py +104 -2
wxo_agentic_evaluation/llm_safety_eval.py +64 -0
wxo_agentic_evaluation/llm_user.py +5 -4
wxo_agentic_evaluation/llm_user_v2.py +114 -0
wxo_agentic_evaluation/main.py +112 -343
wxo_agentic_evaluation/metrics/__init__.py +15 -0
wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
wxo_agentic_evaluation/metrics/evaluations.py +107 -0
wxo_agentic_evaluation/metrics/journey_success.py +137 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
wxo_agentic_evaluation/metrics/metrics.py +276 -8
wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
wxo_agentic_evaluation/otel_parser/parser.py +163 -0
wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
wxo_agentic_evaluation/otel_parser/utils.py +15 -0
wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
wxo_agentic_evaluation/prompt/template_render.py +103 -4
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
wxo_agentic_evaluation/quick_eval.py +33 -17
wxo_agentic_evaluation/record_chat.py +38 -32
wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
wxo_agentic_evaluation/resource_map.py +3 -1
wxo_agentic_evaluation/runner.py +329 -0
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
wxo_agentic_evaluation/scheduler.py +247 -0
wxo_agentic_evaluation/service_instance.py +26 -17
wxo_agentic_evaluation/service_provider/__init__.py +145 -9
wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
wxo_agentic_evaluation/service_provider/provider.py +130 -10
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
wxo_agentic_evaluation/simluation_runner.py +125 -0
wxo_agentic_evaluation/test_prompt.py +4 -4
wxo_agentic_evaluation/type.py +185 -16
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
wxo_agentic_evaluation/utils/__init__.py +44 -3
wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
wxo_agentic_evaluation/utils/messages_parser.py +30 -0
wxo_agentic_evaluation/utils/parsers.py +71 -0
wxo_agentic_evaluation/utils/utils.py +313 -9
wxo_agentic_evaluation/wxo_client.py +81 -0
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0

wxo_agentic_evaluation/service_provider/provider.py CHANGED Viewed

@@ -1,18 +1,138 @@
-from abc import ABC, abstractmethod
-from typing import List
+from __future__ import annotations
+import logging
+import os
+from abc import ABC, ABCMeta, abstractmethod
+from dataclasses import dataclass
+from threading import Lock
+from typing import Any, Dict, List, Optional, Sequence, Tuple
-class Provider(ABC):
-    def __init__(self):
-        pass
+from wxo_agentic_evaluation.type import ProviderInstancesCacheKey
+class SingletonProviderMeta(type):
+    _provider_instances: Dict[str, "Provider"] = {}
+    _instantiation_lock = Lock()
+    def __call__(cls, *args, **kwargs):
+        key_str: str = str(cls._get_key(cls.__name__, args, kwargs))
+        if key_str not in cls._provider_instances:
+            with cls._instantiation_lock:
+                if key_str not in cls._provider_instances:
+                    cls._provider_instances[key_str] = super().__call__(
+                        *args, **kwargs
+                    )
+        return cls._provider_instances[key_str]
+    @staticmethod
+    def _get_key(
+        provider: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> ProviderInstancesCacheKey:
+        args_str = str(args) if args else "noargs"
+        kwargs_str = str(sorted(kwargs.items())) if kwargs else "nokwargs"
+        return ProviderInstancesCacheKey(
+            provider=provider,
+            hashed_args=args_str,
+            hashed_kwargs=kwargs_str,
+        )
+class SingletonProviderABCMeta(ABCMeta, SingletonProviderMeta):
+    pass
+@dataclass
+class ChatResult:
+    text: str
+    usage: Optional[Dict[str, Any]] = None
+    finish_reason: Optional[str] = None
+    raw: Optional[Any] = None
+class Provider(ABC, metaclass=SingletonProviderABCMeta):
+    def __init__(
+        self,
+        use_legacy_query: Optional[bool] = None,
+        logger: Optional[logging.Logger] = None,
+    ) -> None:
+        self.logger = logger or logging.getLogger(self.__class__.__name__)
+        env_use_legacy = os.environ.get("USE_LEGACY_QUERY")
+        if env_use_legacy is not None:
+            self.use_legacy_query: bool = env_use_legacy.strip().lower() in (
+                "1",
+                "true",
+                "yes",
+                "on",
+            )
+        else:
+            self.use_legacy_query = (
+                bool(use_legacy_query) if use_legacy_query is not None else True
+            )
+        if self.use_legacy_query:
+            self.logger.debug("[d][b]Using legacy /text/generation queries")
+        else:
+            self.logger.debug("[d][b]Using new /chat/completions queries")
     @abstractmethod
-    def query(self, sentence: str) -> str:
-        pass
+    def old_query(self, sentence: str) -> str:
+        raise NotImplementedError
-    def batch_query(self, sentences: List[str]) -> List[str]:
-        return [self.query(sentence) for sentence in sentences]
+    @abstractmethod
+    def new_query(self, sentence: str) -> str:
+        raise NotImplementedError
     @abstractmethod
     def encode(self, sentences: List[str]) -> List[list]:
-        pass
+        raise NotImplementedError
+    def query(self, sentence: str) -> str:
+        if self.use_legacy_query:
+            return self.old_query(sentence)
+        return self.new_query(sentence)
+    def chat(
+        self,
+        messages: Sequence[Dict[str, str]],
+        params: Optional[Dict[str, Any]] = None,
+    ) -> ChatResult:
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not implement chat()."
+        )
+    def batch_query(
+        self,
+        sentences: List[str],
+        max_workers: Optional[int] = None,
+    ) -> List[str]:
+        if not sentences:
+            return []
+        if not max_workers or max_workers <= 1:
+            return [self.query(sentence) for sentence in sentences]
+        from concurrent.futures import ThreadPoolExecutor, as_completed
+        results: List[Optional[str]] = [None] * len(sentences)
+        with ThreadPoolExecutor(max_workers=max_workers) as pool:
+            future_to_idx = {
+                pool.submit(self.query, s): i for i, s in enumerate(sentences)
+            }
+            for fut in as_completed(future_to_idx):
+                idx = future_to_idx[fut]
+                results[idx] = fut.result()
+        return [r if r is not None else "" for r in results]
+    def set_routing(self, use_legacy_query: Optional[bool] = None) -> None:
+        if use_legacy_query is not None:
+            self.use_legacy_query = bool(use_legacy_query)
+    def close(self) -> None:
+        return

wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py CHANGED Viewed

@@ -1,9 +1,14 @@
+import uuid
 from abc import ABC, abstractmethod
 from typing import Any, List, Mapping, Optional, Union
 import requests
 import rich
+from wxo_agentic_evaluation.service_provider.gateway_provider import (
+    GatewayProvider,
+    _translate_params_to_chat,
+)
 from wxo_agentic_evaluation.service_provider.model_proxy_provider import (
     ModelProxyProvider,
 )
@@ -149,3 +154,50 @@ class WatsonXLLMKitWrapper(WatsonXProvider, LLMKitWrapper):
             return resp.json()
         else:
             resp.raise_for_status()
+class GatewayProviderLLMKitWrapper(GatewayProvider, LLMKitWrapper):
+    def chat(self, sentence: Union[str, List[Mapping[str, str]]]):
+        if isinstance(sentence, str):
+            messages = []
+            if self.system_prompt:
+                messages.append(
+                    {"role": "system", "content": self.system_prompt}
+                )
+            messages.append({"role": "user", "content": sentence})
+        else:
+            messages = sentence
+        if self.model_id is None:
+            raise Exception("model id must be specified for text generation")
+        self.refresh_token_if_expires()
+        merged_params = dict(self.params or {})
+        chat_params = _translate_params_to_chat(merged_params)
+        chat_params.pop("stream", None)
+        override_params = dict(merged_params)
+        override_params["model"] = self.model_id
+        payload = {
+            "model": self._payload_model_str(self.model_id),
+            "messages": list(messages),
+            **chat_params,
+        }
+        request_id = str(uuid.uuid4())
+        headers = self._headers(request_id, override_params)
+        resp = requests.post(
+            self.chat_url,
+            json=payload,
+            headers=headers,
+            verify=self._wo_ssl_verify,
+            timeout=self.timeout,
+        )
+        if resp.status_code == 200:
+            return resp.json()
+        else:
+            resp.raise_for_status()

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl