PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
wxo_agentic_evaluation/analyze_run.py +1025 -220
wxo_agentic_evaluation/annotate.py +2 -2
wxo_agentic_evaluation/arg_configs.py +60 -2
wxo_agentic_evaluation/base_user.py +25 -0
wxo_agentic_evaluation/batch_annotate.py +19 -2
wxo_agentic_evaluation/clients.py +103 -0
wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
wxo_agentic_evaluation/compare_runs/diff.py +554 -0
wxo_agentic_evaluation/compare_runs/model.py +193 -0
wxo_agentic_evaluation/data_annotator.py +25 -7
wxo_agentic_evaluation/description_quality_checker.py +29 -6
wxo_agentic_evaluation/evaluation.py +16 -8
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
wxo_agentic_evaluation/evaluation_package.py +414 -69
wxo_agentic_evaluation/external_agent/__init__.py +1 -1
wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
wxo_agentic_evaluation/external_agent/types.py +3 -9
wxo_agentic_evaluation/extractors/__init__.py +3 -0
wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
wxo_agentic_evaluation/langfuse_collection.py +60 -0
wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
wxo_agentic_evaluation/llm_matching.py +104 -2
wxo_agentic_evaluation/llm_safety_eval.py +64 -0
wxo_agentic_evaluation/llm_user.py +5 -4
wxo_agentic_evaluation/llm_user_v2.py +114 -0
wxo_agentic_evaluation/main.py +112 -343
wxo_agentic_evaluation/metrics/__init__.py +15 -0
wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
wxo_agentic_evaluation/metrics/evaluations.py +107 -0
wxo_agentic_evaluation/metrics/journey_success.py +137 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
wxo_agentic_evaluation/metrics/metrics.py +276 -8
wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
wxo_agentic_evaluation/otel_parser/parser.py +163 -0
wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
wxo_agentic_evaluation/otel_parser/utils.py +15 -0
wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
wxo_agentic_evaluation/prompt/template_render.py +103 -4
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
wxo_agentic_evaluation/quick_eval.py +33 -17
wxo_agentic_evaluation/record_chat.py +38 -32
wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
wxo_agentic_evaluation/resource_map.py +3 -1
wxo_agentic_evaluation/runner.py +329 -0
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
wxo_agentic_evaluation/scheduler.py +247 -0
wxo_agentic_evaluation/service_instance.py +26 -17
wxo_agentic_evaluation/service_provider/__init__.py +145 -9
wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
wxo_agentic_evaluation/service_provider/provider.py +130 -10
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
wxo_agentic_evaluation/simluation_runner.py +125 -0
wxo_agentic_evaluation/test_prompt.py +4 -4
wxo_agentic_evaluation/type.py +185 -16
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
wxo_agentic_evaluation/utils/__init__.py +44 -3
wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
wxo_agentic_evaluation/utils/messages_parser.py +30 -0
wxo_agentic_evaluation/utils/parsers.py +71 -0
wxo_agentic_evaluation/utils/utils.py +313 -9
wxo_agentic_evaluation/wxo_client.py +81 -0
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0

wxo_agentic_evaluation/service_provider/watsonx_provider.py CHANGED Viewed

@@ -1,15 +1,23 @@
 import dataclasses
 import json
+import logging
 import os
 import time
+import uuid
 from threading import Lock
 from types import MappingProxyType
-from typing import List, Mapping, Union
+from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
 import requests
-from wxo_agentic_evaluation.service_provider.provider import Provider
+from wxo_agentic_evaluation.service_provider.provider import (
+    ChatResult,
+    Provider,
+)
+logger = logging.getLogger(__name__)
+# IAM
 ACCESS_URL = "https://iam.cloud.ibm.com/identity/token"
 ACCESS_HEADER = {
     "content-type": "application/x-www-form-urlencoded",
@@ -18,30 +26,83 @@ ACCESS_HEADER = {
 YPQA_URL = "https://yp-qa.ml.cloud.ibm.com"
 PROD_URL = "https://us-south.ml.cloud.ibm.com"
 DEFAULT_PARAM = MappingProxyType(
     {"min_new_tokens": 1, "decoding_method": "greedy", "max_new_tokens": 400}
 )
+def _truncate(value: Any, max_len: int = 1000) -> str:
+    if value is None:
+        return ""
+    s = str(value)
+    return (
+        s
+        if len(s) <= max_len
+        else s[:max_len] + f"... [truncated {len(s) - max_len} chars]"
+    )
+def _translate_params_to_chat(
+    params: Dict[str, Any] = {},
+) -> Dict[str, Any]:
+    """
+    Translate legacy generation params to chat.completions params.
+    """
+    translated_params: Dict[str, Any] = {}
+    if "max_new_tokens" in params:
+        translated_params["max_tokens"] = params["max_new_tokens"]
+    if params.get("decoding_method") == "greedy":
+        translated_params.setdefault("temperature", 0)
+        translated_params.setdefault("top_p", 1)
+    passthrough = {
+        "temperature",
+        "top_p",
+        "n",
+        "stream",
+        "stop",
+        "presence_penalty",
+        "frequency_penalty",
+        "logit_bias",
+        "user",
+        "seed",
+        "response_format",
+    }
+    for k in passthrough:
+        if k in params:
+            translated_params[k] = params[k]
+    return translated_params
 class WatsonXProvider(Provider):
     def __init__(
         self,
-        model_id=None,
-        api_key=None,
-        space_id=None,
-        api_endpoint=PROD_URL,
-        url=ACCESS_URL,
-        timeout=60,
-        params=None,
-        embedding_model_id=None,
+        model_id: Optional[str] = None,
+        api_key: Optional[str] = None,
+        space_id: Optional[str] = None,
+        api_endpoint: str = PROD_URL,
+        url: str = ACCESS_URL,
+        timeout: int = 60,
+        params: Optional[Any] = None,
+        embedding_model_id: Optional[str] = None,
+        use_legacy_query: Optional[bool] = None,
+        system_prompt: Optional[str] = None,
+        token: Optional[str] = None,
+        instance_url: Optional[str] = None,
     ):
-        super().__init__()
+        super().__init__(use_legacy_query=use_legacy_query)
         self.url = url
         if (embedding_model_id is None) and (model_id is None):
             raise Exception(
                 "either model_id or embedding_model_id must be specified"
             )
         self.model_id = model_id
+        logger.info("[d b]Using inference model %s", self.model_id)
         api_key = os.environ.get("WATSONX_APIKEY", api_key)
         if not api_key:
             raise Exception("apikey must be specified")
@@ -50,7 +111,7 @@ class WatsonXProvider(Provider):
             "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
             "apikey": self.api_key,
         }
-        self.api_endpoint = api_endpoint
+        self.api_endpoint = (api_endpoint or PROD_URL).rstrip("/")
         space_id = os.environ.get("WATSONX_SPACE_ID", space_id)
         if not space_id:
             raise Exception("space id must be specified")
@@ -59,17 +120,26 @@ class WatsonXProvider(Provider):
         self.embedding_model_id = embedding_model_id
         self.lock = Lock()
-        self.params = params if params else DEFAULT_PARAM
+        self.params = params if params is not None else DEFAULT_PARAM
         if isinstance(self.params, MappingProxyType):
             self.params = dict(self.params)
         if dataclasses.is_dataclass(self.params):
             self.params = dataclasses.asdict(self.params)
+        self.system_prompt = system_prompt
         self.refresh_time = None
         self.access_token = None
         self._refresh_token()
+        self.LEGACY_GEN_URL = (
+            f"{self.api_endpoint}/ml/v1/text/generation?version=2023-05-02"
+        )
+        self.CHAT_COMPLETIONS_URL = f"{self.api_endpoint}/ml/v1/text/chat"
+        self.EMBEDDINGS_URL = (
+            f"{self.api_endpoint}/ml/v1/text/embeddings?version=2023-10-25"
+        )
     def _get_access_token(self):
         response = requests.post(
             self.url,
@@ -87,7 +157,7 @@ class WatsonXProvider(Provider):
             return token, refresh_time
         raise RuntimeError(
-            f"try to acquire access token and get {response.status_code}"
+            f"Try to acquire access token and get {response.status_code}. Reason: {response.text} "
         )
     def prepare_header(self):
@@ -97,24 +167,6 @@ class WatsonXProvider(Provider):
         }
         return headers
-    def _query(self, sentence: str):
-        headers = self.prepare_header()
-        data = {
-            "model_id": self.model_id,
-            "input": sentence,
-            "parameters": self.params,
-            "space_id": self.space_id,
-        }
-        generation_url = (
-            f"{self.api_endpoint}/ml/v1/text/generation?version=2023-05-02"
-        )
-        resp = requests.post(url=generation_url, headers=headers, json=data)
-        if resp.status_code == 200:
-            return resp.json()["results"][0]
-        else:
-            resp.raise_for_status()
     def _refresh_token(self):
         # if we do not have a token or the current timestamp is 9 minutes away from expire.
         if not self.access_token or time.time() > self.refresh_time:
@@ -125,28 +177,365 @@ class WatsonXProvider(Provider):
                         self.refresh_time,
                     ) = self._get_access_token()
-    def query(self, sentence: Union[str, Mapping[str, str]]) -> str:
+    def old_query(self, sentence: Union[str, Mapping[str, str]]) -> str:
+        """
+        Legacy /ml/v1/text/generation
+        """
         if self.model_id is None:
             raise Exception("model id must be specified for text generation")
+        self._refresh_token()
+        headers = self.prepare_header()
+        payload: Dict[str, Any] = {
+            "model_id": self.model_id,
+            "input": sentence,
+            "parameters": self.params or {},
+            "space_id": self.space_id,
+        }
+        request_id = str(uuid.uuid4())
+        t0 = time.time()
+        logger.debug(
+            "[d][b]Sending text.generation request | request_id=%s url=%s model=%s space_id=%s params=%s input_preview=%s",
+            request_id,
+            self.LEGACY_GEN_URL,
+            self.model_id,
+            self.space_id,
+            json.dumps(
+                payload.get("parameters", {}),
+                sort_keys=True,
+                ensure_ascii=False,
+            ),
+            _truncate(sentence, 200),
+        )
+        resp = None
         try:
-            response = self._query(sentence)
-            if generated_text := response.get("generated_text"):
-                return generated_text
-            elif message := response.get("message"):
-                return message
+            resp = requests.post(
+                url=self.LEGACY_GEN_URL,
+                headers=headers,
+                json=payload,
+                timeout=self.timeout,
+            )
+            duration_ms = int((time.time() - t0) * 1000)
+            resp.raise_for_status()
+            data = resp.json()
+            if isinstance(data, dict) and "results" in data and data["results"]:
+                result = data["results"][0]
+            elif isinstance(data, dict):
+                result = data
             else:
                 raise ValueError(
-                    f"Unexpected response from WatsonX: {response}"
+                    f"Unexpected response type from WatsonX: {type(data)}"
+                )
+            output_text = ""
+            if isinstance(result, dict):
+                output_text = (
+                    result.get("generated_text") or result.get("message") or ""
+                )
+            usage = data.get("usage") or {}
+            if not usage and isinstance(result, dict):
+                in_tok = result.get("input_token_count")
+                out_tok = result.get("generated_token_count") or result.get(
+                    "output_token_count"
                 )
+                if in_tok is not None or out_tok is not None:
+                    usage = {
+                        "prompt_tokens": in_tok,
+                        "completion_tokens": out_tok,
+                        "total_tokens": (in_tok or 0) + (out_tok or 0),
+                    }
+            api_request_id = resp.headers.get(
+                "x-request-id"
+            ) or resp.headers.get("request-id")
+            logger.debug(
+                "[d][b]text.generation response received | request_id=%s status_code=%s duration_ms=%s usage=%s output_preview=%s api_request_id=%s",
+                request_id,
+                resp.status_code,
+                duration_ms,
+                json.dumps(usage, sort_keys=True, ensure_ascii=False),
+                _truncate(output_text, 2000),
+                api_request_id,
+            )
+            if output_text:
+                return output_text
+            raise ValueError(
+                f"Unexpected response from legacy endpoint: {data}"
+            )
+        except Exception as e:
+            duration_ms = int((time.time() - t0) * 1000)
+            status_code = getattr(resp, "status_code", None)
+            resp_text_preview = (
+                _truncate(getattr(resp, "text", None), 2000)
+                if resp is not None
+                else None
+            )
+            logger.exception(
+                "text.generation request failed | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
+                request_id,
+                status_code,
+                duration_ms,
+                resp_text_preview,
+            )
+            with self.lock:
+                if (
+                    "authentication_token_expired" in str(e)
+                    or status_code == 401
+                ):
+                    try:
+                        self.access_token, self.refresh_time = (
+                            self._get_access_token()
+                        )
+                    except Exception:
+                        pass
+            raise
+    def new_query(self, sentence: str) -> str:
+        """
+        /ml/v1/text/chat
+        Returns assistant content as a plain string.
+        """
+        if self.model_id is None:
+            raise Exception("model id must be specified for text generation")
+        self._refresh_token()
+        headers = self.prepare_header()
+        messages: List[Dict[str, Any]] = []
+        if getattr(self, "system_prompt", None):
+            messages.append({"role": "system", "content": self.system_prompt})
+        messages.append(
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": sentence,
+                    }
+                ],
+            }
+        )
+        chat_params = _translate_params_to_chat(self.params)
+        if "time_limit" in self.params:
+            chat_params["time_limit"] = self.params["time_limit"]
+        payload: Dict[str, Any] = {
+            "model_id": self.model_id,
+            "space_id": self.space_id,
+            "messages": messages,
+            **chat_params,
+        }
+        url = f"{self.CHAT_COMPLETIONS_URL}?version=2024-10-08"
+        request_id = str(uuid.uuid4())
+        t0 = time.time()
+        logger.debug(
+            "[d][b]Sending chat.completions request | request_id=%s url=%s model=%s space_id=%s params=%s input_preview=%s",
+            request_id,
+            url,
+            self.model_id,
+            self.space_id,
+            json.dumps(chat_params, sort_keys=True, ensure_ascii=False),
+            _truncate(sentence, 200),
+        )
+        resp = None
+        try:
+            resp = requests.post(
+                url=url, headers=headers, json=payload, timeout=self.timeout
+            )
+            duration_ms = int((time.time() - t0) * 1000)
+            resp.raise_for_status()
+            data = resp.json()
+            choice = data["choices"][0]
+            content = choice["message"]["content"]
+            finish_reason = choice.get("finish_reason")
+            usage = data.get("usage", {})
+            api_request_id = resp.headers.get(
+                "x-request-id"
+            ) or resp.headers.get("request-id")
+            logger.debug(
+                "[d][b]chat.completions response received | request_id=%s status_code=%s duration_ms=%s finish_reason=%s usage=%s output_preview=%s api_request_id=%s",
+                request_id,
+                resp.status_code,
+                duration_ms,
+                finish_reason,
+                json.dumps(usage, sort_keys=True, ensure_ascii=False),
+                _truncate(content, 2000),
+                api_request_id,
+            )
+            return content
         except Exception as e:
+            duration_ms = int((time.time() - t0) * 1000)
+            status_code = getattr(resp, "status_code", None)
+            resp_text_preview = (
+                _truncate(getattr(resp, "text", None), 2000)
+                if resp is not None
+                else None
+            )
+            logger.exception(
+                "chat.completions request failed | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
+                request_id,
+                status_code,
+                duration_ms,
+                resp_text_preview,
+            )
             with self.lock:
-                if "authentication_token_expired" in str(e):
-                    self._refresh_token()
-                raise e
+                if (
+                    "authentication_token_expired" in str(e)
+                    or status_code == 401
+                ):
+                    try:
+                        self.access_token, self.refresh_time = (
+                            self._get_access_token()
+                        )
+                    except Exception:
+                        pass
+            raise
+    def chat(
+        self,
+        messages: Sequence[Dict[str, str]],
+        params: Optional[Dict[str, Any]] = None,
+    ) -> ChatResult:
+        """
+        Sends a multi-message chat request to /ml/v1/text/chat
+        Returns ChatResult with text, usage, finish_reason, and raw response.
+        """
+        if self.model_id is None:
+            raise Exception("model id must be specified for chat")
+        self._refresh_token()
+        headers = self.prepare_header()
+        wx_messages: List[Dict[str, Any]] = []
+        for m in messages:
+            role = m.get("role")
+            content = m.get("content", "")
+            if role == "user" and isinstance(content, str):
+                wx_messages.append(
+                    {
+                        "role": "user",
+                        "content": [{"type": "text", "text": content}],
+                    }
+                )
+            else:
+                wx_messages.append({"role": role, "content": content})
+        merged_params = dict(self.params or {})
+        if params:
+            merged_params.update(params)
+        chat_params = _translate_params_to_chat(merged_params)
+        chat_params.pop("stream", None)
+        if "time_limit" in merged_params:
+            chat_params["time_limit"] = merged_params["time_limit"]
+        payload: Dict[str, Any] = {
+            "model_id": self.model_id,
+            "space_id": self.space_id,
+            "messages": wx_messages,
+            **chat_params,
+        }
+        url = f"{self.CHAT_COMPLETIONS_URL}?version=2024-10-08"
+        request_id = str(uuid.uuid4())
+        t0 = time.time()
-    def batch_query(self, sentences: List[str]) -> List[dict]:
-        return [self.query(sentence) for sentence in sentences]
+        last_user = next(
+            (
+                m.get("content", "")
+                for m in reversed(messages)
+                if m.get("role") == "user"
+            ),
+            "",
+        )
+        logger.debug(
+            "[d][b]Sending chat.completions request (non-streaming) | request_id=%s url=%s model=%s space_id=%s params=%s input_preview=%s",
+            request_id,
+            url,
+            self.model_id,
+            self.space_id,
+            json.dumps(chat_params, sort_keys=True, ensure_ascii=False),
+            _truncate(last_user, 200),
+        )
+        resp = None
+        try:
+            resp = requests.post(
+                url=url, headers=headers, json=payload, timeout=self.timeout
+            )
+            duration_ms = int((time.time() - t0) * 1000)
+            resp.raise_for_status()
+            data = resp.json()
+            choice = data["choices"][0]
+            content = choice["message"]["content"]
+            finish_reason = choice.get("finish_reason")
+            usage = data.get("usage", {})
+            api_request_id = resp.headers.get(
+                "x-request-id"
+            ) or resp.headers.get("request-id")
+            logger.debug(
+                "[d][b]chat.completions response received (non-streaming) | request_id=%s status_code=%s duration_ms=%s finish_reason=%s usage=%s output_preview=%s api_request_id=%s",
+                request_id,
+                resp.status_code,
+                duration_ms,
+                finish_reason,
+                json.dumps(usage, sort_keys=True, ensure_ascii=False),
+                _truncate(content, 2000),
+                api_request_id,
+            )
+            return ChatResult(
+                text=content, usage=usage, finish_reason=finish_reason, raw=data
+            )
+        except Exception as e:
+            duration_ms = int((time.time() - t0) * 1000)
+            status_code = getattr(resp, "status_code", None)
+            resp_text_preview = (
+                _truncate(getattr(resp, "text", None), 2000)
+                if resp is not None
+                else None
+            )
+            logger.exception(
+                "chat.completions request failed (non-streaming) | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
+                request_id,
+                status_code,
+                duration_ms,
+                resp_text_preview,
+            )
+            with self.lock:
+                if (
+                    "authentication_token_expired" in str(e)
+                    or status_code == 401
+                ):
+                    try:
+                        self.access_token, self.refresh_time = (
+                            self._get_access_token()
+                        )
+                    except Exception:
+                        pass
+            raise
     def encode(self, sentences: List[str]) -> List[list]:
         if self.embedding_model_id is None:
@@ -154,24 +543,63 @@ class WatsonXProvider(Provider):
                 "embedding model id must be specified for text encoding"
             )
+        self._refresh_token()
         headers = self.prepare_header()
-        url = f"{self.api_endpoint}/ml/v1/text/embeddings?version=2023-10-25"
-        data = {
+        # Minimal logging for embeddings
+        request_id = str(uuid.uuid4())
+        t0 = time.time()
+        logger.debug(
+            "[d][b]Sending embeddings request | request_id=%s url=%s model=%s space_id=%s num_inputs=%s",
+            request_id,
+            self.EMBEDDINGS_URL,
+            self.embedding_model_id,
+            self.space_id,
+            len(sentences),
+        )
+        payload = {
             "inputs": sentences,
-            "model_id": self.model_id,
+            "model_id": self.embedding_model_id,
             "space_id": self.space_id,
         }
-        resp = requests.post(url=url, headers=headers, json=data)
+        resp = requests.post(
+            url=self.EMBEDDINGS_URL,
+            headers=headers,
+            json=payload,
+            timeout=self.timeout,
+        )
+        duration_ms = int((time.time() - t0) * 1000)
         if resp.status_code == 200:
-            return [entry["embedding"] for entry in resp.json()["results"]]
-        else:
-            resp.raise_for_status()
+            data = resp.json()
+            vectors = [entry["embedding"] for entry in data["results"]]
+            logger.debug(
+                "[d][b]Embeddings response received | request_id=%s status_code=%s duration_ms=%s num_vectors=%s",
+                request_id,
+                resp.status_code,
+                duration_ms,
+                len(vectors),
+            )
+            return vectors
+        logger.error(
+            "[d b red]Embeddings request failed | request_id=%s status_code=%s duration_ms=%s response_text_preview=%s",
+            request_id,
+            resp.status_code,
+            duration_ms,
+            _truncate(resp.text, 2000),
+        )
+        resp.raise_for_status()
 if __name__ == "__main__":
     provider = WatsonXProvider(
-        model_id="meta-llama/llama-3-2-90b-vision-instruct"
+        model_id="meta-llama/llama-3-2-90b-vision-instruct",
+        use_legacy_query=False,  # set True to use legacy endpoint
+        system_prompt="You are a helpful assistant.",
     )
     prompt = """

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl