PyPI - ibm-watsonx-gov - Versions diffs - 1.3.3__cp313-cp313-macosx_11_0_arm64.whl - Mend

ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (353) hide show

ibm_watsonx_gov/providers/eval_assist_provider.py ADDED Viewed

@@ -0,0 +1,266 @@
+# ----------------------------------------------------------------------------------------------------
+# IBM Confidential
+# Licensed Materials - Property of IBM
+# 5737-H76, 5900-A3Q
+# © Copyright IBM Corp. 2025  All Rights Reserved.
+# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
+# GSA ADPSchedule Contract with IBM Corp.
+# ----------------------------------------------------------------------------------------------------
+import asyncio
+import functools
+import re
+import pandas as pd
+from lazy_imports import LazyModule, load
+from ibm_watsonx_gov.clients.usage_client import validate_usage_client
+from ibm_watsonx_gov.entities.credentials import WxAICredentials
+from ibm_watsonx_gov.entities.criteria import Option
+from ibm_watsonx_gov.entities.enums import EvaluationProvider, MetricGroup
+from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
+                                                        RecordMetricResult)
+from ibm_watsonx_gov.entities.foundation_model import (
+    AzureOpenAIFoundationModel, OpenAIFoundationModel, PortKeyGateway,
+    WxAIFoundationModel)
+from ibm_watsonx_gov.entities.llm_judge import LLMJudge
+from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
+from ibm_watsonx_gov.providers.inference_engines.portkey_inference_engine import \
+    PortKeyInferenceEngine
+from ibm_watsonx_gov.utils.async_util import start_event_loop_run_func
+ea_imports = LazyModule(
+    "from evalassist.judges import Criteria as EACriteria",
+    "from evalassist.judges import CriteriaOption as EACriteriaOption",
+    "from evalassist.judges import Instance, DirectJudge",
+    "from unitxt.inference import CrossProviderInferenceEngine",
+    name="lazy_ea"
+)
+load(ea_imports)
+EACriteria = ea_imports.EACriteria
+EACriteriaOption = ea_imports.EACriteriaOption
+Instance = ea_imports.Instance
+DirectJudge = ea_imports.DirectJudge
+CrossProviderInferenceEngine = ea_imports.CrossProviderInferenceEngine
+VARIABLES_PATTERN = r"\{([a-zA-Z_][a-zA-Z0-9_]*)\}"
+class EvalAssistProvider():
+    """
+    The class to invoke eval assist library for computing the LLMAJ metrics.
+    """
+    def __init__(self, metric_name: str,
+                 display_name: str,
+                 value_type: str,
+                 llm_judge: LLMJudge,
+                 options: list[Option],
+                 criteria_description: str | None = None,
+                 prompt_template: str | None = None,
+                 context_fields: list[str] = [],
+                 prediction_field: str | None = None,
+                 metric_group: MetricGroup = None,
+                 metric_method: str | None = None,
+                 thresholds: list[MetricThreshold] = [],
+                 **kwargs):
+        self.metric_name = metric_name
+        self.display_name = display_name
+        self.value_type = value_type
+        self.llm_judge = llm_judge
+        self.criteria_description = criteria_description
+        self.prompt_template = prompt_template
+        self.options = options
+        self.context_fields = context_fields
+        self.prediction_field = prediction_field
+        self.metric_group = metric_group
+        self.metric_method = metric_method
+        self.thresholds = thresholds
+        self.record_id_field = kwargs.get("record_id_field", "record_id")
+        validate_usage_client(kwargs.get("usage_client"))
+    async def evaluate_async(self, data: pd.DataFrame) -> AggregateMetricResult:
+        loop = asyncio.get_event_loop()
+        # If called as async, run it in a separate thread
+        return await loop.run_in_executor(
+            None,
+            functools.partial(
+                start_event_loop_run_func,
+                func=self.evaluate,
+                data=data
+            )
+        )
+    def evaluate(self, data: pd.DataFrame) -> AggregateMetricResult:
+        try:
+            judge = self.__get_judge()
+            if self.criteria_description:
+                criteria = self.__get_criteria(
+                    self.prediction_field, self.context_fields)
+                instances = self.__get_instances(data=data,
+                                                 prediction_field=self.prediction_field,
+                                                 context_fields=self.context_fields)
+                results = judge(instances=instances, criteria=criteria)
+            elif self.prompt_template:
+                # Get judge prompts with filled in values
+                judge_prompts = data.apply(
+                    lambda row: self.prompt_template.format(**row), axis=1).to_list()
+                # Get the list of valid outputs from the judge prompt
+                valid_outputs = [o.name for o in self.options]
+                results = judge.evaluate_with_custom_prompt(
+                    judge_prompts=judge_prompts,
+                    valid_outputs=valid_outputs)
+            aggregated_result = self.__post_process(
+                results=results, data=data)
+            return aggregated_result
+        except Exception as e:
+            raise Exception(
+                f"Error while computing metrics: {self.metric_name}. Reason: {str(e)}") from e
+    def __get_judge(self):
+        if self.llm_judge and isinstance(self.llm_judge.model, PortKeyGateway):
+            judge = DirectJudge(
+                inference_engine=PortKeyInferenceEngine(
+                    **self.__get_inference_engine_params()),
+                generate_feedback=True,
+            )
+        else:
+            judge = DirectJudge(
+                inference_engine=CrossProviderInferenceEngine(
+                    **self.__get_inference_engine_params()),
+                generate_feedback=True,
+            )
+        return judge
+    def __get_instances(self, data, prediction_field, context_fields):
+        instances = []
+        context_data = data[context_fields].to_dict(orient="records")
+        predictions = data[prediction_field].tolist()
+        if context_data:
+            for c, p in zip(context_data, predictions):
+                fields = {prediction_field: p}
+                fields.update(c)
+                instances.append(Instance(
+                    fields=fields))
+        else:
+            for p in predictions:
+                instances.append(Instance(
+                    fields={prediction_field: p}))
+        return instances
+    def __get_inference_engine_params(self):
+        params = {"seed": 36,
+                  "data_classification_policy": ["public"]}
+        if isinstance(self.llm_judge.model, WxAIFoundationModel):
+            wxai_credentials: WxAICredentials = self.llm_judge.model.provider.credentials
+            wml_credentials = {}
+            wml_credentials["api_base"] = wxai_credentials.url
+            if wxai_credentials.api_key:
+                wml_credentials["api_key"] = wxai_credentials.api_key
+            if wxai_credentials.version:  # using cpd
+                wml_credentials["username"] = wxai_credentials.username
+                wml_credentials["instance_id"] = wxai_credentials.instance_id
+                if wxai_credentials.password:
+                    wml_credentials["password"] = wxai_credentials.password
+            if self.llm_judge.model.project_id:
+                wml_credentials["project_id"] = self.llm_judge.model.project_id
+            elif self.llm_judge.model.space_id:
+                wml_credentials["space_id"] = self.llm_judge.model.space_id
+            else:
+                raise Exception("Either project or space id is required")
+            params.update({
+                "credentials": wml_credentials,
+                "provider": "watsonx",
+                "model": self.llm_judge.model.model_id,
+                "provider_specific_args": {
+                    "watsonx": {
+                        "max_requests_per_second": 1
+                    }
+                }
+            })
+        elif isinstance(self.llm_judge.model, OpenAIFoundationModel):
+            params.update({
+                "credentials": {
+                    "api_key": self.llm_judge.model.provider.credentials.api_key
+                },
+                "provider": "open-ai",
+                "model": self.llm_judge.model.model_id,
+                "provider_specific_args": {"temperature": 0}
+            })
+        elif isinstance(self.llm_judge.model, PortKeyGateway):
+            params.update({
+                "credentials": self.llm_judge.model.provider.credentials.model_dump(),
+                "model": self.llm_judge.model.model_id
+            })
+        elif isinstance(self.llm_judge.model, AzureOpenAIFoundationModel):
+            raise Exception("Azure OpenAI Model provider is not supported.")
+        else:
+            raise Exception("LLM Model provider is not supported.")
+        return params
+    def __get_criteria(self, prediction_field, context_fields):
+        options = []
+        for op in self.options:
+            op_desc = op.description.replace(
+                "{"+prediction_field+"}", prediction_field)
+            op_desc = re.sub(VARIABLES_PATTERN, r"\1", op_desc)
+            options.append(EACriteriaOption(
+                name=op.name,
+                description=op_desc,
+                score=op.value
+            ))
+        desc = self.criteria_description.replace(
+            "{"+prediction_field+"}", prediction_field)
+        desc = re.sub(VARIABLES_PATTERN, r"\1", desc)
+        criteria_with_options = EACriteria(name=self.metric_name,
+                                           description=desc,
+                                           to_evaluate_field=prediction_field,
+                                           context_fields=context_fields,
+                                           options=options)
+        return criteria_with_options
+    def __post_process(self, results, data: pd.DataFrame) -> AggregateMetricResult:
+        record_level_metrics: list[RecordMetricResult] = []
+        score_map = {o.name: o.value for o in self.options}
+        for record_id, result in zip(data[self.record_id_field].tolist(), results):
+            record_level_metrics.append(
+                RecordMetricResult(
+                    name=self.metric_name,
+                    display_name=self.display_name,
+                    method=self.metric_method,
+                    group=self.metric_group,
+                    provider=EvaluationProvider.UNITXT.value,
+                    value=score_map.get(result.selected_option),
+                    label=result.selected_option,
+                    record_id=record_id,
+                    thresholds=self.thresholds,
+                    explanation=result.explanation,
+                    additional_info={
+                        "feedback": result.feedback} if result.feedback else {}
+                )
+            )
+        aggregated_result = AggregateMetricResult.create(
+            record_level_metrics)
+        # return the aggregated result
+        return aggregated_result

ibm_watsonx_gov/providers/inference_engines/__init__.py ADDED Viewed

File without changes

ibm_watsonx_gov/providers/inference_engines/custom_inference_engine.py ADDED Viewed

@@ -0,0 +1,165 @@
+# ----------------------------------------------------------------------------------------------------
+# IBM Confidential
+# Licensed Materials - Property of IBM
+# 5737-H76, 5900-A3Q
+# © Copyright IBM Corp. 2025  All Rights Reserved.
+# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
+# GSA ADPSchedule Contract with IBM Corp.
+# ----------------------------------------------------------------------------------------------------
+from multiprocessing.pool import ThreadPool
+from typing import Annotated, Any, Callable, Dict, List, Optional, Union
+from datasets import Dataset
+from lazy_imports import LazyModule, load
+from pydantic import Field
+from tqdm import tqdm
+unitxt_imports = LazyModule(
+    "from unitxt.artifact import Artifact",
+    "from unitxt.inference import InferenceEngine, TextGenerationInferenceOutput, get_model_and_label_id",
+    name="lazy_unitxt",
+)
+load(unitxt_imports)
+Artifact = unitxt_imports.Artifact
+InferenceEngine = unitxt_imports.InferenceEngine
+TextGenerationInferenceOutput = unitxt_imports.TextGenerationInferenceOutput
+get_model_and_label_id = unitxt_imports.get_model_and_label_id
+def run_with_imap(func):
+    """
+    Decorator to adapt a function for use with multiprocessing's imap.
+    Ensures arguments are unpacked properly when parallelizing inference.
+    """
+    def inner(self, args):
+        return func(self, *args)
+    return inner
+class CustomFnEngineParamsMixin(Artifact):
+    """
+    Mixin class that provides configurable parameters for the custom engine.
+    - batch_size: number of instances per batch (unused, but reserved for extension).
+    - timeout: optional timeout in seconds for inference requests.
+    - num_parallel_requests: max number of threads used for parallel inference.
+    """
+    batch_size: Optional[int] = None
+    timeout: Optional[float] = None
+    num_parallel_requests: Optional[int] = 20
+class CustomFunctionInferenceEngine(InferenceEngine, CustomFnEngineParamsMixin):
+    """
+    A custom inference engine that delegates prediction to a user-provided function (`scoring_fn`).
+    Supports parallel execution across multiple threads and integrates seamlessly with Unitxt.
+    """
+    label: str = "custom_fn"
+    model_name: str = "custom_fn"
+    num_parallel_requests: int = 20
+    scoring_fn: Callable
+    context: Optional[Dict[str, Any]] = None
+    def get_engine_id(self) -> str:
+        """
+        Return a unique engine identifier based on model_name and label.
+        Used internally by Unitxt to differentiate inference engines.
+        """
+        return get_model_and_label_id(self.model_name, self.label)
+    def prepare_engine(self):
+        """
+        Hook for initializing resources before inference.
+        No-op here since the custom engine delegates everything to scoring_fn.
+        """
+        pass
+    def get_return_object(self, predict_result, response, return_meta_data):
+        """
+        Return the prediction object in the format expected by Unitxt.
+        In this implementation, the prediction is returned as-is.
+        """
+        return predict_result
+    def _parallel_infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], Dataset],
+        infer_func,
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List["TextGenerationInferenceOutput"]]:
+        """
+        Run inference on a dataset in parallel using a thread pool.
+        Args:
+            dataset: list of instances or HuggingFace Dataset.
+            infer_func: function applied to each instance.
+            return_meta_data: if True, expects TextGenerationInferenceOutput.
+        Returns:
+            A list of predictions or metadata objects.
+        """
+        inputs = [(instance, return_meta_data) for instance in dataset]
+        outputs: List[Union[str, "TextGenerationInferenceOutput"]] = []
+        with ThreadPool(processes=self.num_parallel_requests) as pool:
+            for output in tqdm(
+                pool.imap(infer_func, inputs),
+                total=len(inputs),
+                desc=f"Inferring with {self.__class__.__name__}",
+            ):
+                outputs.append(output)
+        return outputs
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], Dataset],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List["TextGenerationInferenceOutput"]]:
+        """
+        Core inference method called by Unitxt.
+        Delegates to `_parallel_infer` for concurrent execution.
+        """
+        return self._parallel_infer(
+            dataset=dataset,
+            return_meta_data=return_meta_data,
+            infer_func=self._score_instance,
+        )
+    @run_with_imap
+    def _score_instance(self, instance, return_meta_data):
+        """
+        Run inference on a single instance using the user-provided scoring_fn.
+        Handles type validation and returns a fallback object if scoring fails.
+        """
+        try:
+            pred = self.scoring_fn(
+                instance, return_meta_data, context=self.context)
+            if return_meta_data and not isinstance(pred, TextGenerationInferenceOutput):
+                raise TypeError(
+                    "With return_meta_data=True, scoring_fn must return TextGenerationInferenceOutput."
+                )
+            if not return_meta_data and not isinstance(pred, str):
+                raise TypeError(
+                    "With return_meta_data=False, scoring_fn must return str."
+                )
+            return self.get_return_object(pred, response=None, return_meta_data=return_meta_data)
+        except Exception:
+            if return_meta_data:
+                return TextGenerationInferenceOutput(
+                    prediction="-", generated_text="-", input_tokens=0, output_tokens=0,
+                    model_name=self.model_name, inference_type=self.label,
+                )
+            return "-"
+    def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
+        """
+        Convert the engine configuration to a dictionary.
+        Excludes unserializable fields like `scoring_fn` and `context` to ensure cache safety.
+        """
+        d = super().to_dict(*args, **kwargs)
+        d.pop("scoring_fn", None)
+        d.pop("context", None)
+        return d

ibm_watsonx_gov/providers/inference_engines/portkey_inference_engine.py ADDED Viewed

@@ -0,0 +1,57 @@
+# ----------------------------------------------------------------------------------------------------
+# IBM Confidential
+# Licensed Materials - Property of IBM
+# 5737-H76, 5900-A3Q
+# © Copyright IBM Corp. 2025  All Rights Reserved.
+# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
+# GSA ADPSchedule Contract with IBM Corp.
+# ----------------------------------------------------------------------------------------------------
+from typing import Any, Dict, List, Optional, Union
+from datasets import Dataset
+from ibm_watsonx_gov.entities.llm_judge import LLMJudge
+from unitxt.inference import (InferenceEngine, PackageRequirementsMixin,
+                              StandardAPIParamsMixin,
+                              TextGenerationInferenceOutput,
+                              get_model_and_label_id)
+class PortKeyInferenceEngine(
+    InferenceEngine, StandardAPIParamsMixin, PackageRequirementsMixin
+):
+    label: str = "portkey"
+    _requirements_list = {
+        "portkey-ai": "Install portkey-ai package using 'pip install --upgrade portkey-ai"
+    }
+    model: str = None
+    credentials: Dict[str, str] = {}
+    def get_engine_id(self):
+        return get_model_and_label_id(self.model, self.label)
+    def prepare_engine(self):
+        from portkey_ai import Portkey
+        self.client = Portkey(
+            api_key=self.credentials["api_key"],
+            base_url=self.credentials.get("base_url", None),
+            provider=self.credentials.get("provider"),
+            Authorization="Bearer " + self.credentials["provider_api_key"],
+        )
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], Dataset],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        args = self.to_dict([StandardAPIParamsMixin])
+        results = []
+        for instance in dataset:
+            messages = self.to_messages(instance)
+            response = self.client.chat.completions.create(
+                messages=messages,
+                model=self.model
+            )
+            results.append(response.choices[0].message.content)
+        return results

ibm_watsonx_gov/providers/llmevalkit/__init__.py ADDED Viewed

File without changes