PyPI - ibm-watsonx-gov - Versions diffs - 1.3.3__cp313-cp313-macosx_11_0_arm64.whl - Mend

ibm-watsonx-gov 1.3.3__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (353) hide show

ibm_watsonx_gov/metrics/llm_validation/llm_validation_metric.py ADDED Viewed

@@ -0,0 +1,258 @@
+# ----------------------------------------------------------------------------------------------------
+# IBM Confidential
+# Licensed Materials - Property of IBM
+# 5737-H76, 5900-A3Q
+# © Copyright IBM Corp. 2025  All Rights Reserved.
+# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
+# GSA ADPSchedule Contract with IBM Corp.
+# ----------------------------------------------------------------------------------------------------
+import copy
+from typing import Annotated, List, Literal
+import pandas as pd
+from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
+from lazy_imports import LazyModule, load
+try:
+    # Create lazy module for LangChain imports
+    langchain_imports = LazyModule(
+        "from langchain_ibm import ChatWatsonx",
+        "from langchain_openai import AzureChatOpenAI",
+        "from langchain_openai import ChatOpenAI",
+        name="lazy_langchain_imports"
+    )
+    load(langchain_imports)
+    # Create aliases
+    ChatWatsonx = langchain_imports.ChatWatsonx
+    AzureChatOpenAI = langchain_imports.AzureChatOpenAI
+    ChatOpenAI = langchain_imports.ChatOpenAI
+except ImportError:
+    ChatWatsonx = None
+    AzureChatOpenAI = None
+    ChatOpenAI = None
+    import warnings
+    warnings.warn("LangChain dependencies not available")
+from pydantic import Field
+from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
+from ibm_watsonx_gov.entities.enums import ModelProviderType, TaskType
+from ibm_watsonx_gov.entities.evaluation_result import (AggregateMetricResult,
+                                                        RecordMetricResult)
+from ibm_watsonx_gov.entities.llm_judge import LLMJudge
+from ibm_watsonx_gov.entities.metric import GenAIMetric
+from ibm_watsonx_gov.metrics.llm_validation.evaluation_criteria import (
+    EvaluationCriteria, get_default_evaluation_criteria)
+from ibm_watsonx_gov.metrics.llm_validation.llm_validation_constants import (
+    LLMValidation, LLMValidationFields)
+from ibm_watsonx_gov.metrics.llm_validation.llm_validation_impl import (
+    generate_issues_and_map_to_records, llm_validation_per_record,
+    reverse_mapping)
+def get_prompt_field(configuration: GenAIConfiguration, available_fields=None):
+    if available_fields is None:
+        available_fields = []
+    prompt_field = configuration.prompt_field
+    if not prompt_field:
+        prompt_field = LLMValidationFields.INPUT_FIELD.value
+    if not prompt_field:
+        raise ValueError("Model input not found in data")
+    if available_fields and prompt_field not in available_fields:
+        raise ValueError(
+            f"prompt_field {prompt_field} not found in data. available fields: {available_fields}")
+    return prompt_field
+class LLMValidationMetric(GenAIMetric):
+    """Defines the implementation for computing the LLMValidation metric.
+    .. code-block:: python
+        from ibm_watsonx_gov.entities.foundation_model import WxAIFoundationModel
+        llm_judge=LLMJudge(model=WxAIFoundationModel(model_id="model_id"))
+    .. code-block:: python
+        metric = LLMValidationMetric(llm_judge=llm_judge)
+    """
+    name: Annotated[Literal["llm_validation"],
+                    Field(default=LLMValidation)]
+    tasks: Annotated[list[TaskType], Field(
+        default=[TaskType.RAG, TaskType.SUMMARIZATION])]
+    thresholds: Annotated[list[MetricThreshold], Field(default=[MetricThreshold(
+        type="lower_limit", value=0.7)])]
+    method: Annotated[Literal["llm_as_judge"],
+                      Field(description=f"The method used to compute the metric.",
+                            default="llm_as_judge")]
+    llm_judge: Annotated[LLMJudge | None, Field(
+        description=f"The LLM judge used to compute the metric.")]
+    evaluation_criteria: Annotated[EvaluationCriteria | None, Field(
+        description=f"Evaluation Criteria for metric the computation", default_factory=get_default_evaluation_criteria)]
+    def evaluate(self, data: pd.DataFrame,
+                 configuration: GenAIConfiguration | AgenticAIConfiguration,
+                 **kwargs) -> AggregateMetricResult:
+        record_level_metrics = self.get_record_level_metrics(
+            data, configuration)
+        aggregated_results = self.get_aggregated_results_from_individual_results(
+            record_level_metrics)
+        return aggregated_results
+    def get_record_level_metrics(self, data: pd.DataFrame | dict,
+                                 configuration: GenAIConfiguration | AgenticAIConfiguration) \
+            -> List[RecordMetricResult]:
+        # generate evaluator llm
+        llm = self.generate_evaluating_model()
+        # prepare the data
+        eval_df = copy.deepcopy(data)
+        prompt_field = get_prompt_field(
+            configuration, available_fields=list(eval_df.columns))
+        eval_df[LLMValidationFields.INPUT_FIELD.value] = eval_df.apply(
+            lambda r: r[prompt_field], axis=1)
+        eval_df = eval_df.fillna("")
+        eval_df[LLMValidationFields.OUTPUT_FIELD.value] = eval_df.apply(lambda r: "\n".join([r[output_field]
+                                                                                             for output_field in
+                                                                                             configuration.output_fields]),
+                                                                        axis=1)
+        # # call the per-record evaluating function
+        eval_df = llm_validation_per_record(
+            df=eval_df,
+            llm=llm,
+            input_col=LLMValidationFields.INPUT_FIELD.value,
+            output_col=LLMValidationFields.OUTPUT_FIELD.value,
+            text_col=LLMValidationFields.TEXT_FIELD.value,
+            score_col=LLMValidationFields.SCORE_FIELD.value,
+            summary_col=LLMValidationFields.SUMMARY_FIELD.value,
+            evaluation_criteria=self.evaluation_criteria
+        )
+        record_level_metrics = []
+        for _, row in eval_df.iterrows():
+            record_level_metrics.append(
+                RecordMetricResult(
+                    name=self.name,
+                    method=self.method,
+                    provider="",
+                    value=row[LLMValidationFields.SCORE_FIELD.value],
+                    record_id=row[configuration.record_id_field],
+                    additional_info={
+                        LLMValidationFields.TEXT_FIELD.value: row[LLMValidationFields.TEXT_FIELD.value],
+                        LLMValidationFields.SUMMARY_FIELD.value: row[LLMValidationFields.SUMMARY_FIELD.value],
+                        LLMValidationFields.RECURRING_ISSUE_FIELD.value: "",
+                        LLMValidationFields.RECURRING_ISSUE_IDS_FIELD.value: ""
+                    },
+                    thresholds=self.thresholds,
+                )
+            )
+        return record_level_metrics
+    def get_aggregated_results_from_individual_results(self, record_level_metrics: List[RecordMetricResult]) \
+            -> AggregateMetricResult:
+        summaries_list = [r.additional_info[LLMValidationFields.SUMMARY_FIELD.value]
+                          # TODO!!!! use and map only records with score < 1
+                          if r.value is not None and r.value < 1 else ""
+                          for r in record_level_metrics]
+        llm = self.generate_evaluating_model()
+        recurring_issues_to_record_ids = generate_issues_and_map_to_records(
+            summaries_list=summaries_list,
+            llm=llm,
+        )
+        recurring_issues = list(recurring_issues_to_record_ids.keys())
+        record_to_matching_issues_ids = reverse_mapping(
+            recurring_issues_to_record_ids)
+        for i, r in enumerate(record_level_metrics):
+            matching_issues_ids = record_to_matching_issues_ids.get(i, [])
+            matching_issues = [recurring_issues[i]
+                               for i in matching_issues_ids]
+            r.additional_info[LLMValidationFields.RECURRING_ISSUE_IDS_FIELD.value] = matching_issues_ids
+            r.additional_info[LLMValidationFields.RECURRING_ISSUE_FIELD.value] = matching_issues
+        values = [
+            record.value for record in record_level_metrics if record.value is not None]
+        mean = sum(values) / len(values)
+        evaluation_criteria = self.evaluation_criteria.to_dict(
+        ) if self.evaluation_criteria else {}
+        recurring_issues_count = {
+            k: len(v) for k, v in recurring_issues_to_record_ids.items()}
+        aggregate_result = AggregateMetricResult(
+            name=self.name,
+            method=self.method,
+            provider="",
+            value=mean,
+            total_records=len(record_level_metrics),
+            record_level_metrics=record_level_metrics,
+            min=min(values),
+            max=max(values),
+            mean=mean,
+            thresholds=self.thresholds,
+            additional_info={"recurring_issues": recurring_issues_to_record_ids,
+                             "evaluation_criteria": evaluation_criteria,
+                             "recurring_issues_count": recurring_issues_count,
+                             }
+        )
+        return aggregate_result
+    def generate_evaluating_model(self):
+        provider_type = self.llm_judge.model.provider.type
+        if provider_type == ModelProviderType.IBM_WATSONX_AI:
+            parameters = {
+                "decoding_method": "greedy",
+                "max_new_tokens": 512,
+                "min_new_tokens": 1,
+                "stop_sequences": [".", "<|eom_id|>"],
+                "enable-auto-tool-choice": False,
+                "tool-call-parser": False
+            }
+            return ChatWatsonx(
+                model_id=self.llm_judge.model.model_id,
+                url=self.llm_judge.model.provider.credentials.url,
+                apikey=self.llm_judge.model.provider.credentials.api_key,
+                project_id=self.llm_judge.model.project_id,
+                params=parameters,
+            )
+        elif provider_type == ModelProviderType.AZURE_OPENAI:
+            credentials = self.llm_judge.model.provider.credentials
+            model_id = self.llm_judge.model.model_name
+            azure_openapi_host = credentials.url
+            api_version = credentials.api_version
+            model_base = model_id.split("/")[-1].replace(".", "-")
+            azure_endpoint = \
+                f'{azure_openapi_host}/openai/deployments/{model_base}/chat/completions?api-version={api_version}'
+            parameters = {"temperature": 0}
+            return AzureChatOpenAI(api_key=credentials.api_key,
+                                   azure_endpoint=azure_endpoint,
+                                   api_version=api_version,
+                                   max_retries=2,
+                                   **parameters
+                                   )
+        elif provider_type == ModelProviderType.RITS:
+            credentials = self.llm_judge.model.provider.credentials
+            judge_model_id = self.llm_judge.model.model_name
+            model_base = judge_model_id.split("/")[-1].replace(".", "-")
+            rits_base_url = f'{credentials.hostname}/{model_base}/v1'
+            return ChatOpenAI(
+                model=judge_model_id,
+                api_key='/',
+                base_url=rits_base_url,
+                default_headers={'RITS_API_KEY': credentials.api_key},
+                max_retries=2,
+                temperature=0.0
+            )
+        elif provider_type == ModelProviderType.OPENAI:
+            model_name = self.llm_judge.model.model_name
+            return ChatOpenAI(
+                model=model_name,
+                max_retries=2,
+                temperature=0.0
+            )
+        raise Exception(f"Unknown provider type {provider_type}.")

ibm_watsonx_gov/metrics/llm_validation/llm_validation_prompts.py ADDED Viewed

@@ -0,0 +1,106 @@
+# ----------------------------------------------------------------------------------------------------
+# IBM Confidential
+# Licensed Materials - Property of IBM
+# 5737-H76, 5900-A3Q
+# © Copyright IBM Corp. 2025  All Rights Reserved.
+# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
+# GSA ADPSchedule Contract with IBM Corp.
+# ----------------------------------------------------------------------------------------------------
+# --- Prompt Templates ---
+full_response_eval_system_prompt = """You are an impartial judge evaluating the quality of an AI model's response. You will receive:
+Input: The text the model was asked to process or respond to.
+Output: The model's response text.
+Your task is to score the model's response on a scale of 0 to 1, considering the following criteria.
+You may also consider other relevant factors that contribute to the overall quality of the response.
+Evaluation Criteria:
+{evaluation_criteria}
+Provide a score from 0 to 1 and explain your reasoning clearly and concisely. End the response with 'Final Score: <score>' (e.g., 'Final Score: 0.7').
+"""
+full_response_eval_human_prompt = """The response to analyze:
+Input: '{model_input}'
+Output: '{model_output}'
+--- Begin Evaluation ---
+Textual Evaluation: [Your textual evaluation here]
+Evaluation score: [Your score here]
+"""
+summarization_system_prompt = """
+You are given an evaluation text produced by a judge model. Summarize the text in a few sentences.
+Focus on the core reasoning for the score and the final score itself.
+Remove redundancies and make it concise while keeping the essential information.
+Disregard the score given by the model and focus on the textual feedback.
+"""
+summarization_human_prompt = """
+Evaluation Text to Summarize:
+{evaluation_text}
+"""
+map_shortcomings_system_prompt = """You are an expert analyst reviewing evaluation feedback for AI model responses.
+Your task is to determine which of the following common shortcomings are mentioned or implied in the evaluation text.
+The shortcoming must be included with negative sentiment. For instance, if the shortcoming is
+"The answer lacks factual accuracy" and the evaluation text mentions "The answer is factually accurate", than the shortcoming is NOT mentioned in the evaluation text.
+Analyze the evaluation text and determine which shortcomings are present.
+For each shortcoming, respond with a 1 if it is mentioned or implied, or 0 if it is not mentioned.
+Your response should be a Python list of {num_shortcomings} binary values (0 or 1).
+For example: [1,0,0,1,0,0,0] would mean shortcomings 1 and 4 are present, and the others are not.
+Respond ONLY with the list in the format [0,1,0,...] with no additional text.
+"""
+map_shortcomings_human_prompt = """
+The shortcomings list to analyze:
+{shortcomings_list}
+Evaluation text to analyze:
+{eval_text}\n
+Which shortcomings (1-{num_shortcomings}) are mentioned or implied in this evaluation? Respond with a Python list of {num_shortcomings} binary values (0 or 1) in the format [0,1,0,...]."
+"""
+recurrent_issues_synthesis_system_prompt ="""You are an expert analyst tasked with identifying common themes in evaluation feedback for an AI model's answers. Below is a collection of evaluation texts assessing the quality of different answers.
+Your goal is to identify the most significant and frequent types of shortcomings or negative feedback mentioned in these evaluations. Please provide a list of concise phrases describing these common issues. Focus on actionable feedback points that could help improve the model's responses.
+Guidelines for identifying shortcomings:
+1. Look for patterns across multiple evaluations
+2. Focus on specific, actionable issues rather than general complaints
+3. Consider both content-related issues (accuracy, completeness) and presentation issues (clarity, structure)
+4. Prioritize issues that appear frequently or have significant impact
+5. Be specific but concise in your descriptions
+6. Ensure the issues are distinct and not overlapping.
+Do NOT list positive feedback. Focus only on areas for improvement or reasons for lower scores.
+Present the output ONLY as a Python list of strings. Your response MUST start with '[' and end with ']'.
+"""
+recurrent_issues_synthesis_human_prompt = """--- Begin Evaluation Texts ---
+{concatenated_evaluation_text}
+--- End Evaluation Texts ---
+Synthesized List of Common Shortcomings (Python List format ONLY):
+"""
+shortcomings_clustering_system_prompt = """You are given a list of short action items that describe recurring issues found in responses generated by a language model. These items may contain duplicates or very similar entries phrased differently.
+Your task is to analyze the list, remove duplicates and consolidate redundant items into a smaller set of distinct, clearly described issues.
+Instructions:
+- Group nearly identical feedback items that refer to the same concerns.
+- If there are two issues assessing different aspects of the same topic - do not merge them.
+- Do not merge issues with the same topic but opposite concerns (e.g: overly verbose / not verbose enough).
+- For each group, write a single and clear issue that captures the common idea.
+- Ensure that each issue addresses only a single concern or aspect. Do not merge distinct issues with related topics.
+- Ensure that the final list avoids redundancy and represents the full variety of distinct concerns from the original list.
+- Ensure no important information is lost from the original list — all key concerns must be preserved.
+- Explain your reasoning for each consolidation decision (e.g., which items were grouped together and why).
+- Finish your response by outputting 'Final list:' and then a properly formatted Python list of strings (with each element in double quotes) containing the consolidated issues.
+"""
+shortcomings_clustering_human_prompt = """Now process the following list:
+{recurring_issues_list}
+"""

ibm_watsonx_gov/metrics/llmaj/__init__.py ADDED Viewed

File without changes

ibm_watsonx_gov/metrics/llmaj/llmaj_metric.py ADDED Viewed

@@ -0,0 +1,298 @@
+# ----------------------------------------------------------------------------------------------------
+# IBM Confidential
+# Licensed Materials - Property of IBM
+# 5737-H76, 5900-A3Q
+# © Copyright IBM Corp. 2025  All Rights Reserved.
+# US Government Users Restricted Rights - Use, duplication or disclosure restricted by
+# GSA ADPSchedule Contract with IBM Corp.
+# ----------------------------------------------------------------------------------------------------
+import re
+from typing import Annotated, Literal, Optional, Self
+import pandas as pd
+from pydantic import Field, field_validator, model_validator
+from ibm_watsonx_gov.config import AgenticAIConfiguration, GenAIConfiguration
+from ibm_watsonx_gov.entities.criteria import CriteriaCatalog, Option
+from ibm_watsonx_gov.entities.enums import (MetricGroup, MetricType,
+                                            MetricValueType, TaskType)
+from ibm_watsonx_gov.entities.evaluation_result import AggregateMetricResult
+from ibm_watsonx_gov.entities.llm_judge import LLMJudge
+from ibm_watsonx_gov.entities.metric import GenAIMetric
+from ibm_watsonx_gov.entities.metric_threshold import MetricThreshold
+try:
+    from ibm_watsonx_gov.providers.eval_assist_provider import (
+        VARIABLES_PATTERN, EvalAssistProvider)
+except:
+    pass
+from ibm_watsonx_gov.utils.async_util import run_in_event_loop
+from ibm_watsonx_gov.utils.constants import CUSTOM_TYPE
+class LLMAsJudgeMetric(GenAIMetric):
+    """
+    Defines the LLMAsJudge metric class.
+    The LLMAsJudge metric evaluates the model input, output text based on the provided criteria or the grader prompt using a judge llm.
+    Examples:
+        1. Create LLMAsJudge metric with user defined grader prompt.
+            .. code-block:: python
+                # Define LLM Judge using watsonx.ai
+                # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
+                llm_judge = LLMJudge(model=WxAIFoundationModel(
+                                            model_id="llama-3-3-70b-instruct",
+                                            project_id="<PROJECT_ID>"))
+                prompt_template="You are presented with a response generated subject to a context.\\nContext: \\n {context} \\n Response: {response} \\n. Is the response faithful according to context?\\nChoose an option:\\n- 'Yes' if The response is faithful according to context.\\n- 'No' if The response is not faithful according to context."
+                options=["Yes", "No"])
+                # Optionally the numeric mapping for the string option can be specified as below
+                # options={"Yes": 1, "No": 0}
+                metric = LLMAsJudgeMetric(llm_judge=llm_judge,
+                                          prompt_template=prompt_template,
+                                          options=options)
+                evaluator = MetricsEvaluator()
+                evaluation_result = evaluator.evaluate(data=data,
+                                                       metrics=metrics)
+        2. Create an LLMAsJudge metric using the predefined criteria provided in the IBM watsonx.governance SDK’s criteria catalog.
+            .. code-block:: python
+                # Define LLM Judge using watsonx.ai
+                # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
+                llm_judge = LLMJudge(model=WxAIFoundationModel(
+                                            model_id="llama-3-3-70b-instruct",
+                                            project_id="<PROJECT_ID>"))
+                # Display the catalog
+                CriteriaCatalog.display_criteria_catalog(CriteriaCatalog.get_criteria())
+                # Initialize the LLMAsJudgeMetric with any of the available criteria.
+                metric = LLMAsJudgeMetric(name="conciseness",
+                               output_field="generated_text",
+                               llm_judge=llm_judge)
+                evaluator = MetricsEvaluator()
+                evaluation_result = evaluator.evaluate(data=data,
+                                                       metrics=metrics)
+        3. Create LLMAsJudge metric with user defined criteria and with default options. It is recommended to provide the options along with the description as shown in the next example for better accuracy.
+            .. code-block:: python
+                # Define LLM Judge using watsonx.ai
+                # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
+                llm_judge = LLMJudge(model=WxAIFoundationModel(
+                                            model_id="llama-3-3-70b-instruct",
+                                            project_id="<PROJECT_ID>"))
+                criteria_description="Is the {generated_text} faithful according to {context}?"
+                # When using the criteria description, its required to specify the output field if its other than generated_text.
+                metric = LLMAsJudgeMetric(name="factuality",
+                                          llm_judge=llm_judge,
+                                          criteria_description=criteria_description,
+                                          # output_field="generated_text"
+                                          )
+                evaluator = MetricsEvaluator()
+                evaluation_result = evaluator.evaluate(data=data,
+                                                       metrics=metrics)
+        4. Create LLMAsJudge metric with user defined criteria and options.
+            .. code-block:: python
+                # Define LLM Judge using watsonx.ai
+                # To use other frameworks and models as llm_judge, see :module:`ibm_watsonx_gov.entities.foundation_model`
+                llm_judge = LLMJudge(model=WxAIFoundationModel(
+                                            model_id="llama-3-3-70b-instruct",
+                                            project_id="<PROJECT_ID>"))
+                criteria_description="Is the {response} faithful according to {context}?"
+                options=[Option(name="Yes",
+                                description="The {response} is faithful according to {context}.",
+                                value=1.0),
+                        Option(name="No",
+                                description="The {response} is not faithful according to {context}.",
+                                value=0.0)])
+                # When using the criteria description, its required to specify the output field if its other than generated_text.
+                metric = LLMAsJudgeMetric(name="factuality",
+                                          llm_judge=llm_judge,
+                                          criteria_description=criteria_description,
+                                          options=options,
+                                          output_field="response")
+                evaluator = MetricsEvaluator()
+                evaluation_result = evaluator.evaluate(data=data,
+                                                       metrics=metrics)
+    """
+    name: Annotated[str,
+                    Field(title="Name",
+                          description="The llm as judge metric name. The name should be in lower snake case format.")]
+    display_name: Annotated[Optional[str],
+                            Field(title="Display Name",
+                                  description="The llm as judge metric display name. If not specified, its derived from the name.",
+                                  default=None)]
+    type_: Annotated[CUSTOM_TYPE, Field(title="Metric type",
+                                        description="The type of the metric.",
+                                        serialization_alias="type",
+                                        default=MetricType.CUSTOM.value,
+                                        frozen=True,
+                                        examples=[MetricType.CUSTOM.value])]
+    value_type: Annotated[str, Field(title="Metric value type",
+                                     description="The type of the metric value. Indicates whether the metric value is numeric or categorical. The default value is categorical.",
+                                     serialization_alias="type", default=MetricValueType.CATEGORICAL.value,
+                                     examples=MetricValueType.values())]
+    llm_judge: Annotated[LLMJudge,
+                         Field(title="LLM Judge",
+                               description="The LLM judge to be used for evaluation.")]
+    criteria_description: Annotated[Optional[str],
+                                    Field(title="Criteria Description",
+                                          description="The description of the evaluation criteria used to compute the metric.",
+                                          examples=[
+                                              "Is the {response} concise and to the point?"],
+                                          default=None)]
+    prompt_template: Annotated[Optional[str],
+                               Field(title="Prompt Template",
+                                     description="The grader prompt template used to compute the metric.",
+                                     default=None,
+                                     examples=["You are an expert grader. Your job is to evaluate how factually grounded an AI-generated answer is based on a given context. \n ## Grading Scale: \n Rate the answer either Yes or No:"])]
+    options: Annotated[list[Option] | list[dict] | list[str] | dict,
+                       Field(title="Options",
+                             description="The list of options of the judge response.",
+                             default=[Option(name="Yes",
+                                             value=1.0),
+                                      Option(name="No",
+                                             value=0.0)],
+                             examples=[["Yes", "No"], [{"name": "Yes", "value": 1}, {"name": "No", "value": 0}], [{"name": "Yes", "value": 1, "description": ""}, {"name": "No", "value": 0, "description": ""}]]),
+                       ]
+    output_field: Annotated[Optional[str], Field(title="Output Field",
+                                                 description="The model generated output field in the data. This is required when providing the criteria description. Default value is 'generated_text'.",
+                                                 default="generated_text",
+                                                 examples=["output"])]
+    group: Annotated[str,
+                     Field(title="Group",
+                           description="The metric group. The default group name is custom.",
+                           default=MetricGroup.CUSTOM.value)]
+    thresholds: Annotated[list[MetricThreshold],
+                          Field(title="Thresholds",
+                                description="The metric thresholds.",
+                                default=[MetricThreshold(type="lower_limit", value=0.7)])]
+    tasks: Annotated[list[TaskType],
+                     Field(title="Tasks",
+                           description="The list of supported tasks.",
+                           default=[])]
+    method: Annotated[Literal["llm_as_judge"],
+                      Field(title="Method",
+                            description="The method used to compute the metric.",
+                            default="llm_as_judge", frozen=True)]
+    @field_validator("options", mode="before")
+    def parse_options(cls, value):
+        if isinstance(value, list):
+            if isinstance(value[0], str):
+                return [Option(name=v) for v in value]
+        elif isinstance(value, dict):
+            return [Option(name=k, value=v) for k, v in value.items()]
+        return value
+    @model_validator(mode="after")
+    def validate(self) -> Self:
+        # Set criteria description and options based on the criteria name.
+        if not self.criteria_description and not self.prompt_template:
+            try:
+                criteria_obj = CriteriaCatalog.get_criteria([self.name])
+            except Exception:
+                raise ValueError(
+                    "The provided criteria name is unavailable in the catalog. Choose a criteria from the catalog or provide criteria_description or prompt_template to proceed.")
+            self.criteria_description = criteria_obj[0].description
+            self.options = criteria_obj[0].options
+        if self.criteria_description and not self.output_field:
+            raise ValueError(
+                "The `output_field` value is invalid. Please provide valid value for `output_field` attribute.")
+        if self.value_type == MetricValueType.NUMERIC.value:
+            for o in self.options:
+                if o.value is None:
+                    raise ValueError(
+                        f"The option is invalid. The metric value type is numeric, but the criteria option '{o.name}' does not have a valid value. Please provide a valid option.")
+        if not bool(re.fullmatch(r'[a-z][a-z0-9]*(?:_[a-z0-9]+)*', self.name)):
+            raise ValueError(
+                "The metric name should be in lower snake case format.")
+        if not self.display_name:
+            words = self.name.split('_')
+            self.display_name = ' '.join(word.capitalize() for word in words)
+        return self
+    def evaluate(self,
+                 data: pd.DataFrame,
+                 configuration: GenAIConfiguration | AgenticAIConfiguration,
+                 **kwargs) -> AggregateMetricResult:
+        # If ran in sync mode, block until it is done
+        return run_in_event_loop(
+            self.evaluate_async,
+            data=data,
+            configuration=configuration,
+            **kwargs,
+        )
+    async def evaluate_async(self, data: pd.DataFrame,
+                             configuration: GenAIConfiguration | AgenticAIConfiguration,
+                             **kwargs) -> AggregateMetricResult:
+        data_cols = data.columns.to_list()
+        self.__validate_fields(data_cols)
+        context_fields = []
+        if self.criteria_description:
+            if self.output_field not in data_cols:
+                raise ValueError(
+                    f"The output field {self.output_field} is not present in the data.")
+            ctx_fields = list(self.__criteria_fields)
+            ctx_fields.remove(self.output_field)
+            context_fields = ctx_fields
+        provider = EvalAssistProvider(metric_name=self.name,
+                                      display_name=self.display_name,
+                                      value_type=self.value_type,
+                                      criteria_description=self.criteria_description,
+                                      llm_judge=self.llm_judge,
+                                      metric_group=self.group,
+                                      metric_method=self.method,
+                                      thresholds=self.thresholds,
+                                      prompt_template=self.prompt_template,
+                                      options=self.options,
+                                      prediction_field=self.output_field,
+                                      context_fields=context_fields,
+                                      record_id_field=configuration.record_id_field,
+                                      **kwargs)
+        return await provider.evaluate_async(data)
+    def __validate_fields(self, data_cols):
+        if self.criteria_description:
+            fields_from_criteria = set()
+            fields_from_options = set()
+            fields_from_criteria.update(re.findall(
+                VARIABLES_PATTERN, self.criteria_description))
+            for option in self.options:
+                fields_from_options.update(re.findall(
+                    VARIABLES_PATTERN, option.description))
+            if (not all(field in data_cols for field in fields_from_criteria)):
+                raise ValueError(
+                    f"The fields provided in the criteria description {fields_from_criteria} are not present in the data.")
+            if (not all(field in data_cols for field in fields_from_options)):
+                raise ValueError(
+                    f"The fields provided in the options description {fields_from_options} are not present in the data.")
+            self.__criteria_fields = fields_from_criteria | fields_from_options
+        elif self.prompt_template:
+            fields_from_prompt = set()
+            fields_from_prompt.update(re.findall(
+                VARIABLES_PATTERN, self.prompt_template))
+            if (not all(field in data_cols for field in fields_from_prompt)):
+                raise ValueError(
+                    f"The fields provided in the prompt template {fields_from_prompt} are not present in the data.")

ibm_watsonx_gov/metrics/ndcg/__init__.py ADDED Viewed

File without changes