PyPI - validmind - Versions diffs - 2.5.19__py3-none-any.whl → 2.5.23__py3-none-any.whl - Mend

validmind 2.5.19py3-none-any.whl → 2.5.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

validmind/__init__.py +7 -46
validmind/__version__.py +1 -1
validmind/ai/test_result_description/context.py +2 -2
validmind/api_client.py +131 -266
validmind/client_config.py +1 -3
validmind/datasets/__init__.py +1 -1
validmind/datasets/nlp/__init__.py +1 -1
validmind/errors.py +3 -30
validmind/tests/load.py +4 -0
validmind/tests/model_validation/ragas/AnswerCorrectness.py +0 -1
validmind/tests/model_validation/sklearn/ClassifierPerformance.py +5 -2
validmind/tests/run.py +219 -116
validmind/vm_models/test/result_wrapper.py +4 -4
{validmind-2.5.19.dist-info → validmind-2.5.23.dist-info}/METADATA +12 -12
{validmind-2.5.19.dist-info → validmind-2.5.23.dist-info}/RECORD +18 -18
{validmind-2.5.19.dist-info → validmind-2.5.23.dist-info}/LICENSE +0 -0
{validmind-2.5.19.dist-info → validmind-2.5.23.dist-info}/WHEEL +0 -0
{validmind-2.5.19.dist-info → validmind-2.5.23.dist-info}/entry_points.txt +0 -0

validmind/datasets/nlp/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 """
-Example datasets that can be used with the developer framework.
+Example datasets that can be used with the library.
 """
 __all__ = [

validmind/errors.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
 """
-This module contains all the custom errors that are used in the developer framework.
+This module contains all the custom errors that are used in the library.
 The following base errors are defined for others:
 - BaseError
@@ -236,14 +236,6 @@ class MissingRExtrasError(BaseError):
         )
-class MissingRunCUIDError(APIRequestError):
-    """
-    When data is being sent to the API but the run_cuid is missing.
-    """
-    pass
 class MissingTextContentIdError(APIRequestError):
     """
     When a Text object is sent to the API without a content_id.
@@ -260,30 +252,14 @@ class MissingTextContentsError(APIRequestError):
     pass
-class MissingProjectIdError(BaseError):
+class MissingModelIdError(BaseError):
     def description(self, *args, **kwargs):
         return (
             self.message
-            or "Project ID must be provided either as an environment variable or as an argument to init."
+            or "Model ID must be provided either as an environment variable or as an argument to init."
         )
-class StartTestRunFailedError(APIRequestError):
-    """
-    When the API was not able to start a test run.
-    """
-    pass
-class TestRunNotFoundError(APIRequestError):
-    """
-    When a test run is not found in the API.
-    """
-    pass
 class TestInputInvalidDatasetError(BaseError):
     """
     When an invalid dataset is used in a test context.
@@ -369,11 +345,8 @@ def raise_api_error(error_string):
         "missing_text": MissingTextContentsError,
         "invalid_text_object": InvalidTextObjectError,
         "invalid_content_id_prefix": InvalidContentIdPrefixError,
-        "missing_run_cuid": MissingRunCUIDError,
-        "test_run_not_found": TestRunNotFoundError,
         "invalid_metric_results": InvalidMetricResultsError,
         "invalid_test_results": InvalidTestResultsError,
-        "start_test_run_failed": StartTestRunFailedError,
     }
     error_class = error_map.get(api_code, APIRequestError)

validmind/tests/load.py CHANGED Viewed

@@ -88,6 +88,10 @@ def list_tests(
     Returns:
         list or pandas.DataFrame: A list of all tests or a formatted table.
     """
+    # tests = {
+    #     test_id: load_test(test_id, reload=True)
+    #     for test_id in test_store.get_test_ids()
+    # }
     tests = {}
     for test_id in test_store.get_test_ids():
         try:

validmind/tests/model_validation/ragas/AnswerCorrectness.py CHANGED Viewed

@@ -100,7 +100,6 @@ def AnswerCorrectness(
     }
     ```
     """
     warnings.filterwarnings(
         "ignore",
         category=FutureWarning,

validmind/tests/model_validation/sklearn/ClassifierPerformance.py CHANGED Viewed

@@ -67,6 +67,7 @@ class ClassifierPerformance(Metric):
         "multiclass_classification",
         "model_performance",
     ]
+    default_params = {"average": "macro"}
     def summary(self, metric_value: dict):
         """
@@ -134,11 +135,13 @@ class ClassifierPerformance(Metric):
         if len(np.unique(y_true)) > 2:
             y_pred = self.inputs.dataset.y_pred(self.inputs.model)
             y_true = y_true.astype(y_pred.dtype)
-            roc_auc = multiclass_roc_auc_score(y_true, y_pred)
+            roc_auc = multiclass_roc_auc_score(
+                y_true, y_pred, average=self.params["average"]
+            )
         else:
             y_prob = self.inputs.dataset.y_prob(self.inputs.model)
             y_true = y_true.astype(y_prob.dtype).flatten()
-            roc_auc = roc_auc_score(y_true, y_prob)
+            roc_auc = roc_auc_score(y_true, y_prob, average=self.params["average"])
         report["roc_auc"] = roc_auc

validmind/tests/run.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
+import itertools
 from itertools import product
 from typing import Any, Dict, List, Union
 from uuid import uuid4
@@ -137,7 +138,7 @@ def _combine_figures(figure_lists: List[List[Any]], input_groups: List[Dict[str,
     title_template = "{current_title}({input_description})"
     for idx, figures in enumerate(figure_lists):
-        input_group = input_groups[idx]
+        input_group = input_groups[idx]["inputs"]
         if is_plotly_figure(figures[0].figure):
             _update_plotly_titles(figures, input_group, title_template)
         elif is_matplotlib_figure(figures[0].figure):
@@ -171,63 +172,55 @@ def _combine_unit_metrics(results: List[MetricResultWrapper]):
 def metric_comparison(
     results: List[MetricResultWrapper],
     test_id: TestID,
-    input_groups: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
+    input_params_groups: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
     output_template: str = None,
     generate_description: bool = True,
 ):
     """Build a comparison result for multiple metric results"""
     ref_id = str(uuid4())
+    # Treat param_groups and input_groups as empty lists if they are None or empty
+    input_params_groups = input_params_groups or [{}]
     input_group_strings = []
-    for group in input_groups:
+    for input_params in input_params_groups:
         new_group = {}
-        for k, v in group.items():
-            if isinstance(v, str):
-                new_group[k] = v
-            elif hasattr(v, "input_id"):
-                new_group[k] = v.input_id
-            elif isinstance(v, list) and all(hasattr(item, "input_id") for item in v):
-                new_group[k] = ", ".join([item.input_id for item in v])
+        for param_k, param_v in input_params["params"].items():
+            new_group[param_k] = param_v
+        for metric_k, metric_v in input_params["inputs"].items():
+            # Process values in the input group
+            if isinstance(metric_v, str):
+                new_group[metric_k] = metric_v
+            elif hasattr(metric_v, "input_id"):
+                new_group[metric_k] = metric_v.input_id
+            elif isinstance(metric_v, list) and all(
+                hasattr(item, "input_id") for item in metric_v
+            ):
+                new_group[metric_k] = ", ".join([item.input_id for item in metric_v])
             else:
-                raise ValueError(f"Unsupported type for value: {v}")
+                raise ValueError(f"Unsupported type for value: {metric_v}")
         input_group_strings.append(new_group)
     # handle unit metrics (scalar values) by adding it to the summary
     _combine_unit_metrics(results)
-    # Check if the results list contains a result object with a metric
-    if any(
-        hasattr(result, "metric")
-        and hasattr(result.metric, "summary")
-        and result.metric.summary
-        for result in results
-    ):
-        # Compute merged summaries only if there is a result with a metric
-        merged_summary = _combine_summaries(
-            [
-                {"inputs": input_group_strings[i], "summary": result.metric.summary}
-                for i, result in enumerate(results)
-            ]
-        )
-    else:
-        merged_summary = None
-    # Check if the results list contains a result object with figures
-    if any(hasattr(result, "figures") and result.figures for result in results):
-        # Compute merged figures only if there is at least one result with figures
-        merged_figures = _combine_figures(
-            [result.figures for result in results],
-            input_groups,
-        )
-        # Patch figure metadata so they are connected to the comparison result
-        if merged_figures and len(merged_figures):
-            for i, figure in enumerate(merged_figures):
-                figure.key = f"{figure.key}-{i}"
-                figure.metadata["_name"] = test_id
-                figure.metadata["_ref_id"] = ref_id
-    else:
-        merged_figures = None
+    merged_summary = _combine_summaries(
+        [
+            {"inputs": input_group_strings[i], "summary": result.metric.summary}
+            for i, result in enumerate(results)
+        ]
+    )
+    merged_figures = _combine_figures(
+        [result.figures for result in results], input_params_groups
+    )
+    # Patch figure metadata so they are connected to the comparison result
+    if merged_figures and len(merged_figures):
+        for i, figure in enumerate(merged_figures):
+            figure.key = f"{figure.key}-{i}"
+            figure.metadata["_name"] = test_id
+            figure.metadata["_ref_id"] = ref_id
     return MetricResultWrapper(
         result_id=test_id,
@@ -236,14 +229,14 @@ def metric_comparison(
                 test_id=test_id,
                 default_description=f"Comparison test result for {test_id}",
                 summary=merged_summary.serialize() if merged_summary else None,
-                figures=merged_figures if merged_figures else None,
+                figures=merged_figures,
                 should_generate=generate_description,
             ),
         ],
         inputs=[
             item.input_id if hasattr(item, "input_id") else item
-            for group in input_groups
-            for input in group.values()
+            for group in input_params_groups
+            for input in group["inputs"].values()
             for item in (input if isinstance(input, list) else [input])
             if hasattr(item, "input_id") or isinstance(item, str)
         ],
@@ -333,39 +326,63 @@ def threshold_test_comparison(
 def run_comparison_test(
     test_id: TestID,
-    input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]],
+    input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
+    inputs: Dict[str, Any] = None,
     name: str = None,
     unit_metrics: List[TestID] = None,
+    param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
     params: Dict[str, Any] = None,
     show: bool = True,
     output_template: str = None,
     generate_description: bool = True,
 ):
     """Run a comparison test"""
-    if isinstance(input_grid, dict):
-        input_groups = _cartesian_product(input_grid)
+    if input_grid:
+        if isinstance(input_grid, dict):
+            input_groups = _cartesian_product(input_grid)
+        else:
+            input_groups = input_grid
     else:
-        input_groups = input_grid
+        input_groups = list(inputs) if inputs else []
+    if param_grid:
+        if isinstance(param_grid, dict):
+            param_groups = _cartesian_product(param_grid)
+        else:
+            param_groups = param_grid
+    else:
+        param_groups = list(params) if inputs else []
+    input_groups = input_groups or [{}]
+    param_groups = param_groups or [{}]
+    # Use itertools.product to compute the Cartesian product
+    inputs_params_product = [
+        {
+            "inputs": item1,
+            "params": item2,
+        }  # Merge dictionaries from input_groups and param_groups
+        for item1, item2 in itertools.product(input_groups, param_groups)
+    ]
     results = [
         run_test(
             test_id,
             name=name,
             unit_metrics=unit_metrics,
-            inputs=inputs,
+            inputs=inputs_params["inputs"],
             show=False,
-            params=params,
+            params=inputs_params["params"],
             __generate_description=False,
         )
-        for inputs in input_groups
+        for inputs_params in (inputs_params_product or [{}])
     ]
     if isinstance(results[0], MetricResultWrapper):
         func = metric_comparison
     else:
         func = threshold_test_comparison
-    result = func(results, test_id, input_groups, output_template, generate_description)
+    result = func(
+        results, test_id, inputs_params_product, output_template, generate_description
+    )
     if show:
         result.show()
@@ -376,6 +393,7 @@ def run_comparison_test(
 def run_test(
     test_id: TestID = None,
     params: Dict[str, Any] = None,
+    param_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
     inputs: Dict[str, Any] = None,
     input_grid: Union[Dict[str, List[Any]], List[Dict[str, Any]]] = None,
     name: str = None,
@@ -385,83 +403,81 @@ def run_test(
     __generate_description: bool = True,
     **kwargs,
 ) -> Union[MetricResultWrapper, ThresholdTestResultWrapper]:
-    """Run a test by test ID
-    Args:
-        test_id (TestID, optional): The test ID to run. Not required if `unit_metrics` is provided.
-        params (dict, optional): A dictionary of parameters to pass into the test. Params
-            are used to customize the test behavior and are specific to each test. See the
-            test details for more information on the available parameters. Defaults to None.
-        inputs (Dict[str, Any], optional): A dictionary of test inputs to pass into the
-            test. Inputs are either models or datasets that have been initialized using
-            vm.init_model() or vm.init_dataset(). Defaults to None.
-        input_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): To run
-            a comparison test, provide either a dictionary of inputs where the keys are
-            the input names and the values are lists of different inputs, or a list of
-            dictionaries where each dictionary is a set of inputs to run the test with.
-            This will run the test multiple times with different sets of inputs and then
-            combine the results into a single output. When passing a dictionary, the grid
-            will be created by taking the Cartesian product of the input lists. Its simply
-            a more convenient way of forming the input grid as opposed to passing a list of
-            all possible combinations. Defaults to None.
-        name (str, optional): The name of the test (used to create a composite metric
-            out of multiple unit metrics) - required when running multiple unit metrics
-        unit_metrics (list, optional): A list of unit metric IDs to run as a composite
-            metric - required when running multiple unit metrics
-        output_template (str, optional): A jinja2 html template to customize the output
-            of the test. Defaults to None.
-        show (bool, optional): Whether to display the results. Defaults to True.
-        **kwargs: Keyword inputs to pass into the test (same as `inputs` but as keyword
-            args instead of a dictionary):
-            - dataset: A validmind Dataset object or a Pandas DataFrame
-            - model: A model to use for the test
-            - models: A list of models to use for the test
-            - dataset: A validmind Dataset object or a Pandas DataFrame
+    """Run a test by test ID.
+    test_id (TestID, optional): The test ID to run. Not required if `unit_metrics` is provided.
+    params (dict, optional): A dictionary of parameters to pass into the test. Params
+        are used to customize the test behavior and are specific to each test. See the
+        test details for more information on the available parameters. Defaults to None.
+    param_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): To run
+        a comparison test, provide either a dictionary of parameters where the keys are
+        the parameter names and the values are lists of different parameters, or a list of
+        dictionaries where each dictionary is a set of parameters to run the test with.
+        This will run the test multiple times with different sets of parameters and then
+        combine the results into a single output. When passing a dictionary, the grid
+        will be created by taking the Cartesian product of the parameter lists. Its simply
+        a more convenient way of forming the param grid as opposed to passing a list of
+        all possible combinations. Defaults to None.
+    inputs (Dict[str, Any], optional): A dictionary of test inputs to pass into the
+        test. Inputs are either models or datasets that have been initialized using
+        vm.init_model() or vm.init_dataset(). Defaults to None.
+    input_grid (Union[Dict[str, List[Any]], List[Dict[str, Any]]], optional): To run
+        a comparison test, provide either a dictionary of inputs where the keys are
+        the input names and the values are lists of different inputs, or a list of
+        dictionaries where each dictionary is a set of inputs to run the test with.
+        This will run the test multiple times with different sets of inputs and then
+        combine the results into a single output. When passing a dictionary, the grid
+        will be created by taking the Cartesian product of the input lists. Its simply
+        a more convenient way of forming the input grid as opposed to passing a list of
+        all possible combinations. Defaults to None.
+    name (str, optional): The name of the test (used to create a composite metric
+        out of multiple unit metrics) - required when running multiple unit metrics
+    unit_metrics (list, optional): A list of unit metric IDs to run as a composite
+        metric - required when running multiple unit metrics
+    output_template (str, optional): A jinja2 html template to customize the output
+        of the test. Defaults to None.
+    show (bool, optional): Whether to display the results. Defaults to True.
+    **kwargs: Keyword inputs to pass into the test (same as `inputs` but as keyword
+        args instead of a dictionary):
+        - dataset: A validmind Dataset object or a Pandas DataFrame
+        - model: A model to use for the test
+        - models: A list of models to use for the test
+        - dataset: A validmind Dataset object or a Pandas DataFrame
     """
-    if not test_id and not name and not unit_metrics:
-        raise ValueError(
-            "`test_id` or `name` and `unit_metrics` must be provided to run a test"
-        )
-    if (unit_metrics and not name) or (name and not unit_metrics):
-        raise ValueError("`name` and `unit_metrics` must be provided together")
-    if (input_grid and kwargs) or (input_grid and inputs):
-        raise ValueError(
-            "When providing an `input_grid`, you cannot also provide `inputs` or `kwargs`"
-        )
+    # Validate input arguments with helper functions
+    validate_test_inputs(test_id, name, unit_metrics)
+    validate_grid_inputs(input_grid, kwargs, inputs, param_grid, params)
+    # Handle composite metric creation
     if unit_metrics:
-        metric_id_name = "".join(word[0].upper() + word[1:] for word in name.split())
-        test_id = f"validmind.composite_metric.{metric_id_name}" or test_id
+        test_id = generate_composite_test_id(name, test_id)
-    if input_grid:
-        return run_comparison_test(
+    # Run comparison tests if applicable
+    if input_grid or param_grid:
+        return run_comparison_test_with_grids(
             test_id,
+            inputs,
             input_grid,
-            name=name,
-            unit_metrics=unit_metrics,
-            params=params,
-            output_template=output_template,
-            show=show,
-            generate_description=__generate_description,
+            param_grid,
+            name,
+            unit_metrics,
+            params,
+            output_template,
+            show,
+            __generate_description,
         )
+    # Run unit metric tests
     if test_id.startswith("validmind.unit_metrics"):
         # TODO: as we move towards a more unified approach to metrics
         # we will want to make everything functional and remove the
         # separation between unit metrics and "normal" metrics
         return run_metric(test_id, inputs=inputs, params=params, show=show)
-    if unit_metrics:
-        error, TestClass = load_composite_metric(
-            unit_metrics=unit_metrics, metric_name=metric_id_name
-        )
-        if error:
-            raise LoadTestError(error)
-    else:
-        TestClass = load_test(test_id, reload=True)
+    # Load the appropriate test class
+    TestClass = load_test_class(test_id, unit_metrics, name)
+    # Create and run the test
     test = TestClass(
         test_id=test_id,
         context=TestContext(),
@@ -477,3 +493,90 @@ def run_test(
         test.result.show()
     return test.result
+def validate_test_inputs(test_id, name, unit_metrics):
+    """Validate the main test inputs for `test_id`, `name`, and `unit_metrics`."""
+    if not test_id and not (name and unit_metrics):
+        raise ValueError(
+            "`test_id` or both `name` and `unit_metrics` must be provided to run a test"
+        )
+    if bool(unit_metrics) != bool(name):
+        raise ValueError("`name` and `unit_metrics` must be provided together")
+def validate_grid_inputs(input_grid, kwargs, inputs, param_grid, params):
+    """Validate the grid inputs to avoid conflicting parameters."""
+    if input_grid and (kwargs or inputs):
+        raise ValueError("Cannot provide `input_grid` along with `inputs` or `kwargs`")
+    if param_grid and (kwargs or params):
+        raise ValueError("Cannot provide `param_grid` along with `params` or `kwargs`")
+def generate_composite_test_id(name, test_id):
+    """Generate a composite test ID if unit metrics are provided."""
+    metric_id_name = "".join(word.capitalize() for word in name.split())
+    return f"validmind.composite_metric.{metric_id_name}" or test_id
+def run_comparison_test_with_grids(
+    test_id,
+    inputs,
+    input_grid,
+    param_grid,
+    name,
+    unit_metrics,
+    params,
+    output_template,
+    show,
+    generate_description,
+):
+    """Run a comparison test based on the presence of input and param grids."""
+    if input_grid and param_grid:
+        return run_comparison_test(
+            test_id,
+            input_grid,
+            name=name,
+            unit_metrics=unit_metrics,
+            param_grid=param_grid,
+            output_template=output_template,
+            show=show,
+            generate_description=generate_description,
+        )
+    if input_grid:
+        return run_comparison_test(
+            test_id,
+            input_grid,
+            name=name,
+            unit_metrics=unit_metrics,
+            params=params,
+            output_template=output_template,
+            show=show,
+            generate_description=generate_description,
+        )
+    if param_grid:
+        return run_comparison_test(
+            test_id,
+            inputs=inputs,
+            name=name,
+            unit_metrics=unit_metrics,
+            param_grid=param_grid,
+            output_template=output_template,
+            show=show,
+            generate_description=generate_description,
+        )
+def load_test_class(test_id, unit_metrics, name):
+    """Load the appropriate test class based on `test_id` and unit metrics."""
+    if unit_metrics:
+        metric_id_name = "".join(word.capitalize() for word in name.split())
+        error, TestClass = load_composite_metric(
+            unit_metrics=unit_metrics, metric_name=metric_id_name
+        )
+        if error:
+            raise LoadTestError(error)
+        return TestClass
+    return load_test(test_id, reload=True)

validmind/vm_models/test/result_wrapper.py CHANGED Viewed

@@ -378,8 +378,8 @@ class MetricResultWrapper(ResultWrapper):
                 self.metric.summary = self._get_filtered_summary()
             tasks.append(
-                api_client.log_metrics(
-                    metrics=[self.metric],
+                api_client.log_metric_result(
+                    metric=self.metric,
                     inputs=self.inputs,
                     output_template=self.output_template,
                     section_id=section_id,
@@ -388,7 +388,7 @@ class MetricResultWrapper(ResultWrapper):
             )
         if self.figures:
-            tasks.append(api_client.log_figures(self.figures))
+            tasks.extend([api_client.log_figure(figure) for figure in self.figures])
         if hasattr(self, "result_metadata") and self.result_metadata:
             description = self.result_metadata[0].get("text", "")
@@ -474,7 +474,7 @@ class ThresholdTestResultWrapper(ResultWrapper):
         ]
         if self.figures:
-            tasks.append(api_client.log_figures(self.figures))
+            tasks.extend([api_client.log_figure(figure) for figure in self.figures])
         if hasattr(self, "result_metadata") and self.result_metadata:
             description = self.result_metadata[0].get("text", "")

validmind 2.5.19__py3-none-any.whl → 2.5.23__py3-none-any.whl

validmind 2.5.19py3-none-any.whl → 2.5.23py3-none-any.whl