PyPI - desdeo - Versions diffs - 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

desdeo 2.0.0py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

desdeo/adm/ADMAfsar.py +551 -0
desdeo/adm/ADMChen.py +414 -0
desdeo/adm/BaseADM.py +119 -0
desdeo/adm/__init__.py +11 -0
desdeo/api/__init__.py +6 -6
desdeo/api/app.py +38 -28
desdeo/api/config.py +65 -44
desdeo/api/config.toml +23 -12
desdeo/api/db.py +10 -8
desdeo/api/db_init.py +12 -6
desdeo/api/models/__init__.py +220 -20
desdeo/api/models/archive.py +16 -27
desdeo/api/models/emo.py +128 -0
desdeo/api/models/enautilus.py +69 -0
desdeo/api/models/gdm/gdm_aggregate.py +139 -0
desdeo/api/models/gdm/gdm_base.py +69 -0
desdeo/api/models/gdm/gdm_score_bands.py +114 -0
desdeo/api/models/gdm/gnimbus.py +138 -0
desdeo/api/models/generic.py +104 -0
desdeo/api/models/generic_states.py +401 -0
desdeo/api/models/nimbus.py +158 -0
desdeo/api/models/preference.py +44 -6
desdeo/api/models/problem.py +274 -64
desdeo/api/models/session.py +4 -1
desdeo/api/models/state.py +419 -52
desdeo/api/models/user.py +7 -6
desdeo/api/models/utopia.py +25 -0
desdeo/api/routers/_EMO.backup +309 -0
desdeo/api/routers/_NIMBUS.py +6 -3
desdeo/api/routers/emo.py +497 -0
desdeo/api/routers/enautilus.py +237 -0
desdeo/api/routers/gdm/gdm_aggregate.py +234 -0
desdeo/api/routers/gdm/gdm_base.py +420 -0
desdeo/api/routers/gdm/gdm_score_bands/gdm_score_bands_manager.py +398 -0
desdeo/api/routers/gdm/gdm_score_bands/gdm_score_bands_routers.py +377 -0
desdeo/api/routers/gdm/gnimbus/gnimbus_manager.py +698 -0
desdeo/api/routers/gdm/gnimbus/gnimbus_routers.py +591 -0
desdeo/api/routers/generic.py +233 -0
desdeo/api/routers/nimbus.py +705 -0
desdeo/api/routers/problem.py +201 -4
desdeo/api/routers/reference_point_method.py +20 -44
desdeo/api/routers/session.py +50 -26
desdeo/api/routers/user_authentication.py +180 -26
desdeo/api/routers/utils.py +187 -0
desdeo/api/routers/utopia.py +230 -0
desdeo/api/schema.py +10 -4
desdeo/api/tests/conftest.py +94 -2
desdeo/api/tests/test_enautilus.py +330 -0
desdeo/api/tests/test_models.py +550 -72
desdeo/api/tests/test_routes.py +902 -43
desdeo/api/utils/_database.py +263 -0
desdeo/api/utils/database.py +28 -266
desdeo/api/utils/emo_database.py +40 -0
desdeo/core.py +7 -0
desdeo/emo/__init__.py +154 -24
desdeo/emo/hooks/archivers.py +18 -2
desdeo/emo/methods/EAs.py +128 -5
desdeo/emo/methods/bases.py +9 -56
desdeo/emo/methods/templates.py +111 -0
desdeo/emo/operators/crossover.py +544 -42
desdeo/emo/operators/evaluator.py +10 -14
desdeo/emo/operators/generator.py +127 -24
desdeo/emo/operators/mutation.py +212 -41
desdeo/emo/operators/scalar_selection.py +202 -0
desdeo/emo/operators/selection.py +956 -214
desdeo/emo/operators/termination.py +124 -16
desdeo/emo/options/__init__.py +108 -0
desdeo/emo/options/algorithms.py +435 -0
desdeo/emo/options/crossover.py +164 -0
desdeo/emo/options/generator.py +131 -0
desdeo/emo/options/mutation.py +260 -0
desdeo/emo/options/repair.py +61 -0
desdeo/emo/options/scalar_selection.py +66 -0
desdeo/emo/options/selection.py +127 -0
desdeo/emo/options/templates.py +383 -0
desdeo/emo/options/termination.py +143 -0
desdeo/gdm/__init__.py +22 -0
desdeo/gdm/gdmtools.py +45 -0
desdeo/gdm/score_bands.py +114 -0
desdeo/gdm/voting_rules.py +50 -0
desdeo/mcdm/__init__.py +23 -1
desdeo/mcdm/enautilus.py +338 -0
desdeo/mcdm/gnimbus.py +484 -0
desdeo/mcdm/nautilus_navigator.py +7 -6
desdeo/mcdm/reference_point_method.py +70 -0
desdeo/problem/__init__.py +5 -1
desdeo/problem/external/__init__.py +18 -0
desdeo/problem/external/core.py +356 -0
desdeo/problem/external/pymoo_provider.py +266 -0
desdeo/problem/external/runtime.py +44 -0
desdeo/problem/infix_parser.py +2 -2
desdeo/problem/pyomo_evaluator.py +25 -6
desdeo/problem/schema.py +69 -48
desdeo/problem/simulator_evaluator.py +65 -15
desdeo/problem/testproblems/__init__.py +26 -11
desdeo/problem/testproblems/benchmarks_server.py +120 -0
desdeo/problem/testproblems/cake_problem.py +185 -0
desdeo/problem/testproblems/dmitry_forest_problem_discrete.py +71 -0
desdeo/problem/testproblems/forest_problem.py +77 -69
desdeo/problem/testproblems/multi_valued_constraints.py +119 -0
desdeo/problem/testproblems/{river_pollution_problem.py → river_pollution_problems.py} +28 -22
desdeo/problem/testproblems/single_objective.py +289 -0
desdeo/problem/testproblems/zdt_problem.py +4 -1
desdeo/tools/__init__.py +39 -21
desdeo/tools/desc_gen.py +22 -0
desdeo/tools/generics.py +22 -2
desdeo/tools/group_scalarization.py +3090 -0
desdeo/tools/indicators_binary.py +107 -1
desdeo/tools/indicators_unary.py +3 -16
desdeo/tools/message.py +33 -2
desdeo/tools/non_dominated_sorting.py +4 -3
desdeo/tools/patterns.py +9 -7
desdeo/tools/pyomo_solver_interfaces.py +48 -35
desdeo/tools/reference_vectors.py +118 -351
desdeo/tools/scalarization.py +340 -1413
desdeo/tools/score_bands.py +491 -328
desdeo/tools/utils.py +117 -49
desdeo/tools/visualizations.py +67 -0
desdeo/utopia_stuff/utopia_problem.py +1 -1
desdeo/utopia_stuff/utopia_problem_old.py +1 -1
{desdeo-2.0.0.dist-info → desdeo-2.1.0.dist-info}/METADATA +46 -28
desdeo-2.1.0.dist-info/RECORD +180 -0
{desdeo-2.0.0.dist-info → desdeo-2.1.0.dist-info}/WHEEL +1 -1
desdeo-2.0.0.dist-info/RECORD +0 -120
/desdeo/api/utils/{logger.py → _logger.py} +0 -0
{desdeo-2.0.0.dist-info → desdeo-2.1.0.dist-info/licenses}/LICENSE +0 -0

desdeo/tools/score_bands.py CHANGED Viewed

@@ -3,15 +3,22 @@
 This module contains the functions which generate SCORE bands visualizations. It also contains functions to calculate
 the order and positions of the objective axes, as well as a heatmap of correlation matrix.
-This file is just copied from the old SCORE bands repo.
-It is very much out of date and is missing documentation.
+To run the SCORE bands visualization, use the `score_json` function to generate the data for the visualization, and then
+use the `plot_score` function to generate the figure. You can also pass the result of `score_json` to other frontends
+for visualization.
 """
+from copy import deepcopy
+from enum import Enum
+from typing import Literal
+from warnings import warn
 import numpy as np
-import pandas as pd
 import plotly.figure_factory as ff
 import plotly.graph_objects as go
+import polars as pl
 from matplotlib import cm
+from pydantic import BaseModel, ConfigDict, Field
 from scipy.stats import pearsonr
 from sklearn.cluster import DBSCAN
 from sklearn.metrics import silhouette_score
@@ -20,32 +27,179 @@ from sklearn.preprocessing import StandardScaler
 from tsp_solver.greedy import solve_tsp
-def _gaussianmixtureclusteringwithBIC(data: pd.DataFrame):
-    data = StandardScaler().fit_transform(data)
+class GMMOptions(BaseModel):
+    """Options for Gaussian Mixture Model clustering algorithm."""
+    model_config = ConfigDict(use_attribute_docstrings=True)
+    name: str = Field(default="GMM")
+    """Gaussian Mixture Model clustering algorithm."""
+    scoring_method: Literal["BIC", "silhouette"] = Field(default="silhouette")
+    """Scoring method to use for GMM. Either "BIC" or "silhouette". Defaults to "silhouette".
+        This option determines how the number of clusters is chosen."""
+class DBSCANOptions(BaseModel):
+    """Options for DBSCAN clustering algorithm."""
+    model_config = ConfigDict(use_attribute_docstrings=True)
+    name: str = Field(default="DBSCAN")
+    """DBSCAN clustering algorithm."""
+class KMeansOptions(BaseModel):
+    """Options for KMeans clustering algorithm."""
+    model_config = ConfigDict(use_attribute_docstrings=True)
+    name: str = Field(default="KMeans")
+    """KMeans clustering algorithm."""
+    n_clusters: int = Field(default=5)
+    """Number of clusters to use. Defaults to 5."""
+class DimensionClusterOptions(BaseModel):
+    """Options for clustering by one of the objectives/decision variables."""
+    model_config = ConfigDict(use_attribute_docstrings=True)
+    name: str = Field(default="DimensionCluster")
+    """Clustering by one of the dimensions."""
+    dimension_name: str
+    """Dimension to use for clustering."""
+    n_clusters: int = Field(default=5)
+    """Number of clusters to use. Defaults to 5."""
+    kind: Literal["EqualWidth", "EqualFrequency"] = Field(default="EqualWidth")
+    """Kind of clustering to use. Either "EqualWidth", which divides the dimension range into equal width intervals,
+        or "EqualFrequency", which divides the dimension values into intervals with equal number of solutions.
+        Defaults to "EqualWidth"."""
+class CustomClusterOptions(BaseModel):
+    """Options for custom clustering provided by the user."""
+    model_config = ConfigDict(use_attribute_docstrings=True)
+    name: str = Field(default="Custom")
+    """Custom user-provided clusters."""
+    clusters: list[int]
+    """List of cluster IDs (one for each solution) indicating the cluster to which each solution belongs."""
+ClusteringOptions = GMMOptions | DBSCANOptions | KMeansOptions | DimensionClusterOptions | CustomClusterOptions
+class DistanceFormula(int, Enum):
+    """Distance formulas supported by SCORE bands. See the paper for details."""
+    FORMULA_1 = 1
+    FORMULA_2 = 2
+class SCOREBandsConfig(BaseModel):
+    """Configuration options for SCORE bands visualization."""
+    model_config = ConfigDict(use_attribute_docstrings=True)
+    dimensions: list[str] | None = Field(default=None)
+    """List of variable/objective names (i.e., column names in the data) to include in the visualization.
+        If None, all columns in the data are used. Defaults to None."""
+    descriptive_names: dict[str, str] | None = Field(default=None)
+    """Optional dictionary mapping dimensions to descriptive names for display in the visualization.
+        If None, the original dimension names are used. Defaults to None."""
+    units: dict[str, str] | None = Field(default=None)
+    """Optional dictionary mapping dimensions to their units for display in the visualization.
+        If None, no units are displayed. Defaults to None."""
+    axis_positions: dict[str, float] | None = Field(default=None)
+    """Dictionary mapping objective names to their positions on the axes in the SCORE bands visualization. The first
+        objective is at position 0.0, and the last objective is at position 1.0. Use this option if you want to
+        manually set the axis positions. If None, the axis positions are calculated automatically based on correlations.
+        Defaults to None."""
+    clustering_algorithm: ClusteringOptions = Field(
+        default=DBSCANOptions(),
+    )
+    """
+    Clustering algorithm to use. Currently supported options: "GMM", "DBSCAN",
+        and "KMeans". Defaults to "DBSCAN".
+    """
+    distance_formula: DistanceFormula = Field(default=DistanceFormula.FORMULA_1)
+    """Distance formula to use. The value should be 1 or 2. Check the paper for details. Defaults to 1."""
+    distance_parameter: float = Field(default=0.05)
+    """Change the relative distances between the objective axes. Increase this value if objectives are placed too close
+        together. Decrease this value if the objectives are equidistant in a problem with objective clusters. Defaults
+        to 0.05."""
+    use_absolute_correlations: bool = Field(default=False)
+    """Whether to use absolute value of the correlation to calculate the placement of axes. Defaults to False."""
+    include_solutions: bool = Field(default=False)
+    """Whether to include individual solutions. Defaults to False. If True, the size of the resulting figure may be
+        very large for datasets with many solutions. Moreover, the individual traces are hidden by default, but can be
+        viewed interactively in the figure."""
+    include_medians: bool = Field(default=False)
+    """Whether to include cluster medians. Defaults to False. If True, the median traces are hidden by default, but
+        can be viewed interactively in the figure."""
+    interval_size: float = Field(default=0.95)
+    """The size (as a fraction) of the interval to use for the bands. Defaults to 0.95, meaning that 95% of the
+    middle solutions in a cluster will be included in the band. The rest will be considered outliers."""
+    scales: dict[str, tuple[float, float]] | None = Field(default=None)
+    """Optional dictionary specifying the min and max values for each objective. The keys should be the
+        objective names (i.e., column names in the data), and the values should be tuples of (min, max).
+        If not provided, the min and max will be calculated from the data."""
+class SCOREBandsResult(BaseModel):
+    """Pydantic/JSON model for representing SCORE Bands."""
+    model_config = ConfigDict(use_attribute_docstrings=True)
+    options: SCOREBandsConfig
+    """Configuration options used to generate the SCORE bands."""
+    ordered_dimensions: list[str]
+    """List of variable/objective names (i.e., column names in the data).
+        Ordered according to their placement in the SCORE bands visualization."""
+    clusters: list[int]
+    """List of cluster IDs (one for each solution) indicating the cluster to which each solution belongs."""
+    axis_positions: dict[str, float]
+    """Dictionary mapping objective names to their positions on the axes in the SCORE bands visualization. The first
+        objective is at position 0.0, and the last objective is at position 1.0."""
+    bands: dict[int, dict[str, tuple[float, float]]]
+    """Dictionary mapping cluster IDs to dictionaries of objective names and their corresponding band
+        extremes (min, max)."""
+    medians: dict[int, dict[str, float]]
+    """Dictionary mapping cluster IDs to dictionaries of objective names and their corresponding median values."""
+    cardinalities: dict[int, int]
+    """Dictionary mapping cluster IDs to the number of solutions in each cluster."""
+def _gaussianmixtureclusteringwithBIC(data: pl.DataFrame) -> np.ndarray:
+    """Cluster the data using Gaussian Mixture Model with BIC scoring."""
+    data_copy = data.to_numpy()
+    data_copy = StandardScaler().fit_transform(data_copy)
     lowest_bic = np.inf
     bic = []
-    n_components_range = range(1, min(11, len(data)))
-    cv_types = ["spherical", "tied", "diag", "full"]
+    n_components_range = range(1, min(11, len(data_copy)))
+    cv_types: list[Literal["full", "tied", "diag", "spherical"]] = ["spherical", "tied", "diag", "full"]
     for cv_type in cv_types:
         for n_components in n_components_range:
             # Fit a Gaussian mixture with EM
             gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type)
-            gmm.fit(data)
-            bic.append(gmm.score(data))
+            gmm.fit(data_copy)
+            bic.append(gmm.score(data_copy))
             # bic.append(gmm.bic(data))
             if bic[-1] < lowest_bic:
                 lowest_bic = bic[-1]
                 best_gmm = gmm
-    return best_gmm.predict(data)
+    return best_gmm.predict(data_copy)
-def _gaussianmixtureclusteringwithsilhouette(data: pd.DataFrame):
-    X = StandardScaler().fit_transform(data)
+def _gaussianmixtureclusteringwithsilhouette(data: pl.DataFrame) -> np.ndarray:
+    """Cluster the data using Gaussian Mixture Model with silhouette scoring."""
+    X = StandardScaler().fit_transform(data.to_numpy())
     best_score = -np.inf
-    best_labels = []
+    best_labels = np.ones(len(data))
     n_components_range = range(1, min(11, len(data)))
-    cv_types = ["spherical", "tied", "diag", "full"]
+    cv_types: list[Literal["full", "tied", "diag", "spherical"]] = ["spherical", "tied", "diag", "full"]
     for cv_type in cv_types:
         for n_components in n_components_range:
             # Fit a Gaussian mixture with EM
@@ -62,11 +216,12 @@ def _gaussianmixtureclusteringwithsilhouette(data: pd.DataFrame):
     return best_labels
-def _DBSCANClustering(data: pd.DataFrame):
-    X = StandardScaler().fit_transform(data)
+def _DBSCANClustering(data: pl.DataFrame) -> np.ndarray:
+    """Cluster the data using DBSCAN with silhouette scoring to choose eps."""
+    X = StandardScaler().fit_transform(data.to_numpy())
     eps_options = np.linspace(0.01, 1, 20)
     best_score = -np.inf
-    best_labels = [1] * len(X)
+    best_labels = np.ones(len(data))
     for eps_option in eps_options:
         db = DBSCAN(eps=eps_option, min_samples=10, metric="cosine").fit(X)
         core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
@@ -83,250 +238,54 @@ def _DBSCANClustering(data: pd.DataFrame):
     return best_labels
-def cluster(data: pd.DataFrame, algorithm: str = "DBSCAN", score: str = "silhoutte"):
-    if not (score == "silhoutte" or score == "BIC"):
-        raise ValueError()
-    if not (algorithm == "GMM" or algorithm == "DBSCAN"):
-        raise ValueError()
-    if algorithm == "DBSCAN":
-        return _DBSCANClustering(data)
-    if score == "silhoutte":
-        return _gaussianmixtureclusteringwithsilhouette(data)
-    else:
-        return _gaussianmixtureclusteringwithBIC(data)
-def SCORE_bands(
-    data: pd.DataFrame,
-    axis_signs: np.ndarray = None,
-    color_groups: list | np.ndarray = None,
-    axis_positions: np.ndarray = None,
-    solutions: bool = True,
-    bands: bool = False,
-    medians: bool = False,
-    quantile: float = 0.25,
-) -> go.Figure:
-    """Generate SCORE bands figure from the provided data.
-    Args:
-        data (pd.DataFrame): Pandas dataframe where each column represents an objective and each row is an objective
-        vector. The column names are displayed as the objective names in the generated figure. Each element in the
-        dataframe must be numeric.
-        color_groups (Union[List, np.ndarray], optional): List or numpy array of the same length as the number of
-        objective vectors. The elements should be contiguous set of integers starting at 1. The element value represents
-        the Cluster ID of the corresponding objective vector. Defaults to None (though this behaviour is not fully
-        tested yet).
-        axis_positions (np.ndarray, optional): 1-D numpy array of the same length as the number of objectives. The value
-        represents the horizontal position of the corresponding objective axes. The value of the first and last element
-        should be 0 and 1 respectively, and all intermediate values should lie between 0 and 1.
-        Defaults to None, in which case all axes are positioned equidistant.
-        axis_signs (np.ndarray, optional): 1-D Numpy array of the same length as the number of objectives. Each element
-        can either be 1 or -1. A value of -1 flips the objective in the SCORE bands visualization. This feature is
-        experimental and should be ignored for now. Defaults to None.
-        solutions (bool, optional): Show or hide individual solutions. Defaults to True.
-        bands (bool, optional): Show or hide cluster bands. Defaults to False.
-        medians (bool, optional): Show or hide cluster medians. Defaults to False.
-        quantile (float, optional): The quantile value to calculate the band. The band represents the range between
-        (quantile) and (1 - quantile) quantiles of the objective values. Defaults to 0.25.
-    Returns:
-        go.Figure: SCORE bands plot.
-    """
-    # show on render
-    show_solutions = "legendonly"
-    bands_visible = True
-    if bands:
-        show_medians = "legendonly"
-    if medians:
-        show_medians = True
-    # pio.templates.default = "simple_white"
-    column_names = data.columns
-    num_columns = len(column_names)
-    if axis_positions is None:
-        axis_positions = np.linspace(0, 1, num_columns)
-    if axis_signs is None:
-        axis_signs = np.ones_like(axis_positions)
-    if color_groups is None:
-        color_groups = "continuous"
-        colorscale = cm.get_cmap("viridis")
-    elif isinstance(color_groups, (np.ndarray, list)):
-        groups = list(np.unique(color_groups))
-        if len(groups) <= 8:
-            colorscale = cm.get_cmap("Accent", len(groups))
-            # print(len(groups))
-            # print("hi!")
-        else:
-            colorscale = cm.get_cmap("tab20", len(groups))
-    # colorscale = cm.get_cmap("viridis_r", len(groups))
-    data = data * axis_signs
-    num_labels = 6
-    # Scaling the objective values between 0 and 1.
-    scaled_data = data - data.min(axis=0)
-    scaled_data = scaled_data / scaled_data.max(axis=0)
-    scales = pd.DataFrame([data.min(axis=0), data.max(axis=0)], index=["min", "max"]) * axis_signs
+def cluster_by_dimension(data: pl.DataFrame, options: DimensionClusterOptions) -> np.ndarray:
+    """Cluster the data by a specific dimension."""
+    if options.dimension_name not in data.columns:
+        raise ValueError(f"Objective '{options.dimension_name}' not found in data.")
-    fig = go.Figure()
-    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False)
-    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False)
-    fig.update_layout(plot_bgcolor="rgba(0,0,0,0)")
-    scaled_data.insert(0, "group", value=color_groups)
-    for cluster_id, solns in scaled_data.groupby("group"):
-        # TODO: Many things here are very inefficient. Improve when free.
-        num_solns = len(solns)
-        r, g, b, a = colorscale(cluster_id - 1)  # Needed as cluster numbering starts at 1
-        a = 0.6
-        a_soln = 0.6
-        color_bands = f"rgba({r}, {g}, {b}, {a})"
-        color_soln = f"rgba({r}, {g}, {b}, {a_soln})"
-        low = solns.drop("group", axis=1).quantile(quantile)
-        high = solns.drop("group", axis=1).quantile(1 - quantile)
-        median = solns.drop("group", axis=1).median()
-        if bands is True:
-            # lower bound of the band
-            fig.add_scatter(
-                x=axis_positions,
-                y=low,
-                line={"color": color_bands},
-                name=f"{int(100 - 200 * quantile)}% band: Cluster {cluster_id}; {num_solns} Solutions        ",
-                mode="lines",
-                legendgroup=f"{int(100 - 200 * quantile)}% band: Cluster {cluster_id}",
-                showlegend=True,
-                line_shape="spline",
-                hovertext=f"Cluster {cluster_id}",
-                visible=bands_visible,
-            )
-            # upper bound of the band
-            fig.add_scatter(
-                x=axis_positions,
-                y=high,
-                line={"color": color_bands},
-                name=f"Cluster {cluster_id}",
-                fillcolor=color_bands,
-                mode="lines",
-                legendgroup=f"{int(100 - 200 * quantile)}% band: Cluster {cluster_id}",
-                showlegend=False,
-                line_shape="spline",
-                fill="tonexty",
-                hovertext=f"Cluster {cluster_id}",
-                visible=bands_visible,
-            )
-        if medians is True:
-            # median
-            fig.add_scatter(
-                x=axis_positions,
-                y=median,
-                line={"color": color_bands},
-                name=f"Median: Cluster {cluster_id}",
-                mode="lines+markers",
-                marker={"line": {"color": "Black", "width": 2}},
-                legendgroup=f"Median: Cluster {cluster_id}",
-                showlegend=True,
-                visible=show_medians,
-            )
-        if solutions is True:
-            # individual solutions
-            legend = True
-            for _, soln in solns.drop("group", axis=1).iterrows():
-                fig.add_scatter(
-                    x=axis_positions,
-                    y=soln,
-                    line={"color": color_soln},
-                    name=f"Solutions: Cluster {cluster_id}              ",
-                    legendgroup=f"Solutions: Cluster {cluster_id}",
-                    showlegend=legend,
-                    visible=show_solutions,
-                )
-                legend = False
-    # Axis lines
-    for i, col_name in enumerate(column_names):
-        # better = "Upper" if axis_signs[i] == -1 else "Lower"
-        label_text = np.linspace(scales[col_name]["min"], scales[col_name]["max"], num_labels)
-        # label_text = ["{:.3g}".format(i) for i in label_text]
-        heights = np.linspace(0, 1, num_labels)
-        scale_factors = []
-        for current_label in label_text:
-            try:
-                with np.errstate(divide="ignore"):
-                    scale_factors.append(int(np.floor(np.log10(np.abs(current_label)))))
-            except OverflowError:
-                pass
-        scale_factor = int(np.median(scale_factors))
-        if scale_factor == -1 or scale_factor == 1:
-            scale_factor = 0
-        # TODO: This sometimes doesn't generate the correct label text. Check with datasets where objs lie between (0,1).
-        label_text = label_text / 10 ** (scale_factor)
-        label_text = ["{:.1f}".format(i) for i in label_text]
-        scale_factor_text = f"e{scale_factor}" if scale_factor != 0 else ""
-        # Bottom axis label
-        fig.add_scatter(
-            x=[axis_positions[i]],
-            y=[heights[0]],
-            text=[label_text[0] + scale_factor_text],
-            textposition="bottom center",
-            mode="text",
-            line={"color": "black"},
-            showlegend=False,
-        )
-        # Top axis label
-        fig.add_scatter(
-            x=[axis_positions[i]],
-            y=[heights[-1]],
-            text=[label_text[-1] + scale_factor_text],
-            textposition="top center",
-            mode="text",
-            line={"color": "black"},
-            showlegend=False,
-        )
-        label_text[0] = ""
-        label_text[-1] = ""
-        # Intermediate axes labels
-        fig.add_scatter(
-            x=[axis_positions[i]] * num_labels,
-            y=heights,
-            text=label_text,
-            textposition="middle left",
-            mode="markers+lines+text",
-            line={"color": "black"},
-            showlegend=False,
-        )
+    # Select the dimension column for clustering
+    dimension = data[options.dimension_name]
-        fig.add_scatter(
-            x=[axis_positions[i]],
-            y=[1.10],
-            text=f"{col_name}",
-            textfont={"size": 20},
-            mode="text",
-            showlegend=False,
+    # Perform clustering based on the specified method
+    if options.kind == "EqualWidth":
+        min_val: float = dimension.min()
+        max_val: float = dimension.max()
+        SMALL_VALUE = 1e-8
+        thresholds = np.linspace(
+            min_val * (1 - SMALL_VALUE),  # Ensure the minimum value is included in the first cluster
+            max_val * (1 + SMALL_VALUE),  # Ensure the maximum value is included in the last cluster
+            options.n_clusters + 1,
         )
-        """fig.add_scatter(
-            x=[axis_positions[i]], y=[1.1], text=better, mode="text", showlegend=False,
-        )
-        fig.add_scatter(
-            x=[axis_positions[i]],
-            y=[1.05],
-            text="is better",
-            mode="text",
-            showlegend=False,
-        )"""
-    fig.update_layout(font_size=18)
-    fig.update_layout(legend={"orientation": "h", "yanchor": "top", "font": {"size": 24}})
-    return fig
+        return np.digitize(dimension.to_numpy(), thresholds)  # Cluster IDs start at 1
+    elif options.kind == "EqualFrequency":
+        levels: list[float] = [dimension.quantile(i / options.n_clusters) for i in range(1, options.n_clusters)]
+        thresholds = [-np.inf] + levels + [np.inf]
+        return np.digitize(dimension.to_numpy(), thresholds)  # Cluster IDs start at 1
+    raise ValueError(f"Unknown clustering kind: {options.kind}")
+def cluster(data: pl.DataFrame, options: ClusteringOptions) -> np.ndarray:
+    """Cluster the data using the specified clustering algorithm and options."""
+    if isinstance(options, DimensionClusterOptions):
+        return cluster_by_dimension(data, options)
+    if isinstance(options, KMeansOptions):
+        from sklearn.cluster import KMeans
+        X = StandardScaler().fit_transform(data.to_numpy())
+        kmeans = KMeans(n_clusters=options.n_clusters, random_state=0).fit(X)
+        return kmeans.labels_
+    if isinstance(options, DBSCANOptions):
+        return _DBSCANClustering(data)
+    if isinstance(options, GMMOptions):
+        if options.scoring_method == "silhouette":
+            return _gaussianmixtureclusteringwithsilhouette(data)
+        if options.scoring_method == "BIC":
+            return _gaussianmixtureclusteringwithBIC(data)
+    if isinstance(options, CustomClusterOptions):
+        if len(options.clusters) != len(data):
+            raise ValueError("Length of custom clusters must match number of solutions in data.")
+        return np.array(options.clusters)
+    raise ValueError(f"Unknown clustering algorithm: {options}")
 def annotated_heatmap(correlation_matrix: np.ndarray, col_names: list, order: list | np.ndarray) -> go.Figure:
@@ -340,7 +299,7 @@ def annotated_heatmap(correlation_matrix: np.ndarray, col_names: list, order: li
     Returns:
         go.Figure: The heatmap
     """  # noqa: D212, D213, D406, D407
-    corr = pd.DataFrame(correlation_matrix, index=col_names, columns=col_names)
+    corr = pl.DataFrame(correlation_matrix, index=col_names, columns=col_names)
     corr = corr[col_names[order]].loc[col_names[order[::-1]]]
     corr = np.rint(corr * 100) / 100  # Take upto two significant figures only to make heatmap readable.
     fig = ff.create_annotated_heatmap(
@@ -353,13 +312,13 @@ def annotated_heatmap(correlation_matrix: np.ndarray, col_names: list, order: li
     return fig
-def order_objectives(data: pd.DataFrame, use_absolute_corr: bool = False):
+def order_dimensions(data: pl.DataFrame, use_absolute_corr: bool = False):
     """Calculate the order of objectives.
     Also returns the correlation matrix.
     Args:
-        data (pd.DataFrame): Data to be visualized.
+        data (pl.DataFrame): Data to be visualized.
         use_absolute_corr (bool, optional): Use absolute value of the correlation to calculate order. Defaults to False.
     Returns:
@@ -381,84 +340,288 @@ def order_objectives(data: pd.DataFrame, use_absolute_corr: bool = False):
     return corr, obj_order
-def calculate_axes_positions(data, obj_order, corr, dist_parameter, distance_formula: int = 1):
+def calculate_axes_positions(
+    dimension_order: list[int],
+    corr: np.ndarray,
+    dist_parameter: float,
+    distance_formula: DistanceFormula = DistanceFormula.FORMULA_1,
+) -> np.ndarray:
+    """Calculate the position of the axes for the SCORE bands visualization based on correlations.
+    Args:
+        dimension_order (list[int]): Order of the variables to be plotted.
+        corr (np.ndarray): Correlation (pearson) matrix.
+        dist_parameter (float): Change the relative distances between the axes. Increase this value if the axes are
+            placed too close together. Decrease this value if the axes are equidistant.
+        distance_formula (DistanceFormula, optional): The value should be 1 or 2. Check the paper for details.
+            Defaults to DistanceFormula.FORMULA_1.
+    Returns:
+        np.ndarray: Positions of the axes in the range [0, 1].
+    """
     # axes positions
-    order = np.asarray(list((zip(obj_order[:-1], obj_order[1:]))))
+    order = np.asarray(list(zip(dimension_order[:-1], dimension_order[1:], strict=True)))
     axis_len = corr[order[:, 0], order[:, 1]]
-    if distance_formula == 1:
-        axis_len = 1 - axis_len  # TODO Make this formula available to the user
-    elif distance_formula == 2:
+    if distance_formula == DistanceFormula.FORMULA_1:
+        axis_len = 1 - axis_len
+    elif distance_formula == DistanceFormula.FORMULA_2:
         axis_len = 1 / (np.abs(axis_len) + 1)  #  Reciprocal for reverse
     else:
+        # Should never reach here
         raise ValueError("distance_formula should be either 1 or 2 (int)")
-    # axis_len = np.abs(axis_len)
-    # axis_len = axis_len / sum(axis_len) #TODO Changed
-    axis_len = axis_len + dist_parameter  # Minimum distance between axes
+    axis_len = axis_len + dist_parameter
     axis_len = axis_len / sum(axis_len)
-    axis_dist = np.cumsum(np.append(0, axis_len))
-    # Axis signs (normalizing negative correlations)
-    axis_signs = np.cumprod(np.sign(np.hstack((1, corr[order[:, 0], order[:, 1]]))))
-    return data.iloc[:, obj_order], axis_dist, axis_signs
-def auto_SCORE(
-    data: pd.DataFrame,
-    solutions: bool = True,
-    bands: bool = True,
-    medians: bool = False,
-    dist_parameter: float = 0.05,
-    use_absolute_corr: bool = False,
-    distance_formula: int = 1,
-    flip_axes: bool = False,
-    clustering_algorithm: str = "DBSCAN",
-    clustering_score: str = "silhoutte",
-    quantile: float = 0.05,
-):
-    """Generate the SCORE Bands visualization for a dataset with predefined values for the hyperparameters.
+    return np.cumsum(np.append(0, axis_len))
+def score_json(
+    data: pl.DataFrame,
+    options: SCOREBandsConfig,
+) -> SCOREBandsResult:
+    """Generate the SCORE Bands data for a given dataset and configuration options.
     Args:
-        data (pd.DataFrame): Dataframe of objective values. The column names should be the objective names. Each row
-        should be an objective vector.
+        data (pl.DataFrame): Dataframe of variable (decision or objective) values.
+            The column names should be the names of the variables to be plotted. Each row should be a solution.
-        solutions (bool, optional): Show or hide individual solutions. Defaults to True.
-        bands (bool, optional): Show or hide the cluster bands. Defaults to True.
-        medians (bool, optional): Show or hide the cluster medians. Defaults to False.
-        dist_parameter (float, optional): Change the relative distances between the objective axes. Increase this value
-        if objectives are placed too close together. Decrease this value if the objectives are equidistant in a problem
-        with objective clusters. Defaults to 0.05.
-        use_absolute_corr (bool, optional): Use absolute value of the correlation to calculate the placement of axes.
-        Defaults to False.
-        distance_formula (int, optional): The value should be 1 or 2. Check the paper for details. Defaults to 1.
-        flip_axes (bool, optional): Do not use this option. Defaults to False.
-        clustering_algorithm (str, optional): Currently supported options: "GMM" and "DBSCAN". Defaults to "DBSCAN".
-        clustering_score (str, optional): If "GMM" is chosen for clustering algorithm, the scoring mechanism can be
-        either "silhoutte" or "BIC". Defaults to "silhoutte".
+        options (SCOREBandsConfig): Configuration options for generating the SCORE bands.
     Returns:
-        _type_: _description_
+        SCOREBandsResult: The result containing all relevant data for the SCORE bands visualization.
     """
+    options = deepcopy(options)
     # Calculating correlations and axes positions
-    corr, obj_order = order_objectives(data, use_absolute_corr=use_absolute_corr)
-    ordered_data, axis_dist, axis_signs = calculate_axes_positions(
-        data,
-        obj_order,
-        corr,
-        dist_parameter=dist_parameter,
-        distance_formula=distance_formula,
-    )
-    if not flip_axes:
-        axis_signs = None
-    groups = cluster(ordered_data, algorithm=clustering_algorithm, score=clustering_score)
-    groups = groups - np.min(groups) + 1  # translate minimum to 1.
-    fig1 = SCORE_bands(
-        ordered_data,
-        color_groups=groups,
-        axis_positions=axis_dist,
-        axis_signs=axis_signs,
-        solutions=solutions,
-        bands=bands,
-        medians=medians,
-        quantile=0.05,
+    if options.dimensions is None:
+        options.dimensions = data.columns
+    data_copy = data.select([pl.col(col) for col in options.dimensions])
+    if options.axis_positions is None:
+        corr, dimension_order = order_dimensions(data_copy, use_absolute_corr=options.use_absolute_correlations)
+        axis_dist = calculate_axes_positions(
+            dimension_order,
+            corr,
+            dist_parameter=options.distance_parameter,
+            distance_formula=options.distance_formula,
+        )
+        ordered_dimension_names = [data_copy.columns[i] for i in dimension_order]
+        axis_positions = {name: axis_dist[i] for i, name in enumerate(ordered_dimension_names)}
+    else:
+        axis_positions = options.axis_positions
+        ordered_dimension_names = sorted(axis_positions.keys(), key=axis_positions.get)
+    clusters = cluster(data_copy, options.clustering_algorithm)
+    if min(clusters) <= 0:
+        clusters = clusters - np.min(clusters) + 1  # translate minimum to 1.
+    # some sanity check: check if all cluster IDs are contiguous integers starting at 1, ending at number of clusters
+    unique_clusters = np.unique(clusters)
+    max_cluster_id = max(clusters)
+    if not all(i in unique_clusters for i in range(1, max_cluster_id + 1)):
+        warn(
+            """Cluster IDs are not contiguous integers starting at 1.
+            This may cause issues with the color mapping in the visualization.""",
+            category=UserWarning,
+            stacklevel=2,
+        )
+    cluster_column_name = "cluster"
+    if cluster_column_name in data_copy.columns:
+        cluster_column_name = "cluster_id"
+    data_copy = data_copy.with_columns(pl.Series(cluster_column_name, clusters))
+    grouped = data_copy.group_by(cluster_column_name)
+    min_percentile = (1 - options.interval_size) / 2
+    max_percentile = 1 - min_percentile
+    mins = grouped.quantile(min_percentile)
+    maxs = grouped.quantile(max_percentile)
+    medians = grouped.median()
+    frequencies = grouped.len()
+    bands_dict = {
+        cluster_id: {
+            col_name: (
+                mins.filter(pl.col(cluster_column_name) == cluster_id)[col_name][0],
+                maxs.filter(pl.col(cluster_column_name) == cluster_id)[col_name][0],
+            )
+            for col_name in ordered_dimension_names
+        }
+        for cluster_id in mins[cluster_column_name].to_list()
+    }
+    medians_dict = {
+        cluster_id: {
+            col_name: medians.filter(pl.col(cluster_column_name) == cluster_id)[col_name][0]
+            for col_name in ordered_dimension_names
+        }
+        for cluster_id in medians[cluster_column_name].to_list()
+    }
+    frequencies_dict = {
+        cluster_id: frequencies.filter(pl.col(cluster_column_name) == cluster_id)["len"][0]
+        for cluster_id in frequencies[cluster_column_name].to_list()
+    }
+    if options.scales is None:
+        scales: dict[str, tuple[float, float]] = {
+            dimension: (data_copy[dimension].min(), data_copy[dimension].max()) for dimension in ordered_dimension_names
+        }
+        options.scales = scales
+    return SCOREBandsResult(
+        options=options,
+        ordered_dimensions=ordered_dimension_names,
+        clusters=clusters.tolist(),
+        axis_positions=axis_positions,
+        bands=bands_dict,
+        medians=medians_dict,
+        cardinalities=frequencies_dict,
     )
-    return fig1, corr, obj_order, groups, axis_dist
+def plot_score(data: pl.DataFrame, result: SCOREBandsResult) -> go.Figure:
+    """Generate the SCORE Bands figure from the SCOREBandsResult data.
+    Args:
+        data (pl.DataFrame): Dataframe of objective values. The column names should be the objective names. Each row
+        should be an objective vector.
+        result (SCOREBandsResult): The result containing all relevant data for the SCORE bands visualization.
+    Returns:
+        go.Figure: The SCORE bands plot.
+    """
+    column_names = result.ordered_dimensions
+    clusters = np.sort(np.unique(result.clusters))
+    if len(clusters) <= 8:
+        colorscale = cm.get_cmap("Accent", len(clusters))
+    else:
+        colorscale = cm.get_cmap("tab20", len(clusters))
+    if result.options.scales is None:
+        raise ValueError("Scales must be provided in the SCOREBandsResult to plot the figure.")
+    scale_min = pl.DataFrame({name: result.options.scales[name][0] for name in result.options.scales})
+    scale_max = pl.DataFrame({name: result.options.scales[name][1] for name in result.options.scales})
+    scaled_data = (data[column_names] - scale_min) / (scale_max - scale_min)
+    fig = go.Figure()
+    fig.update_xaxes(showticklabels=False, showgrid=False, zeroline=False)
+    fig.update_yaxes(showticklabels=False, showgrid=False, zeroline=False)
+    fig.update_layout(plot_bgcolor="rgba(0,0,0,0)")
+    cluster_column_name = "cluster"
+    if cluster_column_name in scaled_data.columns:
+        cluster_column_name = "cluster_id"
+    scaled_data = scaled_data.with_columns(pl.Series(cluster_column_name, result.clusters))
+    if result.options.descriptive_names is None:
+        descriptive_names = {name: name for name in column_names}
+    else:
+        descriptive_names = result.options.descriptive_names
+    if result.options.units is None:
+        units = {name: "" for name in column_names}
+    else:
+        units = result.options.units
+    num_ticks = 6
+    # Add axes
+    for i, col_name in enumerate(column_names):
+        label_text = np.linspace(result.options.scales[col_name][0], result.options.scales[col_name][1], num_ticks)
+        label_text = ["{:.5g}".format(i) for i in label_text]
+        # label_text[0] = "<<"
+        # label_text[-1] = ">>"
+        heights = np.linspace(0, 1, num_ticks)
+        # Axis lines
+        fig.add_scatter(
+            x=[result.axis_positions[col_name]] * num_ticks,
+            y=heights,
+            text=label_text,
+            textposition="middle left",
+            mode="markers+lines+text",
+            line={"color": "black"},
+            showlegend=False,
+        )
+        # Column Name
+        fig.add_scatter(
+            x=[result.axis_positions[col_name]],
+            y=[1.20],
+            text=f"{descriptive_names[col_name]}",
+            textfont={"size": 20},
+            mode="text",
+            showlegend=False,
+        )
+        # Units
+        fig.add_scatter(
+            x=[result.axis_positions[col_name]],
+            y=[1.10],
+            text=f"{units[col_name]}",
+            textfont={"size": 12},
+            mode="text",
+            showlegend=False,
+        )
+    # Add bands
+    for cluster_id in sorted(result.bands.keys()):
+        r, g, b, a = colorscale(cluster_id - 1)  # Needed as cluster numbering starts at 1
+        a = 0.6
+        color_bands = f"rgba({r}, {g}, {b}, {a})"
+        color_soln = f"rgba({r}, {g}, {b}, {a})"
+        lows = [
+            (result.bands[cluster_id][col_name][0] - result.options.scales[col_name][0])
+            / (result.options.scales[col_name][1] - result.options.scales[col_name][0])
+            for col_name in column_names
+        ]
+        highs = [
+            (result.bands[cluster_id][col_name][1] - result.options.scales[col_name][0])
+            / (result.options.scales[col_name][1] - result.options.scales[col_name][0])
+            for col_name in column_names
+        ]
+        medians = [
+            (result.medians[cluster_id][col_name] - result.options.scales[col_name][0])
+            / (result.options.scales[col_name][1] - result.options.scales[col_name][0])
+            for col_name in column_names
+        ]
+        fig.add_scatter(
+            x=[result.axis_positions[col_name] for col_name in column_names],
+            y=lows,
+            line={"color": color_bands},
+            name=f"{int(100 * result.options.interval_size)}% band: Cluster {cluster_id}; "
+            f"{result.cardinalities[cluster_id]} Solutions        ",
+            mode="lines",
+            legendgroup=f"{int(100 * result.options.interval_size)}% band: Cluster {cluster_id}",
+            showlegend=True,
+            line_shape="spline",
+            hovertext=f"Cluster {cluster_id}",
+        )
+        # upper bound of the band
+        fig.add_scatter(
+            x=[result.axis_positions[col_name] for col_name in column_names],
+            y=highs,
+            line={"color": color_bands},
+            name=f"Cluster {cluster_id}",
+            fillcolor=color_bands,
+            mode="lines",
+            legendgroup=f"{int(100 * result.options.interval_size)}% band: Cluster {cluster_id}",
+            showlegend=False,
+            line_shape="spline",
+            fill="tonexty",
+            hovertext=f"Cluster {cluster_id}",
+        )
+        if result.options.include_medians:
+            # median
+            fig.add_scatter(
+                x=[result.axis_positions[col_name] for col_name in column_names],
+                y=medians,
+                line={"color": color_bands},
+                name=f"Median: Cluster {cluster_id}",
+                mode="lines+markers",
+                marker={"line": {"color": "Black", "width": 2}},
+                legendgroup=f"Median: Cluster {cluster_id}",
+                showlegend=True,
+            )
+    fig.update_layout(font_size=18)
+    fig.update_layout(legend={"orientation": "h", "yanchor": "top"})
+    return fig

desdeo 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

desdeo 2.0.0py3-none-any.whl → 2.1.0py3-none-any.whl