validmind 2.8.29__py3-none-any.whl → 2.10.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. validmind/__init__.py +16 -5
  2. validmind/__version__.py +1 -1
  3. validmind/ai/utils.py +4 -24
  4. validmind/api_client.py +6 -17
  5. validmind/datasets/credit_risk/lending_club.py +13 -1
  6. validmind/datasets/nlp/cnn_dailymail.py +15 -1
  7. validmind/logging.py +48 -0
  8. validmind/tests/__init__.py +2 -0
  9. validmind/tests/__types__.py +18 -0
  10. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +14 -2
  11. validmind/tests/data_validation/DickeyFullerGLS.py +13 -2
  12. validmind/tests/data_validation/PhillipsPerronArch.py +13 -2
  13. validmind/tests/data_validation/SeasonalDecompose.py +14 -2
  14. validmind/tests/data_validation/ShapiroWilk.py +14 -1
  15. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +14 -1
  16. validmind/tests/data_validation/WOEBinPlots.py +14 -1
  17. validmind/tests/data_validation/WOEBinTable.py +13 -2
  18. validmind/tests/data_validation/ZivotAndrewsArch.py +13 -2
  19. validmind/tests/data_validation/nlp/CommonWords.py +14 -2
  20. validmind/tests/data_validation/nlp/LanguageDetection.py +14 -1
  21. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +13 -1
  22. validmind/tests/data_validation/nlp/Sentiment.py +13 -1
  23. validmind/tests/data_validation/nlp/StopWords.py +14 -2
  24. validmind/tests/data_validation/nlp/TextDescription.py +14 -2
  25. validmind/tests/data_validation/nlp/Toxicity.py +13 -1
  26. validmind/tests/model_validation/BertScore.py +13 -2
  27. validmind/tests/model_validation/BleuScore.py +13 -2
  28. validmind/tests/model_validation/ContextualRecall.py +13 -1
  29. validmind/tests/model_validation/MeteorScore.py +13 -2
  30. validmind/tests/model_validation/ModelPredictionResiduals.py +14 -1
  31. validmind/tests/model_validation/RegardScore.py +13 -2
  32. validmind/tests/model_validation/RougeScore.py +14 -1
  33. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +14 -1
  34. validmind/tests/model_validation/ToxicityScore.py +13 -1
  35. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +14 -2
  36. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +13 -2
  37. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +14 -2
  38. validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +14 -1
  39. validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +14 -1
  40. validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +14 -1
  41. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +14 -1
  42. validmind/tests/output.py +9 -2
  43. validmind/tests/plots/BoxPlot.py +260 -0
  44. validmind/tests/plots/CorrelationHeatmap.py +235 -0
  45. validmind/tests/plots/HistogramPlot.py +233 -0
  46. validmind/tests/plots/ViolinPlot.py +125 -0
  47. validmind/tests/plots/__init__.py +0 -0
  48. validmind/tests/stats/CorrelationAnalysis.py +251 -0
  49. validmind/tests/stats/DescriptiveStats.py +197 -0
  50. validmind/tests/stats/NormalityTests.py +147 -0
  51. validmind/tests/stats/OutlierDetection.py +173 -0
  52. validmind/tests/stats/__init__.py +0 -0
  53. validmind/unit_metrics/classification/individual/AbsoluteError.py +42 -0
  54. validmind/unit_metrics/classification/individual/BrierScore.py +56 -0
  55. validmind/unit_metrics/classification/individual/CalibrationError.py +77 -0
  56. validmind/unit_metrics/classification/individual/ClassBalance.py +65 -0
  57. validmind/unit_metrics/classification/individual/Confidence.py +52 -0
  58. validmind/unit_metrics/classification/individual/Correctness.py +41 -0
  59. validmind/unit_metrics/classification/individual/LogLoss.py +61 -0
  60. validmind/unit_metrics/classification/individual/OutlierScore.py +86 -0
  61. validmind/unit_metrics/classification/individual/ProbabilityError.py +54 -0
  62. validmind/unit_metrics/classification/individual/Uncertainty.py +60 -0
  63. validmind/unit_metrics/classification/individual/__init__.py +0 -0
  64. validmind/vm_models/dataset/dataset.py +147 -1
  65. validmind/vm_models/result/result.py +30 -6
  66. validmind-2.10.0rc1.dist-info/METADATA +845 -0
  67. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/RECORD +70 -49
  68. validmind-2.8.29.dist-info/METADATA +0 -137
  69. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/LICENSE +0 -0
  70. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/WHEEL +0 -0
  71. {validmind-2.8.29.dist-info → validmind-2.10.0rc1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,233 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List, Optional, Union
6
+
7
+ import numpy as np
8
+ import plotly.graph_objects as go
9
+ from plotly.subplots import make_subplots
10
+ from scipy import stats
11
+
12
+ from validmind import tags, tasks
13
+ from validmind.errors import SkipTestError
14
+ from validmind.vm_models import VMDataset
15
+
16
+
17
+ def _validate_columns(dataset: VMDataset, columns: Optional[List[str]]):
18
+ """Validate and return numerical columns."""
19
+ if columns is None:
20
+ columns = dataset.feature_columns_numeric
21
+ else:
22
+ available_columns = set(dataset.feature_columns_numeric)
23
+ columns = [col for col in columns if col in available_columns]
24
+
25
+ if not columns:
26
+ raise SkipTestError("No numerical columns found for histogram plotting")
27
+
28
+ return columns
29
+
30
+
31
+ def _process_column_data(data, log_scale: bool, column: str):
32
+ """Process column data and return plot data and xlabel."""
33
+ plot_data = data
34
+ xlabel = column
35
+ if log_scale and (data > 0).all():
36
+ plot_data = np.log10(data)
37
+ xlabel = f"log10({column})"
38
+ return plot_data, xlabel
39
+
40
+
41
+ def _add_histogram_trace(
42
+ fig, plot_data, bins, color, opacity, normalize, column, row, col
43
+ ):
44
+ """Add histogram trace to figure."""
45
+ histnorm = "probability density" if normalize else None
46
+
47
+ fig.add_trace(
48
+ go.Histogram(
49
+ x=plot_data,
50
+ nbinsx=bins if isinstance(bins, int) else None,
51
+ name=f"Histogram - {column}",
52
+ marker_color=color,
53
+ opacity=opacity,
54
+ histnorm=histnorm,
55
+ showlegend=False,
56
+ ),
57
+ row=row,
58
+ col=col,
59
+ )
60
+
61
+
62
+ def _add_kde_trace(fig, plot_data, bins, normalize, column, row, col):
63
+ """Add KDE trace to figure if possible."""
64
+ try:
65
+ kde = stats.gaussian_kde(plot_data)
66
+ x_range = np.linspace(plot_data.min(), plot_data.max(), 100)
67
+ kde_values = kde(x_range)
68
+
69
+ if not normalize:
70
+ hist_max = (
71
+ len(plot_data) / bins if isinstance(bins, int) else len(plot_data) / 30
72
+ )
73
+ kde_values = kde_values * hist_max / kde_values.max()
74
+
75
+ fig.add_trace(
76
+ go.Scatter(
77
+ x=x_range,
78
+ y=kde_values,
79
+ mode="lines",
80
+ name=f"KDE - {column}",
81
+ line=dict(color="red", width=2),
82
+ showlegend=False,
83
+ ),
84
+ row=row,
85
+ col=col,
86
+ )
87
+ except Exception:
88
+ pass
89
+
90
+
91
+ def _add_stats_annotation(fig, data, idx, row, col):
92
+ """Add statistics annotation to subplot."""
93
+ stats_text = f"Mean: {data.mean():.3f}<br>Std: {data.std():.3f}<br>N: {len(data)}"
94
+ fig.add_annotation(
95
+ text=stats_text,
96
+ x=0.02,
97
+ y=0.98,
98
+ xref=f"x{idx+1} domain" if idx > 0 else "x domain",
99
+ yref=f"y{idx+1} domain" if idx > 0 else "y domain",
100
+ showarrow=False,
101
+ align="left",
102
+ bgcolor="rgba(255,255,255,0.8)",
103
+ bordercolor="black",
104
+ borderwidth=1,
105
+ row=row,
106
+ col=col,
107
+ )
108
+
109
+
110
+ @tags("tabular_data", "visualization", "data_quality")
111
+ @tasks("classification", "regression", "clustering")
112
+ def HistogramPlot(
113
+ dataset: VMDataset,
114
+ columns: Optional[List[str]] = None,
115
+ bins: Union[int, str, List] = 30,
116
+ color: str = "steelblue",
117
+ opacity: float = 0.7,
118
+ show_kde: bool = True,
119
+ normalize: bool = False,
120
+ log_scale: bool = False,
121
+ title_prefix: str = "Histogram of",
122
+ width: int = 1200,
123
+ height: int = 800,
124
+ n_cols: int = 2,
125
+ vertical_spacing: float = 0.15,
126
+ horizontal_spacing: float = 0.1,
127
+ ) -> go.Figure:
128
+ """
129
+ Generates customizable histogram plots for numerical features in a dataset using Plotly.
130
+
131
+ ### Purpose
132
+
133
+ This test provides a flexible way to visualize the distribution of numerical features in a dataset.
134
+ It allows for extensive customization of the histogram appearance and behavior through parameters,
135
+ making it suitable for various exploratory data analysis tasks.
136
+
137
+ ### Test Mechanism
138
+
139
+ The test creates histogram plots for specified numerical columns (or all numerical columns if none specified).
140
+ It supports various customization options including:
141
+ - Number of bins or bin edges
142
+ - Color and opacity
143
+ - Kernel density estimation overlay
144
+ - Logarithmic scaling
145
+ - Normalization options
146
+ - Configurable subplot layout (columns and spacing)
147
+
148
+ ### Signs of High Risk
149
+
150
+ - Highly skewed distributions that may indicate data quality issues
151
+ - Unexpected bimodal or multimodal distributions
152
+ - Presence of extreme outliers
153
+ - Empty or sparse distributions
154
+
155
+ ### Strengths
156
+
157
+ - Highly customizable visualization options
158
+ - Interactive Plotly plots with zoom, pan, and hover capabilities
159
+ - Supports both single and multiple column analysis
160
+ - Provides insights into data distribution patterns
161
+ - Can handle different data types and scales
162
+ - Configurable subplot layout for better visualization
163
+
164
+ ### Limitations
165
+
166
+ - Limited to numerical features only
167
+ - Visual interpretation may be subjective
168
+ - May not be suitable for high-dimensional datasets
169
+ - Performance may degrade with very large datasets
170
+ """
171
+ # Validate inputs
172
+ columns = _validate_columns(dataset, columns)
173
+
174
+ # Calculate subplot layout
175
+ n_cols = min(n_cols, len(columns))
176
+ n_rows = (len(columns) + n_cols - 1) // n_cols
177
+
178
+ # Create subplots
179
+ subplot_titles = [f"{title_prefix} {col}" for col in columns]
180
+ fig = make_subplots(
181
+ rows=n_rows,
182
+ cols=n_cols,
183
+ subplot_titles=subplot_titles,
184
+ vertical_spacing=vertical_spacing,
185
+ horizontal_spacing=horizontal_spacing,
186
+ )
187
+
188
+ for idx, column in enumerate(columns):
189
+ row = (idx // n_cols) + 1
190
+ col = (idx % n_cols) + 1
191
+ data = dataset.df[column].dropna()
192
+
193
+ if len(data) == 0:
194
+ fig.add_annotation(
195
+ text=f"No data available<br>for {column}",
196
+ x=0.5,
197
+ y=0.5,
198
+ xref=f"x{idx+1}" if idx > 0 else "x",
199
+ yref=f"y{idx+1}" if idx > 0 else "y",
200
+ showarrow=False,
201
+ row=row,
202
+ col=col,
203
+ )
204
+ continue
205
+
206
+ # Process data
207
+ plot_data, xlabel = _process_column_data(data, log_scale, column)
208
+
209
+ # Add histogram
210
+ _add_histogram_trace(
211
+ fig, plot_data, bins, color, opacity, normalize, column, row, col
212
+ )
213
+
214
+ # Add KDE if requested
215
+ if show_kde and len(data) > 1:
216
+ _add_kde_trace(fig, plot_data, bins, normalize, column, row, col)
217
+
218
+ # Update axes and add annotations
219
+ fig.update_xaxes(title_text=xlabel, row=row, col=col)
220
+ ylabel = "Density" if normalize else "Frequency"
221
+ fig.update_yaxes(title_text=ylabel, row=row, col=col)
222
+ _add_stats_annotation(fig, data, idx, row, col)
223
+
224
+ # Update layout
225
+ fig.update_layout(
226
+ title_text="Dataset Feature Distributions",
227
+ showlegend=False,
228
+ width=width,
229
+ height=height,
230
+ template="plotly_white",
231
+ )
232
+
233
+ return fig
@@ -0,0 +1,125 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import List, Optional
6
+
7
+ import plotly.express as px
8
+
9
+ from validmind import tags, tasks
10
+ from validmind.errors import SkipTestError
11
+ from validmind.vm_models import VMDataset
12
+
13
+
14
+ @tags("tabular_data", "visualization", "distribution")
15
+ @tasks("classification", "regression", "clustering")
16
+ def ViolinPlot(
17
+ dataset: VMDataset,
18
+ columns: Optional[List[str]] = None,
19
+ group_by: Optional[str] = None,
20
+ width: int = 800,
21
+ height: int = 600,
22
+ ) -> px.violin:
23
+ """
24
+ Generates interactive violin plots for numerical features using Plotly.
25
+
26
+ ### Purpose
27
+
28
+ This test creates violin plots to visualize the distribution of numerical features,
29
+ showing both the probability density and summary statistics. Violin plots combine
30
+ aspects of box plots and kernel density estimation for rich distribution visualization.
31
+
32
+ ### Test Mechanism
33
+
34
+ The test creates violin plots for specified numerical columns, with optional
35
+ grouping by categorical variables. Each violin shows the distribution shape,
36
+ quartiles, and median values.
37
+
38
+ ### Signs of High Risk
39
+
40
+ - Multimodal distributions that might indicate mixed populations
41
+ - Highly skewed distributions suggesting data quality issues
42
+ - Large differences in distribution shapes across groups
43
+ - Unusual distribution patterns that contradict domain expectations
44
+
45
+ ### Strengths
46
+
47
+ - Shows detailed distribution shape information
48
+ - Interactive Plotly visualization with hover details
49
+ - Effective for comparing distributions across groups
50
+ - Combines density estimation with quartile information
51
+
52
+ ### Limitations
53
+
54
+ - Limited to numerical features only
55
+ - Requires sufficient data points for meaningful density estimation
56
+ - May not be suitable for discrete variables
57
+ - Can be misleading with very small sample sizes
58
+ """
59
+ # Get numerical columns
60
+ if columns is None:
61
+ columns = dataset.feature_columns_numeric
62
+ else:
63
+ available_columns = set(dataset.feature_columns_numeric)
64
+ columns = [col for col in columns if col in available_columns]
65
+
66
+ if not columns:
67
+ raise SkipTestError("No numerical columns found for violin plot")
68
+
69
+ # For violin plots, we'll melt the data to long format
70
+ data = dataset.df[columns].dropna()
71
+
72
+ if len(data) == 0:
73
+ raise SkipTestError("No valid data available for violin plot")
74
+
75
+ # Melt the dataframe to long format
76
+ melted_data = data.melt(var_name="Feature", value_name="Value")
77
+
78
+ # Add group column if specified
79
+ if group_by and group_by in dataset.df.columns:
80
+ # Repeat group values for each feature
81
+ group_values = []
82
+ for column in columns:
83
+ column_data = dataset.df[[column, group_by]].dropna()
84
+ group_values.extend(column_data[group_by].tolist())
85
+
86
+ if len(group_values) == len(melted_data):
87
+ melted_data["Group"] = group_values
88
+ else:
89
+ group_by = None # Disable grouping if lengths don't match
90
+
91
+ # Create violin plot
92
+ if group_by and "Group" in melted_data.columns:
93
+ fig = px.violin(
94
+ melted_data,
95
+ x="Feature",
96
+ y="Value",
97
+ color="Group",
98
+ box=True,
99
+ title=f"Distribution of Features by {group_by}",
100
+ width=width,
101
+ height=height,
102
+ )
103
+ else:
104
+ fig = px.violin(
105
+ melted_data,
106
+ x="Feature",
107
+ y="Value",
108
+ box=True,
109
+ title="Feature Distributions",
110
+ width=width,
111
+ height=height,
112
+ )
113
+
114
+ # Update layout
115
+ fig.update_layout(
116
+ template="plotly_white",
117
+ title_x=0.5,
118
+ xaxis_title="Features",
119
+ yaxis_title="Values",
120
+ )
121
+
122
+ # Rotate x-axis labels for better readability
123
+ fig.update_xaxes(tickangle=45)
124
+
125
+ return fig
File without changes
@@ -0,0 +1,251 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from scipy import stats
10
+
11
+ from validmind import tags, tasks
12
+ from validmind.errors import SkipTestError
13
+ from validmind.utils import format_records
14
+ from validmind.vm_models import VMDataset
15
+
16
+
17
+ def _validate_and_prepare_data(dataset: VMDataset, columns: Optional[List[str]]):
18
+ """Validate inputs and prepare data for correlation analysis."""
19
+ if columns is None:
20
+ columns = dataset.feature_columns_numeric
21
+ else:
22
+ available_columns = set(dataset.feature_columns_numeric)
23
+ columns = [col for col in columns if col in available_columns]
24
+
25
+ if not columns:
26
+ raise SkipTestError("No numerical columns found for correlation analysis")
27
+
28
+ if len(columns) < 2:
29
+ raise SkipTestError(
30
+ "At least 2 numerical columns required for correlation analysis"
31
+ )
32
+
33
+ # Get data and remove constant columns
34
+ data = dataset.df[columns].dropna()
35
+ data = data.loc[:, data.var() != 0]
36
+
37
+ if data.shape[1] < 2:
38
+ raise SkipTestError(
39
+ "Insufficient non-constant columns for correlation analysis"
40
+ )
41
+
42
+ return data
43
+
44
+
45
+ def _compute_correlation_matrices(data, method: str):
46
+ """Compute correlation and p-value matrices based on method."""
47
+ if method == "pearson":
48
+ return _compute_pearson_with_pvalues(data)
49
+ elif method == "spearman":
50
+ return _compute_spearman_with_pvalues(data)
51
+ elif method == "kendall":
52
+ return _compute_kendall_with_pvalues(data)
53
+ else:
54
+ raise ValueError(f"Unsupported correlation method: {method}")
55
+
56
+
57
+ def _create_correlation_pairs(
58
+ corr_matrix, p_matrix, significance_level: float, min_correlation: float
59
+ ):
60
+ """Create correlation pairs table."""
61
+ correlation_pairs = []
62
+
63
+ for i, col1 in enumerate(corr_matrix.columns):
64
+ for j, col2 in enumerate(corr_matrix.columns):
65
+ if i < j: # Only upper triangle to avoid duplicates
66
+ corr_val = corr_matrix.iloc[i, j]
67
+ p_val = p_matrix.iloc[i, j]
68
+
69
+ if abs(corr_val) >= min_correlation:
70
+ pair_info = {
71
+ "Feature 1": col1,
72
+ "Feature 2": col2,
73
+ "Correlation": corr_val,
74
+ "Abs Correlation": abs(corr_val),
75
+ "p-value": p_val,
76
+ "Significant": "Yes" if p_val < significance_level else "No",
77
+ "Strength": _correlation_strength(abs(corr_val)),
78
+ "Direction": "Positive" if corr_val > 0 else "Negative",
79
+ }
80
+ correlation_pairs.append(pair_info)
81
+
82
+ # Sort by absolute correlation value
83
+ correlation_pairs.sort(key=lambda x: x["Abs Correlation"], reverse=True)
84
+ return correlation_pairs
85
+
86
+
87
+ def _create_summary_statistics(corr_matrix, correlation_pairs):
88
+ """Create summary statistics table."""
89
+ all_correlations = []
90
+ for i in range(len(corr_matrix.columns)):
91
+ for j in range(i + 1, len(corr_matrix.columns)):
92
+ all_correlations.append(abs(corr_matrix.iloc[i, j]))
93
+
94
+ significant_count = sum(
95
+ 1 for pair in correlation_pairs if pair["Significant"] == "Yes"
96
+ )
97
+ high_corr_count = sum(
98
+ 1 for pair in correlation_pairs if pair["Abs Correlation"] > 0.7
99
+ )
100
+ very_high_corr_count = sum(
101
+ 1 for pair in correlation_pairs if pair["Abs Correlation"] > 0.9
102
+ )
103
+
104
+ return {
105
+ "Total Feature Pairs": len(all_correlations),
106
+ "Pairs Above Threshold": len(correlation_pairs),
107
+ "Significant Correlations": significant_count,
108
+ "High Correlations (>0.7)": high_corr_count,
109
+ "Very High Correlations (>0.9)": very_high_corr_count,
110
+ "Mean Absolute Correlation": np.mean(all_correlations),
111
+ "Max Absolute Correlation": np.max(all_correlations),
112
+ "Median Absolute Correlation": np.median(all_correlations),
113
+ }
114
+
115
+
116
+ @tags("tabular_data", "statistics", "correlation")
117
+ @tasks("classification", "regression", "clustering")
118
+ def CorrelationAnalysis(
119
+ dataset: VMDataset,
120
+ columns: Optional[List[str]] = None,
121
+ method: str = "pearson",
122
+ significance_level: float = 0.05,
123
+ min_correlation: float = 0.1,
124
+ ) -> Dict[str, Any]:
125
+ """
126
+ Performs comprehensive correlation analysis with significance testing for numerical features.
127
+
128
+ ### Purpose
129
+
130
+ This test conducts detailed correlation analysis between numerical features, including
131
+ correlation coefficients, significance testing, and identification of significant
132
+ relationships. It helps identify multicollinearity, feature relationships, and
133
+ potential redundancies in the dataset.
134
+
135
+ ### Test Mechanism
136
+
137
+ The test computes correlation coefficients using the specified method and performs
138
+ statistical significance testing for each correlation pair. It provides:
139
+ - Correlation matrix with significance indicators
140
+ - List of significant correlations above threshold
141
+ - Summary statistics about correlation patterns
142
+ - Identification of highly correlated feature pairs
143
+
144
+ ### Signs of High Risk
145
+
146
+ - Very high correlations (>0.9) indicating potential multicollinearity
147
+ - Many significant correlations suggesting complex feature interactions
148
+ - Features with no significant correlations to others (potential isolation)
149
+ - Unexpected correlation patterns contradicting domain knowledge
150
+
151
+ ### Strengths
152
+
153
+ - Provides statistical significance testing for correlations
154
+ - Supports multiple correlation methods (Pearson, Spearman, Kendall)
155
+ - Identifies potentially problematic high correlations
156
+ - Filters results by minimum correlation threshold
157
+ - Comprehensive summary of correlation patterns
158
+
159
+ ### Limitations
160
+
161
+ - Limited to numerical features only
162
+ - Cannot detect non-linear relationships (except with Spearman)
163
+ - Significance testing assumes certain distributional properties
164
+ - Correlation does not imply causation
165
+ """
166
+ # Validate and prepare data
167
+ data = _validate_and_prepare_data(dataset, columns)
168
+
169
+ # Compute correlation matrices
170
+ corr_matrix, p_matrix = _compute_correlation_matrices(data, method)
171
+
172
+ # Create correlation pairs
173
+ correlation_pairs = _create_correlation_pairs(
174
+ corr_matrix, p_matrix, significance_level, min_correlation
175
+ )
176
+
177
+ # Build results
178
+ results = {}
179
+ if correlation_pairs:
180
+ results["Correlation Pairs"] = format_records(pd.DataFrame(correlation_pairs))
181
+
182
+ # Create summary statistics
183
+ summary_stats = _create_summary_statistics(corr_matrix, correlation_pairs)
184
+ results["Summary Statistics"] = format_records(pd.DataFrame([summary_stats]))
185
+
186
+ return results
187
+
188
+
189
+ def _compute_pearson_with_pvalues(data):
190
+ """Compute Pearson correlation with p-values"""
191
+ n_vars = data.shape[1]
192
+ corr_matrix = data.corr(method="pearson")
193
+ p_matrix = pd.DataFrame(
194
+ np.zeros((n_vars, n_vars)), index=corr_matrix.index, columns=corr_matrix.columns
195
+ )
196
+
197
+ for i, col1 in enumerate(data.columns):
198
+ for j, col2 in enumerate(data.columns):
199
+ if i != j:
200
+ _, p_val = stats.pearsonr(data[col1], data[col2])
201
+ p_matrix.iloc[i, j] = p_val
202
+
203
+ return corr_matrix, p_matrix
204
+
205
+
206
+ def _compute_spearman_with_pvalues(data):
207
+ """Compute Spearman correlation with p-values"""
208
+ n_vars = data.shape[1]
209
+ corr_matrix = data.corr(method="spearman")
210
+ p_matrix = pd.DataFrame(
211
+ np.zeros((n_vars, n_vars)), index=corr_matrix.index, columns=corr_matrix.columns
212
+ )
213
+
214
+ for i, col1 in enumerate(data.columns):
215
+ for j, col2 in enumerate(data.columns):
216
+ if i != j:
217
+ _, p_val = stats.spearmanr(data[col1], data[col2])
218
+ p_matrix.iloc[i, j] = p_val
219
+
220
+ return corr_matrix, p_matrix
221
+
222
+
223
+ def _compute_kendall_with_pvalues(data):
224
+ """Compute Kendall correlation with p-values"""
225
+ n_vars = data.shape[1]
226
+ corr_matrix = data.corr(method="kendall")
227
+ p_matrix = pd.DataFrame(
228
+ np.zeros((n_vars, n_vars)), index=corr_matrix.index, columns=corr_matrix.columns
229
+ )
230
+
231
+ for i, col1 in enumerate(data.columns):
232
+ for j, col2 in enumerate(data.columns):
233
+ if i != j:
234
+ _, p_val = stats.kendalltau(data[col1], data[col2])
235
+ p_matrix.iloc[i, j] = p_val
236
+
237
+ return corr_matrix, p_matrix
238
+
239
+
240
+ def _correlation_strength(abs_corr):
241
+ """Classify correlation strength"""
242
+ if abs_corr >= 0.9:
243
+ return "Very Strong"
244
+ elif abs_corr >= 0.7:
245
+ return "Strong"
246
+ elif abs_corr >= 0.5:
247
+ return "Moderate"
248
+ elif abs_corr >= 0.3:
249
+ return "Weak"
250
+ else:
251
+ return "Very Weak"