validmind 2.3.3__py3-none-any.whl → 2.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/datasets/regression/fred_timeseries.py +272 -0
- validmind/tests/__types__.py +10 -0
- validmind/tests/data_validation/SeasonalDecompose.py +68 -40
- validmind/tests/data_validation/TimeSeriesDescription.py +74 -0
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +76 -0
- validmind/tests/data_validation/TimeSeriesHistogram.py +29 -45
- validmind/tests/data_validation/TimeSeriesOutliers.py +30 -41
- validmind/tests/model_validation/ModelMetadataComparison.py +59 -0
- validmind/tests/model_validation/ModelPredictionResiduals.py +103 -0
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +131 -0
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +76 -0
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +103 -0
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +83 -0
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +76 -0
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +63 -0
- {validmind-2.3.3.dist-info → validmind-2.3.5.dist-info}/METADATA +70 -36
- {validmind-2.3.3.dist-info → validmind-2.3.5.dist-info}/RECORD +23 -12
- /validmind/datasets/regression/datasets/{lending_club_loan_rates.csv → leanding_club_loan_rates.csv} +0 -0
- {validmind-2.3.3.dist-info → validmind-2.3.5.dist-info}/LICENSE +0 -0
- {validmind-2.3.3.dist-info → validmind-2.3.5.dist-info}/WHEEL +0 -0
- {validmind-2.3.3.dist-info → validmind-2.3.5.dist-info}/entry_points.txt +0 -0
validmind/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "2.3.
|
1
|
+
__version__ = "2.3.5"
|
@@ -0,0 +1,272 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import os
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
current_path = os.path.dirname(os.path.abspath(__file__))
|
10
|
+
mortgage30us_path = os.path.join(current_path, "datasets", "fred", "MORTGAGE30US.csv")
|
11
|
+
fedfunds_path = os.path.join(current_path, "datasets", "fred", "FEDFUNDS.csv")
|
12
|
+
gs10_path = os.path.join(current_path, "datasets", "fred", "GS10.csv")
|
13
|
+
unrate_path = os.path.join(current_path, "datasets", "fred", "UNRATE.csv")
|
14
|
+
|
15
|
+
target_column = "MORTGAGE30US"
|
16
|
+
feature_columns = ["FEDFUNDS", "GS10", "UNRATE"]
|
17
|
+
|
18
|
+
|
19
|
+
def get_common_date_range(dfs):
|
20
|
+
start_dates = [df.index.min() for df in dfs]
|
21
|
+
end_dates = [df.index.max() for df in dfs]
|
22
|
+
|
23
|
+
common_start_date = max(start_dates)
|
24
|
+
common_end_date = min(end_dates)
|
25
|
+
|
26
|
+
return common_start_date, common_end_date
|
27
|
+
|
28
|
+
|
29
|
+
def align_date_range(dfs, start_date, end_date):
|
30
|
+
return [df.loc[start_date:end_date] for df in dfs]
|
31
|
+
|
32
|
+
|
33
|
+
def load_data():
|
34
|
+
mortgage30us = pd.read_csv(
|
35
|
+
mortgage30us_path, parse_dates=["DATE"], index_col="DATE"
|
36
|
+
)
|
37
|
+
fedfunds = pd.read_csv(fedfunds_path, parse_dates=["DATE"], index_col="DATE")
|
38
|
+
gs10 = pd.read_csv(gs10_path, parse_dates=["DATE"], index_col="DATE")
|
39
|
+
unrate = pd.read_csv(unrate_path, parse_dates=["DATE"], index_col="DATE")
|
40
|
+
|
41
|
+
# Resample mortgage30us to monthly frequency
|
42
|
+
mortgage30us = mortgage30us.resample("MS").last()
|
43
|
+
|
44
|
+
# Get the common date range
|
45
|
+
common_start_date, common_end_date = get_common_date_range(
|
46
|
+
[mortgage30us, fedfunds, gs10, unrate]
|
47
|
+
)
|
48
|
+
|
49
|
+
# Align the date range for all dataframes
|
50
|
+
mortgage30us, fedfunds, gs10, unrate = align_date_range(
|
51
|
+
[mortgage30us, fedfunds, gs10, unrate], common_start_date, common_end_date
|
52
|
+
)
|
53
|
+
|
54
|
+
# Combine into a single DataFrame
|
55
|
+
df = pd.concat([mortgage30us, fedfunds, gs10, unrate], axis=1, join="inner")
|
56
|
+
df.columns = [target_column] + feature_columns
|
57
|
+
|
58
|
+
return df
|
59
|
+
|
60
|
+
|
61
|
+
# Convert data back to levels
|
62
|
+
def convert_to_levels(diff_df, original_df, target_column):
|
63
|
+
"""
|
64
|
+
Convert differenced data back to original levels.
|
65
|
+
"""
|
66
|
+
previous_values = original_df[target_column].shift(1).dropna()
|
67
|
+
levels_df = diff_df.add(previous_values, axis=0)
|
68
|
+
return levels_df
|
69
|
+
|
70
|
+
|
71
|
+
def get_demo_test_config(test_suite=None):
|
72
|
+
|
73
|
+
default_config = {}
|
74
|
+
|
75
|
+
default_config["validmind.data_validation.TimeSeriesDescription"] = {
|
76
|
+
"inputs": {
|
77
|
+
"dataset": "raw_ds",
|
78
|
+
}
|
79
|
+
}
|
80
|
+
default_config["validmind.data_validation.TimeSeriesLinePlot"] = {
|
81
|
+
"inputs": {
|
82
|
+
"dataset": "raw_ds",
|
83
|
+
}
|
84
|
+
}
|
85
|
+
default_config["validmind.data_validation.TimeSeriesMissingValues"] = {
|
86
|
+
"inputs": {
|
87
|
+
"dataset": "raw_ds",
|
88
|
+
}
|
89
|
+
}
|
90
|
+
default_config["validmind.data_validation.SeasonalDecompose"] = {
|
91
|
+
"inputs": {
|
92
|
+
"dataset": "raw_ds",
|
93
|
+
}
|
94
|
+
}
|
95
|
+
default_config[
|
96
|
+
"validmind.data_validation.TimeSeriesDescriptiveStatistics:train_diff_data"
|
97
|
+
] = {
|
98
|
+
"inputs": {
|
99
|
+
"dataset": "train_diff_ds",
|
100
|
+
}
|
101
|
+
}
|
102
|
+
default_config[
|
103
|
+
"validmind.data_validation.TimeSeriesDescriptiveStatistics:test_diff_data"
|
104
|
+
] = {
|
105
|
+
"inputs": {
|
106
|
+
"dataset": "test_diff_ds",
|
107
|
+
}
|
108
|
+
}
|
109
|
+
default_config["validmind.data_validation.TimeSeriesOutliers:train_diff_data"] = {
|
110
|
+
"inputs": {
|
111
|
+
"dataset": "train_diff_ds",
|
112
|
+
},
|
113
|
+
"params": {"zscore_threshold": 4},
|
114
|
+
}
|
115
|
+
default_config["validmind.data_validation.TimeSeriesOutliers:test_diff_data"] = {
|
116
|
+
"inputs": {
|
117
|
+
"dataset": "test_diff_ds",
|
118
|
+
},
|
119
|
+
"params": {"zscore_threshold": 4},
|
120
|
+
}
|
121
|
+
default_config["validmind.data_validation.TimeSeriesHistogram:train_diff_data"] = {
|
122
|
+
"inputs": {
|
123
|
+
"dataset": "train_diff_ds",
|
124
|
+
},
|
125
|
+
"params": {"nbins": 100},
|
126
|
+
}
|
127
|
+
default_config["validmind.data_validation.TimeSeriesHistogram:test_diff_data"] = {
|
128
|
+
"inputs": {
|
129
|
+
"dataset": "test_diff_ds",
|
130
|
+
},
|
131
|
+
"params": {"nbins": 100},
|
132
|
+
}
|
133
|
+
default_config["validmind.data_validation.DatasetSplit"] = {
|
134
|
+
"inputs": {
|
135
|
+
"datasets": ["train_diff_ds", "test_diff_ds"],
|
136
|
+
}
|
137
|
+
}
|
138
|
+
default_config["validmind.model_validation.ModelMetadataComparison"] = {
|
139
|
+
"inputs": {
|
140
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
141
|
+
}
|
142
|
+
}
|
143
|
+
default_config[
|
144
|
+
"validmind.model_validation.sklearn.RegressionErrorsComparison:train_data"
|
145
|
+
] = {
|
146
|
+
"inputs": {
|
147
|
+
"datasets": ["train_ds", "train_ds"],
|
148
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
149
|
+
}
|
150
|
+
}
|
151
|
+
default_config[
|
152
|
+
"validmind.model_validation.sklearn.RegressionErrorsComparison:test_data"
|
153
|
+
] = {
|
154
|
+
"inputs": {
|
155
|
+
"datasets": ["test_ds", "test_ds"],
|
156
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
157
|
+
}
|
158
|
+
}
|
159
|
+
default_config[
|
160
|
+
"validmind.model_validation.sklearn.RegressionR2SquareComparison:train_data"
|
161
|
+
] = {
|
162
|
+
"inputs": {
|
163
|
+
"datasets": ["train_ds", "train_ds"],
|
164
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
165
|
+
}
|
166
|
+
}
|
167
|
+
default_config[
|
168
|
+
"validmind.model_validation.sklearn.RegressionR2SquareComparison:test_data"
|
169
|
+
] = {
|
170
|
+
"inputs": {
|
171
|
+
"datasets": ["test_ds", "test_ds"],
|
172
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
173
|
+
}
|
174
|
+
}
|
175
|
+
default_config[
|
176
|
+
"validmind.model_validation.TimeSeriesR2SquareBySegments:train_data"
|
177
|
+
] = {
|
178
|
+
"inputs": {
|
179
|
+
"datasets": ["train_ds", "train_ds"],
|
180
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
181
|
+
}
|
182
|
+
}
|
183
|
+
default_config[
|
184
|
+
"validmind.model_validation.TimeSeriesR2SquareBySegments:test_data"
|
185
|
+
] = {
|
186
|
+
"inputs": {
|
187
|
+
"datasets": ["test_ds", "test_ds"],
|
188
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
189
|
+
},
|
190
|
+
"params": {
|
191
|
+
"segments": {
|
192
|
+
"start_date": ["2012-11-01", "2018-02-01"],
|
193
|
+
"end_date": ["2018-01-01", "2023-03-01"],
|
194
|
+
}
|
195
|
+
},
|
196
|
+
}
|
197
|
+
default_config[
|
198
|
+
"validmind.model_validation.TimeSeriesPredictionsPlot:train_data"
|
199
|
+
] = {
|
200
|
+
"inputs": {
|
201
|
+
"datasets": ["train_ds", "train_ds"],
|
202
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
203
|
+
}
|
204
|
+
}
|
205
|
+
default_config["validmind.model_validation.TimeSeriesPredictionsPlot:test_data"] = {
|
206
|
+
"inputs": {
|
207
|
+
"datasets": ["test_ds", "test_ds"],
|
208
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
209
|
+
}
|
210
|
+
}
|
211
|
+
default_config[
|
212
|
+
"validmind.model_validation.TimeSeriesPredictionWithCI:random_forests_model"
|
213
|
+
] = {
|
214
|
+
"inputs": {
|
215
|
+
"dataset": "test_ds",
|
216
|
+
"model": "random_forests_model",
|
217
|
+
}
|
218
|
+
}
|
219
|
+
default_config[
|
220
|
+
"validmind.model_validation.TimeSeriesPredictionWithCI:gradient_boosting_model"
|
221
|
+
] = {
|
222
|
+
"inputs": {
|
223
|
+
"dataset": "test_ds",
|
224
|
+
"model": "gradient_boosting_model",
|
225
|
+
}
|
226
|
+
}
|
227
|
+
default_config["validmind.model_validation.ModelPredictionResiduals:train_data"] = {
|
228
|
+
"inputs": {
|
229
|
+
"datasets": ["train_ds", "train_ds"],
|
230
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
231
|
+
}
|
232
|
+
}
|
233
|
+
default_config["validmind.model_validation.ModelPredictionResiduals:test_data"] = {
|
234
|
+
"inputs": {
|
235
|
+
"datasets": ["test_ds", "test_ds"],
|
236
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
237
|
+
}
|
238
|
+
}
|
239
|
+
default_config[
|
240
|
+
"validmind.model_validation.sklearn.FeatureImportanceComparison:train_data"
|
241
|
+
] = {
|
242
|
+
"inputs": {
|
243
|
+
"datasets": ["train_ds", "train_ds"],
|
244
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
245
|
+
}
|
246
|
+
}
|
247
|
+
default_config[
|
248
|
+
"validmind.model_validation.sklearn.FeatureImportanceComparison:test_data"
|
249
|
+
] = {
|
250
|
+
"inputs": {
|
251
|
+
"datasets": ["test_ds", "test_ds"],
|
252
|
+
"models": ["random_forests_model", "gradient_boosting_model"],
|
253
|
+
}
|
254
|
+
}
|
255
|
+
default_config[
|
256
|
+
"validmind.model_validation.sklearn.PermutationFeatureImportance:random_forests_model"
|
257
|
+
] = {
|
258
|
+
"inputs": {
|
259
|
+
"dataset": "test_ds",
|
260
|
+
"model": "random_forests_model",
|
261
|
+
}
|
262
|
+
}
|
263
|
+
default_config[
|
264
|
+
"validmind.model_validation.sklearn.PermutationFeatureImportance:gradient_boosting_model"
|
265
|
+
] = {
|
266
|
+
"inputs": {
|
267
|
+
"dataset": "test_ds",
|
268
|
+
"model": "gradient_boosting_model",
|
269
|
+
}
|
270
|
+
}
|
271
|
+
|
272
|
+
return default_config
|
validmind/tests/__types__.py
CHANGED
@@ -18,9 +18,12 @@ TestID = Literal[
|
|
18
18
|
"validmind.prompt_validation.NegativeInstruction",
|
19
19
|
"validmind.prompt_validation.Conciseness",
|
20
20
|
"validmind.prompt_validation.Delimitation",
|
21
|
+
"validmind.model_validation.ModelPredictionResiduals",
|
21
22
|
"validmind.model_validation.BertScore",
|
23
|
+
"validmind.model_validation.TimeSeriesPredictionsPlot",
|
22
24
|
"validmind.model_validation.RegardScore",
|
23
25
|
"validmind.model_validation.BleuScore",
|
26
|
+
"validmind.model_validation.TimeSeriesPredictionWithCI",
|
24
27
|
"validmind.model_validation.RegressionResidualsPlot",
|
25
28
|
"validmind.model_validation.FeaturesAUC",
|
26
29
|
"validmind.model_validation.ContextualRecall",
|
@@ -30,6 +33,8 @@ TestID = Literal[
|
|
30
33
|
"validmind.model_validation.ClusterSizeDistribution",
|
31
34
|
"validmind.model_validation.TokenDisparity",
|
32
35
|
"validmind.model_validation.ToxicityScore",
|
36
|
+
"validmind.model_validation.ModelMetadataComparison",
|
37
|
+
"validmind.model_validation.TimeSeriesR2SquareBySegments",
|
33
38
|
"validmind.model_validation.embeddings.CosineSimilarityComparison",
|
34
39
|
"validmind.model_validation.embeddings.EmbeddingsVisualization2D",
|
35
40
|
"validmind.model_validation.embeddings.StabilityAnalysisRandomNoise",
|
@@ -77,11 +82,14 @@ TestID = Literal[
|
|
77
82
|
"validmind.model_validation.sklearn.RegressionR2Square",
|
78
83
|
"validmind.model_validation.sklearn.RegressionErrors",
|
79
84
|
"validmind.model_validation.sklearn.ClusterPerformance",
|
85
|
+
"validmind.model_validation.sklearn.FeatureImportanceComparison",
|
80
86
|
"validmind.model_validation.sklearn.TrainingTestDegradation",
|
87
|
+
"validmind.model_validation.sklearn.RegressionErrorsComparison",
|
81
88
|
"validmind.model_validation.sklearn.HyperParametersTuning",
|
82
89
|
"validmind.model_validation.sklearn.KMeansClustersOptimization",
|
83
90
|
"validmind.model_validation.sklearn.ModelsPerformanceComparison",
|
84
91
|
"validmind.model_validation.sklearn.WeakspotsDiagnosis",
|
92
|
+
"validmind.model_validation.sklearn.RegressionR2SquareComparison",
|
85
93
|
"validmind.model_validation.sklearn.PopulationStabilityIndex",
|
86
94
|
"validmind.model_validation.sklearn.MinimumAccuracy",
|
87
95
|
"validmind.model_validation.statsmodels.RegressionModelsCoeffs",
|
@@ -118,6 +126,7 @@ TestID = Literal[
|
|
118
126
|
"validmind.data_validation.TabularCategoricalBarPlots",
|
119
127
|
"validmind.data_validation.AutoStationarity",
|
120
128
|
"validmind.data_validation.DescriptiveStatistics",
|
129
|
+
"validmind.data_validation.TimeSeriesDescription",
|
121
130
|
"validmind.data_validation.ANOVAOneWayTable",
|
122
131
|
"validmind.data_validation.TargetRateBarPlots",
|
123
132
|
"validmind.data_validation.PearsonCorrelationMatrix",
|
@@ -154,6 +163,7 @@ TestID = Literal[
|
|
154
163
|
"validmind.data_validation.ClassImbalance",
|
155
164
|
"validmind.data_validation.IQROutliersBarPlot",
|
156
165
|
"validmind.data_validation.DFGLSArch",
|
166
|
+
"validmind.data_validation.TimeSeriesDescriptiveStatistics",
|
157
167
|
"validmind.data_validation.AutoAR",
|
158
168
|
"validmind.data_validation.TabularDateTimeHistograms",
|
159
169
|
"validmind.data_validation.ADF",
|
@@ -4,10 +4,10 @@
|
|
4
4
|
|
5
5
|
import warnings
|
6
6
|
|
7
|
-
import matplotlib.pyplot as plt
|
8
7
|
import numpy as np
|
9
8
|
import pandas as pd
|
10
|
-
import
|
9
|
+
import plotly.graph_objects as go
|
10
|
+
from plotly.subplots import make_subplots
|
11
11
|
from scipy import stats
|
12
12
|
from statsmodels.tsa.seasonal import seasonal_decompose
|
13
13
|
|
@@ -132,7 +132,6 @@ class SeasonalDecompose(Metric):
|
|
132
132
|
inferred_freq = pd.infer_freq(series.index)
|
133
133
|
|
134
134
|
if inferred_freq is not None:
|
135
|
-
logger.info(f"Frequency of {col}: {inferred_freq}")
|
136
135
|
|
137
136
|
# Only take finite values to seasonal_decompose
|
138
137
|
sd = seasonal_decompose(
|
@@ -142,58 +141,87 @@ class SeasonalDecompose(Metric):
|
|
142
141
|
|
143
142
|
results[col] = self.serialize_seasonal_decompose(sd)
|
144
143
|
|
145
|
-
# Create subplots
|
146
|
-
fig
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
144
|
+
# Create subplots using Plotly
|
145
|
+
fig = make_subplots(
|
146
|
+
rows=3,
|
147
|
+
cols=2,
|
148
|
+
subplot_titles=(
|
149
|
+
"Observed",
|
150
|
+
"Trend",
|
151
|
+
"Seasonal",
|
152
|
+
"Residuals",
|
153
|
+
"Histogram and KDE of Residuals",
|
154
|
+
"Normal Q-Q Plot of Residuals",
|
155
|
+
),
|
156
|
+
vertical_spacing=0.1,
|
155
157
|
)
|
156
158
|
|
157
|
-
# Original seasonal decomposition plots
|
158
159
|
# Observed
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
160
|
+
fig.add_trace(
|
161
|
+
go.Scatter(x=sd.observed.index, y=sd.observed, name="Observed"),
|
162
|
+
row=1,
|
163
|
+
col=1,
|
164
|
+
)
|
163
165
|
|
164
166
|
# Trend
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
167
|
+
fig.add_trace(
|
168
|
+
go.Scatter(x=sd.trend.index, y=sd.trend, name="Trend"),
|
169
|
+
row=1,
|
170
|
+
col=2,
|
171
|
+
)
|
169
172
|
|
170
173
|
# Seasonal
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
174
|
+
fig.add_trace(
|
175
|
+
go.Scatter(x=sd.seasonal.index, y=sd.seasonal, name="Seasonal"),
|
176
|
+
row=2,
|
177
|
+
col=1,
|
178
|
+
)
|
175
179
|
|
176
180
|
# Residuals
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
+
fig.add_trace(
|
182
|
+
go.Scatter(x=sd.resid.index, y=sd.resid, name="Residuals"),
|
183
|
+
row=2,
|
184
|
+
col=2,
|
185
|
+
)
|
181
186
|
|
182
187
|
# Histogram with KDE
|
183
188
|
residuals = sd.resid.dropna()
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
189
|
+
fig.add_trace(
|
190
|
+
go.Histogram(x=residuals, nbinsx=100, name="Residuals"),
|
191
|
+
row=3,
|
192
|
+
col=1,
|
193
|
+
)
|
188
194
|
|
189
195
|
# Normal Q-Q plot
|
190
|
-
stats.probplot(residuals, plot=
|
191
|
-
|
192
|
-
|
193
|
-
|
196
|
+
qq = stats.probplot(residuals, plot=None)
|
197
|
+
qq_line_slope, qq_line_intercept = stats.linregress(
|
198
|
+
qq[0][0], qq[0][1]
|
199
|
+
)[:2]
|
200
|
+
qq_line = qq_line_slope * np.array(qq[0][0]) + qq_line_intercept
|
201
|
+
|
202
|
+
fig.add_trace(
|
203
|
+
go.Scatter(
|
204
|
+
x=qq[0][0], y=qq[0][1], mode="markers", name="QQ plot"
|
205
|
+
),
|
206
|
+
row=3,
|
207
|
+
col=2,
|
208
|
+
)
|
209
|
+
fig.add_trace(
|
210
|
+
go.Scatter(
|
211
|
+
x=qq[0][0],
|
212
|
+
y=qq_line,
|
213
|
+
mode="lines",
|
214
|
+
name="QQ line",
|
215
|
+
),
|
216
|
+
row=3,
|
217
|
+
col=2,
|
218
|
+
)
|
194
219
|
|
195
|
-
|
196
|
-
|
220
|
+
fig.update_layout(
|
221
|
+
height=1000,
|
222
|
+
title_text=f"Seasonal Decomposition for {col}",
|
223
|
+
showlegend=False,
|
224
|
+
)
|
197
225
|
|
198
226
|
figures.append(
|
199
227
|
Figure(
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
from validmind import tags, tasks
|
8
|
+
|
9
|
+
|
10
|
+
@tags("time_series_data", "analysis")
|
11
|
+
@tasks("regression")
|
12
|
+
def TimeSeriesDescription(dataset):
|
13
|
+
"""
|
14
|
+
Generates a detailed analysis for the provided time series dataset.
|
15
|
+
|
16
|
+
**Purpose**: The purpose of the TimeSeriesDescription function is to analyze an individual time series
|
17
|
+
by providing a summary of key statistics. This helps in understanding trends, patterns, and data quality issues
|
18
|
+
within the time series.
|
19
|
+
|
20
|
+
**Test Mechanism**: The function extracts the time series data and provides a summary of key statistics.
|
21
|
+
The dataset is expected to have a datetime index. The function checks this and raises an error if the index is
|
22
|
+
not in datetime format. For each variable (column) in the dataset, appropriate statistics including start date,
|
23
|
+
end date, frequency, number of missing values, count, min, and max values are calculated.
|
24
|
+
|
25
|
+
**Signs of High Risk**:
|
26
|
+
- If the index of the dataset is not in datetime format, it could lead to errors in time-series analysis.
|
27
|
+
- Inconsistent or missing data within the dataset might affect the analysis of trends and patterns.
|
28
|
+
|
29
|
+
**Strengths**:
|
30
|
+
- This function provides a comprehensive summary of key statistics for each variable, helping to identify data quality
|
31
|
+
issues such as missing values.
|
32
|
+
- The function helps in understanding the distribution and range of the data by including min and max values.
|
33
|
+
|
34
|
+
**Limitations**:
|
35
|
+
- This function assumes that the dataset is provided as a DataFrameDataset object with a .df attribute to access
|
36
|
+
the pandas DataFrame.
|
37
|
+
- It only analyzes datasets with a datetime index and will raise an error for other types of indices.
|
38
|
+
- The function does not handle large datasets efficiently, and performance may degrade with very large datasets.
|
39
|
+
"""
|
40
|
+
|
41
|
+
summary = []
|
42
|
+
|
43
|
+
df = (
|
44
|
+
dataset.df
|
45
|
+
) # Assuming DataFrameDataset objects have a .df attribute to get the pandas DataFrame
|
46
|
+
|
47
|
+
if not pd.api.types.is_datetime64_any_dtype(df.index):
|
48
|
+
raise ValueError(f"Dataset {dataset.input_id} must have a datetime index")
|
49
|
+
|
50
|
+
for column in df.columns:
|
51
|
+
start_date = df.index.min().strftime("%Y-%m-%d")
|
52
|
+
end_date = df.index.max().strftime("%Y-%m-%d")
|
53
|
+
frequency = pd.infer_freq(df.index)
|
54
|
+
num_missing_values = df[column].isna().sum()
|
55
|
+
count = df[column].count()
|
56
|
+
min_value = df[column].min()
|
57
|
+
max_value = df[column].max()
|
58
|
+
|
59
|
+
summary.append(
|
60
|
+
{
|
61
|
+
"Variable": column,
|
62
|
+
"Start Date": start_date,
|
63
|
+
"End Date": end_date,
|
64
|
+
"Frequency": frequency,
|
65
|
+
"Num of Missing Values": num_missing_values,
|
66
|
+
"Count": count,
|
67
|
+
"Min Value": min_value,
|
68
|
+
"Max Value": max_value,
|
69
|
+
}
|
70
|
+
)
|
71
|
+
|
72
|
+
result_df = pd.DataFrame(summary)
|
73
|
+
|
74
|
+
return result_df
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
from scipy.stats import kurtosis, skew
|
7
|
+
|
8
|
+
from validmind import tags, tasks
|
9
|
+
|
10
|
+
|
11
|
+
@tags("time_series_data", "analysis")
|
12
|
+
@tasks("regression")
|
13
|
+
def TimeSeriesDescriptiveStatistics(dataset):
|
14
|
+
"""
|
15
|
+
Generates a detailed table of descriptive statistics for the provided time series dataset.
|
16
|
+
|
17
|
+
**Purpose**: The purpose of the TimeSeriesDescriptiveStatistics function is to analyze an individual time series
|
18
|
+
by providing a summary of key descriptive statistics. This helps in understanding trends, patterns, and data quality issues
|
19
|
+
within the time series.
|
20
|
+
|
21
|
+
**Test Mechanism**: The function extracts the time series data and provides a summary of key descriptive statistics.
|
22
|
+
The dataset is expected to have a datetime index. The function checks this and raises an error if the index is
|
23
|
+
not in datetime format. For each variable (column) in the dataset, appropriate statistics including start date,
|
24
|
+
end date, min, mean, max, skewness, kurtosis, and count are calculated.
|
25
|
+
|
26
|
+
**Signs of High Risk**:
|
27
|
+
- If the index of the dataset is not in datetime format, it could lead to errors in time-series analysis.
|
28
|
+
- Inconsistent or missing data within the dataset might affect the analysis of trends and patterns.
|
29
|
+
|
30
|
+
**Strengths**:
|
31
|
+
- This function provides a comprehensive summary of key descriptive statistics for each variable, helping to identify data quality
|
32
|
+
issues and understand the distribution of the data.
|
33
|
+
|
34
|
+
**Limitations**:
|
35
|
+
- This function assumes that the dataset is provided as a DataFrameDataset object with a .df attribute to access
|
36
|
+
the pandas DataFrame.
|
37
|
+
- It only analyzes datasets with a datetime index and will raise an error for other types of indices.
|
38
|
+
- The function does not handle large datasets efficiently, and performance may degrade with very large datasets.
|
39
|
+
"""
|
40
|
+
|
41
|
+
summary = []
|
42
|
+
|
43
|
+
df = (
|
44
|
+
dataset.df
|
45
|
+
) # Assuming DataFrameDataset objects have a .df attribute to get the pandas DataFrame
|
46
|
+
|
47
|
+
if not pd.api.types.is_datetime64_any_dtype(df.index):
|
48
|
+
raise ValueError(f"Dataset {dataset.input_id} must have a datetime index")
|
49
|
+
|
50
|
+
for column in df.columns:
|
51
|
+
start_date = df.index.min().strftime("%Y-%m-%d")
|
52
|
+
end_date = df.index.max().strftime("%Y-%m-%d")
|
53
|
+
count = df[column].count()
|
54
|
+
min_value = df[column].min()
|
55
|
+
mean_value = df[column].mean()
|
56
|
+
max_value = df[column].max()
|
57
|
+
skewness_value = skew(df[column].dropna())
|
58
|
+
kurtosis_value = kurtosis(df[column].dropna())
|
59
|
+
|
60
|
+
summary.append(
|
61
|
+
{
|
62
|
+
"Variable": column,
|
63
|
+
"Start Date": start_date,
|
64
|
+
"End Date": end_date,
|
65
|
+
"Min": min_value,
|
66
|
+
"Mean": mean_value,
|
67
|
+
"Max": max_value,
|
68
|
+
"Skewness": skewness_value,
|
69
|
+
"Kurtosis": kurtosis_value,
|
70
|
+
"Count": count,
|
71
|
+
}
|
72
|
+
)
|
73
|
+
|
74
|
+
result_df = pd.DataFrame(summary)
|
75
|
+
|
76
|
+
return result_df
|