validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +26 -7
  3. validmind/api_client.py +89 -43
  4. validmind/client.py +2 -2
  5. validmind/client_config.py +11 -14
  6. validmind/datasets/regression/fred_timeseries.py +67 -138
  7. validmind/template.py +1 -0
  8. validmind/test_suites/__init__.py +0 -2
  9. validmind/test_suites/statsmodels_timeseries.py +1 -1
  10. validmind/test_suites/summarization.py +0 -1
  11. validmind/test_suites/time_series.py +0 -43
  12. validmind/tests/__types__.py +3 -13
  13. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  14. validmind/tests/data_validation/ADF.py +31 -24
  15. validmind/tests/data_validation/AutoAR.py +9 -9
  16. validmind/tests/data_validation/AutoMA.py +23 -16
  17. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  18. validmind/tests/data_validation/AutoStationarity.py +21 -16
  19. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  20. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
  21. validmind/tests/data_validation/ClassImbalance.py +15 -12
  22. validmind/tests/data_validation/DFGLSArch.py +19 -13
  23. validmind/tests/data_validation/DatasetDescription.py +17 -11
  24. validmind/tests/data_validation/DatasetSplit.py +7 -5
  25. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  26. validmind/tests/data_validation/Duplicates.py +33 -25
  27. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  28. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  29. validmind/tests/data_validation/HighCardinality.py +19 -12
  30. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  31. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  32. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  33. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  34. validmind/tests/data_validation/KPSS.py +34 -29
  35. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  36. validmind/tests/data_validation/MissingValues.py +32 -27
  37. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  38. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  39. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  40. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  41. validmind/tests/data_validation/ScatterPlot.py +63 -78
  42. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  43. validmind/tests/data_validation/Skewness.py +35 -37
  44. validmind/tests/data_validation/SpreadPlot.py +35 -35
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  47. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  48. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  49. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  50. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  51. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  52. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  53. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  54. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  55. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  56. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  57. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  58. validmind/tests/data_validation/UniqueRows.py +11 -6
  59. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  60. validmind/tests/data_validation/WOEBinTable.py +35 -30
  61. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  62. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  63. validmind/tests/data_validation/nlp/Hashtags.py +27 -20
  64. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  65. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  66. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  67. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  68. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  69. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  70. validmind/tests/data_validation/nlp/TextDescription.py +36 -35
  71. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  72. validmind/tests/decorator.py +81 -42
  73. validmind/tests/model_validation/BertScore.py +36 -27
  74. validmind/tests/model_validation/BleuScore.py +25 -19
  75. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  76. validmind/tests/model_validation/ContextualRecall.py +35 -13
  77. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  78. validmind/tests/model_validation/MeteorScore.py +46 -33
  79. validmind/tests/model_validation/ModelMetadata.py +32 -64
  80. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  81. validmind/tests/model_validation/RegardScore.py +30 -14
  82. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  83. validmind/tests/model_validation/RougeScore.py +36 -30
  84. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  85. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  86. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  87. validmind/tests/model_validation/TokenDisparity.py +31 -23
  88. validmind/tests/model_validation/ToxicityScore.py +26 -17
  89. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  90. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  91. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  92. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  93. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  94. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  95. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  96. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  97. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  98. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  99. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  100. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  101. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  102. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  103. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  104. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  105. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  106. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  107. validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
  108. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  109. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  110. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  111. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  112. validmind/tests/model_validation/ragas/utils.py +6 -0
  113. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  114. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  115. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  116. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  117. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  118. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  119. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  120. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  121. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  122. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  123. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  124. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  125. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  126. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  127. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  128. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  129. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  130. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  131. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
  132. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  133. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  134. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  135. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  136. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  137. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  138. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
  139. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  140. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
  141. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
  142. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  143. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  144. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  145. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  146. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  147. validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
  148. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  149. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
  150. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  151. validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
  152. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  153. validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
  154. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  155. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
  156. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  157. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  158. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  159. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  160. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  161. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  162. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  163. validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
  164. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  165. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
  166. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  167. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  168. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  169. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  170. validmind/tests/prompt_validation/Bias.py +14 -11
  171. validmind/tests/prompt_validation/Clarity.py +16 -14
  172. validmind/tests/prompt_validation/Conciseness.py +7 -5
  173. validmind/tests/prompt_validation/Delimitation.py +23 -22
  174. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  175. validmind/tests/prompt_validation/Robustness.py +12 -10
  176. validmind/tests/prompt_validation/Specificity.py +13 -11
  177. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  178. validmind/tests/run.py +68 -23
  179. validmind/unit_metrics/__init__.py +81 -144
  180. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  181. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  182. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  183. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  184. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  185. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  186. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  187. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  188. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  189. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  190. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  191. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  192. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  193. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  194. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  195. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  196. validmind/vm_models/dataset/dataset.py +2 -0
  197. validmind/vm_models/figure.py +5 -0
  198. validmind/vm_models/test/result_wrapper.py +93 -132
  199. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
  200. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
  201. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  202. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  203. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  204. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  205. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  206. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  207. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  208. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  209. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  210. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
  211. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
  212. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -14,28 +14,45 @@ from validmind import tags, tasks
14
14
  @tasks("regression", "time_series_forecasting")
15
15
  def TimeSeriesPredictionWithCI(dataset, model, confidence=0.95):
16
16
  """
17
- Plot actual vs predicted values for a time series with confidence intervals and compute breaches.
17
+ Assesses predictive accuracy and uncertainty in time series models, highlighting breaches beyond confidence
18
+ intervals.
18
19
 
19
- **Purpose**: The purpose of this function is to visualize the actual versus predicted values for time series data, including confidence intervals, and to compute and report the number of breaches beyond these intervals.
20
+ ### Purpose
20
21
 
21
- **Test Mechanism**: The function calculates the standard deviation of prediction errors, determines the confidence intervals, and counts the number of actual values that fall outside these intervals (breaches). It then generates a plot with the actual values, predicted values, and confidence intervals, and returns a DataFrame summarizing the breach information.
22
+ The purpose of the Time Series Prediction with Confidence Intervals (CI) test is to visualize the actual versus
23
+ predicted values for time series data, including confidence intervals, and to compute and report the number of
24
+ breaches beyond these intervals. This helps in evaluating the reliability and accuracy of the model's predictions.
22
25
 
23
- **Signs of High Risk**:
24
- - A high number of breaches indicates that the model's predictions are not reliable within the specified confidence level.
25
- - Significant deviations between actual and predicted values may highlight model inadequacies or issues with data quality.
26
+ ### Test Mechanism
27
+
28
+ The function performs the following steps:
29
+
30
+ - Calculates the standard deviation of prediction errors.
31
+ - Determines the confidence intervals using a specified confidence level, typically 95%.
32
+ - Counts the number of actual values that fall outside the confidence intervals, referred to as breaches.
33
+ - Generates a plot visualizing the actual values, predicted values, and confidence intervals.
34
+ - Returns a DataFrame summarizing the breach information, including the total breaches, upper breaches, and lower
35
+ breaches.
36
+
37
+ ### Signs of High Risk
38
+
39
+ - A high number of breaches indicates that the model's predictions are not reliable within the specified confidence
40
+ level.
41
+ - Significant deviations between actual and predicted values may highlight model inadequacies or issues with data
42
+ quality.
43
+
44
+ ### Strengths
26
45
 
27
- **Strengths**:
28
46
  - Provides a visual representation of prediction accuracy and the uncertainty around predictions.
29
47
  - Includes a statistical measure of prediction reliability through confidence intervals.
30
48
  - Computes and reports breaches, offering a quantitative assessment of prediction performance.
31
49
 
32
- **Limitations**:
50
+ ### Limitations
51
+
33
52
  - Assumes that the dataset is provided as a DataFrameDataset object with a datetime index.
34
53
  - Requires that `dataset.y_pred(model)` returns the predicted values for the model.
35
54
  - The calculation of confidence intervals assumes normally distributed errors, which may not hold for all datasets.
36
55
  """
37
- dataset_name = dataset.input_id
38
- model_name = model.input_id
39
56
  time_index = dataset.df.index # Assuming the index of the dataset is datetime
40
57
 
41
58
  # Get actual and predicted values
@@ -77,7 +94,7 @@ def TimeSeriesPredictionWithCI(dataset, model, confidence=0.95):
77
94
  x=time_index,
78
95
  y=y_true,
79
96
  mode="lines",
80
- name="Actual Values",
97
+ name="Actual",
81
98
  line=dict(color="blue"),
82
99
  )
83
100
  )
@@ -88,7 +105,7 @@ def TimeSeriesPredictionWithCI(dataset, model, confidence=0.95):
88
105
  x=time_index,
89
106
  y=y_pred,
90
107
  mode="lines",
91
- name=f"Predicted by {model_name}",
108
+ name="Predicted",
92
109
  line=dict(color="red"),
93
110
  )
94
111
  )
@@ -121,10 +138,9 @@ def TimeSeriesPredictionWithCI(dataset, model, confidence=0.95):
121
138
 
122
139
  # Update layout
123
140
  fig.update_layout(
124
- title=f"Time Series Actual vs Predicted Values for {dataset_name} and {model_name}",
141
+ title="Actual vs Predicted",
125
142
  xaxis_title="Time",
126
143
  yaxis_title="Values",
127
- legend_title="Legend",
128
144
  template="plotly_white",
129
145
  )
130
146
 
@@ -2,7 +2,6 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import plotly.express as px
6
5
  import plotly.graph_objects as go
7
6
 
8
7
  from validmind import tags, tasks
@@ -10,66 +9,64 @@ from validmind import tags, tasks
10
9
 
11
10
  @tags("model_predictions", "visualization")
12
11
  @tasks("regression", "time_series_forecasting")
13
- def TimeSeriesPredictionsPlot(datasets, models):
12
+ def TimeSeriesPredictionsPlot(dataset, model):
14
13
  """
15
- Plot actual vs predicted values for time series data and generate a visual comparison for each model.
14
+ Plot actual vs predicted values for time series data and generate a visual comparison for the model.
16
15
 
17
- **Purpose**: The purpose of this function is to visualize the actual versus predicted values for time series data across different models.
16
+ ### Purpose
18
17
 
19
- **Test Mechanism**: The function iterates through each dataset-model pair, plots the actual values from the dataset, and overlays the predicted values from each model using Plotly for interactive visualization.
18
+ The purpose of this function is to visualize the actual versus predicted values for time
19
+ series data for a single model.
20
+
21
+ ### Test Mechanism
22
+
23
+ The function plots the actual values from the dataset and overlays the predicted
24
+ values from the model using Plotly for interactive visualization.
20
25
 
21
- **Signs of High Risk**:
22
26
  - Large discrepancies between actual and predicted values indicate poor model performance.
23
27
  - Systematic deviations in predicted values can highlight model bias or issues with data patterns.
24
28
 
25
- **Strengths**:
29
+ ### Strengths
30
+
26
31
  - Provides a clear visual comparison of model predictions against actual values.
27
32
  - Uses Plotly for interactive and visually appealing plots.
28
- - Can handle multiple models and datasets, displaying them with distinct colors.
29
33
 
30
- **Limitations**:
34
+ ### Limitations
35
+
31
36
  - Assumes that the dataset is provided as a DataFrameDataset object with a datetime index.
32
37
  - Requires that `dataset.y_pred(model)` returns the predicted values for the model.
33
- - Visualization might become cluttered with a large number of models or datasets.
34
38
  """
35
39
  fig = go.Figure()
36
40
 
37
- # Use Plotly's color sequence for different model predictions
38
- colors = px.colors.qualitative.Plotly
39
-
40
- # Plot actual values from the first dataset
41
- dataset = datasets[0]
41
+ # Plot actual values from the dataset
42
42
  time_index = dataset.df.index # Assuming the index of the dataset is datetime
43
43
  fig.add_trace(
44
44
  go.Scatter(
45
45
  x=time_index,
46
46
  y=dataset.y,
47
47
  mode="lines",
48
- name="Actual Values",
48
+ name="Actual",
49
49
  line=dict(color="blue"),
50
50
  )
51
51
  )
52
52
 
53
- # Plot predicted values for each dataset-model pair
54
- for idx, (dataset, model) in enumerate(zip(datasets, models)):
55
- model_name = model.input_id
56
- y_pred = dataset.y_pred(model)
57
- fig.add_trace(
58
- go.Scatter(
59
- x=time_index,
60
- y=y_pred,
61
- mode="lines",
62
- name=f"Predicted by {model_name}",
63
- line=dict(color=colors[idx % len(colors)]),
64
- )
53
+ # Plot predicted values for the model
54
+ y_pred = dataset.y_pred(model)
55
+ fig.add_trace(
56
+ go.Scatter(
57
+ x=time_index,
58
+ y=y_pred,
59
+ mode="lines",
60
+ name="Predicted",
61
+ line=dict(color="orange"), # Using a distinct color for the prediction
65
62
  )
63
+ )
66
64
 
67
65
  # Update layout
68
66
  fig.update_layout(
69
- title="Time Series Actual vs Predicted Values",
67
+ title="Actual vs Predicted",
70
68
  xaxis_title="Time",
71
69
  yaxis_title="Values",
72
- legend_title="Legend",
73
70
  template="plotly_white",
74
71
  )
75
72
 
@@ -12,75 +12,80 @@ from validmind import tags, tasks
12
12
 
13
13
  @tags("model_performance", "sklearn")
14
14
  @tasks("regression", "time_series_forecasting")
15
- def TimeSeriesR2SquareBySegments(datasets, models, segments=None):
15
+ def TimeSeriesR2SquareBySegments(dataset, model, segments=None):
16
16
  """
17
- Plot R-Squared values for each model over specified time segments and generate a bar chart
18
- with the results.
17
+ Evaluates the R-Squared values of regression models over specified time segments in time series data to assess
18
+ segment-wise model performance.
19
19
 
20
- **Purpose**: The purpose of this function is to plot the R-Squared values for different models applied to various segments of the time series data.
20
+ ### Purpose
21
21
 
22
- **Parameters**:
23
- - datasets: List of datasets to evaluate.
24
- - models: List of models to evaluate.
25
- - segments: Dictionary with 'start_date' and 'end_date' keys containing lists of start and end dates for each segments. If None, the time series will be segmented into two halves.
22
+ The TimeSeriesR2SquareBySegments test aims to evaluate the R-Squared values for several regression models across
23
+ different segments of time series data. This helps in determining how well the models explain the variability in
24
+ the data within each specific time segment.
26
25
 
27
- **Test Mechanism**: The function iterates through each dataset-model pair, calculates the R-Squared values for specified time segments, and generates a bar chart with these results.
26
+ ### Test Mechanism
27
+ - Provides a visual representation of model performance across different time segments.
28
+ - Allows for identification of segments where the model performs poorly.
29
+ - Calculating the R-Squared values for each segment.
30
+ - Generating a bar chart to visually represent the R-Squared values across different models and segments.
28
31
 
29
- **Signs of High Risk**:
30
- - If the R-Squared values are significantly low for certain segments, it could indicate that the model is not explaining much of the variability in the dataset for those segments.
32
+ ### Signs of High Risk
31
33
 
32
- **Strengths**:
33
- - Provides a visual representation of model performance across different time segments.
34
- - Allows for identification of segments where models perform poorly.
34
+ - Significantly low R-Squared values for certain time segments, indicating poor model performance in those periods.
35
+ - Large variability in R-Squared values across different segments for the same model, suggesting inconsistent
36
+ performance.
37
+
38
+ ### Strengths
39
+
40
+ - Provides a visual representation of how well models perform over different time periods.
41
+ - Helps identify time segments where models may need improvement or retraining.
42
+ - Facilitates comparison between multiple models in a straightforward manner.
43
+
44
+ ### Limitations
35
45
 
36
- **Limitations**:
37
- - Assumes that the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns` attributes.
38
- - Requires that `dataset.y_pred(model)` returns the predicted values for the model.
39
- - Assumes that `y_true` and `y_pred` are pandas Series with datetime indices.
46
+ - Assumes datasets are provided as DataFrameDataset objects with the attributes `y`, `y_pred`, and
47
+ `feature_columns`.
48
+ - Requires that `dataset.y_pred(model)` returns predicted values for the model.
49
+ - Assumes that both `y_true` and `y_pred` are pandas Series with datetime indices, which may not always be the case.
50
+ - May not account for more nuanced temporal dependencies within the segments.
40
51
  """
41
52
  results_list = []
42
53
 
43
- for dataset, model in zip(datasets, models):
44
- dataset_name = dataset.input_id
45
- model_name = model.input_id
46
-
47
- y_true = dataset.y
48
- y_pred = dataset.y_pred(model)
49
-
50
- # Ensure y_true and y_pred are pandas Series with the same index
51
- if not isinstance(y_true, pd.Series):
52
- y_true = pd.Series(y_true, index=dataset.df.index)
53
- if not isinstance(y_pred, pd.Series):
54
- y_pred = pd.Series(y_pred, index=dataset.df.index)
55
-
56
- index = dataset.df.index
57
-
58
- if segments is None:
59
- mid_point = len(index) // 2
60
- segments = {
61
- "start_date": [index.min(), index[mid_point]],
62
- "end_date": [index[mid_point - 1], index.max()],
63
- }
64
-
65
- for segment_index, (start_date, end_date) in enumerate(
66
- zip(segments["start_date"], segments["end_date"])
67
- ):
68
- mask = (index >= start_date) & (index <= end_date)
69
- y_true_segment = y_true.loc[mask]
70
- y_pred_segment = y_pred.loc[mask]
71
-
72
- if len(y_true_segment) > 0 and len(y_pred_segment) > 0:
73
- r2s = metrics.r2_score(y_true_segment, y_pred_segment)
74
- results_list.append(
75
- {
76
- "Model": model_name,
77
- "Dataset": dataset_name,
78
- "Segments": f"Segment {segment_index + 1}",
79
- "Start Date": start_date,
80
- "End Date": end_date,
81
- "R-Squared": r2s,
82
- }
83
- )
54
+ y_true = dataset.y
55
+ y_pred = dataset.y_pred(model)
56
+
57
+ # Ensure y_true and y_pred are pandas Series with the same index
58
+ if not isinstance(y_true, pd.Series):
59
+ y_true = pd.Series(y_true, index=dataset.df.index)
60
+ if not isinstance(y_pred, pd.Series):
61
+ y_pred = pd.Series(y_pred, index=dataset.df.index)
62
+
63
+ index = dataset.df.index
64
+
65
+ if segments is None:
66
+ mid_point = len(index) // 2
67
+ segments = {
68
+ "start_date": [index.min(), index[mid_point]],
69
+ "end_date": [index[mid_point - 1], index.max()],
70
+ }
71
+
72
+ for segment_index, (start_date, end_date) in enumerate(
73
+ zip(segments["start_date"], segments["end_date"])
74
+ ):
75
+ mask = (index >= start_date) & (index <= end_date)
76
+ y_true_segment = y_true.loc[mask]
77
+ y_pred_segment = y_pred.loc[mask]
78
+
79
+ if len(y_true_segment) > 0 and len(y_pred_segment) > 0:
80
+ r2s = metrics.r2_score(y_true_segment, y_pred_segment)
81
+ results_list.append(
82
+ {
83
+ "Segments": f"Segment {segment_index + 1}",
84
+ "Start Date": start_date,
85
+ "End Date": end_date,
86
+ "R-Squared": r2s,
87
+ }
88
+ )
84
89
 
85
90
  # Convert results list to a DataFrame
86
91
  results_df = pd.DataFrame(results_list)
@@ -90,13 +95,13 @@ def TimeSeriesR2SquareBySegments(datasets, models, segments=None):
90
95
  results_df,
91
96
  x="Segments",
92
97
  y="R-Squared",
93
- color="Model",
98
+ # color="Model",
94
99
  barmode="group",
95
- title="R-Squared Comparison by Segment and Model",
100
+ title="R-Squared by Segment",
96
101
  labels={
97
102
  "R-Squared": "R-Squared Value",
98
- "Segment": "Time Segment",
99
- "Model": "Model",
103
+ "Segments": "Time Segment",
104
+ # "Model": "Model",
100
105
  },
101
106
  )
102
107
 
@@ -12,33 +12,41 @@ from validmind import tags, tasks
12
12
  @tasks("text_classification", "text_summarization")
13
13
  def TokenDisparity(dataset, model):
14
14
  """
15
- Evaluates the token disparity between reference and generated texts, visualizing the results through histograms
16
- and bar charts, alongside compiling a comprehensive table of descriptive statistics for token counts.
17
-
18
- **Purpose:**
19
- This function is designed to assess the token disparity between reference and generated texts. Token disparity is
20
- important for understanding how closely the length and token usage of generated texts match the reference texts.
21
-
22
- **Test Mechanism:**
23
- The function starts by extracting the true and predicted values from the provided dataset and model. It then calculates
24
- the number of tokens in each reference and generated text. Histograms and bar charts are generated for the token counts
25
- of both reference and generated texts to visualize their distribution. Additionally, a table of descriptive statistics
26
- (mean, median, standard deviation, minimum, and maximum) is compiled for the token counts, providing a comprehensive
27
- summary of the model's performance.
28
-
29
- **Signs of High Risk:**
30
- - Significant disparity in token counts between reference and generated texts could indicate issues with text generation
31
- quality, such as verbosity or lack of detail.
15
+ Evaluates the token disparity between reference and generated texts, visualizing the results through histograms and
16
+ bar charts, alongside compiling a comprehensive table of descriptive statistics for token counts.
17
+
18
+ ### Purpose
19
+
20
+ The Token Disparity test aims to assess the difference in the number of tokens between reference texts and texts
21
+ generated by the model. Understanding token disparity is essential for evaluating how well the generated content
22
+ matches the expected length and richness of the reference texts.
23
+
24
+ ### Test Mechanism
25
+
26
+ The test extracts true and predicted values from the dataset and model. It computes the number of tokens in each
27
+ reference and generated text. The results are visualized using histograms and bar charts to display the
28
+ distribution of token counts. Additionally, a table of descriptive statistics, including the mean, median, standard
29
+ deviation, minimum, and maximum token counts, is compiled to provide a detailed summary of token usage.
30
+
31
+ ### Signs of High Risk
32
+
33
+ - Significant disparity in token counts between reference and generated texts could indicate issues with text
34
+ generation quality, such as verbosity or lack of detail.
32
35
  - Consistently low token counts in generated texts compared to references might suggest that the model is producing
33
- incomplete or overly concise outputs.
36
+ incomplete or overly concise outputs.
37
+
38
+ ### Strengths
34
39
 
35
- **Strengths:**
36
40
  - Provides a simple yet effective evaluation of text length and token usage.
37
- - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of token counts.
38
- - Descriptive statistics offer a concise summary of the model's performance in generating texts of appropriate length.
41
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of
42
+ token counts.
43
+ - Descriptive statistics offer a concise summary of the model's performance in generating texts of appropriate
44
+ length.
45
+
46
+ ### Limitations
39
47
 
40
- **Limitations:**
41
- - Token counts alone do not provide a complete assessment of text quality and should be supplemented with other metrics and qualitative analysis.
48
+ - Token counts alone do not provide a complete assessment of text quality and should be supplemented with other
49
+ metrics and qualitative analysis.
42
50
  """
43
51
 
44
52
  # Extract true and predicted values
@@ -13,31 +13,40 @@ from validmind import tags, tasks
13
13
  @tasks("text_classification", "text_summarization")
14
14
  def ToxicityScore(dataset, model):
15
15
  """
16
- Computes and visualizes the toxicity score for input text, true text, and predicted text, assessing content quality and potential risk.
16
+ Assesses the toxicity levels of texts generated by NLP models to identify and mitigate harmful or offensive content.
17
17
 
18
- **Purpose:**
19
- The ToxicityScore metric is designed to evaluate the toxicity levels of texts generated by models. This is crucial for
20
- identifying and mitigating harmful or offensive content in machine-generated texts.
18
+ ### Purpose
21
19
 
22
- **Test Mechanism:**
23
- The function starts by extracting the input, true, and predicted values from the provided dataset and model. The toxicity score is
24
- computed for each text using a preloaded `toxicity` evaluation tool. The scores are compiled into dataframes, and histograms
25
- and bar charts are generated to visualize the distribution of toxicity scores. Additionally, a table of descriptive statistics
26
- (mean, median, standard deviation, minimum, and maximum) is compiled for the toxicity scores, providing a comprehensive
27
- summary of the model's performance.
20
+ The ToxicityScore metric is designed to evaluate the toxicity levels of texts generated by models. This is crucial
21
+ for identifying and mitigating harmful or offensive content in machine-generated texts.
22
+
23
+ ### Test Mechanism
24
+
25
+ The function starts by extracting the input, true, and predicted values from the provided dataset and model. The
26
+ toxicity score is computed for each text using a preloaded `toxicity` evaluation tool. The scores are compiled into
27
+ dataframes, and histograms and bar charts are generated to visualize the distribution of toxicity scores.
28
+ Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is
29
+ compiled for the toxicity scores, providing a comprehensive summary of the model's performance.
30
+
31
+ ### Signs of High Risk
28
32
 
29
- **Signs of High Risk:**
30
33
  - Drastic spikes in toxicity scores indicate potentially toxic content within the associated text segment.
31
- - Persistent high toxicity scores across multiple texts may suggest systemic issues in the model's text generation process.
34
+ - Persistent high toxicity scores across multiple texts may suggest systemic issues in the model's text generation
35
+ process.
32
36
 
33
- **Strengths:**
34
- - Provides a clear evaluation of toxicity levels in generated texts, helping to ensure content safety and appropriateness.
35
- - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of toxicity scores.
37
+ ### Strengths
38
+
39
+ - Provides a clear evaluation of toxicity levels in generated texts, helping to ensure content safety and
40
+ appropriateness.
41
+ - Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of
42
+ toxicity scores.
36
43
  - Descriptive statistics offer a concise summary of the model's performance in generating non-toxic texts.
37
44
 
38
- **Limitations:**
45
+ ### Limitations
46
+
39
47
  - The accuracy of the toxicity scores is contingent upon the underlying `toxicity` tool.
40
- - The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for high toxicity.
48
+ - The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for
49
+ high toxicity.
41
50
  - Supplementary, in-depth analysis might be needed for granular insights.
42
51
  """
43
52
 
@@ -12,38 +12,42 @@ class ClusterDistribution(Metric):
12
12
  """
13
13
  Assesses the distribution of text embeddings across clusters produced by a model using KMeans clustering.
14
14
 
15
- **Purpose:** The purpose of this metric is to analyze the distribution of the clusters produced by a text embedding
16
- model. By dividing the text embeddings into different clusters, we can understand how the model is grouping or
17
- categorizing the text data. This aids in visualizing the organization and segregation of the data and thus gives an
15
+ ### Purpose
16
+
17
+ The purpose of this metric is to analyze the distribution of the clusters produced by a text embedding model. By
18
+ dividing the text embeddings into different clusters, we can understand how the model is grouping or categorizing
19
+ the text data. This aids in visualizing the organization and segregation of the data, thereby giving an
18
20
  understanding of how the model is processing the data.
19
21
 
20
- **Test Mechanism:** The metric applies the KMeans clustering algorithm on the predictions made by the model on the
21
- testing dataset and divides the text embeddings into a pre-defined number of clusters. By default, this number is
22
- set to 5 but can be customized as per requirements. The output of this test is a histogram plot that shows the
23
- distribution of embeddings across these clusters.
22
+ ### Test Mechanism
23
+
24
+ The metric applies the KMeans clustering algorithm on the predictions made by the model on the testing dataset and
25
+ divides the text embeddings into a pre-defined number of clusters. By default, this number is set to 5 but can be
26
+ customized as per requirements. The output of this test is a histogram plot that shows the distribution of
27
+ embeddings across these clusters.
24
28
 
25
- **Signs of High Risk:**
29
+ ### Signs of High Risk
26
30
 
27
- - If the embeddings are skewed towards one or two clusters, that would indicate that the model is not effectively
31
+ - If the embeddings are skewed towards one or two clusters, it indicates that the model is not effectively
28
32
  differentiating the various categories in the text data.
29
33
  - Uniform distribution of the embeddings across the clusters might show a lack of proper categorization.
30
34
 
31
- **Strengths:**
35
+ ### Strengths
32
36
 
33
- - Great tool to visualize the text data categorization by the model. It provides a way to assess if the model is
34
- distinguishing the categories effectively or not.
35
- - It is flexible with the number of clusters (classes), so can be used on various types of data regardless of the
36
- number of categories.
37
+ - Great tool to visualize the text data categorization by the model.
38
+ - Provides a way to assess if the model is distinguishing the categories effectively or not.
39
+ - Flexible with the number of clusters, so it can be used on various types of data regardless of the number of
40
+ categories.
37
41
 
38
- **Limitations:**
42
+ ### Limitations
39
43
 
40
- - The success or failure of this test is based on visual interpretation, which might not be enough for making solid
44
+ - Success or failure of this test is based on visual interpretation, which might not be enough for making solid
41
45
  conclusions or determining the exact points of failure.
42
- - It assumes that the division of text embeddings across clusters should ideally be homogeneous, which might not
46
+ - Assumes that the division of text embeddings across clusters should ideally be homogeneous, which might not
43
47
  always be the case depending on the nature of the text data.
44
- - It only applies to text embedding models, reducing its utility across various ML models.
45
- - This test uses the KMeans clustering algorithm, which assumes that clusters are convex and isotropic. Thus, this
46
- test may not work as intended if the true clusters in the data are not of this shape.
48
+ - Only applies to text embedding models, reducing its utility across various ML models.
49
+ - Uses the KMeans clustering algorithm, which assumes that clusters are convex and isotropic, and may not work as
50
+ intended if the true clusters in the data are not of this shape.
47
51
  """
48
52
 
49
53
  name = "Text Embeddings Cluster Distribution"
@@ -16,45 +16,48 @@ from validmind import tags, tasks
16
16
  @tasks("text_qa", "text_generation", "text_summarization")
17
17
  def CosineSimilarityComparison(dataset, models):
18
18
  """
19
- Computes pairwise cosine similarities between model embeddings and visualizes the results through bar charts,
20
- alongside compiling a comprehensive table of descriptive statistics for each model pair.
19
+ Assesses the similarity between embeddings generated by different models using Cosine Similarity, providing both
20
+ statistical and visual insights.
21
21
 
22
- **Purpose:**
23
- This function is designed to analyze and compare the embeddings produced by different models using Cosine Similarity.
24
- Cosine Similarity, a measure calculating the cosine of the angle between two vectors, is widely used to determine
25
- the alignment or similarity between vectors in high-dimensional spaces, such as text embeddings. This analysis helps
26
- to understand how similar or different the models' predictions are in terms of embedding generation.
22
+ ### Purpose
27
23
 
28
- **Test Mechanism:**
29
- The function begins by computing the embeddings for each model using the provided dataset. It then calculates the
30
- cosine similarity for every possible pair of models, generating a similarity matrix. Each element of this matrix
31
- represents the cosine similarity between two model embeddings. The function flattens this matrix and uses it to
32
- create a bar chart for each model pair, visualizing their similarity distribution. Additionally, it compiles a table
33
- with descriptive statistics (mean, median, standard deviation, minimum, and maximum) for the similarities of each
34
- pair, including a reference to the compared models.
24
+ The Cosine Similarity Comparison test aims to analyze and compare the embeddings produced by different models using
25
+ Cosine Similarity. Cosine Similarity is a measure that calculates the cosine of the angle between two vectors,
26
+ widely used to determine the alignment or similarity between high-dimensional vectors, such as text embeddings.
27
+ This analysis helps understand how similar or different the models' predictions are in terms of embedding
28
+ generation.
35
29
 
36
- **Signs of High Risk:**
30
+ ### Test Mechanism
31
+
32
+ The function starts by computing the embeddings for each model using the provided dataset. It then calculates the
33
+ cosine similarity for every possible pair of models, generating a similarity matrix wherein each element represents
34
+ the cosine similarity between two model embeddings. This matrix is flattened to create a bar chart for each model
35
+ pair, visualizing their similarity distribution. Additionally, a table with descriptive statistics (mean, median,
36
+ standard deviation, minimum, and maximum) for the similarities of each pair is compiled, referencing the compared
37
+ models.
38
+
39
+ ### Signs of High Risk
37
40
 
38
41
  - A high concentration of cosine similarity values close to 1 could suggest that the models are producing very
39
- similar embeddings, which could be a sign of redundancy or lack of diversity in model training or design.
40
- - Conversely, very low similarity values near -1 indicate strong dissimilarity, potentially highlighting models
41
- that are too divergent, possibly focusing on very different features of the data.
42
+ similar embeddings, indicating redundancy or lack of diversity in model training or design.
43
+ - Very low similarity values near -1 highlight strong dissimilarity, suggesting models that are too divergent and
44
+ possibly focusing on very different features of the data.
42
45
 
43
- **Strengths:**
46
+ ### Strengths
44
47
 
45
48
  - Enables detailed comparisons between multiple models' embedding strategies through visual and statistical means.
46
- - Helps identify which models produce similar or dissimilar embeddings, useful for tasks requiring model diversity.
49
+ - Identifies models producing similar or dissimilar embeddings, useful for tasks requiring model diversity.
47
50
  - Provides quantitative and visual feedback on the degree of similarity, enhancing interpretability of model
48
- behavior in embedding spaces.
51
+ behavior in embedding spaces.
49
52
 
50
- **Limitations:**
53
+ ### Limitations
51
54
 
52
- - The analysis is confined to the comparison of embeddings and does not assess the overall performance of the models
53
- in terms of their primary tasks (e.g., classification, regression).
55
+ - The analysis is confined to the comparison of embeddings and does not assess the overall performance of the
56
+ models in terms of their primary tasks (e.g., classification, regression).
54
57
  - Assumes that the models are suitable for generating comparable embeddings, which might not always be the case,
55
- especially across different types of models.
56
- - Interpretation of results is heavily dependent on the understanding of Cosine Similarity and the nature of high-dimensional
57
- embedding spaces.
58
+ especially across different types of models.
59
+ - Interpretation of results is heavily dependent on the understanding of Cosine Similarity and the nature of
60
+ high-dimensional embedding spaces.
58
61
  """
59
62
 
60
63
  figures = []