validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +26 -7
  3. validmind/api_client.py +89 -43
  4. validmind/client.py +2 -2
  5. validmind/client_config.py +11 -14
  6. validmind/datasets/regression/fred_timeseries.py +67 -138
  7. validmind/template.py +1 -0
  8. validmind/test_suites/__init__.py +0 -2
  9. validmind/test_suites/statsmodels_timeseries.py +1 -1
  10. validmind/test_suites/summarization.py +0 -1
  11. validmind/test_suites/time_series.py +0 -43
  12. validmind/tests/__types__.py +3 -13
  13. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  14. validmind/tests/data_validation/ADF.py +31 -24
  15. validmind/tests/data_validation/AutoAR.py +9 -9
  16. validmind/tests/data_validation/AutoMA.py +23 -16
  17. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  18. validmind/tests/data_validation/AutoStationarity.py +21 -16
  19. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  20. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
  21. validmind/tests/data_validation/ClassImbalance.py +15 -12
  22. validmind/tests/data_validation/DFGLSArch.py +19 -13
  23. validmind/tests/data_validation/DatasetDescription.py +17 -11
  24. validmind/tests/data_validation/DatasetSplit.py +7 -5
  25. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  26. validmind/tests/data_validation/Duplicates.py +33 -25
  27. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  28. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  29. validmind/tests/data_validation/HighCardinality.py +19 -12
  30. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  31. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  32. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  33. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  34. validmind/tests/data_validation/KPSS.py +34 -29
  35. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  36. validmind/tests/data_validation/MissingValues.py +32 -27
  37. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  38. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  39. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  40. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  41. validmind/tests/data_validation/ScatterPlot.py +63 -78
  42. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  43. validmind/tests/data_validation/Skewness.py +35 -37
  44. validmind/tests/data_validation/SpreadPlot.py +35 -35
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  47. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  48. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  49. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  50. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  51. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  52. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  53. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  54. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  55. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  56. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  57. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  58. validmind/tests/data_validation/UniqueRows.py +11 -6
  59. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  60. validmind/tests/data_validation/WOEBinTable.py +35 -30
  61. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  62. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  63. validmind/tests/data_validation/nlp/Hashtags.py +27 -20
  64. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  65. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  66. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  67. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  68. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  69. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  70. validmind/tests/data_validation/nlp/TextDescription.py +36 -35
  71. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  72. validmind/tests/decorator.py +81 -42
  73. validmind/tests/model_validation/BertScore.py +36 -27
  74. validmind/tests/model_validation/BleuScore.py +25 -19
  75. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  76. validmind/tests/model_validation/ContextualRecall.py +35 -13
  77. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  78. validmind/tests/model_validation/MeteorScore.py +46 -33
  79. validmind/tests/model_validation/ModelMetadata.py +32 -64
  80. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  81. validmind/tests/model_validation/RegardScore.py +30 -14
  82. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  83. validmind/tests/model_validation/RougeScore.py +36 -30
  84. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  85. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  86. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  87. validmind/tests/model_validation/TokenDisparity.py +31 -23
  88. validmind/tests/model_validation/ToxicityScore.py +26 -17
  89. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  90. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  91. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  92. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  93. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  94. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  95. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  96. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  97. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  98. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  99. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  100. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  101. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  102. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  103. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  104. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  105. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  106. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  107. validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
  108. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  109. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  110. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  111. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  112. validmind/tests/model_validation/ragas/utils.py +6 -0
  113. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  114. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  115. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  116. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  117. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  118. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  119. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  120. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  121. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  122. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  123. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  124. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  125. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  126. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  127. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  128. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  129. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  130. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  131. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
  132. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  133. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  134. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  135. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  136. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  137. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  138. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
  139. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  140. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
  141. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
  142. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  143. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  144. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  145. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  146. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  147. validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
  148. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  149. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
  150. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  151. validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
  152. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  153. validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
  154. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  155. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
  156. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  157. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  158. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  159. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  160. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  161. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  162. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  163. validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
  164. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  165. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
  166. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  167. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  168. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  169. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  170. validmind/tests/prompt_validation/Bias.py +14 -11
  171. validmind/tests/prompt_validation/Clarity.py +16 -14
  172. validmind/tests/prompt_validation/Conciseness.py +7 -5
  173. validmind/tests/prompt_validation/Delimitation.py +23 -22
  174. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  175. validmind/tests/prompt_validation/Robustness.py +12 -10
  176. validmind/tests/prompt_validation/Specificity.py +13 -11
  177. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  178. validmind/tests/run.py +68 -23
  179. validmind/unit_metrics/__init__.py +81 -144
  180. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  181. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  182. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  183. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  184. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  185. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  186. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  187. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  188. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  189. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  190. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  191. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  192. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  193. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  194. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  195. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  196. validmind/vm_models/dataset/dataset.py +2 -0
  197. validmind/vm_models/figure.py +5 -0
  198. validmind/vm_models/test/result_wrapper.py +93 -132
  199. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
  200. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
  201. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  202. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  203. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  204. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  205. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  206. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  207. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  208. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  209. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  210. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
  211. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
  212. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -20,36 +20,44 @@ from validmind.vm_models import (
20
20
  @dataclass
21
21
  class SilhouettePlot(Metric):
22
22
  """
23
- Calculates and visualizes Silhouette Score, assessing degree of data point suitability to its cluster in ML models.
24
-
25
- **Purpose:** This test calculates the Silhouette Score, which is a model performance metric used in clustering
26
- applications. Primarily, the Silhouette Score evaluates how similar an object (data point) is to its own cluster
27
- compared to other clusters. The metric ranges between -1 and 1, where a high value indicates that the object is
28
- well matched to its own cluster and poorly matched to neighboring clusters. Thus, the goal is to achieve a high
29
- Silhouette Score, implying well-separated clusters.
30
-
31
- **Test Mechanism:** The test first extracts the true and predicted labels from the model's training data. The test
32
- runs the Silhouette Score function, which takes as input the training dataset features and the predicted labels,
33
- subsequently calculating the average score. This average Silhouette Score is printed for reference. The script then
34
- calculates the silhouette coefficients for each data point, helping to form the Silhouette Plot. Each cluster is
35
- represented in this plot, with color distinguishing between different clusters. A red dashed line indicates the
36
- average Silhouette Score. The Silhouette Scores are also collected into a structured table, facilitating model
37
- performance analysis and comparison.
38
-
39
- **Signs of High Risk:**
23
+ Calculates and visualizes Silhouette Score, assessing the degree of data point suitability to its cluster in ML
24
+ models.
25
+
26
+ ### Purpose
27
+
28
+ This test calculates the Silhouette Score, which is a model performance metric used in clustering applications.
29
+ Primarily, the Silhouette Score evaluates how similar a data point is to its own cluster compared to other
30
+ clusters. The metric ranges between -1 and 1, where a high value indicates that the object is well matched to its
31
+ own cluster and poorly matched to neighboring clusters. Thus, the goal is to achieve a high Silhouette Score,
32
+ implying well-separated clusters.
33
+
34
+ ### Test Mechanism
35
+
36
+ The test first extracts the true and predicted labels from the model's training data. The test runs the Silhouette
37
+ Score function, which takes as input the training dataset features and the predicted labels, subsequently
38
+ calculating the average score. This average Silhouette Score is printed for reference. The script then calculates
39
+ the silhouette coefficients for each data point, helping to form the Silhouette Plot. Each cluster is represented
40
+ in this plot, with color distinguishing between different clusters. A red dashed line indicates the average
41
+ Silhouette Score. The Silhouette Scores are also collected into a structured table, facilitating model performance
42
+ analysis and comparison.
43
+
44
+ ### Signs of High Risk
45
+
40
46
  - A low Silhouette Score, potentially indicating that the clusters are not well separated and that data points may
41
47
  not be fitting well to their respective clusters.
42
48
  - A Silhouette Plot displaying overlapping clusters or the absence of clear distinctions between clusters visually
43
49
  also suggests poor clustering performance.
44
50
 
45
- **Strengths:**
51
+ ### Strengths
52
+
46
53
  - The Silhouette Score provides a clear and quantitative measure of how well data points have been grouped into
47
54
  clusters, offering insights into model performance.
48
55
  - The Silhouette Plot provides an intuitive, graphical representation of the clustering mechanism, aiding visual
49
56
  assessments of model performance.
50
57
  - It does not require ground truth labels, so it's useful when true cluster assignments are not known.
51
58
 
52
- **Limitations:**
59
+ ### Limitations
60
+
53
61
  - The Silhouette Score may be susceptible to the influence of outliers, which could impact its accuracy and
54
62
  reliability.
55
63
  - It assumes the clusters are convex and isotropic, which might not be the case with complex datasets.
@@ -32,33 +32,40 @@ class TrainingTestDegradation(ThresholdTest):
32
32
  """
33
33
  Tests if model performance degradation between training and test datasets exceeds a predefined threshold.
34
34
 
35
- **Purpose**: The 'TrainingTestDegradation' class serves as a test to verify that the degradation in performance
36
- between the training and test datasets does not exceed a predefined threshold. This test serves as a measure to
37
- check the model's ability to generalize from its training data to unseen test data. It assesses key classification
38
- metric scores such as accuracy, precision, recall and f1 score, to verify the model's robustness and reliability.
39
-
40
- **Test Mechanism**: The code applies several predefined metrics including accuracy, precision, recall and f1 scores
41
- to the model's predictions for both the training and test datasets. It calculates the degradation as the difference
42
- between the training score and test score divided by the training score. The test is considered successful if the
43
- degradation for each metric is less than the preset maximum threshold of 10%. The results are summarized in a table
44
- showing each metric's train score, test score, degradation percentage, and pass/fail status.
45
-
46
- **Signs of High Risk**:
35
+ ### Purpose
36
+
37
+ The `TrainingTestDegradation` class serves as a test to verify that the degradation in performance between the
38
+ training and test datasets does not exceed a predefined threshold. This test measures the model's ability to
39
+ generalize from its training data to unseen test data, assessing key classification metrics such as accuracy,
40
+ precision, recall, and f1 score to verify the model's robustness and reliability.
41
+
42
+ ### Test Mechanism
43
+
44
+ The code applies several predefined metrics, including accuracy, precision, recall, and f1 scores, to the model's
45
+ predictions for both the training and test datasets. It calculates the degradation as the difference between the
46
+ training score and test score divided by the training score. The test is considered successful if the degradation
47
+ for each metric is less than the preset maximum threshold of 10%. The results are summarized in a table showing
48
+ each metric's train score, test score, degradation percentage, and pass/fail status.
49
+
50
+ ### Signs of High Risk
51
+
47
52
  - A degradation percentage that exceeds the maximum allowed threshold of 10% for any of the evaluated metrics.
48
53
  - A high difference or gap between the metric scores on the training and the test datasets.
49
54
  - The 'Pass/Fail' column displaying 'Fail' for any of the evaluated metrics.
50
55
 
51
- **Strengths**:
52
- - This test provides a quantitative measure of the model's ability to generalize to unseen data, which is key for
53
- predicting its practical real-world performance.
56
+ ### Strengths
57
+
58
+ - Provides a quantitative measure of the model's ability to generalize to unseen data, which is key for predicting
59
+ its practical real-world performance.
54
60
  - By evaluating multiple metrics, it takes into account different facets of model performance and enables a more
55
61
  holistic evaluation.
56
62
  - The use of a variable predefined threshold allows the flexibility to adjust the acceptability criteria for
57
63
  different scenarios.
58
64
 
59
- **Limitations**:
60
- - The test compares raw performance on training and test data, but does not factor in the nature of the data. Areas
61
- with less representation in the training set, for instance, might still perform poorly on unseen data.
65
+ ### Limitations
66
+
67
+ - The test compares raw performance on training and test data but does not factor in the nature of the data. Areas
68
+ with less representation in the training set might still perform poorly on unseen data.
62
69
  - It requires good coverage and balance in the test and training datasets to produce reliable results, which may
63
70
  not always be available.
64
71
  - The test is currently only designed for classification tasks.
@@ -14,42 +14,43 @@ class VMeasure(ClusterPerformance):
14
14
  """
15
15
  Evaluates homogeneity and completeness of a clustering model using the V Measure Score.
16
16
 
17
- **1. Purpose:**
17
+ ### Purpose
18
+
18
19
  The purpose of this metric, V Measure Score (V Score), is to evaluate the performance of a clustering model. It
19
20
  measures the homogeneity and completeness of a set of cluster labels, where homogeneity refers to each cluster
20
21
  containing only members of a single class and completeness meaning all members of a given class are assigned to the
21
22
  same cluster.
22
23
 
23
- **2. Test Mechanism:**
24
- ClusterVMeasure is a class that inherits from another class, ClusterPerformance. It uses the v_measure_score
24
+ ### Test Mechanism
25
+
26
+ ClusterVMeasure is a class that inherits from another class, ClusterPerformance. It uses the `v_measure_score`
25
27
  function from the sklearn module's metrics package. The required inputs to perform this metric are the model, train
26
28
  dataset, and test dataset. The test is appropriate for models tasked with clustering.
27
29
 
28
- **3. Signs of High Risk:**
30
+ ### Signs of High Risk
29
31
 
30
32
  - Low V Measure Score: A low V Measure Score indicates that the clustering model has poor homogeneity or
31
33
  completeness, or both. This might signal that the model is failing to correctly cluster the data.
32
34
 
33
- **4. Strengths:**
35
+ ### Strengths
34
36
 
35
37
  - The V Measure Score is a harmonic mean between homogeneity and completeness. This ensures that both attributes
36
38
  are taken into account when evaluating the model, providing an overall measure of its cluster validity.
37
-
38
39
  - The metric does not require knowledge of the ground truth classes when measuring homogeneity and completeness,
39
40
  making it applicable in instances where such information is unavailable.
40
41
 
41
- **5. Limitations:**
42
-
43
- - The V Score can be influenced by the number of clusters, which means that it might not always reflect the quality
44
- of the clustering. Partitioning the data into many small clusters could lead to high homogeneity but low
45
- completeness, leading to a low V Score even if the clustering might be useful.
42
+ ### Limitations
46
43
 
44
+ - The V Measure Score can be influenced by the number of clusters, which means that it might not always reflect the
45
+ quality of the clustering. Partitioning the data into many small clusters could lead to high homogeneity but low
46
+ completeness, leading to a low V Measure Score even if the clustering might be useful.
47
47
  - It assumes equal importance of homogeneity and completeness. In some applications, one may be more important than
48
- the other. The V Score does not provide flexibility in assigning different weights to homogeneity and completeness.
48
+ the other. The V Measure Score does not provide flexibility in assigning different weights to homogeneity and
49
+ completeness.
49
50
  """
50
51
 
51
52
  name = "v_measure_score"
52
- required_inputs = ["model", "datasets"]
53
+ required_inputs = ["model", "dataset"]
53
54
  tasks = ["clustering"]
54
55
  tags = [
55
56
  "sklearn",
@@ -27,21 +27,23 @@ class WeakspotsDiagnosis(ThresholdTest):
27
27
  Identifies and visualizes weak spots in a machine learning model's performance across various sections of the
28
28
  feature space.
29
29
 
30
- **Purpose:**
30
+ ### Purpose
31
+
31
32
  The weak spots test is applied to evaluate the performance of a machine learning model within specific regions of
32
33
  its feature space. This test slices the feature space into various sections, evaluating the model's outputs within
33
34
  each section against specific performance metrics (e.g., accuracy, precision, recall, and F1 scores). The ultimate
34
35
  aim is to identify areas where the model's performance falls below the set thresholds, thereby exposing its
35
36
  possible weaknesses and limitations.
36
37
 
37
- **Test Mechanism:**
38
+ ### Test Mechanism
39
+
38
40
  The test mechanism adopts an approach of dividing the feature space of the training dataset into numerous bins. The
39
41
  model's performance metrics (accuracy, precision, recall, F1 scores) are then computed for each bin on both the
40
42
  training and test datasets. A "weak spot" is identified if any of the performance metrics fall below a
41
43
  predetermined threshold for a particular bin on the test dataset. The test results are visually plotted as bar
42
44
  charts for each performance metric, indicating the bins which fail to meet the established threshold.
43
45
 
44
- **Signs of High Risk:**
46
+ ### Signs of High Risk
45
47
 
46
48
  - Any performance metric of the model dropping below the set thresholds.
47
49
  - Significant disparity in performance between the training and test datasets within a bin could be an indication
@@ -49,7 +51,7 @@ class WeakspotsDiagnosis(ThresholdTest):
49
51
  - Regions or slices with consistently low performance metrics. Such instances could mean that the model struggles
50
52
  to handle specific types of input data adequately, resulting in potentially inaccurate predictions.
51
53
 
52
- **Strengths:**
54
+ ### Strengths
53
55
 
54
56
  - The test helps pinpoint precise regions of the feature space where the model's performance is below par, allowing
55
57
  for more targeted improvements to the model.
@@ -58,7 +60,7 @@ class WeakspotsDiagnosis(ThresholdTest):
58
60
  - The test exhibits flexibility, letting users set different thresholds for various performance metrics according
59
61
  to the specific requirements of the application.
60
62
 
61
- **Limitations:**
63
+ ### Limitations
62
64
 
63
65
  - The binning system utilized for the feature space in the test could over-simplify the model's behavior within
64
66
  each bin. The granularity of this slicing depends on the chosen 'bins' parameter and can sometimes be arbitrary.
@@ -15,13 +15,16 @@ class AutoARIMA(Metric):
15
15
  """
16
16
  Evaluates ARIMA models for time-series forecasting, ranking them using Bayesian and Akaike Information Criteria.
17
17
 
18
- **Purpose**: The AutoARIMA validation test is designed to evaluate and rank AutoRegressive Integrated Moving
19
- Average (ARIMA) models. These models are primarily used for forecasting time-series data. The validation test
20
- automatically fits multiple ARIMA models, with varying parameters, to every variable within the given dataset. The
21
- models are then ranked based on their Bayesian Information Criterion (BIC) and Akaike Information Criterion (AIC)
22
- values, which provide a basis for the efficient model selection process.
18
+ ### Purpose
19
+
20
+ The AutoARIMA validation test is designed to evaluate and rank AutoRegressive Integrated Moving Average (ARIMA)
21
+ models. These models are primarily used for forecasting time-series data. The validation test automatically fits
22
+ multiple ARIMA models, with varying parameters, to every variable within the given dataset. The models are then
23
+ ranked based on their Bayesian Information Criterion (BIC) and Akaike Information Criterion (AIC) values, which
24
+ provide a basis for the efficient model selection process.
25
+
26
+ ### Test Mechanism
23
27
 
24
- **Test Mechanism**:
25
28
  This metric proceeds by generating an array of feasible combinations of ARIMA model parameters which are within a
26
29
  prescribed limit. These limits include `max_p`, `max_d`, `max_q`; they represent the autoregressive, differencing,
27
30
  and moving average components respectively. Upon applying these sets of parameters, the validation test fits each
@@ -31,28 +34,31 @@ class AutoARIMA(Metric):
31
34
  found to be non-stationary, a warning message is sent out, given that ARIMA models necessitate input series to be
32
35
  stationary.
33
36
 
34
- **Signs of High Risk**:
35
- * If the p-value of the Augmented Dickey-Fuller test for a variable exceeds 0.05, a warning is logged. This warning
37
+ ### Signs of High Risk
38
+
39
+ - If the p-value of the Augmented Dickey-Fuller test for a variable exceeds 0.05, a warning is logged. This warning
36
40
  indicates that the series might not be stationary, leading to potentially inaccurate results.
37
- * Consistent failure in fitting ARIMA models (as made evident through logged errors) might disclose issues with
41
+ - Consistent failure in fitting ARIMA models (as made evident through logged errors) might disclose issues with
38
42
  either the data or model stability.
39
43
 
40
- **Strengths**:
41
- * The AutoARIMA validation test simplifies the often complex task of selecting the most suitable ARIMA model based
44
+ ### Strengths
45
+
46
+ - The AutoARIMA validation test simplifies the often complex task of selecting the most suitable ARIMA model based
42
47
  on BIC and AIC criteria.
43
- * The mechanism incorporates a check for non-stationarity within the data, which is a critical prerequisite for
48
+ - The mechanism incorporates a check for non-stationarity within the data, which is a critical prerequisite for
44
49
  ARIMA models.
45
- * The exhaustive search through all possible combinations of model parameters enhances the likelihood of
50
+ - The exhaustive search through all possible combinations of model parameters enhances the likelihood of
46
51
  identifying the best-fit model.
47
52
 
48
- **Limitations**:
49
- * This validation test can be computationally costly as it involves creating and fitting multiple ARIMA models for
53
+ ### Limitations
54
+
55
+ - This validation test can be computationally costly as it involves creating and fitting multiple ARIMA models for
50
56
  every variable.
51
- * Although the test checks for non-stationarity and logs warnings where present, it does not apply any
57
+ - Although the test checks for non-stationarity and logs warnings where present, it does not apply any
52
58
  transformations to the data to establish stationarity.
53
- * The selection of models leans solely on BIC and AIC criteria, which may not yield the best predictive model in
59
+ - The selection of models leans solely on BIC and AIC criteria, which may not yield the best predictive model in
54
60
  all scenarios.
55
- * The test is only applicable to regression tasks involving time-series data, and may not work effectively for
61
+ - The test is only applicable to regression tasks involving time-series data, and may not work effectively for
56
62
  other types of machine learning tasks.
57
63
  """
58
64
 
@@ -11,31 +11,35 @@ class BoxPierce(Metric):
11
11
  """
12
12
  Detects autocorrelation in time-series data through the Box-Pierce test to validate model performance.
13
13
 
14
- **Purpose:** The Box-Pierce test is utilized to detect the presence of autocorrelation in a time-series dataset.
14
+ ### Purpose
15
+
16
+ The Box-Pierce test is utilized to detect the presence of autocorrelation in a time-series dataset.
15
17
  Autocorrelation, or serial correlation, refers to the degree of similarity between observations based on the
16
18
  temporal spacing between them. This test is essential for affirming the quality of a time-series model by ensuring
17
19
  that the error terms in the model are random and do not adhere to a specific pattern.
18
20
 
19
- **Test Mechanism:** The implementation of the Box-Pierce test involves calculating a test statistic along with a
20
- corresponding p-value derived from the dataset features. These quantities are used to test the null hypothesis that
21
- posits the data to be independently distributed. This is achieved by iterating over every feature column in the
22
- time-series data and applying the `acorr_ljungbox` function of the statsmodels library. The function yields the
23
- Box-Pierce test statistic as well as the respective p-value, all of which are cached as test results.
21
+ ### Test Mechanism
22
+
23
+ The implementation of the Box-Pierce test involves calculating a test statistic along with a corresponding p-value
24
+ derived from the dataset features. These quantities are used to test the null hypothesis that posits the data to be
25
+ independently distributed. This is achieved by iterating over every feature column in the time-series data and
26
+ applying the `acorr_ljungbox` function of the statsmodels library. The function yields the Box-Pierce test
27
+ statistic as well as the respective p-value, all of which are cached as test results.
24
28
 
25
- **Signs of High Risk:**
29
+ ### Signs of High Risk
26
30
 
27
31
  - A low p-value, typically under 0.05 as per statistical convention, throws the null hypothesis of independence
28
32
  into question. This implies that the dataset potentially houses autocorrelations, thus indicating a high-risk
29
33
  scenario concerning model performance.
30
34
  - Large Box-Pierce test statistic values may indicate the presence of autocorrelation.
31
35
 
32
- **Strengths:**
36
+ ### Strengths
33
37
 
34
38
  - Detects patterns in data that are supposed to be random, thereby ensuring no underlying autocorrelation.
35
39
  - Can be computed efficiently given its low computational complexity.
36
40
  - Can be widely applied to most regression problems, making it very versatile.
37
41
 
38
- **Limitations:**
42
+ ### Limitations
39
43
 
40
44
  - Assumes homoscedasticity (constant variance) and normality of residuals, which may not always be the case in
41
45
  real-world datasets.
@@ -43,7 +47,7 @@ class BoxPierce(Metric):
43
47
  correlations.
44
48
  - It only provides a general indication of the existence of autocorrelation, without providing specific insights
45
49
  into the nature or patterns of the detected autocorrelation.
46
- - In the presence of exhibits trends or seasonal patterns, the Box-Pierce test may yield misleading results.
50
+ - In the presence of trends or seasonal patterns, the Box-Pierce test may yield misleading results.
47
51
  - Applicability is limited to time-series data, which limits its overall utility.
48
52
  """
49
53
 
@@ -2,138 +2,107 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import numpy as np
8
6
  import plotly.graph_objects as go
9
7
  from matplotlib import cm
10
8
 
11
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
12
10
 
13
11
 
14
- @dataclass
15
- class CumulativePredictionProbabilities(Metric):
12
+ @tags("visualization", "credit_risk", "logistic_regression")
13
+ @tasks("classification")
14
+ def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabilities"):
16
15
  """
17
16
  Visualizes cumulative probabilities of positive and negative classes for both training and testing in logistic
18
17
  regression models.
19
18
 
20
- **Purpose**: This metric is utilized to evaluate the distribution of predicted probabilities for positive and
21
- negative classes in a logistic regression model. It's not solely intended to measure the model's performance but
22
- also provides a visual assessment of the model's behavior by plotting the cumulative probabilities for positive and
23
- negative classes across both the training and test datasets.
19
+ ### Purpose
20
+
21
+ This metric is utilized to evaluate the distribution of predicted probabilities for positive and negative classes
22
+ in a logistic regression model. It provides a visual assessment of the model's behavior by plotting the cumulative
23
+ probabilities for positive and negative classes across both the training and test datasets.
24
+
25
+ ### Test Mechanism
26
+
27
+ The logistic regression model is evaluated by first computing the predicted probabilities for each instance in both
28
+ the training and test datasets, which are then added as a new column in these sets. The cumulative probabilities
29
+ for positive and negative classes are subsequently calculated and sorted in ascending order. Cumulative
30
+ distributions of these probabilities are created for both positive and negative classes across both training and
31
+ test datasets. These cumulative probabilities are represented visually in a plot, containing two subplots - one for
32
+ the training data and the other for the test data, with lines representing cumulative distributions of positive and
33
+ negative classes.
24
34
 
25
- **Test Mechanism**: The logistic regression model is evaluated by first computing the predicted probabilities for
26
- each instance in both the training and test datasets, which are then added as a new column in these sets. The
27
- cumulative probabilities for positive and negative classes are subsequently calculated and sorted in ascending
28
- order. Cumulative distributions of these probabilities are created for both positive and negative classes across
29
- both training and test datasets. These cumulative probabilities are represented visually in a plot, containing two
30
- subplots - one for the training data and the other for the test data, with lines representing cumulative
31
- distributions of positive and negative classes.
35
+ ### Signs of High Risk
32
36
 
33
- **Signs of High Risk**:
34
37
  - Imbalanced distribution of probabilities for either positive or negative classes.
35
38
  - Notable discrepancies or significant differences between the cumulative probability distributions for the
36
39
  training data versus the test data.
37
40
  - Marked discrepancies or large differences between the cumulative probability distributions for positive and
38
41
  negative classes.
39
42
 
40
- **Strengths**:
41
- - It offers not only numerical probabilities but also provides a visual illustration of data, which enhances the
42
- ease of understanding and interpreting the model's behavior.
43
+ ### Strengths
44
+
45
+ - Provides a visual illustration of data, which enhances the ease of understanding and interpreting the model's
46
+ behavior.
43
47
  - Allows for the comparison of model's behavior across training and testing datasets, providing insights about how
44
48
  well the model is generalized.
45
- - It differentiates between positive and negative classes and their respective distribution patterns, which can aid
46
- in problem diagnosis.
49
+ - Differentiates between positive and negative classes and their respective distribution patterns, aiding in
50
+ problem diagnosis.
51
+
52
+ ### Limitations
47
53
 
48
- **Limitations**:
49
54
  - Exclusive to classification tasks and specifically to logistic regression models.
50
55
  - Graphical results necessitate human interpretation and may not be directly applicable for automated risk
51
56
  detection.
52
- - The method does not give a solitary quantifiable measure of model risk, rather it offers a visual representation
53
- and broad distributional information.
57
+ - The method does not give a solitary quantifiable measure of model risk, instead, it offers a visual
58
+ representation and broad distributional information.
54
59
  - If the training and test datasets are not representative of the overall data distribution, the metric could
55
60
  provide misleading results.
56
61
  """
57
62
 
58
- name = "cumulative_prediction_probabilities"
59
- required_inputs = ["model", "datasets"]
60
- tasks = ["classification"]
61
- tags = ["logistic_regression", "visualization"]
62
-
63
- default_params = {"title": "Cumulative Probabilities"}
64
-
65
- @staticmethod
66
- def plot_cumulative_prob(dataframes, dataset_titles, target_col, title):
67
- figures = []
68
-
69
- # Generate a colormap and convert to Plotly-accepted color format
70
- # Adjust 'viridis' to any other matplotlib colormap if desired
71
- colormap = cm.get_cmap("viridis")
72
-
73
- for _, (df, dataset_title) in enumerate(zip(dataframes, dataset_titles)):
74
- fig = go.Figure()
75
-
76
- # Get unique classes and assign colors
77
- classes = sorted(df[target_col].unique())
78
- colors = [
79
- colormap(i / len(classes))[:3] for i in range(len(classes))
80
- ] # RGB
81
- color_dict = {
82
- cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
83
- for cls, rgb in zip(classes, colors)
84
- }
85
- for class_value in sorted(df[target_col].unique()):
86
- # Calculate cumulative distribution for the current class
87
- sorted_probs = np.sort(
88
- df[df[target_col] == class_value]["probabilities"]
89
- )
90
- cumulative_probs = np.cumsum(sorted_probs) / np.sum(sorted_probs)
91
-
92
- fig.add_trace(
93
- go.Scatter(
94
- x=sorted_probs,
95
- y=cumulative_probs,
96
- mode="lines",
97
- name=f"{dataset_title} {target_col} = {class_value}",
98
- line=dict(
99
- color=color_dict[class_value],
100
- ),
101
- )
102
- )
103
- fig.update_layout(
104
- title_text=f"{title} - {dataset_title}",
105
- xaxis_title="Probability",
106
- yaxis_title="Cumulative Distribution",
107
- legend_title=target_col,
108
- )
109
- figures.append(fig)
110
- return figures
111
-
112
- def run(self):
113
- dataset_titles = [dataset.input_id for dataset in self.inputs.datasets]
114
- target_column = self.inputs.datasets[0].target_column
115
- title = self.params.get("title", self.default_params["title"])
116
-
117
- dataframes = []
118
- metric_value = {"cum_prob": {}}
119
- for dataset in self.inputs.datasets:
120
- df = dataset.df.copy()
121
- y_prob = dataset.y_prob(self.inputs.model)
122
- df["probabilities"] = y_prob
123
- dataframes.append(df)
124
- metric_value["cum_prob"][dataset.input_id] = list(df["probabilities"])
125
-
126
- figures = self.plot_cumulative_prob(
127
- dataframes, dataset_titles, target_column, title
128
- )
63
+ df = dataset.df
64
+ df["probabilities"] = dataset.y_prob(model)
129
65
 
130
- figures_list = [
131
- Figure(
132
- for_object=self,
133
- key=f"cumulative_prob_{title.replace(' ', '_')}_{i+1}",
134
- figure=fig,
66
+ fig = _plot_cumulative_prob(df, dataset.target_column, title)
67
+
68
+ return fig
69
+
70
+
71
+ def _plot_cumulative_prob(df, target_col, title):
72
+
73
+ # Generate a colormap and convert to Plotly-accepted color format
74
+ # Adjust 'viridis' to any other matplotlib colormap if desired
75
+ colormap = cm.get_cmap("viridis")
76
+
77
+ fig = go.Figure()
78
+
79
+ # Get unique classes and assign colors
80
+ classes = sorted(df[target_col].unique())
81
+ colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB
82
+ color_dict = {
83
+ cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
84
+ for cls, rgb in zip(classes, colors)
85
+ }
86
+ for class_value in sorted(df[target_col].unique()):
87
+ # Calculate cumulative distribution for the current class
88
+ sorted_probs = np.sort(df[df[target_col] == class_value]["probabilities"])
89
+ cumulative_probs = np.cumsum(sorted_probs) / np.sum(sorted_probs)
90
+
91
+ fig.add_trace(
92
+ go.Scatter(
93
+ x=sorted_probs,
94
+ y=cumulative_probs,
95
+ mode="lines",
96
+ name=f"{target_col} = {class_value}",
97
+ line=dict(
98
+ color=color_dict[class_value],
99
+ ),
135
100
  )
136
- for i, fig in enumerate(figures)
137
- ]
101
+ )
102
+ fig.update_layout(
103
+ title_text=f"{title}",
104
+ xaxis_title="Probability",
105
+ yaxis_title="Cumulative Distribution",
106
+ )
138
107
 
139
- return self.cache_results(metric_value=metric_value, figures=figures_list)
108
+ return fig
@@ -14,32 +14,39 @@ class DurbinWatsonTest(Metric):
14
14
  """
15
15
  Assesses autocorrelation in time series data features using the Durbin-Watson statistic.
16
16
 
17
- **Purpose**: The Durbin-Watson Test metric detects autocorrelation in time series data (where a set of data values
18
- influences their predecessors). Autocorrelation is a crucial factor for regression tasks as these often assume the
17
+ ### Purpose
18
+
19
+ The Durbin-Watson Test metric detects autocorrelation in time series data (where a set of data values influences
20
+ their predecessors). Autocorrelation is a crucial factor for regression tasks as these often assume the
19
21
  independence of residuals. A model with significant autocorrelation may give unreliable predictions.
20
22
 
21
- **Test Mechanism**: Utilizing the `durbin_watson` function in the `statsmodels` Python library, the Durbin-Watson
22
- (DW) Test metric generates a statistical value for each feature of the training dataset. The function is looped
23
- over all columns of the dataset, calculating and caching the DW value for each column for further analysis. A DW
24
- metric value nearing 2 indicates no autocorrelation. Conversely, values approaching 0 suggest positive
25
- autocorrelation, and those leaning towards 4 imply negative autocorrelation.
23
+ ### Test Mechanism
24
+
25
+ Utilizing the `durbin_watson` function in the `statsmodels` Python library, the Durbin-Watson (DW) Test metric
26
+ generates a statistical value for each feature of the training dataset. The function is looped over all columns of
27
+ the dataset, calculating and caching the DW value for each column for further analysis. A DW metric value nearing 2
28
+ indicates no autocorrelation. Conversely, values approaching 0 suggest positive autocorrelation, and those leaning
29
+ towards 4 imply negative autocorrelation.
30
+
31
+ ### Signs of High Risk
26
32
 
27
- **Signs of High Risk**:
28
33
  - If a feature's DW value significantly deviates from 2, it could signal a high risk due to potential
29
34
  autocorrelation issues in the dataset.
30
- - A value closer to '0' could imply positive autocorrelation, while a value nearer to '4' could point to negative
35
+ - A value closer to 0 could imply positive autocorrelation, while a value nearer to 4 could point to negative
31
36
  autocorrelation, both leading to potentially unreliable prediction models.
32
37
 
33
- **Strengths**:
38
+ ### Strengths
39
+
34
40
  - The metric specializes in identifying autocorrelation in prediction model residuals.
35
41
  - Autocorrelation detection assists in diagnosing violation of various modeling technique assumptions, particularly
36
42
  in regression analysis and time-series data modeling.
37
43
 
38
- **Limitations**:
44
+ ### Limitations
45
+
39
46
  - The Durbin-Watson Test mainly detects linear autocorrelation and could overlook other types of relationships.
40
47
  - The metric is highly sensitive to data points order. Shuffling the order could lead to notably different results.
41
48
  - The test only checks for first-order autocorrelation (between a variable and its immediate predecessor) and fails
42
- to detect higher order autocorrelation.
49
+ to detect higher-order autocorrelation.
43
50
  """
44
51
 
45
52
  name = "durbin_watson"