validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -20,34 +20,41 @@ class WOEBinPlots(Metric):
20
20
  Generates visualizations of Weight of Evidence (WoE) and Information Value (IV) for understanding predictive power
21
21
  of categorical variables in a data set.
22
22
 
23
- **Purpose**: This test is designed to visualize the Weight of Evidence (WoE) and Information Value (IV) for
24
- categorical variables in a provided dataset. By showcasing the data distribution across different categories of
25
- each feature, it aids in understanding each variable's predictive power in the context of a classification-based
26
- machine learning model. Commonly used in credit scoring models, WoE and IV are robust statistical methods for
27
- evaluating a variable's predictive power.
28
-
29
- **Test Mechanism**: The test implementation follows defined steps. Initially, it selects non-numeric columns from
30
- the dataset and changes them to string type, paving the way for accurate binning. It then performs an automated WoE
31
- binning operation on these selected features, effectively categorizing the potential values of a variable into
32
- distinct bins. After the binning process, the function generates two separate visualizations (a scatter chart for
33
- WoE values and a bar chart for IV) for each variable. These visual presentations are formed according to the spread
34
- of each metric across various categories of each feature.
35
-
36
- **Signs of High Risk**:
23
+ ### Purpose
24
+
25
+ This test is designed to visualize the Weight of Evidence (WoE) and Information Value (IV) for categorical
26
+ variables in a provided dataset. By showcasing the data distribution across different categories of each feature,
27
+ it aids in understanding each variable's predictive power in the context of a classification-based machine learning
28
+ model. Commonly used in credit scoring models, WoE and IV are robust statistical methods for evaluating a
29
+ variable's predictive power.
30
+
31
+ ### Test Mechanism
32
+
33
+ The test implementation follows defined steps. Initially, it selects non-numeric columns from the dataset and
34
+ changes them to string type, paving the way for accurate binning. It then performs an automated WoE binning
35
+ operation on these selected features, effectively categorizing the potential values of a variable into distinct
36
+ bins. After the binning process, the function generates two separate visualizations (a scatter chart for WoE values
37
+ and a bar chart for IV) for each variable. These visual presentations are formed according to the spread of each
38
+ metric across various categories of each feature.
39
+
40
+ ### Signs of High Risk
41
+
37
42
  - Errors occurring during the binning process.
38
43
  - Challenges in converting non-numeric columns into string data type.
39
44
  - Misbalance in the distribution of WoE and IV, with certain bins overtaking others conspicuously. This could
40
45
  denote that the model is disproportionately dependent on certain variables or categories for predictions, an
41
46
  indication of potential risks to its robustness and generalizability.
42
47
 
43
- **Strengths**:
48
+ ### Strengths
49
+
44
50
  - Provides a detailed visual representation of the relationship between feature categories and the target variable.
45
51
  This grants an intuitive understanding of each feature's contribution to the model.
46
52
  - Allows for easy identification of features with high impact, facilitating feature selection and enhancing
47
53
  comprehension of the model's decision logic.
48
54
  - WoE conversions are monotonic, upholding the rank ordering of the original data points, which simplifies analysis.
49
55
 
50
- **Limitations**:
56
+ ### Limitations
57
+
51
58
  - The method is largely reliant on the binning process, and an inappropriate binning threshold or bin number choice
52
59
  might result in a misrepresentation of the variable's distribution.
53
60
  - While excellent for categorical data, the encoding of continuous variables into categorical can sometimes lead to
@@ -13,36 +13,41 @@ from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableM
13
13
  @dataclass
14
14
  class WOEBinTable(Metric):
15
15
  """
16
- Calculates and assesses the Weight of Evidence (WoE) and Information Value (IV) of each feature in a ML model.
17
-
18
- **Purpose**: The Weight of Evidence (WoE) and Information Value (IV) test is intended to evaluate the predictive
19
- power of each feature in the machine learning model. The test generates binned groups of values from each feature
20
- in a dataset, computes the WoE value and the IV value for each bin. These values provide insights on the
21
- relationship between each feature and the target variable and their contribution towards the predictive output of
22
- the model.
23
-
24
- **Test Mechanism**: The metric leverages the `scorecardpy.woebin` method to perform WoE-based automatic binning on
25
- the dataset. Depending on the parameter `breaks_adj`, the method adjusts the cut-off points for binning numeric
26
- variables. The bins are then used to calculate the WoE and IV. The metric requires a dataset with the target
27
- variable defined. The metric outputs a dataframe that comprises the bin boundaries, WoE, and IV values for each
28
- feature.
29
-
30
- **Signs of High Risk**:
31
- - High IV values, which denote variables with too much predictive power which might lead to overfitting
32
- - Errors during the binning process, which might be due to inappropriate data types or poorly defined bins
33
-
34
- **Strengths**:
35
- - The WoE and IV test is highly effective for feature selection in binary classification problems, as it quantifies
36
- how much predictive information is packed within each feature regarding the binary outcome
37
- - The WoE transformation creates a monotonic relationship between the target and independent variables
38
-
39
- **Limitations**:
40
- - Mainly designed for binary classification tasks, therefore it might not be applicable or reliable for multi-class
41
- classification or regression tasks
42
- - If the dataset has many features or the features are not binnable or they are non-numeric, this process might
43
- encounter difficulties
44
- - This metric doesn't help in identifying if the predictive factor being observed is a coincidence or a real
45
- phenomenon due to data randomness
16
+ Assesses the Weight of Evidence (WoE) and Information Value (IV) of each feature to evaluate its predictive power
17
+ in a binary classification model.
18
+
19
+ ### Purpose
20
+
21
+ The Weight of Evidence (WoE) and Information Value (IV) test is designed to evaluate the predictive power of each
22
+ feature in a machine learning model. This test generates binned groups of values from each feature, computes the
23
+ WoE and IV for each bin, and provides insights into the relationship between each feature and the target variable,
24
+ illustrating their contribution to the model's predictive capabilities.
25
+
26
+ ### Test Mechanism
27
+
28
+ The test uses the `scorecardpy.woebin` method to perform automatic binning of the dataset based on WoE. The method
29
+ adjusts the cut-off points for binning numeric variables based on the parameter `breaks_adj`. The bins are then
30
+ used to calculate the WoE and IV values, effectively creating a dataframe that includes the bin boundaries, WoE,
31
+ and IV values for each feature. A target variable is required in the dataset to perform this analysis.
32
+
33
+ ### Signs of High Risk
34
+
35
+ - High IV values, indicating variables with excessive predictive power which might lead to overfitting.
36
+ - Errors during the binning process, potentially due to inappropriate data types or poorly defined bins.
37
+
38
+ ### Strengths
39
+
40
+ - Highly effective for feature selection in binary classification problems, as it quantifies the predictive
41
+ information within each feature concerning the binary outcome.
42
+ - The WoE transformation creates a monotonic relationship between the target and independent variables.
43
+
44
+ ### Limitations
45
+
46
+ - Primarily designed for binary classification tasks, making it less applicable or reliable for multi-class
47
+ classification or regression tasks.
48
+ - Potential difficulties if the dataset has many features, non-binnable features, or non-numeric features.
49
+ - The metric does not help in distinguishing whether the observed predictive factor is due to data randomness or a
50
+ true phenomenon.
46
51
  """
47
52
 
48
53
  name = "woe_bin_table"
@@ -17,36 +17,42 @@ logger = get_logger(__name__)
17
17
  @dataclass
18
18
  class ZivotAndrewsArch(Metric):
19
19
  """
20
- Evaluates the order of integration and stationarity of time series data using Zivot-Andrews unit root test.
21
-
22
- **Purpose**: The Zivot-Andrews Arch metric is used to evaluate the order of integration for a time series data in a
23
- machine learning model. It's designed to test for stationarity, a crucial aspect in time series analysis where data
24
- points are not dependent on time. Stationarity means that the statistical properties such as mean, variance and
25
- autocorrelation are all constant over time.
26
-
27
- **Test Mechanism**: The Zivot-Andrews unit root test is performed on each feature in the dataset using the
28
- `ZivotAndrews` function from the `arch.unitroot` module. This function returns the Zivot-Andrews metric for each
29
- feature, which includes the statistical value, p-value (probability value), the number of used lags, and the number
30
- of observations. The p-value is later used to decide on the null hypothesis (the time series has a unit root and is
31
- non-stationary) based on a chosen level of significance.
32
-
33
- **Signs of High Risk**:
34
- - A high p-value can suggest high risk. This might indicate that there's insufficient evidence to reject the null
35
- hypothesis, which would mean the time series has a unit root and is therefore non-stationary.
20
+ Evaluates the order of integration and stationarity of time series data using the Zivot-Andrews unit root test.
21
+
22
+ ### Purpose
23
+
24
+ The Zivot-Andrews Arch metric is used to evaluate the order of integration for time series data in a machine
25
+ learning model. It's designed to test for stationarity, a crucial aspect of time series analysis, where data points
26
+ are independent of time. Stationarity means that the statistical properties such as mean, variance, and
27
+ autocorrelation are constant over time.
28
+
29
+ ### Test Mechanism
30
+
31
+ The Zivot-Andrews unit root test is performed on each feature in the dataset using the `ZivotAndrews` function from
32
+ the `arch.unitroot` module. This function returns several metrics for each feature, including the statistical
33
+ value, p-value (probability value), the number of lags used, and the number of observations. The p-value is used to
34
+ decide on the null hypothesis (the time series has a unit root and is non-stationary) based on a chosen level of
35
+ significance.
36
+
37
+ ### Signs of High Risk
38
+
39
+ - A high p-value suggests high risk, indicating insufficient evidence to reject the null hypothesis, implying that
40
+ the time series has a unit root and is non-stationary.
36
41
  - Non-stationary time series data can lead to misleading statistics and unreliable machine learning models.
37
42
 
38
- **Strengths**:
39
- - The Zivot-Andrews Arch metric dynamically tests for stationarity against structural breaks in time series data,
40
- offering robust evaluation of stationarity in features.
41
- - This metric is especially beneficial with financial, economic, or other time-series data where data observations
42
- lack a consistent pattern and structural breaks may occur.
43
-
44
- **Limitations**:
45
- - The Zivot-Andrews Arch metric assumes that data is derived from a single-equation, autoregressive model. It may,
46
- therefore, not be appropriate for multivariate time series data or data which does not align with the
47
- autoregressive model assumption.
48
- - It might not take into account unexpected shocks or changes in the series trend which can both have a significant
49
- impact on the stationarity of the data.
43
+ ### Strengths
44
+
45
+ - Dynamically tests for stationarity against structural breaks in time series data, offering robust evaluation of
46
+ stationarity in features.
47
+ - Especially beneficial with financial, economic, or other time-series data where data observations lack a
48
+ consistent pattern and structural breaks may occur.
49
+
50
+ ### Limitations
51
+
52
+ - Assumes data is derived from a single-equation, autoregressive model, making it less appropriate for multivariate
53
+ time series data or data not aligning with this model.
54
+ - May not account for unexpected shocks or changes in the series trend, both of which can significantly impact data
55
+ stationarity.
50
56
  """
51
57
 
52
58
  name = "zivot_andrews"
@@ -19,33 +19,40 @@ from ....vm_models import Figure, Metric, VMDataset
19
19
  @dataclass
20
20
  class CommonWords(Metric):
21
21
  """
22
- Identifies and visualizes the 40 most frequent non-stopwords in a specified text column within a dataset.
22
+ Assesses the most frequent non-stopwords in a text column for identifying prevalent language patterns.
23
23
 
24
- **Purpose**: The CommonWords metric is used to identify and visualize the most prevalent words within a specified
25
- text column of a dataset. This provides insights into the prevalent language patterns and vocabulary, especially
26
- useful in Natural Language Processing (NLP) tasks such as text classification and text summarization.
24
+ ### Purpose
27
25
 
28
- **Test Mechanism**: The test methodology involves splitting the specified text column's entries into words,
29
- collating them into a corpus, and then counting the frequency of each word using the Counter. The forty most
30
- frequently occurring non-stopwords are then visualized in a bar chart, where the x-axis represents the words, and
31
- the y-axis indicates their frequency of occurrence.
26
+ The CommonWords metric is used to identify and visualize the most prevalent words within a specified text column of
27
+ a dataset. This provides insights into the prevalent language patterns and vocabulary, especially useful in Natural
28
+ Language Processing (NLP) tasks such as text classification and text summarization.
29
+
30
+ ### Test Mechanism
31
+
32
+ The test methodology involves splitting the specified text column's entries into words, collating them into a
33
+ corpus, and then counting the frequency of each word using the Counter. The forty most frequently occurring
34
+ non-stopwords are then visualized in a bar chart, where the x-axis represents the words, and the y-axis indicates
35
+ their frequency of occurrence.
36
+
37
+ ### Signs of High Risk
32
38
 
33
- **Signs of High Risk**:
34
39
  - A lack of distinct words within the list, or the most common words being stopwords.
35
40
  - Frequent occurrence of irrelevant or inappropriate words could point out a poorly curated or noisy dataset.
36
- - An error returned due to the absence of a valid Dataset object indicates high risk as the metric cannot be
41
+ - An error returned due to the absence of a valid Dataset object, indicating high risk as the metric cannot be
37
42
  effectively implemented without it.
38
43
 
39
- **Strengths**:
44
+ ### Strengths
45
+
40
46
  - The metric provides clear insights into the language features – specifically word frequency – of unstructured
41
47
  text data.
42
48
  - It can reveal prominent vocabulary and language patterns, which prove vital for feature extraction in NLP tasks.
43
49
  - The visualization helps in quickly capturing the patterns and understanding the data intuitively.
44
50
 
45
- **Limitations**:
51
+ ### Limitations
52
+
46
53
  - The test disregards semantic or context-related information as it solely focuses on word frequency.
47
- - It intentionally ignores stopwords which might carry necessary significance in certain scenarios.
48
- - The applicability is limited to English language text data as English stopwords are used for filtering, hence
54
+ - It intentionally ignores stopwords, which might carry necessary significance in certain scenarios.
55
+ - The applicability is limited to English-language text data as English stopwords are used for filtering, hence
49
56
  cannot account for data in other languages.
50
57
  - The metric requires a valid Dataset object, indicating a dependency condition that limits its broader
51
58
  applicability.
@@ -9,8 +9,7 @@ Threshold based tests
9
9
  import re
10
10
  from dataclasses import dataclass
11
11
 
12
- import matplotlib.pyplot as plt
13
- import seaborn as sns
12
+ import plotly.graph_objects as go
14
13
 
15
14
  from validmind.vm_models import Figure, ThresholdTest, VMDataset
16
15
 
@@ -20,37 +19,44 @@ class Hashtags(ThresholdTest):
20
19
  """
21
20
  Assesses hashtag frequency in a text column, highlighting usage trends and potential dataset bias or spam.
22
21
 
23
- **Purpose**: The Hashtags test is designed to measure the frequency of hashtags used within a given text column in
24
- a dataset. It is particularly useful for natural language processing tasks such as text classification and text
25
- summarization. The goal is to identify common trends and patterns in the use of hashtags, which can serve as
26
- critical indicators or features within a machine learning model.
22
+ ### Purpose
27
23
 
28
- **Test Mechanism**: The test implements a regular expression (regex) to extract all hashtags from the specified
29
- text column. For each hashtag found, it makes a tally of its occurrences. It then outputs a list of the top N
30
- hashtags (default is 25, but customizable), sorted by their counts in descending order. The results are also
31
- visualized in a bar plot, with frequency counts on the y-axis and the corresponding hashtags on the x-axis.
24
+ The Hashtags test is designed to measure the frequency of hashtags used within a given text column in a dataset. It
25
+ is particularly useful for natural language processing tasks such as text classification and text summarization.
26
+ The goal is to identify common trends and patterns in the use of hashtags, which can serve as critical indicators
27
+ or features within a machine learning model.
28
+
29
+ ### Test Mechanism
30
+
31
+ The test implements a regular expression (regex) to extract all hashtags from the specified text column. For each
32
+ hashtag found, it makes a tally of its occurrences. It then outputs a list of the top N hashtags (default is 25,
33
+ but customizable), sorted by their counts in descending order. The results are also visualized in a bar plot, with
34
+ frequency counts on the y-axis and the corresponding hashtags on the x-axis.
35
+
36
+ ### Signs of High Risk
32
37
 
33
- **Signs of High Risk**:
34
38
  - A low diversity in the usage of hashtags, as indicated by a few hashtags being used disproportionately more than
35
39
  others.
36
40
  - Repeated usage of one or few hashtags can be indicative of spam or a biased dataset.
37
41
  - If there are no or extremely few hashtags found in the dataset, it perhaps signifies that the text data does not
38
42
  contain structured social media data.
39
43
 
40
- **Strengths**:
41
- - It provides a concise visual representation of the frequency of hashtags, which can be critical for understanding
44
+ ### Strengths
45
+
46
+ - Provides a concise visual representation of the frequency of hashtags, which can be critical for understanding
42
47
  trends about a particular topic in text data.
43
- - It is instrumental in tasks specifically related to social media text analytics, such as opinion analysis and
44
- trend discovery.
45
- - The test is adaptable, allowing the flexibility to determine the number of top hashtags to be analyzed.
48
+ - Instrumental in tasks specifically related to social media text analytics, such as opinion analysis and trend
49
+ discovery.
50
+ - Adaptable, allowing the flexibility to determine the number of top hashtags to be analyzed.
46
51
 
47
- **Limitations**:
48
- - The test assumes the presence of hashtags and therefore may not be applicable for text datasets that do not
49
- contain hashtags (e.g., formal documents, scientific literature).
52
+ ### Limitations
53
+
54
+ - Assumes the presence of hashtags and therefore may not be applicable for text datasets that do not contain
55
+ hashtags (e.g., formal documents, scientific literature).
50
56
  - Language-specific limitations of hashtag formulations are not taken into account.
51
- - It does not account for typographical errors, variations, or synonyms in hashtags.
52
- - This test does not provide context or sentiment associated with the hashtags, so the information provided may
53
- have limited utility on its own.
57
+ - Does not account for typographical errors, variations, or synonyms in hashtags.
58
+ - Does not provide context or sentiment associated with the hashtags, so the information provided may have limited
59
+ utility on its own.
54
60
  """
55
61
 
56
62
  name = "hashtags"
@@ -67,25 +73,23 @@ class Hashtags(ThresholdTest):
67
73
  text_column = self.inputs.dataset.text_column
68
74
 
69
75
  def find_hash(text):
70
- line = re.findall(r"(?<=#)\w+", text)
71
- return " ".join(line)
72
-
73
- temp = (
74
- self.inputs.dataset.df[text_column]
75
- .apply(lambda x: find_hash(x))
76
- .value_counts()[:][1 : self.params["top_hashtags"]]
77
- )
78
- temp = (
79
- temp.to_frame()
80
- .reset_index()
81
- .rename(columns={"index": "Hashtag", text_column: "count"})
82
- )
76
+ return re.findall(r"(?<=#)\w+", str(text))
77
+
78
+ # Extract hashtags from the text column and count occurrences
79
+ hashtags = self.inputs.dataset.df[text_column].apply(find_hash).explode()
80
+ temp = hashtags.value_counts().head(self.params["top_hashtags"])
81
+
82
+ print(f"temp: {temp}")
83
83
 
84
84
  figures = []
85
85
  if not temp.empty:
86
- fig = plt.figure()
87
- sns.barplot(x="Hashtag", y="count", data=temp)
88
- plt.xticks(rotation=90)
86
+ fig = go.Figure(data=[go.Bar(x=temp.index, y=temp.values)])
87
+ fig.update_layout(
88
+ title="Top Hashtags",
89
+ xaxis_title="Hashtag",
90
+ yaxis_title="Count",
91
+ xaxis_tickangle=-45,
92
+ )
89
93
  figures.append(
90
94
  Figure(
91
95
  for_object=self,
@@ -93,7 +97,5 @@ class Hashtags(ThresholdTest):
93
97
  figure=fig,
94
98
  )
95
99
  )
96
- # Do this if you want to prevent the figure from being displayed
97
- plt.close("all")
98
100
 
99
101
  return self.cache_results([], passed=True, figures=figures)
@@ -17,24 +17,43 @@ from validmind import tags, tasks
17
17
  @tasks("text_classification", "text_summarization")
18
18
  def LanguageDetection(dataset):
19
19
  """
20
- Detects the language of each text entry in a dataset and visualizes the distribution of languages
21
- as a histogram.
20
+ Assesses the diversity of languages in a textual dataset by detecting and visualizing the distribution of languages.
22
21
 
23
- This method checks for a specified text column in the dataset's dataframe, uses a language detection
24
- library to determine the language of each text entry, and returns a histogram plot of the language
25
- distribution.
22
+ ### Purpose
26
23
 
27
- Args:
28
- dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
29
- and a `text_column` attribute indicating the name of the column containing text. If the
30
- `text_column` attribute is not set, a ValueError is raised.
24
+ The Language Detection test aims to identify and visualize the distribution of languages present within a textual
25
+ dataset. This test helps in understanding the diversity of languages in the data, which is crucial for developing
26
+ and validating multilingual models.
31
27
 
32
- Returns:
33
- plotly.graph_objs._figure.Figure: A Plotly histogram plot showing the distribution of detected
34
- languages across the dataset's text entries.
28
+ ### Test Mechanism
35
29
 
36
- Raises:
37
- ValueError: If the `text_column` is not specified in the dataset object.
30
+ This test operates by:
31
+
32
+ - Checking if the dataset has a specified text column.
33
+ - Using a language detection library to determine the language of each text entry in the dataset.
34
+ - Generating a histogram plot of the language distribution, with language codes on the x-axis and their frequencies
35
+ on the y-axis.
36
+
37
+ If the text column is not specified, a ValueError is raised to ensure proper dataset configuration.
38
+
39
+ ### Signs of High Risk
40
+
41
+ - A high proportion of entries returning "Unknown" language codes.
42
+ - Detection of unexpectedly diverse or incorrect language codes, indicating potential data quality issues.
43
+ - Significant imbalance in language distribution, which might indicate potential biases in the dataset.
44
+
45
+ ### Strengths
46
+
47
+ - Provides a visual representation of language diversity within the dataset.
48
+ - Helps identify data quality issues related to incorrect or unknown language detection.
49
+ - Useful for ensuring that multilingual models have adequate and appropriate representation from various languages.
50
+
51
+ ### Limitations
52
+
53
+ - Dependency on the accuracy of the language detection library, which may not be perfect.
54
+ - Languages with similar structures or limited text length may be incorrectly classified.
55
+ - The test returns "Unknown" for entries where language detection fails, which might mask underlying issues with
56
+ certain languages or text formats.
38
57
  """
39
58
  # check text column
40
59
  if not dataset.text_column:
@@ -20,31 +20,37 @@ class Mentions(ThresholdTest):
20
20
  """
21
21
  Calculates and visualizes frequencies of '@' prefixed mentions in a text-based dataset for NLP model analysis.
22
22
 
23
- **Purpose**: This test, termed "Mentions", is designed to gauge the quality of data in a Natural Language
24
- Processing (NLP) or text-focused Machine Learning model. The primary objective is to identify and calculate the
25
- frequency of 'mentions' within a chosen text column of a dataset. A 'mention' in this context refers to individual
26
- text elements that are prefixed by '@'. The output of this test reveals the most frequently mentioned entities or
27
- usernames, which can be integral for applications such as social media analyses, customer sentiment analyses, and
28
- so on.
29
-
30
- **Test Mechanism**: The test first verifies the existence of a text column in the provided dataset. It then employs
31
- a regular expression pattern to extract mentions from the text. Subsequently, the frequency of each unique mention
32
- is calculated. The test selects the most frequent mentions based on default or user-defined parameters, the default
23
+ ### Purpose
24
+
25
+ The "Mentions" test is designed to gauge the quality of data in a Natural Language Processing (NLP) or text-focused
26
+ Machine Learning model. The primary objective is to identify and calculate the frequency of 'mentions' within a
27
+ chosen text column of a dataset. A 'mention' in this context refers to individual text elements that are prefixed
28
+ by '@'. The output of this test reveals the most frequently mentioned entities or usernames, which can be integral
29
+ for applications such as social media analyses or customer sentiment analyses.
30
+
31
+ ### Test Mechanism
32
+
33
+ The test first verifies the existence of a text column in the provided dataset. It then employs a regular
34
+ expression pattern to extract mentions from the text. Subsequently, the frequency of each unique mention is
35
+ calculated. The test selects the most frequent mentions based on default or user-defined parameters, the default
33
36
  being the top 25, for representation. This process of thresholding forms the core of the test. A treemap plot
34
37
  visualizes the test results, where the size of each rectangle corresponds to the frequency of a particular mention.
35
38
 
36
- **Signs of High Risk**:
39
+ ### Signs of High Risk
40
+
37
41
  - The lack of a valid text column in the dataset, which would result in the failure of the test execution.
38
42
  - The absence of any mentions within the text data, indicating that there might not be any text associated with
39
- '@'. This situation could point towards sparse or poor-quality data, thereby hampering the model's generalization
40
- or learning capabilities.
43
+ '@'. This situation could point toward sparse or poor-quality data, thereby hampering the model's generalization or
44
+ learning capabilities.
45
+
46
+ ### Strengths
41
47
 
42
- **Strengths**:
43
48
  - The test is specifically optimized for text-based datasets which gives it distinct power in the context of NLP.
44
49
  - It enables quick identification and visually appealing representation of the predominant elements or mentions.
45
50
  - It can provide crucial insights about the most frequently mentioned entities or usernames.
46
51
 
47
- **Limitations**:
52
+ ### Limitations
53
+
48
54
  - The test only recognizes mentions that are prefixed by '@', hence useful textual aspects not preceded by '@'
49
55
  might be ignored.
50
56
  - This test isn't suited for datasets devoid of textual data.
@@ -14,18 +14,41 @@ from validmind import tags, tasks
14
14
  @tasks("nlp")
15
15
  def PolarityAndSubjectivity(dataset):
16
16
  """
17
- Analyzes the polarity and subjectivity of text data within a dataset.
17
+ Analyzes the polarity and subjectivity of text data within a given dataset to visualize the sentiment distribution.
18
18
 
19
- This method processes a dataset containing textual data to compute the polarity and
20
- subjectivity scores using TextBlob, and returns a Plotly scatter plot visualizing
21
- these scores.
19
+ ### Purpose
22
20
 
23
- Args:
24
- dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
25
- and a `text_column` attribute indicating the name of the column containing text.
21
+ The Polarity and Subjectivity test is designed to evaluate the sentiment expressed in textual data. By analyzing
22
+ these aspects, it helps to identify the emotional tone and subjectivity of the dataset, which could be crucial in
23
+ understanding customer feedback, social media sentiments, or other text-related data.
26
24
 
27
- Returns:
28
- plotly.graph_objs._figure.Figure: A Plotly scatter plot of polarity vs subjectivity.
25
+ ### Test Mechanism
26
+
27
+ This test uses TextBlob to compute the polarity and subjectivity scores of textual data in a given dataset. The
28
+ mechanism includes:
29
+
30
+ - Iterating through each text entry in the specified column of the dataset.
31
+ - Applying the TextBlob library to compute the polarity (ranging from -1 for negative sentiment to +1 for positive
32
+ sentiment) and subjectivity (ranging from 0 for objective to 1 for subjective) for each entry.
33
+ - Creating a scatter plot using Plotly to visualize the relationship between polarity and subjectivity.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - High concentration of negative polarity values indicating prevalent negative sentiments.
38
+ - High subjectivity scores suggesting the text data is largely opinion-based rather than factual.
39
+ - Disproportionate clusters of extreme scores (e.g., many points near -1 or +1 polarity).
40
+
41
+ ### Strengths
42
+
43
+ - Quantifies sentiment and subjectivity which can provide actionable insights.
44
+ - Visualizes sentiment distribution, aiding in easy interpretation.
45
+ - Utilizes well-established TextBlob library for sentiment analysis.
46
+
47
+ ### Limitations
48
+
49
+ - Polarity and subjectivity calculations may oversimplify nuanced text sentiments.
50
+ - Reliance on TextBlob which may not be accurate for all domains or contexts.
51
+ - Visualization could become cluttered with very large datasets, making interpretation difficult.
29
52
  """
30
53
 
31
54
  # Function to calculate sentiment and subjectivity
@@ -20,34 +20,38 @@ class Punctuations(Metric):
20
20
  """
21
21
  Analyzes and visualizes the frequency distribution of punctuation usage in a given text dataset.
22
22
 
23
- **1. Purpose:** The Punctuations Metric's primary purpose is to analyze the frequency of punctuation usage within a
24
- given text dataset. This is often used in Natural Language Processing tasks, such as text classification and text
23
+ ### Purpose
24
+
25
+ The Punctuations Metric's primary purpose is to analyze the frequency of punctuation usage within a given text
26
+ dataset. This is often used in Natural Language Processing tasks, such as text classification and text
25
27
  summarization.
26
28
 
27
- **2. Test Mechanism:** The test begins by verifying that the input "dataset" is of the type VMDataset. Following
28
- that, a corpus is created from the dataset by splitting its text on spaces. Each unique punctuation character in
29
- the text corpus is then tallied. Then, the frequency distribution of each punctuation symbol is visualized as a bar
30
- graph, with these results being stored as Figures and associated with the main Punctuations object.
29
+ ### Test Mechanism
30
+
31
+ The test begins by verifying that the input "dataset" is of the type VMDataset. Following that, a corpus is created
32
+ from the dataset by splitting its text on spaces. Each unique punctuation character in the text corpus is then
33
+ tallied. The frequency distribution of each punctuation symbol is visualized as a bar graph, with these results
34
+ being stored as Figures and associated with the main Punctuations object.
31
35
 
32
- **3. Signs of High Risk:**
36
+ ### Signs of High Risk
33
37
 
34
- - High risk can be indicated by the excessive or unusual frequency of specific punctuation marks, potentially
35
- denoting dubious quality, data corruption, or skewed data.
38
+ - Excessive or unusual frequency of specific punctuation marks, potentially denoting dubious quality, data
39
+ corruption, or skewed data.
36
40
 
37
- **4. Strengths:**
41
+ ### Strengths
38
42
 
39
- - The Punctuations Metric provides valuable insights into the distribution of punctuation usage in a text dataset.
40
- - This insight can be important in validating the quality, consistency, and nature of the data.
41
- - It can provide hints about the style or tonality of the text corpus. For example, frequent usage of exclamation
42
- marks may suggest a more informal and emotional context.
43
+ - Provides valuable insights into the distribution of punctuation usage in a text dataset.
44
+ - Important in validating the quality, consistency, and nature of the data.
45
+ - Can provide hints about the style or tonality of the text corpus, such as informal and emotional context
46
+ indicated by frequent exclamation marks.
43
47
 
44
- **5. Limitations:**
48
+ ### Limitations
45
49
 
46
- - The metric focuses solely on punctuation usage and can miss other important textual characteristics.
47
- - It's important not to make general cultural or tonality assumptions based solely on punctuation distribution,
48
- since these can vary greatly across different languages and contexts.
49
- - The metric may be less effective with languages that use non-standard or different punctuation.
50
- - The visualization may lack interpretability when there are many unique punctuation marks in the dataset.
50
+ - Focuses solely on punctuation usage, potentially missing other important textual characteristics.
51
+ - General cultural or tonality assumptions based on punctuation distribution can be misguiding, as these vary
52
+ across different languages and contexts.
53
+ - Less effective with languages that use non-standard or different punctuation.
54
+ - Visualization may lack interpretability when there are many unique punctuation marks in the dataset.
51
55
  """
52
56
 
53
57
  name = "punctuations"