validmind 2.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. validmind/__init__.py +6 -3
  2. validmind/__version__.py +1 -1
  3. validmind/ai.py +193 -0
  4. validmind/api_client.py +45 -31
  5. validmind/client.py +33 -6
  6. validmind/datasets/classification/customer_churn.py +2 -2
  7. validmind/datasets/credit_risk/__init__.py +11 -0
  8. validmind/datasets/credit_risk/datasets/lending_club_loan_data_2007_2014_clean.csv.gz +0 -0
  9. validmind/datasets/credit_risk/lending_club.py +394 -0
  10. validmind/datasets/nlp/__init__.py +5 -0
  11. validmind/datasets/nlp/cnn_dailymail.py +98 -0
  12. validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
  13. validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
  14. validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
  15. validmind/errors.py +11 -1
  16. validmind/logging.py +9 -2
  17. validmind/models/huggingface.py +2 -2
  18. validmind/models/pytorch.py +3 -3
  19. validmind/models/sklearn.py +4 -4
  20. validmind/template.py +2 -2
  21. validmind/test_suites/__init__.py +4 -2
  22. validmind/tests/__init__.py +130 -45
  23. validmind/tests/data_validation/DatasetDescription.py +0 -1
  24. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +3 -1
  25. validmind/tests/data_validation/PiTCreditScoresHistogram.py +1 -1
  26. validmind/tests/data_validation/ScatterPlot.py +8 -2
  27. validmind/tests/data_validation/nlp/StopWords.py +1 -6
  28. validmind/tests/data_validation/nlp/TextDescription.py +20 -9
  29. validmind/tests/decorator.py +313 -0
  30. validmind/tests/model_validation/BertScore.py +1 -1
  31. validmind/tests/model_validation/BertScoreAggregate.py +1 -1
  32. validmind/tests/model_validation/BleuScore.py +1 -1
  33. validmind/tests/model_validation/ClusterSizeDistribution.py +1 -1
  34. validmind/tests/model_validation/ContextualRecall.py +1 -1
  35. validmind/tests/model_validation/FeaturesAUC.py +110 -0
  36. validmind/tests/model_validation/MeteorScore.py +92 -0
  37. validmind/tests/model_validation/RegardHistogram.py +6 -7
  38. validmind/tests/model_validation/RegardScore.py +4 -6
  39. validmind/tests/model_validation/RegressionResidualsPlot.py +127 -0
  40. validmind/tests/model_validation/RougeMetrics.py +7 -5
  41. validmind/tests/model_validation/RougeMetricsAggregate.py +1 -1
  42. validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
  43. validmind/tests/model_validation/TokenDisparity.py +1 -1
  44. validmind/tests/model_validation/ToxicityHistogram.py +1 -1
  45. validmind/tests/model_validation/ToxicityScore.py +1 -1
  46. validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
  47. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +1 -3
  48. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
  49. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +1 -1
  50. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +16 -17
  51. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +1 -1
  52. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  53. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +21 -3
  54. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +1 -1
  55. validmind/tests/model_validation/sklearn/MinimumF1Score.py +1 -1
  56. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +1 -1
  57. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +5 -4
  58. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +2 -2
  59. validmind/tests/model_validation/sklearn/ROCCurve.py +6 -12
  60. validmind/tests/model_validation/sklearn/RegressionErrors.py +2 -2
  61. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +6 -4
  62. validmind/tests/model_validation/sklearn/RegressionR2Square.py +2 -2
  63. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +55 -5
  64. validmind/tests/model_validation/sklearn/SilhouettePlot.py +1 -1
  65. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +11 -5
  66. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +2 -2
  67. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +140 -0
  68. validmind/tests/model_validation/statsmodels/GINITable.py +22 -45
  69. validmind/tests/model_validation/statsmodels/{LogisticRegPredictionHistogram.py → PredictionProbabilitiesHistogram.py} +67 -92
  70. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +2 -2
  71. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -2
  72. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  73. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  74. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  75. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +2 -2
  76. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +128 -0
  77. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +70 -103
  78. validmind/tests/prompt_validation/ai_powered_test.py +2 -0
  79. validmind/tests/test_providers.py +14 -124
  80. validmind/unit_metrics/__init__.py +75 -70
  81. validmind/unit_metrics/classification/sklearn/Accuracy.py +14 -0
  82. validmind/unit_metrics/classification/sklearn/F1.py +13 -0
  83. validmind/unit_metrics/classification/sklearn/Precision.py +13 -0
  84. validmind/unit_metrics/classification/sklearn/ROC_AUC.py +13 -0
  85. validmind/unit_metrics/classification/sklearn/Recall.py +13 -0
  86. validmind/unit_metrics/composite.py +228 -0
  87. validmind/unit_metrics/regression/GiniCoefficient.py +33 -0
  88. validmind/unit_metrics/regression/HuberLoss.py +23 -0
  89. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +30 -0
  90. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +16 -0
  91. validmind/unit_metrics/regression/MeanBiasDeviation.py +13 -0
  92. validmind/unit_metrics/regression/QuantileLoss.py +15 -0
  93. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +21 -0
  94. validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +13 -0
  95. validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +13 -0
  96. validmind/unit_metrics/regression/sklearn/RSquaredScore.py +13 -0
  97. validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +20 -0
  98. validmind/utils.py +20 -31
  99. validmind/vm_models/__init__.py +0 -2
  100. validmind/vm_models/dataset.py +623 -29
  101. validmind/vm_models/figure.py +52 -17
  102. validmind/vm_models/test/metric.py +33 -31
  103. validmind/vm_models/test/output_template.py +0 -27
  104. validmind/vm_models/test/result_wrapper.py +68 -36
  105. validmind/vm_models/test/test.py +4 -2
  106. validmind/vm_models/test/threshold_test.py +24 -14
  107. validmind/vm_models/test_context.py +7 -0
  108. validmind/vm_models/test_suite/runner.py +1 -1
  109. validmind/vm_models/test_suite/summary.py +3 -3
  110. validmind/vm_models/test_suite/test.py +1 -1
  111. validmind/vm_models/test_suite/test_suite.py +2 -1
  112. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/METADATA +18 -18
  113. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/RECORD +116 -94
  114. validmind-2.1.0.dist-info/entry_points.txt +3 -0
  115. validmind/tests/__types__.py +0 -62
  116. validmind/tests/model_validation/statsmodels/LogRegressionConfusionMatrix.py +0 -128
  117. validmind/tests/model_validation/statsmodels/LogisticRegCumulativeProb.py +0 -172
  118. validmind/tests/model_validation/statsmodels/ScorecardBucketHistogram.py +0 -181
  119. validmind/tests/model_validation/statsmodels/ScorecardProbabilitiesHistogram.py +0 -175
  120. validmind/unit_metrics/sklearn/classification/Accuracy.py +0 -20
  121. validmind/unit_metrics/sklearn/classification/F1.py +0 -22
  122. validmind/unit_metrics/sklearn/classification/Precision.py +0 -22
  123. validmind/unit_metrics/sklearn/classification/ROC_AUC.py +0 -20
  124. validmind/unit_metrics/sklearn/classification/Recall.py +0 -20
  125. validmind/vm_models/test/unit_metric.py +0 -88
  126. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/LICENSE +0 -0
  127. {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,394 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import os
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import scorecardpy as sc
10
+ import statsmodels.api as sm
11
+ from sklearn.model_selection import train_test_split
12
+
13
+ current_path = os.path.dirname(os.path.abspath(__file__))
14
+ dataset_path = os.path.join(current_path, "datasets")
15
+
16
+ # URLs or file paths for online and offline data
17
+ online_data_file = "https://vmai.s3.us-west-1.amazonaws.com/datasets/lending_club_loan_data_2007_2014.csv"
18
+ offline_data_file = os.path.join(
19
+ dataset_path, "lending_club_loan_data_2007_2014_clean.csv.gz"
20
+ )
21
+
22
+ target_column = "loan_status"
23
+
24
+ drop_columns = [
25
+ "Unnamed: 0",
26
+ "id",
27
+ "member_id",
28
+ "funded_amnt",
29
+ "emp_title",
30
+ "url",
31
+ "desc",
32
+ "application_type",
33
+ "title",
34
+ "zip_code",
35
+ "delinq_2yrs",
36
+ "mths_since_last_delinq",
37
+ "mths_since_last_record",
38
+ "mths_since_last_major_derog",
39
+ "revol_bal",
40
+ "total_rec_prncp",
41
+ "total_rec_late_fee",
42
+ "recoveries",
43
+ "out_prncp_inv",
44
+ "out_prncp",
45
+ "collection_recovery_fee",
46
+ "next_pymnt_d",
47
+ "initial_list_status",
48
+ "pub_rec",
49
+ "collections_12_mths_ex_med",
50
+ "policy_code",
51
+ "acc_now_delinq",
52
+ "pymnt_plan",
53
+ "tot_coll_amt",
54
+ "tot_cur_bal",
55
+ "total_rev_hi_lim",
56
+ "last_pymnt_d",
57
+ "last_credit_pull_d",
58
+ "earliest_cr_line",
59
+ "issue_d",
60
+ "addr_state",
61
+ "dti",
62
+ "revol_util",
63
+ "total_pymnt_inv",
64
+ "inq_last_6mths",
65
+ "total_rec_int",
66
+ "last_pymnt_amnt",
67
+ ]
68
+
69
+ drop_features = [
70
+ "loan_amnt",
71
+ "funded_amnt_inv",
72
+ "total_pymnt",
73
+ ]
74
+
75
+ categorical_variables = [
76
+ "term",
77
+ "grade",
78
+ "sub_grade",
79
+ "emp_length",
80
+ "home_ownership",
81
+ "verification_status",
82
+ "purpose",
83
+ ]
84
+
85
+ breaks_adj = {
86
+ "loan_amnt": [5000, 10000, 15000, 20000, 25000],
87
+ "int_rate": [10, 15, 20],
88
+ "annual_inc": [50000, 100000, 150000],
89
+ }
90
+
91
+ score_params = {
92
+ "target_score": 600,
93
+ "target_odds": 50,
94
+ "pdo": 20,
95
+ }
96
+
97
+
98
+ def load_data(source="online"):
99
+ """
100
+ Load data from either an online source or offline files, automatically dropping specified columns for offline data.
101
+
102
+ :param source: 'online' for online data, 'offline' for offline files. Defaults to 'online'.
103
+ :return: DataFrame containing the loaded data.
104
+ """
105
+
106
+ if source == "online":
107
+ print(f"Loading data from an online source: {online_data_file}")
108
+ df = pd.read_csv(online_data_file)
109
+ df = _clean_data(df)
110
+
111
+ elif source == "offline":
112
+ print(f"Loading data from an offline .gz file: {offline_data_file}")
113
+ # Since we know the offline_data_file path ends with '.zip', we replace it with '.csv.gz'
114
+ gzip_file_path = offline_data_file.replace(".zip", ".csv.gz")
115
+ print(f"Attempting to read from .gz file: {gzip_file_path}")
116
+ # Read the CSV file directly from the .gz archive
117
+ df = pd.read_csv(gzip_file_path, compression="gzip")
118
+ print("Data loaded successfully.")
119
+ else:
120
+ raise ValueError("Invalid source specified. Choose 'online' or 'offline'.")
121
+
122
+ print(
123
+ f"Rows: {df.shape[0]}, Columns: {df.shape[1]}, Missing values: {df.isnull().sum().sum()}"
124
+ )
125
+ return df
126
+
127
+
128
+ def _clean_data(df):
129
+ df = df.copy()
130
+
131
+ # Drop columns not relevant for application scorecards
132
+ df = df.drop(columns=drop_columns)
133
+
134
+ # Drop rows with missing target values
135
+ df.dropna(subset=[target_column], inplace=True)
136
+ print("Dropping rows with missing target values:")
137
+ print(
138
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
139
+ )
140
+
141
+ # Drop columns with more than N percent missing values
142
+ missing_values = df.isnull().mean()
143
+ df = df.loc[:, missing_values < 0.7]
144
+ print("Dropping columns with more than 70% missing values:")
145
+ print(
146
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
147
+ )
148
+
149
+ # Drop columns with only one unique value
150
+ unique_values = df.nunique()
151
+ df = df.loc[:, unique_values > 1]
152
+ print("Dropping columns with only one unique value:")
153
+ print(
154
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
155
+ )
156
+
157
+ # Define the target variable for the model, representing loan default status.
158
+ df[target_column] = df[target_column].map({"Fully Paid": 0, "Charged Off": 1})
159
+
160
+ # Drop rows with NaN in target_column after mapping
161
+ df.dropna(subset=[target_column], inplace=True)
162
+ print("Dropping rows with missing target values:")
163
+ print(
164
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
165
+ )
166
+
167
+ return df
168
+
169
+
170
+ def preprocess(df):
171
+ df = df.copy()
172
+
173
+ # Convert the target variable to integer type for modeling.
174
+ df[target_column] = df[target_column].astype(int)
175
+
176
+ # Keep rows where purpose is 'debt_consolidation' or 'credit_card'
177
+ df = df[df["purpose"].isin(["debt_consolidation", "credit_card"])]
178
+ print("Filtering 'purpose' to 'debt_consolidation' and 'credit_card':")
179
+ print(
180
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
181
+ )
182
+
183
+ # Remove rows where grade is 'F' or 'G'
184
+ df = df[~df["grade"].isin(["F", "G"])]
185
+ print("Filtering out 'grade' F and G:")
186
+ print(
187
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
188
+ )
189
+
190
+ # Remove rows where sub_grade starts with 'F' or 'G'
191
+ df = df[~df["sub_grade"].str.startswith(("F", "G"))]
192
+ print("Filtering out 'sub_grade' F and G:")
193
+ print(
194
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
195
+ )
196
+
197
+ # Remove rows where home_ownership is 'OTHER', 'NONE', or 'ANY'
198
+ df = df[~df["home_ownership"].isin(["OTHER", "NONE", "ANY"])]
199
+ print("Filtering out 'home_ownership' OTHER, NONE, ANY:")
200
+ print(
201
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
202
+ )
203
+
204
+ # Drop features that are not useful for modeling
205
+ df.drop(drop_features, axis=1, inplace=True)
206
+ print("Dropping specified features:")
207
+ print(
208
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
209
+ )
210
+
211
+ # Drop rows with missing values
212
+ df.dropna(inplace=True)
213
+ print("Dropping rows with any missing values:")
214
+ print(
215
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
216
+ )
217
+
218
+ # Preprocess emp_length column
219
+ df = _preprocess_emp_length(df)
220
+
221
+ # Preprocess term column
222
+ df = _preprocess_term(df)
223
+
224
+ return df
225
+
226
+
227
+ def _preprocess_term(df):
228
+ df = df.copy()
229
+
230
+ # Remove ' months' and convert to integer
231
+ df["term"] = df["term"].str.replace(" months", "").astype(object)
232
+
233
+ return df
234
+
235
+
236
+ def _preprocess_emp_length(df):
237
+ df = df.copy()
238
+
239
+ # Mapping string values to numbers
240
+ emp_length_map = {
241
+ "10+ years": 10,
242
+ "< 1 year": 0,
243
+ "1 year": 1,
244
+ "2 years": 2,
245
+ "3 years": 3,
246
+ "4 years": 4,
247
+ "5 years": 5,
248
+ "6 years": 6,
249
+ "7 years": 7,
250
+ "8 years": 8,
251
+ "9 years": 9,
252
+ }
253
+
254
+ # Apply the mapping to the emp_length column
255
+ df["emp_length"] = df["emp_length"].map(emp_length_map).astype(object)
256
+
257
+ # Drop rows where emp_length is NaN after mapping
258
+ # df.dropna(subset=["emp_length"], inplace=True)
259
+
260
+ return df
261
+
262
+
263
+ def feature_engineering(df):
264
+ df = df.copy()
265
+
266
+ # WoE encoding of numerical and categorical features
267
+ df = woe_encoding(df)
268
+
269
+ print(
270
+ f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}\nMissing values: {df.isnull().sum().sum()}\n"
271
+ )
272
+
273
+ return df
274
+
275
+
276
+ def woe_encoding(df):
277
+ df = df.copy()
278
+
279
+ woe = _woebin(df)
280
+ bins = _woe_to_bins(woe)
281
+
282
+ # Make sure we don't transform the target column
283
+ if target_column in bins:
284
+ del bins[target_column]
285
+ print(f"Excluded {target_column} from WoE transformation.")
286
+
287
+ # Apply the WoE transformation
288
+ df = sc.woebin_ply(df, bins=bins)
289
+
290
+ print("Successfully converted features to WoE values.")
291
+
292
+ return df
293
+
294
+
295
+ def _woe_to_bins(woe):
296
+ # Select and rename columns
297
+ transformed_df = woe[
298
+ [
299
+ "variable",
300
+ "bin",
301
+ "count",
302
+ "count_distr",
303
+ "good",
304
+ "bad",
305
+ "badprob",
306
+ "woe",
307
+ "bin_iv",
308
+ "total_iv",
309
+ ]
310
+ ].copy()
311
+ transformed_df.rename(columns={"bin_iv": "total_iv"}, inplace=True)
312
+
313
+ # Create 'is_special_values' column (assuming there are no special values)
314
+ transformed_df["is_special_values"] = False
315
+
316
+ # Transform 'bin' column into interval format and store it in 'breaks' column
317
+ transformed_df["breaks"] = transformed_df["bin"].apply(
318
+ lambda x: "[-inf, %s)" % x if isinstance(x, float) else "[%s, inf)" % x
319
+ )
320
+
321
+ # Group by 'variable' to create bins dictionary
322
+ bins = {}
323
+ for variable, group in transformed_df.groupby("variable"):
324
+ bins[variable] = group
325
+
326
+ return bins
327
+
328
+
329
+ def _woebin(df):
330
+ """
331
+ This function performs automatic binning using WoE.
332
+ df: A pandas dataframe
333
+ target_column: The target variable in quotes, e.g. 'loan_status'
334
+ """
335
+
336
+ non_numeric_cols = df.select_dtypes(exclude=["int64", "float64"]).columns
337
+ df[non_numeric_cols] = df[non_numeric_cols].astype(str)
338
+
339
+ try:
340
+ print(
341
+ f"Performing binning with breaks_adj: {breaks_adj}"
342
+ ) # print the breaks_adj being used
343
+ bins = sc.woebin(df, target_column, breaks_list=breaks_adj)
344
+ except Exception as e:
345
+ print("Error during binning: ")
346
+ print(e)
347
+ else:
348
+ bins_df = pd.concat(bins.values(), keys=bins.keys())
349
+ bins_df.reset_index(inplace=True)
350
+ bins_df.drop(columns=["variable"], inplace=True)
351
+ bins_df.rename(columns={"level_0": "variable"}, inplace=True)
352
+
353
+ bins_df["bin_number"] = bins_df.groupby("variable").cumcount()
354
+
355
+ return bins_df
356
+
357
+
358
+ def split(df, add_constant=False):
359
+ df = df.copy()
360
+
361
+ # Splitting the dataset into training and test sets
362
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
363
+
364
+ if add_constant:
365
+ # Add a constant to the model for both training and testing datasets
366
+ train_df = sm.add_constant(train_df)
367
+ test_df = sm.add_constant(test_df)
368
+
369
+ # Calculate and print details for the training dataset
370
+ print("After splitting the dataset into training and test sets:")
371
+ print(
372
+ f"Training Dataset:\nRows: {train_df.shape[0]}\nColumns: {train_df.shape[1]}\nMissing values: {train_df.isnull().sum().sum()}\n"
373
+ )
374
+
375
+ # Calculate and print details for the test dataset
376
+ print(
377
+ f"Test Dataset:\nRows: {test_df.shape[0]}\nColumns: {test_df.shape[1]}\nMissing values: {test_df.isnull().sum().sum()}\n"
378
+ )
379
+
380
+ return train_df, test_df
381
+
382
+
383
+ def compute_scores(probabilities):
384
+
385
+ target_score = score_params["target_score"]
386
+ target_odds = score_params["target_odds"]
387
+ pdo = score_params["pdo"]
388
+
389
+ factor = pdo / np.log(2)
390
+ offset = target_score - (factor * np.log(target_odds))
391
+
392
+ scores = offset + factor * np.log(probabilities / (1 - probabilities))
393
+
394
+ return scores
@@ -5,3 +5,8 @@
5
5
  """
6
6
  Example datasets that can be used with the developer framework.
7
7
  """
8
+
9
+ __all__ = [
10
+ "cnn_dailymail",
11
+ "twitter_covide_19",
12
+ ]
@@ -0,0 +1,98 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import os
6
+ import textwrap
7
+
8
+ import pandas as pd
9
+ from datasets import load_dataset
10
+ from IPython.display import HTML, display
11
+ from tabulate import tabulate
12
+
13
+ # Define column names
14
+ text_column = "article"
15
+ target_column = "highlights"
16
+ gpt_35_prediction_column = "gpt_35_prediction"
17
+ t5_prediction = "t5_prediction"
18
+
19
+ # Define the path to the dataset directory
20
+ current_path = os.path.dirname(os.path.abspath(__file__))
21
+ dataset_path = os.path.join(current_path, "datasets")
22
+
23
+
24
+ def load_data(source="online", dataset_size=None):
25
+ """
26
+ Load data from either online source or offline files.
27
+
28
+ :param source: 'online' for online data, 'offline' for offline data. Defaults to 'online'.
29
+ :param dataset_size: Applicable if source is 'offline'. '300k' or '500k' for dataset size. Defaults to None.
30
+ :return: DataFrame containing the loaded data.
31
+ """
32
+ if source == "online":
33
+ # Load online data without predictions
34
+ cnn_dataset = load_dataset("cnn_dailymail", "3.0.0")
35
+ train_df = cnn_dataset["train"].to_pandas()
36
+ test_df = cnn_dataset["test"].to_pandas()
37
+
38
+ # Process the DataFrame to include necessary columns
39
+ train_df = train_df[["article", "highlights"]]
40
+ test_df = test_df[["article", "highlights"]]
41
+
42
+ return train_df, test_df
43
+
44
+ elif source == "offline":
45
+ # Determine the file name based on the dataset size
46
+ if dataset_size == "100":
47
+ data_file_name = "cnn_dailymail_100_with_predictions.csv"
48
+ elif dataset_size == "500":
49
+ data_file_name = "cnn_dailymail_500_with_predictions.csv"
50
+ else:
51
+ raise ValueError("Invalid dataset_size specified. Choose '100' or '500'.")
52
+
53
+ # Construct the file path
54
+ data_file = os.path.join(dataset_path, data_file_name)
55
+
56
+ # Load the dataset
57
+ df = pd.read_csv(data_file)
58
+ df = df[["article", "highlights", "gpt_35_prediction", "t5_prediction"]]
59
+
60
+ train_df = df.sample(frac=0.7, random_state=42)
61
+ test_df = df.drop(train_df.index)
62
+ return train_df, test_df
63
+
64
+ else:
65
+ raise ValueError("Invalid source specified. Choose 'online' or 'offline'.")
66
+
67
+
68
+ def _format_cell_text(text, width=50):
69
+ """Private function to format a cell's text."""
70
+ return "\n".join([textwrap.fill(line, width=width) for line in text.split("\n")])
71
+
72
+
73
+ def _format_dataframe_for_tabulate(df):
74
+ """Private function to format the entire DataFrame for tabulation."""
75
+ df_out = df.copy()
76
+
77
+ # Format all string columns
78
+ for column in df_out.columns:
79
+ # Check if column is of type object (likely strings)
80
+ if df_out[column].dtype == object:
81
+ df_out[column] = df_out[column].apply(_format_cell_text)
82
+ return df_out
83
+
84
+
85
+ def _dataframe_to_html_table(df):
86
+ """Private function to convert a DataFrame to an HTML table."""
87
+ headers = df.columns.tolist()
88
+ table_data = df.values.tolist()
89
+ return tabulate(table_data, headers=headers, tablefmt="html")
90
+
91
+
92
+ def display_nice(df, num_rows=None):
93
+ """Primary function to format and display a DataFrame."""
94
+ if num_rows is not None:
95
+ df = df.head(num_rows)
96
+ formatted_df = _format_dataframe_for_tabulate(df)
97
+ html_table = _dataframe_to_html_table(formatted_df)
98
+ display(HTML(html_table))