utilsds-models 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,179 @@
1
+ """
2
+ NGR Metrics Calculation Module
3
+
4
+ This module provides functions for calculating various error metrics
5
+ for NGR (Net Gaming Revenue) predictions with business-optimal weighting.
6
+ """
7
+
8
+ from typing import Any, Dict, Optional
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+
13
+
14
+ def calculate_ngr_metrics(
15
+ df: pd.DataFrame, late_stage_day: Optional[int] = None, late_stage_correction: Optional[float] = None
16
+ ) -> Dict[str, Any]:
17
+ """
18
+ Calculate NGR metrics with business-optimal weighting.
19
+
20
+ This function computes various error metrics (MAE, MAPE, ME, MPE) with business weights
21
+ based on days_since_ftd. Optionally applies late-stage corrections to predictions.
22
+
23
+ Parameters
24
+ ----------
25
+ df : pd.DataFrame
26
+ DataFrame containing prediction results with columns:
27
+ - 'days_since_ftd': int, days since first time deposit
28
+ - 'remaining_ngr': float, actual remaining NGR
29
+ - 'remaining_ngr_pred': float, predicted remaining NGR
30
+ late_stage_day : int, optional
31
+ Day threshold after which to apply correction
32
+ late_stage_correction : float, optional
33
+ Correction factor to apply (0.0-1.0) linearly after late_stage_day
34
+
35
+ Returns
36
+ -------
37
+ dict
38
+ Dictionary containing calculated metrics:
39
+ - 'Standard MAE': Mean Absolute Error
40
+ - 'Standard MAPE (%)': Mean Absolute Percentage Error
41
+ - 'Standard ME': Mean Error
42
+ - 'Standard MPE (%)': Mean Percentage Error
43
+ - 'Business Optimal MAE': Weighted MAE
44
+ - 'Business Optimal MAPE (%)': Weighted MAPE
45
+ - 'Business Optimal ME': Weighted ME
46
+ - 'Business Optimal MPE (%)': Weighted MPE
47
+ - 'mean_abs_error_by_bin': DataFrame with errors by period
48
+ - 'mean_values': DataFrame with aggregated values by day
49
+ - 'weights': Array of business weights by day
50
+
51
+ Notes
52
+ -----
53
+ Business-optimal weights by day ranges:
54
+ - Days 1-7: 0.05 (learning period - insufficient data)
55
+ - Days 8-14: 0.85 (early signals emerging)
56
+ - Days 15-45: 1.00 (SUPER CRITICAL - optimal intervention window)
57
+ - Days 46-90: 0.90 (confirmation period - high value)
58
+ - Days 91-180: 0.60 (established patterns - moderate value)
59
+ - Days 181-270: 0.40 (mature behavior - operational value)
60
+ - Days 271+: 0.30 (end-game precision - tactical value)
61
+
62
+ Examples
63
+ --------
64
+ >>> metrics = calculate_ngr_metrics(
65
+ ... df,
66
+ ... late_stage_day=320,
67
+ ... late_stage_correction=0.95
68
+ ... )
69
+ >>> print(f"Business MAE: {metrics['Business Optimal MAE']:.2f}")
70
+ """
71
+
72
+ # Calculate mean values grouped by days_since_ftd
73
+ mean_values = (
74
+ df.groupby("days_since_ftd")
75
+ .agg(
76
+ {
77
+ "remaining_ngr": "mean",
78
+ "remaining_ngr_pred": "mean",
79
+ }
80
+ )
81
+ .reset_index()
82
+ )
83
+ mean_values = mean_values[mean_values["days_since_ftd"] < 365]
84
+
85
+ # Apply late stage correction if specified
86
+ if late_stage_day is not None and late_stage_correction is not None:
87
+ days_array = mean_values["days_since_ftd"].values
88
+ days_from_start = days_array - late_stage_day
89
+ max_days = 364 - late_stage_day
90
+
91
+ correction_factor = np.where(
92
+ days_array <= late_stage_day,
93
+ 1.0, # Before late_stage_day: no changes
94
+ 1.0 - (1.0 - late_stage_correction) * (days_from_start / max_days), # Linear transition
95
+ )
96
+ mean_values["remaining_ngr_pred"] = mean_values["remaining_ngr_pred"] * correction_factor
97
+
98
+ days = mean_values["days_since_ftd"].values.astype(np.float64)
99
+
100
+ # Business-optimal weighting
101
+ weights = np.where(
102
+ days <= 7,
103
+ 0.05, # Learning period - insufficient data
104
+ np.where(
105
+ days <= 14,
106
+ 0.85, # Early signals emerging
107
+ np.where(
108
+ days <= 45,
109
+ 1.00, # SUPER CRITICAL - optimal intervention window
110
+ np.where(
111
+ days <= 90,
112
+ 0.90, # Confirmation period - high value
113
+ np.where(
114
+ days <= 180,
115
+ 0.60, # Established patterns - moderate value
116
+ np.where(days <= 270, 0.40, 0.30), # Mature behavior - operational value
117
+ ),
118
+ ),
119
+ ),
120
+ ),
121
+ ) # End-game precision - tactical value
122
+
123
+ abs_errors = np.abs(mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"])
124
+ errors = mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"]
125
+
126
+ # Define bins for error analysis by period
127
+ bins = [0, 7, 14, 45, 90, 180, 270, 364]
128
+ labels = ["1-7", "8-14", "15-45", "46-90", "91-180", "181-270", "271-364"]
129
+
130
+ # Calculate standard metrics
131
+ business_optimal_tmae = np.sum(abs_errors * weights) / np.sum(weights)
132
+ business_optimal_me = np.sum(errors * weights) / np.sum(weights)
133
+ standard_mae = round(np.mean(abs_errors), 4)
134
+ standard_me = round(np.mean(errors), 4)
135
+
136
+ # Calculate percentage-based metrics
137
+ # Avoid division by zero - use small epsilon for very small values
138
+ epsilon = 1e-6
139
+ safe_remaining_ngr = np.where(np.abs(mean_values["remaining_ngr"]) < epsilon, epsilon, mean_values["remaining_ngr"])
140
+
141
+ percentage_abs_errors = (
142
+ np.abs((mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"]) / safe_remaining_ngr) * 100
143
+ )
144
+ errors_percentage = ((mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"]) / safe_remaining_ngr) * 100
145
+
146
+ # Assign each day to a period bin
147
+ day_bins = pd.cut(mean_values["days_since_ftd"], bins=bins, labels=labels, right=True, include_lowest=True)
148
+
149
+ # Calculate mean errors by period
150
+ mean_abs_error_by_bin = pd.DataFrame(
151
+ {
152
+ "bin": labels,
153
+ "mean_abs_error": [abs_errors[day_bins == label].mean() for label in labels],
154
+ "mean_error": [errors[day_bins == label].mean() for label in labels],
155
+ "mape": [percentage_abs_errors[day_bins == label].mean() for label in labels],
156
+ }
157
+ )
158
+
159
+ # Calculate business-optimal percentage metrics
160
+ business_optimal_mape = np.sum(percentage_abs_errors * weights) / np.sum(weights)
161
+ standard_mape = np.mean(percentage_abs_errors)
162
+
163
+ business_optimal_mpe = np.sum(errors_percentage * weights) / np.sum(weights)
164
+ standard_mpe = np.mean(errors_percentage)
165
+
166
+ # Return dict with all metrics
167
+ return {
168
+ "Standard MAE": standard_mae,
169
+ "Standard MAPE (%)": round(standard_mape, 1),
170
+ "Standard ME": standard_me,
171
+ "Standard MPE (%)": round(standard_mpe, 1),
172
+ "Business Optimal MAE": round(business_optimal_tmae, 2),
173
+ "Business Optimal MAPE (%)": round(business_optimal_mape, 1),
174
+ "Business Optimal ME": round(business_optimal_me, 2),
175
+ "Business Optimal MPE (%)": round(business_optimal_mpe, 1),
176
+ "mean_abs_error_by_bin": mean_abs_error_by_bin,
177
+ "mean_values": mean_values,
178
+ "weights": weights,
179
+ }
@@ -0,0 +1,106 @@
1
+ Metadata-Version: 2.4
2
+ Name: utilsds-models
3
+ Version: 0.0.1
4
+ Summary: Solution for specific models
5
+ Author-email: DS Team <ds@sts.pl>
6
+ License: MIT License
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.12
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: pandas>=2.2.2
13
+ Requires-Dist: numpy>=1.26.0
14
+ Requires-Dist: scikit-learn>=1.5.0
15
+ Requires-Dist: matplotlib>=3.9.0
16
+ Provides-Extra: dev
17
+ Requires-Dist: pre-commit>=3.5.0; extra == "dev"
18
+
19
+ # utilsds
20
+
21
+ Utilsds is a library that includes classes and functions used in data science projects such as:
22
+
23
+ - **algorithm**:
24
+ - `Algorithm`: Base class for fitting, training, and getting hyperparameters of machine learning models.
25
+
26
+ - **data_ops**:
27
+ - `DataOperations`: Handle data operations locally and with Google Cloud services (BigQuery and Cloud Storage).
28
+ - BigQuery operations:
29
+ - `load_bq_data`: Load data from tables, views, and SQL files.
30
+ - `save_bq_view`, `save_bq_table`: Save views and tables.
31
+ - `load_bq_procedure`: Execute stored procedures.
32
+ - `load_bq_details`: Get table/view details and schema.
33
+ - `delete_bq_data`: Delete data with safety confirmations.
34
+ - `dry_run`: Perform dry runs to estimate query costs.
35
+ - Cloud Storage operations:
36
+ - `save_gcs_bucket`: Create buckets.
37
+ - `save_gcs_file`, `load_gcs_file`: Save and load files (.pkl, .json, .csv, .html, .sql).
38
+ - Local file operations:
39
+ - `save_local_file`, `load_local_file`: Save and load files (.pkl, .json, .csv, .html, .sql).
40
+
41
+ - **data_processing**:
42
+ - `SkewnessTransformer`: Transform skewed data using various methods (IHS, neglog, Yeo-Johnson, quantile).
43
+ - `NullReplacer`: Replace null values in specified columns with configurable strategies.
44
+ - `ColumnDropper`: Drop specified columns from a DataFrame.
45
+ - `OutliersCleaner`: Clean outliers by clipping values outside specified percentile ranges.
46
+ - `CategoricalMapper`: Map values in categorical columns according to a specified mapping scheme.
47
+ - `NumericalMapper`: Convert numerical columns to categorical by binning.
48
+ - `Encoder`: One-hot encode categorical columns in the data.
49
+ - `Normalizer`: Normalize numerical columns using a provided scaler.
50
+
51
+ - **data_split**:
52
+ - `train_test_validation_split`: Split data into training, testing, and validation sets.
53
+ - `resample_X_y`: resample train data and target column.
54
+
55
+ - **ds_statistics**:
56
+ - `test_kruskal_wallis`: Perform the Kruskal-Wallis statistical test.
57
+ - `test_agosto_pearsona`: Test for normality using D'Agostino-Pearson test.
58
+
59
+ - **evaluate**:
60
+ - `ModelEvaluator`: Evaluate models and generate plots for diagnostics.
61
+ - `ShapExplainer`: Explain model predictions using SHAP values.
62
+
63
+ - **experiments**:
64
+ - `VertexExperiment`: Manage experiments with Vertex AI.
65
+
66
+ - **optuna**:
67
+ - `Optuna`: Optimize hyperparameters using Optuna.
68
+
69
+ - **metrics**:
70
+ - `Metrics`: Calculate metrics for both classification and regression models.
71
+
72
+ - **modeling**:
73
+ - `Modeling`: Manage modeling, metrics, and logging with Vertex AI.
74
+
75
+ - **Supervised**:
76
+ - `LazyClassifier`: A classifier that automatically trains and evaluates multiple models.
77
+ - `LazyRegressor`: A regressor that automatically trains and evaluates multiple models.
78
+ - `get_card_split`: Function to split data into card-like groups.
79
+ - `adjusted_rsquared`: Calculate adjusted R-squared for regression models.
80
+
81
+ - **visualization**:
82
+ - `MetricsPlot`: Compare metrics for different parameter values.
83
+ - `Radar`: Create radar plots for visualizing data.
84
+ - `cluster_characteristics`: Analyze cluster characteristics.
85
+ - `comparison_density`: Compare density distributions.
86
+ - `elbow_visualisation`: Visualize the elbow method for clustering.
87
+ - `describe_clusters_metrics`: Describe metrics for clusters.
88
+ - `category_null_variables`: Visualize null variables in categorical data.
89
+ - `normal_distr_plots`: Visualize normal distribution plots.
90
+ - `distplot_limitations`: Visualize limitations of distplot.
91
+ - `boxplot_limitations`: Visualize limitations of boxplot.
92
+ - `violinplot_limitations`: Visualize limitations of violinplot.
93
+ - `countplot_limitations`: Visualize limitations of countplot.
94
+ - `categorical_variable_perc`: Visualize percentage of categorical variables.
95
+ - `spearman_correlation`: Visualize spearman correlation.
96
+ - `calculate_crammers_v`: Calculate Crammer's V.
97
+
98
+ - **what_if_streamlit**:
99
+ - `ShapSaver`: Save SHAP explainer components for lazy loading in what-if analysis.
100
+ - `ColumnMetadataGenerator`: Generate column metadata from a DataFrame or CSV file.
101
+
102
+ - **monitoring**:
103
+ - `mapping`: Create column mapping from configuration file for Evidently.
104
+ - `test_data`: Test data for issues using Evidently test suites.
105
+ - `check_data_drift`: Check data for drift using Evidently metrics.
106
+ - `send_email_with_table`: Send email notifications with HTML tables for monitoring alerts.
@@ -0,0 +1,10 @@
1
+ utilsds-models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ utilsds-models/custom_metrics.py,sha256=bHoLUsbvI4ypCFWtV6dFguj2prF0BI4HGH6ZYtzYQBY,21067
3
+ utilsds-models/data_processing.py,sha256=fNM9fk8bfn_OEwrK1eCqRCwU4h5JR3ujtwVxwmzxL-0,14087
4
+ utilsds-models/evip_dynamic.py,sha256=m90XHO9V1sFN501NjWpPVBlfM33n8W-QHNcjfO302pg,4227
5
+ utilsds-models/metrics.py,sha256=sUb0LbIjumIu13DU2lNONYoTRtHjQLo0Z-gX39Pg77U,7016
6
+ utilsds-models/visualization.py,sha256=sUb0LbIjumIu13DU2lNONYoTRtHjQLo0Z-gX39Pg77U,7016
7
+ utilsds_models-0.0.1.dist-info/METADATA,sha256=CjTuHtwSPl3oFB8dUnvwRqvaUa5UaBXBSTUNVRaBRjY,4973
8
+ utilsds_models-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
9
+ utilsds_models-0.0.1.dist-info/top_level.txt,sha256=TopNmyfabH891p-7KDjmfWjIUDYBEvo-4a2vBWz5uyU,15
10
+ utilsds_models-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ utilsds-models