imsciences 0.9.5.3__py3-none-any.whl → 0.9.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/mmm.py +139 -2
- {imsciences-0.9.5.3.dist-info → imsciences-0.9.5.4.dist-info}/METADATA +9 -2
- {imsciences-0.9.5.3.dist-info → imsciences-0.9.5.4.dist-info}/RECORD +7 -7
- {imsciences-0.9.5.3.dist-info → imsciences-0.9.5.4.dist-info}/LICENSE.txt +0 -0
- {imsciences-0.9.5.3.dist-info → imsciences-0.9.5.4.dist-info}/PKG-INFO-IMS-24Ltp-3 +0 -0
- {imsciences-0.9.5.3.dist-info → imsciences-0.9.5.4.dist-info}/WHEEL +0 -0
- {imsciences-0.9.5.3.dist-info → imsciences-0.9.5.4.dist-info}/top_level.txt +0 -0
imsciences/mmm.py
CHANGED
|
@@ -6,6 +6,9 @@ import re
|
|
|
6
6
|
from datetime import datetime, timedelta
|
|
7
7
|
import subprocess
|
|
8
8
|
import json
|
|
9
|
+
from sklearn.model_selection import train_test_split
|
|
10
|
+
import xgboost as xgb
|
|
11
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
9
12
|
|
|
10
13
|
class dataprocessing:
|
|
11
14
|
|
|
@@ -180,7 +183,12 @@ class dataprocessing:
|
|
|
180
183
|
print(" - Description: Maps dates to the start of the current ISO week based on a specified weekday.")
|
|
181
184
|
print(" - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')")
|
|
182
185
|
print(" - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')")
|
|
183
|
-
|
|
186
|
+
|
|
187
|
+
print("\n35. seasonality_feature_extraction")
|
|
188
|
+
print(" - Description: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.")
|
|
189
|
+
print(" - Usage: seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)")
|
|
190
|
+
print(" - Example: seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)")
|
|
191
|
+
|
|
184
192
|
def get_wd_levels(self, levels):
|
|
185
193
|
"""
|
|
186
194
|
Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
|
|
@@ -1417,4 +1425,133 @@ class dataprocessing:
|
|
|
1417
1425
|
new_col = f"week_start_{week_commencing}"
|
|
1418
1426
|
df[new_col] = df[date_col].apply(map_to_week_start)
|
|
1419
1427
|
|
|
1420
|
-
return df
|
|
1428
|
+
return df
|
|
1429
|
+
|
|
1430
|
+
def seasonality_feature_extraction(self, df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False):
|
|
1431
|
+
"""
|
|
1432
|
+
1) Uses the provided dataframe (df), where:
|
|
1433
|
+
- df['kpi_total_sales'] is the target (y).
|
|
1434
|
+
- df['OBS'] is a date or index column (excluded from features).
|
|
1435
|
+
|
|
1436
|
+
2) Splits data into train/test using the specified test_size, random_state, and shuffle.
|
|
1437
|
+
3) Trains XGBoost and Random Forest on all features.
|
|
1438
|
+
4) Extracts the top n_features from each model.
|
|
1439
|
+
5) Merges their unique top features.
|
|
1440
|
+
6) Optionally retrains each model on the combined top features.
|
|
1441
|
+
7) Returns performance metrics and the fitted models.
|
|
1442
|
+
|
|
1443
|
+
Parameters
|
|
1444
|
+
----------
|
|
1445
|
+
df : pd.DataFrame
|
|
1446
|
+
The input dataframe that contains kpi_var (target) and 'OBS' (date/index).
|
|
1447
|
+
n_features : int, optional
|
|
1448
|
+
Number of top features to extract from each model (default=10).
|
|
1449
|
+
test_size : float, optional
|
|
1450
|
+
Test size for train_test_split (default=0.1).
|
|
1451
|
+
random_state : int, optional
|
|
1452
|
+
Random state for reproducibility (default=42).
|
|
1453
|
+
shuffle : bool, optional
|
|
1454
|
+
Whether to shuffle the data before splitting (default=False).
|
|
1455
|
+
|
|
1456
|
+
Returns
|
|
1457
|
+
-------
|
|
1458
|
+
dict
|
|
1459
|
+
A dictionary containing:
|
|
1460
|
+
- "top_features_xgb": list of top n_features from XGBoost
|
|
1461
|
+
- "top_features_rf": list of top n_features from Random Forest
|
|
1462
|
+
- "combined_features": merged unique feature list
|
|
1463
|
+
- "performance": dictionary of performance metrics
|
|
1464
|
+
- "models": dictionary of fitted models
|
|
1465
|
+
"""
|
|
1466
|
+
# ---------------------------------------------------------------------
|
|
1467
|
+
# 1. Prepare your data (X, y)
|
|
1468
|
+
# ---------------------------------------------------------------------
|
|
1469
|
+
# Extract target and features
|
|
1470
|
+
y = df[kpi_var]
|
|
1471
|
+
X = df.drop(columns=['OBS', kpi_var])
|
|
1472
|
+
|
|
1473
|
+
# Split into train/test
|
|
1474
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
1475
|
+
X, y,
|
|
1476
|
+
test_size=test_size,
|
|
1477
|
+
random_state=random_state,
|
|
1478
|
+
shuffle=shuffle
|
|
1479
|
+
)
|
|
1480
|
+
|
|
1481
|
+
# ---------------------------------------------------------------------
|
|
1482
|
+
# 2. XGBoost Approach (on all features)
|
|
1483
|
+
# ---------------------------------------------------------------------
|
|
1484
|
+
# (A) Train full model on ALL features
|
|
1485
|
+
xgb_model_full = xgb.XGBRegressor(random_state=random_state)
|
|
1486
|
+
xgb_model_full.fit(X_train, y_train)
|
|
1487
|
+
|
|
1488
|
+
# (B) Get feature importances
|
|
1489
|
+
xgb_importances = xgb_model_full.feature_importances_
|
|
1490
|
+
xgb_feat_importance_df = (
|
|
1491
|
+
pd.DataFrame({
|
|
1492
|
+
'feature': X.columns,
|
|
1493
|
+
'importance': xgb_importances
|
|
1494
|
+
})
|
|
1495
|
+
.sort_values('importance', ascending=False)
|
|
1496
|
+
.reset_index(drop=True)
|
|
1497
|
+
)
|
|
1498
|
+
|
|
1499
|
+
# (C) Select top N features
|
|
1500
|
+
top_features_xgb = xgb_feat_importance_df['feature'].head(n_features).tolist()
|
|
1501
|
+
|
|
1502
|
+
# (D) Subset data to top N features
|
|
1503
|
+
X_train_xgb_topN = X_train[top_features_xgb]
|
|
1504
|
+
|
|
1505
|
+
# (E) Retrain XGBoost on these top N features
|
|
1506
|
+
xgb_model_topN = xgb.XGBRegressor(random_state=random_state)
|
|
1507
|
+
xgb_model_topN.fit(X_train_xgb_topN, y_train)
|
|
1508
|
+
|
|
1509
|
+
# ---------------------------------------------------------------------
|
|
1510
|
+
# 3. Random Forest Approach (on all features)
|
|
1511
|
+
# ---------------------------------------------------------------------
|
|
1512
|
+
rf_model_full = RandomForestRegressor(random_state=random_state)
|
|
1513
|
+
rf_model_full.fit(X_train, y_train)
|
|
1514
|
+
|
|
1515
|
+
# (B) Get feature importances
|
|
1516
|
+
rf_importances = rf_model_full.feature_importances_
|
|
1517
|
+
rf_feat_importance_df = (
|
|
1518
|
+
pd.DataFrame({
|
|
1519
|
+
'feature': X.columns,
|
|
1520
|
+
'importance': rf_importances
|
|
1521
|
+
})
|
|
1522
|
+
.sort_values('importance', ascending=False)
|
|
1523
|
+
.reset_index(drop=True)
|
|
1524
|
+
)
|
|
1525
|
+
|
|
1526
|
+
# (C) Select top N features
|
|
1527
|
+
top_features_rf = rf_feat_importance_df['feature'].head(n_features).tolist()
|
|
1528
|
+
|
|
1529
|
+
# (D) Subset data to top N features
|
|
1530
|
+
X_train_rf_topN = X_train[top_features_rf]
|
|
1531
|
+
|
|
1532
|
+
# (E) Retrain Random Forest on these top N features
|
|
1533
|
+
rf_model_topN = RandomForestRegressor(random_state=random_state)
|
|
1534
|
+
rf_model_topN.fit(X_train_rf_topN, y_train)
|
|
1535
|
+
|
|
1536
|
+
# ---------------------------------------------------------------------
|
|
1537
|
+
# 4. Combine top features from both models
|
|
1538
|
+
# ---------------------------------------------------------------------
|
|
1539
|
+
combined_features = list(set(top_features_xgb + top_features_rf))
|
|
1540
|
+
|
|
1541
|
+
# Create new training/testing data with the combined features
|
|
1542
|
+
X_train_combined = X_train[combined_features]
|
|
1543
|
+
|
|
1544
|
+
# (Optional) Retrain XGBoost on combined features
|
|
1545
|
+
xgb_model_combined = xgb.XGBRegressor(random_state=random_state)
|
|
1546
|
+
xgb_model_combined.fit(X_train_combined, y_train)
|
|
1547
|
+
|
|
1548
|
+
# (Optional) Retrain Random Forest on combined features
|
|
1549
|
+
rf_model_combined = RandomForestRegressor(random_state=random_state)
|
|
1550
|
+
rf_model_combined.fit(X_train_combined, y_train)
|
|
1551
|
+
|
|
1552
|
+
# Organize all results to return
|
|
1553
|
+
output = {
|
|
1554
|
+
"combined_features": combined_features,
|
|
1555
|
+
}
|
|
1556
|
+
|
|
1557
|
+
return output
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: imsciences
|
|
3
|
-
Version: 0.9.5.
|
|
3
|
+
Version: 0.9.5.4
|
|
4
4
|
Summary: IMS Data Processing Package
|
|
5
5
|
Author: IMS
|
|
6
6
|
Author-email: cam@im-sciences.com
|
|
7
|
-
Keywords:
|
|
7
|
+
Keywords: data processing,apis,data analysis,data visualization,machine learning
|
|
8
8
|
Classifier: Development Status :: 3 - Alpha
|
|
9
9
|
Classifier: Intended Audience :: Developers
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -17,6 +17,8 @@ Requires-Dist: pandas
|
|
|
17
17
|
Requires-Dist: plotly
|
|
18
18
|
Requires-Dist: numpy
|
|
19
19
|
Requires-Dist: fredapi
|
|
20
|
+
Requires-Dist: xgboost
|
|
21
|
+
Requires-Dist: scikit-learn
|
|
20
22
|
Requires-Dist: bs4
|
|
21
23
|
Requires-Dist: yfinance
|
|
22
24
|
Requires-Dist: holidays
|
|
@@ -222,6 +224,11 @@ Table of Contents
|
|
|
222
224
|
- **Usage**: `week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')`
|
|
223
225
|
- **Example**: `week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')`
|
|
224
226
|
|
|
227
|
+
## 35. `seasonality_feature_extraction`
|
|
228
|
+
- **Description**: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.
|
|
229
|
+
- **Usage**: `seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)`
|
|
230
|
+
- **Example**: `seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)`
|
|
231
|
+
|
|
225
232
|
---
|
|
226
233
|
|
|
227
234
|
## Data Processing for Incrementality Testing
|
|
@@ -6,7 +6,7 @@ imsciences/datafunctions-IMS-24Ltp-3.py,sha256=3Snv-0iE_03StmyjtT-riOU9f4v8TaJWL
|
|
|
6
6
|
imsciences/datafunctions.py,sha256=WZrXNLO-SYrCuFt0pAbha74psMOZPY7meWJ7yWEbRpk,169953
|
|
7
7
|
imsciences/datapull.py,sha256=TPY0LDgOkcKTBk8OekbD0Grg5x0SomAK2dZ7MuT6X1E,19000
|
|
8
8
|
imsciences/geo.py,sha256=J8AkLk1Nyty3VBkPFqcseXjtlSvXVNkHW_nymERz3nA,13472
|
|
9
|
-
imsciences/mmm.py,sha256=
|
|
9
|
+
imsciences/mmm.py,sha256=w2A90eJPvMH0Mp3jh8booKaLGm0BKFqW-H92FR4OpV8,80490
|
|
10
10
|
imsciences/pull.py,sha256=bGz8B7bBQ5b9hrx3ipCFTWl_eebEb7rPL4dANKiVWTY,74015
|
|
11
11
|
imsciences/unittesting.py,sha256=DYGqVCsZHrs_tZ-EXDW8q8CdlcsTnG8HsnmWjEE521c,45691
|
|
12
12
|
imsciences/vis.py,sha256=2izdHQhmWEReerRqIxhY4Ai10VjL7xoUqyWyZC7-2XI,8931
|
|
@@ -14,9 +14,9 @@ imsciencesdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_a
|
|
|
14
14
|
imsciencesdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
|
|
15
15
|
imsdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
|
|
16
16
|
imsdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
|
|
17
|
-
imsciences-0.9.5.
|
|
18
|
-
imsciences-0.9.5.
|
|
19
|
-
imsciences-0.9.5.
|
|
20
|
-
imsciences-0.9.5.
|
|
21
|
-
imsciences-0.9.5.
|
|
22
|
-
imsciences-0.9.5.
|
|
17
|
+
imsciences-0.9.5.4.dist-info/LICENSE.txt,sha256=lVq2QwcExPX4Kl2DHeEkRrikuItcDB1Pr7yF7FQ8_z8,1108
|
|
18
|
+
imsciences-0.9.5.4.dist-info/METADATA,sha256=wCS5rgCM0s3XEwXIPZCwBpaej5PxebfgHB1MaKy-5us,17644
|
|
19
|
+
imsciences-0.9.5.4.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
|
|
20
|
+
imsciences-0.9.5.4.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
|
|
21
|
+
imsciences-0.9.5.4.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
|
|
22
|
+
imsciences-0.9.5.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|