imsciences 0.9.5.3__py3-none-any.whl → 0.9.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/mmm.py CHANGED
@@ -6,6 +6,9 @@ import re
6
6
  from datetime import datetime, timedelta
7
7
  import subprocess
8
8
  import json
9
+ from sklearn.model_selection import train_test_split
10
+ import xgboost as xgb
11
+ from sklearn.ensemble import RandomForestRegressor
9
12
 
10
13
  class dataprocessing:
11
14
 
@@ -180,7 +183,12 @@ class dataprocessing:
180
183
  print(" - Description: Maps dates to the start of the current ISO week based on a specified weekday.")
181
184
  print(" - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')")
182
185
  print(" - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')")
183
-
186
+
187
+ print("\n35. seasonality_feature_extraction")
188
+ print(" - Description: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.")
189
+ print(" - Usage: seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)")
190
+ print(" - Example: seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)")
191
+
184
192
  def get_wd_levels(self, levels):
185
193
  """
186
194
  Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
@@ -1417,4 +1425,133 @@ class dataprocessing:
1417
1425
  new_col = f"week_start_{week_commencing}"
1418
1426
  df[new_col] = df[date_col].apply(map_to_week_start)
1419
1427
 
1420
- return df
1428
+ return df
1429
+
1430
+ def seasonality_feature_extraction(self, df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False):
1431
+ """
1432
+ 1) Uses the provided dataframe (df), where:
1433
+ - df['kpi_total_sales'] is the target (y).
1434
+ - df['OBS'] is a date or index column (excluded from features).
1435
+
1436
+ 2) Splits data into train/test using the specified test_size, random_state, and shuffle.
1437
+ 3) Trains XGBoost and Random Forest on all features.
1438
+ 4) Extracts the top n_features from each model.
1439
+ 5) Merges their unique top features.
1440
+ 6) Optionally retrains each model on the combined top features.
1441
+ 7) Returns performance metrics and the fitted models.
1442
+
1443
+ Parameters
1444
+ ----------
1445
+ df : pd.DataFrame
1446
+ The input dataframe that contains kpi_var (target) and 'OBS' (date/index).
1447
+ n_features : int, optional
1448
+ Number of top features to extract from each model (default=10).
1449
+ test_size : float, optional
1450
+ Test size for train_test_split (default=0.1).
1451
+ random_state : int, optional
1452
+ Random state for reproducibility (default=42).
1453
+ shuffle : bool, optional
1454
+ Whether to shuffle the data before splitting (default=False).
1455
+
1456
+ Returns
1457
+ -------
1458
+ dict
1459
+ A dictionary containing:
1460
+ - "top_features_xgb": list of top n_features from XGBoost
1461
+ - "top_features_rf": list of top n_features from Random Forest
1462
+ - "combined_features": merged unique feature list
1463
+ - "performance": dictionary of performance metrics
1464
+ - "models": dictionary of fitted models
1465
+ """
1466
+ # ---------------------------------------------------------------------
1467
+ # 1. Prepare your data (X, y)
1468
+ # ---------------------------------------------------------------------
1469
+ # Extract target and features
1470
+ y = df[kpi_var]
1471
+ X = df.drop(columns=['OBS', kpi_var])
1472
+
1473
+ # Split into train/test
1474
+ X_train, X_test, y_train, y_test = train_test_split(
1475
+ X, y,
1476
+ test_size=test_size,
1477
+ random_state=random_state,
1478
+ shuffle=shuffle
1479
+ )
1480
+
1481
+ # ---------------------------------------------------------------------
1482
+ # 2. XGBoost Approach (on all features)
1483
+ # ---------------------------------------------------------------------
1484
+ # (A) Train full model on ALL features
1485
+ xgb_model_full = xgb.XGBRegressor(random_state=random_state)
1486
+ xgb_model_full.fit(X_train, y_train)
1487
+
1488
+ # (B) Get feature importances
1489
+ xgb_importances = xgb_model_full.feature_importances_
1490
+ xgb_feat_importance_df = (
1491
+ pd.DataFrame({
1492
+ 'feature': X.columns,
1493
+ 'importance': xgb_importances
1494
+ })
1495
+ .sort_values('importance', ascending=False)
1496
+ .reset_index(drop=True)
1497
+ )
1498
+
1499
+ # (C) Select top N features
1500
+ top_features_xgb = xgb_feat_importance_df['feature'].head(n_features).tolist()
1501
+
1502
+ # (D) Subset data to top N features
1503
+ X_train_xgb_topN = X_train[top_features_xgb]
1504
+
1505
+ # (E) Retrain XGBoost on these top N features
1506
+ xgb_model_topN = xgb.XGBRegressor(random_state=random_state)
1507
+ xgb_model_topN.fit(X_train_xgb_topN, y_train)
1508
+
1509
+ # ---------------------------------------------------------------------
1510
+ # 3. Random Forest Approach (on all features)
1511
+ # ---------------------------------------------------------------------
1512
+ rf_model_full = RandomForestRegressor(random_state=random_state)
1513
+ rf_model_full.fit(X_train, y_train)
1514
+
1515
+ # (B) Get feature importances
1516
+ rf_importances = rf_model_full.feature_importances_
1517
+ rf_feat_importance_df = (
1518
+ pd.DataFrame({
1519
+ 'feature': X.columns,
1520
+ 'importance': rf_importances
1521
+ })
1522
+ .sort_values('importance', ascending=False)
1523
+ .reset_index(drop=True)
1524
+ )
1525
+
1526
+ # (C) Select top N features
1527
+ top_features_rf = rf_feat_importance_df['feature'].head(n_features).tolist()
1528
+
1529
+ # (D) Subset data to top N features
1530
+ X_train_rf_topN = X_train[top_features_rf]
1531
+
1532
+ # (E) Retrain Random Forest on these top N features
1533
+ rf_model_topN = RandomForestRegressor(random_state=random_state)
1534
+ rf_model_topN.fit(X_train_rf_topN, y_train)
1535
+
1536
+ # ---------------------------------------------------------------------
1537
+ # 4. Combine top features from both models
1538
+ # ---------------------------------------------------------------------
1539
+ combined_features = list(set(top_features_xgb + top_features_rf))
1540
+
1541
+ # Create new training/testing data with the combined features
1542
+ X_train_combined = X_train[combined_features]
1543
+
1544
+ # (Optional) Retrain XGBoost on combined features
1545
+ xgb_model_combined = xgb.XGBRegressor(random_state=random_state)
1546
+ xgb_model_combined.fit(X_train_combined, y_train)
1547
+
1548
+ # (Optional) Retrain Random Forest on combined features
1549
+ rf_model_combined = RandomForestRegressor(random_state=random_state)
1550
+ rf_model_combined.fit(X_train_combined, y_train)
1551
+
1552
+ # Organize all results to return
1553
+ output = {
1554
+ "combined_features": combined_features,
1555
+ }
1556
+
1557
+ return output
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.5.3
3
+ Version: 0.9.5.4
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
7
- Keywords: python,data processing,apis
7
+ Keywords: data processing,apis,data analysis,data visualization,machine learning
8
8
  Classifier: Development Status :: 3 - Alpha
9
9
  Classifier: Intended Audience :: Developers
10
10
  Classifier: Programming Language :: Python :: 3
@@ -17,6 +17,8 @@ Requires-Dist: pandas
17
17
  Requires-Dist: plotly
18
18
  Requires-Dist: numpy
19
19
  Requires-Dist: fredapi
20
+ Requires-Dist: xgboost
21
+ Requires-Dist: scikit-learn
20
22
  Requires-Dist: bs4
21
23
  Requires-Dist: yfinance
22
24
  Requires-Dist: holidays
@@ -222,6 +224,11 @@ Table of Contents
222
224
  - **Usage**: `week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')`
223
225
  - **Example**: `week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')`
224
226
 
227
+ ## 35. `seasonality_feature_extraction`
228
+ - **Description**: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.
229
+ - **Usage**: `seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)`
230
+ - **Example**: `seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)`
231
+
225
232
  ---
226
233
 
227
234
  ## Data Processing for Incrementality Testing
@@ -6,7 +6,7 @@ imsciences/datafunctions-IMS-24Ltp-3.py,sha256=3Snv-0iE_03StmyjtT-riOU9f4v8TaJWL
6
6
  imsciences/datafunctions.py,sha256=WZrXNLO-SYrCuFt0pAbha74psMOZPY7meWJ7yWEbRpk,169953
7
7
  imsciences/datapull.py,sha256=TPY0LDgOkcKTBk8OekbD0Grg5x0SomAK2dZ7MuT6X1E,19000
8
8
  imsciences/geo.py,sha256=J8AkLk1Nyty3VBkPFqcseXjtlSvXVNkHW_nymERz3nA,13472
9
- imsciences/mmm.py,sha256=W7e46fw0c2V9n_-fU3O6F0X1P5tbC2dkMrqnDLJH28g,74230
9
+ imsciences/mmm.py,sha256=w2A90eJPvMH0Mp3jh8booKaLGm0BKFqW-H92FR4OpV8,80490
10
10
  imsciences/pull.py,sha256=bGz8B7bBQ5b9hrx3ipCFTWl_eebEb7rPL4dANKiVWTY,74015
11
11
  imsciences/unittesting.py,sha256=DYGqVCsZHrs_tZ-EXDW8q8CdlcsTnG8HsnmWjEE521c,45691
12
12
  imsciences/vis.py,sha256=2izdHQhmWEReerRqIxhY4Ai10VjL7xoUqyWyZC7-2XI,8931
@@ -14,9 +14,9 @@ imsciencesdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_a
14
14
  imsciencesdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
15
15
  imsdataprocessing/__init__.py,sha256=quSwsLs6IuLoA5Rzi0ZD40xZaQudwDteF7_ai9JfTPk,32
16
16
  imsdataprocessing/datafunctions.py,sha256=vE1vsZ8xOSbR9Bwlp9SWXwEHXQ0nFydwGkvzHXf2f1Y,41
17
- imsciences-0.9.5.3.dist-info/LICENSE.txt,sha256=lVq2QwcExPX4Kl2DHeEkRrikuItcDB1Pr7yF7FQ8_z8,1108
18
- imsciences-0.9.5.3.dist-info/METADATA,sha256=J6mScUPG8way2ZfnDauRvqqKDUSriU3dZ1POLQuJmno,16994
19
- imsciences-0.9.5.3.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
20
- imsciences-0.9.5.3.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
21
- imsciences-0.9.5.3.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
22
- imsciences-0.9.5.3.dist-info/RECORD,,
17
+ imsciences-0.9.5.4.dist-info/LICENSE.txt,sha256=lVq2QwcExPX4Kl2DHeEkRrikuItcDB1Pr7yF7FQ8_z8,1108
18
+ imsciences-0.9.5.4.dist-info/METADATA,sha256=wCS5rgCM0s3XEwXIPZCwBpaej5PxebfgHB1MaKy-5us,17644
19
+ imsciences-0.9.5.4.dist-info/PKG-INFO-IMS-24Ltp-3,sha256=yqZbigwHjnYoqyI81PGz_AeofRFfOrwH_Vyawyef-mg,854
20
+ imsciences-0.9.5.4.dist-info/WHEEL,sha256=ixB2d4u7mugx_bCBycvM9OzZ5yD7NmPXFRtKlORZS2Y,91
21
+ imsciences-0.9.5.4.dist-info/top_level.txt,sha256=hsENS-AlDVRh8tQJ6-426iUQlla9bPcGc0-UlFF0_iU,11
22
+ imsciences-0.9.5.4.dist-info/RECORD,,