imsciences 0.9.5.1__tar.gz → 0.9.5.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {imsciences-0.9.5.1/imsciences.egg-info → imsciences-0.9.5.5}/PKG-INFO +33 -23
- imsciences-0.9.5.1/PKG-INFO → imsciences-0.9.5.5/README.md +30 -47
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences/geo.py +13 -11
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences/mmm.py +152 -10
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences/pull.py +726 -577
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences/unittesting.py +0 -1
- imsciences-0.9.5.1/README.md → imsciences-0.9.5.5/imsciences.egg-info/PKG-INFO +57 -22
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences.egg-info/requires.txt +2 -0
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/setup.py +3 -3
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/LICENSE.txt +0 -0
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences/__init__.py +0 -0
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences/vis.py +0 -0
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences.egg-info/PKG-INFO-IMS-24Ltp-3 +0 -0
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences.egg-info/SOURCES.txt +0 -0
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-0.9.5.1 → imsciences-0.9.5.5}/setup.cfg +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: imsciences
|
|
3
|
-
Version: 0.9.5.
|
|
3
|
+
Version: 0.9.5.5
|
|
4
4
|
Summary: IMS Data Processing Package
|
|
5
5
|
Author: IMS
|
|
6
6
|
Author-email: cam@im-sciences.com
|
|
7
|
-
Keywords:
|
|
7
|
+
Keywords: data processing,apis,data analysis,data visualization,machine learning
|
|
8
8
|
Classifier: Development Status :: 3 - Alpha
|
|
9
9
|
Classifier: Intended Audience :: Developers
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -17,6 +17,8 @@ Requires-Dist: pandas
|
|
|
17
17
|
Requires-Dist: plotly
|
|
18
18
|
Requires-Dist: numpy
|
|
19
19
|
Requires-Dist: fredapi
|
|
20
|
+
Requires-Dist: xgboost
|
|
21
|
+
Requires-Dist: scikit-learn
|
|
20
22
|
Requires-Dist: bs4
|
|
21
23
|
Requires-Dist: yfinance
|
|
22
24
|
Requires-Dist: holidays
|
|
@@ -33,23 +35,33 @@ The **Independent Marketing Sciences** package is a Python library designed to p
|
|
|
33
35
|
- Seamless data processing for time series workflows.
|
|
34
36
|
- Aggregation, filtering, and transformation of time series data.
|
|
35
37
|
- Visualising Data
|
|
36
|
-
- Integration with external data sources like FRED, Bank of England
|
|
38
|
+
- Integration with external data sources like FRED, Bank of England and ONS.
|
|
37
39
|
|
|
38
40
|
---
|
|
39
41
|
|
|
40
42
|
Table of Contents
|
|
41
43
|
=================
|
|
42
44
|
|
|
43
|
-
1. [
|
|
44
|
-
2. [Data Processing for
|
|
45
|
-
3. [Data
|
|
46
|
-
4. [Data
|
|
47
|
-
5. [
|
|
48
|
-
6. [
|
|
45
|
+
1. [Usage](#usage)
|
|
46
|
+
2. [Data Processing for Time Series](#data-processing-for-time-series)
|
|
47
|
+
3. [Data Processing for Incrementality Testing](#data-processing-for-incrementality-testing)
|
|
48
|
+
4. [Data Visualisations](#data-visualisations)
|
|
49
|
+
5. [Data Pulling](#data-pulling)
|
|
50
|
+
6. [Installation](#installation)
|
|
49
51
|
7. [License](#license)
|
|
50
52
|
|
|
51
53
|
---
|
|
52
54
|
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
from imsciences import dataprocessing, geoprocessing, datapull, datavis
|
|
59
|
+
ims_proc = dataprocessing()
|
|
60
|
+
ims_geo = geoprocessing()
|
|
61
|
+
ims_pull = datapull()
|
|
62
|
+
ims_vis = datavis()
|
|
63
|
+
```
|
|
64
|
+
|
|
53
65
|
## Data Processing for Time Series
|
|
54
66
|
|
|
55
67
|
## 1. `get_wd_levels`
|
|
@@ -222,6 +234,11 @@ Table of Contents
|
|
|
222
234
|
- **Usage**: `week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')`
|
|
223
235
|
- **Example**: `week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')`
|
|
224
236
|
|
|
237
|
+
## 35. `seasonality_feature_extraction`
|
|
238
|
+
- **Description**: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.
|
|
239
|
+
- **Usage**: `seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)`
|
|
240
|
+
- **Example**: `seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)`
|
|
241
|
+
|
|
225
242
|
---
|
|
226
243
|
|
|
227
244
|
## Data Processing for Incrementality Testing
|
|
@@ -291,8 +308,8 @@ Table of Contents
|
|
|
291
308
|
|
|
292
309
|
## 6. `pull_weather`
|
|
293
310
|
- **Description**: Fetch and process historical weather data for the specified country.
|
|
294
|
-
- **Usage**: `pull_weather(week_commencing, country)`
|
|
295
|
-
- **Example**: `pull_weather('mon', 'GBR')`
|
|
311
|
+
- **Usage**: `pull_weather(week_commencing, start_date, country)`
|
|
312
|
+
- **Example**: `pull_weather('mon', '2020-01-01', 'GBR')`
|
|
296
313
|
|
|
297
314
|
## 7. `pull_macro_ons_uk`
|
|
298
315
|
- **Description**: Fetch and process time series data from the Beta ONS API.
|
|
@@ -304,6 +321,11 @@ Table of Contents
|
|
|
304
321
|
- **Usage**: `pull_yfinance(tickers, week_start_day)`
|
|
305
322
|
- **Example**: `pull_yfinance(['^FTMC', '^IXIC'], 'mon')`
|
|
306
323
|
|
|
324
|
+
## 9. `pull_sports_events`
|
|
325
|
+
- **Description**: Pull a veriety of sports events primaraly football and rugby.
|
|
326
|
+
- **Usage**: `pull_sports_events(start_date, week_commencing)`
|
|
327
|
+
- **Example**: `pull_sports_events('2020-01-01', 'mon')`
|
|
328
|
+
|
|
307
329
|
---
|
|
308
330
|
|
|
309
331
|
## Installation
|
|
@@ -316,18 +338,6 @@ pip install imsciences
|
|
|
316
338
|
|
|
317
339
|
---
|
|
318
340
|
|
|
319
|
-
## Usage
|
|
320
|
-
|
|
321
|
-
```bash
|
|
322
|
-
from imsciences import *
|
|
323
|
-
ims_proc = dataprocessing()
|
|
324
|
-
ims_geo = geoprocessing()
|
|
325
|
-
ims_pull = datapull()
|
|
326
|
-
ims_vis = datavis()
|
|
327
|
-
```
|
|
328
|
-
|
|
329
|
-
---
|
|
330
|
-
|
|
331
341
|
## License
|
|
332
342
|
|
|
333
343
|
This project is licensed under the MIT License. 
|
|
@@ -1,28 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: imsciences
|
|
3
|
-
Version: 0.9.5.1
|
|
4
|
-
Summary: IMS Data Processing Package
|
|
5
|
-
Author: IMS
|
|
6
|
-
Author-email: cam@im-sciences.com
|
|
7
|
-
Keywords: python,data processing,apis
|
|
8
|
-
Classifier: Development Status :: 3 - Alpha
|
|
9
|
-
Classifier: Intended Audience :: Developers
|
|
10
|
-
Classifier: Programming Language :: Python :: 3
|
|
11
|
-
Classifier: Operating System :: Unix
|
|
12
|
-
Classifier: Operating System :: MacOS :: MacOS X
|
|
13
|
-
Classifier: Operating System :: Microsoft :: Windows
|
|
14
|
-
Description-Content-Type: text/markdown
|
|
15
|
-
License-File: LICENSE.txt
|
|
16
|
-
Requires-Dist: pandas
|
|
17
|
-
Requires-Dist: plotly
|
|
18
|
-
Requires-Dist: numpy
|
|
19
|
-
Requires-Dist: fredapi
|
|
20
|
-
Requires-Dist: bs4
|
|
21
|
-
Requires-Dist: yfinance
|
|
22
|
-
Requires-Dist: holidays
|
|
23
|
-
Requires-Dist: google-analytics-data
|
|
24
|
-
Requires-Dist: geopandas
|
|
25
|
-
|
|
26
1
|
# IMS Package Documentation
|
|
27
2
|
|
|
28
3
|
The **Independent Marketing Sciences** package is a Python library designed to process incoming data into a format tailored for projects, particularly those utilising weekly time series data. This package offers a suite of functions for efficient data collection, manipulation, visualisation and analysis.
|
|
@@ -33,23 +8,33 @@ The **Independent Marketing Sciences** package is a Python library designed to p
|
|
|
33
8
|
- Seamless data processing for time series workflows.
|
|
34
9
|
- Aggregation, filtering, and transformation of time series data.
|
|
35
10
|
- Visualising Data
|
|
36
|
-
- Integration with external data sources like FRED, Bank of England
|
|
11
|
+
- Integration with external data sources like FRED, Bank of England and ONS.
|
|
37
12
|
|
|
38
13
|
---
|
|
39
14
|
|
|
40
15
|
Table of Contents
|
|
41
16
|
=================
|
|
42
17
|
|
|
43
|
-
1. [
|
|
44
|
-
2. [Data Processing for
|
|
45
|
-
3. [Data
|
|
46
|
-
4. [Data
|
|
47
|
-
5. [
|
|
48
|
-
6. [
|
|
18
|
+
1. [Usage](#usage)
|
|
19
|
+
2. [Data Processing for Time Series](#data-processing-for-time-series)
|
|
20
|
+
3. [Data Processing for Incrementality Testing](#data-processing-for-incrementality-testing)
|
|
21
|
+
4. [Data Visualisations](#data-visualisations)
|
|
22
|
+
5. [Data Pulling](#data-pulling)
|
|
23
|
+
6. [Installation](#installation)
|
|
49
24
|
7. [License](#license)
|
|
50
25
|
|
|
51
26
|
---
|
|
52
27
|
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
from imsciences import dataprocessing, geoprocessing, datapull, datavis
|
|
32
|
+
ims_proc = dataprocessing()
|
|
33
|
+
ims_geo = geoprocessing()
|
|
34
|
+
ims_pull = datapull()
|
|
35
|
+
ims_vis = datavis()
|
|
36
|
+
```
|
|
37
|
+
|
|
53
38
|
## Data Processing for Time Series
|
|
54
39
|
|
|
55
40
|
## 1. `get_wd_levels`
|
|
@@ -222,6 +207,11 @@ Table of Contents
|
|
|
222
207
|
- **Usage**: `week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')`
|
|
223
208
|
- **Example**: `week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')`
|
|
224
209
|
|
|
210
|
+
## 35. `seasonality_feature_extraction`
|
|
211
|
+
- **Description**: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.
|
|
212
|
+
- **Usage**: `seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)`
|
|
213
|
+
- **Example**: `seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)`
|
|
214
|
+
|
|
225
215
|
---
|
|
226
216
|
|
|
227
217
|
## Data Processing for Incrementality Testing
|
|
@@ -291,8 +281,8 @@ Table of Contents
|
|
|
291
281
|
|
|
292
282
|
## 6. `pull_weather`
|
|
293
283
|
- **Description**: Fetch and process historical weather data for the specified country.
|
|
294
|
-
- **Usage**: `pull_weather(week_commencing, country)`
|
|
295
|
-
- **Example**: `pull_weather('mon', 'GBR')`
|
|
284
|
+
- **Usage**: `pull_weather(week_commencing, start_date, country)`
|
|
285
|
+
- **Example**: `pull_weather('mon', '2020-01-01', 'GBR')`
|
|
296
286
|
|
|
297
287
|
## 7. `pull_macro_ons_uk`
|
|
298
288
|
- **Description**: Fetch and process time series data from the Beta ONS API.
|
|
@@ -304,6 +294,11 @@ Table of Contents
|
|
|
304
294
|
- **Usage**: `pull_yfinance(tickers, week_start_day)`
|
|
305
295
|
- **Example**: `pull_yfinance(['^FTMC', '^IXIC'], 'mon')`
|
|
306
296
|
|
|
297
|
+
## 9. `pull_sports_events`
|
|
298
|
+
- **Description**: Pull a veriety of sports events primaraly football and rugby.
|
|
299
|
+
- **Usage**: `pull_sports_events(start_date, week_commencing)`
|
|
300
|
+
- **Example**: `pull_sports_events('2020-01-01', 'mon')`
|
|
301
|
+
|
|
307
302
|
---
|
|
308
303
|
|
|
309
304
|
## Installation
|
|
@@ -316,20 +311,8 @@ pip install imsciences
|
|
|
316
311
|
|
|
317
312
|
---
|
|
318
313
|
|
|
319
|
-
## Usage
|
|
320
|
-
|
|
321
|
-
```bash
|
|
322
|
-
from imsciences import *
|
|
323
|
-
ims_proc = dataprocessing()
|
|
324
|
-
ims_geo = geoprocessing()
|
|
325
|
-
ims_pull = datapull()
|
|
326
|
-
ims_vis = datavis()
|
|
327
|
-
```
|
|
328
|
-
|
|
329
|
-
---
|
|
330
|
-
|
|
331
314
|
## License
|
|
332
315
|
|
|
333
316
|
This project is licensed under the MIT License. 
|
|
334
317
|
|
|
335
|
-
---
|
|
318
|
+
---
|
|
@@ -199,13 +199,13 @@ class geoprocessing:
|
|
|
199
199
|
|
|
200
200
|
return analysis_df
|
|
201
201
|
|
|
202
|
-
def process_city_analysis(self,
|
|
202
|
+
def process_city_analysis(self, raw_data, spend_data, output_path, group1, group2, response_column):
|
|
203
203
|
"""
|
|
204
204
|
Process city analysis by grouping data, analyzing user metrics, and merging with spend data.
|
|
205
205
|
|
|
206
206
|
Parameters:
|
|
207
|
-
|
|
208
|
-
|
|
207
|
+
raw_data (str or pd.DataFrame): Raw input data as a file path (CSV/XLSX) or DataFrame.
|
|
208
|
+
spend_data (str or pd.DataFrame): Spend data as a file path (CSV/XLSX) or DataFrame.
|
|
209
209
|
output_path (str): Path to save the final output file (CSV or XLSX).
|
|
210
210
|
group1 (list): List of city regions for group 1.
|
|
211
211
|
group2 (list): List of city regions for group 2.
|
|
@@ -217,13 +217,15 @@ class geoprocessing:
|
|
|
217
217
|
import pandas as pd
|
|
218
218
|
import os
|
|
219
219
|
|
|
220
|
-
def read_file(
|
|
221
|
-
"""Helper function to
|
|
222
|
-
|
|
220
|
+
def read_file(data):
|
|
221
|
+
"""Helper function to handle file paths or return DataFrame directly."""
|
|
222
|
+
if isinstance(data, pd.DataFrame):
|
|
223
|
+
return data
|
|
224
|
+
ext = os.path.splitext(data)[1].lower()
|
|
223
225
|
if ext == '.csv':
|
|
224
|
-
return pd.read_csv(
|
|
226
|
+
return pd.read_csv(data)
|
|
225
227
|
elif ext in ['.xlsx', '.xls']:
|
|
226
|
-
return pd.read_excel(
|
|
228
|
+
return pd.read_excel(data)
|
|
227
229
|
else:
|
|
228
230
|
raise ValueError("Unsupported file type. Please use a CSV or XLSX file.")
|
|
229
231
|
|
|
@@ -237,9 +239,9 @@ class geoprocessing:
|
|
|
237
239
|
else:
|
|
238
240
|
raise ValueError("Unsupported file type. Please use a CSV or XLSX file.")
|
|
239
241
|
|
|
240
|
-
# Read
|
|
241
|
-
raw_df = read_file(
|
|
242
|
-
spend_df = read_file(
|
|
242
|
+
# Read data
|
|
243
|
+
raw_df = read_file(raw_data)
|
|
244
|
+
spend_df = read_file(spend_data)
|
|
243
245
|
|
|
244
246
|
# Ensure necessary columns are present
|
|
245
247
|
required_columns = {'date', 'city', response_column}
|
|
@@ -6,6 +6,9 @@ import re
|
|
|
6
6
|
from datetime import datetime, timedelta
|
|
7
7
|
import subprocess
|
|
8
8
|
import json
|
|
9
|
+
from sklearn.model_selection import train_test_split
|
|
10
|
+
import xgboost as xgb
|
|
11
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
9
12
|
|
|
10
13
|
class dataprocessing:
|
|
11
14
|
|
|
@@ -180,7 +183,12 @@ class dataprocessing:
|
|
|
180
183
|
print(" - Description: Maps dates to the start of the current ISO week based on a specified weekday.")
|
|
181
184
|
print(" - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')")
|
|
182
185
|
print(" - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')")
|
|
183
|
-
|
|
186
|
+
|
|
187
|
+
print("\n35. seasonality_feature_extraction")
|
|
188
|
+
print(" - Description: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.")
|
|
189
|
+
print(" - Usage: seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)")
|
|
190
|
+
print(" - Example: seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)")
|
|
191
|
+
|
|
184
192
|
def get_wd_levels(self, levels):
|
|
185
193
|
"""
|
|
186
194
|
Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
|
|
@@ -492,15 +500,15 @@ class dataprocessing:
|
|
|
492
500
|
|
|
493
501
|
return combined_df
|
|
494
502
|
|
|
495
|
-
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=
|
|
503
|
+
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=True, week_commencing="W-MON"):
|
|
496
504
|
"""
|
|
497
505
|
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
498
506
|
|
|
499
507
|
Args:
|
|
500
508
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
501
509
|
index_col (str): Name of Column for your pivot table to index on
|
|
502
|
-
columns (str): Name of
|
|
503
|
-
values_col (str): Name of Values
|
|
510
|
+
columns (str or list): Name of Column(s) for your pivot table. Can be a single column or a list of columns.
|
|
511
|
+
values_col (str or list): Name of Values Column(s) for your pivot table. Can be a single column or a list of columns.
|
|
504
512
|
filters_dict (dict, optional): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell. Defaults to None
|
|
505
513
|
fill_value (int, optional): The value to replace nan with. Defaults to 0.
|
|
506
514
|
aggfunc (str, optional): The method on which to aggregate the values column. Defaults to sum.
|
|
@@ -514,14 +522,19 @@ class dataprocessing:
|
|
|
514
522
|
Returns:
|
|
515
523
|
pandas.DataFrame: The pivot table specified
|
|
516
524
|
"""
|
|
517
|
-
|
|
518
525
|
# Validate inputs
|
|
519
526
|
if index_col not in df.columns:
|
|
520
527
|
raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
528
|
+
|
|
529
|
+
columns = [columns] if isinstance(columns, str) else columns
|
|
530
|
+
for col in columns:
|
|
531
|
+
if col not in df.columns:
|
|
532
|
+
raise ValueError(f"columns '{col}' not found in DataFrame.")
|
|
533
|
+
|
|
534
|
+
values_col = [values_col] if isinstance(values_col, str) else values_col
|
|
535
|
+
for col in values_col:
|
|
536
|
+
if col not in df.columns:
|
|
537
|
+
raise ValueError(f"values_col '{col}' not found in DataFrame.")
|
|
525
538
|
|
|
526
539
|
# Apply filters if provided
|
|
527
540
|
if filters_dict:
|
|
@@ -1412,4 +1425,133 @@ class dataprocessing:
|
|
|
1412
1425
|
new_col = f"week_start_{week_commencing}"
|
|
1413
1426
|
df[new_col] = df[date_col].apply(map_to_week_start)
|
|
1414
1427
|
|
|
1415
|
-
return df
|
|
1428
|
+
return df
|
|
1429
|
+
|
|
1430
|
+
def seasonality_feature_extraction(self, df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False):
|
|
1431
|
+
"""
|
|
1432
|
+
1) Uses the provided dataframe (df), where:
|
|
1433
|
+
- df['kpi_total_sales'] is the target (y).
|
|
1434
|
+
- df['OBS'] is a date or index column (excluded from features).
|
|
1435
|
+
|
|
1436
|
+
2) Splits data into train/test using the specified test_size, random_state, and shuffle.
|
|
1437
|
+
3) Trains XGBoost and Random Forest on all features.
|
|
1438
|
+
4) Extracts the top n_features from each model.
|
|
1439
|
+
5) Merges their unique top features.
|
|
1440
|
+
6) Optionally retrains each model on the combined top features.
|
|
1441
|
+
7) Returns performance metrics and the fitted models.
|
|
1442
|
+
|
|
1443
|
+
Parameters
|
|
1444
|
+
----------
|
|
1445
|
+
df : pd.DataFrame
|
|
1446
|
+
The input dataframe that contains kpi_var (target) and 'OBS' (date/index).
|
|
1447
|
+
n_features : int, optional
|
|
1448
|
+
Number of top features to extract from each model (default=10).
|
|
1449
|
+
test_size : float, optional
|
|
1450
|
+
Test size for train_test_split (default=0.1).
|
|
1451
|
+
random_state : int, optional
|
|
1452
|
+
Random state for reproducibility (default=42).
|
|
1453
|
+
shuffle : bool, optional
|
|
1454
|
+
Whether to shuffle the data before splitting (default=False).
|
|
1455
|
+
|
|
1456
|
+
Returns
|
|
1457
|
+
-------
|
|
1458
|
+
dict
|
|
1459
|
+
A dictionary containing:
|
|
1460
|
+
- "top_features_xgb": list of top n_features from XGBoost
|
|
1461
|
+
- "top_features_rf": list of top n_features from Random Forest
|
|
1462
|
+
- "combined_features": merged unique feature list
|
|
1463
|
+
- "performance": dictionary of performance metrics
|
|
1464
|
+
- "models": dictionary of fitted models
|
|
1465
|
+
"""
|
|
1466
|
+
# ---------------------------------------------------------------------
|
|
1467
|
+
# 1. Prepare your data (X, y)
|
|
1468
|
+
# ---------------------------------------------------------------------
|
|
1469
|
+
# Extract target and features
|
|
1470
|
+
y = df[kpi_var]
|
|
1471
|
+
X = df.drop(columns=['OBS', kpi_var])
|
|
1472
|
+
|
|
1473
|
+
# Split into train/test
|
|
1474
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
1475
|
+
X, y,
|
|
1476
|
+
test_size=test_size,
|
|
1477
|
+
random_state=random_state,
|
|
1478
|
+
shuffle=shuffle
|
|
1479
|
+
)
|
|
1480
|
+
|
|
1481
|
+
# ---------------------------------------------------------------------
|
|
1482
|
+
# 2. XGBoost Approach (on all features)
|
|
1483
|
+
# ---------------------------------------------------------------------
|
|
1484
|
+
# (A) Train full model on ALL features
|
|
1485
|
+
xgb_model_full = xgb.XGBRegressor(random_state=random_state)
|
|
1486
|
+
xgb_model_full.fit(X_train, y_train)
|
|
1487
|
+
|
|
1488
|
+
# (B) Get feature importances
|
|
1489
|
+
xgb_importances = xgb_model_full.feature_importances_
|
|
1490
|
+
xgb_feat_importance_df = (
|
|
1491
|
+
pd.DataFrame({
|
|
1492
|
+
'feature': X.columns,
|
|
1493
|
+
'importance': xgb_importances
|
|
1494
|
+
})
|
|
1495
|
+
.sort_values('importance', ascending=False)
|
|
1496
|
+
.reset_index(drop=True)
|
|
1497
|
+
)
|
|
1498
|
+
|
|
1499
|
+
# (C) Select top N features
|
|
1500
|
+
top_features_xgb = xgb_feat_importance_df['feature'].head(n_features).tolist()
|
|
1501
|
+
|
|
1502
|
+
# (D) Subset data to top N features
|
|
1503
|
+
X_train_xgb_topN = X_train[top_features_xgb]
|
|
1504
|
+
|
|
1505
|
+
# (E) Retrain XGBoost on these top N features
|
|
1506
|
+
xgb_model_topN = xgb.XGBRegressor(random_state=random_state)
|
|
1507
|
+
xgb_model_topN.fit(X_train_xgb_topN, y_train)
|
|
1508
|
+
|
|
1509
|
+
# ---------------------------------------------------------------------
|
|
1510
|
+
# 3. Random Forest Approach (on all features)
|
|
1511
|
+
# ---------------------------------------------------------------------
|
|
1512
|
+
rf_model_full = RandomForestRegressor(random_state=random_state)
|
|
1513
|
+
rf_model_full.fit(X_train, y_train)
|
|
1514
|
+
|
|
1515
|
+
# (B) Get feature importances
|
|
1516
|
+
rf_importances = rf_model_full.feature_importances_
|
|
1517
|
+
rf_feat_importance_df = (
|
|
1518
|
+
pd.DataFrame({
|
|
1519
|
+
'feature': X.columns,
|
|
1520
|
+
'importance': rf_importances
|
|
1521
|
+
})
|
|
1522
|
+
.sort_values('importance', ascending=False)
|
|
1523
|
+
.reset_index(drop=True)
|
|
1524
|
+
)
|
|
1525
|
+
|
|
1526
|
+
# (C) Select top N features
|
|
1527
|
+
top_features_rf = rf_feat_importance_df['feature'].head(n_features).tolist()
|
|
1528
|
+
|
|
1529
|
+
# (D) Subset data to top N features
|
|
1530
|
+
X_train_rf_topN = X_train[top_features_rf]
|
|
1531
|
+
|
|
1532
|
+
# (E) Retrain Random Forest on these top N features
|
|
1533
|
+
rf_model_topN = RandomForestRegressor(random_state=random_state)
|
|
1534
|
+
rf_model_topN.fit(X_train_rf_topN, y_train)
|
|
1535
|
+
|
|
1536
|
+
# ---------------------------------------------------------------------
|
|
1537
|
+
# 4. Combine top features from both models
|
|
1538
|
+
# ---------------------------------------------------------------------
|
|
1539
|
+
combined_features = list(set(top_features_xgb + top_features_rf))
|
|
1540
|
+
|
|
1541
|
+
# Create new training/testing data with the combined features
|
|
1542
|
+
X_train_combined = X_train[combined_features]
|
|
1543
|
+
|
|
1544
|
+
# (Optional) Retrain XGBoost on combined features
|
|
1545
|
+
xgb_model_combined = xgb.XGBRegressor(random_state=random_state)
|
|
1546
|
+
xgb_model_combined.fit(X_train_combined, y_train)
|
|
1547
|
+
|
|
1548
|
+
# (Optional) Retrain Random Forest on combined features
|
|
1549
|
+
rf_model_combined = RandomForestRegressor(random_state=random_state)
|
|
1550
|
+
rf_model_combined.fit(X_train_combined, y_train)
|
|
1551
|
+
|
|
1552
|
+
# Organize all results to return
|
|
1553
|
+
output = {
|
|
1554
|
+
"combined_features": combined_features,
|
|
1555
|
+
}
|
|
1556
|
+
|
|
1557
|
+
return output
|