imsciences 0.9.7.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/mmm.py CHANGED
@@ -1,93 +1,152 @@
1
- import pandas as pd
2
1
  import calendar
2
+ import json
3
3
  import os
4
- import numpy as np
5
- import re
6
- from datetime import datetime, timedelta
7
4
  import subprocess
8
- import json
9
- from sklearn.model_selection import train_test_split
5
+ from datetime import datetime, timedelta
6
+
7
+ import numpy as np
8
+ import pandas as pd
10
9
  import xgboost as xgb
11
10
  from sklearn.ensemble import RandomForestRegressor
11
+ from sklearn.model_selection import train_test_split
12
+
12
13
 
13
14
  class dataprocessing:
14
-
15
15
  def help(self):
16
-
17
16
  print("\n1. get_wd_levels")
18
- print(" - Description: Get the working directory with the option of moving up parents.")
17
+ print(
18
+ " - Description: Get the working directory with the option of moving up parents.",
19
+ )
19
20
  print(" - Usage: get_wd_levels(levels)")
20
21
  print(" - Example: get_wd_levels(0)")
21
22
 
22
23
  print("\n2. aggregate_daily_to_wc_long")
23
- print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
24
- print(" - Usage: aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')")
25
- print(" - Example: aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')")
24
+ print(
25
+ " - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.",
26
+ )
27
+ print(
28
+ " - Usage: aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')",
29
+ )
30
+ print(
31
+ " - Example: aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')",
32
+ )
26
33
 
27
34
  print("\n3. convert_monthly_to_daily")
28
- print(" - Description: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.")
35
+ print(
36
+ " - Description: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.",
37
+ )
29
38
  print(" - Usage: convert_monthly_to_daily(df, date_column, divide=True)")
30
39
  print(" - Example: convert_monthly_to_daily(df, 'date')")
31
40
 
32
41
  print("\n4. week_of_year_mapping")
33
- print(" - Description: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.")
42
+ print(
43
+ " - Description: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.",
44
+ )
34
45
  print(" - Usage: week_of_year_mapping(df, week_col, start_day_str)")
35
46
  print(" - Example: week_of_year_mapping(df, 'week', 'mon')")
36
47
 
37
48
  print("\n5. rename_cols")
38
- print(" - Description: Renames columns in a pandas DataFrame with a specified prefix or format.")
49
+ print(
50
+ " - Description: Renames columns in a pandas DataFrame with a specified prefix or format.",
51
+ )
39
52
  print(" - Usage: rename_cols(df, name='ame_')")
40
53
  print(" - Example: rename_cols(df, 'ame_facebook')")
41
54
 
42
55
  print("\n6. merge_new_and_old")
43
- print(" - Description: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.")
44
- print(" - Usage: merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')")
45
- print(" - Example: merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')")
56
+ print(
57
+ " - Description: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.",
58
+ )
59
+ print(
60
+ " - Usage: merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')",
61
+ )
62
+ print(
63
+ " - Example: merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')",
64
+ )
46
65
 
47
66
  print("\n7. merge_dataframes_on_column")
48
67
  print(" - Description: Merge a list of DataFrames on a common column.")
49
- print(" - Usage: merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')")
50
- print(" - Example: merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')")
68
+ print(
69
+ " - Usage: merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')",
70
+ )
71
+ print(
72
+ " - Example: merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')",
73
+ )
51
74
 
52
75
  print("\n8. merge_and_update_dfs")
53
- print(" - Description: Merges two dataframes, updating columns from the second dataframe where values are available.")
76
+ print(
77
+ " - Description: Merges two dataframes, updating columns from the second dataframe where values are available.",
78
+ )
54
79
  print(" - Usage: merge_and_update_dfs(df1, df2, key_column)")
55
- print(" - Example: merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')")
80
+ print(
81
+ " - Example: merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')",
82
+ )
56
83
 
57
84
  print("\n9. convert_us_to_uk_dates")
58
- print(" - Description: Convert a DataFrame column with mixed US and UK date formats to datetime.")
85
+ print(
86
+ " - Description: Convert a DataFrame column with mixed US and UK date formats to datetime.",
87
+ )
59
88
  print(" - Usage: convert_us_to_uk_dates(df, date_col)")
60
89
  print(" - Example: convert_us_to_uk_dates(df, 'date')")
61
90
 
62
91
  print("\n10. combine_sheets")
63
- print(" - Description: Combines multiple DataFrames from a dictionary into a single DataFrame.")
92
+ print(
93
+ " - Description: Combines multiple DataFrames from a dictionary into a single DataFrame.",
94
+ )
64
95
  print(" - Usage: combine_sheets(all_sheets)")
65
96
  print(" - Example: combine_sheets({'Sheet1': df1, 'Sheet2': df2})")
66
97
 
67
98
  print("\n11. pivot_table")
68
- print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
69
- print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')")
70
- print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)")
99
+ print(
100
+ " - Description: Dynamically pivots a DataFrame based on specified columns.",
101
+ )
102
+ print(
103
+ " - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')",
104
+ )
105
+ print(
106
+ " - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)",
107
+ )
71
108
 
72
109
  print("\n12. apply_lookup_table_for_columns")
73
- print(" - Description: Maps substrings in columns to new values based on a dictionary.")
74
- print(" - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')")
75
- print(" - Example: apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')")
110
+ print(
111
+ " - Description: Maps substrings in columns to new values based on a dictionary.",
112
+ )
113
+ print(
114
+ " - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')",
115
+ )
116
+ print(
117
+ " - Example: apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')",
118
+ )
76
119
 
77
120
  print("\n13. aggregate_daily_to_wc_wide")
78
- print(" - Description: Aggregates daily data into weekly data and pivots it to wide format.")
79
- print(" - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)")
80
- print(" - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)")
121
+ print(
122
+ " - Description: Aggregates daily data into weekly data and pivots it to wide format.",
123
+ )
124
+ print(
125
+ " - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)",
126
+ )
127
+ print(
128
+ " - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)",
129
+ )
81
130
 
82
131
  print("\n14. merge_cols_with_seperator")
83
- print(" - Description: Merges multiple columns in a DataFrame into one column with a specified separator.")
84
- print(" - Usage: merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')")
85
- print(" - Example: merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')")
132
+ print(
133
+ " - Description: Merges multiple columns in a DataFrame into one column with a specified separator.",
134
+ )
135
+ print(
136
+ " - Usage: merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')",
137
+ )
138
+ print(
139
+ " - Example: merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')",
140
+ )
86
141
 
87
142
  print("\n15. check_sum_of_df_cols_are_equal")
88
- print(" - Description: Checks if the sum of two columns in two DataFrames are equal and provides the difference.")
143
+ print(
144
+ " - Description: Checks if the sum of two columns in two DataFrames are equal and provides the difference.",
145
+ )
89
146
  print(" - Usage: check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)")
90
- print(" - Example: check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')")
147
+ print(
148
+ " - Example: check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')",
149
+ )
91
150
 
92
151
  print("\n16. convert_2_df_cols_to_dict")
93
152
  print(" - Description: Creates a dictionary from two DataFrame columns.")
@@ -95,128 +154,229 @@ class dataprocessing:
95
154
  print(" - Example: convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')")
96
155
 
97
156
  print("\n17. create_FY_and_H_columns")
98
- print(" - Description: Adds financial year and half-year columns to a DataFrame based on a start date.")
99
- print(" - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')")
100
- print(" - Example: create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')")
157
+ print(
158
+ " - Description: Adds financial year and half-year columns to a DataFrame based on a start date.",
159
+ )
160
+ print(
161
+ " - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')",
162
+ )
163
+ print(
164
+ " - Example: create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')",
165
+ )
101
166
 
102
167
  print("\n18. keyword_lookup_replacement")
103
- print(" - Description: Updates values in a column based on a lookup dictionary with conditional logic.")
104
- print(" - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')")
105
- print(" - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')")
168
+ print(
169
+ " - Description: Updates values in a column based on a lookup dictionary with conditional logic.",
170
+ )
171
+ print(
172
+ " - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')",
173
+ )
174
+ print(
175
+ " - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')",
176
+ )
106
177
 
107
178
  print("\n19. create_new_version_of_col_using_LUT")
108
- print(" - Description: Creates a new column based on a lookup table applied to an existing column.")
109
- print(" - Usage: create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')")
110
- print(" - Example: create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)")
179
+ print(
180
+ " - Description: Creates a new column based on a lookup table applied to an existing column.",
181
+ )
182
+ print(
183
+ " - Usage: create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')",
184
+ )
185
+ print(
186
+ " - Example: create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)",
187
+ )
111
188
 
112
189
  print("\n20. convert_df_wide_2_long")
113
- print(" - Description: Converts a wide-format DataFrame into a long-format DataFrame.")
114
- print(" - Usage: convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')")
115
- print(" - Example: convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')")
190
+ print(
191
+ " - Description: Converts a wide-format DataFrame into a long-format DataFrame.",
192
+ )
193
+ print(
194
+ " - Usage: convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')",
195
+ )
196
+ print(
197
+ " - Example: convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')",
198
+ )
116
199
 
117
200
  print("\n21. manually_edit_data")
118
- print(" - Description: Manually updates specified cells in a DataFrame based on filters.")
119
- print(" - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)")
120
- print(" - Example: manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')")
201
+ print(
202
+ " - Description: Manually updates specified cells in a DataFrame based on filters.",
203
+ )
204
+ print(
205
+ " - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)",
206
+ )
207
+ print(
208
+ " - Example: manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')",
209
+ )
121
210
 
122
211
  print("\n22. format_numbers_with_commas")
123
- print(" - Description: Formats numerical columns with commas and a specified number of decimal places.")
212
+ print(
213
+ " - Description: Formats numerical columns with commas and a specified number of decimal places.",
214
+ )
124
215
  print(" - Usage: format_numbers_with_commas(df, decimal_length_chosen=2)")
125
216
  print(" - Example: format_numbers_with_commas(df, decimal_length_chosen=1)")
126
217
 
127
218
  print("\n23. filter_df_on_multiple_conditions")
128
- print(" - Description: Filters a DataFrame based on multiple column conditions.")
219
+ print(
220
+ " - Description: Filters a DataFrame based on multiple column conditions.",
221
+ )
129
222
  print(" - Usage: filter_df_on_multiple_conditions(df, filters_dict)")
130
- print(" - Example: filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})")
223
+ print(
224
+ " - Example: filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})",
225
+ )
131
226
 
132
227
  print("\n24. read_and_concatenate_files")
133
- print(" - Description: Reads and concatenates files from a specified folder into a single DataFrame.")
228
+ print(
229
+ " - Description: Reads and concatenates files from a specified folder into a single DataFrame.",
230
+ )
134
231
  print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
135
- print(" - Example: read_and_concatenate_files('/path/to/files', file_type='xlsx')")
232
+ print(
233
+ " - Example: read_and_concatenate_files('/path/to/files', file_type='xlsx')",
234
+ )
136
235
 
137
236
  print("\n25. upgrade_outdated_packages")
138
- print(" - Description: Upgrades all outdated Python packages except specified ones.")
237
+ print(
238
+ " - Description: Upgrades all outdated Python packages except specified ones.",
239
+ )
139
240
  print(" - Usage: upgrade_outdated_packages(exclude_packages=['twine'])")
140
- print(" - Example: upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])")
241
+ print(
242
+ " - Example: upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])",
243
+ )
141
244
 
142
245
  print("\n26. convert_mixed_formats_dates")
143
- print(" - Description: Converts mixed-format date columns into standardized datetime format.")
246
+ print(
247
+ " - Description: Converts mixed-format date columns into standardized datetime format.",
248
+ )
144
249
  print(" - Usage: convert_mixed_formats_dates(df, column_name)")
145
250
  print(" - Example: convert_mixed_formats_dates(df, 'date_col')")
146
251
 
147
252
  print("\n27. fill_weekly_date_range")
148
- print(" - Description: Fills in missing weekly dates in a DataFrame with a specified frequency.")
253
+ print(
254
+ " - Description: Fills in missing weekly dates in a DataFrame with a specified frequency.",
255
+ )
149
256
  print(" - Usage: fill_weekly_date_range(df, date_column, freq='W-MON')")
150
257
  print(" - Example: fill_weekly_date_range(df, 'date_col')")
151
258
 
152
259
  print("\n28. add_prefix_and_suffix")
153
- print(" - Description: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.")
154
- print(" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)")
155
- print(" - Example: add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')")
260
+ print(
261
+ " - Description: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.",
262
+ )
263
+ print(
264
+ " - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)",
265
+ )
266
+ print(
267
+ " - Example: add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')",
268
+ )
156
269
 
157
270
  print("\n29. create_dummies")
158
- print(" - Description: Creates dummy variables for columns, with an option to add a total dummy column.")
159
- print(" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')")
160
- print(" - Example: create_dummies(df, date_col='date_col', dummy_threshold=1)")
271
+ print(
272
+ " - Description: Creates dummy variables for columns, with an option to add a total dummy column.",
273
+ )
274
+ print(
275
+ " - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')",
276
+ )
277
+ print(
278
+ " - Example: create_dummies(df, date_col='date_col', dummy_threshold=1)",
279
+ )
161
280
 
162
281
  print("\n30. replace_substrings")
163
- print(" - Description: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.")
164
- print(" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)")
165
- print(" - Example: replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')")
282
+ print(
283
+ " - Description: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.",
284
+ )
285
+ print(
286
+ " - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)",
287
+ )
288
+ print(
289
+ " - Example: replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')",
290
+ )
166
291
 
167
292
  print("\n31. add_total_column")
168
- print(" - Description: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.")
169
- print(" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')")
293
+ print(
294
+ " - Description: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.",
295
+ )
296
+ print(
297
+ " - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')",
298
+ )
170
299
  print(" - Example: add_total_column(df, exclude_col='date_col')")
171
300
 
172
301
  print("\n32. apply_lookup_table_based_on_substring")
173
- print(" - Description: Categorizes text in a column using a lookup table based on substrings.")
174
- print(" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')")
175
- print(" - Example: apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})")
302
+ print(
303
+ " - Description: Categorizes text in a column using a lookup table based on substrings.",
304
+ )
305
+ print(
306
+ " - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')",
307
+ )
308
+ print(
309
+ " - Example: apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})",
310
+ )
176
311
 
177
312
  print("\n33. compare_overlap")
178
- print(" - Description: Compares overlapping periods between two DataFrames and summarizes differences.")
313
+ print(
314
+ " - Description: Compares overlapping periods between two DataFrames and summarizes differences.",
315
+ )
179
316
  print(" - Usage: compare_overlap(df1, df2, date_col)")
180
317
  print(" - Example: compare_overlap(df1, df2, 'date_col')")
181
318
 
182
319
  print("\n34. week_commencing_2_week_commencing_conversion_isoweekday")
183
- print(" - Description: Maps dates to the start of the current ISO week based on a specified weekday.")
184
- print(" - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')")
185
- print(" - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')")
186
-
320
+ print(
321
+ " - Description: Maps dates to the start of the current ISO week based on a specified weekday.",
322
+ )
323
+ print(
324
+ " - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')",
325
+ )
326
+ print(
327
+ " - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')",
328
+ )
329
+
187
330
  print("\n35. seasonality_feature_extraction")
188
- print(" - Description: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.")
189
- print(" - Usage: seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)")
190
- print(" - Example: seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)")
331
+ print(
332
+ " - Description: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.",
333
+ )
334
+ print(
335
+ " - Usage: seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)",
336
+ )
337
+ print(
338
+ " - Example: seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)",
339
+ )
191
340
 
192
341
  def get_wd_levels(self, levels):
193
342
  """
194
343
  Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
195
344
 
196
- Parameters:
345
+ Parameters
346
+ ----------
197
347
  - data_frame: pandas DataFrame
198
348
  The input data frame.
199
349
  - num_rows_to_remove: int
200
350
  The number of levels to move up pathways.
201
351
 
202
- Returns:
352
+ Returns
353
+ -------
203
354
  - Current wd
204
- """
205
355
 
356
+ """
206
357
  directory = os.getcwd()
207
358
  for _ in range(levels):
208
359
  directory = os.path.dirname(directory)
209
360
  return directory
210
-
211
- def aggregate_daily_to_wc_long(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum') -> pd.DataFrame:
361
+
362
+ def aggregate_daily_to_wc_long(
363
+ self,
364
+ df: pd.DataFrame,
365
+ date_column: str,
366
+ group_columns: list[str],
367
+ sum_columns: list[str],
368
+ wc: str = "sun",
369
+ aggregation: str = "sum",
370
+ ) -> pd.DataFrame:
212
371
  """
213
- Aggregates daily data into weekly data, starting on a specified day of the week,
214
- and groups the data by additional specified columns. It aggregates specified numeric columns
215
- by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
216
- of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
372
+ Aggregates daily data into weekly data, starting on a specified day of the week,
373
+ and groups the data by additional specified columns. It aggregates specified numeric columns
374
+ by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
375
+ of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
217
376
  The day column is renamed from 'Day' to 'OBS'.
218
377
 
219
- Parameters:
378
+ Parameters
379
+ ----------
220
380
  - df: pandas DataFrame
221
381
  The input DataFrame containing daily data.
222
382
  - date_column: string
@@ -230,18 +390,21 @@ class dataprocessing:
230
390
  - aggregation: string, optional (default 'sum')
231
391
  Aggregation method, either 'sum', 'average', or 'count'.
232
392
 
233
- Returns:
393
+ Returns
394
+ -------
234
395
  - pandas DataFrame
235
396
  A new DataFrame with weekly aggregated data. The index is reset,
236
- and columns represent the grouped and aggregated metrics. The DataFrame
237
- is in long format, with separate columns for each combination of
397
+ and columns represent the grouped and aggregated metrics. The DataFrame
398
+ is in long format, with separate columns for each combination of
238
399
  grouped metrics.
239
- """
240
400
 
401
+ """
241
402
  # Map the input week commencing day to a weekday number (0=Monday, 6=Sunday)
242
- days = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, 'sun': 6}
403
+ days = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
243
404
  if wc.lower() not in days:
244
- return print(f"Incorrect week commencing day input: '{wc}'. Please choose a valid day of the week (e.g., 'sun', 'mon', etc.).")
405
+ return print(
406
+ f"Incorrect week commencing day input: '{wc}'. Please choose a valid day of the week (e.g., 'sun', 'mon', etc.).",
407
+ )
245
408
 
246
409
  start_day = days[wc.lower()]
247
410
 
@@ -252,26 +415,40 @@ class dataprocessing:
252
415
  df_copy[date_column] = pd.to_datetime(df_copy[date_column])
253
416
 
254
417
  # Determine the start of each week
255
- df_copy['week_start'] = df_copy[date_column].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - start_day) % 7))
418
+ df_copy["week_start"] = df_copy[date_column].apply(
419
+ lambda x: x - pd.Timedelta(days=(x.weekday() - start_day) % 7),
420
+ )
256
421
 
257
422
  # Convert sum_columns to numeric and fill NaNs with 0, retaining decimal values
258
423
  for col in sum_columns:
259
- df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)
424
+ df_copy[col] = pd.to_numeric(df_copy[col], errors="coerce").fillna(0)
260
425
 
261
426
  # Group by the new week start column and additional columns, then aggregate the numeric columns
262
- if aggregation == 'average':
263
- grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].mean().reset_index()
264
- elif aggregation == 'count':
265
- grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].count().reset_index()
427
+ if aggregation == "average":
428
+ grouped = (
429
+ df_copy.groupby(["week_start"] + group_columns)[sum_columns]
430
+ .mean()
431
+ .reset_index()
432
+ )
433
+ elif aggregation == "count":
434
+ grouped = (
435
+ df_copy.groupby(["week_start"] + group_columns)[sum_columns]
436
+ .count()
437
+ .reset_index()
438
+ )
266
439
  else: # Default to 'sum' if any other value is provided
267
- grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].sum().reset_index()
440
+ grouped = (
441
+ df_copy.groupby(["week_start"] + group_columns)[sum_columns]
442
+ .sum()
443
+ .reset_index()
444
+ )
268
445
 
269
446
  # Rename 'week_start' column to 'OBS'
270
- grouped = grouped.rename(columns={'week_start': 'OBS'})
447
+ grouped = grouped.rename(columns={"week_start": "OBS"})
271
448
 
272
449
  return grouped
273
-
274
- def convert_monthly_to_daily(self, df, date_column, divide = True):
450
+
451
+ def convert_monthly_to_daily(self, df, date_column, divide=True):
275
452
  """
276
453
  Convert a DataFrame with monthly data to daily data.
277
454
  This function takes a DataFrame and a date column, then it expands each
@@ -282,7 +459,6 @@ class dataprocessing:
282
459
  :param divide: boolean divide by the number of days in a month (default True)
283
460
  :return: A new DataFrame with daily data.
284
461
  """
285
-
286
462
  # Convert date_column to datetime
287
463
  df[date_column] = pd.to_datetime(df[date_column])
288
464
 
@@ -292,7 +468,10 @@ class dataprocessing:
292
468
  # Iterate over each row in the DataFrame
293
469
  for _, row in df.iterrows():
294
470
  # Calculate the number of days in the month
295
- num_days = calendar.monthrange(row[date_column].year, row[date_column].month)[1]
471
+ num_days = calendar.monthrange(
472
+ row[date_column].year,
473
+ row[date_column].month,
474
+ )[1]
296
475
 
297
476
  # Create a new record for each day of the month
298
477
  for day in range(1, num_days + 1):
@@ -304,32 +483,41 @@ class dataprocessing:
304
483
  if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
305
484
  if divide is True:
306
485
  daily_row[col] = row[col] / num_days
307
- else:
486
+ else:
308
487
  daily_row[col] = row[col]
309
488
  daily_records.append(daily_row)
310
489
 
311
490
  # Convert the list of daily records into a DataFrame
312
491
  daily_df = pd.DataFrame(daily_records)
313
-
492
+
314
493
  return daily_df
315
-
316
- def week_of_year_mapping(self,df, week_col, start_day_str):
317
494
 
495
+ def week_of_year_mapping(self, df, week_col, start_day_str):
318
496
  # Mapping of string day names to day numbers (1 for Monday, 7 for Sunday)
319
497
  day_mapping = {
320
- 'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7
498
+ "mon": 1,
499
+ "tue": 2,
500
+ "wed": 3,
501
+ "thu": 4,
502
+ "fri": 5,
503
+ "sat": 6,
504
+ "sun": 7,
321
505
  }
322
506
 
323
507
  # Convert the day string to a number, or raise an error if not valid
324
508
  start_day = day_mapping.get(start_day_str.lower())
325
509
  if start_day is None:
326
- raise ValueError(f"Invalid day input: '{start_day_str}'. Please use one of 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'.")
510
+ raise ValueError(
511
+ f"Invalid day input: '{start_day_str}'. Please use one of 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'.",
512
+ )
327
513
 
328
514
  # Function to convert week number to start date of the week
329
515
  def week_to_startdate(week_str, start_day):
330
- year, week = map(int, week_str.split('-W'))
516
+ year, week = map(int, week_str.split("-W"))
331
517
  first_day_of_year = datetime(year, 1, 1)
332
- first_weekday_of_year = first_day_of_year.weekday() # Monday is 0 and Sunday is 6
518
+ first_weekday_of_year = (
519
+ first_day_of_year.weekday()
520
+ ) # Monday is 0 and Sunday is 6
333
521
 
334
522
  # Calculate days to adjust to the desired start day of the week
335
523
  days_to_adjust = (start_day - 1 - first_weekday_of_year) % 7
@@ -340,25 +528,38 @@ class dataprocessing:
340
528
  return start_of_week
341
529
 
342
530
  # Apply the function to each row in the specified week column
343
- df['OBS'] = df[week_col].apply(lambda x: week_to_startdate(x, start_day)).dt.strftime('%d/%m/%Y')
531
+ df["OBS"] = (
532
+ df[week_col]
533
+ .apply(lambda x: week_to_startdate(x, start_day))
534
+ .dt.strftime("%d/%m/%Y")
535
+ )
344
536
  return df
345
-
346
- def rename_cols(self, df, name = 'ame_'):
537
+
538
+ def rename_cols(self, df, name="ame_"):
347
539
  new_columns = {}
348
540
  for col in df.columns:
349
- if col != 'OBS':
541
+ if col != "OBS":
350
542
  new_col_name = name + col.replace(" ", "_").lower()
351
543
  else:
352
544
  new_col_name = col
353
545
  new_columns[col] = new_col_name
354
546
  return df.rename(columns=new_columns)
355
-
356
- def merge_new_and_old(self, old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS'):
547
+
548
+ def merge_new_and_old(
549
+ self,
550
+ old_df,
551
+ old_col,
552
+ new_df,
553
+ new_col,
554
+ cutoff_date,
555
+ date_col_name="OBS",
556
+ ):
357
557
  """
358
558
  Creates a new DataFrame with two columns: one for dates and one for merged numeric values.
359
559
  Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.
360
560
 
361
- Parameters:
561
+ Parameters
562
+ ----------
362
563
  - old_df: pandas DataFrame
363
564
  The old DataFrame from which to take the numeric values up to the specified date.
364
565
  - old_col: str
@@ -372,11 +573,12 @@ class dataprocessing:
372
573
  - date_col_name: str, optional (default 'OBS')
373
574
  The name of the date column in both DataFrames.
374
575
 
375
- Returns:
576
+ Returns
577
+ -------
376
578
  - pandas DataFrame
377
579
  A new DataFrame with two columns: 'Date' and a column named after 'new_col' containing merged numeric values.
378
- """
379
580
 
581
+ """
380
582
  # Convert date columns in both dataframes to datetime for comparison
381
583
  old_df[date_col_name] = pd.to_datetime(old_df[date_col_name])
382
584
  new_df[date_col_name] = pd.to_datetime(new_df[date_col_name])
@@ -389,67 +591,93 @@ class dataprocessing:
389
591
  new_values = new_df[new_df[date_col_name] > cutoff_date]
390
592
 
391
593
  # Create a new DataFrame with two columns: 'Date' and a column named after 'new_col'
392
- merged_df = pd.DataFrame({
393
- 'OBS': pd.concat([old_values[date_col_name], new_values[date_col_name]], ignore_index=True),
394
- new_col: pd.concat([old_values[old_col], new_values[new_col]], ignore_index=True)
395
- })
594
+ merged_df = pd.DataFrame(
595
+ {
596
+ "OBS": pd.concat(
597
+ [old_values[date_col_name], new_values[date_col_name]],
598
+ ignore_index=True,
599
+ ),
600
+ new_col: pd.concat(
601
+ [old_values[old_col], new_values[new_col]],
602
+ ignore_index=True,
603
+ ),
604
+ },
605
+ )
396
606
 
397
607
  return merged_df
398
-
399
- def merge_dataframes_on_column(self, dataframes, common_column='OBS', merge_how='outer'):
608
+
609
+ def merge_dataframes_on_column(
610
+ self,
611
+ dataframes,
612
+ common_column="OBS",
613
+ merge_how="outer",
614
+ ):
400
615
  """
401
616
  Merge a list of DataFrames on a common column.
402
617
 
403
- Parameters:
618
+ Parameters
619
+ ----------
404
620
  - dataframes: A list of DataFrames to merge.
405
621
  - common_column: The name of the common column to merge on.
406
622
  - merge_how: The type of merge to perform ('inner', 'outer', 'left', or 'right').
407
623
 
408
- Returns:
624
+ Returns
625
+ -------
409
626
  - A merged DataFrame.
627
+
410
628
  """
411
629
  if not dataframes:
412
630
  return None
413
-
631
+
414
632
  merged_df = dataframes[0] # Start with the first DataFrame
415
633
 
416
634
  for df in dataframes[1:]:
417
635
  merged_df = pd.merge(merged_df, df, on=common_column, how=merge_how)
418
636
 
419
637
  # Check if the common column is of datetime dtype
420
- if merged_df[common_column].dtype == 'datetime64[ns]':
638
+ if merged_df[common_column].dtype == "datetime64[ns]":
421
639
  merged_df[common_column] = pd.to_datetime(merged_df[common_column])
422
640
  merged_df = merged_df.sort_values(by=common_column)
423
641
  merged_df = merged_df.fillna(0)
424
-
642
+
425
643
  return merged_df
426
-
644
+
427
645
  def merge_and_update_dfs(self, df1, df2, key_column):
428
646
  """
429
647
  Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available,
430
648
  and returns a dataframe sorted by the key column.
431
649
 
432
- Parameters:
650
+ Parameters
651
+ ----------
433
652
  df1 (DataFrame): The first dataframe to merge (e.g., processed_facebook).
434
653
  df2 (DataFrame): The second dataframe to merge (e.g., finalised_meta).
435
654
  key_column (str): The name of the column to merge and sort by (e.g., 'OBS').
436
655
 
437
- Returns:
656
+ Returns
657
+ -------
438
658
  DataFrame: The merged and updated dataframe.
439
- """
440
659
 
660
+ """
441
661
  # Sort both DataFrames by the key column
442
662
  df1_sorted = df1.sort_values(by=key_column)
443
663
  df2_sorted = df2.sort_values(by=key_column)
444
664
 
445
665
  # Perform the full outer merge
446
- merged_df = pd.merge(df1_sorted, df2_sorted, on=key_column, how='outer', suffixes=('', '_finalised'))
666
+ merged_df = pd.merge(
667
+ df1_sorted,
668
+ df2_sorted,
669
+ on=key_column,
670
+ how="outer",
671
+ suffixes=("", "_finalised"),
672
+ )
447
673
 
448
674
  # Update with non-null values from df2
449
675
  for column in merged_df.columns:
450
- if column.endswith('_finalised'):
451
- original_column = column.replace('_finalised', '')
452
- merged_df.loc[merged_df[column].notnull(), original_column] = merged_df.loc[merged_df[column].notnull(), column]
676
+ if column.endswith("_finalised"):
677
+ original_column = column.replace("_finalised", "")
678
+ merged_df.loc[merged_df[column].notnull(), original_column] = (
679
+ merged_df.loc[merged_df[column].notnull(), column]
680
+ )
453
681
  merged_df.drop(column, axis=1, inplace=True)
454
682
 
455
683
  # Sort the merged DataFrame by the key column
@@ -459,25 +687,30 @@ class dataprocessing:
459
687
  merged_df.fillna(0, inplace=True)
460
688
 
461
689
  return merged_df
462
-
690
+
463
691
  def convert_us_to_uk_dates(self, df, date_col):
464
692
  """
465
- Processes the date column of a DataFrame to remove hyphens and slashes,
693
+ Processes the date column of a DataFrame to remove hyphens and slashes,
466
694
  and converts it to a datetime object.
467
-
468
- Parameters:
695
+
696
+ Parameters
697
+ ----------
469
698
  df (pd.DataFrame): The DataFrame containing the date column.
470
699
  date_col (str): The name of the date column.
471
-
472
- Returns:
700
+
701
+ Returns
702
+ -------
473
703
  pd.DataFrame: The DataFrame with the processed date column.
704
+
474
705
  """
475
- df[date_col] = df[date_col].str.replace(r'[-/]', '', regex=True)
706
+ df[date_col] = df[date_col].str.replace(r"[-/]", "", regex=True)
476
707
  df[date_col] = pd.to_datetime(
477
- df[date_col].str.slice(0, 2) + '/' +
478
- df[date_col].str.slice(2, 4) + '/' +
479
- df[date_col].str.slice(4, 8),
480
- format='%m/%d/%Y'
708
+ df[date_col].str.slice(0, 2)
709
+ + "/"
710
+ + df[date_col].str.slice(2, 4)
711
+ + "/"
712
+ + df[date_col].str.slice(4, 8),
713
+ format="%m/%d/%Y",
481
714
  )
482
715
  return df
483
716
 
@@ -486,21 +719,40 @@ class dataprocessing:
486
719
  Combines multiple DataFrames from a dictionary into a single DataFrame.
487
720
  Adds a column 'SheetName' indicating the origin sheet of each row.
488
721
 
489
- Parameters:
722
+ Parameters
723
+ ----------
490
724
  all_sheets (dict): A dictionary of DataFrames, typically read from an Excel file with multiple sheets.
491
725
 
492
- Returns:
726
+ Returns
727
+ -------
493
728
  DataFrame: A concatenated DataFrame with an additional 'SheetName' column.
729
+
494
730
  """
495
731
  combined_df = pd.DataFrame()
496
732
 
497
733
  for sheet_name, df in all_sheets.items():
498
- df['SheetName'] = sheet_name
734
+ df["SheetName"] = sheet_name
499
735
  combined_df = pd.concat([combined_df, df], ignore_index=True)
500
736
 
501
737
  return combined_df
502
-
503
- def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=True, week_commencing="W-MON"):
738
+
739
+ def pivot_table(
740
+ self,
741
+ df,
742
+ index_col,
743
+ columns,
744
+ values_col,
745
+ filters_dict=None,
746
+ fill_value=0,
747
+ aggfunc="sum",
748
+ margins=False,
749
+ margins_name="Total",
750
+ datetime_trans_needed=True,
751
+ date_format="%Y-%m-%d",
752
+ reverse_header_order=False,
753
+ fill_missing_weekly_dates=True,
754
+ week_commencing="W-MON",
755
+ ):
504
756
  """
505
757
  Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
506
758
 
@@ -521,6 +773,7 @@ class dataprocessing:
521
773
 
522
774
  Returns:
523
775
  pandas.DataFrame: The pivot table specified
776
+
524
777
  """
525
778
  # Validate inputs
526
779
  if index_col not in df.columns:
@@ -544,7 +797,10 @@ class dataprocessing:
544
797
 
545
798
  # Ensure index column is in datetime format if needed
546
799
  if datetime_trans_needed:
547
- df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
800
+ df_filtered[index_col] = pd.to_datetime(
801
+ df_filtered[index_col],
802
+ dayfirst=True,
803
+ )
548
804
 
549
805
  # Create the pivot table
550
806
  pivoted_df = df_filtered.pivot_table(
@@ -559,7 +815,9 @@ class dataprocessing:
559
815
  # Handle column headers
560
816
  if isinstance(pivoted_df.columns, pd.MultiIndex):
561
817
  pivoted_df.columns = [
562
- "_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
818
+ "_".join(
819
+ reversed(map(str, col)) if reverse_header_order else map(str, col),
820
+ )
563
821
  for col in pivoted_df.columns.values
564
822
  ]
565
823
  else:
@@ -570,7 +828,10 @@ class dataprocessing:
570
828
 
571
829
  # Handle sorting and formatting of index column
572
830
  if datetime_trans_needed:
573
- pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
831
+ pivoted_df[index_col] = pd.to_datetime(
832
+ pivoted_df[index_col],
833
+ errors="coerce",
834
+ )
574
835
  pivoted_df.sort_values(by=index_col, inplace=True)
575
836
  pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
576
837
 
@@ -579,35 +840,49 @@ class dataprocessing:
579
840
 
580
841
  # Fill missing weekly dates if specified
581
842
  if fill_missing_weekly_dates:
582
- pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
843
+ pivoted_df = self.fill_weekly_date_range(
844
+ pivoted_df,
845
+ index_col,
846
+ freq=week_commencing,
847
+ )
583
848
 
584
849
  return pivoted_df
585
850
 
586
- def apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict="Other", new_column_name="Mapping"):
851
+ def apply_lookup_table_for_columns(
852
+ df,
853
+ col_names,
854
+ to_find_dict,
855
+ if_not_in_dict="Other",
856
+ new_column_name="Mapping",
857
+ ):
587
858
  """
588
859
  Creates a new DataFrame column based on a look up table, using exact matches.
589
860
 
590
- Parameters:
861
+ Parameters
862
+ ----------
591
863
  df (pandas.DataFrame): The DataFrame containing the data.
592
864
  col_names (list of str): List of column names to use for lookup. If more than one, values are merged with '|'.
593
865
  to_find_dict (dict): Lookup dictionary with exact keys to match.
594
866
  if_not_in_dict (str, optional): Value used if no match is found. Defaults to "Other".
595
867
  new_column_name (str, optional): Name of new output column. Defaults to "Mapping".
596
868
 
597
- Returns:
869
+ Returns
870
+ -------
598
871
  pandas.DataFrame: DataFrame with a new column containing lookup results.
599
- """
600
872
 
873
+ """
601
874
  # Preprocess DataFrame if multiple columns
602
875
  if len(col_names) > 1:
603
- df["Merged"] = df[col_names].astype(str).agg('|'.join, axis=1)
876
+ df["Merged"] = df[col_names].astype(str).agg("|".join, axis=1)
604
877
  col_to_use = "Merged"
605
878
  else:
606
879
  col_to_use = col_names[0]
607
880
 
608
881
  # Normalize case for matching
609
882
  lookup = {k.lower(): v for k, v in to_find_dict.items()}
610
- df[new_column_name] = df[col_to_use].str.lower().map(lookup).fillna(if_not_in_dict)
883
+ df[new_column_name] = (
884
+ df[col_to_use].str.lower().map(lookup).fillna(if_not_in_dict)
885
+ )
611
886
 
612
887
  # Drop intermediate column if created
613
888
  if len(col_names) > 1:
@@ -615,15 +890,25 @@ class dataprocessing:
615
890
 
616
891
  return df
617
892
 
618
- def aggregate_daily_to_wc_wide(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum', include_totals : bool = False) -> pd.DataFrame:
893
+ def aggregate_daily_to_wc_wide(
894
+ self,
895
+ df: pd.DataFrame,
896
+ date_column: str,
897
+ group_columns: list[str],
898
+ sum_columns: list[str],
899
+ wc: str = "sun",
900
+ aggregation: str = "sum",
901
+ include_totals: bool = False,
902
+ ) -> pd.DataFrame:
619
903
  """
620
- Aggregates daily data into weekly data, starting on a specified day of the week,
621
- and groups the data by additional specified columns. It aggregates specified numeric columns
622
- by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
623
- of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
904
+ Aggregates daily data into weekly data, starting on a specified day of the week,
905
+ and groups the data by additional specified columns. It aggregates specified numeric columns
906
+ by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
907
+ of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
624
908
  The day column is renamed from 'Day' to 'OBS'.
625
909
 
626
- Parameters:
910
+ Parameters
911
+ ----------
627
912
  - df: pandas DataFrame
628
913
  The input DataFrame containing daily data.
629
914
  - date_column: string
@@ -639,26 +924,36 @@ class dataprocessing:
639
924
  - include_totals: boolean, optional (default False)
640
925
  If True, include total columns for each sum_column.
641
926
 
642
- Returns:
927
+ Returns
928
+ -------
643
929
  - pandas DataFrame
644
930
  A new DataFrame with weekly aggregated data. The index is reset,
645
- and columns represent the grouped and aggregated metrics. The DataFrame
646
- is in wide format, with separate columns for each combination of
931
+ and columns represent the grouped and aggregated metrics. The DataFrame
932
+ is in wide format, with separate columns for each combination of
647
933
  grouped metrics.
934
+
648
935
  """
649
-
650
- grouped = self.aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation)
651
-
936
+ grouped = self.aggregate_daily_to_wc_long(
937
+ df,
938
+ date_column,
939
+ group_columns,
940
+ sum_columns,
941
+ wc,
942
+ aggregation,
943
+ )
944
+
652
945
  # Pivot the data to wide format
653
946
  if group_columns:
654
- wide_df = grouped.pivot_table(index='OBS',
655
- columns=group_columns,
656
- values=sum_columns,
657
- aggfunc='first')
947
+ wide_df = grouped.pivot_table(
948
+ index="OBS",
949
+ columns=group_columns,
950
+ values=sum_columns,
951
+ aggfunc="first",
952
+ )
658
953
  # Flatten the multi-level column index and create combined column names
659
- wide_df.columns = ['_'.join(col).strip() for col in wide_df.columns.values]
954
+ wide_df.columns = ["_".join(col).strip() for col in wide_df.columns.values]
660
955
  else:
661
- wide_df = grouped.set_index('OBS')
956
+ wide_df = grouped.set_index("OBS")
662
957
 
663
958
  # Fill NaN values with 0
664
959
  wide_df = wide_df.fillna(0)
@@ -666,9 +961,11 @@ class dataprocessing:
666
961
  # Adding total columns for each unique sum_column, if include_totals is True
667
962
  if include_totals:
668
963
  for col in sum_columns:
669
- total_column_name = f'Total {col}'
964
+ total_column_name = f"Total {col}"
670
965
  if group_columns:
671
- columns_to_sum = [column for column in wide_df.columns if col in column]
966
+ columns_to_sum = [
967
+ column for column in wide_df.columns if col in column
968
+ ]
672
969
  else:
673
970
  columns_to_sum = [col]
674
971
  wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
@@ -678,11 +975,20 @@ class dataprocessing:
678
975
 
679
976
  return wide_df
680
977
 
681
- def merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = "Merged",starting_prefix_str=None,ending_prefix_str=None):
978
+ def merge_cols_with_seperator(
979
+ self,
980
+ df,
981
+ col_names,
982
+ seperator="_",
983
+ output_column_name="Merged",
984
+ starting_prefix_str=None,
985
+ ending_prefix_str=None,
986
+ ):
682
987
  """
683
988
  Creates a new column in the dataframe that merges 2 or more columns together with a "_" seperator, possibly to be used for a look up table where multiple columns are being looked up
684
989
 
685
- Parameters:
990
+ Parameters
991
+ ----------
686
992
  df (pandas.DataFrame): Dataframe to make changes to.
687
993
  col_names (list): list of columm names ot merge.
688
994
  seperator (str, optional): Name of column outputted. Defaults to "_".
@@ -690,76 +996,99 @@ class dataprocessing:
690
996
  starting_prefix_str (str, optional): string of optional text to be added before the merged column str value
691
997
  ending_prefix_str (str, optional): string of optional text to be added after the merged column str value
692
998
 
693
- Raises:
999
+ Raises
1000
+ ------
694
1001
  ValueError: if more less than two column names are inputted in the list there is nothing to merge on
695
1002
 
696
- Returns:
1003
+ Returns
1004
+ -------
697
1005
  pandas.DataFrame: DataFrame with additional merged column
1006
+
698
1007
  """
699
1008
  # Specify more than one column must be entered
700
1009
  if len(col_names) < 2:
701
1010
  raise ValueError("2 or more columns must be specified to merge")
702
-
1011
+
703
1012
  # Create a new column with the merged columns
704
1013
  df[output_column_name] = df[col_names].astype(str).apply(seperator.join, axis=1)
705
1014
 
706
- # Add string before
1015
+ # Add string before
707
1016
  if starting_prefix_str is not None:
708
- df[output_column_name] = starting_prefix_str + df[output_column_name].astype(str)
709
-
1017
+ df[output_column_name] = starting_prefix_str + df[
1018
+ output_column_name
1019
+ ].astype(str)
1020
+
710
1021
  # Add string after
711
1022
  if ending_prefix_str is not None:
712
- df[output_column_name] = df[output_column_name].astype(str) + ending_prefix_str
713
-
1023
+ df[output_column_name] = (
1024
+ df[output_column_name].astype(str) + ending_prefix_str
1025
+ )
1026
+
714
1027
  return df
715
1028
 
716
- def check_sum_of_df_cols_are_equal(self, df_1,df_2,cols_1,cols_2):
1029
+ def check_sum_of_df_cols_are_equal(self, df_1, df_2, cols_1, cols_2):
717
1030
  """
718
1031
  Checks the sum of two different dataframe column or columns are equal
719
1032
 
720
- Parameters:
1033
+ Parameters
1034
+ ----------
721
1035
  df_1 (pandas.DataFrame): First dataframe for columnsa to be summed on.
722
1036
  df_2 (pandas.DataFrame): Second dataframe for columnsa to be summed on.
723
1037
  cols_1 (list of str): Columns from first dataframe to sum.
724
1038
  cols_2 (list of str): Columns from second dataframe to sum.
725
1039
 
726
- Returns:
1040
+ Returns
1041
+ -------
727
1042
  Tuple: Answer is the true or false answer to whether sums are the same, df_1_sum is the sum of the column/columns in the first dataframe, df_2_sum is the sum of the column/columns in the second dataframe
1043
+
728
1044
  """
729
1045
  # Find the sum of both sets of columns
730
1046
  df_1_sum = df_1[cols_1].sum().sum()
731
1047
  df_2_sum = df_2[cols_2].sum().sum()
732
-
733
- # If the the two columns are
1048
+
1049
+ # If the the two columns are
734
1050
  if df_1_sum == df_2_sum:
735
1051
  Answer = "They are equal"
736
1052
  if df_1_sum != df_2_sum:
737
- Answer = "They are different by " + str(df_2_sum-df_1_sum)
738
-
739
- return Answer,df_1_sum,df_2_sum
740
-
1053
+ Answer = "They are different by " + str(df_2_sum - df_1_sum)
1054
+
1055
+ return Answer, df_1_sum, df_2_sum
1056
+
741
1057
  def convert_2_df_cols_to_dict(self, df, key_col, value_col):
742
1058
  """
743
1059
  Create a dictionary mapping from two columns of a DataFrame.
744
1060
 
745
- Parameters:
1061
+ Parameters
1062
+ ----------
746
1063
  df (pd.DataFrame): The DataFrame containing the data.
747
1064
  key_col (str): The column name to use as keys in the dictionary.
748
1065
  value_col (str): The column name to use as values in the dictionary.
749
1066
 
750
- Returns:
1067
+ Returns
1068
+ -------
751
1069
  dict: A dictionary with keys from 'key_col' and values from 'value_col'.
1070
+
752
1071
  """
753
1072
  if key_col not in df or value_col not in df:
754
1073
  raise ValueError("Specified columns are not in the DataFrame")
755
1074
 
756
1075
  return {df[key_col].iloc[i]: df[value_col].iloc[i] for i in range(len(df))}
757
-
758
- def create_FY_and_H_columns(self, df, index_col, start_date, starting_FY,short_format="No",half_years="No",combined_FY_and_H="No"):
1076
+
1077
+ def create_FY_and_H_columns(
1078
+ self,
1079
+ df,
1080
+ index_col,
1081
+ start_date,
1082
+ starting_FY,
1083
+ short_format="No",
1084
+ half_years="No",
1085
+ combined_FY_and_H="No",
1086
+ ):
759
1087
  """
760
- Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
1088
+ Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
761
1089
 
762
- Parameters:
1090
+ Parameters
1091
+ ----------
763
1092
  df (pandas.DataFrame): Dataframe to operate on.
764
1093
  index_col (str): Name of the column to use for datetime
765
1094
  start_date (str): String used to specify the start date of an FY specified, needs to be of format "yyyy-mm-dd" e.g. 2021-11-31
@@ -768,16 +1097,17 @@ class dataprocessing:
768
1097
  half_years (str, optional): String used to specify if half year column is desired. Defaults to "No".
769
1098
  combined_FY_and_H (str, optional): String used to specify is a combined half year and FY column is desired. Defaults to "No".
770
1099
 
771
- Returns:
1100
+ Returns
1101
+ -------
772
1102
  pandas.DataFrame: DataFrame with a new column 'FY' containing the FY as well as, if desired, a half year column and a combined FY half year column.
1103
+
773
1104
  """
774
-
775
1105
  try:
776
- start_date = datetime.strptime(start_date, '%Y-%m-%d')
1106
+ start_date = datetime.strptime(start_date, "%Y-%m-%d")
777
1107
  except ValueError:
778
1108
  print("Error: Date must be of format yyyy-mm-dd")
779
1109
  return df
780
-
1110
+
781
1111
  df["OBS"] = pd.to_datetime(df[index_col])
782
1112
  df["OBS as string"] = df["OBS"].dt.strftime("%Y-%m-%d")
783
1113
 
@@ -787,35 +1117,51 @@ class dataprocessing:
787
1117
 
788
1118
  def calculate_FY_vectorized(date_series):
789
1119
  years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
790
- fy = 'FY' + (start_year + years_since_start).astype(str)
1120
+ fy = "FY" + (start_year + years_since_start).astype(str)
791
1121
  if short_format == "Yes":
792
- fy = 'FY' + fy.str[-2:]
1122
+ fy = "FY" + fy.str[-2:]
793
1123
  return fy
794
1124
 
795
- df['FY'] = calculate_FY_vectorized(df[index_col])
1125
+ df["FY"] = calculate_FY_vectorized(df[index_col])
796
1126
 
797
1127
  if half_years == "Yes" or combined_FY_and_H == "Yes":
1128
+
798
1129
  def calculate_half_year_vectorized(date_series):
799
- fy_years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
800
- fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(years=1)
801
- fy_end_of_h1 = fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
802
- half_year = np.where(date_series <= fy_end_of_h1, 'H1', 'H2')
1130
+ fy_years_since_start = (
1131
+ (date_series - start_date).dt.days / 364
1132
+ ).astype(int)
1133
+ fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(
1134
+ years=1,
1135
+ )
1136
+ fy_end_of_h1 = (
1137
+ fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
1138
+ )
1139
+ half_year = np.where(date_series <= fy_end_of_h1, "H1", "H2")
803
1140
  return half_year
804
-
805
- df['Half Years'] = calculate_half_year_vectorized(df[index_col])
806
-
1141
+
1142
+ df["Half Years"] = calculate_half_year_vectorized(df[index_col])
1143
+
807
1144
  if combined_FY_and_H == "Yes":
808
- df['Financial Half Years'] = df['FY'] + ' ' + df['Half Years']
1145
+ df["Financial Half Years"] = df["FY"] + " " + df["Half Years"]
809
1146
 
810
1147
  return df
811
-
812
- def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
1148
+
1149
+ def keyword_lookup_replacement(
1150
+ self,
1151
+ df,
1152
+ col,
1153
+ replacement_rows,
1154
+ cols_to_merge,
1155
+ replacement_lookup_dict,
1156
+ output_column_name="Updated Column",
1157
+ ):
813
1158
  """
814
1159
  This function updates values in a specified column of the DataFrame based on a lookup dictionary.
815
1160
  It first merges several columns into a new 'Merged' column, then uses this merged column to determine
816
1161
  if replacements are needed based on the dictionary.
817
1162
 
818
- Parameters:
1163
+ Parameters
1164
+ ----------
819
1165
  df (pd.DataFrame): The DataFrame to process.
820
1166
  col (str): The name of the column whose values are potentially replaced.
821
1167
  replacement_rows (str): The specific value in 'col' to check for replacements.
@@ -823,65 +1169,102 @@ class dataprocessing:
823
1169
  replacement_lookup_dict (dict): Dictionary where keys are merged column values and values are the new data to replace in 'col'.
824
1170
  output_column_name (str, optional): Name of column outputted. Defaults to "Updated Column".
825
1171
 
826
- Returns:
1172
+ Returns
1173
+ -------
827
1174
  pd.DataFrame: The modified DataFrame with updated values in the specified column.
1175
+
828
1176
  """
829
1177
  # Create a merged column from specified columns
830
- df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
831
-
1178
+ df["Merged"] = df[cols_to_merge].apply(
1179
+ lambda row: "|".join(row.values.astype(str)),
1180
+ axis=1,
1181
+ )
1182
+
832
1183
  # Replace values in the specified column based on the lookup
833
1184
  def replace_values(x):
834
1185
  if x[col] == replacement_rows:
835
- merged_value = x['Merged']
1186
+ merged_value = x["Merged"]
836
1187
  if merged_value in replacement_lookup_dict:
837
1188
  return replacement_lookup_dict[merged_value]
838
1189
  return x[col]
839
-
1190
+
840
1191
  # Apply replacement logic
841
1192
  df[output_column_name] = df.apply(replace_values, axis=1)
842
-
1193
+
843
1194
  # Drop the intermediate 'Merged' column
844
- df.drop(columns=['Merged'], inplace=True)
845
-
1195
+ df.drop(columns=["Merged"], inplace=True)
1196
+
846
1197
  return df
847
1198
 
848
- def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
1199
+ def create_new_version_of_col_using_LUT(
1200
+ self,
1201
+ df,
1202
+ keys_col,
1203
+ value_col,
1204
+ dict_for_specific_changes,
1205
+ new_col_name="New Version of Old Col",
1206
+ ):
849
1207
  """
850
- Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
1208
+ Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
851
1209
  The lookup is based on a column in the dataframe. Can only input one column and output one new column.
852
1210
 
853
- Parameters:
1211
+ Parameters
1212
+ ----------
854
1213
  df (pandas.DataFrame): The DataFrame containing the data.
855
1214
  keys_col (str): The name of the column which the LUT will be refercing to ouput a value.
856
1215
  value_col (str): The name of the column which the new column will be based off. If a key in the key column is not found in the LUT, the values from this column are used instead.
857
1216
  dict_for_specific_changes (dict): The LUT which the keys_col will be mapped on to find any values that need changing in the new column.
858
1217
  new_col_name (str, optional): This is the name of the new column being generated. Defaults to "New Version of Old Col".
859
1218
 
860
- Returns:
1219
+ Returns
1220
+ -------
861
1221
  pandas.DataFrame: DataFrame with a new column which is similar to the old column, except for where changes have been made to reflect the lookup table.
1222
+
862
1223
  """
863
-
864
1224
  # Extract columns to change using new dictionary
865
- smaller_df = df[[keys_col,value_col]]
1225
+ smaller_df = df[[keys_col, value_col]]
866
1226
 
867
1227
  # Use the new dictionary to create a new LUT
868
- smaller_df_with_LUT = self.apply_lookup_table_for_columns(smaller_df,[keys_col,value_col],dict_for_specific_changes)
869
-
1228
+ smaller_df_with_LUT = self.apply_lookup_table_for_columns(
1229
+ smaller_df,
1230
+ [keys_col, value_col],
1231
+ dict_for_specific_changes,
1232
+ )
1233
+
870
1234
  # In a new column, keep values from the old column that don't need updating as they are not in the dictionary, and replace values that do need updating with values from the dictionary based on the keys
871
- smaller_df_with_LUT["Updated Col"]=smaller_df_with_LUT.apply(lambda x: x['Mapping'] if x['Mapping'] != "Other" else x[value_col],axis=1)
1235
+ smaller_df_with_LUT["Updated Col"] = smaller_df_with_LUT.apply(
1236
+ lambda x: x["Mapping"] if x["Mapping"] != "Other" else x[value_col],
1237
+ axis=1,
1238
+ )
872
1239
 
873
1240
  # Drop the extra unecessary cols
874
- smaller_df_with_LUT.drop([keys_col,'Mapping'],axis=1,inplace=True)
875
-
1241
+ smaller_df_with_LUT.drop([keys_col, "Mapping"], axis=1, inplace=True)
1242
+
876
1243
  # # Output dataframes as dictionary to be used in a LUT
877
- new_dict = self.convert_2_df_cols_to_dict(smaller_df_with_LUT,value_col,"Updated Col")
1244
+ new_dict = self.convert_2_df_cols_to_dict(
1245
+ smaller_df_with_LUT,
1246
+ value_col,
1247
+ "Updated Col",
1248
+ )
878
1249
 
879
1250
  # # Use new dictionary to create a new version of an old column
880
- df_final = self.apply_lookup_table_for_columns(df,[keys_col],new_dict,"other",new_col_name)
881
-
1251
+ df_final = self.apply_lookup_table_for_columns(
1252
+ df,
1253
+ [keys_col],
1254
+ new_dict,
1255
+ "other",
1256
+ new_col_name,
1257
+ )
1258
+
882
1259
  return df_final
883
-
884
- def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
1260
+
1261
+ def convert_df_wide_2_long(
1262
+ self,
1263
+ df,
1264
+ value_cols,
1265
+ variable_col_name="Stacked",
1266
+ value_col_name="Value",
1267
+ ):
885
1268
  """
886
1269
  Changes a dataframe from wide to long format.
887
1270
 
@@ -896,16 +1279,25 @@ class dataprocessing:
896
1279
 
897
1280
  Raises:
898
1281
  ValueError: If the number of columns to depivot is less than 2.
1282
+
899
1283
  """
900
1284
  # Check length of value_cols is greater than 1
901
1285
  if len(value_cols) < 2:
902
1286
  raise ValueError("Number of inputs in list must be greater than 1")
903
1287
 
904
1288
  # Find the columns that are not to be depivoted into one column
905
- id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
1289
+ id_vars = [
1290
+ col for col in df.columns if col not in value_cols
1291
+ ] # Preserve column order in the DataFrame
906
1292
 
907
1293
  # Melt all columns chosen into one column
908
- df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
1294
+ df_final = pd.melt(
1295
+ df,
1296
+ id_vars=id_vars,
1297
+ value_vars=value_cols,
1298
+ var_name=variable_col_name,
1299
+ value_name=value_col_name,
1300
+ )
909
1301
 
910
1302
  # Sort column order to match expected output
911
1303
  ordered_columns = id_vars + [variable_col_name, value_col_name]
@@ -913,7 +1305,19 @@ class dataprocessing:
913
1305
 
914
1306
  return df_final
915
1307
 
916
- def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
1308
+ def manually_edit_data(
1309
+ self,
1310
+ df,
1311
+ filters_dict,
1312
+ col_to_change,
1313
+ new_value,
1314
+ change_in_existing_df_col="No",
1315
+ new_col_to_change_name="New",
1316
+ manual_edit_col_name=None,
1317
+ add_notes="No",
1318
+ existing_note_col_name=None,
1319
+ note=None,
1320
+ ):
917
1321
  """
918
1322
  Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
919
1323
 
@@ -936,31 +1340,44 @@ class dataprocessing:
936
1340
 
937
1341
  Returns:
938
1342
  pandas.DataFrame: Dataframe with manual changes added
1343
+
939
1344
  """
940
-
941
1345
  # Raise type error if more than one col is supported
942
1346
  if isinstance(col_to_change, list):
943
1347
  raise TypeError("Col to change must be specified as a string, not a list")
944
1348
 
945
1349
  # Raises value error if input is invalid for change_in_existing_df_col
946
1350
  if change_in_existing_df_col not in ["Yes", "No"]:
947
- raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
1351
+ raise ValueError(
1352
+ "Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']",
1353
+ )
948
1354
 
949
1355
  # Raises value error if input is invalid for add_notes_col
950
1356
  if add_notes not in ["Yes", "No"]:
951
- raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
1357
+ raise ValueError(
1358
+ "Invalid input value for add_notes. Allowed values are: ['Yes', 'No']",
1359
+ )
952
1360
 
953
1361
  # Validate filters_dict format
954
1362
  for col, cond in filters_dict.items():
955
1363
  if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
956
- raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
1364
+ raise ValueError(
1365
+ f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'",
1366
+ )
957
1367
 
958
1368
  # Create the filtered df by applying the conditions
959
1369
  df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
960
1370
 
961
1371
  # Create a new column to add the changes if desired, else edit in the current chosen column
962
- col_to_update = col_to_change if change_in_existing_df_col == "Yes" else new_col_to_change_name
963
- if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
1372
+ col_to_update = (
1373
+ col_to_change
1374
+ if change_in_existing_df_col == "Yes"
1375
+ else new_col_to_change_name
1376
+ )
1377
+ if (
1378
+ change_in_existing_df_col == "No"
1379
+ and new_col_to_change_name not in df.columns
1380
+ ):
964
1381
  df = df.copy()
965
1382
  df[new_col_to_change_name] = df[col_to_change]
966
1383
 
@@ -972,19 +1389,19 @@ class dataprocessing:
972
1389
  if manual_edit_col_name not in df.columns:
973
1390
  df[manual_edit_col_name] = 0
974
1391
  df.loc[df_filtered.index, manual_edit_col_name] = 1
975
- elif not manual_edit_col_name and 'Manual Changes' not in df.columns:
976
- df['Manual Changes'] = 0
977
- df.loc[df_filtered.index, 'Manual Changes'] = 1
1392
+ elif not manual_edit_col_name and "Manual Changes" not in df.columns:
1393
+ df["Manual Changes"] = 0
1394
+ df.loc[df_filtered.index, "Manual Changes"] = 1
978
1395
 
979
1396
  # Add note if desired in new column or an existing column
980
1397
  if add_notes == "Yes":
981
- note_col = existing_note_col_name if existing_note_col_name else 'Notes'
1398
+ note_col = existing_note_col_name if existing_note_col_name else "Notes"
982
1399
  if note_col not in df.columns:
983
1400
  df[note_col] = None
984
1401
  df.loc[df_filtered.index, note_col] = note
985
1402
 
986
1403
  return df
987
-
1404
+
988
1405
  def format_numbers_with_commas(self, df, decimal_length_chosen=2):
989
1406
  """
990
1407
  Converts data in numerical format into numbers with commas and a chosen decimal place length.
@@ -995,24 +1412,26 @@ class dataprocessing:
995
1412
 
996
1413
  Returns:
997
1414
  pandas.DataFrame: The DataFrame with the chosen updated format.
1415
+
998
1416
  """
1417
+
999
1418
  def format_number_with_commas(x, decimal_length=decimal_length_chosen):
1000
1419
  if pd.isna(x): # Preserve None/NaN values
1001
1420
  return pd.NA # Explicitly normalize to pd.NA
1002
- elif isinstance(x, (int, float)):
1421
+ if isinstance(x, (int, float)):
1003
1422
  if decimal_length is not None:
1004
1423
  format_str = f"{{:,.{decimal_length}f}}"
1005
1424
  return format_str.format(x)
1006
- else:
1007
- return f"{x:,}"
1008
- else:
1009
- return x # Return unchanged if not a number
1425
+ return f"{x:,}"
1426
+ return x # Return unchanged if not a number
1010
1427
 
1011
1428
  # Apply formatting column by column
1012
- formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
1429
+ formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(
1430
+ value=pd.NA,
1431
+ )
1013
1432
 
1014
1433
  return formatted_df
1015
-
1434
+
1016
1435
  def filter_df_on_multiple_conditions(self, df, filters_dict):
1017
1436
  """
1018
1437
  Filter a dataframe based on mulitple conditions
@@ -1023,59 +1442,62 @@ class dataprocessing:
1023
1442
 
1024
1443
  Returns:
1025
1444
  pandas.DatFrame: Filtered Da
1445
+
1026
1446
  """
1027
1447
  mask = pd.Series(True, index=df.index)
1028
1448
  for col, cond in filters_dict.items():
1029
1449
  cond = cond.strip()
1030
1450
  operator, value = cond.split(maxsplit=1)
1031
-
1451
+
1032
1452
  # If value is a string condition make sure to check if there are new lines
1033
1453
  if "'" in value:
1034
1454
  value = value.strip().strip("'\"")
1035
1455
  # If not a string e.g. datetime or number condition you need to transform the string into a value
1036
1456
  else:
1037
- value = eval(value)
1457
+ value = eval(value)
1038
1458
 
1039
1459
  if operator == "==":
1040
- temp_mask = (df[col] == value)
1460
+ temp_mask = df[col] == value
1041
1461
  elif operator == "!=":
1042
- temp_mask = (df[col] != value)
1462
+ temp_mask = df[col] != value
1043
1463
  elif operator == ">=":
1044
- temp_mask = (df[col] >= value)
1464
+ temp_mask = df[col] >= value
1045
1465
  elif operator == "<=":
1046
- temp_mask = (df[col] <= value)
1466
+ temp_mask = df[col] <= value
1047
1467
  elif operator == ">":
1048
- temp_mask = (df[col] > value)
1468
+ temp_mask = df[col] > value
1049
1469
  elif operator == "<":
1050
- temp_mask = (df[col] < value)
1470
+ temp_mask = df[col] < value
1051
1471
  mask &= temp_mask
1052
1472
 
1053
1473
  # Create the filtered df by applying the conditions
1054
1474
  df_filtered = df[mask]
1055
-
1475
+
1056
1476
  return df_filtered
1057
-
1058
- def read_and_concatenate_files(self, folder_path, file_type='csv'):
1477
+
1478
+ def read_and_concatenate_files(self, folder_path, file_type="csv"):
1059
1479
  """
1060
- Reads all files of a specified type (CSV or XLSX) from a given folder
1480
+ Reads all files of a specified type (CSV or XLSX) from a given folder
1061
1481
  and concatenates them into a single DataFrame.
1062
-
1063
- Parameters:
1482
+
1483
+ Parameters
1484
+ ----------
1064
1485
  folder_path (str): The path to the folder containing the files.
1065
1486
  file_type (str): The type of files to read ('csv' or 'xlsx'). Defaults to 'csv'.
1066
-
1067
- Returns:
1487
+
1488
+ Returns
1489
+ -------
1068
1490
  pd.DataFrame: A DataFrame containing the concatenated data from all files.
1491
+
1069
1492
  """
1070
-
1071
1493
  # Initialize an empty list to hold dataframes
1072
1494
  dataframes = []
1073
1495
 
1074
1496
  # Define file extension based on file_type
1075
- if file_type == 'csv':
1076
- extension = '.csv'
1077
- elif file_type == 'xlsx':
1078
- extension = '.xlsx'
1497
+ if file_type == "csv":
1498
+ extension = ".csv"
1499
+ elif file_type == "xlsx":
1500
+ extension = ".xlsx"
1079
1501
  else:
1080
1502
  raise ValueError("file_type must be either 'csv' or 'xlsx'")
1081
1503
 
@@ -1085,19 +1507,19 @@ class dataprocessing:
1085
1507
  if filename.endswith(extension):
1086
1508
  file_path = os.path.join(folder_path, filename)
1087
1509
  # Read the file into a DataFrame
1088
- if file_type == 'csv':
1510
+ if file_type == "csv":
1089
1511
  df = pd.read_csv(file_path)
1090
- elif file_type == 'xlsx':
1512
+ elif file_type == "xlsx":
1091
1513
  df = pd.read_excel(file_path)
1092
1514
  # Append the DataFrame to the list
1093
1515
  dataframes.append(df)
1094
1516
 
1095
1517
  # Concatenate all DataFrames into a single DataFrame
1096
1518
  combined_df = pd.concat(dataframes, ignore_index=True)
1097
-
1519
+
1098
1520
  return combined_df
1099
-
1100
- def upgrade_outdated_packages(self, exclude_packages=['twine']):
1521
+
1522
+ def upgrade_outdated_packages(self, exclude_packages=["twine"]):
1101
1523
  """
1102
1524
  Upgrade all outdated Python packages except those specified in `exclude_packages`.
1103
1525
 
@@ -1108,32 +1530,49 @@ class dataprocessing:
1108
1530
  try:
1109
1531
  # Get all installed packages
1110
1532
  installed_packages_result = subprocess.run(
1111
- "pip list --format=json", shell=True, capture_output=True, text=True
1533
+ "pip list --format=json",
1534
+ check=False,
1535
+ shell=True,
1536
+ capture_output=True,
1537
+ text=True,
1112
1538
  )
1113
1539
  installed_packages = json.loads(installed_packages_result.stdout)
1114
1540
 
1115
1541
  # Get the list of outdated packages
1116
1542
  outdated_packages_result = subprocess.run(
1117
- "pip list --outdated --format=json", shell=True, capture_output=True, text=True
1543
+ "pip list --outdated --format=json",
1544
+ check=False,
1545
+ shell=True,
1546
+ capture_output=True,
1547
+ text=True,
1118
1548
  )
1119
1549
  outdated_packages = json.loads(outdated_packages_result.stdout)
1120
1550
 
1121
1551
  # Create a set of outdated package names for quick lookup
1122
- outdated_package_names = {pkg['name'] for pkg in outdated_packages}
1552
+ outdated_package_names = {pkg["name"] for pkg in outdated_packages}
1123
1553
 
1124
1554
  # Upgrade only outdated packages, excluding specified packages
1125
1555
  for package in installed_packages:
1126
- package_name = package['name']
1127
- if package_name in outdated_package_names and package_name not in exclude_packages:
1556
+ package_name = package["name"]
1557
+ if (
1558
+ package_name in outdated_package_names
1559
+ and package_name not in exclude_packages
1560
+ ):
1128
1561
  try:
1129
1562
  print(f"Upgrading package: {package_name}")
1130
1563
  upgrade_result = subprocess.run(
1131
- f"pip install --upgrade {package_name}", shell=True, capture_output=True, text=True
1564
+ f"pip install --upgrade {package_name}",
1565
+ check=False,
1566
+ shell=True,
1567
+ capture_output=True,
1568
+ text=True,
1132
1569
  )
1133
1570
  if upgrade_result.returncode == 0:
1134
1571
  print(f"Successfully upgraded {package_name}")
1135
1572
  else:
1136
- print(f"Failed to upgrade {package_name}: {upgrade_result.stderr}")
1573
+ print(
1574
+ f"Failed to upgrade {package_name}: {upgrade_result.stderr}",
1575
+ )
1137
1576
  except Exception as e:
1138
1577
  print(f"An error occurred while upgrading {package_name}: {e}")
1139
1578
  elif package_name in exclude_packages:
@@ -1145,12 +1584,12 @@ class dataprocessing:
1145
1584
 
1146
1585
  def convert_mixed_formats_dates(self, df, column_name):
1147
1586
  # Convert initial dates to datetime with coercion to handle errors
1148
- df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
1587
+ df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
1149
1588
  df[column_name] = df[column_name].astype(str)
1150
1589
  corrected_dates = []
1151
-
1590
+
1152
1591
  for date_str in df[column_name]:
1153
- date_str = date_str.replace('-', '').replace('/', '')
1592
+ date_str = date_str.replace("-", "").replace("/", "")
1154
1593
  if len(date_str) == 8:
1155
1594
  year = date_str[:4]
1156
1595
  month = date_str[4:6]
@@ -1161,39 +1600,45 @@ class dataprocessing:
1161
1600
  else:
1162
1601
  corrected_date_str = f"{year}-{month}-{day}"
1163
1602
  # Convert to datetime
1164
- corrected_date = pd.to_datetime(corrected_date_str, errors='coerce')
1603
+ corrected_date = pd.to_datetime(corrected_date_str, errors="coerce")
1165
1604
  else:
1166
- corrected_date = pd.to_datetime(date_str, errors='coerce')
1167
-
1605
+ corrected_date = pd.to_datetime(date_str, errors="coerce")
1606
+
1168
1607
  corrected_dates.append(corrected_date)
1169
-
1608
+
1170
1609
  # Check length of the corrected_dates list
1171
1610
  if len(corrected_dates) != len(df):
1172
- raise ValueError("Length of corrected_dates does not match the original DataFrame")
1173
-
1611
+ raise ValueError(
1612
+ "Length of corrected_dates does not match the original DataFrame",
1613
+ )
1614
+
1174
1615
  # Assign the corrected dates back to the DataFrame
1175
1616
  df[column_name] = corrected_dates
1176
1617
  return df
1177
1618
 
1178
- def fill_weekly_date_range(self, df, date_column, freq='W-MON'):
1619
+ def fill_weekly_date_range(self, df, date_column, freq="W-MON"):
1179
1620
  # Ensure the date column is in datetime format
1180
1621
  df[date_column] = pd.to_datetime(df[date_column])
1181
-
1622
+
1182
1623
  # Generate the full date range with the specified frequency
1183
- full_date_range = pd.date_range(start=df[date_column].min(), end=df[date_column].max(), freq=freq)
1184
-
1624
+ full_date_range = pd.date_range(
1625
+ start=df[date_column].min(),
1626
+ end=df[date_column].max(),
1627
+ freq=freq,
1628
+ )
1629
+
1185
1630
  # Create a new dataframe with the full date range
1186
1631
  full_date_df = pd.DataFrame({date_column: full_date_range})
1187
-
1632
+
1188
1633
  # Merge the original dataframe with the new full date range dataframe
1189
- df_full = full_date_df.merge(df, on=date_column, how='left')
1190
-
1634
+ df_full = full_date_df.merge(df, on=date_column, how="left")
1635
+
1191
1636
  # Fill missing values with 0
1192
1637
  df_full.fillna(0, inplace=True)
1193
-
1638
+
1194
1639
  return df_full
1195
-
1196
- def add_prefix_and_suffix(self, df, prefix='', suffix='', date_col=None):
1640
+
1641
+ def add_prefix_and_suffix(self, df, prefix="", suffix="", date_col=None):
1197
1642
  """
1198
1643
  Adds a specified prefix and/or suffix to the column names of a DataFrame. Optionally, a column (e.g., a date column) can be excluded.
1199
1644
 
@@ -1205,19 +1650,28 @@ class dataprocessing:
1205
1650
 
1206
1651
  Returns:
1207
1652
  pd.DataFrame: The DataFrame with updated column names.
1653
+
1208
1654
  """
1209
-
1210
1655
  # If there is no date column
1211
1656
  if date_col is None:
1212
1657
  # Add prefixes and suffixes to all columns
1213
1658
  df.columns = [prefix + col + suffix for col in df.columns]
1214
1659
  else:
1215
1660
  # Add prefixes and suffixes to all columns except the date column
1216
- df.columns = [prefix + col + suffix if col != date_col else col for col in df.columns]
1217
-
1661
+ df.columns = [
1662
+ prefix + col + suffix if col != date_col else col for col in df.columns
1663
+ ]
1664
+
1218
1665
  return df
1219
1666
 
1220
- def create_dummies(self, df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total'):
1667
+ def create_dummies(
1668
+ self,
1669
+ df,
1670
+ date_col=None,
1671
+ dummy_threshold=0,
1672
+ add_total_dummy_col="No",
1673
+ total_col_name="total",
1674
+ ):
1221
1675
  """
1222
1676
  Creates dummy variables for the DataFrame, converting values greater than the threshold to 1 and others to 0.
1223
1677
  Optionally adds a total dummy column indicating whether any row contains at least one value greater than the threshold.
@@ -1231,13 +1685,15 @@ class dataprocessing:
1231
1685
 
1232
1686
  Returns:
1233
1687
  pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
1234
- """
1235
1688
 
1689
+ """
1236
1690
  # If there is no date column
1237
1691
  if date_col is None:
1238
- df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
1692
+ df = df.apply(
1693
+ lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0),
1694
+ )
1239
1695
 
1240
- if add_total_dummy_col != 'No':
1696
+ if add_total_dummy_col != "No":
1241
1697
  # Find max value of rows
1242
1698
  df[total_col_name] = df.max(axis=1)
1243
1699
 
@@ -1245,18 +1701,25 @@ class dataprocessing:
1245
1701
  else:
1246
1702
  # Create dummies for all columns except the date column
1247
1703
  df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
1248
- lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
1704
+ lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0),
1249
1705
  )
1250
1706
 
1251
- if add_total_dummy_col != 'No':
1707
+ if add_total_dummy_col != "No":
1252
1708
  # Find max value of rows
1253
1709
  df[total_col_name] = df.loc[:, df.columns != date_col].max(axis=1)
1254
1710
 
1255
1711
  return df
1256
1712
 
1257
- def replace_substrings(self, df, column, replacements, to_lower=False, new_column=None):
1713
+ def replace_substrings(
1714
+ self,
1715
+ df,
1716
+ column,
1717
+ replacements,
1718
+ to_lower=False,
1719
+ new_column=None,
1720
+ ):
1258
1721
  """
1259
- Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
1722
+ Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
1260
1723
  Optionally converts the column values to lowercase and allows creating a new column or modifying the existing one.
1261
1724
 
1262
1725
  Args:
@@ -1268,6 +1731,7 @@ class dataprocessing:
1268
1731
 
1269
1732
  Returns:
1270
1733
  pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
1734
+
1271
1735
  """
1272
1736
  if new_column is not None:
1273
1737
  # Create a new column for replacements
@@ -1287,7 +1751,7 @@ class dataprocessing:
1287
1751
 
1288
1752
  return df
1289
1753
 
1290
- def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
1754
+ def add_total_column(self, df, exclude_col=None, total_col_name="Total"):
1291
1755
  """
1292
1756
  Adds a total column to a DataFrame by summing across all columns. Optionally excludes a specified column.
1293
1757
 
@@ -1298,17 +1762,27 @@ class dataprocessing:
1298
1762
 
1299
1763
  Returns:
1300
1764
  pd.DataFrame: The DataFrame with an added total column.
1765
+
1301
1766
  """
1302
1767
  if exclude_col and exclude_col in df.columns:
1303
1768
  # Ensure the column to exclude exists before dropping
1304
- df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
1769
+ df[total_col_name] = df.drop(columns=[exclude_col], errors="ignore").sum(
1770
+ axis=1,
1771
+ )
1305
1772
  else:
1306
1773
  # Sum across all columns if no column is specified to exclude
1307
1774
  df[total_col_name] = df.sum(axis=1)
1308
-
1775
+
1309
1776
  return df
1310
1777
 
1311
- def apply_lookup_table_based_on_substring(self, df, column_name, category_dict, new_col_name='Category', other_label='Other'):
1778
+ def apply_lookup_table_based_on_substring(
1779
+ self,
1780
+ df,
1781
+ column_name,
1782
+ category_dict,
1783
+ new_col_name="Category",
1784
+ other_label="Other",
1785
+ ):
1312
1786
  """
1313
1787
  Categorizes text in a specified DataFrame column by applying a lookup table based on substrings.
1314
1788
 
@@ -1321,6 +1795,7 @@ class dataprocessing:
1321
1795
 
1322
1796
  Returns:
1323
1797
  pd.DataFrame: The original DataFrame with an additional column containing the assigned categories.
1798
+
1324
1799
  """
1325
1800
 
1326
1801
  def categorize_text(text):
@@ -1331,11 +1806,14 @@ class dataprocessing:
1331
1806
  text (str): The text string to categorize.
1332
1807
 
1333
1808
  Returns:
1334
- str: The category assigned based on the first matching substring found in the text. If no
1809
+ str: The category assigned based on the first matching substring found in the text. If no
1335
1810
  matching substring is found, returns other_name.
1811
+
1336
1812
  """
1337
1813
  for key, category in category_dict.items():
1338
- if key.lower() in text.lower(): # Check if the substring is in the text (case-insensitive)
1814
+ if (
1815
+ key.lower() in text.lower()
1816
+ ): # Check if the substring is in the text (case-insensitive)
1339
1817
  return category
1340
1818
  return other_label # Default category if no match is found
1341
1819
 
@@ -1354,6 +1832,7 @@ class dataprocessing:
1354
1832
 
1355
1833
  Returns:
1356
1834
  tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
1835
+
1357
1836
  """
1358
1837
  # Ensure date columns are in datetime format
1359
1838
  df1[date_col] = pd.to_datetime(df1[date_col])
@@ -1368,29 +1847,43 @@ class dataprocessing:
1368
1847
  df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
1369
1848
 
1370
1849
  # Merge the DataFrames on the date column
1371
- merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
1850
+ merged_df = pd.merge(
1851
+ df1_overlap,
1852
+ df2_overlap,
1853
+ on=date_col,
1854
+ suffixes=("_df1", "_df2"),
1855
+ )
1372
1856
 
1373
1857
  # Get common columns, excluding the date column
1374
- common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
1858
+ common_cols = [
1859
+ col for col in df1.columns if col != date_col and col in df2.columns
1860
+ ]
1375
1861
 
1376
1862
  # Create a DataFrame for differences
1377
1863
  diff_df = pd.DataFrame({date_col: merged_df[date_col]})
1378
1864
 
1379
1865
  total_diff_list = []
1380
1866
  for col in common_cols:
1381
- diff_col = f'diff_{col}'
1382
- diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
1867
+ diff_col = f"diff_{col}"
1868
+ diff_df[diff_col] = (
1869
+ merged_df[f"{col}_df1"] - merged_df[f"{col}_df2"]
1870
+ ) # Corrected subtraction order
1383
1871
 
1384
1872
  # Sum differences for the column
1385
1873
  total_diff = diff_df[diff_col].sum()
1386
- total_diff_list.append({'Column': col, 'Total Difference': total_diff})
1874
+ total_diff_list.append({"Column": col, "Total Difference": total_diff})
1387
1875
 
1388
1876
  # Create summary DataFrame
1389
1877
  total_diff_df = pd.DataFrame(total_diff_list)
1390
1878
 
1391
1879
  return diff_df, total_diff_df
1392
1880
 
1393
- def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
1881
+ def week_commencing_2_week_commencing_conversion_isoweekday(
1882
+ self,
1883
+ df,
1884
+ date_col,
1885
+ week_commencing="mon",
1886
+ ):
1394
1887
  """
1395
1888
  Convert a DataFrame's date column so that each date is mapped back
1396
1889
  to the 'week_commencing' day of the *current ISO week*.
@@ -1398,7 +1891,7 @@ class dataprocessing:
1398
1891
  Args:
1399
1892
  df (pandas.DataFrame): The DataFrame with date-based data.
1400
1893
  date_col (str): The name of the date column.
1401
- week_commencing (str): The desired start of the week.
1894
+ week_commencing (str): The desired start of the week.
1402
1895
  ('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
1403
1896
  Uses ISO day numbering (Mon=1, ..., Sun=7).
1404
1897
 
@@ -1406,9 +1899,18 @@ class dataprocessing:
1406
1899
  pandas.DataFrame: Original DataFrame with an extra column
1407
1900
  'week_start_<week_commencing>' containing the
1408
1901
  start-of-week date for each row.
1902
+
1409
1903
  """
1410
1904
  # ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
1411
- iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
1905
+ iso_day_dict = {
1906
+ "mon": 1,
1907
+ "tue": 2,
1908
+ "wed": 3,
1909
+ "thur": 4,
1910
+ "fri": 5,
1911
+ "sat": 6,
1912
+ "sun": 7,
1913
+ }
1412
1914
 
1413
1915
  target_day = iso_day_dict[week_commencing]
1414
1916
 
@@ -1419,15 +1921,23 @@ class dataprocessing:
1419
1921
  # Apply the transformation
1420
1922
  new_col = f"week_start_{week_commencing}"
1421
1923
  df[new_col] = df[date_col].apply(map_to_week_start)
1422
-
1924
+
1423
1925
  return df
1424
-
1425
- def seasonality_feature_extraction(self, df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False):
1926
+
1927
+ def seasonality_feature_extraction(
1928
+ self,
1929
+ df,
1930
+ kpi_var,
1931
+ n_features=10,
1932
+ test_size=0.1,
1933
+ random_state=42,
1934
+ shuffle=False,
1935
+ ):
1426
1936
  """
1427
1937
  1) Uses the provided dataframe (df), where:
1428
1938
  - df['kpi_total_sales'] is the target (y).
1429
1939
  - df['OBS'] is a date or index column (excluded from features).
1430
-
1940
+
1431
1941
  2) Splits data into train/test using the specified test_size, random_state, and shuffle.
1432
1942
  3) Trains XGBoost and Random Forest on all features.
1433
1943
  4) Extracts the top n_features from each model.
@@ -1457,20 +1967,22 @@ class dataprocessing:
1457
1967
  - "combined_features": merged unique feature list
1458
1968
  - "performance": dictionary of performance metrics
1459
1969
  - "models": dictionary of fitted models
1970
+
1460
1971
  """
1461
1972
  # ---------------------------------------------------------------------
1462
1973
  # 1. Prepare your data (X, y)
1463
1974
  # ---------------------------------------------------------------------
1464
1975
  # Extract target and features
1465
1976
  y = df[kpi_var]
1466
- X = df.drop(columns=['OBS', kpi_var])
1977
+ X = df.drop(columns=["OBS", kpi_var])
1467
1978
 
1468
1979
  # Split into train/test
1469
1980
  X_train, X_test, y_train, y_test = train_test_split(
1470
- X, y,
1981
+ X,
1982
+ y,
1471
1983
  test_size=test_size,
1472
1984
  random_state=random_state,
1473
- shuffle=shuffle
1985
+ shuffle=shuffle,
1474
1986
  )
1475
1987
 
1476
1988
  # ---------------------------------------------------------------------
@@ -1483,16 +1995,13 @@ class dataprocessing:
1483
1995
  # (B) Get feature importances
1484
1996
  xgb_importances = xgb_model_full.feature_importances_
1485
1997
  xgb_feat_importance_df = (
1486
- pd.DataFrame({
1487
- 'feature': X.columns,
1488
- 'importance': xgb_importances
1489
- })
1490
- .sort_values('importance', ascending=False)
1998
+ pd.DataFrame({"feature": X.columns, "importance": xgb_importances})
1999
+ .sort_values("importance", ascending=False)
1491
2000
  .reset_index(drop=True)
1492
2001
  )
1493
2002
 
1494
2003
  # (C) Select top N features
1495
- top_features_xgb = xgb_feat_importance_df['feature'].head(n_features).tolist()
2004
+ top_features_xgb = xgb_feat_importance_df["feature"].head(n_features).tolist()
1496
2005
 
1497
2006
  # (D) Subset data to top N features
1498
2007
  X_train_xgb_topN = X_train[top_features_xgb]
@@ -1510,16 +2019,13 @@ class dataprocessing:
1510
2019
  # (B) Get feature importances
1511
2020
  rf_importances = rf_model_full.feature_importances_
1512
2021
  rf_feat_importance_df = (
1513
- pd.DataFrame({
1514
- 'feature': X.columns,
1515
- 'importance': rf_importances
1516
- })
1517
- .sort_values('importance', ascending=False)
2022
+ pd.DataFrame({"feature": X.columns, "importance": rf_importances})
2023
+ .sort_values("importance", ascending=False)
1518
2024
  .reset_index(drop=True)
1519
2025
  )
1520
2026
 
1521
2027
  # (C) Select top N features
1522
- top_features_rf = rf_feat_importance_df['feature'].head(n_features).tolist()
2028
+ top_features_rf = rf_feat_importance_df["feature"].head(n_features).tolist()
1523
2029
 
1524
2030
  # (D) Subset data to top N features
1525
2031
  X_train_rf_topN = X_train[top_features_rf]
@@ -1551,25 +2057,45 @@ class dataprocessing:
1551
2057
 
1552
2058
  return output
1553
2059
 
1554
- def quid_pr (self, df):
2060
+ def quid_pr(self, df):
1555
2061
  def convert_date(date_str):
1556
2062
  try:
1557
- return datetime.strptime(date_str, '%b %d, %Y')
2063
+ return datetime.strptime(date_str, "%b %d, %Y")
1558
2064
  except ValueError:
1559
2065
  return None # Return None if conversion fails
2066
+
1560
2067
  # Apply conversion to create new columns
1561
- df['Start Date'] = df['Earliest Published'].astype(str).apply(convert_date)
1562
- df['End Date'] = df['Latest Published'].astype(str).apply(convert_date)
1563
- df['Days Duration'] = (df['End Date'] - df['Start Date']).dt.days + 1 # Ensure inclusive range
1564
- df['Count per Day'] = df['Published Count'] / df['Days Duration'] # Calculate count per day
1565
- df['Social Engagement per Day'] = df['Social Engagement'] / df['Days Duration']
1566
- df['Week Start'] = df['Start Date'].apply(lambda x: x - timedelta(days=x.weekday()) if pd.notnull(x) else None)
1567
- count_df = df.groupby('Week Start')['Count per Day'].sum().reset_index()
1568
- total_engagement_per_company = df.groupby('Company (Primary Mention)')['Social Engagement'].sum().reset_index() # Caluclates Social Engagement across whole period
1569
- valid_companies = total_engagement_per_company[total_engagement_per_company['Social Engagement'] > 0]['Company (Primary Mention)'] # Filters out Companies with no Social Engagement
1570
- social_engagement_df = df[df['Company (Primary Mention)'].isin(valid_companies)].groupby(['Week Start', 'Company (Primary Mention)'])[
1571
- 'Social Engagement'
1572
- ].sum().reset_index()
1573
- total_social_engagement_df = df.groupby('Week Start')['Social Engagement per Day'].sum().reset_index()
1574
-
1575
- return count_df, total_social_engagement_df, social_engagement_df
2068
+ df["Start Date"] = df["Earliest Published"].astype(str).apply(convert_date)
2069
+ df["End Date"] = df["Latest Published"].astype(str).apply(convert_date)
2070
+ df["Days Duration"] = (
2071
+ df["End Date"] - df["Start Date"]
2072
+ ).dt.days + 1 # Ensure inclusive range
2073
+ df["Count per Day"] = (
2074
+ df["Published Count"] / df["Days Duration"]
2075
+ ) # Calculate count per day
2076
+ df["Social Engagement per Day"] = df["Social Engagement"] / df["Days Duration"]
2077
+ df["Week Start"] = df["Start Date"].apply(
2078
+ lambda x: x - timedelta(days=x.weekday()) if pd.notnull(x) else None,
2079
+ )
2080
+ count_df = df.groupby("Week Start")["Count per Day"].sum().reset_index()
2081
+ total_engagement_per_company = (
2082
+ df.groupby("Company (Primary Mention)")["Social Engagement"]
2083
+ .sum()
2084
+ .reset_index()
2085
+ ) # Caluclates Social Engagement across whole period
2086
+ valid_companies = total_engagement_per_company[
2087
+ total_engagement_per_company["Social Engagement"] > 0
2088
+ ][
2089
+ "Company (Primary Mention)"
2090
+ ] # Filters out Companies with no Social Engagement
2091
+ social_engagement_df = (
2092
+ df[df["Company (Primary Mention)"].isin(valid_companies)]
2093
+ .groupby(["Week Start", "Company (Primary Mention)"])["Social Engagement"]
2094
+ .sum()
2095
+ .reset_index()
2096
+ )
2097
+ total_social_engagement_df = (
2098
+ df.groupby("Week Start")["Social Engagement per Day"].sum().reset_index()
2099
+ )
2100
+
2101
+ return count_df, total_social_engagement_df, social_engagement_df