imsciences 0.9.6.9__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/mmm.py CHANGED
@@ -1,93 +1,152 @@
1
- import pandas as pd
2
1
  import calendar
2
+ import json
3
3
  import os
4
- import numpy as np
5
- import re
6
- from datetime import datetime, timedelta
7
4
  import subprocess
8
- import json
9
- from sklearn.model_selection import train_test_split
5
+ from datetime import datetime, timedelta
6
+
7
+ import numpy as np
8
+ import pandas as pd
10
9
  import xgboost as xgb
11
10
  from sklearn.ensemble import RandomForestRegressor
11
+ from sklearn.model_selection import train_test_split
12
+
12
13
 
13
14
  class dataprocessing:
14
-
15
15
  def help(self):
16
-
17
16
  print("\n1. get_wd_levels")
18
- print(" - Description: Get the working directory with the option of moving up parents.")
17
+ print(
18
+ " - Description: Get the working directory with the option of moving up parents.",
19
+ )
19
20
  print(" - Usage: get_wd_levels(levels)")
20
21
  print(" - Example: get_wd_levels(0)")
21
22
 
22
23
  print("\n2. aggregate_daily_to_wc_long")
23
- print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
24
- print(" - Usage: aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')")
25
- print(" - Example: aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')")
24
+ print(
25
+ " - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.",
26
+ )
27
+ print(
28
+ " - Usage: aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')",
29
+ )
30
+ print(
31
+ " - Example: aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')",
32
+ )
26
33
 
27
34
  print("\n3. convert_monthly_to_daily")
28
- print(" - Description: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.")
35
+ print(
36
+ " - Description: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.",
37
+ )
29
38
  print(" - Usage: convert_monthly_to_daily(df, date_column, divide=True)")
30
39
  print(" - Example: convert_monthly_to_daily(df, 'date')")
31
40
 
32
41
  print("\n4. week_of_year_mapping")
33
- print(" - Description: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.")
42
+ print(
43
+ " - Description: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.",
44
+ )
34
45
  print(" - Usage: week_of_year_mapping(df, week_col, start_day_str)")
35
46
  print(" - Example: week_of_year_mapping(df, 'week', 'mon')")
36
47
 
37
48
  print("\n5. rename_cols")
38
- print(" - Description: Renames columns in a pandas DataFrame with a specified prefix or format.")
49
+ print(
50
+ " - Description: Renames columns in a pandas DataFrame with a specified prefix or format.",
51
+ )
39
52
  print(" - Usage: rename_cols(df, name='ame_')")
40
53
  print(" - Example: rename_cols(df, 'ame_facebook')")
41
54
 
42
55
  print("\n6. merge_new_and_old")
43
- print(" - Description: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.")
44
- print(" - Usage: merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')")
45
- print(" - Example: merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')")
56
+ print(
57
+ " - Description: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.",
58
+ )
59
+ print(
60
+ " - Usage: merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')",
61
+ )
62
+ print(
63
+ " - Example: merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')",
64
+ )
46
65
 
47
66
  print("\n7. merge_dataframes_on_column")
48
67
  print(" - Description: Merge a list of DataFrames on a common column.")
49
- print(" - Usage: merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')")
50
- print(" - Example: merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')")
68
+ print(
69
+ " - Usage: merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')",
70
+ )
71
+ print(
72
+ " - Example: merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')",
73
+ )
51
74
 
52
75
  print("\n8. merge_and_update_dfs")
53
- print(" - Description: Merges two dataframes, updating columns from the second dataframe where values are available.")
76
+ print(
77
+ " - Description: Merges two dataframes, updating columns from the second dataframe where values are available.",
78
+ )
54
79
  print(" - Usage: merge_and_update_dfs(df1, df2, key_column)")
55
- print(" - Example: merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')")
80
+ print(
81
+ " - Example: merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')",
82
+ )
56
83
 
57
84
  print("\n9. convert_us_to_uk_dates")
58
- print(" - Description: Convert a DataFrame column with mixed US and UK date formats to datetime.")
85
+ print(
86
+ " - Description: Convert a DataFrame column with mixed US and UK date formats to datetime.",
87
+ )
59
88
  print(" - Usage: convert_us_to_uk_dates(df, date_col)")
60
89
  print(" - Example: convert_us_to_uk_dates(df, 'date')")
61
90
 
62
91
  print("\n10. combine_sheets")
63
- print(" - Description: Combines multiple DataFrames from a dictionary into a single DataFrame.")
92
+ print(
93
+ " - Description: Combines multiple DataFrames from a dictionary into a single DataFrame.",
94
+ )
64
95
  print(" - Usage: combine_sheets(all_sheets)")
65
96
  print(" - Example: combine_sheets({'Sheet1': df1, 'Sheet2': df2})")
66
97
 
67
98
  print("\n11. pivot_table")
68
- print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
69
- print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')")
70
- print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)")
99
+ print(
100
+ " - Description: Dynamically pivots a DataFrame based on specified columns.",
101
+ )
102
+ print(
103
+ " - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')",
104
+ )
105
+ print(
106
+ " - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)",
107
+ )
71
108
 
72
109
  print("\n12. apply_lookup_table_for_columns")
73
- print(" - Description: Maps substrings in columns to new values based on a dictionary.")
74
- print(" - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')")
75
- print(" - Example: apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')")
110
+ print(
111
+ " - Description: Maps substrings in columns to new values based on a dictionary.",
112
+ )
113
+ print(
114
+ " - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')",
115
+ )
116
+ print(
117
+ " - Example: apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')",
118
+ )
76
119
 
77
120
  print("\n13. aggregate_daily_to_wc_wide")
78
- print(" - Description: Aggregates daily data into weekly data and pivots it to wide format.")
79
- print(" - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)")
80
- print(" - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)")
121
+ print(
122
+ " - Description: Aggregates daily data into weekly data and pivots it to wide format.",
123
+ )
124
+ print(
125
+ " - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)",
126
+ )
127
+ print(
128
+ " - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)",
129
+ )
81
130
 
82
131
  print("\n14. merge_cols_with_seperator")
83
- print(" - Description: Merges multiple columns in a DataFrame into one column with a specified separator.")
84
- print(" - Usage: merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')")
85
- print(" - Example: merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')")
132
+ print(
133
+ " - Description: Merges multiple columns in a DataFrame into one column with a specified separator.",
134
+ )
135
+ print(
136
+ " - Usage: merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')",
137
+ )
138
+ print(
139
+ " - Example: merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')",
140
+ )
86
141
 
87
142
  print("\n15. check_sum_of_df_cols_are_equal")
88
- print(" - Description: Checks if the sum of two columns in two DataFrames are equal and provides the difference.")
143
+ print(
144
+ " - Description: Checks if the sum of two columns in two DataFrames are equal and provides the difference.",
145
+ )
89
146
  print(" - Usage: check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)")
90
- print(" - Example: check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')")
147
+ print(
148
+ " - Example: check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')",
149
+ )
91
150
 
92
151
  print("\n16. convert_2_df_cols_to_dict")
93
152
  print(" - Description: Creates a dictionary from two DataFrame columns.")
@@ -95,128 +154,229 @@ class dataprocessing:
95
154
  print(" - Example: convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')")
96
155
 
97
156
  print("\n17. create_FY_and_H_columns")
98
- print(" - Description: Adds financial year and half-year columns to a DataFrame based on a start date.")
99
- print(" - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')")
100
- print(" - Example: create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')")
157
+ print(
158
+ " - Description: Adds financial year and half-year columns to a DataFrame based on a start date.",
159
+ )
160
+ print(
161
+ " - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')",
162
+ )
163
+ print(
164
+ " - Example: create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')",
165
+ )
101
166
 
102
167
  print("\n18. keyword_lookup_replacement")
103
- print(" - Description: Updates values in a column based on a lookup dictionary with conditional logic.")
104
- print(" - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')")
105
- print(" - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')")
168
+ print(
169
+ " - Description: Updates values in a column based on a lookup dictionary with conditional logic.",
170
+ )
171
+ print(
172
+ " - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')",
173
+ )
174
+ print(
175
+ " - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')",
176
+ )
106
177
 
107
178
  print("\n19. create_new_version_of_col_using_LUT")
108
- print(" - Description: Creates a new column based on a lookup table applied to an existing column.")
109
- print(" - Usage: create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')")
110
- print(" - Example: create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)")
179
+ print(
180
+ " - Description: Creates a new column based on a lookup table applied to an existing column.",
181
+ )
182
+ print(
183
+ " - Usage: create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')",
184
+ )
185
+ print(
186
+ " - Example: create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)",
187
+ )
111
188
 
112
189
  print("\n20. convert_df_wide_2_long")
113
- print(" - Description: Converts a wide-format DataFrame into a long-format DataFrame.")
114
- print(" - Usage: convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')")
115
- print(" - Example: convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')")
190
+ print(
191
+ " - Description: Converts a wide-format DataFrame into a long-format DataFrame.",
192
+ )
193
+ print(
194
+ " - Usage: convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')",
195
+ )
196
+ print(
197
+ " - Example: convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')",
198
+ )
116
199
 
117
200
  print("\n21. manually_edit_data")
118
- print(" - Description: Manually updates specified cells in a DataFrame based on filters.")
119
- print(" - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)")
120
- print(" - Example: manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')")
201
+ print(
202
+ " - Description: Manually updates specified cells in a DataFrame based on filters.",
203
+ )
204
+ print(
205
+ " - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)",
206
+ )
207
+ print(
208
+ " - Example: manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')",
209
+ )
121
210
 
122
211
  print("\n22. format_numbers_with_commas")
123
- print(" - Description: Formats numerical columns with commas and a specified number of decimal places.")
212
+ print(
213
+ " - Description: Formats numerical columns with commas and a specified number of decimal places.",
214
+ )
124
215
  print(" - Usage: format_numbers_with_commas(df, decimal_length_chosen=2)")
125
216
  print(" - Example: format_numbers_with_commas(df, decimal_length_chosen=1)")
126
217
 
127
218
  print("\n23. filter_df_on_multiple_conditions")
128
- print(" - Description: Filters a DataFrame based on multiple column conditions.")
219
+ print(
220
+ " - Description: Filters a DataFrame based on multiple column conditions.",
221
+ )
129
222
  print(" - Usage: filter_df_on_multiple_conditions(df, filters_dict)")
130
- print(" - Example: filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})")
223
+ print(
224
+ " - Example: filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})",
225
+ )
131
226
 
132
227
  print("\n24. read_and_concatenate_files")
133
- print(" - Description: Reads and concatenates files from a specified folder into a single DataFrame.")
228
+ print(
229
+ " - Description: Reads and concatenates files from a specified folder into a single DataFrame.",
230
+ )
134
231
  print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
135
- print(" - Example: read_and_concatenate_files('/path/to/files', file_type='xlsx')")
232
+ print(
233
+ " - Example: read_and_concatenate_files('/path/to/files', file_type='xlsx')",
234
+ )
136
235
 
137
236
  print("\n25. upgrade_outdated_packages")
138
- print(" - Description: Upgrades all outdated Python packages except specified ones.")
237
+ print(
238
+ " - Description: Upgrades all outdated Python packages except specified ones.",
239
+ )
139
240
  print(" - Usage: upgrade_outdated_packages(exclude_packages=['twine'])")
140
- print(" - Example: upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])")
241
+ print(
242
+ " - Example: upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])",
243
+ )
141
244
 
142
245
  print("\n26. convert_mixed_formats_dates")
143
- print(" - Description: Converts mixed-format date columns into standardized datetime format.")
246
+ print(
247
+ " - Description: Converts mixed-format date columns into standardized datetime format.",
248
+ )
144
249
  print(" - Usage: convert_mixed_formats_dates(df, column_name)")
145
250
  print(" - Example: convert_mixed_formats_dates(df, 'date_col')")
146
251
 
147
252
  print("\n27. fill_weekly_date_range")
148
- print(" - Description: Fills in missing weekly dates in a DataFrame with a specified frequency.")
253
+ print(
254
+ " - Description: Fills in missing weekly dates in a DataFrame with a specified frequency.",
255
+ )
149
256
  print(" - Usage: fill_weekly_date_range(df, date_column, freq='W-MON')")
150
257
  print(" - Example: fill_weekly_date_range(df, 'date_col')")
151
258
 
152
259
  print("\n28. add_prefix_and_suffix")
153
- print(" - Description: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.")
154
- print(" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)")
155
- print(" - Example: add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')")
260
+ print(
261
+ " - Description: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.",
262
+ )
263
+ print(
264
+ " - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)",
265
+ )
266
+ print(
267
+ " - Example: add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')",
268
+ )
156
269
 
157
270
  print("\n29. create_dummies")
158
- print(" - Description: Creates dummy variables for columns, with an option to add a total dummy column.")
159
- print(" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')")
160
- print(" - Example: create_dummies(df, date_col='date_col', dummy_threshold=1)")
271
+ print(
272
+ " - Description: Creates dummy variables for columns, with an option to add a total dummy column.",
273
+ )
274
+ print(
275
+ " - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')",
276
+ )
277
+ print(
278
+ " - Example: create_dummies(df, date_col='date_col', dummy_threshold=1)",
279
+ )
161
280
 
162
281
  print("\n30. replace_substrings")
163
- print(" - Description: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.")
164
- print(" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)")
165
- print(" - Example: replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')")
282
+ print(
283
+ " - Description: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.",
284
+ )
285
+ print(
286
+ " - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)",
287
+ )
288
+ print(
289
+ " - Example: replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')",
290
+ )
166
291
 
167
292
  print("\n31. add_total_column")
168
- print(" - Description: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.")
169
- print(" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')")
293
+ print(
294
+ " - Description: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.",
295
+ )
296
+ print(
297
+ " - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')",
298
+ )
170
299
  print(" - Example: add_total_column(df, exclude_col='date_col')")
171
300
 
172
301
  print("\n32. apply_lookup_table_based_on_substring")
173
- print(" - Description: Categorizes text in a column using a lookup table based on substrings.")
174
- print(" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')")
175
- print(" - Example: apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})")
302
+ print(
303
+ " - Description: Categorizes text in a column using a lookup table based on substrings.",
304
+ )
305
+ print(
306
+ " - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')",
307
+ )
308
+ print(
309
+ " - Example: apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})",
310
+ )
176
311
 
177
312
  print("\n33. compare_overlap")
178
- print(" - Description: Compares overlapping periods between two DataFrames and summarizes differences.")
313
+ print(
314
+ " - Description: Compares overlapping periods between two DataFrames and summarizes differences.",
315
+ )
179
316
  print(" - Usage: compare_overlap(df1, df2, date_col)")
180
317
  print(" - Example: compare_overlap(df1, df2, 'date_col')")
181
318
 
182
319
  print("\n34. week_commencing_2_week_commencing_conversion_isoweekday")
183
- print(" - Description: Maps dates to the start of the current ISO week based on a specified weekday.")
184
- print(" - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')")
185
- print(" - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')")
186
-
320
+ print(
321
+ " - Description: Maps dates to the start of the current ISO week based on a specified weekday.",
322
+ )
323
+ print(
324
+ " - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')",
325
+ )
326
+ print(
327
+ " - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')",
328
+ )
329
+
187
330
  print("\n35. seasonality_feature_extraction")
188
- print(" - Description: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.")
189
- print(" - Usage: seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)")
190
- print(" - Example: seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)")
331
+ print(
332
+ " - Description: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.",
333
+ )
334
+ print(
335
+ " - Usage: seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)",
336
+ )
337
+ print(
338
+ " - Example: seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)",
339
+ )
191
340
 
192
341
  def get_wd_levels(self, levels):
193
342
  """
194
343
  Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
195
344
 
196
- Parameters:
345
+ Parameters
346
+ ----------
197
347
  - data_frame: pandas DataFrame
198
348
  The input data frame.
199
349
  - num_rows_to_remove: int
200
350
  The number of levels to move up pathways.
201
351
 
202
- Returns:
352
+ Returns
353
+ -------
203
354
  - Current wd
204
- """
205
355
 
356
+ """
206
357
  directory = os.getcwd()
207
358
  for _ in range(levels):
208
359
  directory = os.path.dirname(directory)
209
360
  return directory
210
-
211
- def aggregate_daily_to_wc_long(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum') -> pd.DataFrame:
361
+
362
+ def aggregate_daily_to_wc_long(
363
+ self,
364
+ df: pd.DataFrame,
365
+ date_column: str,
366
+ group_columns: list[str],
367
+ sum_columns: list[str],
368
+ wc: str = "sun",
369
+ aggregation: str = "sum",
370
+ ) -> pd.DataFrame:
212
371
  """
213
- Aggregates daily data into weekly data, starting on a specified day of the week,
214
- and groups the data by additional specified columns. It aggregates specified numeric columns
215
- by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
216
- of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
372
+ Aggregates daily data into weekly data, starting on a specified day of the week,
373
+ and groups the data by additional specified columns. It aggregates specified numeric columns
374
+ by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
375
+ of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
217
376
  The day column is renamed from 'Day' to 'OBS'.
218
377
 
219
- Parameters:
378
+ Parameters
379
+ ----------
220
380
  - df: pandas DataFrame
221
381
  The input DataFrame containing daily data.
222
382
  - date_column: string
@@ -230,18 +390,21 @@ class dataprocessing:
230
390
  - aggregation: string, optional (default 'sum')
231
391
  Aggregation method, either 'sum', 'average', or 'count'.
232
392
 
233
- Returns:
393
+ Returns
394
+ -------
234
395
  - pandas DataFrame
235
396
  A new DataFrame with weekly aggregated data. The index is reset,
236
- and columns represent the grouped and aggregated metrics. The DataFrame
237
- is in long format, with separate columns for each combination of
397
+ and columns represent the grouped and aggregated metrics. The DataFrame
398
+ is in long format, with separate columns for each combination of
238
399
  grouped metrics.
239
- """
240
400
 
401
+ """
241
402
  # Map the input week commencing day to a weekday number (0=Monday, 6=Sunday)
242
- days = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, 'sun': 6}
403
+ days = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
243
404
  if wc.lower() not in days:
244
- return print(f"Incorrect week commencing day input: '{wc}'. Please choose a valid day of the week (e.g., 'sun', 'mon', etc.).")
405
+ return print(
406
+ f"Incorrect week commencing day input: '{wc}'. Please choose a valid day of the week (e.g., 'sun', 'mon', etc.).",
407
+ )
245
408
 
246
409
  start_day = days[wc.lower()]
247
410
 
@@ -252,26 +415,40 @@ class dataprocessing:
252
415
  df_copy[date_column] = pd.to_datetime(df_copy[date_column])
253
416
 
254
417
  # Determine the start of each week
255
- df_copy['week_start'] = df_copy[date_column].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - start_day) % 7))
418
+ df_copy["week_start"] = df_copy[date_column].apply(
419
+ lambda x: x - pd.Timedelta(days=(x.weekday() - start_day) % 7),
420
+ )
256
421
 
257
422
  # Convert sum_columns to numeric and fill NaNs with 0, retaining decimal values
258
423
  for col in sum_columns:
259
- df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)
424
+ df_copy[col] = pd.to_numeric(df_copy[col], errors="coerce").fillna(0)
260
425
 
261
426
  # Group by the new week start column and additional columns, then aggregate the numeric columns
262
- if aggregation == 'average':
263
- grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].mean().reset_index()
264
- elif aggregation == 'count':
265
- grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].count().reset_index()
427
+ if aggregation == "average":
428
+ grouped = (
429
+ df_copy.groupby(["week_start"] + group_columns)[sum_columns]
430
+ .mean()
431
+ .reset_index()
432
+ )
433
+ elif aggregation == "count":
434
+ grouped = (
435
+ df_copy.groupby(["week_start"] + group_columns)[sum_columns]
436
+ .count()
437
+ .reset_index()
438
+ )
266
439
  else: # Default to 'sum' if any other value is provided
267
- grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].sum().reset_index()
440
+ grouped = (
441
+ df_copy.groupby(["week_start"] + group_columns)[sum_columns]
442
+ .sum()
443
+ .reset_index()
444
+ )
268
445
 
269
446
  # Rename 'week_start' column to 'OBS'
270
- grouped = grouped.rename(columns={'week_start': 'OBS'})
447
+ grouped = grouped.rename(columns={"week_start": "OBS"})
271
448
 
272
449
  return grouped
273
-
274
- def convert_monthly_to_daily(self, df, date_column, divide = True):
450
+
451
+ def convert_monthly_to_daily(self, df, date_column, divide=True):
275
452
  """
276
453
  Convert a DataFrame with monthly data to daily data.
277
454
  This function takes a DataFrame and a date column, then it expands each
@@ -282,7 +459,6 @@ class dataprocessing:
282
459
  :param divide: boolean divide by the number of days in a month (default True)
283
460
  :return: A new DataFrame with daily data.
284
461
  """
285
-
286
462
  # Convert date_column to datetime
287
463
  df[date_column] = pd.to_datetime(df[date_column])
288
464
 
@@ -292,7 +468,10 @@ class dataprocessing:
292
468
  # Iterate over each row in the DataFrame
293
469
  for _, row in df.iterrows():
294
470
  # Calculate the number of days in the month
295
- num_days = calendar.monthrange(row[date_column].year, row[date_column].month)[1]
471
+ num_days = calendar.monthrange(
472
+ row[date_column].year,
473
+ row[date_column].month,
474
+ )[1]
296
475
 
297
476
  # Create a new record for each day of the month
298
477
  for day in range(1, num_days + 1):
@@ -304,32 +483,41 @@ class dataprocessing:
304
483
  if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
305
484
  if divide is True:
306
485
  daily_row[col] = row[col] / num_days
307
- else:
486
+ else:
308
487
  daily_row[col] = row[col]
309
488
  daily_records.append(daily_row)
310
489
 
311
490
  # Convert the list of daily records into a DataFrame
312
491
  daily_df = pd.DataFrame(daily_records)
313
-
492
+
314
493
  return daily_df
315
-
316
- def week_of_year_mapping(self,df, week_col, start_day_str):
317
494
 
495
+ def week_of_year_mapping(self, df, week_col, start_day_str):
318
496
  # Mapping of string day names to day numbers (1 for Monday, 7 for Sunday)
319
497
  day_mapping = {
320
- 'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7
498
+ "mon": 1,
499
+ "tue": 2,
500
+ "wed": 3,
501
+ "thu": 4,
502
+ "fri": 5,
503
+ "sat": 6,
504
+ "sun": 7,
321
505
  }
322
506
 
323
507
  # Convert the day string to a number, or raise an error if not valid
324
508
  start_day = day_mapping.get(start_day_str.lower())
325
509
  if start_day is None:
326
- raise ValueError(f"Invalid day input: '{start_day_str}'. Please use one of 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'.")
510
+ raise ValueError(
511
+ f"Invalid day input: '{start_day_str}'. Please use one of 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'.",
512
+ )
327
513
 
328
514
  # Function to convert week number to start date of the week
329
515
  def week_to_startdate(week_str, start_day):
330
- year, week = map(int, week_str.split('-W'))
516
+ year, week = map(int, week_str.split("-W"))
331
517
  first_day_of_year = datetime(year, 1, 1)
332
- first_weekday_of_year = first_day_of_year.weekday() # Monday is 0 and Sunday is 6
518
+ first_weekday_of_year = (
519
+ first_day_of_year.weekday()
520
+ ) # Monday is 0 and Sunday is 6
333
521
 
334
522
  # Calculate days to adjust to the desired start day of the week
335
523
  days_to_adjust = (start_day - 1 - first_weekday_of_year) % 7
@@ -340,25 +528,38 @@ class dataprocessing:
340
528
  return start_of_week
341
529
 
342
530
  # Apply the function to each row in the specified week column
343
- df['OBS'] = df[week_col].apply(lambda x: week_to_startdate(x, start_day)).dt.strftime('%d/%m/%Y')
531
+ df["OBS"] = (
532
+ df[week_col]
533
+ .apply(lambda x: week_to_startdate(x, start_day))
534
+ .dt.strftime("%d/%m/%Y")
535
+ )
344
536
  return df
345
-
346
- def rename_cols(self, df, name = 'ame_'):
537
+
538
+ def rename_cols(self, df, name="ame_"):
347
539
  new_columns = {}
348
540
  for col in df.columns:
349
- if col != 'OBS':
541
+ if col != "OBS":
350
542
  new_col_name = name + col.replace(" ", "_").lower()
351
543
  else:
352
544
  new_col_name = col
353
545
  new_columns[col] = new_col_name
354
546
  return df.rename(columns=new_columns)
355
-
356
- def merge_new_and_old(self, old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS'):
547
+
548
+ def merge_new_and_old(
549
+ self,
550
+ old_df,
551
+ old_col,
552
+ new_df,
553
+ new_col,
554
+ cutoff_date,
555
+ date_col_name="OBS",
556
+ ):
357
557
  """
358
558
  Creates a new DataFrame with two columns: one for dates and one for merged numeric values.
359
559
  Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.
360
560
 
361
- Parameters:
561
+ Parameters
562
+ ----------
362
563
  - old_df: pandas DataFrame
363
564
  The old DataFrame from which to take the numeric values up to the specified date.
364
565
  - old_col: str
@@ -372,11 +573,12 @@ class dataprocessing:
372
573
  - date_col_name: str, optional (default 'OBS')
373
574
  The name of the date column in both DataFrames.
374
575
 
375
- Returns:
576
+ Returns
577
+ -------
376
578
  - pandas DataFrame
377
579
  A new DataFrame with two columns: 'Date' and a column named after 'new_col' containing merged numeric values.
378
- """
379
580
 
581
+ """
380
582
  # Convert date columns in both dataframes to datetime for comparison
381
583
  old_df[date_col_name] = pd.to_datetime(old_df[date_col_name])
382
584
  new_df[date_col_name] = pd.to_datetime(new_df[date_col_name])
@@ -389,67 +591,93 @@ class dataprocessing:
389
591
  new_values = new_df[new_df[date_col_name] > cutoff_date]
390
592
 
391
593
  # Create a new DataFrame with two columns: 'Date' and a column named after 'new_col'
392
- merged_df = pd.DataFrame({
393
- 'OBS': pd.concat([old_values[date_col_name], new_values[date_col_name]], ignore_index=True),
394
- new_col: pd.concat([old_values[old_col], new_values[new_col]], ignore_index=True)
395
- })
594
+ merged_df = pd.DataFrame(
595
+ {
596
+ "OBS": pd.concat(
597
+ [old_values[date_col_name], new_values[date_col_name]],
598
+ ignore_index=True,
599
+ ),
600
+ new_col: pd.concat(
601
+ [old_values[old_col], new_values[new_col]],
602
+ ignore_index=True,
603
+ ),
604
+ },
605
+ )
396
606
 
397
607
  return merged_df
398
-
399
- def merge_dataframes_on_column(self, dataframes, common_column='OBS', merge_how='outer'):
608
+
609
+ def merge_dataframes_on_column(
610
+ self,
611
+ dataframes,
612
+ common_column="OBS",
613
+ merge_how="outer",
614
+ ):
400
615
  """
401
616
  Merge a list of DataFrames on a common column.
402
617
 
403
- Parameters:
618
+ Parameters
619
+ ----------
404
620
  - dataframes: A list of DataFrames to merge.
405
621
  - common_column: The name of the common column to merge on.
406
622
  - merge_how: The type of merge to perform ('inner', 'outer', 'left', or 'right').
407
623
 
408
- Returns:
624
+ Returns
625
+ -------
409
626
  - A merged DataFrame.
627
+
410
628
  """
411
629
  if not dataframes:
412
630
  return None
413
-
631
+
414
632
  merged_df = dataframes[0] # Start with the first DataFrame
415
633
 
416
634
  for df in dataframes[1:]:
417
635
  merged_df = pd.merge(merged_df, df, on=common_column, how=merge_how)
418
636
 
419
637
  # Check if the common column is of datetime dtype
420
- if merged_df[common_column].dtype == 'datetime64[ns]':
638
+ if merged_df[common_column].dtype == "datetime64[ns]":
421
639
  merged_df[common_column] = pd.to_datetime(merged_df[common_column])
422
640
  merged_df = merged_df.sort_values(by=common_column)
423
641
  merged_df = merged_df.fillna(0)
424
-
642
+
425
643
  return merged_df
426
-
644
+
427
645
  def merge_and_update_dfs(self, df1, df2, key_column):
428
646
  """
429
647
  Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available,
430
648
  and returns a dataframe sorted by the key column.
431
649
 
432
- Parameters:
650
+ Parameters
651
+ ----------
433
652
  df1 (DataFrame): The first dataframe to merge (e.g., processed_facebook).
434
653
  df2 (DataFrame): The second dataframe to merge (e.g., finalised_meta).
435
654
  key_column (str): The name of the column to merge and sort by (e.g., 'OBS').
436
655
 
437
- Returns:
656
+ Returns
657
+ -------
438
658
  DataFrame: The merged and updated dataframe.
439
- """
440
659
 
660
+ """
441
661
  # Sort both DataFrames by the key column
442
662
  df1_sorted = df1.sort_values(by=key_column)
443
663
  df2_sorted = df2.sort_values(by=key_column)
444
664
 
445
665
  # Perform the full outer merge
446
- merged_df = pd.merge(df1_sorted, df2_sorted, on=key_column, how='outer', suffixes=('', '_finalised'))
666
+ merged_df = pd.merge(
667
+ df1_sorted,
668
+ df2_sorted,
669
+ on=key_column,
670
+ how="outer",
671
+ suffixes=("", "_finalised"),
672
+ )
447
673
 
448
674
  # Update with non-null values from df2
449
675
  for column in merged_df.columns:
450
- if column.endswith('_finalised'):
451
- original_column = column.replace('_finalised', '')
452
- merged_df.loc[merged_df[column].notnull(), original_column] = merged_df.loc[merged_df[column].notnull(), column]
676
+ if column.endswith("_finalised"):
677
+ original_column = column.replace("_finalised", "")
678
+ merged_df.loc[merged_df[column].notnull(), original_column] = (
679
+ merged_df.loc[merged_df[column].notnull(), column]
680
+ )
453
681
  merged_df.drop(column, axis=1, inplace=True)
454
682
 
455
683
  # Sort the merged DataFrame by the key column
@@ -459,25 +687,30 @@ class dataprocessing:
459
687
  merged_df.fillna(0, inplace=True)
460
688
 
461
689
  return merged_df
462
-
690
+
463
691
  def convert_us_to_uk_dates(self, df, date_col):
464
692
  """
465
- Processes the date column of a DataFrame to remove hyphens and slashes,
693
+ Processes the date column of a DataFrame to remove hyphens and slashes,
466
694
  and converts it to a datetime object.
467
-
468
- Parameters:
695
+
696
+ Parameters
697
+ ----------
469
698
  df (pd.DataFrame): The DataFrame containing the date column.
470
699
  date_col (str): The name of the date column.
471
-
472
- Returns:
700
+
701
+ Returns
702
+ -------
473
703
  pd.DataFrame: The DataFrame with the processed date column.
704
+
474
705
  """
475
- df[date_col] = df[date_col].str.replace(r'[-/]', '', regex=True)
706
+ df[date_col] = df[date_col].str.replace(r"[-/]", "", regex=True)
476
707
  df[date_col] = pd.to_datetime(
477
- df[date_col].str.slice(0, 2) + '/' +
478
- df[date_col].str.slice(2, 4) + '/' +
479
- df[date_col].str.slice(4, 8),
480
- format='%m/%d/%Y'
708
+ df[date_col].str.slice(0, 2)
709
+ + "/"
710
+ + df[date_col].str.slice(2, 4)
711
+ + "/"
712
+ + df[date_col].str.slice(4, 8),
713
+ format="%m/%d/%Y",
481
714
  )
482
715
  return df
483
716
 
@@ -486,21 +719,40 @@ class dataprocessing:
486
719
  Combines multiple DataFrames from a dictionary into a single DataFrame.
487
720
  Adds a column 'SheetName' indicating the origin sheet of each row.
488
721
 
489
- Parameters:
722
+ Parameters
723
+ ----------
490
724
  all_sheets (dict): A dictionary of DataFrames, typically read from an Excel file with multiple sheets.
491
725
 
492
- Returns:
726
+ Returns
727
+ -------
493
728
  DataFrame: A concatenated DataFrame with an additional 'SheetName' column.
729
+
494
730
  """
495
731
  combined_df = pd.DataFrame()
496
732
 
497
733
  for sheet_name, df in all_sheets.items():
498
- df['SheetName'] = sheet_name
734
+ df["SheetName"] = sheet_name
499
735
  combined_df = pd.concat([combined_df, df], ignore_index=True)
500
736
 
501
737
  return combined_df
502
-
503
- def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=True, week_commencing="W-MON"):
738
+
739
+ def pivot_table(
740
+ self,
741
+ df,
742
+ index_col,
743
+ columns,
744
+ values_col,
745
+ filters_dict=None,
746
+ fill_value=0,
747
+ aggfunc="sum",
748
+ margins=False,
749
+ margins_name="Total",
750
+ datetime_trans_needed=True,
751
+ date_format="%Y-%m-%d",
752
+ reverse_header_order=False,
753
+ fill_missing_weekly_dates=True,
754
+ week_commencing="W-MON",
755
+ ):
504
756
  """
505
757
  Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
506
758
 
@@ -521,6 +773,7 @@ class dataprocessing:
521
773
 
522
774
  Returns:
523
775
  pandas.DataFrame: The pivot table specified
776
+
524
777
  """
525
778
  # Validate inputs
526
779
  if index_col not in df.columns:
@@ -544,7 +797,10 @@ class dataprocessing:
544
797
 
545
798
  # Ensure index column is in datetime format if needed
546
799
  if datetime_trans_needed:
547
- df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
800
+ df_filtered[index_col] = pd.to_datetime(
801
+ df_filtered[index_col],
802
+ dayfirst=True,
803
+ )
548
804
 
549
805
  # Create the pivot table
550
806
  pivoted_df = df_filtered.pivot_table(
@@ -559,7 +815,9 @@ class dataprocessing:
559
815
  # Handle column headers
560
816
  if isinstance(pivoted_df.columns, pd.MultiIndex):
561
817
  pivoted_df.columns = [
562
- "_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
818
+ "_".join(
819
+ reversed(map(str, col)) if reverse_header_order else map(str, col),
820
+ )
563
821
  for col in pivoted_df.columns.values
564
822
  ]
565
823
  else:
@@ -570,7 +828,10 @@ class dataprocessing:
570
828
 
571
829
  # Handle sorting and formatting of index column
572
830
  if datetime_trans_needed:
573
- pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
831
+ pivoted_df[index_col] = pd.to_datetime(
832
+ pivoted_df[index_col],
833
+ errors="coerce",
834
+ )
574
835
  pivoted_df.sort_values(by=index_col, inplace=True)
575
836
  pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
576
837
 
@@ -579,56 +840,75 @@ class dataprocessing:
579
840
 
580
841
  # Fill missing weekly dates if specified
581
842
  if fill_missing_weekly_dates:
582
- pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
843
+ pivoted_df = self.fill_weekly_date_range(
844
+ pivoted_df,
845
+ index_col,
846
+ freq=week_commencing,
847
+ )
583
848
 
584
849
  return pivoted_df
585
850
 
586
- def apply_lookup_table_for_columns(self, df, col_names, to_find_dict, if_not_in_dict="Other", new_column_name="Mapping"):
851
+ def apply_lookup_table_for_columns(
852
+ df,
853
+ col_names,
854
+ to_find_dict,
855
+ if_not_in_dict="Other",
856
+ new_column_name="Mapping",
857
+ ):
587
858
  """
588
- Creates a new DataFrame column based on a look up table, possibly with multiple columns to look up on (dictionary of substrings to class mappings).
859
+ Creates a new DataFrame column based on a look up table, using exact matches.
589
860
 
590
- Parameters:
861
+ Parameters
862
+ ----------
591
863
  df (pandas.DataFrame): The DataFrame containing the data.
592
- col_names (list of str): these are the columns which are used for the lookup. One column or several columns can be inputted as a list, provided there is a merged column to lookup on. If there are multiple columns to look up on then a merged column must be inputted as the key of the dictionary of format e.g. col1|col2|col3
593
- to_find_dict (dict): your look up table, where keys are the values being looked up, and the values are the resulting mappings.
594
- if_not_in_dict (str, optional): default value if no substring matches are found in the look up table dictionary. Defaults to "Other".
595
- new_column_name (str, optional): name of new column. Defaults to "Mapping".
864
+ col_names (list of str): List of column names to use for lookup. If more than one, values are merged with '|'.
865
+ to_find_dict (dict): Lookup dictionary with exact keys to match.
866
+ if_not_in_dict (str, optional): Value used if no match is found. Defaults to "Other".
867
+ new_column_name (str, optional): Name of new output column. Defaults to "Mapping".
596
868
 
597
- Returns:
598
- pandas.DataFrame: DataFrame with a new column containing the look up table results.
599
- """
869
+ Returns
870
+ -------
871
+ pandas.DataFrame: DataFrame with a new column containing lookup results.
600
872
 
601
- # Create regex pattern with word boundaries from the dictionary
602
- regex_pattern = "|".join(r'\b' + re.escape(key) + r'\b' for key in to_find_dict.keys())
603
-
873
+ """
604
874
  # Preprocess DataFrame if multiple columns
605
875
  if len(col_names) > 1:
606
- df["Merged"] = df[col_names].astype(str).apply('|'.join, axis=1)
876
+ df["Merged"] = df[col_names].astype(str).agg("|".join, axis=1)
607
877
  col_to_use = "Merged"
608
878
  else:
609
879
  col_to_use = col_names[0]
610
880
 
611
- # Extract the first match using the regex pattern
612
- matches = df[col_to_use].str.extract(f'({regex_pattern})', expand=False, flags=re.IGNORECASE)
613
-
614
- # Map the matches to the corresponding values in the dictionary
615
- df[new_column_name] = matches.str.lower().map({k.lower(): v for k, v in to_find_dict.items()}).fillna(if_not_in_dict)
616
-
881
+ # Normalize case for matching
882
+ lookup = {k.lower(): v for k, v in to_find_dict.items()}
883
+ df[new_column_name] = (
884
+ df[col_to_use].str.lower().map(lookup).fillna(if_not_in_dict)
885
+ )
886
+
617
887
  # Drop intermediate column if created
618
888
  if len(col_names) > 1:
619
889
  df.drop(columns=["Merged"], inplace=True)
620
890
 
621
891
  return df
622
892
 
623
- def aggregate_daily_to_wc_wide(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum', include_totals : bool = False) -> pd.DataFrame:
893
+ def aggregate_daily_to_wc_wide(
894
+ self,
895
+ df: pd.DataFrame,
896
+ date_column: str,
897
+ group_columns: list[str],
898
+ sum_columns: list[str],
899
+ wc: str = "sun",
900
+ aggregation: str = "sum",
901
+ include_totals: bool = False,
902
+ ) -> pd.DataFrame:
624
903
  """
625
- Aggregates daily data into weekly data, starting on a specified day of the week,
626
- and groups the data by additional specified columns. It aggregates specified numeric columns
627
- by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
628
- of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
904
+ Aggregates daily data into weekly data, starting on a specified day of the week,
905
+ and groups the data by additional specified columns. It aggregates specified numeric columns
906
+ by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
907
+ of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
629
908
  The day column is renamed from 'Day' to 'OBS'.
630
909
 
631
- Parameters:
910
+ Parameters
911
+ ----------
632
912
  - df: pandas DataFrame
633
913
  The input DataFrame containing daily data.
634
914
  - date_column: string
@@ -644,26 +924,36 @@ class dataprocessing:
644
924
  - include_totals: boolean, optional (default False)
645
925
  If True, include total columns for each sum_column.
646
926
 
647
- Returns:
927
+ Returns
928
+ -------
648
929
  - pandas DataFrame
649
930
  A new DataFrame with weekly aggregated data. The index is reset,
650
- and columns represent the grouped and aggregated metrics. The DataFrame
651
- is in wide format, with separate columns for each combination of
931
+ and columns represent the grouped and aggregated metrics. The DataFrame
932
+ is in wide format, with separate columns for each combination of
652
933
  grouped metrics.
934
+
653
935
  """
654
-
655
- grouped = self.aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation)
656
-
936
+ grouped = self.aggregate_daily_to_wc_long(
937
+ df,
938
+ date_column,
939
+ group_columns,
940
+ sum_columns,
941
+ wc,
942
+ aggregation,
943
+ )
944
+
657
945
  # Pivot the data to wide format
658
946
  if group_columns:
659
- wide_df = grouped.pivot_table(index='OBS',
660
- columns=group_columns,
661
- values=sum_columns,
662
- aggfunc='first')
947
+ wide_df = grouped.pivot_table(
948
+ index="OBS",
949
+ columns=group_columns,
950
+ values=sum_columns,
951
+ aggfunc="first",
952
+ )
663
953
  # Flatten the multi-level column index and create combined column names
664
- wide_df.columns = ['_'.join(col).strip() for col in wide_df.columns.values]
954
+ wide_df.columns = ["_".join(col).strip() for col in wide_df.columns.values]
665
955
  else:
666
- wide_df = grouped.set_index('OBS')
956
+ wide_df = grouped.set_index("OBS")
667
957
 
668
958
  # Fill NaN values with 0
669
959
  wide_df = wide_df.fillna(0)
@@ -671,9 +961,11 @@ class dataprocessing:
671
961
  # Adding total columns for each unique sum_column, if include_totals is True
672
962
  if include_totals:
673
963
  for col in sum_columns:
674
- total_column_name = f'Total {col}'
964
+ total_column_name = f"Total {col}"
675
965
  if group_columns:
676
- columns_to_sum = [column for column in wide_df.columns if col in column]
966
+ columns_to_sum = [
967
+ column for column in wide_df.columns if col in column
968
+ ]
677
969
  else:
678
970
  columns_to_sum = [col]
679
971
  wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
@@ -683,11 +975,20 @@ class dataprocessing:
683
975
 
684
976
  return wide_df
685
977
 
686
- def merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = "Merged",starting_prefix_str=None,ending_prefix_str=None):
978
+ def merge_cols_with_seperator(
979
+ self,
980
+ df,
981
+ col_names,
982
+ seperator="_",
983
+ output_column_name="Merged",
984
+ starting_prefix_str=None,
985
+ ending_prefix_str=None,
986
+ ):
687
987
  """
688
988
  Creates a new column in the dataframe that merges 2 or more columns together with a "_" seperator, possibly to be used for a look up table where multiple columns are being looked up
689
989
 
690
- Parameters:
990
+ Parameters
991
+ ----------
691
992
  df (pandas.DataFrame): Dataframe to make changes to.
692
993
  col_names (list): list of columm names ot merge.
693
994
  seperator (str, optional): Name of column outputted. Defaults to "_".
@@ -695,76 +996,99 @@ class dataprocessing:
695
996
  starting_prefix_str (str, optional): string of optional text to be added before the merged column str value
696
997
  ending_prefix_str (str, optional): string of optional text to be added after the merged column str value
697
998
 
698
- Raises:
999
+ Raises
1000
+ ------
699
1001
  ValueError: if more less than two column names are inputted in the list there is nothing to merge on
700
1002
 
701
- Returns:
1003
+ Returns
1004
+ -------
702
1005
  pandas.DataFrame: DataFrame with additional merged column
1006
+
703
1007
  """
704
1008
  # Specify more than one column must be entered
705
1009
  if len(col_names) < 2:
706
1010
  raise ValueError("2 or more columns must be specified to merge")
707
-
1011
+
708
1012
  # Create a new column with the merged columns
709
1013
  df[output_column_name] = df[col_names].astype(str).apply(seperator.join, axis=1)
710
1014
 
711
- # Add string before
1015
+ # Add string before
712
1016
  if starting_prefix_str is not None:
713
- df[output_column_name] = starting_prefix_str + df[output_column_name].astype(str)
714
-
1017
+ df[output_column_name] = starting_prefix_str + df[
1018
+ output_column_name
1019
+ ].astype(str)
1020
+
715
1021
  # Add string after
716
1022
  if ending_prefix_str is not None:
717
- df[output_column_name] = df[output_column_name].astype(str) + ending_prefix_str
718
-
1023
+ df[output_column_name] = (
1024
+ df[output_column_name].astype(str) + ending_prefix_str
1025
+ )
1026
+
719
1027
  return df
720
1028
 
721
- def check_sum_of_df_cols_are_equal(self, df_1,df_2,cols_1,cols_2):
1029
+ def check_sum_of_df_cols_are_equal(self, df_1, df_2, cols_1, cols_2):
722
1030
  """
723
1031
  Checks the sum of two different dataframe column or columns are equal
724
1032
 
725
- Parameters:
1033
+ Parameters
1034
+ ----------
726
1035
  df_1 (pandas.DataFrame): First dataframe for columnsa to be summed on.
727
1036
  df_2 (pandas.DataFrame): Second dataframe for columnsa to be summed on.
728
1037
  cols_1 (list of str): Columns from first dataframe to sum.
729
1038
  cols_2 (list of str): Columns from second dataframe to sum.
730
1039
 
731
- Returns:
1040
+ Returns
1041
+ -------
732
1042
  Tuple: Answer is the true or false answer to whether sums are the same, df_1_sum is the sum of the column/columns in the first dataframe, df_2_sum is the sum of the column/columns in the second dataframe
1043
+
733
1044
  """
734
1045
  # Find the sum of both sets of columns
735
1046
  df_1_sum = df_1[cols_1].sum().sum()
736
1047
  df_2_sum = df_2[cols_2].sum().sum()
737
-
738
- # If the the two columns are
1048
+
1049
+ # If the the two columns are
739
1050
  if df_1_sum == df_2_sum:
740
1051
  Answer = "They are equal"
741
1052
  if df_1_sum != df_2_sum:
742
- Answer = "They are different by " + str(df_2_sum-df_1_sum)
743
-
744
- return Answer,df_1_sum,df_2_sum
745
-
1053
+ Answer = "They are different by " + str(df_2_sum - df_1_sum)
1054
+
1055
+ return Answer, df_1_sum, df_2_sum
1056
+
746
1057
  def convert_2_df_cols_to_dict(self, df, key_col, value_col):
747
1058
  """
748
1059
  Create a dictionary mapping from two columns of a DataFrame.
749
1060
 
750
- Parameters:
1061
+ Parameters
1062
+ ----------
751
1063
  df (pd.DataFrame): The DataFrame containing the data.
752
1064
  key_col (str): The column name to use as keys in the dictionary.
753
1065
  value_col (str): The column name to use as values in the dictionary.
754
1066
 
755
- Returns:
1067
+ Returns
1068
+ -------
756
1069
  dict: A dictionary with keys from 'key_col' and values from 'value_col'.
1070
+
757
1071
  """
758
1072
  if key_col not in df or value_col not in df:
759
1073
  raise ValueError("Specified columns are not in the DataFrame")
760
1074
 
761
1075
  return {df[key_col].iloc[i]: df[value_col].iloc[i] for i in range(len(df))}
762
-
763
- def create_FY_and_H_columns(self, df, index_col, start_date, starting_FY,short_format="No",half_years="No",combined_FY_and_H="No"):
1076
+
1077
+ def create_FY_and_H_columns(
1078
+ self,
1079
+ df,
1080
+ index_col,
1081
+ start_date,
1082
+ starting_FY,
1083
+ short_format="No",
1084
+ half_years="No",
1085
+ combined_FY_and_H="No",
1086
+ ):
764
1087
  """
765
- Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
1088
+ Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
766
1089
 
767
- Parameters:
1090
+ Parameters
1091
+ ----------
768
1092
  df (pandas.DataFrame): Dataframe to operate on.
769
1093
  index_col (str): Name of the column to use for datetime
770
1094
  start_date (str): String used to specify the start date of an FY specified, needs to be of format "yyyy-mm-dd" e.g. 2021-11-31
@@ -773,16 +1097,17 @@ class dataprocessing:
773
1097
  half_years (str, optional): String used to specify if half year column is desired. Defaults to "No".
774
1098
  combined_FY_and_H (str, optional): String used to specify is a combined half year and FY column is desired. Defaults to "No".
775
1099
 
776
- Returns:
1100
+ Returns
1101
+ -------
777
1102
  pandas.DataFrame: DataFrame with a new column 'FY' containing the FY as well as, if desired, a half year column and a combined FY half year column.
1103
+
778
1104
  """
779
-
780
1105
  try:
781
- start_date = datetime.strptime(start_date, '%Y-%m-%d')
1106
+ start_date = datetime.strptime(start_date, "%Y-%m-%d")
782
1107
  except ValueError:
783
1108
  print("Error: Date must be of format yyyy-mm-dd")
784
1109
  return df
785
-
1110
+
786
1111
  df["OBS"] = pd.to_datetime(df[index_col])
787
1112
  df["OBS as string"] = df["OBS"].dt.strftime("%Y-%m-%d")
788
1113
 
@@ -792,35 +1117,51 @@ class dataprocessing:
792
1117
 
793
1118
  def calculate_FY_vectorized(date_series):
794
1119
  years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
795
- fy = 'FY' + (start_year + years_since_start).astype(str)
1120
+ fy = "FY" + (start_year + years_since_start).astype(str)
796
1121
  if short_format == "Yes":
797
- fy = 'FY' + fy.str[-2:]
1122
+ fy = "FY" + fy.str[-2:]
798
1123
  return fy
799
1124
 
800
- df['FY'] = calculate_FY_vectorized(df[index_col])
1125
+ df["FY"] = calculate_FY_vectorized(df[index_col])
801
1126
 
802
1127
  if half_years == "Yes" or combined_FY_and_H == "Yes":
1128
+
803
1129
  def calculate_half_year_vectorized(date_series):
804
- fy_years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
805
- fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(years=1)
806
- fy_end_of_h1 = fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
807
- half_year = np.where(date_series <= fy_end_of_h1, 'H1', 'H2')
1130
+ fy_years_since_start = (
1131
+ (date_series - start_date).dt.days / 364
1132
+ ).astype(int)
1133
+ fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(
1134
+ years=1,
1135
+ )
1136
+ fy_end_of_h1 = (
1137
+ fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
1138
+ )
1139
+ half_year = np.where(date_series <= fy_end_of_h1, "H1", "H2")
808
1140
  return half_year
809
-
810
- df['Half Years'] = calculate_half_year_vectorized(df[index_col])
811
-
1141
+
1142
+ df["Half Years"] = calculate_half_year_vectorized(df[index_col])
1143
+
812
1144
  if combined_FY_and_H == "Yes":
813
- df['Financial Half Years'] = df['FY'] + ' ' + df['Half Years']
1145
+ df["Financial Half Years"] = df["FY"] + " " + df["Half Years"]
814
1146
 
815
1147
  return df
816
-
817
- def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
1148
+
1149
+ def keyword_lookup_replacement(
1150
+ self,
1151
+ df,
1152
+ col,
1153
+ replacement_rows,
1154
+ cols_to_merge,
1155
+ replacement_lookup_dict,
1156
+ output_column_name="Updated Column",
1157
+ ):
818
1158
  """
819
1159
  This function updates values in a specified column of the DataFrame based on a lookup dictionary.
820
1160
  It first merges several columns into a new 'Merged' column, then uses this merged column to determine
821
1161
  if replacements are needed based on the dictionary.
822
1162
 
823
- Parameters:
1163
+ Parameters
1164
+ ----------
824
1165
  df (pd.DataFrame): The DataFrame to process.
825
1166
  col (str): The name of the column whose values are potentially replaced.
826
1167
  replacement_rows (str): The specific value in 'col' to check for replacements.
@@ -828,65 +1169,102 @@ class dataprocessing:
828
1169
  replacement_lookup_dict (dict): Dictionary where keys are merged column values and values are the new data to replace in 'col'.
829
1170
  output_column_name (str, optional): Name of column outputted. Defaults to "Updated Column".
830
1171
 
831
- Returns:
1172
+ Returns
1173
+ -------
832
1174
  pd.DataFrame: The modified DataFrame with updated values in the specified column.
1175
+
833
1176
  """
834
1177
  # Create a merged column from specified columns
835
- df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
836
-
1178
+ df["Merged"] = df[cols_to_merge].apply(
1179
+ lambda row: "|".join(row.values.astype(str)),
1180
+ axis=1,
1181
+ )
1182
+
837
1183
  # Replace values in the specified column based on the lookup
838
1184
  def replace_values(x):
839
1185
  if x[col] == replacement_rows:
840
- merged_value = x['Merged']
1186
+ merged_value = x["Merged"]
841
1187
  if merged_value in replacement_lookup_dict:
842
1188
  return replacement_lookup_dict[merged_value]
843
1189
  return x[col]
844
-
1190
+
845
1191
  # Apply replacement logic
846
1192
  df[output_column_name] = df.apply(replace_values, axis=1)
847
-
1193
+
848
1194
  # Drop the intermediate 'Merged' column
849
- df.drop(columns=['Merged'], inplace=True)
850
-
1195
+ df.drop(columns=["Merged"], inplace=True)
1196
+
851
1197
  return df
852
1198
 
853
- def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
1199
+ def create_new_version_of_col_using_LUT(
1200
+ self,
1201
+ df,
1202
+ keys_col,
1203
+ value_col,
1204
+ dict_for_specific_changes,
1205
+ new_col_name="New Version of Old Col",
1206
+ ):
854
1207
  """
855
- Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
1208
+ Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
856
1209
  The lookup is based on a column in the dataframe. Can only input one column and output one new column.
857
1210
 
858
- Parameters:
1211
+ Parameters
1212
+ ----------
859
1213
  df (pandas.DataFrame): The DataFrame containing the data.
860
1214
  keys_col (str): The name of the column which the LUT will be refercing to ouput a value.
861
1215
  value_col (str): The name of the column which the new column will be based off. If a key in the key column is not found in the LUT, the values from this column are used instead.
862
1216
  dict_for_specific_changes (dict): The LUT which the keys_col will be mapped on to find any values that need changing in the new column.
863
1217
  new_col_name (str, optional): This is the name of the new column being generated. Defaults to "New Version of Old Col".
864
1218
 
865
- Returns:
1219
+ Returns
1220
+ -------
866
1221
  pandas.DataFrame: DataFrame with a new column which is similar to the old column, except for where changes have been made to reflect the lookup table.
1222
+
867
1223
  """
868
-
869
1224
  # Extract columns to change using new dictionary
870
- smaller_df = df[[keys_col,value_col]]
1225
+ smaller_df = df[[keys_col, value_col]]
871
1226
 
872
1227
  # Use the new dictionary to create a new LUT
873
- smaller_df_with_LUT = self.apply_lookup_table_for_columns(smaller_df,[keys_col,value_col],dict_for_specific_changes)
874
-
1228
+ smaller_df_with_LUT = self.apply_lookup_table_for_columns(
1229
+ smaller_df,
1230
+ [keys_col, value_col],
1231
+ dict_for_specific_changes,
1232
+ )
1233
+
875
1234
  # In a new column, keep values from the old column that don't need updating as they are not in the dictionary, and replace values that do need updating with values from the dictionary based on the keys
876
- smaller_df_with_LUT["Updated Col"]=smaller_df_with_LUT.apply(lambda x: x['Mapping'] if x['Mapping'] != "Other" else x[value_col],axis=1)
1235
+ smaller_df_with_LUT["Updated Col"] = smaller_df_with_LUT.apply(
1236
+ lambda x: x["Mapping"] if x["Mapping"] != "Other" else x[value_col],
1237
+ axis=1,
1238
+ )
877
1239
 
878
1240
  # Drop the extra unecessary cols
879
- smaller_df_with_LUT.drop([keys_col,'Mapping'],axis=1,inplace=True)
880
-
1241
+ smaller_df_with_LUT.drop([keys_col, "Mapping"], axis=1, inplace=True)
1242
+
881
1243
  # # Output dataframes as dictionary to be used in a LUT
882
- new_dict = self.convert_2_df_cols_to_dict(smaller_df_with_LUT,value_col,"Updated Col")
1244
+ new_dict = self.convert_2_df_cols_to_dict(
1245
+ smaller_df_with_LUT,
1246
+ value_col,
1247
+ "Updated Col",
1248
+ )
883
1249
 
884
1250
  # # Use new dictionary to create a new version of an old column
885
- df_final = self.apply_lookup_table_for_columns(df,[keys_col],new_dict,"other",new_col_name)
886
-
1251
+ df_final = self.apply_lookup_table_for_columns(
1252
+ df,
1253
+ [keys_col],
1254
+ new_dict,
1255
+ "other",
1256
+ new_col_name,
1257
+ )
1258
+
887
1259
  return df_final
888
-
889
- def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
1260
+
1261
+ def convert_df_wide_2_long(
1262
+ self,
1263
+ df,
1264
+ value_cols,
1265
+ variable_col_name="Stacked",
1266
+ value_col_name="Value",
1267
+ ):
890
1268
  """
891
1269
  Changes a dataframe from wide to long format.
892
1270
 
@@ -901,16 +1279,25 @@ class dataprocessing:
901
1279
 
902
1280
  Raises:
903
1281
  ValueError: If the number of columns to depivot is less than 2.
1282
+
904
1283
  """
905
1284
  # Check length of value_cols is greater than 1
906
1285
  if len(value_cols) < 2:
907
1286
  raise ValueError("Number of inputs in list must be greater than 1")
908
1287
 
909
1288
  # Find the columns that are not to be depivoted into one column
910
- id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
1289
+ id_vars = [
1290
+ col for col in df.columns if col not in value_cols
1291
+ ] # Preserve column order in the DataFrame
911
1292
 
912
1293
  # Melt all columns chosen into one column
913
- df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
1294
+ df_final = pd.melt(
1295
+ df,
1296
+ id_vars=id_vars,
1297
+ value_vars=value_cols,
1298
+ var_name=variable_col_name,
1299
+ value_name=value_col_name,
1300
+ )
914
1301
 
915
1302
  # Sort column order to match expected output
916
1303
  ordered_columns = id_vars + [variable_col_name, value_col_name]
@@ -918,7 +1305,19 @@ class dataprocessing:
918
1305
 
919
1306
  return df_final
920
1307
 
921
- def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
1308
+ def manually_edit_data(
1309
+ self,
1310
+ df,
1311
+ filters_dict,
1312
+ col_to_change,
1313
+ new_value,
1314
+ change_in_existing_df_col="No",
1315
+ new_col_to_change_name="New",
1316
+ manual_edit_col_name=None,
1317
+ add_notes="No",
1318
+ existing_note_col_name=None,
1319
+ note=None,
1320
+ ):
922
1321
  """
923
1322
  Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
924
1323
 
@@ -941,31 +1340,44 @@ class dataprocessing:
941
1340
 
942
1341
  Returns:
943
1342
  pandas.DataFrame: Dataframe with manual changes added
1343
+
944
1344
  """
945
-
946
1345
  # Raise type error if more than one col is supported
947
1346
  if isinstance(col_to_change, list):
948
1347
  raise TypeError("Col to change must be specified as a string, not a list")
949
1348
 
950
1349
  # Raises value error if input is invalid for change_in_existing_df_col
951
1350
  if change_in_existing_df_col not in ["Yes", "No"]:
952
- raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
1351
+ raise ValueError(
1352
+ "Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']",
1353
+ )
953
1354
 
954
1355
  # Raises value error if input is invalid for add_notes_col
955
1356
  if add_notes not in ["Yes", "No"]:
956
- raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
1357
+ raise ValueError(
1358
+ "Invalid input value for add_notes. Allowed values are: ['Yes', 'No']",
1359
+ )
957
1360
 
958
1361
  # Validate filters_dict format
959
1362
  for col, cond in filters_dict.items():
960
1363
  if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
961
- raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
1364
+ raise ValueError(
1365
+ f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'",
1366
+ )
962
1367
 
963
1368
  # Create the filtered df by applying the conditions
964
1369
  df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
965
1370
 
966
1371
  # Create a new column to add the changes if desired, else edit in the current chosen column
967
- col_to_update = col_to_change if change_in_existing_df_col == "Yes" else new_col_to_change_name
968
- if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
1372
+ col_to_update = (
1373
+ col_to_change
1374
+ if change_in_existing_df_col == "Yes"
1375
+ else new_col_to_change_name
1376
+ )
1377
+ if (
1378
+ change_in_existing_df_col == "No"
1379
+ and new_col_to_change_name not in df.columns
1380
+ ):
969
1381
  df = df.copy()
970
1382
  df[new_col_to_change_name] = df[col_to_change]
971
1383
 
@@ -977,19 +1389,19 @@ class dataprocessing:
977
1389
  if manual_edit_col_name not in df.columns:
978
1390
  df[manual_edit_col_name] = 0
979
1391
  df.loc[df_filtered.index, manual_edit_col_name] = 1
980
- elif not manual_edit_col_name and 'Manual Changes' not in df.columns:
981
- df['Manual Changes'] = 0
982
- df.loc[df_filtered.index, 'Manual Changes'] = 1
1392
+ elif not manual_edit_col_name and "Manual Changes" not in df.columns:
1393
+ df["Manual Changes"] = 0
1394
+ df.loc[df_filtered.index, "Manual Changes"] = 1
983
1395
 
984
1396
  # Add note if desired in new column or an existing column
985
1397
  if add_notes == "Yes":
986
- note_col = existing_note_col_name if existing_note_col_name else 'Notes'
1398
+ note_col = existing_note_col_name if existing_note_col_name else "Notes"
987
1399
  if note_col not in df.columns:
988
1400
  df[note_col] = None
989
1401
  df.loc[df_filtered.index, note_col] = note
990
1402
 
991
1403
  return df
992
-
1404
+
993
1405
  def format_numbers_with_commas(self, df, decimal_length_chosen=2):
994
1406
  """
995
1407
  Converts data in numerical format into numbers with commas and a chosen decimal place length.
@@ -1000,24 +1412,26 @@ class dataprocessing:
1000
1412
 
1001
1413
  Returns:
1002
1414
  pandas.DataFrame: The DataFrame with the chosen updated format.
1415
+
1003
1416
  """
1417
+
1004
1418
  def format_number_with_commas(x, decimal_length=decimal_length_chosen):
1005
1419
  if pd.isna(x): # Preserve None/NaN values
1006
1420
  return pd.NA # Explicitly normalize to pd.NA
1007
- elif isinstance(x, (int, float)):
1421
+ if isinstance(x, (int, float)):
1008
1422
  if decimal_length is not None:
1009
1423
  format_str = f"{{:,.{decimal_length}f}}"
1010
1424
  return format_str.format(x)
1011
- else:
1012
- return f"{x:,}"
1013
- else:
1014
- return x # Return unchanged if not a number
1425
+ return f"{x:,}"
1426
+ return x # Return unchanged if not a number
1015
1427
 
1016
1428
  # Apply formatting column by column
1017
- formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
1429
+ formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(
1430
+ value=pd.NA,
1431
+ )
1018
1432
 
1019
1433
  return formatted_df
1020
-
1434
+
1021
1435
  def filter_df_on_multiple_conditions(self, df, filters_dict):
1022
1436
  """
1023
1437
  Filter a dataframe based on mulitple conditions
@@ -1028,59 +1442,62 @@ class dataprocessing:
1028
1442
 
1029
1443
  Returns:
1030
1444
  pandas.DatFrame: Filtered Da
1445
+
1031
1446
  """
1032
1447
  mask = pd.Series(True, index=df.index)
1033
1448
  for col, cond in filters_dict.items():
1034
1449
  cond = cond.strip()
1035
1450
  operator, value = cond.split(maxsplit=1)
1036
-
1451
+
1037
1452
  # If value is a string condition make sure to check if there are new lines
1038
1453
  if "'" in value:
1039
1454
  value = value.strip().strip("'\"")
1040
1455
  # If not a string e.g. datetime or number condition you need to transform the string into a value
1041
1456
  else:
1042
- value = eval(value)
1457
+ value = eval(value)
1043
1458
 
1044
1459
  if operator == "==":
1045
- temp_mask = (df[col] == value)
1460
+ temp_mask = df[col] == value
1046
1461
  elif operator == "!=":
1047
- temp_mask = (df[col] != value)
1462
+ temp_mask = df[col] != value
1048
1463
  elif operator == ">=":
1049
- temp_mask = (df[col] >= value)
1464
+ temp_mask = df[col] >= value
1050
1465
  elif operator == "<=":
1051
- temp_mask = (df[col] <= value)
1466
+ temp_mask = df[col] <= value
1052
1467
  elif operator == ">":
1053
- temp_mask = (df[col] > value)
1468
+ temp_mask = df[col] > value
1054
1469
  elif operator == "<":
1055
- temp_mask = (df[col] < value)
1470
+ temp_mask = df[col] < value
1056
1471
  mask &= temp_mask
1057
1472
 
1058
1473
  # Create the filtered df by applying the conditions
1059
1474
  df_filtered = df[mask]
1060
-
1475
+
1061
1476
  return df_filtered
1062
-
1063
- def read_and_concatenate_files(self, folder_path, file_type='csv'):
1477
+
1478
+ def read_and_concatenate_files(self, folder_path, file_type="csv"):
1064
1479
  """
1065
- Reads all files of a specified type (CSV or XLSX) from a given folder
1480
+ Reads all files of a specified type (CSV or XLSX) from a given folder
1066
1481
  and concatenates them into a single DataFrame.
1067
-
1068
- Parameters:
1482
+
1483
+ Parameters
1484
+ ----------
1069
1485
  folder_path (str): The path to the folder containing the files.
1070
1486
  file_type (str): The type of files to read ('csv' or 'xlsx'). Defaults to 'csv'.
1071
-
1072
- Returns:
1487
+
1488
+ Returns
1489
+ -------
1073
1490
  pd.DataFrame: A DataFrame containing the concatenated data from all files.
1491
+
1074
1492
  """
1075
-
1076
1493
  # Initialize an empty list to hold dataframes
1077
1494
  dataframes = []
1078
1495
 
1079
1496
  # Define file extension based on file_type
1080
- if file_type == 'csv':
1081
- extension = '.csv'
1082
- elif file_type == 'xlsx':
1083
- extension = '.xlsx'
1497
+ if file_type == "csv":
1498
+ extension = ".csv"
1499
+ elif file_type == "xlsx":
1500
+ extension = ".xlsx"
1084
1501
  else:
1085
1502
  raise ValueError("file_type must be either 'csv' or 'xlsx'")
1086
1503
 
@@ -1090,19 +1507,19 @@ class dataprocessing:
1090
1507
  if filename.endswith(extension):
1091
1508
  file_path = os.path.join(folder_path, filename)
1092
1509
  # Read the file into a DataFrame
1093
- if file_type == 'csv':
1510
+ if file_type == "csv":
1094
1511
  df = pd.read_csv(file_path)
1095
- elif file_type == 'xlsx':
1512
+ elif file_type == "xlsx":
1096
1513
  df = pd.read_excel(file_path)
1097
1514
  # Append the DataFrame to the list
1098
1515
  dataframes.append(df)
1099
1516
 
1100
1517
  # Concatenate all DataFrames into a single DataFrame
1101
1518
  combined_df = pd.concat(dataframes, ignore_index=True)
1102
-
1519
+
1103
1520
  return combined_df
1104
-
1105
- def upgrade_outdated_packages(self, exclude_packages=['twine']):
1521
+
1522
+ def upgrade_outdated_packages(self, exclude_packages=["twine"]):
1106
1523
  """
1107
1524
  Upgrade all outdated Python packages except those specified in `exclude_packages`.
1108
1525
 
@@ -1113,32 +1530,49 @@ class dataprocessing:
1113
1530
  try:
1114
1531
  # Get all installed packages
1115
1532
  installed_packages_result = subprocess.run(
1116
- "pip list --format=json", shell=True, capture_output=True, text=True
1533
+ "pip list --format=json",
1534
+ check=False,
1535
+ shell=True,
1536
+ capture_output=True,
1537
+ text=True,
1117
1538
  )
1118
1539
  installed_packages = json.loads(installed_packages_result.stdout)
1119
1540
 
1120
1541
  # Get the list of outdated packages
1121
1542
  outdated_packages_result = subprocess.run(
1122
- "pip list --outdated --format=json", shell=True, capture_output=True, text=True
1543
+ "pip list --outdated --format=json",
1544
+ check=False,
1545
+ shell=True,
1546
+ capture_output=True,
1547
+ text=True,
1123
1548
  )
1124
1549
  outdated_packages = json.loads(outdated_packages_result.stdout)
1125
1550
 
1126
1551
  # Create a set of outdated package names for quick lookup
1127
- outdated_package_names = {pkg['name'] for pkg in outdated_packages}
1552
+ outdated_package_names = {pkg["name"] for pkg in outdated_packages}
1128
1553
 
1129
1554
  # Upgrade only outdated packages, excluding specified packages
1130
1555
  for package in installed_packages:
1131
- package_name = package['name']
1132
- if package_name in outdated_package_names and package_name not in exclude_packages:
1556
+ package_name = package["name"]
1557
+ if (
1558
+ package_name in outdated_package_names
1559
+ and package_name not in exclude_packages
1560
+ ):
1133
1561
  try:
1134
1562
  print(f"Upgrading package: {package_name}")
1135
1563
  upgrade_result = subprocess.run(
1136
- f"pip install --upgrade {package_name}", shell=True, capture_output=True, text=True
1564
+ f"pip install --upgrade {package_name}",
1565
+ check=False,
1566
+ shell=True,
1567
+ capture_output=True,
1568
+ text=True,
1137
1569
  )
1138
1570
  if upgrade_result.returncode == 0:
1139
1571
  print(f"Successfully upgraded {package_name}")
1140
1572
  else:
1141
- print(f"Failed to upgrade {package_name}: {upgrade_result.stderr}")
1573
+ print(
1574
+ f"Failed to upgrade {package_name}: {upgrade_result.stderr}",
1575
+ )
1142
1576
  except Exception as e:
1143
1577
  print(f"An error occurred while upgrading {package_name}: {e}")
1144
1578
  elif package_name in exclude_packages:
@@ -1150,12 +1584,12 @@ class dataprocessing:
1150
1584
 
1151
1585
  def convert_mixed_formats_dates(self, df, column_name):
1152
1586
  # Convert initial dates to datetime with coercion to handle errors
1153
- df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
1587
+ df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
1154
1588
  df[column_name] = df[column_name].astype(str)
1155
1589
  corrected_dates = []
1156
-
1590
+
1157
1591
  for date_str in df[column_name]:
1158
- date_str = date_str.replace('-', '').replace('/', '')
1592
+ date_str = date_str.replace("-", "").replace("/", "")
1159
1593
  if len(date_str) == 8:
1160
1594
  year = date_str[:4]
1161
1595
  month = date_str[4:6]
@@ -1166,39 +1600,45 @@ class dataprocessing:
1166
1600
  else:
1167
1601
  corrected_date_str = f"{year}-{month}-{day}"
1168
1602
  # Convert to datetime
1169
- corrected_date = pd.to_datetime(corrected_date_str, errors='coerce')
1603
+ corrected_date = pd.to_datetime(corrected_date_str, errors="coerce")
1170
1604
  else:
1171
- corrected_date = pd.to_datetime(date_str, errors='coerce')
1172
-
1605
+ corrected_date = pd.to_datetime(date_str, errors="coerce")
1606
+
1173
1607
  corrected_dates.append(corrected_date)
1174
-
1608
+
1175
1609
  # Check length of the corrected_dates list
1176
1610
  if len(corrected_dates) != len(df):
1177
- raise ValueError("Length of corrected_dates does not match the original DataFrame")
1178
-
1611
+ raise ValueError(
1612
+ "Length of corrected_dates does not match the original DataFrame",
1613
+ )
1614
+
1179
1615
  # Assign the corrected dates back to the DataFrame
1180
1616
  df[column_name] = corrected_dates
1181
1617
  return df
1182
1618
 
1183
- def fill_weekly_date_range(self, df, date_column, freq='W-MON'):
1619
+ def fill_weekly_date_range(self, df, date_column, freq="W-MON"):
1184
1620
  # Ensure the date column is in datetime format
1185
1621
  df[date_column] = pd.to_datetime(df[date_column])
1186
-
1622
+
1187
1623
  # Generate the full date range with the specified frequency
1188
- full_date_range = pd.date_range(start=df[date_column].min(), end=df[date_column].max(), freq=freq)
1189
-
1624
+ full_date_range = pd.date_range(
1625
+ start=df[date_column].min(),
1626
+ end=df[date_column].max(),
1627
+ freq=freq,
1628
+ )
1629
+
1190
1630
  # Create a new dataframe with the full date range
1191
1631
  full_date_df = pd.DataFrame({date_column: full_date_range})
1192
-
1632
+
1193
1633
  # Merge the original dataframe with the new full date range dataframe
1194
- df_full = full_date_df.merge(df, on=date_column, how='left')
1195
-
1634
+ df_full = full_date_df.merge(df, on=date_column, how="left")
1635
+
1196
1636
  # Fill missing values with 0
1197
1637
  df_full.fillna(0, inplace=True)
1198
-
1638
+
1199
1639
  return df_full
1200
-
1201
- def add_prefix_and_suffix(self, df, prefix='', suffix='', date_col=None):
1640
+
1641
+ def add_prefix_and_suffix(self, df, prefix="", suffix="", date_col=None):
1202
1642
  """
1203
1643
  Adds a specified prefix and/or suffix to the column names of a DataFrame. Optionally, a column (e.g., a date column) can be excluded.
1204
1644
 
@@ -1210,19 +1650,28 @@ class dataprocessing:
1210
1650
 
1211
1651
  Returns:
1212
1652
  pd.DataFrame: The DataFrame with updated column names.
1653
+
1213
1654
  """
1214
-
1215
1655
  # If there is no date column
1216
1656
  if date_col is None:
1217
1657
  # Add prefixes and suffixes to all columns
1218
1658
  df.columns = [prefix + col + suffix for col in df.columns]
1219
1659
  else:
1220
1660
  # Add prefixes and suffixes to all columns except the date column
1221
- df.columns = [prefix + col + suffix if col != date_col else col for col in df.columns]
1222
-
1661
+ df.columns = [
1662
+ prefix + col + suffix if col != date_col else col for col in df.columns
1663
+ ]
1664
+
1223
1665
  return df
1224
1666
 
1225
- def create_dummies(self, df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total'):
1667
+ def create_dummies(
1668
+ self,
1669
+ df,
1670
+ date_col=None,
1671
+ dummy_threshold=0,
1672
+ add_total_dummy_col="No",
1673
+ total_col_name="total",
1674
+ ):
1226
1675
  """
1227
1676
  Creates dummy variables for the DataFrame, converting values greater than the threshold to 1 and others to 0.
1228
1677
  Optionally adds a total dummy column indicating whether any row contains at least one value greater than the threshold.
@@ -1236,13 +1685,15 @@ class dataprocessing:
1236
1685
 
1237
1686
  Returns:
1238
1687
  pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
1239
- """
1240
1688
 
1689
+ """
1241
1690
  # If there is no date column
1242
1691
  if date_col is None:
1243
- df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
1692
+ df = df.apply(
1693
+ lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0),
1694
+ )
1244
1695
 
1245
- if add_total_dummy_col != 'No':
1696
+ if add_total_dummy_col != "No":
1246
1697
  # Find max value of rows
1247
1698
  df[total_col_name] = df.max(axis=1)
1248
1699
 
@@ -1250,18 +1701,25 @@ class dataprocessing:
1250
1701
  else:
1251
1702
  # Create dummies for all columns except the date column
1252
1703
  df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
1253
- lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
1704
+ lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0),
1254
1705
  )
1255
1706
 
1256
- if add_total_dummy_col != 'No':
1707
+ if add_total_dummy_col != "No":
1257
1708
  # Find max value of rows
1258
1709
  df[total_col_name] = df.loc[:, df.columns != date_col].max(axis=1)
1259
1710
 
1260
1711
  return df
1261
1712
 
1262
- def replace_substrings(self, df, column, replacements, to_lower=False, new_column=None):
1713
+ def replace_substrings(
1714
+ self,
1715
+ df,
1716
+ column,
1717
+ replacements,
1718
+ to_lower=False,
1719
+ new_column=None,
1720
+ ):
1263
1721
  """
1264
- Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
1722
+ Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
1265
1723
  Optionally converts the column values to lowercase and allows creating a new column or modifying the existing one.
1266
1724
 
1267
1725
  Args:
@@ -1273,6 +1731,7 @@ class dataprocessing:
1273
1731
 
1274
1732
  Returns:
1275
1733
  pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
1734
+
1276
1735
  """
1277
1736
  if new_column is not None:
1278
1737
  # Create a new column for replacements
@@ -1292,7 +1751,7 @@ class dataprocessing:
1292
1751
 
1293
1752
  return df
1294
1753
 
1295
- def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
1754
+ def add_total_column(self, df, exclude_col=None, total_col_name="Total"):
1296
1755
  """
1297
1756
  Adds a total column to a DataFrame by summing across all columns. Optionally excludes a specified column.
1298
1757
 
@@ -1303,17 +1762,27 @@ class dataprocessing:
1303
1762
 
1304
1763
  Returns:
1305
1764
  pd.DataFrame: The DataFrame with an added total column.
1765
+
1306
1766
  """
1307
1767
  if exclude_col and exclude_col in df.columns:
1308
1768
  # Ensure the column to exclude exists before dropping
1309
- df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
1769
+ df[total_col_name] = df.drop(columns=[exclude_col], errors="ignore").sum(
1770
+ axis=1,
1771
+ )
1310
1772
  else:
1311
1773
  # Sum across all columns if no column is specified to exclude
1312
1774
  df[total_col_name] = df.sum(axis=1)
1313
-
1775
+
1314
1776
  return df
1315
1777
 
1316
- def apply_lookup_table_based_on_substring(self, df, column_name, category_dict, new_col_name='Category', other_label='Other'):
1778
+ def apply_lookup_table_based_on_substring(
1779
+ self,
1780
+ df,
1781
+ column_name,
1782
+ category_dict,
1783
+ new_col_name="Category",
1784
+ other_label="Other",
1785
+ ):
1317
1786
  """
1318
1787
  Categorizes text in a specified DataFrame column by applying a lookup table based on substrings.
1319
1788
 
@@ -1326,6 +1795,7 @@ class dataprocessing:
1326
1795
 
1327
1796
  Returns:
1328
1797
  pd.DataFrame: The original DataFrame with an additional column containing the assigned categories.
1798
+
1329
1799
  """
1330
1800
 
1331
1801
  def categorize_text(text):
@@ -1336,11 +1806,14 @@ class dataprocessing:
1336
1806
  text (str): The text string to categorize.
1337
1807
 
1338
1808
  Returns:
1339
- str: The category assigned based on the first matching substring found in the text. If no
1809
+ str: The category assigned based on the first matching substring found in the text. If no
1340
1810
  matching substring is found, returns other_name.
1811
+
1341
1812
  """
1342
1813
  for key, category in category_dict.items():
1343
- if key.lower() in text.lower(): # Check if the substring is in the text (case-insensitive)
1814
+ if (
1815
+ key.lower() in text.lower()
1816
+ ): # Check if the substring is in the text (case-insensitive)
1344
1817
  return category
1345
1818
  return other_label # Default category if no match is found
1346
1819
 
@@ -1359,6 +1832,7 @@ class dataprocessing:
1359
1832
 
1360
1833
  Returns:
1361
1834
  tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
1835
+
1362
1836
  """
1363
1837
  # Ensure date columns are in datetime format
1364
1838
  df1[date_col] = pd.to_datetime(df1[date_col])
@@ -1373,29 +1847,43 @@ class dataprocessing:
1373
1847
  df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
1374
1848
 
1375
1849
  # Merge the DataFrames on the date column
1376
- merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
1850
+ merged_df = pd.merge(
1851
+ df1_overlap,
1852
+ df2_overlap,
1853
+ on=date_col,
1854
+ suffixes=("_df1", "_df2"),
1855
+ )
1377
1856
 
1378
1857
  # Get common columns, excluding the date column
1379
- common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
1858
+ common_cols = [
1859
+ col for col in df1.columns if col != date_col and col in df2.columns
1860
+ ]
1380
1861
 
1381
1862
  # Create a DataFrame for differences
1382
1863
  diff_df = pd.DataFrame({date_col: merged_df[date_col]})
1383
1864
 
1384
1865
  total_diff_list = []
1385
1866
  for col in common_cols:
1386
- diff_col = f'diff_{col}'
1387
- diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
1867
+ diff_col = f"diff_{col}"
1868
+ diff_df[diff_col] = (
1869
+ merged_df[f"{col}_df1"] - merged_df[f"{col}_df2"]
1870
+ ) # Corrected subtraction order
1388
1871
 
1389
1872
  # Sum differences for the column
1390
1873
  total_diff = diff_df[diff_col].sum()
1391
- total_diff_list.append({'Column': col, 'Total Difference': total_diff})
1874
+ total_diff_list.append({"Column": col, "Total Difference": total_diff})
1392
1875
 
1393
1876
  # Create summary DataFrame
1394
1877
  total_diff_df = pd.DataFrame(total_diff_list)
1395
1878
 
1396
1879
  return diff_df, total_diff_df
1397
1880
 
1398
- def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
1881
+ def week_commencing_2_week_commencing_conversion_isoweekday(
1882
+ self,
1883
+ df,
1884
+ date_col,
1885
+ week_commencing="mon",
1886
+ ):
1399
1887
  """
1400
1888
  Convert a DataFrame's date column so that each date is mapped back
1401
1889
  to the 'week_commencing' day of the *current ISO week*.
@@ -1403,7 +1891,7 @@ class dataprocessing:
1403
1891
  Args:
1404
1892
  df (pandas.DataFrame): The DataFrame with date-based data.
1405
1893
  date_col (str): The name of the date column.
1406
- week_commencing (str): The desired start of the week.
1894
+ week_commencing (str): The desired start of the week.
1407
1895
  ('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
1408
1896
  Uses ISO day numbering (Mon=1, ..., Sun=7).
1409
1897
 
@@ -1411,9 +1899,18 @@ class dataprocessing:
1411
1899
  pandas.DataFrame: Original DataFrame with an extra column
1412
1900
  'week_start_<week_commencing>' containing the
1413
1901
  start-of-week date for each row.
1902
+
1414
1903
  """
1415
1904
  # ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
1416
- iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
1905
+ iso_day_dict = {
1906
+ "mon": 1,
1907
+ "tue": 2,
1908
+ "wed": 3,
1909
+ "thur": 4,
1910
+ "fri": 5,
1911
+ "sat": 6,
1912
+ "sun": 7,
1913
+ }
1417
1914
 
1418
1915
  target_day = iso_day_dict[week_commencing]
1419
1916
 
@@ -1424,15 +1921,23 @@ class dataprocessing:
1424
1921
  # Apply the transformation
1425
1922
  new_col = f"week_start_{week_commencing}"
1426
1923
  df[new_col] = df[date_col].apply(map_to_week_start)
1427
-
1924
+
1428
1925
  return df
1429
-
1430
- def seasonality_feature_extraction(self, df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False):
1926
+
1927
+ def seasonality_feature_extraction(
1928
+ self,
1929
+ df,
1930
+ kpi_var,
1931
+ n_features=10,
1932
+ test_size=0.1,
1933
+ random_state=42,
1934
+ shuffle=False,
1935
+ ):
1431
1936
  """
1432
1937
  1) Uses the provided dataframe (df), where:
1433
1938
  - df['kpi_total_sales'] is the target (y).
1434
1939
  - df['OBS'] is a date or index column (excluded from features).
1435
-
1940
+
1436
1941
  2) Splits data into train/test using the specified test_size, random_state, and shuffle.
1437
1942
  3) Trains XGBoost and Random Forest on all features.
1438
1943
  4) Extracts the top n_features from each model.
@@ -1462,20 +1967,22 @@ class dataprocessing:
1462
1967
  - "combined_features": merged unique feature list
1463
1968
  - "performance": dictionary of performance metrics
1464
1969
  - "models": dictionary of fitted models
1970
+
1465
1971
  """
1466
1972
  # ---------------------------------------------------------------------
1467
1973
  # 1. Prepare your data (X, y)
1468
1974
  # ---------------------------------------------------------------------
1469
1975
  # Extract target and features
1470
1976
  y = df[kpi_var]
1471
- X = df.drop(columns=['OBS', kpi_var])
1977
+ X = df.drop(columns=["OBS", kpi_var])
1472
1978
 
1473
1979
  # Split into train/test
1474
1980
  X_train, X_test, y_train, y_test = train_test_split(
1475
- X, y,
1981
+ X,
1982
+ y,
1476
1983
  test_size=test_size,
1477
1984
  random_state=random_state,
1478
- shuffle=shuffle
1985
+ shuffle=shuffle,
1479
1986
  )
1480
1987
 
1481
1988
  # ---------------------------------------------------------------------
@@ -1488,16 +1995,13 @@ class dataprocessing:
1488
1995
  # (B) Get feature importances
1489
1996
  xgb_importances = xgb_model_full.feature_importances_
1490
1997
  xgb_feat_importance_df = (
1491
- pd.DataFrame({
1492
- 'feature': X.columns,
1493
- 'importance': xgb_importances
1494
- })
1495
- .sort_values('importance', ascending=False)
1998
+ pd.DataFrame({"feature": X.columns, "importance": xgb_importances})
1999
+ .sort_values("importance", ascending=False)
1496
2000
  .reset_index(drop=True)
1497
2001
  )
1498
2002
 
1499
2003
  # (C) Select top N features
1500
- top_features_xgb = xgb_feat_importance_df['feature'].head(n_features).tolist()
2004
+ top_features_xgb = xgb_feat_importance_df["feature"].head(n_features).tolist()
1501
2005
 
1502
2006
  # (D) Subset data to top N features
1503
2007
  X_train_xgb_topN = X_train[top_features_xgb]
@@ -1515,16 +2019,13 @@ class dataprocessing:
1515
2019
  # (B) Get feature importances
1516
2020
  rf_importances = rf_model_full.feature_importances_
1517
2021
  rf_feat_importance_df = (
1518
- pd.DataFrame({
1519
- 'feature': X.columns,
1520
- 'importance': rf_importances
1521
- })
1522
- .sort_values('importance', ascending=False)
2022
+ pd.DataFrame({"feature": X.columns, "importance": rf_importances})
2023
+ .sort_values("importance", ascending=False)
1523
2024
  .reset_index(drop=True)
1524
2025
  )
1525
2026
 
1526
2027
  # (C) Select top N features
1527
- top_features_rf = rf_feat_importance_df['feature'].head(n_features).tolist()
2028
+ top_features_rf = rf_feat_importance_df["feature"].head(n_features).tolist()
1528
2029
 
1529
2030
  # (D) Subset data to top N features
1530
2031
  X_train_rf_topN = X_train[top_features_rf]
@@ -1556,25 +2057,45 @@ class dataprocessing:
1556
2057
 
1557
2058
  return output
1558
2059
 
1559
- def quid_pr (self, df):
2060
+ def quid_pr(self, df):
1560
2061
  def convert_date(date_str):
1561
2062
  try:
1562
- return datetime.strptime(date_str, '%b %d, %Y')
2063
+ return datetime.strptime(date_str, "%b %d, %Y")
1563
2064
  except ValueError:
1564
2065
  return None # Return None if conversion fails
2066
+
1565
2067
  # Apply conversion to create new columns
1566
- df['Start Date'] = df['Earliest Published'].astype(str).apply(convert_date)
1567
- df['End Date'] = df['Latest Published'].astype(str).apply(convert_date)
1568
- df['Days Duration'] = (df['End Date'] - df['Start Date']).dt.days + 1 # Ensure inclusive range
1569
- df['Count per Day'] = df['Published Count'] / df['Days Duration'] # Calculate count per day
1570
- df['Social Engagement per Day'] = df['Social Engagement'] / df['Days Duration']
1571
- df['Week Start'] = df['Start Date'].apply(lambda x: x - timedelta(days=x.weekday()) if pd.notnull(x) else None)
1572
- count_df = df.groupby('Week Start')['Count per Day'].sum().reset_index()
1573
- total_engagement_per_company = df.groupby('Company (Primary Mention)')['Social Engagement'].sum().reset_index() # Caluclates Social Engagement across whole period
1574
- valid_companies = total_engagement_per_company[total_engagement_per_company['Social Engagement'] > 0]['Company (Primary Mention)'] # Filters out Companies with no Social Engagement
1575
- social_engagement_df = df[df['Company (Primary Mention)'].isin(valid_companies)].groupby(['Week Start', 'Company (Primary Mention)'])[
1576
- 'Social Engagement'
1577
- ].sum().reset_index()
1578
- total_social_engagement_df = df.groupby('Week Start')['Social Engagement per Day'].sum().reset_index()
1579
-
1580
- return count_df, total_social_engagement_df, social_engagement_df
2068
+ df["Start Date"] = df["Earliest Published"].astype(str).apply(convert_date)
2069
+ df["End Date"] = df["Latest Published"].astype(str).apply(convert_date)
2070
+ df["Days Duration"] = (
2071
+ df["End Date"] - df["Start Date"]
2072
+ ).dt.days + 1 # Ensure inclusive range
2073
+ df["Count per Day"] = (
2074
+ df["Published Count"] / df["Days Duration"]
2075
+ ) # Calculate count per day
2076
+ df["Social Engagement per Day"] = df["Social Engagement"] / df["Days Duration"]
2077
+ df["Week Start"] = df["Start Date"].apply(
2078
+ lambda x: x - timedelta(days=x.weekday()) if pd.notnull(x) else None,
2079
+ )
2080
+ count_df = df.groupby("Week Start")["Count per Day"].sum().reset_index()
2081
+ total_engagement_per_company = (
2082
+ df.groupby("Company (Primary Mention)")["Social Engagement"]
2083
+ .sum()
2084
+ .reset_index()
2085
+ ) # Caluclates Social Engagement across whole period
2086
+ valid_companies = total_engagement_per_company[
2087
+ total_engagement_per_company["Social Engagement"] > 0
2088
+ ][
2089
+ "Company (Primary Mention)"
2090
+ ] # Filters out Companies with no Social Engagement
2091
+ social_engagement_df = (
2092
+ df[df["Company (Primary Mention)"].isin(valid_companies)]
2093
+ .groupby(["Week Start", "Company (Primary Mention)"])["Social Engagement"]
2094
+ .sum()
2095
+ .reset_index()
2096
+ )
2097
+ total_social_engagement_df = (
2098
+ df.groupby("Week Start")["Social Engagement per Day"].sum().reset_index()
2099
+ )
2100
+
2101
+ return count_df, total_social_engagement_df, social_engagement_df