imsciences 0.9.6.9__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +2 -2
- imsciences/geo.py +173 -115
- imsciences/mmm.py +930 -409
- imsciences/pull.py +1952 -1154
- imsciences/unittesting.py +729 -478
- imsciences/vis.py +669 -126
- {imsciences-0.9.6.9.dist-info → imsciences-1.0.1.dist-info}/METADATA +1 -1
- imsciences-1.0.1.dist-info/RECORD +12 -0
- imsciences-0.9.6.9.dist-info/RECORD +0 -12
- {imsciences-0.9.6.9.dist-info → imsciences-1.0.1.dist-info}/LICENSE.txt +0 -0
- {imsciences-0.9.6.9.dist-info → imsciences-1.0.1.dist-info}/PKG-INFO-TomG-HP-290722 +0 -0
- {imsciences-0.9.6.9.dist-info → imsciences-1.0.1.dist-info}/WHEEL +0 -0
- {imsciences-0.9.6.9.dist-info → imsciences-1.0.1.dist-info}/top_level.txt +0 -0
imsciences/mmm.py
CHANGED
|
@@ -1,93 +1,152 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
1
|
import calendar
|
|
2
|
+
import json
|
|
3
3
|
import os
|
|
4
|
-
import numpy as np
|
|
5
|
-
import re
|
|
6
|
-
from datetime import datetime, timedelta
|
|
7
4
|
import subprocess
|
|
8
|
-
import
|
|
9
|
-
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
10
9
|
import xgboost as xgb
|
|
11
10
|
from sklearn.ensemble import RandomForestRegressor
|
|
11
|
+
from sklearn.model_selection import train_test_split
|
|
12
|
+
|
|
12
13
|
|
|
13
14
|
class dataprocessing:
|
|
14
|
-
|
|
15
15
|
def help(self):
|
|
16
|
-
|
|
17
16
|
print("\n1. get_wd_levels")
|
|
18
|
-
print(
|
|
17
|
+
print(
|
|
18
|
+
" - Description: Get the working directory with the option of moving up parents.",
|
|
19
|
+
)
|
|
19
20
|
print(" - Usage: get_wd_levels(levels)")
|
|
20
21
|
print(" - Example: get_wd_levels(0)")
|
|
21
22
|
|
|
22
23
|
print("\n2. aggregate_daily_to_wc_long")
|
|
23
|
-
print(
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
print(
|
|
25
|
+
" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.",
|
|
26
|
+
)
|
|
27
|
+
print(
|
|
28
|
+
" - Usage: aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')",
|
|
29
|
+
)
|
|
30
|
+
print(
|
|
31
|
+
" - Example: aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')",
|
|
32
|
+
)
|
|
26
33
|
|
|
27
34
|
print("\n3. convert_monthly_to_daily")
|
|
28
|
-
print(
|
|
35
|
+
print(
|
|
36
|
+
" - Description: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.",
|
|
37
|
+
)
|
|
29
38
|
print(" - Usage: convert_monthly_to_daily(df, date_column, divide=True)")
|
|
30
39
|
print(" - Example: convert_monthly_to_daily(df, 'date')")
|
|
31
40
|
|
|
32
41
|
print("\n4. week_of_year_mapping")
|
|
33
|
-
print(
|
|
42
|
+
print(
|
|
43
|
+
" - Description: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.",
|
|
44
|
+
)
|
|
34
45
|
print(" - Usage: week_of_year_mapping(df, week_col, start_day_str)")
|
|
35
46
|
print(" - Example: week_of_year_mapping(df, 'week', 'mon')")
|
|
36
47
|
|
|
37
48
|
print("\n5. rename_cols")
|
|
38
|
-
print(
|
|
49
|
+
print(
|
|
50
|
+
" - Description: Renames columns in a pandas DataFrame with a specified prefix or format.",
|
|
51
|
+
)
|
|
39
52
|
print(" - Usage: rename_cols(df, name='ame_')")
|
|
40
53
|
print(" - Example: rename_cols(df, 'ame_facebook')")
|
|
41
54
|
|
|
42
55
|
print("\n6. merge_new_and_old")
|
|
43
|
-
print(
|
|
44
|
-
|
|
45
|
-
|
|
56
|
+
print(
|
|
57
|
+
" - Description: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.",
|
|
58
|
+
)
|
|
59
|
+
print(
|
|
60
|
+
" - Usage: merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')",
|
|
61
|
+
)
|
|
62
|
+
print(
|
|
63
|
+
" - Example: merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')",
|
|
64
|
+
)
|
|
46
65
|
|
|
47
66
|
print("\n7. merge_dataframes_on_column")
|
|
48
67
|
print(" - Description: Merge a list of DataFrames on a common column.")
|
|
49
|
-
print(
|
|
50
|
-
|
|
68
|
+
print(
|
|
69
|
+
" - Usage: merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')",
|
|
70
|
+
)
|
|
71
|
+
print(
|
|
72
|
+
" - Example: merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')",
|
|
73
|
+
)
|
|
51
74
|
|
|
52
75
|
print("\n8. merge_and_update_dfs")
|
|
53
|
-
print(
|
|
76
|
+
print(
|
|
77
|
+
" - Description: Merges two dataframes, updating columns from the second dataframe where values are available.",
|
|
78
|
+
)
|
|
54
79
|
print(" - Usage: merge_and_update_dfs(df1, df2, key_column)")
|
|
55
|
-
print(
|
|
80
|
+
print(
|
|
81
|
+
" - Example: merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')",
|
|
82
|
+
)
|
|
56
83
|
|
|
57
84
|
print("\n9. convert_us_to_uk_dates")
|
|
58
|
-
print(
|
|
85
|
+
print(
|
|
86
|
+
" - Description: Convert a DataFrame column with mixed US and UK date formats to datetime.",
|
|
87
|
+
)
|
|
59
88
|
print(" - Usage: convert_us_to_uk_dates(df, date_col)")
|
|
60
89
|
print(" - Example: convert_us_to_uk_dates(df, 'date')")
|
|
61
90
|
|
|
62
91
|
print("\n10. combine_sheets")
|
|
63
|
-
print(
|
|
92
|
+
print(
|
|
93
|
+
" - Description: Combines multiple DataFrames from a dictionary into a single DataFrame.",
|
|
94
|
+
)
|
|
64
95
|
print(" - Usage: combine_sheets(all_sheets)")
|
|
65
96
|
print(" - Example: combine_sheets({'Sheet1': df1, 'Sheet2': df2})")
|
|
66
97
|
|
|
67
98
|
print("\n11. pivot_table")
|
|
68
|
-
print(
|
|
69
|
-
|
|
70
|
-
|
|
99
|
+
print(
|
|
100
|
+
" - Description: Dynamically pivots a DataFrame based on specified columns.",
|
|
101
|
+
)
|
|
102
|
+
print(
|
|
103
|
+
" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')",
|
|
104
|
+
)
|
|
105
|
+
print(
|
|
106
|
+
" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)",
|
|
107
|
+
)
|
|
71
108
|
|
|
72
109
|
print("\n12. apply_lookup_table_for_columns")
|
|
73
|
-
print(
|
|
74
|
-
|
|
75
|
-
|
|
110
|
+
print(
|
|
111
|
+
" - Description: Maps substrings in columns to new values based on a dictionary.",
|
|
112
|
+
)
|
|
113
|
+
print(
|
|
114
|
+
" - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')",
|
|
115
|
+
)
|
|
116
|
+
print(
|
|
117
|
+
" - Example: apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')",
|
|
118
|
+
)
|
|
76
119
|
|
|
77
120
|
print("\n13. aggregate_daily_to_wc_wide")
|
|
78
|
-
print(
|
|
79
|
-
|
|
80
|
-
|
|
121
|
+
print(
|
|
122
|
+
" - Description: Aggregates daily data into weekly data and pivots it to wide format.",
|
|
123
|
+
)
|
|
124
|
+
print(
|
|
125
|
+
" - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)",
|
|
126
|
+
)
|
|
127
|
+
print(
|
|
128
|
+
" - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)",
|
|
129
|
+
)
|
|
81
130
|
|
|
82
131
|
print("\n14. merge_cols_with_seperator")
|
|
83
|
-
print(
|
|
84
|
-
|
|
85
|
-
|
|
132
|
+
print(
|
|
133
|
+
" - Description: Merges multiple columns in a DataFrame into one column with a specified separator.",
|
|
134
|
+
)
|
|
135
|
+
print(
|
|
136
|
+
" - Usage: merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')",
|
|
137
|
+
)
|
|
138
|
+
print(
|
|
139
|
+
" - Example: merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')",
|
|
140
|
+
)
|
|
86
141
|
|
|
87
142
|
print("\n15. check_sum_of_df_cols_are_equal")
|
|
88
|
-
print(
|
|
143
|
+
print(
|
|
144
|
+
" - Description: Checks if the sum of two columns in two DataFrames are equal and provides the difference.",
|
|
145
|
+
)
|
|
89
146
|
print(" - Usage: check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)")
|
|
90
|
-
print(
|
|
147
|
+
print(
|
|
148
|
+
" - Example: check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')",
|
|
149
|
+
)
|
|
91
150
|
|
|
92
151
|
print("\n16. convert_2_df_cols_to_dict")
|
|
93
152
|
print(" - Description: Creates a dictionary from two DataFrame columns.")
|
|
@@ -95,128 +154,229 @@ class dataprocessing:
|
|
|
95
154
|
print(" - Example: convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')")
|
|
96
155
|
|
|
97
156
|
print("\n17. create_FY_and_H_columns")
|
|
98
|
-
print(
|
|
99
|
-
|
|
100
|
-
|
|
157
|
+
print(
|
|
158
|
+
" - Description: Adds financial year and half-year columns to a DataFrame based on a start date.",
|
|
159
|
+
)
|
|
160
|
+
print(
|
|
161
|
+
" - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')",
|
|
162
|
+
)
|
|
163
|
+
print(
|
|
164
|
+
" - Example: create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')",
|
|
165
|
+
)
|
|
101
166
|
|
|
102
167
|
print("\n18. keyword_lookup_replacement")
|
|
103
|
-
print(
|
|
104
|
-
|
|
105
|
-
|
|
168
|
+
print(
|
|
169
|
+
" - Description: Updates values in a column based on a lookup dictionary with conditional logic.",
|
|
170
|
+
)
|
|
171
|
+
print(
|
|
172
|
+
" - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')",
|
|
173
|
+
)
|
|
174
|
+
print(
|
|
175
|
+
" - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')",
|
|
176
|
+
)
|
|
106
177
|
|
|
107
178
|
print("\n19. create_new_version_of_col_using_LUT")
|
|
108
|
-
print(
|
|
109
|
-
|
|
110
|
-
|
|
179
|
+
print(
|
|
180
|
+
" - Description: Creates a new column based on a lookup table applied to an existing column.",
|
|
181
|
+
)
|
|
182
|
+
print(
|
|
183
|
+
" - Usage: create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')",
|
|
184
|
+
)
|
|
185
|
+
print(
|
|
186
|
+
" - Example: create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)",
|
|
187
|
+
)
|
|
111
188
|
|
|
112
189
|
print("\n20. convert_df_wide_2_long")
|
|
113
|
-
print(
|
|
114
|
-
|
|
115
|
-
|
|
190
|
+
print(
|
|
191
|
+
" - Description: Converts a wide-format DataFrame into a long-format DataFrame.",
|
|
192
|
+
)
|
|
193
|
+
print(
|
|
194
|
+
" - Usage: convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')",
|
|
195
|
+
)
|
|
196
|
+
print(
|
|
197
|
+
" - Example: convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')",
|
|
198
|
+
)
|
|
116
199
|
|
|
117
200
|
print("\n21. manually_edit_data")
|
|
118
|
-
print(
|
|
119
|
-
|
|
120
|
-
|
|
201
|
+
print(
|
|
202
|
+
" - Description: Manually updates specified cells in a DataFrame based on filters.",
|
|
203
|
+
)
|
|
204
|
+
print(
|
|
205
|
+
" - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)",
|
|
206
|
+
)
|
|
207
|
+
print(
|
|
208
|
+
" - Example: manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')",
|
|
209
|
+
)
|
|
121
210
|
|
|
122
211
|
print("\n22. format_numbers_with_commas")
|
|
123
|
-
print(
|
|
212
|
+
print(
|
|
213
|
+
" - Description: Formats numerical columns with commas and a specified number of decimal places.",
|
|
214
|
+
)
|
|
124
215
|
print(" - Usage: format_numbers_with_commas(df, decimal_length_chosen=2)")
|
|
125
216
|
print(" - Example: format_numbers_with_commas(df, decimal_length_chosen=1)")
|
|
126
217
|
|
|
127
218
|
print("\n23. filter_df_on_multiple_conditions")
|
|
128
|
-
print(
|
|
219
|
+
print(
|
|
220
|
+
" - Description: Filters a DataFrame based on multiple column conditions.",
|
|
221
|
+
)
|
|
129
222
|
print(" - Usage: filter_df_on_multiple_conditions(df, filters_dict)")
|
|
130
|
-
print(
|
|
223
|
+
print(
|
|
224
|
+
" - Example: filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})",
|
|
225
|
+
)
|
|
131
226
|
|
|
132
227
|
print("\n24. read_and_concatenate_files")
|
|
133
|
-
print(
|
|
228
|
+
print(
|
|
229
|
+
" - Description: Reads and concatenates files from a specified folder into a single DataFrame.",
|
|
230
|
+
)
|
|
134
231
|
print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
135
|
-
print(
|
|
232
|
+
print(
|
|
233
|
+
" - Example: read_and_concatenate_files('/path/to/files', file_type='xlsx')",
|
|
234
|
+
)
|
|
136
235
|
|
|
137
236
|
print("\n25. upgrade_outdated_packages")
|
|
138
|
-
print(
|
|
237
|
+
print(
|
|
238
|
+
" - Description: Upgrades all outdated Python packages except specified ones.",
|
|
239
|
+
)
|
|
139
240
|
print(" - Usage: upgrade_outdated_packages(exclude_packages=['twine'])")
|
|
140
|
-
print(
|
|
241
|
+
print(
|
|
242
|
+
" - Example: upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])",
|
|
243
|
+
)
|
|
141
244
|
|
|
142
245
|
print("\n26. convert_mixed_formats_dates")
|
|
143
|
-
print(
|
|
246
|
+
print(
|
|
247
|
+
" - Description: Converts mixed-format date columns into standardized datetime format.",
|
|
248
|
+
)
|
|
144
249
|
print(" - Usage: convert_mixed_formats_dates(df, column_name)")
|
|
145
250
|
print(" - Example: convert_mixed_formats_dates(df, 'date_col')")
|
|
146
251
|
|
|
147
252
|
print("\n27. fill_weekly_date_range")
|
|
148
|
-
print(
|
|
253
|
+
print(
|
|
254
|
+
" - Description: Fills in missing weekly dates in a DataFrame with a specified frequency.",
|
|
255
|
+
)
|
|
149
256
|
print(" - Usage: fill_weekly_date_range(df, date_column, freq='W-MON')")
|
|
150
257
|
print(" - Example: fill_weekly_date_range(df, 'date_col')")
|
|
151
258
|
|
|
152
259
|
print("\n28. add_prefix_and_suffix")
|
|
153
|
-
print(
|
|
154
|
-
|
|
155
|
-
|
|
260
|
+
print(
|
|
261
|
+
" - Description: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.",
|
|
262
|
+
)
|
|
263
|
+
print(
|
|
264
|
+
" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)",
|
|
265
|
+
)
|
|
266
|
+
print(
|
|
267
|
+
" - Example: add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')",
|
|
268
|
+
)
|
|
156
269
|
|
|
157
270
|
print("\n29. create_dummies")
|
|
158
|
-
print(
|
|
159
|
-
|
|
160
|
-
|
|
271
|
+
print(
|
|
272
|
+
" - Description: Creates dummy variables for columns, with an option to add a total dummy column.",
|
|
273
|
+
)
|
|
274
|
+
print(
|
|
275
|
+
" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')",
|
|
276
|
+
)
|
|
277
|
+
print(
|
|
278
|
+
" - Example: create_dummies(df, date_col='date_col', dummy_threshold=1)",
|
|
279
|
+
)
|
|
161
280
|
|
|
162
281
|
print("\n30. replace_substrings")
|
|
163
|
-
print(
|
|
164
|
-
|
|
165
|
-
|
|
282
|
+
print(
|
|
283
|
+
" - Description: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.",
|
|
284
|
+
)
|
|
285
|
+
print(
|
|
286
|
+
" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)",
|
|
287
|
+
)
|
|
288
|
+
print(
|
|
289
|
+
" - Example: replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')",
|
|
290
|
+
)
|
|
166
291
|
|
|
167
292
|
print("\n31. add_total_column")
|
|
168
|
-
print(
|
|
169
|
-
|
|
293
|
+
print(
|
|
294
|
+
" - Description: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.",
|
|
295
|
+
)
|
|
296
|
+
print(
|
|
297
|
+
" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')",
|
|
298
|
+
)
|
|
170
299
|
print(" - Example: add_total_column(df, exclude_col='date_col')")
|
|
171
300
|
|
|
172
301
|
print("\n32. apply_lookup_table_based_on_substring")
|
|
173
|
-
print(
|
|
174
|
-
|
|
175
|
-
|
|
302
|
+
print(
|
|
303
|
+
" - Description: Categorizes text in a column using a lookup table based on substrings.",
|
|
304
|
+
)
|
|
305
|
+
print(
|
|
306
|
+
" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')",
|
|
307
|
+
)
|
|
308
|
+
print(
|
|
309
|
+
" - Example: apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})",
|
|
310
|
+
)
|
|
176
311
|
|
|
177
312
|
print("\n33. compare_overlap")
|
|
178
|
-
print(
|
|
313
|
+
print(
|
|
314
|
+
" - Description: Compares overlapping periods between two DataFrames and summarizes differences.",
|
|
315
|
+
)
|
|
179
316
|
print(" - Usage: compare_overlap(df1, df2, date_col)")
|
|
180
317
|
print(" - Example: compare_overlap(df1, df2, 'date_col')")
|
|
181
318
|
|
|
182
319
|
print("\n34. week_commencing_2_week_commencing_conversion_isoweekday")
|
|
183
|
-
print(
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
320
|
+
print(
|
|
321
|
+
" - Description: Maps dates to the start of the current ISO week based on a specified weekday.",
|
|
322
|
+
)
|
|
323
|
+
print(
|
|
324
|
+
" - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')",
|
|
325
|
+
)
|
|
326
|
+
print(
|
|
327
|
+
" - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')",
|
|
328
|
+
)
|
|
329
|
+
|
|
187
330
|
print("\n35. seasonality_feature_extraction")
|
|
188
|
-
print(
|
|
189
|
-
|
|
190
|
-
|
|
331
|
+
print(
|
|
332
|
+
" - Description: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.",
|
|
333
|
+
)
|
|
334
|
+
print(
|
|
335
|
+
" - Usage: seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)",
|
|
336
|
+
)
|
|
337
|
+
print(
|
|
338
|
+
" - Example: seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)",
|
|
339
|
+
)
|
|
191
340
|
|
|
192
341
|
def get_wd_levels(self, levels):
|
|
193
342
|
"""
|
|
194
343
|
Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
|
|
195
344
|
|
|
196
|
-
Parameters
|
|
345
|
+
Parameters
|
|
346
|
+
----------
|
|
197
347
|
- data_frame: pandas DataFrame
|
|
198
348
|
The input data frame.
|
|
199
349
|
- num_rows_to_remove: int
|
|
200
350
|
The number of levels to move up pathways.
|
|
201
351
|
|
|
202
|
-
Returns
|
|
352
|
+
Returns
|
|
353
|
+
-------
|
|
203
354
|
- Current wd
|
|
204
|
-
"""
|
|
205
355
|
|
|
356
|
+
"""
|
|
206
357
|
directory = os.getcwd()
|
|
207
358
|
for _ in range(levels):
|
|
208
359
|
directory = os.path.dirname(directory)
|
|
209
360
|
return directory
|
|
210
|
-
|
|
211
|
-
def aggregate_daily_to_wc_long(
|
|
361
|
+
|
|
362
|
+
def aggregate_daily_to_wc_long(
|
|
363
|
+
self,
|
|
364
|
+
df: pd.DataFrame,
|
|
365
|
+
date_column: str,
|
|
366
|
+
group_columns: list[str],
|
|
367
|
+
sum_columns: list[str],
|
|
368
|
+
wc: str = "sun",
|
|
369
|
+
aggregation: str = "sum",
|
|
370
|
+
) -> pd.DataFrame:
|
|
212
371
|
"""
|
|
213
|
-
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
214
|
-
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
215
|
-
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
216
|
-
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
372
|
+
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
373
|
+
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
374
|
+
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
375
|
+
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
217
376
|
The day column is renamed from 'Day' to 'OBS'.
|
|
218
377
|
|
|
219
|
-
Parameters
|
|
378
|
+
Parameters
|
|
379
|
+
----------
|
|
220
380
|
- df: pandas DataFrame
|
|
221
381
|
The input DataFrame containing daily data.
|
|
222
382
|
- date_column: string
|
|
@@ -230,18 +390,21 @@ class dataprocessing:
|
|
|
230
390
|
- aggregation: string, optional (default 'sum')
|
|
231
391
|
Aggregation method, either 'sum', 'average', or 'count'.
|
|
232
392
|
|
|
233
|
-
Returns
|
|
393
|
+
Returns
|
|
394
|
+
-------
|
|
234
395
|
- pandas DataFrame
|
|
235
396
|
A new DataFrame with weekly aggregated data. The index is reset,
|
|
236
|
-
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
237
|
-
is in long format, with separate columns for each combination of
|
|
397
|
+
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
398
|
+
is in long format, with separate columns for each combination of
|
|
238
399
|
grouped metrics.
|
|
239
|
-
"""
|
|
240
400
|
|
|
401
|
+
"""
|
|
241
402
|
# Map the input week commencing day to a weekday number (0=Monday, 6=Sunday)
|
|
242
|
-
days = {
|
|
403
|
+
days = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
243
404
|
if wc.lower() not in days:
|
|
244
|
-
return print(
|
|
405
|
+
return print(
|
|
406
|
+
f"Incorrect week commencing day input: '{wc}'. Please choose a valid day of the week (e.g., 'sun', 'mon', etc.).",
|
|
407
|
+
)
|
|
245
408
|
|
|
246
409
|
start_day = days[wc.lower()]
|
|
247
410
|
|
|
@@ -252,26 +415,40 @@ class dataprocessing:
|
|
|
252
415
|
df_copy[date_column] = pd.to_datetime(df_copy[date_column])
|
|
253
416
|
|
|
254
417
|
# Determine the start of each week
|
|
255
|
-
df_copy[
|
|
418
|
+
df_copy["week_start"] = df_copy[date_column].apply(
|
|
419
|
+
lambda x: x - pd.Timedelta(days=(x.weekday() - start_day) % 7),
|
|
420
|
+
)
|
|
256
421
|
|
|
257
422
|
# Convert sum_columns to numeric and fill NaNs with 0, retaining decimal values
|
|
258
423
|
for col in sum_columns:
|
|
259
|
-
df_copy[col] = pd.to_numeric(df_copy[col], errors=
|
|
424
|
+
df_copy[col] = pd.to_numeric(df_copy[col], errors="coerce").fillna(0)
|
|
260
425
|
|
|
261
426
|
# Group by the new week start column and additional columns, then aggregate the numeric columns
|
|
262
|
-
if aggregation ==
|
|
263
|
-
grouped =
|
|
264
|
-
|
|
265
|
-
|
|
427
|
+
if aggregation == "average":
|
|
428
|
+
grouped = (
|
|
429
|
+
df_copy.groupby(["week_start"] + group_columns)[sum_columns]
|
|
430
|
+
.mean()
|
|
431
|
+
.reset_index()
|
|
432
|
+
)
|
|
433
|
+
elif aggregation == "count":
|
|
434
|
+
grouped = (
|
|
435
|
+
df_copy.groupby(["week_start"] + group_columns)[sum_columns]
|
|
436
|
+
.count()
|
|
437
|
+
.reset_index()
|
|
438
|
+
)
|
|
266
439
|
else: # Default to 'sum' if any other value is provided
|
|
267
|
-
grouped =
|
|
440
|
+
grouped = (
|
|
441
|
+
df_copy.groupby(["week_start"] + group_columns)[sum_columns]
|
|
442
|
+
.sum()
|
|
443
|
+
.reset_index()
|
|
444
|
+
)
|
|
268
445
|
|
|
269
446
|
# Rename 'week_start' column to 'OBS'
|
|
270
|
-
grouped = grouped.rename(columns={
|
|
447
|
+
grouped = grouped.rename(columns={"week_start": "OBS"})
|
|
271
448
|
|
|
272
449
|
return grouped
|
|
273
|
-
|
|
274
|
-
def convert_monthly_to_daily(self, df, date_column, divide
|
|
450
|
+
|
|
451
|
+
def convert_monthly_to_daily(self, df, date_column, divide=True):
|
|
275
452
|
"""
|
|
276
453
|
Convert a DataFrame with monthly data to daily data.
|
|
277
454
|
This function takes a DataFrame and a date column, then it expands each
|
|
@@ -282,7 +459,6 @@ class dataprocessing:
|
|
|
282
459
|
:param divide: boolean divide by the number of days in a month (default True)
|
|
283
460
|
:return: A new DataFrame with daily data.
|
|
284
461
|
"""
|
|
285
|
-
|
|
286
462
|
# Convert date_column to datetime
|
|
287
463
|
df[date_column] = pd.to_datetime(df[date_column])
|
|
288
464
|
|
|
@@ -292,7 +468,10 @@ class dataprocessing:
|
|
|
292
468
|
# Iterate over each row in the DataFrame
|
|
293
469
|
for _, row in df.iterrows():
|
|
294
470
|
# Calculate the number of days in the month
|
|
295
|
-
num_days = calendar.monthrange(
|
|
471
|
+
num_days = calendar.monthrange(
|
|
472
|
+
row[date_column].year,
|
|
473
|
+
row[date_column].month,
|
|
474
|
+
)[1]
|
|
296
475
|
|
|
297
476
|
# Create a new record for each day of the month
|
|
298
477
|
for day in range(1, num_days + 1):
|
|
@@ -304,32 +483,41 @@ class dataprocessing:
|
|
|
304
483
|
if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
|
|
305
484
|
if divide is True:
|
|
306
485
|
daily_row[col] = row[col] / num_days
|
|
307
|
-
else:
|
|
486
|
+
else:
|
|
308
487
|
daily_row[col] = row[col]
|
|
309
488
|
daily_records.append(daily_row)
|
|
310
489
|
|
|
311
490
|
# Convert the list of daily records into a DataFrame
|
|
312
491
|
daily_df = pd.DataFrame(daily_records)
|
|
313
|
-
|
|
492
|
+
|
|
314
493
|
return daily_df
|
|
315
|
-
|
|
316
|
-
def week_of_year_mapping(self,df, week_col, start_day_str):
|
|
317
494
|
|
|
495
|
+
def week_of_year_mapping(self, df, week_col, start_day_str):
|
|
318
496
|
# Mapping of string day names to day numbers (1 for Monday, 7 for Sunday)
|
|
319
497
|
day_mapping = {
|
|
320
|
-
|
|
498
|
+
"mon": 1,
|
|
499
|
+
"tue": 2,
|
|
500
|
+
"wed": 3,
|
|
501
|
+
"thu": 4,
|
|
502
|
+
"fri": 5,
|
|
503
|
+
"sat": 6,
|
|
504
|
+
"sun": 7,
|
|
321
505
|
}
|
|
322
506
|
|
|
323
507
|
# Convert the day string to a number, or raise an error if not valid
|
|
324
508
|
start_day = day_mapping.get(start_day_str.lower())
|
|
325
509
|
if start_day is None:
|
|
326
|
-
raise ValueError(
|
|
510
|
+
raise ValueError(
|
|
511
|
+
f"Invalid day input: '{start_day_str}'. Please use one of 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'.",
|
|
512
|
+
)
|
|
327
513
|
|
|
328
514
|
# Function to convert week number to start date of the week
|
|
329
515
|
def week_to_startdate(week_str, start_day):
|
|
330
|
-
year, week = map(int, week_str.split(
|
|
516
|
+
year, week = map(int, week_str.split("-W"))
|
|
331
517
|
first_day_of_year = datetime(year, 1, 1)
|
|
332
|
-
first_weekday_of_year =
|
|
518
|
+
first_weekday_of_year = (
|
|
519
|
+
first_day_of_year.weekday()
|
|
520
|
+
) # Monday is 0 and Sunday is 6
|
|
333
521
|
|
|
334
522
|
# Calculate days to adjust to the desired start day of the week
|
|
335
523
|
days_to_adjust = (start_day - 1 - first_weekday_of_year) % 7
|
|
@@ -340,25 +528,38 @@ class dataprocessing:
|
|
|
340
528
|
return start_of_week
|
|
341
529
|
|
|
342
530
|
# Apply the function to each row in the specified week column
|
|
343
|
-
df[
|
|
531
|
+
df["OBS"] = (
|
|
532
|
+
df[week_col]
|
|
533
|
+
.apply(lambda x: week_to_startdate(x, start_day))
|
|
534
|
+
.dt.strftime("%d/%m/%Y")
|
|
535
|
+
)
|
|
344
536
|
return df
|
|
345
|
-
|
|
346
|
-
def rename_cols(self, df, name
|
|
537
|
+
|
|
538
|
+
def rename_cols(self, df, name="ame_"):
|
|
347
539
|
new_columns = {}
|
|
348
540
|
for col in df.columns:
|
|
349
|
-
if col !=
|
|
541
|
+
if col != "OBS":
|
|
350
542
|
new_col_name = name + col.replace(" ", "_").lower()
|
|
351
543
|
else:
|
|
352
544
|
new_col_name = col
|
|
353
545
|
new_columns[col] = new_col_name
|
|
354
546
|
return df.rename(columns=new_columns)
|
|
355
|
-
|
|
356
|
-
def merge_new_and_old(
|
|
547
|
+
|
|
548
|
+
def merge_new_and_old(
|
|
549
|
+
self,
|
|
550
|
+
old_df,
|
|
551
|
+
old_col,
|
|
552
|
+
new_df,
|
|
553
|
+
new_col,
|
|
554
|
+
cutoff_date,
|
|
555
|
+
date_col_name="OBS",
|
|
556
|
+
):
|
|
357
557
|
"""
|
|
358
558
|
Creates a new DataFrame with two columns: one for dates and one for merged numeric values.
|
|
359
559
|
Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.
|
|
360
560
|
|
|
361
|
-
Parameters
|
|
561
|
+
Parameters
|
|
562
|
+
----------
|
|
362
563
|
- old_df: pandas DataFrame
|
|
363
564
|
The old DataFrame from which to take the numeric values up to the specified date.
|
|
364
565
|
- old_col: str
|
|
@@ -372,11 +573,12 @@ class dataprocessing:
|
|
|
372
573
|
- date_col_name: str, optional (default 'OBS')
|
|
373
574
|
The name of the date column in both DataFrames.
|
|
374
575
|
|
|
375
|
-
Returns
|
|
576
|
+
Returns
|
|
577
|
+
-------
|
|
376
578
|
- pandas DataFrame
|
|
377
579
|
A new DataFrame with two columns: 'Date' and a column named after 'new_col' containing merged numeric values.
|
|
378
|
-
"""
|
|
379
580
|
|
|
581
|
+
"""
|
|
380
582
|
# Convert date columns in both dataframes to datetime for comparison
|
|
381
583
|
old_df[date_col_name] = pd.to_datetime(old_df[date_col_name])
|
|
382
584
|
new_df[date_col_name] = pd.to_datetime(new_df[date_col_name])
|
|
@@ -389,67 +591,93 @@ class dataprocessing:
|
|
|
389
591
|
new_values = new_df[new_df[date_col_name] > cutoff_date]
|
|
390
592
|
|
|
391
593
|
# Create a new DataFrame with two columns: 'Date' and a column named after 'new_col'
|
|
392
|
-
merged_df = pd.DataFrame(
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
594
|
+
merged_df = pd.DataFrame(
|
|
595
|
+
{
|
|
596
|
+
"OBS": pd.concat(
|
|
597
|
+
[old_values[date_col_name], new_values[date_col_name]],
|
|
598
|
+
ignore_index=True,
|
|
599
|
+
),
|
|
600
|
+
new_col: pd.concat(
|
|
601
|
+
[old_values[old_col], new_values[new_col]],
|
|
602
|
+
ignore_index=True,
|
|
603
|
+
),
|
|
604
|
+
},
|
|
605
|
+
)
|
|
396
606
|
|
|
397
607
|
return merged_df
|
|
398
|
-
|
|
399
|
-
def merge_dataframes_on_column(
|
|
608
|
+
|
|
609
|
+
def merge_dataframes_on_column(
|
|
610
|
+
self,
|
|
611
|
+
dataframes,
|
|
612
|
+
common_column="OBS",
|
|
613
|
+
merge_how="outer",
|
|
614
|
+
):
|
|
400
615
|
"""
|
|
401
616
|
Merge a list of DataFrames on a common column.
|
|
402
617
|
|
|
403
|
-
Parameters
|
|
618
|
+
Parameters
|
|
619
|
+
----------
|
|
404
620
|
- dataframes: A list of DataFrames to merge.
|
|
405
621
|
- common_column: The name of the common column to merge on.
|
|
406
622
|
- merge_how: The type of merge to perform ('inner', 'outer', 'left', or 'right').
|
|
407
623
|
|
|
408
|
-
Returns
|
|
624
|
+
Returns
|
|
625
|
+
-------
|
|
409
626
|
- A merged DataFrame.
|
|
627
|
+
|
|
410
628
|
"""
|
|
411
629
|
if not dataframes:
|
|
412
630
|
return None
|
|
413
|
-
|
|
631
|
+
|
|
414
632
|
merged_df = dataframes[0] # Start with the first DataFrame
|
|
415
633
|
|
|
416
634
|
for df in dataframes[1:]:
|
|
417
635
|
merged_df = pd.merge(merged_df, df, on=common_column, how=merge_how)
|
|
418
636
|
|
|
419
637
|
# Check if the common column is of datetime dtype
|
|
420
|
-
if merged_df[common_column].dtype ==
|
|
638
|
+
if merged_df[common_column].dtype == "datetime64[ns]":
|
|
421
639
|
merged_df[common_column] = pd.to_datetime(merged_df[common_column])
|
|
422
640
|
merged_df = merged_df.sort_values(by=common_column)
|
|
423
641
|
merged_df = merged_df.fillna(0)
|
|
424
|
-
|
|
642
|
+
|
|
425
643
|
return merged_df
|
|
426
|
-
|
|
644
|
+
|
|
427
645
|
def merge_and_update_dfs(self, df1, df2, key_column):
|
|
428
646
|
"""
|
|
429
647
|
Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available,
|
|
430
648
|
and returns a dataframe sorted by the key column.
|
|
431
649
|
|
|
432
|
-
Parameters
|
|
650
|
+
Parameters
|
|
651
|
+
----------
|
|
433
652
|
df1 (DataFrame): The first dataframe to merge (e.g., processed_facebook).
|
|
434
653
|
df2 (DataFrame): The second dataframe to merge (e.g., finalised_meta).
|
|
435
654
|
key_column (str): The name of the column to merge and sort by (e.g., 'OBS').
|
|
436
655
|
|
|
437
|
-
Returns
|
|
656
|
+
Returns
|
|
657
|
+
-------
|
|
438
658
|
DataFrame: The merged and updated dataframe.
|
|
439
|
-
"""
|
|
440
659
|
|
|
660
|
+
"""
|
|
441
661
|
# Sort both DataFrames by the key column
|
|
442
662
|
df1_sorted = df1.sort_values(by=key_column)
|
|
443
663
|
df2_sorted = df2.sort_values(by=key_column)
|
|
444
664
|
|
|
445
665
|
# Perform the full outer merge
|
|
446
|
-
merged_df = pd.merge(
|
|
666
|
+
merged_df = pd.merge(
|
|
667
|
+
df1_sorted,
|
|
668
|
+
df2_sorted,
|
|
669
|
+
on=key_column,
|
|
670
|
+
how="outer",
|
|
671
|
+
suffixes=("", "_finalised"),
|
|
672
|
+
)
|
|
447
673
|
|
|
448
674
|
# Update with non-null values from df2
|
|
449
675
|
for column in merged_df.columns:
|
|
450
|
-
if column.endswith(
|
|
451
|
-
original_column = column.replace(
|
|
452
|
-
merged_df.loc[merged_df[column].notnull(), original_column] =
|
|
676
|
+
if column.endswith("_finalised"):
|
|
677
|
+
original_column = column.replace("_finalised", "")
|
|
678
|
+
merged_df.loc[merged_df[column].notnull(), original_column] = (
|
|
679
|
+
merged_df.loc[merged_df[column].notnull(), column]
|
|
680
|
+
)
|
|
453
681
|
merged_df.drop(column, axis=1, inplace=True)
|
|
454
682
|
|
|
455
683
|
# Sort the merged DataFrame by the key column
|
|
@@ -459,25 +687,30 @@ class dataprocessing:
|
|
|
459
687
|
merged_df.fillna(0, inplace=True)
|
|
460
688
|
|
|
461
689
|
return merged_df
|
|
462
|
-
|
|
690
|
+
|
|
463
691
|
def convert_us_to_uk_dates(self, df, date_col):
|
|
464
692
|
"""
|
|
465
|
-
Processes the date column of a DataFrame to remove hyphens and slashes,
|
|
693
|
+
Processes the date column of a DataFrame to remove hyphens and slashes,
|
|
466
694
|
and converts it to a datetime object.
|
|
467
|
-
|
|
468
|
-
Parameters
|
|
695
|
+
|
|
696
|
+
Parameters
|
|
697
|
+
----------
|
|
469
698
|
df (pd.DataFrame): The DataFrame containing the date column.
|
|
470
699
|
date_col (str): The name of the date column.
|
|
471
|
-
|
|
472
|
-
Returns
|
|
700
|
+
|
|
701
|
+
Returns
|
|
702
|
+
-------
|
|
473
703
|
pd.DataFrame: The DataFrame with the processed date column.
|
|
704
|
+
|
|
474
705
|
"""
|
|
475
|
-
df[date_col] = df[date_col].str.replace(r
|
|
706
|
+
df[date_col] = df[date_col].str.replace(r"[-/]", "", regex=True)
|
|
476
707
|
df[date_col] = pd.to_datetime(
|
|
477
|
-
df[date_col].str.slice(0, 2)
|
|
478
|
-
|
|
479
|
-
df[date_col].str.slice(
|
|
480
|
-
|
|
708
|
+
df[date_col].str.slice(0, 2)
|
|
709
|
+
+ "/"
|
|
710
|
+
+ df[date_col].str.slice(2, 4)
|
|
711
|
+
+ "/"
|
|
712
|
+
+ df[date_col].str.slice(4, 8),
|
|
713
|
+
format="%m/%d/%Y",
|
|
481
714
|
)
|
|
482
715
|
return df
|
|
483
716
|
|
|
@@ -486,21 +719,40 @@ class dataprocessing:
|
|
|
486
719
|
Combines multiple DataFrames from a dictionary into a single DataFrame.
|
|
487
720
|
Adds a column 'SheetName' indicating the origin sheet of each row.
|
|
488
721
|
|
|
489
|
-
Parameters
|
|
722
|
+
Parameters
|
|
723
|
+
----------
|
|
490
724
|
all_sheets (dict): A dictionary of DataFrames, typically read from an Excel file with multiple sheets.
|
|
491
725
|
|
|
492
|
-
Returns
|
|
726
|
+
Returns
|
|
727
|
+
-------
|
|
493
728
|
DataFrame: A concatenated DataFrame with an additional 'SheetName' column.
|
|
729
|
+
|
|
494
730
|
"""
|
|
495
731
|
combined_df = pd.DataFrame()
|
|
496
732
|
|
|
497
733
|
for sheet_name, df in all_sheets.items():
|
|
498
|
-
df[
|
|
734
|
+
df["SheetName"] = sheet_name
|
|
499
735
|
combined_df = pd.concat([combined_df, df], ignore_index=True)
|
|
500
736
|
|
|
501
737
|
return combined_df
|
|
502
|
-
|
|
503
|
-
def pivot_table(
|
|
738
|
+
|
|
739
|
+
def pivot_table(
|
|
740
|
+
self,
|
|
741
|
+
df,
|
|
742
|
+
index_col,
|
|
743
|
+
columns,
|
|
744
|
+
values_col,
|
|
745
|
+
filters_dict=None,
|
|
746
|
+
fill_value=0,
|
|
747
|
+
aggfunc="sum",
|
|
748
|
+
margins=False,
|
|
749
|
+
margins_name="Total",
|
|
750
|
+
datetime_trans_needed=True,
|
|
751
|
+
date_format="%Y-%m-%d",
|
|
752
|
+
reverse_header_order=False,
|
|
753
|
+
fill_missing_weekly_dates=True,
|
|
754
|
+
week_commencing="W-MON",
|
|
755
|
+
):
|
|
504
756
|
"""
|
|
505
757
|
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
506
758
|
|
|
@@ -521,6 +773,7 @@ class dataprocessing:
|
|
|
521
773
|
|
|
522
774
|
Returns:
|
|
523
775
|
pandas.DataFrame: The pivot table specified
|
|
776
|
+
|
|
524
777
|
"""
|
|
525
778
|
# Validate inputs
|
|
526
779
|
if index_col not in df.columns:
|
|
@@ -544,7 +797,10 @@ class dataprocessing:
|
|
|
544
797
|
|
|
545
798
|
# Ensure index column is in datetime format if needed
|
|
546
799
|
if datetime_trans_needed:
|
|
547
|
-
df_filtered[index_col] = pd.to_datetime(
|
|
800
|
+
df_filtered[index_col] = pd.to_datetime(
|
|
801
|
+
df_filtered[index_col],
|
|
802
|
+
dayfirst=True,
|
|
803
|
+
)
|
|
548
804
|
|
|
549
805
|
# Create the pivot table
|
|
550
806
|
pivoted_df = df_filtered.pivot_table(
|
|
@@ -559,7 +815,9 @@ class dataprocessing:
|
|
|
559
815
|
# Handle column headers
|
|
560
816
|
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
561
817
|
pivoted_df.columns = [
|
|
562
|
-
"_".join(
|
|
818
|
+
"_".join(
|
|
819
|
+
reversed(map(str, col)) if reverse_header_order else map(str, col),
|
|
820
|
+
)
|
|
563
821
|
for col in pivoted_df.columns.values
|
|
564
822
|
]
|
|
565
823
|
else:
|
|
@@ -570,7 +828,10 @@ class dataprocessing:
|
|
|
570
828
|
|
|
571
829
|
# Handle sorting and formatting of index column
|
|
572
830
|
if datetime_trans_needed:
|
|
573
|
-
pivoted_df[index_col] = pd.to_datetime(
|
|
831
|
+
pivoted_df[index_col] = pd.to_datetime(
|
|
832
|
+
pivoted_df[index_col],
|
|
833
|
+
errors="coerce",
|
|
834
|
+
)
|
|
574
835
|
pivoted_df.sort_values(by=index_col, inplace=True)
|
|
575
836
|
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
|
|
576
837
|
|
|
@@ -579,56 +840,75 @@ class dataprocessing:
|
|
|
579
840
|
|
|
580
841
|
# Fill missing weekly dates if specified
|
|
581
842
|
if fill_missing_weekly_dates:
|
|
582
|
-
pivoted_df = self.fill_weekly_date_range(
|
|
843
|
+
pivoted_df = self.fill_weekly_date_range(
|
|
844
|
+
pivoted_df,
|
|
845
|
+
index_col,
|
|
846
|
+
freq=week_commencing,
|
|
847
|
+
)
|
|
583
848
|
|
|
584
849
|
return pivoted_df
|
|
585
850
|
|
|
586
|
-
def apply_lookup_table_for_columns(
|
|
851
|
+
def apply_lookup_table_for_columns(
|
|
852
|
+
df,
|
|
853
|
+
col_names,
|
|
854
|
+
to_find_dict,
|
|
855
|
+
if_not_in_dict="Other",
|
|
856
|
+
new_column_name="Mapping",
|
|
857
|
+
):
|
|
587
858
|
"""
|
|
588
|
-
Creates a new DataFrame column based on a look up table,
|
|
859
|
+
Creates a new DataFrame column based on a look up table, using exact matches.
|
|
589
860
|
|
|
590
|
-
Parameters
|
|
861
|
+
Parameters
|
|
862
|
+
----------
|
|
591
863
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
592
|
-
col_names (list of str):
|
|
593
|
-
to_find_dict (dict):
|
|
594
|
-
if_not_in_dict (str, optional):
|
|
595
|
-
new_column_name (str, optional):
|
|
864
|
+
col_names (list of str): List of column names to use for lookup. If more than one, values are merged with '|'.
|
|
865
|
+
to_find_dict (dict): Lookup dictionary with exact keys to match.
|
|
866
|
+
if_not_in_dict (str, optional): Value used if no match is found. Defaults to "Other".
|
|
867
|
+
new_column_name (str, optional): Name of new output column. Defaults to "Mapping".
|
|
596
868
|
|
|
597
|
-
Returns
|
|
598
|
-
|
|
599
|
-
|
|
869
|
+
Returns
|
|
870
|
+
-------
|
|
871
|
+
pandas.DataFrame: DataFrame with a new column containing lookup results.
|
|
600
872
|
|
|
601
|
-
|
|
602
|
-
regex_pattern = "|".join(r'\b' + re.escape(key) + r'\b' for key in to_find_dict.keys())
|
|
603
|
-
|
|
873
|
+
"""
|
|
604
874
|
# Preprocess DataFrame if multiple columns
|
|
605
875
|
if len(col_names) > 1:
|
|
606
|
-
df["Merged"] = df[col_names].astype(str).
|
|
876
|
+
df["Merged"] = df[col_names].astype(str).agg("|".join, axis=1)
|
|
607
877
|
col_to_use = "Merged"
|
|
608
878
|
else:
|
|
609
879
|
col_to_use = col_names[0]
|
|
610
880
|
|
|
611
|
-
#
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
881
|
+
# Normalize case for matching
|
|
882
|
+
lookup = {k.lower(): v for k, v in to_find_dict.items()}
|
|
883
|
+
df[new_column_name] = (
|
|
884
|
+
df[col_to_use].str.lower().map(lookup).fillna(if_not_in_dict)
|
|
885
|
+
)
|
|
886
|
+
|
|
617
887
|
# Drop intermediate column if created
|
|
618
888
|
if len(col_names) > 1:
|
|
619
889
|
df.drop(columns=["Merged"], inplace=True)
|
|
620
890
|
|
|
621
891
|
return df
|
|
622
892
|
|
|
623
|
-
def aggregate_daily_to_wc_wide(
|
|
893
|
+
def aggregate_daily_to_wc_wide(
|
|
894
|
+
self,
|
|
895
|
+
df: pd.DataFrame,
|
|
896
|
+
date_column: str,
|
|
897
|
+
group_columns: list[str],
|
|
898
|
+
sum_columns: list[str],
|
|
899
|
+
wc: str = "sun",
|
|
900
|
+
aggregation: str = "sum",
|
|
901
|
+
include_totals: bool = False,
|
|
902
|
+
) -> pd.DataFrame:
|
|
624
903
|
"""
|
|
625
|
-
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
626
|
-
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
627
|
-
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
628
|
-
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
904
|
+
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
905
|
+
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
906
|
+
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
907
|
+
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
629
908
|
The day column is renamed from 'Day' to 'OBS'.
|
|
630
909
|
|
|
631
|
-
Parameters
|
|
910
|
+
Parameters
|
|
911
|
+
----------
|
|
632
912
|
- df: pandas DataFrame
|
|
633
913
|
The input DataFrame containing daily data.
|
|
634
914
|
- date_column: string
|
|
@@ -644,26 +924,36 @@ class dataprocessing:
|
|
|
644
924
|
- include_totals: boolean, optional (default False)
|
|
645
925
|
If True, include total columns for each sum_column.
|
|
646
926
|
|
|
647
|
-
Returns
|
|
927
|
+
Returns
|
|
928
|
+
-------
|
|
648
929
|
- pandas DataFrame
|
|
649
930
|
A new DataFrame with weekly aggregated data. The index is reset,
|
|
650
|
-
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
651
|
-
is in wide format, with separate columns for each combination of
|
|
931
|
+
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
932
|
+
is in wide format, with separate columns for each combination of
|
|
652
933
|
grouped metrics.
|
|
934
|
+
|
|
653
935
|
"""
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
936
|
+
grouped = self.aggregate_daily_to_wc_long(
|
|
937
|
+
df,
|
|
938
|
+
date_column,
|
|
939
|
+
group_columns,
|
|
940
|
+
sum_columns,
|
|
941
|
+
wc,
|
|
942
|
+
aggregation,
|
|
943
|
+
)
|
|
944
|
+
|
|
657
945
|
# Pivot the data to wide format
|
|
658
946
|
if group_columns:
|
|
659
|
-
wide_df = grouped.pivot_table(
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
947
|
+
wide_df = grouped.pivot_table(
|
|
948
|
+
index="OBS",
|
|
949
|
+
columns=group_columns,
|
|
950
|
+
values=sum_columns,
|
|
951
|
+
aggfunc="first",
|
|
952
|
+
)
|
|
663
953
|
# Flatten the multi-level column index and create combined column names
|
|
664
|
-
wide_df.columns = [
|
|
954
|
+
wide_df.columns = ["_".join(col).strip() for col in wide_df.columns.values]
|
|
665
955
|
else:
|
|
666
|
-
wide_df = grouped.set_index(
|
|
956
|
+
wide_df = grouped.set_index("OBS")
|
|
667
957
|
|
|
668
958
|
# Fill NaN values with 0
|
|
669
959
|
wide_df = wide_df.fillna(0)
|
|
@@ -671,9 +961,11 @@ class dataprocessing:
|
|
|
671
961
|
# Adding total columns for each unique sum_column, if include_totals is True
|
|
672
962
|
if include_totals:
|
|
673
963
|
for col in sum_columns:
|
|
674
|
-
total_column_name = f
|
|
964
|
+
total_column_name = f"Total {col}"
|
|
675
965
|
if group_columns:
|
|
676
|
-
columns_to_sum = [
|
|
966
|
+
columns_to_sum = [
|
|
967
|
+
column for column in wide_df.columns if col in column
|
|
968
|
+
]
|
|
677
969
|
else:
|
|
678
970
|
columns_to_sum = [col]
|
|
679
971
|
wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
|
|
@@ -683,11 +975,20 @@ class dataprocessing:
|
|
|
683
975
|
|
|
684
976
|
return wide_df
|
|
685
977
|
|
|
686
|
-
def merge_cols_with_seperator(
|
|
978
|
+
def merge_cols_with_seperator(
|
|
979
|
+
self,
|
|
980
|
+
df,
|
|
981
|
+
col_names,
|
|
982
|
+
seperator="_",
|
|
983
|
+
output_column_name="Merged",
|
|
984
|
+
starting_prefix_str=None,
|
|
985
|
+
ending_prefix_str=None,
|
|
986
|
+
):
|
|
687
987
|
"""
|
|
688
988
|
Creates a new column in the dataframe that merges 2 or more columns together with a "_" seperator, possibly to be used for a look up table where multiple columns are being looked up
|
|
689
989
|
|
|
690
|
-
Parameters
|
|
990
|
+
Parameters
|
|
991
|
+
----------
|
|
691
992
|
df (pandas.DataFrame): Dataframe to make changes to.
|
|
692
993
|
col_names (list): list of columm names ot merge.
|
|
693
994
|
seperator (str, optional): Name of column outputted. Defaults to "_".
|
|
@@ -695,76 +996,99 @@ class dataprocessing:
|
|
|
695
996
|
starting_prefix_str (str, optional): string of optional text to be added before the merged column str value
|
|
696
997
|
ending_prefix_str (str, optional): string of optional text to be added after the merged column str value
|
|
697
998
|
|
|
698
|
-
Raises
|
|
999
|
+
Raises
|
|
1000
|
+
------
|
|
699
1001
|
ValueError: if more less than two column names are inputted in the list there is nothing to merge on
|
|
700
1002
|
|
|
701
|
-
Returns
|
|
1003
|
+
Returns
|
|
1004
|
+
-------
|
|
702
1005
|
pandas.DataFrame: DataFrame with additional merged column
|
|
1006
|
+
|
|
703
1007
|
"""
|
|
704
1008
|
# Specify more than one column must be entered
|
|
705
1009
|
if len(col_names) < 2:
|
|
706
1010
|
raise ValueError("2 or more columns must be specified to merge")
|
|
707
|
-
|
|
1011
|
+
|
|
708
1012
|
# Create a new column with the merged columns
|
|
709
1013
|
df[output_column_name] = df[col_names].astype(str).apply(seperator.join, axis=1)
|
|
710
1014
|
|
|
711
|
-
# Add string before
|
|
1015
|
+
# Add string before
|
|
712
1016
|
if starting_prefix_str is not None:
|
|
713
|
-
df[output_column_name] = starting_prefix_str + df[
|
|
714
|
-
|
|
1017
|
+
df[output_column_name] = starting_prefix_str + df[
|
|
1018
|
+
output_column_name
|
|
1019
|
+
].astype(str)
|
|
1020
|
+
|
|
715
1021
|
# Add string after
|
|
716
1022
|
if ending_prefix_str is not None:
|
|
717
|
-
df[output_column_name] =
|
|
718
|
-
|
|
1023
|
+
df[output_column_name] = (
|
|
1024
|
+
df[output_column_name].astype(str) + ending_prefix_str
|
|
1025
|
+
)
|
|
1026
|
+
|
|
719
1027
|
return df
|
|
720
1028
|
|
|
721
|
-
def check_sum_of_df_cols_are_equal(self, df_1,df_2,cols_1,cols_2):
|
|
1029
|
+
def check_sum_of_df_cols_are_equal(self, df_1, df_2, cols_1, cols_2):
|
|
722
1030
|
"""
|
|
723
1031
|
Checks the sum of two different dataframe column or columns are equal
|
|
724
1032
|
|
|
725
|
-
Parameters
|
|
1033
|
+
Parameters
|
|
1034
|
+
----------
|
|
726
1035
|
df_1 (pandas.DataFrame): First dataframe for columnsa to be summed on.
|
|
727
1036
|
df_2 (pandas.DataFrame): Second dataframe for columnsa to be summed on.
|
|
728
1037
|
cols_1 (list of str): Columns from first dataframe to sum.
|
|
729
1038
|
cols_2 (list of str): Columns from second dataframe to sum.
|
|
730
1039
|
|
|
731
|
-
Returns
|
|
1040
|
+
Returns
|
|
1041
|
+
-------
|
|
732
1042
|
Tuple: Answer is the true or false answer to whether sums are the same, df_1_sum is the sum of the column/columns in the first dataframe, df_2_sum is the sum of the column/columns in the second dataframe
|
|
1043
|
+
|
|
733
1044
|
"""
|
|
734
1045
|
# Find the sum of both sets of columns
|
|
735
1046
|
df_1_sum = df_1[cols_1].sum().sum()
|
|
736
1047
|
df_2_sum = df_2[cols_2].sum().sum()
|
|
737
|
-
|
|
738
|
-
# If the the two columns are
|
|
1048
|
+
|
|
1049
|
+
# If the the two columns are
|
|
739
1050
|
if df_1_sum == df_2_sum:
|
|
740
1051
|
Answer = "They are equal"
|
|
741
1052
|
if df_1_sum != df_2_sum:
|
|
742
|
-
Answer = "They are different by " + str(df_2_sum-df_1_sum)
|
|
743
|
-
|
|
744
|
-
return Answer,df_1_sum,df_2_sum
|
|
745
|
-
|
|
1053
|
+
Answer = "They are different by " + str(df_2_sum - df_1_sum)
|
|
1054
|
+
|
|
1055
|
+
return Answer, df_1_sum, df_2_sum
|
|
1056
|
+
|
|
746
1057
|
def convert_2_df_cols_to_dict(self, df, key_col, value_col):
|
|
747
1058
|
"""
|
|
748
1059
|
Create a dictionary mapping from two columns of a DataFrame.
|
|
749
1060
|
|
|
750
|
-
Parameters
|
|
1061
|
+
Parameters
|
|
1062
|
+
----------
|
|
751
1063
|
df (pd.DataFrame): The DataFrame containing the data.
|
|
752
1064
|
key_col (str): The column name to use as keys in the dictionary.
|
|
753
1065
|
value_col (str): The column name to use as values in the dictionary.
|
|
754
1066
|
|
|
755
|
-
Returns
|
|
1067
|
+
Returns
|
|
1068
|
+
-------
|
|
756
1069
|
dict: A dictionary with keys from 'key_col' and values from 'value_col'.
|
|
1070
|
+
|
|
757
1071
|
"""
|
|
758
1072
|
if key_col not in df or value_col not in df:
|
|
759
1073
|
raise ValueError("Specified columns are not in the DataFrame")
|
|
760
1074
|
|
|
761
1075
|
return {df[key_col].iloc[i]: df[value_col].iloc[i] for i in range(len(df))}
|
|
762
|
-
|
|
763
|
-
def create_FY_and_H_columns(
|
|
1076
|
+
|
|
1077
|
+
def create_FY_and_H_columns(
|
|
1078
|
+
self,
|
|
1079
|
+
df,
|
|
1080
|
+
index_col,
|
|
1081
|
+
start_date,
|
|
1082
|
+
starting_FY,
|
|
1083
|
+
short_format="No",
|
|
1084
|
+
half_years="No",
|
|
1085
|
+
combined_FY_and_H="No",
|
|
1086
|
+
):
|
|
764
1087
|
"""
|
|
765
|
-
Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
|
|
1088
|
+
Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
|
|
766
1089
|
|
|
767
|
-
Parameters
|
|
1090
|
+
Parameters
|
|
1091
|
+
----------
|
|
768
1092
|
df (pandas.DataFrame): Dataframe to operate on.
|
|
769
1093
|
index_col (str): Name of the column to use for datetime
|
|
770
1094
|
start_date (str): String used to specify the start date of an FY specified, needs to be of format "yyyy-mm-dd" e.g. 2021-11-31
|
|
@@ -773,16 +1097,17 @@ class dataprocessing:
|
|
|
773
1097
|
half_years (str, optional): String used to specify if half year column is desired. Defaults to "No".
|
|
774
1098
|
combined_FY_and_H (str, optional): String used to specify is a combined half year and FY column is desired. Defaults to "No".
|
|
775
1099
|
|
|
776
|
-
Returns
|
|
1100
|
+
Returns
|
|
1101
|
+
-------
|
|
777
1102
|
pandas.DataFrame: DataFrame with a new column 'FY' containing the FY as well as, if desired, a half year column and a combined FY half year column.
|
|
1103
|
+
|
|
778
1104
|
"""
|
|
779
|
-
|
|
780
1105
|
try:
|
|
781
|
-
start_date = datetime.strptime(start_date,
|
|
1106
|
+
start_date = datetime.strptime(start_date, "%Y-%m-%d")
|
|
782
1107
|
except ValueError:
|
|
783
1108
|
print("Error: Date must be of format yyyy-mm-dd")
|
|
784
1109
|
return df
|
|
785
|
-
|
|
1110
|
+
|
|
786
1111
|
df["OBS"] = pd.to_datetime(df[index_col])
|
|
787
1112
|
df["OBS as string"] = df["OBS"].dt.strftime("%Y-%m-%d")
|
|
788
1113
|
|
|
@@ -792,35 +1117,51 @@ class dataprocessing:
|
|
|
792
1117
|
|
|
793
1118
|
def calculate_FY_vectorized(date_series):
|
|
794
1119
|
years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
|
|
795
|
-
fy =
|
|
1120
|
+
fy = "FY" + (start_year + years_since_start).astype(str)
|
|
796
1121
|
if short_format == "Yes":
|
|
797
|
-
fy =
|
|
1122
|
+
fy = "FY" + fy.str[-2:]
|
|
798
1123
|
return fy
|
|
799
1124
|
|
|
800
|
-
df[
|
|
1125
|
+
df["FY"] = calculate_FY_vectorized(df[index_col])
|
|
801
1126
|
|
|
802
1127
|
if half_years == "Yes" or combined_FY_and_H == "Yes":
|
|
1128
|
+
|
|
803
1129
|
def calculate_half_year_vectorized(date_series):
|
|
804
|
-
fy_years_since_start = (
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
1130
|
+
fy_years_since_start = (
|
|
1131
|
+
(date_series - start_date).dt.days / 364
|
|
1132
|
+
).astype(int)
|
|
1133
|
+
fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(
|
|
1134
|
+
years=1,
|
|
1135
|
+
)
|
|
1136
|
+
fy_end_of_h1 = (
|
|
1137
|
+
fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
|
|
1138
|
+
)
|
|
1139
|
+
half_year = np.where(date_series <= fy_end_of_h1, "H1", "H2")
|
|
808
1140
|
return half_year
|
|
809
|
-
|
|
810
|
-
df[
|
|
811
|
-
|
|
1141
|
+
|
|
1142
|
+
df["Half Years"] = calculate_half_year_vectorized(df[index_col])
|
|
1143
|
+
|
|
812
1144
|
if combined_FY_and_H == "Yes":
|
|
813
|
-
df[
|
|
1145
|
+
df["Financial Half Years"] = df["FY"] + " " + df["Half Years"]
|
|
814
1146
|
|
|
815
1147
|
return df
|
|
816
|
-
|
|
817
|
-
def keyword_lookup_replacement(
|
|
1148
|
+
|
|
1149
|
+
def keyword_lookup_replacement(
|
|
1150
|
+
self,
|
|
1151
|
+
df,
|
|
1152
|
+
col,
|
|
1153
|
+
replacement_rows,
|
|
1154
|
+
cols_to_merge,
|
|
1155
|
+
replacement_lookup_dict,
|
|
1156
|
+
output_column_name="Updated Column",
|
|
1157
|
+
):
|
|
818
1158
|
"""
|
|
819
1159
|
This function updates values in a specified column of the DataFrame based on a lookup dictionary.
|
|
820
1160
|
It first merges several columns into a new 'Merged' column, then uses this merged column to determine
|
|
821
1161
|
if replacements are needed based on the dictionary.
|
|
822
1162
|
|
|
823
|
-
Parameters
|
|
1163
|
+
Parameters
|
|
1164
|
+
----------
|
|
824
1165
|
df (pd.DataFrame): The DataFrame to process.
|
|
825
1166
|
col (str): The name of the column whose values are potentially replaced.
|
|
826
1167
|
replacement_rows (str): The specific value in 'col' to check for replacements.
|
|
@@ -828,65 +1169,102 @@ class dataprocessing:
|
|
|
828
1169
|
replacement_lookup_dict (dict): Dictionary where keys are merged column values and values are the new data to replace in 'col'.
|
|
829
1170
|
output_column_name (str, optional): Name of column outputted. Defaults to "Updated Column".
|
|
830
1171
|
|
|
831
|
-
Returns
|
|
1172
|
+
Returns
|
|
1173
|
+
-------
|
|
832
1174
|
pd.DataFrame: The modified DataFrame with updated values in the specified column.
|
|
1175
|
+
|
|
833
1176
|
"""
|
|
834
1177
|
# Create a merged column from specified columns
|
|
835
|
-
df["Merged"] = df[cols_to_merge].apply(
|
|
836
|
-
|
|
1178
|
+
df["Merged"] = df[cols_to_merge].apply(
|
|
1179
|
+
lambda row: "|".join(row.values.astype(str)),
|
|
1180
|
+
axis=1,
|
|
1181
|
+
)
|
|
1182
|
+
|
|
837
1183
|
# Replace values in the specified column based on the lookup
|
|
838
1184
|
def replace_values(x):
|
|
839
1185
|
if x[col] == replacement_rows:
|
|
840
|
-
merged_value = x[
|
|
1186
|
+
merged_value = x["Merged"]
|
|
841
1187
|
if merged_value in replacement_lookup_dict:
|
|
842
1188
|
return replacement_lookup_dict[merged_value]
|
|
843
1189
|
return x[col]
|
|
844
|
-
|
|
1190
|
+
|
|
845
1191
|
# Apply replacement logic
|
|
846
1192
|
df[output_column_name] = df.apply(replace_values, axis=1)
|
|
847
|
-
|
|
1193
|
+
|
|
848
1194
|
# Drop the intermediate 'Merged' column
|
|
849
|
-
df.drop(columns=[
|
|
850
|
-
|
|
1195
|
+
df.drop(columns=["Merged"], inplace=True)
|
|
1196
|
+
|
|
851
1197
|
return df
|
|
852
1198
|
|
|
853
|
-
def create_new_version_of_col_using_LUT(
|
|
1199
|
+
def create_new_version_of_col_using_LUT(
|
|
1200
|
+
self,
|
|
1201
|
+
df,
|
|
1202
|
+
keys_col,
|
|
1203
|
+
value_col,
|
|
1204
|
+
dict_for_specific_changes,
|
|
1205
|
+
new_col_name="New Version of Old Col",
|
|
1206
|
+
):
|
|
854
1207
|
"""
|
|
855
|
-
Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
|
|
1208
|
+
Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
|
|
856
1209
|
The lookup is based on a column in the dataframe. Can only input one column and output one new column.
|
|
857
1210
|
|
|
858
|
-
Parameters
|
|
1211
|
+
Parameters
|
|
1212
|
+
----------
|
|
859
1213
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
860
1214
|
keys_col (str): The name of the column which the LUT will be refercing to ouput a value.
|
|
861
1215
|
value_col (str): The name of the column which the new column will be based off. If a key in the key column is not found in the LUT, the values from this column are used instead.
|
|
862
1216
|
dict_for_specific_changes (dict): The LUT which the keys_col will be mapped on to find any values that need changing in the new column.
|
|
863
1217
|
new_col_name (str, optional): This is the name of the new column being generated. Defaults to "New Version of Old Col".
|
|
864
1218
|
|
|
865
|
-
Returns
|
|
1219
|
+
Returns
|
|
1220
|
+
-------
|
|
866
1221
|
pandas.DataFrame: DataFrame with a new column which is similar to the old column, except for where changes have been made to reflect the lookup table.
|
|
1222
|
+
|
|
867
1223
|
"""
|
|
868
|
-
|
|
869
1224
|
# Extract columns to change using new dictionary
|
|
870
|
-
smaller_df = df[[keys_col,value_col]]
|
|
1225
|
+
smaller_df = df[[keys_col, value_col]]
|
|
871
1226
|
|
|
872
1227
|
# Use the new dictionary to create a new LUT
|
|
873
|
-
smaller_df_with_LUT = self.apply_lookup_table_for_columns(
|
|
874
|
-
|
|
1228
|
+
smaller_df_with_LUT = self.apply_lookup_table_for_columns(
|
|
1229
|
+
smaller_df,
|
|
1230
|
+
[keys_col, value_col],
|
|
1231
|
+
dict_for_specific_changes,
|
|
1232
|
+
)
|
|
1233
|
+
|
|
875
1234
|
# In a new column, keep values from the old column that don't need updating as they are not in the dictionary, and replace values that do need updating with values from the dictionary based on the keys
|
|
876
|
-
smaller_df_with_LUT["Updated Col"]=smaller_df_with_LUT.apply(
|
|
1235
|
+
smaller_df_with_LUT["Updated Col"] = smaller_df_with_LUT.apply(
|
|
1236
|
+
lambda x: x["Mapping"] if x["Mapping"] != "Other" else x[value_col],
|
|
1237
|
+
axis=1,
|
|
1238
|
+
)
|
|
877
1239
|
|
|
878
1240
|
# Drop the extra unecessary cols
|
|
879
|
-
smaller_df_with_LUT.drop([keys_col,
|
|
880
|
-
|
|
1241
|
+
smaller_df_with_LUT.drop([keys_col, "Mapping"], axis=1, inplace=True)
|
|
1242
|
+
|
|
881
1243
|
# # Output dataframes as dictionary to be used in a LUT
|
|
882
|
-
new_dict = self.convert_2_df_cols_to_dict(
|
|
1244
|
+
new_dict = self.convert_2_df_cols_to_dict(
|
|
1245
|
+
smaller_df_with_LUT,
|
|
1246
|
+
value_col,
|
|
1247
|
+
"Updated Col",
|
|
1248
|
+
)
|
|
883
1249
|
|
|
884
1250
|
# # Use new dictionary to create a new version of an old column
|
|
885
|
-
df_final = self.apply_lookup_table_for_columns(
|
|
886
|
-
|
|
1251
|
+
df_final = self.apply_lookup_table_for_columns(
|
|
1252
|
+
df,
|
|
1253
|
+
[keys_col],
|
|
1254
|
+
new_dict,
|
|
1255
|
+
"other",
|
|
1256
|
+
new_col_name,
|
|
1257
|
+
)
|
|
1258
|
+
|
|
887
1259
|
return df_final
|
|
888
|
-
|
|
889
|
-
def convert_df_wide_2_long(
|
|
1260
|
+
|
|
1261
|
+
def convert_df_wide_2_long(
|
|
1262
|
+
self,
|
|
1263
|
+
df,
|
|
1264
|
+
value_cols,
|
|
1265
|
+
variable_col_name="Stacked",
|
|
1266
|
+
value_col_name="Value",
|
|
1267
|
+
):
|
|
890
1268
|
"""
|
|
891
1269
|
Changes a dataframe from wide to long format.
|
|
892
1270
|
|
|
@@ -901,16 +1279,25 @@ class dataprocessing:
|
|
|
901
1279
|
|
|
902
1280
|
Raises:
|
|
903
1281
|
ValueError: If the number of columns to depivot is less than 2.
|
|
1282
|
+
|
|
904
1283
|
"""
|
|
905
1284
|
# Check length of value_cols is greater than 1
|
|
906
1285
|
if len(value_cols) < 2:
|
|
907
1286
|
raise ValueError("Number of inputs in list must be greater than 1")
|
|
908
1287
|
|
|
909
1288
|
# Find the columns that are not to be depivoted into one column
|
|
910
|
-
id_vars = [
|
|
1289
|
+
id_vars = [
|
|
1290
|
+
col for col in df.columns if col not in value_cols
|
|
1291
|
+
] # Preserve column order in the DataFrame
|
|
911
1292
|
|
|
912
1293
|
# Melt all columns chosen into one column
|
|
913
|
-
df_final = pd.melt(
|
|
1294
|
+
df_final = pd.melt(
|
|
1295
|
+
df,
|
|
1296
|
+
id_vars=id_vars,
|
|
1297
|
+
value_vars=value_cols,
|
|
1298
|
+
var_name=variable_col_name,
|
|
1299
|
+
value_name=value_col_name,
|
|
1300
|
+
)
|
|
914
1301
|
|
|
915
1302
|
# Sort column order to match expected output
|
|
916
1303
|
ordered_columns = id_vars + [variable_col_name, value_col_name]
|
|
@@ -918,7 +1305,19 @@ class dataprocessing:
|
|
|
918
1305
|
|
|
919
1306
|
return df_final
|
|
920
1307
|
|
|
921
|
-
def manually_edit_data(
|
|
1308
|
+
def manually_edit_data(
|
|
1309
|
+
self,
|
|
1310
|
+
df,
|
|
1311
|
+
filters_dict,
|
|
1312
|
+
col_to_change,
|
|
1313
|
+
new_value,
|
|
1314
|
+
change_in_existing_df_col="No",
|
|
1315
|
+
new_col_to_change_name="New",
|
|
1316
|
+
manual_edit_col_name=None,
|
|
1317
|
+
add_notes="No",
|
|
1318
|
+
existing_note_col_name=None,
|
|
1319
|
+
note=None,
|
|
1320
|
+
):
|
|
922
1321
|
"""
|
|
923
1322
|
Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
|
|
924
1323
|
|
|
@@ -941,31 +1340,44 @@ class dataprocessing:
|
|
|
941
1340
|
|
|
942
1341
|
Returns:
|
|
943
1342
|
pandas.DataFrame: Dataframe with manual changes added
|
|
1343
|
+
|
|
944
1344
|
"""
|
|
945
|
-
|
|
946
1345
|
# Raise type error if more than one col is supported
|
|
947
1346
|
if isinstance(col_to_change, list):
|
|
948
1347
|
raise TypeError("Col to change must be specified as a string, not a list")
|
|
949
1348
|
|
|
950
1349
|
# Raises value error if input is invalid for change_in_existing_df_col
|
|
951
1350
|
if change_in_existing_df_col not in ["Yes", "No"]:
|
|
952
|
-
raise ValueError(
|
|
1351
|
+
raise ValueError(
|
|
1352
|
+
"Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']",
|
|
1353
|
+
)
|
|
953
1354
|
|
|
954
1355
|
# Raises value error if input is invalid for add_notes_col
|
|
955
1356
|
if add_notes not in ["Yes", "No"]:
|
|
956
|
-
raise ValueError(
|
|
1357
|
+
raise ValueError(
|
|
1358
|
+
"Invalid input value for add_notes. Allowed values are: ['Yes', 'No']",
|
|
1359
|
+
)
|
|
957
1360
|
|
|
958
1361
|
# Validate filters_dict format
|
|
959
1362
|
for col, cond in filters_dict.items():
|
|
960
1363
|
if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
|
|
961
|
-
raise ValueError(
|
|
1364
|
+
raise ValueError(
|
|
1365
|
+
f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'",
|
|
1366
|
+
)
|
|
962
1367
|
|
|
963
1368
|
# Create the filtered df by applying the conditions
|
|
964
1369
|
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
965
1370
|
|
|
966
1371
|
# Create a new column to add the changes if desired, else edit in the current chosen column
|
|
967
|
-
col_to_update =
|
|
968
|
-
|
|
1372
|
+
col_to_update = (
|
|
1373
|
+
col_to_change
|
|
1374
|
+
if change_in_existing_df_col == "Yes"
|
|
1375
|
+
else new_col_to_change_name
|
|
1376
|
+
)
|
|
1377
|
+
if (
|
|
1378
|
+
change_in_existing_df_col == "No"
|
|
1379
|
+
and new_col_to_change_name not in df.columns
|
|
1380
|
+
):
|
|
969
1381
|
df = df.copy()
|
|
970
1382
|
df[new_col_to_change_name] = df[col_to_change]
|
|
971
1383
|
|
|
@@ -977,19 +1389,19 @@ class dataprocessing:
|
|
|
977
1389
|
if manual_edit_col_name not in df.columns:
|
|
978
1390
|
df[manual_edit_col_name] = 0
|
|
979
1391
|
df.loc[df_filtered.index, manual_edit_col_name] = 1
|
|
980
|
-
elif not manual_edit_col_name and
|
|
981
|
-
df[
|
|
982
|
-
df.loc[df_filtered.index,
|
|
1392
|
+
elif not manual_edit_col_name and "Manual Changes" not in df.columns:
|
|
1393
|
+
df["Manual Changes"] = 0
|
|
1394
|
+
df.loc[df_filtered.index, "Manual Changes"] = 1
|
|
983
1395
|
|
|
984
1396
|
# Add note if desired in new column or an existing column
|
|
985
1397
|
if add_notes == "Yes":
|
|
986
|
-
note_col = existing_note_col_name if existing_note_col_name else
|
|
1398
|
+
note_col = existing_note_col_name if existing_note_col_name else "Notes"
|
|
987
1399
|
if note_col not in df.columns:
|
|
988
1400
|
df[note_col] = None
|
|
989
1401
|
df.loc[df_filtered.index, note_col] = note
|
|
990
1402
|
|
|
991
1403
|
return df
|
|
992
|
-
|
|
1404
|
+
|
|
993
1405
|
def format_numbers_with_commas(self, df, decimal_length_chosen=2):
|
|
994
1406
|
"""
|
|
995
1407
|
Converts data in numerical format into numbers with commas and a chosen decimal place length.
|
|
@@ -1000,24 +1412,26 @@ class dataprocessing:
|
|
|
1000
1412
|
|
|
1001
1413
|
Returns:
|
|
1002
1414
|
pandas.DataFrame: The DataFrame with the chosen updated format.
|
|
1415
|
+
|
|
1003
1416
|
"""
|
|
1417
|
+
|
|
1004
1418
|
def format_number_with_commas(x, decimal_length=decimal_length_chosen):
|
|
1005
1419
|
if pd.isna(x): # Preserve None/NaN values
|
|
1006
1420
|
return pd.NA # Explicitly normalize to pd.NA
|
|
1007
|
-
|
|
1421
|
+
if isinstance(x, (int, float)):
|
|
1008
1422
|
if decimal_length is not None:
|
|
1009
1423
|
format_str = f"{{:,.{decimal_length}f}}"
|
|
1010
1424
|
return format_str.format(x)
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
else:
|
|
1014
|
-
return x # Return unchanged if not a number
|
|
1425
|
+
return f"{x:,}"
|
|
1426
|
+
return x # Return unchanged if not a number
|
|
1015
1427
|
|
|
1016
1428
|
# Apply formatting column by column
|
|
1017
|
-
formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(
|
|
1429
|
+
formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(
|
|
1430
|
+
value=pd.NA,
|
|
1431
|
+
)
|
|
1018
1432
|
|
|
1019
1433
|
return formatted_df
|
|
1020
|
-
|
|
1434
|
+
|
|
1021
1435
|
def filter_df_on_multiple_conditions(self, df, filters_dict):
|
|
1022
1436
|
"""
|
|
1023
1437
|
Filter a dataframe based on mulitple conditions
|
|
@@ -1028,59 +1442,62 @@ class dataprocessing:
|
|
|
1028
1442
|
|
|
1029
1443
|
Returns:
|
|
1030
1444
|
pandas.DatFrame: Filtered Da
|
|
1445
|
+
|
|
1031
1446
|
"""
|
|
1032
1447
|
mask = pd.Series(True, index=df.index)
|
|
1033
1448
|
for col, cond in filters_dict.items():
|
|
1034
1449
|
cond = cond.strip()
|
|
1035
1450
|
operator, value = cond.split(maxsplit=1)
|
|
1036
|
-
|
|
1451
|
+
|
|
1037
1452
|
# If value is a string condition make sure to check if there are new lines
|
|
1038
1453
|
if "'" in value:
|
|
1039
1454
|
value = value.strip().strip("'\"")
|
|
1040
1455
|
# If not a string e.g. datetime or number condition you need to transform the string into a value
|
|
1041
1456
|
else:
|
|
1042
|
-
value = eval(value)
|
|
1457
|
+
value = eval(value)
|
|
1043
1458
|
|
|
1044
1459
|
if operator == "==":
|
|
1045
|
-
temp_mask =
|
|
1460
|
+
temp_mask = df[col] == value
|
|
1046
1461
|
elif operator == "!=":
|
|
1047
|
-
temp_mask =
|
|
1462
|
+
temp_mask = df[col] != value
|
|
1048
1463
|
elif operator == ">=":
|
|
1049
|
-
temp_mask =
|
|
1464
|
+
temp_mask = df[col] >= value
|
|
1050
1465
|
elif operator == "<=":
|
|
1051
|
-
temp_mask =
|
|
1466
|
+
temp_mask = df[col] <= value
|
|
1052
1467
|
elif operator == ">":
|
|
1053
|
-
temp_mask =
|
|
1468
|
+
temp_mask = df[col] > value
|
|
1054
1469
|
elif operator == "<":
|
|
1055
|
-
temp_mask =
|
|
1470
|
+
temp_mask = df[col] < value
|
|
1056
1471
|
mask &= temp_mask
|
|
1057
1472
|
|
|
1058
1473
|
# Create the filtered df by applying the conditions
|
|
1059
1474
|
df_filtered = df[mask]
|
|
1060
|
-
|
|
1475
|
+
|
|
1061
1476
|
return df_filtered
|
|
1062
|
-
|
|
1063
|
-
def read_and_concatenate_files(self, folder_path, file_type=
|
|
1477
|
+
|
|
1478
|
+
def read_and_concatenate_files(self, folder_path, file_type="csv"):
|
|
1064
1479
|
"""
|
|
1065
|
-
Reads all files of a specified type (CSV or XLSX) from a given folder
|
|
1480
|
+
Reads all files of a specified type (CSV or XLSX) from a given folder
|
|
1066
1481
|
and concatenates them into a single DataFrame.
|
|
1067
|
-
|
|
1068
|
-
Parameters
|
|
1482
|
+
|
|
1483
|
+
Parameters
|
|
1484
|
+
----------
|
|
1069
1485
|
folder_path (str): The path to the folder containing the files.
|
|
1070
1486
|
file_type (str): The type of files to read ('csv' or 'xlsx'). Defaults to 'csv'.
|
|
1071
|
-
|
|
1072
|
-
Returns
|
|
1487
|
+
|
|
1488
|
+
Returns
|
|
1489
|
+
-------
|
|
1073
1490
|
pd.DataFrame: A DataFrame containing the concatenated data from all files.
|
|
1491
|
+
|
|
1074
1492
|
"""
|
|
1075
|
-
|
|
1076
1493
|
# Initialize an empty list to hold dataframes
|
|
1077
1494
|
dataframes = []
|
|
1078
1495
|
|
|
1079
1496
|
# Define file extension based on file_type
|
|
1080
|
-
if file_type ==
|
|
1081
|
-
extension =
|
|
1082
|
-
elif file_type ==
|
|
1083
|
-
extension =
|
|
1497
|
+
if file_type == "csv":
|
|
1498
|
+
extension = ".csv"
|
|
1499
|
+
elif file_type == "xlsx":
|
|
1500
|
+
extension = ".xlsx"
|
|
1084
1501
|
else:
|
|
1085
1502
|
raise ValueError("file_type must be either 'csv' or 'xlsx'")
|
|
1086
1503
|
|
|
@@ -1090,19 +1507,19 @@ class dataprocessing:
|
|
|
1090
1507
|
if filename.endswith(extension):
|
|
1091
1508
|
file_path = os.path.join(folder_path, filename)
|
|
1092
1509
|
# Read the file into a DataFrame
|
|
1093
|
-
if file_type ==
|
|
1510
|
+
if file_type == "csv":
|
|
1094
1511
|
df = pd.read_csv(file_path)
|
|
1095
|
-
elif file_type ==
|
|
1512
|
+
elif file_type == "xlsx":
|
|
1096
1513
|
df = pd.read_excel(file_path)
|
|
1097
1514
|
# Append the DataFrame to the list
|
|
1098
1515
|
dataframes.append(df)
|
|
1099
1516
|
|
|
1100
1517
|
# Concatenate all DataFrames into a single DataFrame
|
|
1101
1518
|
combined_df = pd.concat(dataframes, ignore_index=True)
|
|
1102
|
-
|
|
1519
|
+
|
|
1103
1520
|
return combined_df
|
|
1104
|
-
|
|
1105
|
-
def upgrade_outdated_packages(self, exclude_packages=[
|
|
1521
|
+
|
|
1522
|
+
def upgrade_outdated_packages(self, exclude_packages=["twine"]):
|
|
1106
1523
|
"""
|
|
1107
1524
|
Upgrade all outdated Python packages except those specified in `exclude_packages`.
|
|
1108
1525
|
|
|
@@ -1113,32 +1530,49 @@ class dataprocessing:
|
|
|
1113
1530
|
try:
|
|
1114
1531
|
# Get all installed packages
|
|
1115
1532
|
installed_packages_result = subprocess.run(
|
|
1116
|
-
"pip list --format=json",
|
|
1533
|
+
"pip list --format=json",
|
|
1534
|
+
check=False,
|
|
1535
|
+
shell=True,
|
|
1536
|
+
capture_output=True,
|
|
1537
|
+
text=True,
|
|
1117
1538
|
)
|
|
1118
1539
|
installed_packages = json.loads(installed_packages_result.stdout)
|
|
1119
1540
|
|
|
1120
1541
|
# Get the list of outdated packages
|
|
1121
1542
|
outdated_packages_result = subprocess.run(
|
|
1122
|
-
"pip list --outdated --format=json",
|
|
1543
|
+
"pip list --outdated --format=json",
|
|
1544
|
+
check=False,
|
|
1545
|
+
shell=True,
|
|
1546
|
+
capture_output=True,
|
|
1547
|
+
text=True,
|
|
1123
1548
|
)
|
|
1124
1549
|
outdated_packages = json.loads(outdated_packages_result.stdout)
|
|
1125
1550
|
|
|
1126
1551
|
# Create a set of outdated package names for quick lookup
|
|
1127
|
-
outdated_package_names = {pkg[
|
|
1552
|
+
outdated_package_names = {pkg["name"] for pkg in outdated_packages}
|
|
1128
1553
|
|
|
1129
1554
|
# Upgrade only outdated packages, excluding specified packages
|
|
1130
1555
|
for package in installed_packages:
|
|
1131
|
-
package_name = package[
|
|
1132
|
-
if
|
|
1556
|
+
package_name = package["name"]
|
|
1557
|
+
if (
|
|
1558
|
+
package_name in outdated_package_names
|
|
1559
|
+
and package_name not in exclude_packages
|
|
1560
|
+
):
|
|
1133
1561
|
try:
|
|
1134
1562
|
print(f"Upgrading package: {package_name}")
|
|
1135
1563
|
upgrade_result = subprocess.run(
|
|
1136
|
-
f"pip install --upgrade {package_name}",
|
|
1564
|
+
f"pip install --upgrade {package_name}",
|
|
1565
|
+
check=False,
|
|
1566
|
+
shell=True,
|
|
1567
|
+
capture_output=True,
|
|
1568
|
+
text=True,
|
|
1137
1569
|
)
|
|
1138
1570
|
if upgrade_result.returncode == 0:
|
|
1139
1571
|
print(f"Successfully upgraded {package_name}")
|
|
1140
1572
|
else:
|
|
1141
|
-
print(
|
|
1573
|
+
print(
|
|
1574
|
+
f"Failed to upgrade {package_name}: {upgrade_result.stderr}",
|
|
1575
|
+
)
|
|
1142
1576
|
except Exception as e:
|
|
1143
1577
|
print(f"An error occurred while upgrading {package_name}: {e}")
|
|
1144
1578
|
elif package_name in exclude_packages:
|
|
@@ -1150,12 +1584,12 @@ class dataprocessing:
|
|
|
1150
1584
|
|
|
1151
1585
|
def convert_mixed_formats_dates(self, df, column_name):
|
|
1152
1586
|
# Convert initial dates to datetime with coercion to handle errors
|
|
1153
|
-
df[column_name] = pd.to_datetime(df[column_name], errors=
|
|
1587
|
+
df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
|
|
1154
1588
|
df[column_name] = df[column_name].astype(str)
|
|
1155
1589
|
corrected_dates = []
|
|
1156
|
-
|
|
1590
|
+
|
|
1157
1591
|
for date_str in df[column_name]:
|
|
1158
|
-
date_str = date_str.replace(
|
|
1592
|
+
date_str = date_str.replace("-", "").replace("/", "")
|
|
1159
1593
|
if len(date_str) == 8:
|
|
1160
1594
|
year = date_str[:4]
|
|
1161
1595
|
month = date_str[4:6]
|
|
@@ -1166,39 +1600,45 @@ class dataprocessing:
|
|
|
1166
1600
|
else:
|
|
1167
1601
|
corrected_date_str = f"{year}-{month}-{day}"
|
|
1168
1602
|
# Convert to datetime
|
|
1169
|
-
corrected_date = pd.to_datetime(corrected_date_str, errors=
|
|
1603
|
+
corrected_date = pd.to_datetime(corrected_date_str, errors="coerce")
|
|
1170
1604
|
else:
|
|
1171
|
-
corrected_date = pd.to_datetime(date_str, errors=
|
|
1172
|
-
|
|
1605
|
+
corrected_date = pd.to_datetime(date_str, errors="coerce")
|
|
1606
|
+
|
|
1173
1607
|
corrected_dates.append(corrected_date)
|
|
1174
|
-
|
|
1608
|
+
|
|
1175
1609
|
# Check length of the corrected_dates list
|
|
1176
1610
|
if len(corrected_dates) != len(df):
|
|
1177
|
-
raise ValueError(
|
|
1178
|
-
|
|
1611
|
+
raise ValueError(
|
|
1612
|
+
"Length of corrected_dates does not match the original DataFrame",
|
|
1613
|
+
)
|
|
1614
|
+
|
|
1179
1615
|
# Assign the corrected dates back to the DataFrame
|
|
1180
1616
|
df[column_name] = corrected_dates
|
|
1181
1617
|
return df
|
|
1182
1618
|
|
|
1183
|
-
def fill_weekly_date_range(self, df, date_column, freq=
|
|
1619
|
+
def fill_weekly_date_range(self, df, date_column, freq="W-MON"):
|
|
1184
1620
|
# Ensure the date column is in datetime format
|
|
1185
1621
|
df[date_column] = pd.to_datetime(df[date_column])
|
|
1186
|
-
|
|
1622
|
+
|
|
1187
1623
|
# Generate the full date range with the specified frequency
|
|
1188
|
-
full_date_range = pd.date_range(
|
|
1189
|
-
|
|
1624
|
+
full_date_range = pd.date_range(
|
|
1625
|
+
start=df[date_column].min(),
|
|
1626
|
+
end=df[date_column].max(),
|
|
1627
|
+
freq=freq,
|
|
1628
|
+
)
|
|
1629
|
+
|
|
1190
1630
|
# Create a new dataframe with the full date range
|
|
1191
1631
|
full_date_df = pd.DataFrame({date_column: full_date_range})
|
|
1192
|
-
|
|
1632
|
+
|
|
1193
1633
|
# Merge the original dataframe with the new full date range dataframe
|
|
1194
|
-
df_full = full_date_df.merge(df, on=date_column, how=
|
|
1195
|
-
|
|
1634
|
+
df_full = full_date_df.merge(df, on=date_column, how="left")
|
|
1635
|
+
|
|
1196
1636
|
# Fill missing values with 0
|
|
1197
1637
|
df_full.fillna(0, inplace=True)
|
|
1198
|
-
|
|
1638
|
+
|
|
1199
1639
|
return df_full
|
|
1200
|
-
|
|
1201
|
-
def add_prefix_and_suffix(self, df, prefix=
|
|
1640
|
+
|
|
1641
|
+
def add_prefix_and_suffix(self, df, prefix="", suffix="", date_col=None):
|
|
1202
1642
|
"""
|
|
1203
1643
|
Adds a specified prefix and/or suffix to the column names of a DataFrame. Optionally, a column (e.g., a date column) can be excluded.
|
|
1204
1644
|
|
|
@@ -1210,19 +1650,28 @@ class dataprocessing:
|
|
|
1210
1650
|
|
|
1211
1651
|
Returns:
|
|
1212
1652
|
pd.DataFrame: The DataFrame with updated column names.
|
|
1653
|
+
|
|
1213
1654
|
"""
|
|
1214
|
-
|
|
1215
1655
|
# If there is no date column
|
|
1216
1656
|
if date_col is None:
|
|
1217
1657
|
# Add prefixes and suffixes to all columns
|
|
1218
1658
|
df.columns = [prefix + col + suffix for col in df.columns]
|
|
1219
1659
|
else:
|
|
1220
1660
|
# Add prefixes and suffixes to all columns except the date column
|
|
1221
|
-
df.columns = [
|
|
1222
|
-
|
|
1661
|
+
df.columns = [
|
|
1662
|
+
prefix + col + suffix if col != date_col else col for col in df.columns
|
|
1663
|
+
]
|
|
1664
|
+
|
|
1223
1665
|
return df
|
|
1224
1666
|
|
|
1225
|
-
def create_dummies(
|
|
1667
|
+
def create_dummies(
|
|
1668
|
+
self,
|
|
1669
|
+
df,
|
|
1670
|
+
date_col=None,
|
|
1671
|
+
dummy_threshold=0,
|
|
1672
|
+
add_total_dummy_col="No",
|
|
1673
|
+
total_col_name="total",
|
|
1674
|
+
):
|
|
1226
1675
|
"""
|
|
1227
1676
|
Creates dummy variables for the DataFrame, converting values greater than the threshold to 1 and others to 0.
|
|
1228
1677
|
Optionally adds a total dummy column indicating whether any row contains at least one value greater than the threshold.
|
|
@@ -1236,13 +1685,15 @@ class dataprocessing:
|
|
|
1236
1685
|
|
|
1237
1686
|
Returns:
|
|
1238
1687
|
pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
|
|
1239
|
-
"""
|
|
1240
1688
|
|
|
1689
|
+
"""
|
|
1241
1690
|
# If there is no date column
|
|
1242
1691
|
if date_col is None:
|
|
1243
|
-
df = df.apply(
|
|
1692
|
+
df = df.apply(
|
|
1693
|
+
lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0),
|
|
1694
|
+
)
|
|
1244
1695
|
|
|
1245
|
-
if add_total_dummy_col !=
|
|
1696
|
+
if add_total_dummy_col != "No":
|
|
1246
1697
|
# Find max value of rows
|
|
1247
1698
|
df[total_col_name] = df.max(axis=1)
|
|
1248
1699
|
|
|
@@ -1250,18 +1701,25 @@ class dataprocessing:
|
|
|
1250
1701
|
else:
|
|
1251
1702
|
# Create dummies for all columns except the date column
|
|
1252
1703
|
df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
|
|
1253
|
-
lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
|
|
1704
|
+
lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0),
|
|
1254
1705
|
)
|
|
1255
1706
|
|
|
1256
|
-
if add_total_dummy_col !=
|
|
1707
|
+
if add_total_dummy_col != "No":
|
|
1257
1708
|
# Find max value of rows
|
|
1258
1709
|
df[total_col_name] = df.loc[:, df.columns != date_col].max(axis=1)
|
|
1259
1710
|
|
|
1260
1711
|
return df
|
|
1261
1712
|
|
|
1262
|
-
def replace_substrings(
|
|
1713
|
+
def replace_substrings(
|
|
1714
|
+
self,
|
|
1715
|
+
df,
|
|
1716
|
+
column,
|
|
1717
|
+
replacements,
|
|
1718
|
+
to_lower=False,
|
|
1719
|
+
new_column=None,
|
|
1720
|
+
):
|
|
1263
1721
|
"""
|
|
1264
|
-
Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
|
|
1722
|
+
Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
|
|
1265
1723
|
Optionally converts the column values to lowercase and allows creating a new column or modifying the existing one.
|
|
1266
1724
|
|
|
1267
1725
|
Args:
|
|
@@ -1273,6 +1731,7 @@ class dataprocessing:
|
|
|
1273
1731
|
|
|
1274
1732
|
Returns:
|
|
1275
1733
|
pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
|
|
1734
|
+
|
|
1276
1735
|
"""
|
|
1277
1736
|
if new_column is not None:
|
|
1278
1737
|
# Create a new column for replacements
|
|
@@ -1292,7 +1751,7 @@ class dataprocessing:
|
|
|
1292
1751
|
|
|
1293
1752
|
return df
|
|
1294
1753
|
|
|
1295
|
-
def add_total_column(self, df, exclude_col=None, total_col_name=
|
|
1754
|
+
def add_total_column(self, df, exclude_col=None, total_col_name="Total"):
|
|
1296
1755
|
"""
|
|
1297
1756
|
Adds a total column to a DataFrame by summing across all columns. Optionally excludes a specified column.
|
|
1298
1757
|
|
|
@@ -1303,17 +1762,27 @@ class dataprocessing:
|
|
|
1303
1762
|
|
|
1304
1763
|
Returns:
|
|
1305
1764
|
pd.DataFrame: The DataFrame with an added total column.
|
|
1765
|
+
|
|
1306
1766
|
"""
|
|
1307
1767
|
if exclude_col and exclude_col in df.columns:
|
|
1308
1768
|
# Ensure the column to exclude exists before dropping
|
|
1309
|
-
df[total_col_name] = df.drop(columns=[exclude_col], errors=
|
|
1769
|
+
df[total_col_name] = df.drop(columns=[exclude_col], errors="ignore").sum(
|
|
1770
|
+
axis=1,
|
|
1771
|
+
)
|
|
1310
1772
|
else:
|
|
1311
1773
|
# Sum across all columns if no column is specified to exclude
|
|
1312
1774
|
df[total_col_name] = df.sum(axis=1)
|
|
1313
|
-
|
|
1775
|
+
|
|
1314
1776
|
return df
|
|
1315
1777
|
|
|
1316
|
-
def apply_lookup_table_based_on_substring(
|
|
1778
|
+
def apply_lookup_table_based_on_substring(
|
|
1779
|
+
self,
|
|
1780
|
+
df,
|
|
1781
|
+
column_name,
|
|
1782
|
+
category_dict,
|
|
1783
|
+
new_col_name="Category",
|
|
1784
|
+
other_label="Other",
|
|
1785
|
+
):
|
|
1317
1786
|
"""
|
|
1318
1787
|
Categorizes text in a specified DataFrame column by applying a lookup table based on substrings.
|
|
1319
1788
|
|
|
@@ -1326,6 +1795,7 @@ class dataprocessing:
|
|
|
1326
1795
|
|
|
1327
1796
|
Returns:
|
|
1328
1797
|
pd.DataFrame: The original DataFrame with an additional column containing the assigned categories.
|
|
1798
|
+
|
|
1329
1799
|
"""
|
|
1330
1800
|
|
|
1331
1801
|
def categorize_text(text):
|
|
@@ -1336,11 +1806,14 @@ class dataprocessing:
|
|
|
1336
1806
|
text (str): The text string to categorize.
|
|
1337
1807
|
|
|
1338
1808
|
Returns:
|
|
1339
|
-
str: The category assigned based on the first matching substring found in the text. If no
|
|
1809
|
+
str: The category assigned based on the first matching substring found in the text. If no
|
|
1340
1810
|
matching substring is found, returns other_name.
|
|
1811
|
+
|
|
1341
1812
|
"""
|
|
1342
1813
|
for key, category in category_dict.items():
|
|
1343
|
-
if
|
|
1814
|
+
if (
|
|
1815
|
+
key.lower() in text.lower()
|
|
1816
|
+
): # Check if the substring is in the text (case-insensitive)
|
|
1344
1817
|
return category
|
|
1345
1818
|
return other_label # Default category if no match is found
|
|
1346
1819
|
|
|
@@ -1359,6 +1832,7 @@ class dataprocessing:
|
|
|
1359
1832
|
|
|
1360
1833
|
Returns:
|
|
1361
1834
|
tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
|
|
1835
|
+
|
|
1362
1836
|
"""
|
|
1363
1837
|
# Ensure date columns are in datetime format
|
|
1364
1838
|
df1[date_col] = pd.to_datetime(df1[date_col])
|
|
@@ -1373,29 +1847,43 @@ class dataprocessing:
|
|
|
1373
1847
|
df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
|
|
1374
1848
|
|
|
1375
1849
|
# Merge the DataFrames on the date column
|
|
1376
|
-
merged_df = pd.merge(
|
|
1850
|
+
merged_df = pd.merge(
|
|
1851
|
+
df1_overlap,
|
|
1852
|
+
df2_overlap,
|
|
1853
|
+
on=date_col,
|
|
1854
|
+
suffixes=("_df1", "_df2"),
|
|
1855
|
+
)
|
|
1377
1856
|
|
|
1378
1857
|
# Get common columns, excluding the date column
|
|
1379
|
-
common_cols = [
|
|
1858
|
+
common_cols = [
|
|
1859
|
+
col for col in df1.columns if col != date_col and col in df2.columns
|
|
1860
|
+
]
|
|
1380
1861
|
|
|
1381
1862
|
# Create a DataFrame for differences
|
|
1382
1863
|
diff_df = pd.DataFrame({date_col: merged_df[date_col]})
|
|
1383
1864
|
|
|
1384
1865
|
total_diff_list = []
|
|
1385
1866
|
for col in common_cols:
|
|
1386
|
-
diff_col = f
|
|
1387
|
-
diff_df[diff_col] =
|
|
1867
|
+
diff_col = f"diff_{col}"
|
|
1868
|
+
diff_df[diff_col] = (
|
|
1869
|
+
merged_df[f"{col}_df1"] - merged_df[f"{col}_df2"]
|
|
1870
|
+
) # Corrected subtraction order
|
|
1388
1871
|
|
|
1389
1872
|
# Sum differences for the column
|
|
1390
1873
|
total_diff = diff_df[diff_col].sum()
|
|
1391
|
-
total_diff_list.append({
|
|
1874
|
+
total_diff_list.append({"Column": col, "Total Difference": total_diff})
|
|
1392
1875
|
|
|
1393
1876
|
# Create summary DataFrame
|
|
1394
1877
|
total_diff_df = pd.DataFrame(total_diff_list)
|
|
1395
1878
|
|
|
1396
1879
|
return diff_df, total_diff_df
|
|
1397
1880
|
|
|
1398
|
-
def week_commencing_2_week_commencing_conversion_isoweekday(
|
|
1881
|
+
def week_commencing_2_week_commencing_conversion_isoweekday(
|
|
1882
|
+
self,
|
|
1883
|
+
df,
|
|
1884
|
+
date_col,
|
|
1885
|
+
week_commencing="mon",
|
|
1886
|
+
):
|
|
1399
1887
|
"""
|
|
1400
1888
|
Convert a DataFrame's date column so that each date is mapped back
|
|
1401
1889
|
to the 'week_commencing' day of the *current ISO week*.
|
|
@@ -1403,7 +1891,7 @@ class dataprocessing:
|
|
|
1403
1891
|
Args:
|
|
1404
1892
|
df (pandas.DataFrame): The DataFrame with date-based data.
|
|
1405
1893
|
date_col (str): The name of the date column.
|
|
1406
|
-
week_commencing (str): The desired start of the week.
|
|
1894
|
+
week_commencing (str): The desired start of the week.
|
|
1407
1895
|
('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
|
|
1408
1896
|
Uses ISO day numbering (Mon=1, ..., Sun=7).
|
|
1409
1897
|
|
|
@@ -1411,9 +1899,18 @@ class dataprocessing:
|
|
|
1411
1899
|
pandas.DataFrame: Original DataFrame with an extra column
|
|
1412
1900
|
'week_start_<week_commencing>' containing the
|
|
1413
1901
|
start-of-week date for each row.
|
|
1902
|
+
|
|
1414
1903
|
"""
|
|
1415
1904
|
# ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
|
|
1416
|
-
iso_day_dict = {
|
|
1905
|
+
iso_day_dict = {
|
|
1906
|
+
"mon": 1,
|
|
1907
|
+
"tue": 2,
|
|
1908
|
+
"wed": 3,
|
|
1909
|
+
"thur": 4,
|
|
1910
|
+
"fri": 5,
|
|
1911
|
+
"sat": 6,
|
|
1912
|
+
"sun": 7,
|
|
1913
|
+
}
|
|
1417
1914
|
|
|
1418
1915
|
target_day = iso_day_dict[week_commencing]
|
|
1419
1916
|
|
|
@@ -1424,15 +1921,23 @@ class dataprocessing:
|
|
|
1424
1921
|
# Apply the transformation
|
|
1425
1922
|
new_col = f"week_start_{week_commencing}"
|
|
1426
1923
|
df[new_col] = df[date_col].apply(map_to_week_start)
|
|
1427
|
-
|
|
1924
|
+
|
|
1428
1925
|
return df
|
|
1429
|
-
|
|
1430
|
-
def seasonality_feature_extraction(
|
|
1926
|
+
|
|
1927
|
+
def seasonality_feature_extraction(
|
|
1928
|
+
self,
|
|
1929
|
+
df,
|
|
1930
|
+
kpi_var,
|
|
1931
|
+
n_features=10,
|
|
1932
|
+
test_size=0.1,
|
|
1933
|
+
random_state=42,
|
|
1934
|
+
shuffle=False,
|
|
1935
|
+
):
|
|
1431
1936
|
"""
|
|
1432
1937
|
1) Uses the provided dataframe (df), where:
|
|
1433
1938
|
- df['kpi_total_sales'] is the target (y).
|
|
1434
1939
|
- df['OBS'] is a date or index column (excluded from features).
|
|
1435
|
-
|
|
1940
|
+
|
|
1436
1941
|
2) Splits data into train/test using the specified test_size, random_state, and shuffle.
|
|
1437
1942
|
3) Trains XGBoost and Random Forest on all features.
|
|
1438
1943
|
4) Extracts the top n_features from each model.
|
|
@@ -1462,20 +1967,22 @@ class dataprocessing:
|
|
|
1462
1967
|
- "combined_features": merged unique feature list
|
|
1463
1968
|
- "performance": dictionary of performance metrics
|
|
1464
1969
|
- "models": dictionary of fitted models
|
|
1970
|
+
|
|
1465
1971
|
"""
|
|
1466
1972
|
# ---------------------------------------------------------------------
|
|
1467
1973
|
# 1. Prepare your data (X, y)
|
|
1468
1974
|
# ---------------------------------------------------------------------
|
|
1469
1975
|
# Extract target and features
|
|
1470
1976
|
y = df[kpi_var]
|
|
1471
|
-
X = df.drop(columns=[
|
|
1977
|
+
X = df.drop(columns=["OBS", kpi_var])
|
|
1472
1978
|
|
|
1473
1979
|
# Split into train/test
|
|
1474
1980
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
1475
|
-
X,
|
|
1981
|
+
X,
|
|
1982
|
+
y,
|
|
1476
1983
|
test_size=test_size,
|
|
1477
1984
|
random_state=random_state,
|
|
1478
|
-
shuffle=shuffle
|
|
1985
|
+
shuffle=shuffle,
|
|
1479
1986
|
)
|
|
1480
1987
|
|
|
1481
1988
|
# ---------------------------------------------------------------------
|
|
@@ -1488,16 +1995,13 @@ class dataprocessing:
|
|
|
1488
1995
|
# (B) Get feature importances
|
|
1489
1996
|
xgb_importances = xgb_model_full.feature_importances_
|
|
1490
1997
|
xgb_feat_importance_df = (
|
|
1491
|
-
pd.DataFrame({
|
|
1492
|
-
|
|
1493
|
-
'importance': xgb_importances
|
|
1494
|
-
})
|
|
1495
|
-
.sort_values('importance', ascending=False)
|
|
1998
|
+
pd.DataFrame({"feature": X.columns, "importance": xgb_importances})
|
|
1999
|
+
.sort_values("importance", ascending=False)
|
|
1496
2000
|
.reset_index(drop=True)
|
|
1497
2001
|
)
|
|
1498
2002
|
|
|
1499
2003
|
# (C) Select top N features
|
|
1500
|
-
top_features_xgb = xgb_feat_importance_df[
|
|
2004
|
+
top_features_xgb = xgb_feat_importance_df["feature"].head(n_features).tolist()
|
|
1501
2005
|
|
|
1502
2006
|
# (D) Subset data to top N features
|
|
1503
2007
|
X_train_xgb_topN = X_train[top_features_xgb]
|
|
@@ -1515,16 +2019,13 @@ class dataprocessing:
|
|
|
1515
2019
|
# (B) Get feature importances
|
|
1516
2020
|
rf_importances = rf_model_full.feature_importances_
|
|
1517
2021
|
rf_feat_importance_df = (
|
|
1518
|
-
pd.DataFrame({
|
|
1519
|
-
|
|
1520
|
-
'importance': rf_importances
|
|
1521
|
-
})
|
|
1522
|
-
.sort_values('importance', ascending=False)
|
|
2022
|
+
pd.DataFrame({"feature": X.columns, "importance": rf_importances})
|
|
2023
|
+
.sort_values("importance", ascending=False)
|
|
1523
2024
|
.reset_index(drop=True)
|
|
1524
2025
|
)
|
|
1525
2026
|
|
|
1526
2027
|
# (C) Select top N features
|
|
1527
|
-
top_features_rf = rf_feat_importance_df[
|
|
2028
|
+
top_features_rf = rf_feat_importance_df["feature"].head(n_features).tolist()
|
|
1528
2029
|
|
|
1529
2030
|
# (D) Subset data to top N features
|
|
1530
2031
|
X_train_rf_topN = X_train[top_features_rf]
|
|
@@ -1556,25 +2057,45 @@ class dataprocessing:
|
|
|
1556
2057
|
|
|
1557
2058
|
return output
|
|
1558
2059
|
|
|
1559
|
-
def quid_pr
|
|
2060
|
+
def quid_pr(self, df):
|
|
1560
2061
|
def convert_date(date_str):
|
|
1561
2062
|
try:
|
|
1562
|
-
return datetime.strptime(date_str,
|
|
2063
|
+
return datetime.strptime(date_str, "%b %d, %Y")
|
|
1563
2064
|
except ValueError:
|
|
1564
2065
|
return None # Return None if conversion fails
|
|
2066
|
+
|
|
1565
2067
|
# Apply conversion to create new columns
|
|
1566
|
-
df[
|
|
1567
|
-
df[
|
|
1568
|
-
df[
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
df[
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
2068
|
+
df["Start Date"] = df["Earliest Published"].astype(str).apply(convert_date)
|
|
2069
|
+
df["End Date"] = df["Latest Published"].astype(str).apply(convert_date)
|
|
2070
|
+
df["Days Duration"] = (
|
|
2071
|
+
df["End Date"] - df["Start Date"]
|
|
2072
|
+
).dt.days + 1 # Ensure inclusive range
|
|
2073
|
+
df["Count per Day"] = (
|
|
2074
|
+
df["Published Count"] / df["Days Duration"]
|
|
2075
|
+
) # Calculate count per day
|
|
2076
|
+
df["Social Engagement per Day"] = df["Social Engagement"] / df["Days Duration"]
|
|
2077
|
+
df["Week Start"] = df["Start Date"].apply(
|
|
2078
|
+
lambda x: x - timedelta(days=x.weekday()) if pd.notnull(x) else None,
|
|
2079
|
+
)
|
|
2080
|
+
count_df = df.groupby("Week Start")["Count per Day"].sum().reset_index()
|
|
2081
|
+
total_engagement_per_company = (
|
|
2082
|
+
df.groupby("Company (Primary Mention)")["Social Engagement"]
|
|
2083
|
+
.sum()
|
|
2084
|
+
.reset_index()
|
|
2085
|
+
) # Caluclates Social Engagement across whole period
|
|
2086
|
+
valid_companies = total_engagement_per_company[
|
|
2087
|
+
total_engagement_per_company["Social Engagement"] > 0
|
|
2088
|
+
][
|
|
2089
|
+
"Company (Primary Mention)"
|
|
2090
|
+
] # Filters out Companies with no Social Engagement
|
|
2091
|
+
social_engagement_df = (
|
|
2092
|
+
df[df["Company (Primary Mention)"].isin(valid_companies)]
|
|
2093
|
+
.groupby(["Week Start", "Company (Primary Mention)"])["Social Engagement"]
|
|
2094
|
+
.sum()
|
|
2095
|
+
.reset_index()
|
|
2096
|
+
)
|
|
2097
|
+
total_social_engagement_df = (
|
|
2098
|
+
df.groupby("Week Start")["Social Engagement per Day"].sum().reset_index()
|
|
2099
|
+
)
|
|
2100
|
+
|
|
2101
|
+
return count_df, total_social_engagement_df, social_engagement_df
|