imsciences 0.9.7.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +2 -2
- imsciences/geo.py +173 -115
- imsciences/mmm.py +921 -395
- imsciences/pull.py +1952 -1154
- imsciences/unittesting.py +729 -478
- imsciences/vis.py +669 -126
- {imsciences-0.9.7.0.dist-info → imsciences-1.0.1.dist-info}/METADATA +1 -1
- imsciences-1.0.1.dist-info/RECORD +12 -0
- {imsciences-0.9.7.0.dist-info → imsciences-1.0.1.dist-info}/WHEEL +1 -1
- imsciences-0.9.7.0.dist-info/RECORD +0 -12
- {imsciences-0.9.7.0.dist-info → imsciences-1.0.1.dist-info}/LICENSE.txt +0 -0
- {imsciences-0.9.7.0.dist-info → imsciences-1.0.1.dist-info}/PKG-INFO-TomG-HP-290722 +0 -0
- {imsciences-0.9.7.0.dist-info → imsciences-1.0.1.dist-info}/top_level.txt +0 -0
imsciences/mmm.py
CHANGED
|
@@ -1,93 +1,152 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
1
|
import calendar
|
|
2
|
+
import json
|
|
3
3
|
import os
|
|
4
|
-
import numpy as np
|
|
5
|
-
import re
|
|
6
|
-
from datetime import datetime, timedelta
|
|
7
4
|
import subprocess
|
|
8
|
-
import
|
|
9
|
-
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
10
9
|
import xgboost as xgb
|
|
11
10
|
from sklearn.ensemble import RandomForestRegressor
|
|
11
|
+
from sklearn.model_selection import train_test_split
|
|
12
|
+
|
|
12
13
|
|
|
13
14
|
class dataprocessing:
|
|
14
|
-
|
|
15
15
|
def help(self):
|
|
16
|
-
|
|
17
16
|
print("\n1. get_wd_levels")
|
|
18
|
-
print(
|
|
17
|
+
print(
|
|
18
|
+
" - Description: Get the working directory with the option of moving up parents.",
|
|
19
|
+
)
|
|
19
20
|
print(" - Usage: get_wd_levels(levels)")
|
|
20
21
|
print(" - Example: get_wd_levels(0)")
|
|
21
22
|
|
|
22
23
|
print("\n2. aggregate_daily_to_wc_long")
|
|
23
|
-
print(
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
print(
|
|
25
|
+
" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.",
|
|
26
|
+
)
|
|
27
|
+
print(
|
|
28
|
+
" - Usage: aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')",
|
|
29
|
+
)
|
|
30
|
+
print(
|
|
31
|
+
" - Example: aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')",
|
|
32
|
+
)
|
|
26
33
|
|
|
27
34
|
print("\n3. convert_monthly_to_daily")
|
|
28
|
-
print(
|
|
35
|
+
print(
|
|
36
|
+
" - Description: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.",
|
|
37
|
+
)
|
|
29
38
|
print(" - Usage: convert_monthly_to_daily(df, date_column, divide=True)")
|
|
30
39
|
print(" - Example: convert_monthly_to_daily(df, 'date')")
|
|
31
40
|
|
|
32
41
|
print("\n4. week_of_year_mapping")
|
|
33
|
-
print(
|
|
42
|
+
print(
|
|
43
|
+
" - Description: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.",
|
|
44
|
+
)
|
|
34
45
|
print(" - Usage: week_of_year_mapping(df, week_col, start_day_str)")
|
|
35
46
|
print(" - Example: week_of_year_mapping(df, 'week', 'mon')")
|
|
36
47
|
|
|
37
48
|
print("\n5. rename_cols")
|
|
38
|
-
print(
|
|
49
|
+
print(
|
|
50
|
+
" - Description: Renames columns in a pandas DataFrame with a specified prefix or format.",
|
|
51
|
+
)
|
|
39
52
|
print(" - Usage: rename_cols(df, name='ame_')")
|
|
40
53
|
print(" - Example: rename_cols(df, 'ame_facebook')")
|
|
41
54
|
|
|
42
55
|
print("\n6. merge_new_and_old")
|
|
43
|
-
print(
|
|
44
|
-
|
|
45
|
-
|
|
56
|
+
print(
|
|
57
|
+
" - Description: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.",
|
|
58
|
+
)
|
|
59
|
+
print(
|
|
60
|
+
" - Usage: merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')",
|
|
61
|
+
)
|
|
62
|
+
print(
|
|
63
|
+
" - Example: merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')",
|
|
64
|
+
)
|
|
46
65
|
|
|
47
66
|
print("\n7. merge_dataframes_on_column")
|
|
48
67
|
print(" - Description: Merge a list of DataFrames on a common column.")
|
|
49
|
-
print(
|
|
50
|
-
|
|
68
|
+
print(
|
|
69
|
+
" - Usage: merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')",
|
|
70
|
+
)
|
|
71
|
+
print(
|
|
72
|
+
" - Example: merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')",
|
|
73
|
+
)
|
|
51
74
|
|
|
52
75
|
print("\n8. merge_and_update_dfs")
|
|
53
|
-
print(
|
|
76
|
+
print(
|
|
77
|
+
" - Description: Merges two dataframes, updating columns from the second dataframe where values are available.",
|
|
78
|
+
)
|
|
54
79
|
print(" - Usage: merge_and_update_dfs(df1, df2, key_column)")
|
|
55
|
-
print(
|
|
80
|
+
print(
|
|
81
|
+
" - Example: merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')",
|
|
82
|
+
)
|
|
56
83
|
|
|
57
84
|
print("\n9. convert_us_to_uk_dates")
|
|
58
|
-
print(
|
|
85
|
+
print(
|
|
86
|
+
" - Description: Convert a DataFrame column with mixed US and UK date formats to datetime.",
|
|
87
|
+
)
|
|
59
88
|
print(" - Usage: convert_us_to_uk_dates(df, date_col)")
|
|
60
89
|
print(" - Example: convert_us_to_uk_dates(df, 'date')")
|
|
61
90
|
|
|
62
91
|
print("\n10. combine_sheets")
|
|
63
|
-
print(
|
|
92
|
+
print(
|
|
93
|
+
" - Description: Combines multiple DataFrames from a dictionary into a single DataFrame.",
|
|
94
|
+
)
|
|
64
95
|
print(" - Usage: combine_sheets(all_sheets)")
|
|
65
96
|
print(" - Example: combine_sheets({'Sheet1': df1, 'Sheet2': df2})")
|
|
66
97
|
|
|
67
98
|
print("\n11. pivot_table")
|
|
68
|
-
print(
|
|
69
|
-
|
|
70
|
-
|
|
99
|
+
print(
|
|
100
|
+
" - Description: Dynamically pivots a DataFrame based on specified columns.",
|
|
101
|
+
)
|
|
102
|
+
print(
|
|
103
|
+
" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')",
|
|
104
|
+
)
|
|
105
|
+
print(
|
|
106
|
+
" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)",
|
|
107
|
+
)
|
|
71
108
|
|
|
72
109
|
print("\n12. apply_lookup_table_for_columns")
|
|
73
|
-
print(
|
|
74
|
-
|
|
75
|
-
|
|
110
|
+
print(
|
|
111
|
+
" - Description: Maps substrings in columns to new values based on a dictionary.",
|
|
112
|
+
)
|
|
113
|
+
print(
|
|
114
|
+
" - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')",
|
|
115
|
+
)
|
|
116
|
+
print(
|
|
117
|
+
" - Example: apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')",
|
|
118
|
+
)
|
|
76
119
|
|
|
77
120
|
print("\n13. aggregate_daily_to_wc_wide")
|
|
78
|
-
print(
|
|
79
|
-
|
|
80
|
-
|
|
121
|
+
print(
|
|
122
|
+
" - Description: Aggregates daily data into weekly data and pivots it to wide format.",
|
|
123
|
+
)
|
|
124
|
+
print(
|
|
125
|
+
" - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)",
|
|
126
|
+
)
|
|
127
|
+
print(
|
|
128
|
+
" - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)",
|
|
129
|
+
)
|
|
81
130
|
|
|
82
131
|
print("\n14. merge_cols_with_seperator")
|
|
83
|
-
print(
|
|
84
|
-
|
|
85
|
-
|
|
132
|
+
print(
|
|
133
|
+
" - Description: Merges multiple columns in a DataFrame into one column with a specified separator.",
|
|
134
|
+
)
|
|
135
|
+
print(
|
|
136
|
+
" - Usage: merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')",
|
|
137
|
+
)
|
|
138
|
+
print(
|
|
139
|
+
" - Example: merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')",
|
|
140
|
+
)
|
|
86
141
|
|
|
87
142
|
print("\n15. check_sum_of_df_cols_are_equal")
|
|
88
|
-
print(
|
|
143
|
+
print(
|
|
144
|
+
" - Description: Checks if the sum of two columns in two DataFrames are equal and provides the difference.",
|
|
145
|
+
)
|
|
89
146
|
print(" - Usage: check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)")
|
|
90
|
-
print(
|
|
147
|
+
print(
|
|
148
|
+
" - Example: check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')",
|
|
149
|
+
)
|
|
91
150
|
|
|
92
151
|
print("\n16. convert_2_df_cols_to_dict")
|
|
93
152
|
print(" - Description: Creates a dictionary from two DataFrame columns.")
|
|
@@ -95,128 +154,229 @@ class dataprocessing:
|
|
|
95
154
|
print(" - Example: convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')")
|
|
96
155
|
|
|
97
156
|
print("\n17. create_FY_and_H_columns")
|
|
98
|
-
print(
|
|
99
|
-
|
|
100
|
-
|
|
157
|
+
print(
|
|
158
|
+
" - Description: Adds financial year and half-year columns to a DataFrame based on a start date.",
|
|
159
|
+
)
|
|
160
|
+
print(
|
|
161
|
+
" - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')",
|
|
162
|
+
)
|
|
163
|
+
print(
|
|
164
|
+
" - Example: create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')",
|
|
165
|
+
)
|
|
101
166
|
|
|
102
167
|
print("\n18. keyword_lookup_replacement")
|
|
103
|
-
print(
|
|
104
|
-
|
|
105
|
-
|
|
168
|
+
print(
|
|
169
|
+
" - Description: Updates values in a column based on a lookup dictionary with conditional logic.",
|
|
170
|
+
)
|
|
171
|
+
print(
|
|
172
|
+
" - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')",
|
|
173
|
+
)
|
|
174
|
+
print(
|
|
175
|
+
" - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')",
|
|
176
|
+
)
|
|
106
177
|
|
|
107
178
|
print("\n19. create_new_version_of_col_using_LUT")
|
|
108
|
-
print(
|
|
109
|
-
|
|
110
|
-
|
|
179
|
+
print(
|
|
180
|
+
" - Description: Creates a new column based on a lookup table applied to an existing column.",
|
|
181
|
+
)
|
|
182
|
+
print(
|
|
183
|
+
" - Usage: create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')",
|
|
184
|
+
)
|
|
185
|
+
print(
|
|
186
|
+
" - Example: create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)",
|
|
187
|
+
)
|
|
111
188
|
|
|
112
189
|
print("\n20. convert_df_wide_2_long")
|
|
113
|
-
print(
|
|
114
|
-
|
|
115
|
-
|
|
190
|
+
print(
|
|
191
|
+
" - Description: Converts a wide-format DataFrame into a long-format DataFrame.",
|
|
192
|
+
)
|
|
193
|
+
print(
|
|
194
|
+
" - Usage: convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')",
|
|
195
|
+
)
|
|
196
|
+
print(
|
|
197
|
+
" - Example: convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')",
|
|
198
|
+
)
|
|
116
199
|
|
|
117
200
|
print("\n21. manually_edit_data")
|
|
118
|
-
print(
|
|
119
|
-
|
|
120
|
-
|
|
201
|
+
print(
|
|
202
|
+
" - Description: Manually updates specified cells in a DataFrame based on filters.",
|
|
203
|
+
)
|
|
204
|
+
print(
|
|
205
|
+
" - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)",
|
|
206
|
+
)
|
|
207
|
+
print(
|
|
208
|
+
" - Example: manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')",
|
|
209
|
+
)
|
|
121
210
|
|
|
122
211
|
print("\n22. format_numbers_with_commas")
|
|
123
|
-
print(
|
|
212
|
+
print(
|
|
213
|
+
" - Description: Formats numerical columns with commas and a specified number of decimal places.",
|
|
214
|
+
)
|
|
124
215
|
print(" - Usage: format_numbers_with_commas(df, decimal_length_chosen=2)")
|
|
125
216
|
print(" - Example: format_numbers_with_commas(df, decimal_length_chosen=1)")
|
|
126
217
|
|
|
127
218
|
print("\n23. filter_df_on_multiple_conditions")
|
|
128
|
-
print(
|
|
219
|
+
print(
|
|
220
|
+
" - Description: Filters a DataFrame based on multiple column conditions.",
|
|
221
|
+
)
|
|
129
222
|
print(" - Usage: filter_df_on_multiple_conditions(df, filters_dict)")
|
|
130
|
-
print(
|
|
223
|
+
print(
|
|
224
|
+
" - Example: filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})",
|
|
225
|
+
)
|
|
131
226
|
|
|
132
227
|
print("\n24. read_and_concatenate_files")
|
|
133
|
-
print(
|
|
228
|
+
print(
|
|
229
|
+
" - Description: Reads and concatenates files from a specified folder into a single DataFrame.",
|
|
230
|
+
)
|
|
134
231
|
print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
135
|
-
print(
|
|
232
|
+
print(
|
|
233
|
+
" - Example: read_and_concatenate_files('/path/to/files', file_type='xlsx')",
|
|
234
|
+
)
|
|
136
235
|
|
|
137
236
|
print("\n25. upgrade_outdated_packages")
|
|
138
|
-
print(
|
|
237
|
+
print(
|
|
238
|
+
" - Description: Upgrades all outdated Python packages except specified ones.",
|
|
239
|
+
)
|
|
139
240
|
print(" - Usage: upgrade_outdated_packages(exclude_packages=['twine'])")
|
|
140
|
-
print(
|
|
241
|
+
print(
|
|
242
|
+
" - Example: upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])",
|
|
243
|
+
)
|
|
141
244
|
|
|
142
245
|
print("\n26. convert_mixed_formats_dates")
|
|
143
|
-
print(
|
|
246
|
+
print(
|
|
247
|
+
" - Description: Converts mixed-format date columns into standardized datetime format.",
|
|
248
|
+
)
|
|
144
249
|
print(" - Usage: convert_mixed_formats_dates(df, column_name)")
|
|
145
250
|
print(" - Example: convert_mixed_formats_dates(df, 'date_col')")
|
|
146
251
|
|
|
147
252
|
print("\n27. fill_weekly_date_range")
|
|
148
|
-
print(
|
|
253
|
+
print(
|
|
254
|
+
" - Description: Fills in missing weekly dates in a DataFrame with a specified frequency.",
|
|
255
|
+
)
|
|
149
256
|
print(" - Usage: fill_weekly_date_range(df, date_column, freq='W-MON')")
|
|
150
257
|
print(" - Example: fill_weekly_date_range(df, 'date_col')")
|
|
151
258
|
|
|
152
259
|
print("\n28. add_prefix_and_suffix")
|
|
153
|
-
print(
|
|
154
|
-
|
|
155
|
-
|
|
260
|
+
print(
|
|
261
|
+
" - Description: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.",
|
|
262
|
+
)
|
|
263
|
+
print(
|
|
264
|
+
" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)",
|
|
265
|
+
)
|
|
266
|
+
print(
|
|
267
|
+
" - Example: add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')",
|
|
268
|
+
)
|
|
156
269
|
|
|
157
270
|
print("\n29. create_dummies")
|
|
158
|
-
print(
|
|
159
|
-
|
|
160
|
-
|
|
271
|
+
print(
|
|
272
|
+
" - Description: Creates dummy variables for columns, with an option to add a total dummy column.",
|
|
273
|
+
)
|
|
274
|
+
print(
|
|
275
|
+
" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')",
|
|
276
|
+
)
|
|
277
|
+
print(
|
|
278
|
+
" - Example: create_dummies(df, date_col='date_col', dummy_threshold=1)",
|
|
279
|
+
)
|
|
161
280
|
|
|
162
281
|
print("\n30. replace_substrings")
|
|
163
|
-
print(
|
|
164
|
-
|
|
165
|
-
|
|
282
|
+
print(
|
|
283
|
+
" - Description: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.",
|
|
284
|
+
)
|
|
285
|
+
print(
|
|
286
|
+
" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)",
|
|
287
|
+
)
|
|
288
|
+
print(
|
|
289
|
+
" - Example: replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')",
|
|
290
|
+
)
|
|
166
291
|
|
|
167
292
|
print("\n31. add_total_column")
|
|
168
|
-
print(
|
|
169
|
-
|
|
293
|
+
print(
|
|
294
|
+
" - Description: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.",
|
|
295
|
+
)
|
|
296
|
+
print(
|
|
297
|
+
" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')",
|
|
298
|
+
)
|
|
170
299
|
print(" - Example: add_total_column(df, exclude_col='date_col')")
|
|
171
300
|
|
|
172
301
|
print("\n32. apply_lookup_table_based_on_substring")
|
|
173
|
-
print(
|
|
174
|
-
|
|
175
|
-
|
|
302
|
+
print(
|
|
303
|
+
" - Description: Categorizes text in a column using a lookup table based on substrings.",
|
|
304
|
+
)
|
|
305
|
+
print(
|
|
306
|
+
" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')",
|
|
307
|
+
)
|
|
308
|
+
print(
|
|
309
|
+
" - Example: apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})",
|
|
310
|
+
)
|
|
176
311
|
|
|
177
312
|
print("\n33. compare_overlap")
|
|
178
|
-
print(
|
|
313
|
+
print(
|
|
314
|
+
" - Description: Compares overlapping periods between two DataFrames and summarizes differences.",
|
|
315
|
+
)
|
|
179
316
|
print(" - Usage: compare_overlap(df1, df2, date_col)")
|
|
180
317
|
print(" - Example: compare_overlap(df1, df2, 'date_col')")
|
|
181
318
|
|
|
182
319
|
print("\n34. week_commencing_2_week_commencing_conversion_isoweekday")
|
|
183
|
-
print(
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
320
|
+
print(
|
|
321
|
+
" - Description: Maps dates to the start of the current ISO week based on a specified weekday.",
|
|
322
|
+
)
|
|
323
|
+
print(
|
|
324
|
+
" - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')",
|
|
325
|
+
)
|
|
326
|
+
print(
|
|
327
|
+
" - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')",
|
|
328
|
+
)
|
|
329
|
+
|
|
187
330
|
print("\n35. seasonality_feature_extraction")
|
|
188
|
-
print(
|
|
189
|
-
|
|
190
|
-
|
|
331
|
+
print(
|
|
332
|
+
" - Description: Splits data into train/test sets, trains XGBoost and Random Forest on all features, extracts top features based on feature importance, merges them, optionally retrains models on top and combined features, and returns a dict of results.",
|
|
333
|
+
)
|
|
334
|
+
print(
|
|
335
|
+
" - Usage: seasonality_feature_extraction(df, kpi_var, n_features=10, test_size=0.1, random_state=42, shuffle=False)",
|
|
336
|
+
)
|
|
337
|
+
print(
|
|
338
|
+
" - Example: seasonality_feature_extraction(df, 'kpi_total_sales', n_features=5, test_size=0.2, random_state=123, shuffle=True)",
|
|
339
|
+
)
|
|
191
340
|
|
|
192
341
|
def get_wd_levels(self, levels):
|
|
193
342
|
"""
|
|
194
343
|
Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
|
|
195
344
|
|
|
196
|
-
Parameters
|
|
345
|
+
Parameters
|
|
346
|
+
----------
|
|
197
347
|
- data_frame: pandas DataFrame
|
|
198
348
|
The input data frame.
|
|
199
349
|
- num_rows_to_remove: int
|
|
200
350
|
The number of levels to move up pathways.
|
|
201
351
|
|
|
202
|
-
Returns
|
|
352
|
+
Returns
|
|
353
|
+
-------
|
|
203
354
|
- Current wd
|
|
204
|
-
"""
|
|
205
355
|
|
|
356
|
+
"""
|
|
206
357
|
directory = os.getcwd()
|
|
207
358
|
for _ in range(levels):
|
|
208
359
|
directory = os.path.dirname(directory)
|
|
209
360
|
return directory
|
|
210
|
-
|
|
211
|
-
def aggregate_daily_to_wc_long(
|
|
361
|
+
|
|
362
|
+
def aggregate_daily_to_wc_long(
|
|
363
|
+
self,
|
|
364
|
+
df: pd.DataFrame,
|
|
365
|
+
date_column: str,
|
|
366
|
+
group_columns: list[str],
|
|
367
|
+
sum_columns: list[str],
|
|
368
|
+
wc: str = "sun",
|
|
369
|
+
aggregation: str = "sum",
|
|
370
|
+
) -> pd.DataFrame:
|
|
212
371
|
"""
|
|
213
|
-
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
214
|
-
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
215
|
-
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
216
|
-
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
372
|
+
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
373
|
+
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
374
|
+
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
375
|
+
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
217
376
|
The day column is renamed from 'Day' to 'OBS'.
|
|
218
377
|
|
|
219
|
-
Parameters
|
|
378
|
+
Parameters
|
|
379
|
+
----------
|
|
220
380
|
- df: pandas DataFrame
|
|
221
381
|
The input DataFrame containing daily data.
|
|
222
382
|
- date_column: string
|
|
@@ -230,18 +390,21 @@ class dataprocessing:
|
|
|
230
390
|
- aggregation: string, optional (default 'sum')
|
|
231
391
|
Aggregation method, either 'sum', 'average', or 'count'.
|
|
232
392
|
|
|
233
|
-
Returns
|
|
393
|
+
Returns
|
|
394
|
+
-------
|
|
234
395
|
- pandas DataFrame
|
|
235
396
|
A new DataFrame with weekly aggregated data. The index is reset,
|
|
236
|
-
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
237
|
-
is in long format, with separate columns for each combination of
|
|
397
|
+
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
398
|
+
is in long format, with separate columns for each combination of
|
|
238
399
|
grouped metrics.
|
|
239
|
-
"""
|
|
240
400
|
|
|
401
|
+
"""
|
|
241
402
|
# Map the input week commencing day to a weekday number (0=Monday, 6=Sunday)
|
|
242
|
-
days = {
|
|
403
|
+
days = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
243
404
|
if wc.lower() not in days:
|
|
244
|
-
return print(
|
|
405
|
+
return print(
|
|
406
|
+
f"Incorrect week commencing day input: '{wc}'. Please choose a valid day of the week (e.g., 'sun', 'mon', etc.).",
|
|
407
|
+
)
|
|
245
408
|
|
|
246
409
|
start_day = days[wc.lower()]
|
|
247
410
|
|
|
@@ -252,26 +415,40 @@ class dataprocessing:
|
|
|
252
415
|
df_copy[date_column] = pd.to_datetime(df_copy[date_column])
|
|
253
416
|
|
|
254
417
|
# Determine the start of each week
|
|
255
|
-
df_copy[
|
|
418
|
+
df_copy["week_start"] = df_copy[date_column].apply(
|
|
419
|
+
lambda x: x - pd.Timedelta(days=(x.weekday() - start_day) % 7),
|
|
420
|
+
)
|
|
256
421
|
|
|
257
422
|
# Convert sum_columns to numeric and fill NaNs with 0, retaining decimal values
|
|
258
423
|
for col in sum_columns:
|
|
259
|
-
df_copy[col] = pd.to_numeric(df_copy[col], errors=
|
|
424
|
+
df_copy[col] = pd.to_numeric(df_copy[col], errors="coerce").fillna(0)
|
|
260
425
|
|
|
261
426
|
# Group by the new week start column and additional columns, then aggregate the numeric columns
|
|
262
|
-
if aggregation ==
|
|
263
|
-
grouped =
|
|
264
|
-
|
|
265
|
-
|
|
427
|
+
if aggregation == "average":
|
|
428
|
+
grouped = (
|
|
429
|
+
df_copy.groupby(["week_start"] + group_columns)[sum_columns]
|
|
430
|
+
.mean()
|
|
431
|
+
.reset_index()
|
|
432
|
+
)
|
|
433
|
+
elif aggregation == "count":
|
|
434
|
+
grouped = (
|
|
435
|
+
df_copy.groupby(["week_start"] + group_columns)[sum_columns]
|
|
436
|
+
.count()
|
|
437
|
+
.reset_index()
|
|
438
|
+
)
|
|
266
439
|
else: # Default to 'sum' if any other value is provided
|
|
267
|
-
grouped =
|
|
440
|
+
grouped = (
|
|
441
|
+
df_copy.groupby(["week_start"] + group_columns)[sum_columns]
|
|
442
|
+
.sum()
|
|
443
|
+
.reset_index()
|
|
444
|
+
)
|
|
268
445
|
|
|
269
446
|
# Rename 'week_start' column to 'OBS'
|
|
270
|
-
grouped = grouped.rename(columns={
|
|
447
|
+
grouped = grouped.rename(columns={"week_start": "OBS"})
|
|
271
448
|
|
|
272
449
|
return grouped
|
|
273
|
-
|
|
274
|
-
def convert_monthly_to_daily(self, df, date_column, divide
|
|
450
|
+
|
|
451
|
+
def convert_monthly_to_daily(self, df, date_column, divide=True):
|
|
275
452
|
"""
|
|
276
453
|
Convert a DataFrame with monthly data to daily data.
|
|
277
454
|
This function takes a DataFrame and a date column, then it expands each
|
|
@@ -282,7 +459,6 @@ class dataprocessing:
|
|
|
282
459
|
:param divide: boolean divide by the number of days in a month (default True)
|
|
283
460
|
:return: A new DataFrame with daily data.
|
|
284
461
|
"""
|
|
285
|
-
|
|
286
462
|
# Convert date_column to datetime
|
|
287
463
|
df[date_column] = pd.to_datetime(df[date_column])
|
|
288
464
|
|
|
@@ -292,7 +468,10 @@ class dataprocessing:
|
|
|
292
468
|
# Iterate over each row in the DataFrame
|
|
293
469
|
for _, row in df.iterrows():
|
|
294
470
|
# Calculate the number of days in the month
|
|
295
|
-
num_days = calendar.monthrange(
|
|
471
|
+
num_days = calendar.monthrange(
|
|
472
|
+
row[date_column].year,
|
|
473
|
+
row[date_column].month,
|
|
474
|
+
)[1]
|
|
296
475
|
|
|
297
476
|
# Create a new record for each day of the month
|
|
298
477
|
for day in range(1, num_days + 1):
|
|
@@ -304,32 +483,41 @@ class dataprocessing:
|
|
|
304
483
|
if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
|
|
305
484
|
if divide is True:
|
|
306
485
|
daily_row[col] = row[col] / num_days
|
|
307
|
-
else:
|
|
486
|
+
else:
|
|
308
487
|
daily_row[col] = row[col]
|
|
309
488
|
daily_records.append(daily_row)
|
|
310
489
|
|
|
311
490
|
# Convert the list of daily records into a DataFrame
|
|
312
491
|
daily_df = pd.DataFrame(daily_records)
|
|
313
|
-
|
|
492
|
+
|
|
314
493
|
return daily_df
|
|
315
|
-
|
|
316
|
-
def week_of_year_mapping(self,df, week_col, start_day_str):
|
|
317
494
|
|
|
495
|
+
def week_of_year_mapping(self, df, week_col, start_day_str):
|
|
318
496
|
# Mapping of string day names to day numbers (1 for Monday, 7 for Sunday)
|
|
319
497
|
day_mapping = {
|
|
320
|
-
|
|
498
|
+
"mon": 1,
|
|
499
|
+
"tue": 2,
|
|
500
|
+
"wed": 3,
|
|
501
|
+
"thu": 4,
|
|
502
|
+
"fri": 5,
|
|
503
|
+
"sat": 6,
|
|
504
|
+
"sun": 7,
|
|
321
505
|
}
|
|
322
506
|
|
|
323
507
|
# Convert the day string to a number, or raise an error if not valid
|
|
324
508
|
start_day = day_mapping.get(start_day_str.lower())
|
|
325
509
|
if start_day is None:
|
|
326
|
-
raise ValueError(
|
|
510
|
+
raise ValueError(
|
|
511
|
+
f"Invalid day input: '{start_day_str}'. Please use one of 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'.",
|
|
512
|
+
)
|
|
327
513
|
|
|
328
514
|
# Function to convert week number to start date of the week
|
|
329
515
|
def week_to_startdate(week_str, start_day):
|
|
330
|
-
year, week = map(int, week_str.split(
|
|
516
|
+
year, week = map(int, week_str.split("-W"))
|
|
331
517
|
first_day_of_year = datetime(year, 1, 1)
|
|
332
|
-
first_weekday_of_year =
|
|
518
|
+
first_weekday_of_year = (
|
|
519
|
+
first_day_of_year.weekday()
|
|
520
|
+
) # Monday is 0 and Sunday is 6
|
|
333
521
|
|
|
334
522
|
# Calculate days to adjust to the desired start day of the week
|
|
335
523
|
days_to_adjust = (start_day - 1 - first_weekday_of_year) % 7
|
|
@@ -340,25 +528,38 @@ class dataprocessing:
|
|
|
340
528
|
return start_of_week
|
|
341
529
|
|
|
342
530
|
# Apply the function to each row in the specified week column
|
|
343
|
-
df[
|
|
531
|
+
df["OBS"] = (
|
|
532
|
+
df[week_col]
|
|
533
|
+
.apply(lambda x: week_to_startdate(x, start_day))
|
|
534
|
+
.dt.strftime("%d/%m/%Y")
|
|
535
|
+
)
|
|
344
536
|
return df
|
|
345
|
-
|
|
346
|
-
def rename_cols(self, df, name
|
|
537
|
+
|
|
538
|
+
def rename_cols(self, df, name="ame_"):
|
|
347
539
|
new_columns = {}
|
|
348
540
|
for col in df.columns:
|
|
349
|
-
if col !=
|
|
541
|
+
if col != "OBS":
|
|
350
542
|
new_col_name = name + col.replace(" ", "_").lower()
|
|
351
543
|
else:
|
|
352
544
|
new_col_name = col
|
|
353
545
|
new_columns[col] = new_col_name
|
|
354
546
|
return df.rename(columns=new_columns)
|
|
355
|
-
|
|
356
|
-
def merge_new_and_old(
|
|
547
|
+
|
|
548
|
+
def merge_new_and_old(
|
|
549
|
+
self,
|
|
550
|
+
old_df,
|
|
551
|
+
old_col,
|
|
552
|
+
new_df,
|
|
553
|
+
new_col,
|
|
554
|
+
cutoff_date,
|
|
555
|
+
date_col_name="OBS",
|
|
556
|
+
):
|
|
357
557
|
"""
|
|
358
558
|
Creates a new DataFrame with two columns: one for dates and one for merged numeric values.
|
|
359
559
|
Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.
|
|
360
560
|
|
|
361
|
-
Parameters
|
|
561
|
+
Parameters
|
|
562
|
+
----------
|
|
362
563
|
- old_df: pandas DataFrame
|
|
363
564
|
The old DataFrame from which to take the numeric values up to the specified date.
|
|
364
565
|
- old_col: str
|
|
@@ -372,11 +573,12 @@ class dataprocessing:
|
|
|
372
573
|
- date_col_name: str, optional (default 'OBS')
|
|
373
574
|
The name of the date column in both DataFrames.
|
|
374
575
|
|
|
375
|
-
Returns
|
|
576
|
+
Returns
|
|
577
|
+
-------
|
|
376
578
|
- pandas DataFrame
|
|
377
579
|
A new DataFrame with two columns: 'Date' and a column named after 'new_col' containing merged numeric values.
|
|
378
|
-
"""
|
|
379
580
|
|
|
581
|
+
"""
|
|
380
582
|
# Convert date columns in both dataframes to datetime for comparison
|
|
381
583
|
old_df[date_col_name] = pd.to_datetime(old_df[date_col_name])
|
|
382
584
|
new_df[date_col_name] = pd.to_datetime(new_df[date_col_name])
|
|
@@ -389,67 +591,93 @@ class dataprocessing:
|
|
|
389
591
|
new_values = new_df[new_df[date_col_name] > cutoff_date]
|
|
390
592
|
|
|
391
593
|
# Create a new DataFrame with two columns: 'Date' and a column named after 'new_col'
|
|
392
|
-
merged_df = pd.DataFrame(
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
594
|
+
merged_df = pd.DataFrame(
|
|
595
|
+
{
|
|
596
|
+
"OBS": pd.concat(
|
|
597
|
+
[old_values[date_col_name], new_values[date_col_name]],
|
|
598
|
+
ignore_index=True,
|
|
599
|
+
),
|
|
600
|
+
new_col: pd.concat(
|
|
601
|
+
[old_values[old_col], new_values[new_col]],
|
|
602
|
+
ignore_index=True,
|
|
603
|
+
),
|
|
604
|
+
},
|
|
605
|
+
)
|
|
396
606
|
|
|
397
607
|
return merged_df
|
|
398
|
-
|
|
399
|
-
def merge_dataframes_on_column(
|
|
608
|
+
|
|
609
|
+
def merge_dataframes_on_column(
|
|
610
|
+
self,
|
|
611
|
+
dataframes,
|
|
612
|
+
common_column="OBS",
|
|
613
|
+
merge_how="outer",
|
|
614
|
+
):
|
|
400
615
|
"""
|
|
401
616
|
Merge a list of DataFrames on a common column.
|
|
402
617
|
|
|
403
|
-
Parameters
|
|
618
|
+
Parameters
|
|
619
|
+
----------
|
|
404
620
|
- dataframes: A list of DataFrames to merge.
|
|
405
621
|
- common_column: The name of the common column to merge on.
|
|
406
622
|
- merge_how: The type of merge to perform ('inner', 'outer', 'left', or 'right').
|
|
407
623
|
|
|
408
|
-
Returns
|
|
624
|
+
Returns
|
|
625
|
+
-------
|
|
409
626
|
- A merged DataFrame.
|
|
627
|
+
|
|
410
628
|
"""
|
|
411
629
|
if not dataframes:
|
|
412
630
|
return None
|
|
413
|
-
|
|
631
|
+
|
|
414
632
|
merged_df = dataframes[0] # Start with the first DataFrame
|
|
415
633
|
|
|
416
634
|
for df in dataframes[1:]:
|
|
417
635
|
merged_df = pd.merge(merged_df, df, on=common_column, how=merge_how)
|
|
418
636
|
|
|
419
637
|
# Check if the common column is of datetime dtype
|
|
420
|
-
if merged_df[common_column].dtype ==
|
|
638
|
+
if merged_df[common_column].dtype == "datetime64[ns]":
|
|
421
639
|
merged_df[common_column] = pd.to_datetime(merged_df[common_column])
|
|
422
640
|
merged_df = merged_df.sort_values(by=common_column)
|
|
423
641
|
merged_df = merged_df.fillna(0)
|
|
424
|
-
|
|
642
|
+
|
|
425
643
|
return merged_df
|
|
426
|
-
|
|
644
|
+
|
|
427
645
|
def merge_and_update_dfs(self, df1, df2, key_column):
|
|
428
646
|
"""
|
|
429
647
|
Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available,
|
|
430
648
|
and returns a dataframe sorted by the key column.
|
|
431
649
|
|
|
432
|
-
Parameters
|
|
650
|
+
Parameters
|
|
651
|
+
----------
|
|
433
652
|
df1 (DataFrame): The first dataframe to merge (e.g., processed_facebook).
|
|
434
653
|
df2 (DataFrame): The second dataframe to merge (e.g., finalised_meta).
|
|
435
654
|
key_column (str): The name of the column to merge and sort by (e.g., 'OBS').
|
|
436
655
|
|
|
437
|
-
Returns
|
|
656
|
+
Returns
|
|
657
|
+
-------
|
|
438
658
|
DataFrame: The merged and updated dataframe.
|
|
439
|
-
"""
|
|
440
659
|
|
|
660
|
+
"""
|
|
441
661
|
# Sort both DataFrames by the key column
|
|
442
662
|
df1_sorted = df1.sort_values(by=key_column)
|
|
443
663
|
df2_sorted = df2.sort_values(by=key_column)
|
|
444
664
|
|
|
445
665
|
# Perform the full outer merge
|
|
446
|
-
merged_df = pd.merge(
|
|
666
|
+
merged_df = pd.merge(
|
|
667
|
+
df1_sorted,
|
|
668
|
+
df2_sorted,
|
|
669
|
+
on=key_column,
|
|
670
|
+
how="outer",
|
|
671
|
+
suffixes=("", "_finalised"),
|
|
672
|
+
)
|
|
447
673
|
|
|
448
674
|
# Update with non-null values from df2
|
|
449
675
|
for column in merged_df.columns:
|
|
450
|
-
if column.endswith(
|
|
451
|
-
original_column = column.replace(
|
|
452
|
-
merged_df.loc[merged_df[column].notnull(), original_column] =
|
|
676
|
+
if column.endswith("_finalised"):
|
|
677
|
+
original_column = column.replace("_finalised", "")
|
|
678
|
+
merged_df.loc[merged_df[column].notnull(), original_column] = (
|
|
679
|
+
merged_df.loc[merged_df[column].notnull(), column]
|
|
680
|
+
)
|
|
453
681
|
merged_df.drop(column, axis=1, inplace=True)
|
|
454
682
|
|
|
455
683
|
# Sort the merged DataFrame by the key column
|
|
@@ -459,25 +687,30 @@ class dataprocessing:
|
|
|
459
687
|
merged_df.fillna(0, inplace=True)
|
|
460
688
|
|
|
461
689
|
return merged_df
|
|
462
|
-
|
|
690
|
+
|
|
463
691
|
def convert_us_to_uk_dates(self, df, date_col):
|
|
464
692
|
"""
|
|
465
|
-
Processes the date column of a DataFrame to remove hyphens and slashes,
|
|
693
|
+
Processes the date column of a DataFrame to remove hyphens and slashes,
|
|
466
694
|
and converts it to a datetime object.
|
|
467
|
-
|
|
468
|
-
Parameters
|
|
695
|
+
|
|
696
|
+
Parameters
|
|
697
|
+
----------
|
|
469
698
|
df (pd.DataFrame): The DataFrame containing the date column.
|
|
470
699
|
date_col (str): The name of the date column.
|
|
471
|
-
|
|
472
|
-
Returns
|
|
700
|
+
|
|
701
|
+
Returns
|
|
702
|
+
-------
|
|
473
703
|
pd.DataFrame: The DataFrame with the processed date column.
|
|
704
|
+
|
|
474
705
|
"""
|
|
475
|
-
df[date_col] = df[date_col].str.replace(r
|
|
706
|
+
df[date_col] = df[date_col].str.replace(r"[-/]", "", regex=True)
|
|
476
707
|
df[date_col] = pd.to_datetime(
|
|
477
|
-
df[date_col].str.slice(0, 2)
|
|
478
|
-
|
|
479
|
-
df[date_col].str.slice(
|
|
480
|
-
|
|
708
|
+
df[date_col].str.slice(0, 2)
|
|
709
|
+
+ "/"
|
|
710
|
+
+ df[date_col].str.slice(2, 4)
|
|
711
|
+
+ "/"
|
|
712
|
+
+ df[date_col].str.slice(4, 8),
|
|
713
|
+
format="%m/%d/%Y",
|
|
481
714
|
)
|
|
482
715
|
return df
|
|
483
716
|
|
|
@@ -486,21 +719,40 @@ class dataprocessing:
|
|
|
486
719
|
Combines multiple DataFrames from a dictionary into a single DataFrame.
|
|
487
720
|
Adds a column 'SheetName' indicating the origin sheet of each row.
|
|
488
721
|
|
|
489
|
-
Parameters
|
|
722
|
+
Parameters
|
|
723
|
+
----------
|
|
490
724
|
all_sheets (dict): A dictionary of DataFrames, typically read from an Excel file with multiple sheets.
|
|
491
725
|
|
|
492
|
-
Returns
|
|
726
|
+
Returns
|
|
727
|
+
-------
|
|
493
728
|
DataFrame: A concatenated DataFrame with an additional 'SheetName' column.
|
|
729
|
+
|
|
494
730
|
"""
|
|
495
731
|
combined_df = pd.DataFrame()
|
|
496
732
|
|
|
497
733
|
for sheet_name, df in all_sheets.items():
|
|
498
|
-
df[
|
|
734
|
+
df["SheetName"] = sheet_name
|
|
499
735
|
combined_df = pd.concat([combined_df, df], ignore_index=True)
|
|
500
736
|
|
|
501
737
|
return combined_df
|
|
502
|
-
|
|
503
|
-
def pivot_table(
|
|
738
|
+
|
|
739
|
+
def pivot_table(
|
|
740
|
+
self,
|
|
741
|
+
df,
|
|
742
|
+
index_col,
|
|
743
|
+
columns,
|
|
744
|
+
values_col,
|
|
745
|
+
filters_dict=None,
|
|
746
|
+
fill_value=0,
|
|
747
|
+
aggfunc="sum",
|
|
748
|
+
margins=False,
|
|
749
|
+
margins_name="Total",
|
|
750
|
+
datetime_trans_needed=True,
|
|
751
|
+
date_format="%Y-%m-%d",
|
|
752
|
+
reverse_header_order=False,
|
|
753
|
+
fill_missing_weekly_dates=True,
|
|
754
|
+
week_commencing="W-MON",
|
|
755
|
+
):
|
|
504
756
|
"""
|
|
505
757
|
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
506
758
|
|
|
@@ -521,6 +773,7 @@ class dataprocessing:
|
|
|
521
773
|
|
|
522
774
|
Returns:
|
|
523
775
|
pandas.DataFrame: The pivot table specified
|
|
776
|
+
|
|
524
777
|
"""
|
|
525
778
|
# Validate inputs
|
|
526
779
|
if index_col not in df.columns:
|
|
@@ -544,7 +797,10 @@ class dataprocessing:
|
|
|
544
797
|
|
|
545
798
|
# Ensure index column is in datetime format if needed
|
|
546
799
|
if datetime_trans_needed:
|
|
547
|
-
df_filtered[index_col] = pd.to_datetime(
|
|
800
|
+
df_filtered[index_col] = pd.to_datetime(
|
|
801
|
+
df_filtered[index_col],
|
|
802
|
+
dayfirst=True,
|
|
803
|
+
)
|
|
548
804
|
|
|
549
805
|
# Create the pivot table
|
|
550
806
|
pivoted_df = df_filtered.pivot_table(
|
|
@@ -559,7 +815,9 @@ class dataprocessing:
|
|
|
559
815
|
# Handle column headers
|
|
560
816
|
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
561
817
|
pivoted_df.columns = [
|
|
562
|
-
"_".join(
|
|
818
|
+
"_".join(
|
|
819
|
+
reversed(map(str, col)) if reverse_header_order else map(str, col),
|
|
820
|
+
)
|
|
563
821
|
for col in pivoted_df.columns.values
|
|
564
822
|
]
|
|
565
823
|
else:
|
|
@@ -570,7 +828,10 @@ class dataprocessing:
|
|
|
570
828
|
|
|
571
829
|
# Handle sorting and formatting of index column
|
|
572
830
|
if datetime_trans_needed:
|
|
573
|
-
pivoted_df[index_col] = pd.to_datetime(
|
|
831
|
+
pivoted_df[index_col] = pd.to_datetime(
|
|
832
|
+
pivoted_df[index_col],
|
|
833
|
+
errors="coerce",
|
|
834
|
+
)
|
|
574
835
|
pivoted_df.sort_values(by=index_col, inplace=True)
|
|
575
836
|
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
|
|
576
837
|
|
|
@@ -579,35 +840,49 @@ class dataprocessing:
|
|
|
579
840
|
|
|
580
841
|
# Fill missing weekly dates if specified
|
|
581
842
|
if fill_missing_weekly_dates:
|
|
582
|
-
pivoted_df = self.fill_weekly_date_range(
|
|
843
|
+
pivoted_df = self.fill_weekly_date_range(
|
|
844
|
+
pivoted_df,
|
|
845
|
+
index_col,
|
|
846
|
+
freq=week_commencing,
|
|
847
|
+
)
|
|
583
848
|
|
|
584
849
|
return pivoted_df
|
|
585
850
|
|
|
586
|
-
def apply_lookup_table_for_columns(
|
|
851
|
+
def apply_lookup_table_for_columns(
|
|
852
|
+
df,
|
|
853
|
+
col_names,
|
|
854
|
+
to_find_dict,
|
|
855
|
+
if_not_in_dict="Other",
|
|
856
|
+
new_column_name="Mapping",
|
|
857
|
+
):
|
|
587
858
|
"""
|
|
588
859
|
Creates a new DataFrame column based on a look up table, using exact matches.
|
|
589
860
|
|
|
590
|
-
Parameters
|
|
861
|
+
Parameters
|
|
862
|
+
----------
|
|
591
863
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
592
864
|
col_names (list of str): List of column names to use for lookup. If more than one, values are merged with '|'.
|
|
593
865
|
to_find_dict (dict): Lookup dictionary with exact keys to match.
|
|
594
866
|
if_not_in_dict (str, optional): Value used if no match is found. Defaults to "Other".
|
|
595
867
|
new_column_name (str, optional): Name of new output column. Defaults to "Mapping".
|
|
596
868
|
|
|
597
|
-
Returns
|
|
869
|
+
Returns
|
|
870
|
+
-------
|
|
598
871
|
pandas.DataFrame: DataFrame with a new column containing lookup results.
|
|
599
|
-
"""
|
|
600
872
|
|
|
873
|
+
"""
|
|
601
874
|
# Preprocess DataFrame if multiple columns
|
|
602
875
|
if len(col_names) > 1:
|
|
603
|
-
df["Merged"] = df[col_names].astype(str).agg(
|
|
876
|
+
df["Merged"] = df[col_names].astype(str).agg("|".join, axis=1)
|
|
604
877
|
col_to_use = "Merged"
|
|
605
878
|
else:
|
|
606
879
|
col_to_use = col_names[0]
|
|
607
880
|
|
|
608
881
|
# Normalize case for matching
|
|
609
882
|
lookup = {k.lower(): v for k, v in to_find_dict.items()}
|
|
610
|
-
df[new_column_name] =
|
|
883
|
+
df[new_column_name] = (
|
|
884
|
+
df[col_to_use].str.lower().map(lookup).fillna(if_not_in_dict)
|
|
885
|
+
)
|
|
611
886
|
|
|
612
887
|
# Drop intermediate column if created
|
|
613
888
|
if len(col_names) > 1:
|
|
@@ -615,15 +890,25 @@ class dataprocessing:
|
|
|
615
890
|
|
|
616
891
|
return df
|
|
617
892
|
|
|
618
|
-
def aggregate_daily_to_wc_wide(
|
|
893
|
+
def aggregate_daily_to_wc_wide(
|
|
894
|
+
self,
|
|
895
|
+
df: pd.DataFrame,
|
|
896
|
+
date_column: str,
|
|
897
|
+
group_columns: list[str],
|
|
898
|
+
sum_columns: list[str],
|
|
899
|
+
wc: str = "sun",
|
|
900
|
+
aggregation: str = "sum",
|
|
901
|
+
include_totals: bool = False,
|
|
902
|
+
) -> pd.DataFrame:
|
|
619
903
|
"""
|
|
620
|
-
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
621
|
-
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
622
|
-
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
623
|
-
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
904
|
+
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
905
|
+
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
906
|
+
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
907
|
+
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
624
908
|
The day column is renamed from 'Day' to 'OBS'.
|
|
625
909
|
|
|
626
|
-
Parameters
|
|
910
|
+
Parameters
|
|
911
|
+
----------
|
|
627
912
|
- df: pandas DataFrame
|
|
628
913
|
The input DataFrame containing daily data.
|
|
629
914
|
- date_column: string
|
|
@@ -639,26 +924,36 @@ class dataprocessing:
|
|
|
639
924
|
- include_totals: boolean, optional (default False)
|
|
640
925
|
If True, include total columns for each sum_column.
|
|
641
926
|
|
|
642
|
-
Returns
|
|
927
|
+
Returns
|
|
928
|
+
-------
|
|
643
929
|
- pandas DataFrame
|
|
644
930
|
A new DataFrame with weekly aggregated data. The index is reset,
|
|
645
|
-
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
646
|
-
is in wide format, with separate columns for each combination of
|
|
931
|
+
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
932
|
+
is in wide format, with separate columns for each combination of
|
|
647
933
|
grouped metrics.
|
|
934
|
+
|
|
648
935
|
"""
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
936
|
+
grouped = self.aggregate_daily_to_wc_long(
|
|
937
|
+
df,
|
|
938
|
+
date_column,
|
|
939
|
+
group_columns,
|
|
940
|
+
sum_columns,
|
|
941
|
+
wc,
|
|
942
|
+
aggregation,
|
|
943
|
+
)
|
|
944
|
+
|
|
652
945
|
# Pivot the data to wide format
|
|
653
946
|
if group_columns:
|
|
654
|
-
wide_df = grouped.pivot_table(
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
947
|
+
wide_df = grouped.pivot_table(
|
|
948
|
+
index="OBS",
|
|
949
|
+
columns=group_columns,
|
|
950
|
+
values=sum_columns,
|
|
951
|
+
aggfunc="first",
|
|
952
|
+
)
|
|
658
953
|
# Flatten the multi-level column index and create combined column names
|
|
659
|
-
wide_df.columns = [
|
|
954
|
+
wide_df.columns = ["_".join(col).strip() for col in wide_df.columns.values]
|
|
660
955
|
else:
|
|
661
|
-
wide_df = grouped.set_index(
|
|
956
|
+
wide_df = grouped.set_index("OBS")
|
|
662
957
|
|
|
663
958
|
# Fill NaN values with 0
|
|
664
959
|
wide_df = wide_df.fillna(0)
|
|
@@ -666,9 +961,11 @@ class dataprocessing:
|
|
|
666
961
|
# Adding total columns for each unique sum_column, if include_totals is True
|
|
667
962
|
if include_totals:
|
|
668
963
|
for col in sum_columns:
|
|
669
|
-
total_column_name = f
|
|
964
|
+
total_column_name = f"Total {col}"
|
|
670
965
|
if group_columns:
|
|
671
|
-
columns_to_sum = [
|
|
966
|
+
columns_to_sum = [
|
|
967
|
+
column for column in wide_df.columns if col in column
|
|
968
|
+
]
|
|
672
969
|
else:
|
|
673
970
|
columns_to_sum = [col]
|
|
674
971
|
wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
|
|
@@ -678,11 +975,20 @@ class dataprocessing:
|
|
|
678
975
|
|
|
679
976
|
return wide_df
|
|
680
977
|
|
|
681
|
-
def merge_cols_with_seperator(
|
|
978
|
+
def merge_cols_with_seperator(
|
|
979
|
+
self,
|
|
980
|
+
df,
|
|
981
|
+
col_names,
|
|
982
|
+
seperator="_",
|
|
983
|
+
output_column_name="Merged",
|
|
984
|
+
starting_prefix_str=None,
|
|
985
|
+
ending_prefix_str=None,
|
|
986
|
+
):
|
|
682
987
|
"""
|
|
683
988
|
Creates a new column in the dataframe that merges 2 or more columns together with a "_" seperator, possibly to be used for a look up table where multiple columns are being looked up
|
|
684
989
|
|
|
685
|
-
Parameters
|
|
990
|
+
Parameters
|
|
991
|
+
----------
|
|
686
992
|
df (pandas.DataFrame): Dataframe to make changes to.
|
|
687
993
|
col_names (list): list of columm names ot merge.
|
|
688
994
|
seperator (str, optional): Name of column outputted. Defaults to "_".
|
|
@@ -690,76 +996,99 @@ class dataprocessing:
|
|
|
690
996
|
starting_prefix_str (str, optional): string of optional text to be added before the merged column str value
|
|
691
997
|
ending_prefix_str (str, optional): string of optional text to be added after the merged column str value
|
|
692
998
|
|
|
693
|
-
Raises
|
|
999
|
+
Raises
|
|
1000
|
+
------
|
|
694
1001
|
ValueError: if more less than two column names are inputted in the list there is nothing to merge on
|
|
695
1002
|
|
|
696
|
-
Returns
|
|
1003
|
+
Returns
|
|
1004
|
+
-------
|
|
697
1005
|
pandas.DataFrame: DataFrame with additional merged column
|
|
1006
|
+
|
|
698
1007
|
"""
|
|
699
1008
|
# Specify more than one column must be entered
|
|
700
1009
|
if len(col_names) < 2:
|
|
701
1010
|
raise ValueError("2 or more columns must be specified to merge")
|
|
702
|
-
|
|
1011
|
+
|
|
703
1012
|
# Create a new column with the merged columns
|
|
704
1013
|
df[output_column_name] = df[col_names].astype(str).apply(seperator.join, axis=1)
|
|
705
1014
|
|
|
706
|
-
# Add string before
|
|
1015
|
+
# Add string before
|
|
707
1016
|
if starting_prefix_str is not None:
|
|
708
|
-
df[output_column_name] = starting_prefix_str + df[
|
|
709
|
-
|
|
1017
|
+
df[output_column_name] = starting_prefix_str + df[
|
|
1018
|
+
output_column_name
|
|
1019
|
+
].astype(str)
|
|
1020
|
+
|
|
710
1021
|
# Add string after
|
|
711
1022
|
if ending_prefix_str is not None:
|
|
712
|
-
df[output_column_name] =
|
|
713
|
-
|
|
1023
|
+
df[output_column_name] = (
|
|
1024
|
+
df[output_column_name].astype(str) + ending_prefix_str
|
|
1025
|
+
)
|
|
1026
|
+
|
|
714
1027
|
return df
|
|
715
1028
|
|
|
716
|
-
def check_sum_of_df_cols_are_equal(self, df_1,df_2,cols_1,cols_2):
|
|
1029
|
+
def check_sum_of_df_cols_are_equal(self, df_1, df_2, cols_1, cols_2):
|
|
717
1030
|
"""
|
|
718
1031
|
Checks the sum of two different dataframe column or columns are equal
|
|
719
1032
|
|
|
720
|
-
Parameters
|
|
1033
|
+
Parameters
|
|
1034
|
+
----------
|
|
721
1035
|
df_1 (pandas.DataFrame): First dataframe for columnsa to be summed on.
|
|
722
1036
|
df_2 (pandas.DataFrame): Second dataframe for columnsa to be summed on.
|
|
723
1037
|
cols_1 (list of str): Columns from first dataframe to sum.
|
|
724
1038
|
cols_2 (list of str): Columns from second dataframe to sum.
|
|
725
1039
|
|
|
726
|
-
Returns
|
|
1040
|
+
Returns
|
|
1041
|
+
-------
|
|
727
1042
|
Tuple: Answer is the true or false answer to whether sums are the same, df_1_sum is the sum of the column/columns in the first dataframe, df_2_sum is the sum of the column/columns in the second dataframe
|
|
1043
|
+
|
|
728
1044
|
"""
|
|
729
1045
|
# Find the sum of both sets of columns
|
|
730
1046
|
df_1_sum = df_1[cols_1].sum().sum()
|
|
731
1047
|
df_2_sum = df_2[cols_2].sum().sum()
|
|
732
|
-
|
|
733
|
-
# If the the two columns are
|
|
1048
|
+
|
|
1049
|
+
# If the the two columns are
|
|
734
1050
|
if df_1_sum == df_2_sum:
|
|
735
1051
|
Answer = "They are equal"
|
|
736
1052
|
if df_1_sum != df_2_sum:
|
|
737
|
-
Answer = "They are different by " + str(df_2_sum-df_1_sum)
|
|
738
|
-
|
|
739
|
-
return Answer,df_1_sum,df_2_sum
|
|
740
|
-
|
|
1053
|
+
Answer = "They are different by " + str(df_2_sum - df_1_sum)
|
|
1054
|
+
|
|
1055
|
+
return Answer, df_1_sum, df_2_sum
|
|
1056
|
+
|
|
741
1057
|
def convert_2_df_cols_to_dict(self, df, key_col, value_col):
|
|
742
1058
|
"""
|
|
743
1059
|
Create a dictionary mapping from two columns of a DataFrame.
|
|
744
1060
|
|
|
745
|
-
Parameters
|
|
1061
|
+
Parameters
|
|
1062
|
+
----------
|
|
746
1063
|
df (pd.DataFrame): The DataFrame containing the data.
|
|
747
1064
|
key_col (str): The column name to use as keys in the dictionary.
|
|
748
1065
|
value_col (str): The column name to use as values in the dictionary.
|
|
749
1066
|
|
|
750
|
-
Returns
|
|
1067
|
+
Returns
|
|
1068
|
+
-------
|
|
751
1069
|
dict: A dictionary with keys from 'key_col' and values from 'value_col'.
|
|
1070
|
+
|
|
752
1071
|
"""
|
|
753
1072
|
if key_col not in df or value_col not in df:
|
|
754
1073
|
raise ValueError("Specified columns are not in the DataFrame")
|
|
755
1074
|
|
|
756
1075
|
return {df[key_col].iloc[i]: df[value_col].iloc[i] for i in range(len(df))}
|
|
757
|
-
|
|
758
|
-
def create_FY_and_H_columns(
|
|
1076
|
+
|
|
1077
|
+
def create_FY_and_H_columns(
|
|
1078
|
+
self,
|
|
1079
|
+
df,
|
|
1080
|
+
index_col,
|
|
1081
|
+
start_date,
|
|
1082
|
+
starting_FY,
|
|
1083
|
+
short_format="No",
|
|
1084
|
+
half_years="No",
|
|
1085
|
+
combined_FY_and_H="No",
|
|
1086
|
+
):
|
|
759
1087
|
"""
|
|
760
|
-
Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
|
|
1088
|
+
Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
|
|
761
1089
|
|
|
762
|
-
Parameters
|
|
1090
|
+
Parameters
|
|
1091
|
+
----------
|
|
763
1092
|
df (pandas.DataFrame): Dataframe to operate on.
|
|
764
1093
|
index_col (str): Name of the column to use for datetime
|
|
765
1094
|
start_date (str): String used to specify the start date of an FY specified, needs to be of format "yyyy-mm-dd" e.g. 2021-11-31
|
|
@@ -768,16 +1097,17 @@ class dataprocessing:
|
|
|
768
1097
|
half_years (str, optional): String used to specify if half year column is desired. Defaults to "No".
|
|
769
1098
|
combined_FY_and_H (str, optional): String used to specify is a combined half year and FY column is desired. Defaults to "No".
|
|
770
1099
|
|
|
771
|
-
Returns
|
|
1100
|
+
Returns
|
|
1101
|
+
-------
|
|
772
1102
|
pandas.DataFrame: DataFrame with a new column 'FY' containing the FY as well as, if desired, a half year column and a combined FY half year column.
|
|
1103
|
+
|
|
773
1104
|
"""
|
|
774
|
-
|
|
775
1105
|
try:
|
|
776
|
-
start_date = datetime.strptime(start_date,
|
|
1106
|
+
start_date = datetime.strptime(start_date, "%Y-%m-%d")
|
|
777
1107
|
except ValueError:
|
|
778
1108
|
print("Error: Date must be of format yyyy-mm-dd")
|
|
779
1109
|
return df
|
|
780
|
-
|
|
1110
|
+
|
|
781
1111
|
df["OBS"] = pd.to_datetime(df[index_col])
|
|
782
1112
|
df["OBS as string"] = df["OBS"].dt.strftime("%Y-%m-%d")
|
|
783
1113
|
|
|
@@ -787,35 +1117,51 @@ class dataprocessing:
|
|
|
787
1117
|
|
|
788
1118
|
def calculate_FY_vectorized(date_series):
|
|
789
1119
|
years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
|
|
790
|
-
fy =
|
|
1120
|
+
fy = "FY" + (start_year + years_since_start).astype(str)
|
|
791
1121
|
if short_format == "Yes":
|
|
792
|
-
fy =
|
|
1122
|
+
fy = "FY" + fy.str[-2:]
|
|
793
1123
|
return fy
|
|
794
1124
|
|
|
795
|
-
df[
|
|
1125
|
+
df["FY"] = calculate_FY_vectorized(df[index_col])
|
|
796
1126
|
|
|
797
1127
|
if half_years == "Yes" or combined_FY_and_H == "Yes":
|
|
1128
|
+
|
|
798
1129
|
def calculate_half_year_vectorized(date_series):
|
|
799
|
-
fy_years_since_start = (
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
1130
|
+
fy_years_since_start = (
|
|
1131
|
+
(date_series - start_date).dt.days / 364
|
|
1132
|
+
).astype(int)
|
|
1133
|
+
fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(
|
|
1134
|
+
years=1,
|
|
1135
|
+
)
|
|
1136
|
+
fy_end_of_h1 = (
|
|
1137
|
+
fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
|
|
1138
|
+
)
|
|
1139
|
+
half_year = np.where(date_series <= fy_end_of_h1, "H1", "H2")
|
|
803
1140
|
return half_year
|
|
804
|
-
|
|
805
|
-
df[
|
|
806
|
-
|
|
1141
|
+
|
|
1142
|
+
df["Half Years"] = calculate_half_year_vectorized(df[index_col])
|
|
1143
|
+
|
|
807
1144
|
if combined_FY_and_H == "Yes":
|
|
808
|
-
df[
|
|
1145
|
+
df["Financial Half Years"] = df["FY"] + " " + df["Half Years"]
|
|
809
1146
|
|
|
810
1147
|
return df
|
|
811
|
-
|
|
812
|
-
def keyword_lookup_replacement(
|
|
1148
|
+
|
|
1149
|
+
def keyword_lookup_replacement(
|
|
1150
|
+
self,
|
|
1151
|
+
df,
|
|
1152
|
+
col,
|
|
1153
|
+
replacement_rows,
|
|
1154
|
+
cols_to_merge,
|
|
1155
|
+
replacement_lookup_dict,
|
|
1156
|
+
output_column_name="Updated Column",
|
|
1157
|
+
):
|
|
813
1158
|
"""
|
|
814
1159
|
This function updates values in a specified column of the DataFrame based on a lookup dictionary.
|
|
815
1160
|
It first merges several columns into a new 'Merged' column, then uses this merged column to determine
|
|
816
1161
|
if replacements are needed based on the dictionary.
|
|
817
1162
|
|
|
818
|
-
Parameters
|
|
1163
|
+
Parameters
|
|
1164
|
+
----------
|
|
819
1165
|
df (pd.DataFrame): The DataFrame to process.
|
|
820
1166
|
col (str): The name of the column whose values are potentially replaced.
|
|
821
1167
|
replacement_rows (str): The specific value in 'col' to check for replacements.
|
|
@@ -823,65 +1169,102 @@ class dataprocessing:
|
|
|
823
1169
|
replacement_lookup_dict (dict): Dictionary where keys are merged column values and values are the new data to replace in 'col'.
|
|
824
1170
|
output_column_name (str, optional): Name of column outputted. Defaults to "Updated Column".
|
|
825
1171
|
|
|
826
|
-
Returns
|
|
1172
|
+
Returns
|
|
1173
|
+
-------
|
|
827
1174
|
pd.DataFrame: The modified DataFrame with updated values in the specified column.
|
|
1175
|
+
|
|
828
1176
|
"""
|
|
829
1177
|
# Create a merged column from specified columns
|
|
830
|
-
df["Merged"] = df[cols_to_merge].apply(
|
|
831
|
-
|
|
1178
|
+
df["Merged"] = df[cols_to_merge].apply(
|
|
1179
|
+
lambda row: "|".join(row.values.astype(str)),
|
|
1180
|
+
axis=1,
|
|
1181
|
+
)
|
|
1182
|
+
|
|
832
1183
|
# Replace values in the specified column based on the lookup
|
|
833
1184
|
def replace_values(x):
|
|
834
1185
|
if x[col] == replacement_rows:
|
|
835
|
-
merged_value = x[
|
|
1186
|
+
merged_value = x["Merged"]
|
|
836
1187
|
if merged_value in replacement_lookup_dict:
|
|
837
1188
|
return replacement_lookup_dict[merged_value]
|
|
838
1189
|
return x[col]
|
|
839
|
-
|
|
1190
|
+
|
|
840
1191
|
# Apply replacement logic
|
|
841
1192
|
df[output_column_name] = df.apply(replace_values, axis=1)
|
|
842
|
-
|
|
1193
|
+
|
|
843
1194
|
# Drop the intermediate 'Merged' column
|
|
844
|
-
df.drop(columns=[
|
|
845
|
-
|
|
1195
|
+
df.drop(columns=["Merged"], inplace=True)
|
|
1196
|
+
|
|
846
1197
|
return df
|
|
847
1198
|
|
|
848
|
-
def create_new_version_of_col_using_LUT(
|
|
1199
|
+
def create_new_version_of_col_using_LUT(
|
|
1200
|
+
self,
|
|
1201
|
+
df,
|
|
1202
|
+
keys_col,
|
|
1203
|
+
value_col,
|
|
1204
|
+
dict_for_specific_changes,
|
|
1205
|
+
new_col_name="New Version of Old Col",
|
|
1206
|
+
):
|
|
849
1207
|
"""
|
|
850
|
-
Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
|
|
1208
|
+
Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
|
|
851
1209
|
The lookup is based on a column in the dataframe. Can only input one column and output one new column.
|
|
852
1210
|
|
|
853
|
-
Parameters
|
|
1211
|
+
Parameters
|
|
1212
|
+
----------
|
|
854
1213
|
df (pandas.DataFrame): The DataFrame containing the data.
|
|
855
1214
|
keys_col (str): The name of the column which the LUT will be refercing to ouput a value.
|
|
856
1215
|
value_col (str): The name of the column which the new column will be based off. If a key in the key column is not found in the LUT, the values from this column are used instead.
|
|
857
1216
|
dict_for_specific_changes (dict): The LUT which the keys_col will be mapped on to find any values that need changing in the new column.
|
|
858
1217
|
new_col_name (str, optional): This is the name of the new column being generated. Defaults to "New Version of Old Col".
|
|
859
1218
|
|
|
860
|
-
Returns
|
|
1219
|
+
Returns
|
|
1220
|
+
-------
|
|
861
1221
|
pandas.DataFrame: DataFrame with a new column which is similar to the old column, except for where changes have been made to reflect the lookup table.
|
|
1222
|
+
|
|
862
1223
|
"""
|
|
863
|
-
|
|
864
1224
|
# Extract columns to change using new dictionary
|
|
865
|
-
smaller_df = df[[keys_col,value_col]]
|
|
1225
|
+
smaller_df = df[[keys_col, value_col]]
|
|
866
1226
|
|
|
867
1227
|
# Use the new dictionary to create a new LUT
|
|
868
|
-
smaller_df_with_LUT = self.apply_lookup_table_for_columns(
|
|
869
|
-
|
|
1228
|
+
smaller_df_with_LUT = self.apply_lookup_table_for_columns(
|
|
1229
|
+
smaller_df,
|
|
1230
|
+
[keys_col, value_col],
|
|
1231
|
+
dict_for_specific_changes,
|
|
1232
|
+
)
|
|
1233
|
+
|
|
870
1234
|
# In a new column, keep values from the old column that don't need updating as they are not in the dictionary, and replace values that do need updating with values from the dictionary based on the keys
|
|
871
|
-
smaller_df_with_LUT["Updated Col"]=smaller_df_with_LUT.apply(
|
|
1235
|
+
smaller_df_with_LUT["Updated Col"] = smaller_df_with_LUT.apply(
|
|
1236
|
+
lambda x: x["Mapping"] if x["Mapping"] != "Other" else x[value_col],
|
|
1237
|
+
axis=1,
|
|
1238
|
+
)
|
|
872
1239
|
|
|
873
1240
|
# Drop the extra unecessary cols
|
|
874
|
-
smaller_df_with_LUT.drop([keys_col,
|
|
875
|
-
|
|
1241
|
+
smaller_df_with_LUT.drop([keys_col, "Mapping"], axis=1, inplace=True)
|
|
1242
|
+
|
|
876
1243
|
# # Output dataframes as dictionary to be used in a LUT
|
|
877
|
-
new_dict = self.convert_2_df_cols_to_dict(
|
|
1244
|
+
new_dict = self.convert_2_df_cols_to_dict(
|
|
1245
|
+
smaller_df_with_LUT,
|
|
1246
|
+
value_col,
|
|
1247
|
+
"Updated Col",
|
|
1248
|
+
)
|
|
878
1249
|
|
|
879
1250
|
# # Use new dictionary to create a new version of an old column
|
|
880
|
-
df_final = self.apply_lookup_table_for_columns(
|
|
881
|
-
|
|
1251
|
+
df_final = self.apply_lookup_table_for_columns(
|
|
1252
|
+
df,
|
|
1253
|
+
[keys_col],
|
|
1254
|
+
new_dict,
|
|
1255
|
+
"other",
|
|
1256
|
+
new_col_name,
|
|
1257
|
+
)
|
|
1258
|
+
|
|
882
1259
|
return df_final
|
|
883
|
-
|
|
884
|
-
def convert_df_wide_2_long(
|
|
1260
|
+
|
|
1261
|
+
def convert_df_wide_2_long(
|
|
1262
|
+
self,
|
|
1263
|
+
df,
|
|
1264
|
+
value_cols,
|
|
1265
|
+
variable_col_name="Stacked",
|
|
1266
|
+
value_col_name="Value",
|
|
1267
|
+
):
|
|
885
1268
|
"""
|
|
886
1269
|
Changes a dataframe from wide to long format.
|
|
887
1270
|
|
|
@@ -896,16 +1279,25 @@ class dataprocessing:
|
|
|
896
1279
|
|
|
897
1280
|
Raises:
|
|
898
1281
|
ValueError: If the number of columns to depivot is less than 2.
|
|
1282
|
+
|
|
899
1283
|
"""
|
|
900
1284
|
# Check length of value_cols is greater than 1
|
|
901
1285
|
if len(value_cols) < 2:
|
|
902
1286
|
raise ValueError("Number of inputs in list must be greater than 1")
|
|
903
1287
|
|
|
904
1288
|
# Find the columns that are not to be depivoted into one column
|
|
905
|
-
id_vars = [
|
|
1289
|
+
id_vars = [
|
|
1290
|
+
col for col in df.columns if col not in value_cols
|
|
1291
|
+
] # Preserve column order in the DataFrame
|
|
906
1292
|
|
|
907
1293
|
# Melt all columns chosen into one column
|
|
908
|
-
df_final = pd.melt(
|
|
1294
|
+
df_final = pd.melt(
|
|
1295
|
+
df,
|
|
1296
|
+
id_vars=id_vars,
|
|
1297
|
+
value_vars=value_cols,
|
|
1298
|
+
var_name=variable_col_name,
|
|
1299
|
+
value_name=value_col_name,
|
|
1300
|
+
)
|
|
909
1301
|
|
|
910
1302
|
# Sort column order to match expected output
|
|
911
1303
|
ordered_columns = id_vars + [variable_col_name, value_col_name]
|
|
@@ -913,7 +1305,19 @@ class dataprocessing:
|
|
|
913
1305
|
|
|
914
1306
|
return df_final
|
|
915
1307
|
|
|
916
|
-
def manually_edit_data(
|
|
1308
|
+
def manually_edit_data(
|
|
1309
|
+
self,
|
|
1310
|
+
df,
|
|
1311
|
+
filters_dict,
|
|
1312
|
+
col_to_change,
|
|
1313
|
+
new_value,
|
|
1314
|
+
change_in_existing_df_col="No",
|
|
1315
|
+
new_col_to_change_name="New",
|
|
1316
|
+
manual_edit_col_name=None,
|
|
1317
|
+
add_notes="No",
|
|
1318
|
+
existing_note_col_name=None,
|
|
1319
|
+
note=None,
|
|
1320
|
+
):
|
|
917
1321
|
"""
|
|
918
1322
|
Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
|
|
919
1323
|
|
|
@@ -936,31 +1340,44 @@ class dataprocessing:
|
|
|
936
1340
|
|
|
937
1341
|
Returns:
|
|
938
1342
|
pandas.DataFrame: Dataframe with manual changes added
|
|
1343
|
+
|
|
939
1344
|
"""
|
|
940
|
-
|
|
941
1345
|
# Raise type error if more than one col is supported
|
|
942
1346
|
if isinstance(col_to_change, list):
|
|
943
1347
|
raise TypeError("Col to change must be specified as a string, not a list")
|
|
944
1348
|
|
|
945
1349
|
# Raises value error if input is invalid for change_in_existing_df_col
|
|
946
1350
|
if change_in_existing_df_col not in ["Yes", "No"]:
|
|
947
|
-
raise ValueError(
|
|
1351
|
+
raise ValueError(
|
|
1352
|
+
"Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']",
|
|
1353
|
+
)
|
|
948
1354
|
|
|
949
1355
|
# Raises value error if input is invalid for add_notes_col
|
|
950
1356
|
if add_notes not in ["Yes", "No"]:
|
|
951
|
-
raise ValueError(
|
|
1357
|
+
raise ValueError(
|
|
1358
|
+
"Invalid input value for add_notes. Allowed values are: ['Yes', 'No']",
|
|
1359
|
+
)
|
|
952
1360
|
|
|
953
1361
|
# Validate filters_dict format
|
|
954
1362
|
for col, cond in filters_dict.items():
|
|
955
1363
|
if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
|
|
956
|
-
raise ValueError(
|
|
1364
|
+
raise ValueError(
|
|
1365
|
+
f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'",
|
|
1366
|
+
)
|
|
957
1367
|
|
|
958
1368
|
# Create the filtered df by applying the conditions
|
|
959
1369
|
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
960
1370
|
|
|
961
1371
|
# Create a new column to add the changes if desired, else edit in the current chosen column
|
|
962
|
-
col_to_update =
|
|
963
|
-
|
|
1372
|
+
col_to_update = (
|
|
1373
|
+
col_to_change
|
|
1374
|
+
if change_in_existing_df_col == "Yes"
|
|
1375
|
+
else new_col_to_change_name
|
|
1376
|
+
)
|
|
1377
|
+
if (
|
|
1378
|
+
change_in_existing_df_col == "No"
|
|
1379
|
+
and new_col_to_change_name not in df.columns
|
|
1380
|
+
):
|
|
964
1381
|
df = df.copy()
|
|
965
1382
|
df[new_col_to_change_name] = df[col_to_change]
|
|
966
1383
|
|
|
@@ -972,19 +1389,19 @@ class dataprocessing:
|
|
|
972
1389
|
if manual_edit_col_name not in df.columns:
|
|
973
1390
|
df[manual_edit_col_name] = 0
|
|
974
1391
|
df.loc[df_filtered.index, manual_edit_col_name] = 1
|
|
975
|
-
elif not manual_edit_col_name and
|
|
976
|
-
df[
|
|
977
|
-
df.loc[df_filtered.index,
|
|
1392
|
+
elif not manual_edit_col_name and "Manual Changes" not in df.columns:
|
|
1393
|
+
df["Manual Changes"] = 0
|
|
1394
|
+
df.loc[df_filtered.index, "Manual Changes"] = 1
|
|
978
1395
|
|
|
979
1396
|
# Add note if desired in new column or an existing column
|
|
980
1397
|
if add_notes == "Yes":
|
|
981
|
-
note_col = existing_note_col_name if existing_note_col_name else
|
|
1398
|
+
note_col = existing_note_col_name if existing_note_col_name else "Notes"
|
|
982
1399
|
if note_col not in df.columns:
|
|
983
1400
|
df[note_col] = None
|
|
984
1401
|
df.loc[df_filtered.index, note_col] = note
|
|
985
1402
|
|
|
986
1403
|
return df
|
|
987
|
-
|
|
1404
|
+
|
|
988
1405
|
def format_numbers_with_commas(self, df, decimal_length_chosen=2):
|
|
989
1406
|
"""
|
|
990
1407
|
Converts data in numerical format into numbers with commas and a chosen decimal place length.
|
|
@@ -995,24 +1412,26 @@ class dataprocessing:
|
|
|
995
1412
|
|
|
996
1413
|
Returns:
|
|
997
1414
|
pandas.DataFrame: The DataFrame with the chosen updated format.
|
|
1415
|
+
|
|
998
1416
|
"""
|
|
1417
|
+
|
|
999
1418
|
def format_number_with_commas(x, decimal_length=decimal_length_chosen):
|
|
1000
1419
|
if pd.isna(x): # Preserve None/NaN values
|
|
1001
1420
|
return pd.NA # Explicitly normalize to pd.NA
|
|
1002
|
-
|
|
1421
|
+
if isinstance(x, (int, float)):
|
|
1003
1422
|
if decimal_length is not None:
|
|
1004
1423
|
format_str = f"{{:,.{decimal_length}f}}"
|
|
1005
1424
|
return format_str.format(x)
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
else:
|
|
1009
|
-
return x # Return unchanged if not a number
|
|
1425
|
+
return f"{x:,}"
|
|
1426
|
+
return x # Return unchanged if not a number
|
|
1010
1427
|
|
|
1011
1428
|
# Apply formatting column by column
|
|
1012
|
-
formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(
|
|
1429
|
+
formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(
|
|
1430
|
+
value=pd.NA,
|
|
1431
|
+
)
|
|
1013
1432
|
|
|
1014
1433
|
return formatted_df
|
|
1015
|
-
|
|
1434
|
+
|
|
1016
1435
|
def filter_df_on_multiple_conditions(self, df, filters_dict):
|
|
1017
1436
|
"""
|
|
1018
1437
|
Filter a dataframe based on mulitple conditions
|
|
@@ -1023,59 +1442,62 @@ class dataprocessing:
|
|
|
1023
1442
|
|
|
1024
1443
|
Returns:
|
|
1025
1444
|
pandas.DatFrame: Filtered Da
|
|
1445
|
+
|
|
1026
1446
|
"""
|
|
1027
1447
|
mask = pd.Series(True, index=df.index)
|
|
1028
1448
|
for col, cond in filters_dict.items():
|
|
1029
1449
|
cond = cond.strip()
|
|
1030
1450
|
operator, value = cond.split(maxsplit=1)
|
|
1031
|
-
|
|
1451
|
+
|
|
1032
1452
|
# If value is a string condition make sure to check if there are new lines
|
|
1033
1453
|
if "'" in value:
|
|
1034
1454
|
value = value.strip().strip("'\"")
|
|
1035
1455
|
# If not a string e.g. datetime or number condition you need to transform the string into a value
|
|
1036
1456
|
else:
|
|
1037
|
-
value = eval(value)
|
|
1457
|
+
value = eval(value)
|
|
1038
1458
|
|
|
1039
1459
|
if operator == "==":
|
|
1040
|
-
temp_mask =
|
|
1460
|
+
temp_mask = df[col] == value
|
|
1041
1461
|
elif operator == "!=":
|
|
1042
|
-
temp_mask =
|
|
1462
|
+
temp_mask = df[col] != value
|
|
1043
1463
|
elif operator == ">=":
|
|
1044
|
-
temp_mask =
|
|
1464
|
+
temp_mask = df[col] >= value
|
|
1045
1465
|
elif operator == "<=":
|
|
1046
|
-
temp_mask =
|
|
1466
|
+
temp_mask = df[col] <= value
|
|
1047
1467
|
elif operator == ">":
|
|
1048
|
-
temp_mask =
|
|
1468
|
+
temp_mask = df[col] > value
|
|
1049
1469
|
elif operator == "<":
|
|
1050
|
-
temp_mask =
|
|
1470
|
+
temp_mask = df[col] < value
|
|
1051
1471
|
mask &= temp_mask
|
|
1052
1472
|
|
|
1053
1473
|
# Create the filtered df by applying the conditions
|
|
1054
1474
|
df_filtered = df[mask]
|
|
1055
|
-
|
|
1475
|
+
|
|
1056
1476
|
return df_filtered
|
|
1057
|
-
|
|
1058
|
-
def read_and_concatenate_files(self, folder_path, file_type=
|
|
1477
|
+
|
|
1478
|
+
def read_and_concatenate_files(self, folder_path, file_type="csv"):
|
|
1059
1479
|
"""
|
|
1060
|
-
Reads all files of a specified type (CSV or XLSX) from a given folder
|
|
1480
|
+
Reads all files of a specified type (CSV or XLSX) from a given folder
|
|
1061
1481
|
and concatenates them into a single DataFrame.
|
|
1062
|
-
|
|
1063
|
-
Parameters
|
|
1482
|
+
|
|
1483
|
+
Parameters
|
|
1484
|
+
----------
|
|
1064
1485
|
folder_path (str): The path to the folder containing the files.
|
|
1065
1486
|
file_type (str): The type of files to read ('csv' or 'xlsx'). Defaults to 'csv'.
|
|
1066
|
-
|
|
1067
|
-
Returns
|
|
1487
|
+
|
|
1488
|
+
Returns
|
|
1489
|
+
-------
|
|
1068
1490
|
pd.DataFrame: A DataFrame containing the concatenated data from all files.
|
|
1491
|
+
|
|
1069
1492
|
"""
|
|
1070
|
-
|
|
1071
1493
|
# Initialize an empty list to hold dataframes
|
|
1072
1494
|
dataframes = []
|
|
1073
1495
|
|
|
1074
1496
|
# Define file extension based on file_type
|
|
1075
|
-
if file_type ==
|
|
1076
|
-
extension =
|
|
1077
|
-
elif file_type ==
|
|
1078
|
-
extension =
|
|
1497
|
+
if file_type == "csv":
|
|
1498
|
+
extension = ".csv"
|
|
1499
|
+
elif file_type == "xlsx":
|
|
1500
|
+
extension = ".xlsx"
|
|
1079
1501
|
else:
|
|
1080
1502
|
raise ValueError("file_type must be either 'csv' or 'xlsx'")
|
|
1081
1503
|
|
|
@@ -1085,19 +1507,19 @@ class dataprocessing:
|
|
|
1085
1507
|
if filename.endswith(extension):
|
|
1086
1508
|
file_path = os.path.join(folder_path, filename)
|
|
1087
1509
|
# Read the file into a DataFrame
|
|
1088
|
-
if file_type ==
|
|
1510
|
+
if file_type == "csv":
|
|
1089
1511
|
df = pd.read_csv(file_path)
|
|
1090
|
-
elif file_type ==
|
|
1512
|
+
elif file_type == "xlsx":
|
|
1091
1513
|
df = pd.read_excel(file_path)
|
|
1092
1514
|
# Append the DataFrame to the list
|
|
1093
1515
|
dataframes.append(df)
|
|
1094
1516
|
|
|
1095
1517
|
# Concatenate all DataFrames into a single DataFrame
|
|
1096
1518
|
combined_df = pd.concat(dataframes, ignore_index=True)
|
|
1097
|
-
|
|
1519
|
+
|
|
1098
1520
|
return combined_df
|
|
1099
|
-
|
|
1100
|
-
def upgrade_outdated_packages(self, exclude_packages=[
|
|
1521
|
+
|
|
1522
|
+
def upgrade_outdated_packages(self, exclude_packages=["twine"]):
|
|
1101
1523
|
"""
|
|
1102
1524
|
Upgrade all outdated Python packages except those specified in `exclude_packages`.
|
|
1103
1525
|
|
|
@@ -1108,32 +1530,49 @@ class dataprocessing:
|
|
|
1108
1530
|
try:
|
|
1109
1531
|
# Get all installed packages
|
|
1110
1532
|
installed_packages_result = subprocess.run(
|
|
1111
|
-
"pip list --format=json",
|
|
1533
|
+
"pip list --format=json",
|
|
1534
|
+
check=False,
|
|
1535
|
+
shell=True,
|
|
1536
|
+
capture_output=True,
|
|
1537
|
+
text=True,
|
|
1112
1538
|
)
|
|
1113
1539
|
installed_packages = json.loads(installed_packages_result.stdout)
|
|
1114
1540
|
|
|
1115
1541
|
# Get the list of outdated packages
|
|
1116
1542
|
outdated_packages_result = subprocess.run(
|
|
1117
|
-
"pip list --outdated --format=json",
|
|
1543
|
+
"pip list --outdated --format=json",
|
|
1544
|
+
check=False,
|
|
1545
|
+
shell=True,
|
|
1546
|
+
capture_output=True,
|
|
1547
|
+
text=True,
|
|
1118
1548
|
)
|
|
1119
1549
|
outdated_packages = json.loads(outdated_packages_result.stdout)
|
|
1120
1550
|
|
|
1121
1551
|
# Create a set of outdated package names for quick lookup
|
|
1122
|
-
outdated_package_names = {pkg[
|
|
1552
|
+
outdated_package_names = {pkg["name"] for pkg in outdated_packages}
|
|
1123
1553
|
|
|
1124
1554
|
# Upgrade only outdated packages, excluding specified packages
|
|
1125
1555
|
for package in installed_packages:
|
|
1126
|
-
package_name = package[
|
|
1127
|
-
if
|
|
1556
|
+
package_name = package["name"]
|
|
1557
|
+
if (
|
|
1558
|
+
package_name in outdated_package_names
|
|
1559
|
+
and package_name not in exclude_packages
|
|
1560
|
+
):
|
|
1128
1561
|
try:
|
|
1129
1562
|
print(f"Upgrading package: {package_name}")
|
|
1130
1563
|
upgrade_result = subprocess.run(
|
|
1131
|
-
f"pip install --upgrade {package_name}",
|
|
1564
|
+
f"pip install --upgrade {package_name}",
|
|
1565
|
+
check=False,
|
|
1566
|
+
shell=True,
|
|
1567
|
+
capture_output=True,
|
|
1568
|
+
text=True,
|
|
1132
1569
|
)
|
|
1133
1570
|
if upgrade_result.returncode == 0:
|
|
1134
1571
|
print(f"Successfully upgraded {package_name}")
|
|
1135
1572
|
else:
|
|
1136
|
-
print(
|
|
1573
|
+
print(
|
|
1574
|
+
f"Failed to upgrade {package_name}: {upgrade_result.stderr}",
|
|
1575
|
+
)
|
|
1137
1576
|
except Exception as e:
|
|
1138
1577
|
print(f"An error occurred while upgrading {package_name}: {e}")
|
|
1139
1578
|
elif package_name in exclude_packages:
|
|
@@ -1145,12 +1584,12 @@ class dataprocessing:
|
|
|
1145
1584
|
|
|
1146
1585
|
def convert_mixed_formats_dates(self, df, column_name):
|
|
1147
1586
|
# Convert initial dates to datetime with coercion to handle errors
|
|
1148
|
-
df[column_name] = pd.to_datetime(df[column_name], errors=
|
|
1587
|
+
df[column_name] = pd.to_datetime(df[column_name], errors="coerce")
|
|
1149
1588
|
df[column_name] = df[column_name].astype(str)
|
|
1150
1589
|
corrected_dates = []
|
|
1151
|
-
|
|
1590
|
+
|
|
1152
1591
|
for date_str in df[column_name]:
|
|
1153
|
-
date_str = date_str.replace(
|
|
1592
|
+
date_str = date_str.replace("-", "").replace("/", "")
|
|
1154
1593
|
if len(date_str) == 8:
|
|
1155
1594
|
year = date_str[:4]
|
|
1156
1595
|
month = date_str[4:6]
|
|
@@ -1161,39 +1600,45 @@ class dataprocessing:
|
|
|
1161
1600
|
else:
|
|
1162
1601
|
corrected_date_str = f"{year}-{month}-{day}"
|
|
1163
1602
|
# Convert to datetime
|
|
1164
|
-
corrected_date = pd.to_datetime(corrected_date_str, errors=
|
|
1603
|
+
corrected_date = pd.to_datetime(corrected_date_str, errors="coerce")
|
|
1165
1604
|
else:
|
|
1166
|
-
corrected_date = pd.to_datetime(date_str, errors=
|
|
1167
|
-
|
|
1605
|
+
corrected_date = pd.to_datetime(date_str, errors="coerce")
|
|
1606
|
+
|
|
1168
1607
|
corrected_dates.append(corrected_date)
|
|
1169
|
-
|
|
1608
|
+
|
|
1170
1609
|
# Check length of the corrected_dates list
|
|
1171
1610
|
if len(corrected_dates) != len(df):
|
|
1172
|
-
raise ValueError(
|
|
1173
|
-
|
|
1611
|
+
raise ValueError(
|
|
1612
|
+
"Length of corrected_dates does not match the original DataFrame",
|
|
1613
|
+
)
|
|
1614
|
+
|
|
1174
1615
|
# Assign the corrected dates back to the DataFrame
|
|
1175
1616
|
df[column_name] = corrected_dates
|
|
1176
1617
|
return df
|
|
1177
1618
|
|
|
1178
|
-
def fill_weekly_date_range(self, df, date_column, freq=
|
|
1619
|
+
def fill_weekly_date_range(self, df, date_column, freq="W-MON"):
|
|
1179
1620
|
# Ensure the date column is in datetime format
|
|
1180
1621
|
df[date_column] = pd.to_datetime(df[date_column])
|
|
1181
|
-
|
|
1622
|
+
|
|
1182
1623
|
# Generate the full date range with the specified frequency
|
|
1183
|
-
full_date_range = pd.date_range(
|
|
1184
|
-
|
|
1624
|
+
full_date_range = pd.date_range(
|
|
1625
|
+
start=df[date_column].min(),
|
|
1626
|
+
end=df[date_column].max(),
|
|
1627
|
+
freq=freq,
|
|
1628
|
+
)
|
|
1629
|
+
|
|
1185
1630
|
# Create a new dataframe with the full date range
|
|
1186
1631
|
full_date_df = pd.DataFrame({date_column: full_date_range})
|
|
1187
|
-
|
|
1632
|
+
|
|
1188
1633
|
# Merge the original dataframe with the new full date range dataframe
|
|
1189
|
-
df_full = full_date_df.merge(df, on=date_column, how=
|
|
1190
|
-
|
|
1634
|
+
df_full = full_date_df.merge(df, on=date_column, how="left")
|
|
1635
|
+
|
|
1191
1636
|
# Fill missing values with 0
|
|
1192
1637
|
df_full.fillna(0, inplace=True)
|
|
1193
|
-
|
|
1638
|
+
|
|
1194
1639
|
return df_full
|
|
1195
|
-
|
|
1196
|
-
def add_prefix_and_suffix(self, df, prefix=
|
|
1640
|
+
|
|
1641
|
+
def add_prefix_and_suffix(self, df, prefix="", suffix="", date_col=None):
|
|
1197
1642
|
"""
|
|
1198
1643
|
Adds a specified prefix and/or suffix to the column names of a DataFrame. Optionally, a column (e.g., a date column) can be excluded.
|
|
1199
1644
|
|
|
@@ -1205,19 +1650,28 @@ class dataprocessing:
|
|
|
1205
1650
|
|
|
1206
1651
|
Returns:
|
|
1207
1652
|
pd.DataFrame: The DataFrame with updated column names.
|
|
1653
|
+
|
|
1208
1654
|
"""
|
|
1209
|
-
|
|
1210
1655
|
# If there is no date column
|
|
1211
1656
|
if date_col is None:
|
|
1212
1657
|
# Add prefixes and suffixes to all columns
|
|
1213
1658
|
df.columns = [prefix + col + suffix for col in df.columns]
|
|
1214
1659
|
else:
|
|
1215
1660
|
# Add prefixes and suffixes to all columns except the date column
|
|
1216
|
-
df.columns = [
|
|
1217
|
-
|
|
1661
|
+
df.columns = [
|
|
1662
|
+
prefix + col + suffix if col != date_col else col for col in df.columns
|
|
1663
|
+
]
|
|
1664
|
+
|
|
1218
1665
|
return df
|
|
1219
1666
|
|
|
1220
|
-
def create_dummies(
|
|
1667
|
+
def create_dummies(
|
|
1668
|
+
self,
|
|
1669
|
+
df,
|
|
1670
|
+
date_col=None,
|
|
1671
|
+
dummy_threshold=0,
|
|
1672
|
+
add_total_dummy_col="No",
|
|
1673
|
+
total_col_name="total",
|
|
1674
|
+
):
|
|
1221
1675
|
"""
|
|
1222
1676
|
Creates dummy variables for the DataFrame, converting values greater than the threshold to 1 and others to 0.
|
|
1223
1677
|
Optionally adds a total dummy column indicating whether any row contains at least one value greater than the threshold.
|
|
@@ -1231,13 +1685,15 @@ class dataprocessing:
|
|
|
1231
1685
|
|
|
1232
1686
|
Returns:
|
|
1233
1687
|
pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
|
|
1234
|
-
"""
|
|
1235
1688
|
|
|
1689
|
+
"""
|
|
1236
1690
|
# If there is no date column
|
|
1237
1691
|
if date_col is None:
|
|
1238
|
-
df = df.apply(
|
|
1692
|
+
df = df.apply(
|
|
1693
|
+
lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0),
|
|
1694
|
+
)
|
|
1239
1695
|
|
|
1240
|
-
if add_total_dummy_col !=
|
|
1696
|
+
if add_total_dummy_col != "No":
|
|
1241
1697
|
# Find max value of rows
|
|
1242
1698
|
df[total_col_name] = df.max(axis=1)
|
|
1243
1699
|
|
|
@@ -1245,18 +1701,25 @@ class dataprocessing:
|
|
|
1245
1701
|
else:
|
|
1246
1702
|
# Create dummies for all columns except the date column
|
|
1247
1703
|
df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
|
|
1248
|
-
lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
|
|
1704
|
+
lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0),
|
|
1249
1705
|
)
|
|
1250
1706
|
|
|
1251
|
-
if add_total_dummy_col !=
|
|
1707
|
+
if add_total_dummy_col != "No":
|
|
1252
1708
|
# Find max value of rows
|
|
1253
1709
|
df[total_col_name] = df.loc[:, df.columns != date_col].max(axis=1)
|
|
1254
1710
|
|
|
1255
1711
|
return df
|
|
1256
1712
|
|
|
1257
|
-
def replace_substrings(
|
|
1713
|
+
def replace_substrings(
|
|
1714
|
+
self,
|
|
1715
|
+
df,
|
|
1716
|
+
column,
|
|
1717
|
+
replacements,
|
|
1718
|
+
to_lower=False,
|
|
1719
|
+
new_column=None,
|
|
1720
|
+
):
|
|
1258
1721
|
"""
|
|
1259
|
-
Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
|
|
1722
|
+
Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
|
|
1260
1723
|
Optionally converts the column values to lowercase and allows creating a new column or modifying the existing one.
|
|
1261
1724
|
|
|
1262
1725
|
Args:
|
|
@@ -1268,6 +1731,7 @@ class dataprocessing:
|
|
|
1268
1731
|
|
|
1269
1732
|
Returns:
|
|
1270
1733
|
pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
|
|
1734
|
+
|
|
1271
1735
|
"""
|
|
1272
1736
|
if new_column is not None:
|
|
1273
1737
|
# Create a new column for replacements
|
|
@@ -1287,7 +1751,7 @@ class dataprocessing:
|
|
|
1287
1751
|
|
|
1288
1752
|
return df
|
|
1289
1753
|
|
|
1290
|
-
def add_total_column(self, df, exclude_col=None, total_col_name=
|
|
1754
|
+
def add_total_column(self, df, exclude_col=None, total_col_name="Total"):
|
|
1291
1755
|
"""
|
|
1292
1756
|
Adds a total column to a DataFrame by summing across all columns. Optionally excludes a specified column.
|
|
1293
1757
|
|
|
@@ -1298,17 +1762,27 @@ class dataprocessing:
|
|
|
1298
1762
|
|
|
1299
1763
|
Returns:
|
|
1300
1764
|
pd.DataFrame: The DataFrame with an added total column.
|
|
1765
|
+
|
|
1301
1766
|
"""
|
|
1302
1767
|
if exclude_col and exclude_col in df.columns:
|
|
1303
1768
|
# Ensure the column to exclude exists before dropping
|
|
1304
|
-
df[total_col_name] = df.drop(columns=[exclude_col], errors=
|
|
1769
|
+
df[total_col_name] = df.drop(columns=[exclude_col], errors="ignore").sum(
|
|
1770
|
+
axis=1,
|
|
1771
|
+
)
|
|
1305
1772
|
else:
|
|
1306
1773
|
# Sum across all columns if no column is specified to exclude
|
|
1307
1774
|
df[total_col_name] = df.sum(axis=1)
|
|
1308
|
-
|
|
1775
|
+
|
|
1309
1776
|
return df
|
|
1310
1777
|
|
|
1311
|
-
def apply_lookup_table_based_on_substring(
|
|
1778
|
+
def apply_lookup_table_based_on_substring(
|
|
1779
|
+
self,
|
|
1780
|
+
df,
|
|
1781
|
+
column_name,
|
|
1782
|
+
category_dict,
|
|
1783
|
+
new_col_name="Category",
|
|
1784
|
+
other_label="Other",
|
|
1785
|
+
):
|
|
1312
1786
|
"""
|
|
1313
1787
|
Categorizes text in a specified DataFrame column by applying a lookup table based on substrings.
|
|
1314
1788
|
|
|
@@ -1321,6 +1795,7 @@ class dataprocessing:
|
|
|
1321
1795
|
|
|
1322
1796
|
Returns:
|
|
1323
1797
|
pd.DataFrame: The original DataFrame with an additional column containing the assigned categories.
|
|
1798
|
+
|
|
1324
1799
|
"""
|
|
1325
1800
|
|
|
1326
1801
|
def categorize_text(text):
|
|
@@ -1331,11 +1806,14 @@ class dataprocessing:
|
|
|
1331
1806
|
text (str): The text string to categorize.
|
|
1332
1807
|
|
|
1333
1808
|
Returns:
|
|
1334
|
-
str: The category assigned based on the first matching substring found in the text. If no
|
|
1809
|
+
str: The category assigned based on the first matching substring found in the text. If no
|
|
1335
1810
|
matching substring is found, returns other_name.
|
|
1811
|
+
|
|
1336
1812
|
"""
|
|
1337
1813
|
for key, category in category_dict.items():
|
|
1338
|
-
if
|
|
1814
|
+
if (
|
|
1815
|
+
key.lower() in text.lower()
|
|
1816
|
+
): # Check if the substring is in the text (case-insensitive)
|
|
1339
1817
|
return category
|
|
1340
1818
|
return other_label # Default category if no match is found
|
|
1341
1819
|
|
|
@@ -1354,6 +1832,7 @@ class dataprocessing:
|
|
|
1354
1832
|
|
|
1355
1833
|
Returns:
|
|
1356
1834
|
tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
|
|
1835
|
+
|
|
1357
1836
|
"""
|
|
1358
1837
|
# Ensure date columns are in datetime format
|
|
1359
1838
|
df1[date_col] = pd.to_datetime(df1[date_col])
|
|
@@ -1368,29 +1847,43 @@ class dataprocessing:
|
|
|
1368
1847
|
df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
|
|
1369
1848
|
|
|
1370
1849
|
# Merge the DataFrames on the date column
|
|
1371
|
-
merged_df = pd.merge(
|
|
1850
|
+
merged_df = pd.merge(
|
|
1851
|
+
df1_overlap,
|
|
1852
|
+
df2_overlap,
|
|
1853
|
+
on=date_col,
|
|
1854
|
+
suffixes=("_df1", "_df2"),
|
|
1855
|
+
)
|
|
1372
1856
|
|
|
1373
1857
|
# Get common columns, excluding the date column
|
|
1374
|
-
common_cols = [
|
|
1858
|
+
common_cols = [
|
|
1859
|
+
col for col in df1.columns if col != date_col and col in df2.columns
|
|
1860
|
+
]
|
|
1375
1861
|
|
|
1376
1862
|
# Create a DataFrame for differences
|
|
1377
1863
|
diff_df = pd.DataFrame({date_col: merged_df[date_col]})
|
|
1378
1864
|
|
|
1379
1865
|
total_diff_list = []
|
|
1380
1866
|
for col in common_cols:
|
|
1381
|
-
diff_col = f
|
|
1382
|
-
diff_df[diff_col] =
|
|
1867
|
+
diff_col = f"diff_{col}"
|
|
1868
|
+
diff_df[diff_col] = (
|
|
1869
|
+
merged_df[f"{col}_df1"] - merged_df[f"{col}_df2"]
|
|
1870
|
+
) # Corrected subtraction order
|
|
1383
1871
|
|
|
1384
1872
|
# Sum differences for the column
|
|
1385
1873
|
total_diff = diff_df[diff_col].sum()
|
|
1386
|
-
total_diff_list.append({
|
|
1874
|
+
total_diff_list.append({"Column": col, "Total Difference": total_diff})
|
|
1387
1875
|
|
|
1388
1876
|
# Create summary DataFrame
|
|
1389
1877
|
total_diff_df = pd.DataFrame(total_diff_list)
|
|
1390
1878
|
|
|
1391
1879
|
return diff_df, total_diff_df
|
|
1392
1880
|
|
|
1393
|
-
def week_commencing_2_week_commencing_conversion_isoweekday(
|
|
1881
|
+
def week_commencing_2_week_commencing_conversion_isoweekday(
|
|
1882
|
+
self,
|
|
1883
|
+
df,
|
|
1884
|
+
date_col,
|
|
1885
|
+
week_commencing="mon",
|
|
1886
|
+
):
|
|
1394
1887
|
"""
|
|
1395
1888
|
Convert a DataFrame's date column so that each date is mapped back
|
|
1396
1889
|
to the 'week_commencing' day of the *current ISO week*.
|
|
@@ -1398,7 +1891,7 @@ class dataprocessing:
|
|
|
1398
1891
|
Args:
|
|
1399
1892
|
df (pandas.DataFrame): The DataFrame with date-based data.
|
|
1400
1893
|
date_col (str): The name of the date column.
|
|
1401
|
-
week_commencing (str): The desired start of the week.
|
|
1894
|
+
week_commencing (str): The desired start of the week.
|
|
1402
1895
|
('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
|
|
1403
1896
|
Uses ISO day numbering (Mon=1, ..., Sun=7).
|
|
1404
1897
|
|
|
@@ -1406,9 +1899,18 @@ class dataprocessing:
|
|
|
1406
1899
|
pandas.DataFrame: Original DataFrame with an extra column
|
|
1407
1900
|
'week_start_<week_commencing>' containing the
|
|
1408
1901
|
start-of-week date for each row.
|
|
1902
|
+
|
|
1409
1903
|
"""
|
|
1410
1904
|
# ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
|
|
1411
|
-
iso_day_dict = {
|
|
1905
|
+
iso_day_dict = {
|
|
1906
|
+
"mon": 1,
|
|
1907
|
+
"tue": 2,
|
|
1908
|
+
"wed": 3,
|
|
1909
|
+
"thur": 4,
|
|
1910
|
+
"fri": 5,
|
|
1911
|
+
"sat": 6,
|
|
1912
|
+
"sun": 7,
|
|
1913
|
+
}
|
|
1412
1914
|
|
|
1413
1915
|
target_day = iso_day_dict[week_commencing]
|
|
1414
1916
|
|
|
@@ -1419,15 +1921,23 @@ class dataprocessing:
|
|
|
1419
1921
|
# Apply the transformation
|
|
1420
1922
|
new_col = f"week_start_{week_commencing}"
|
|
1421
1923
|
df[new_col] = df[date_col].apply(map_to_week_start)
|
|
1422
|
-
|
|
1924
|
+
|
|
1423
1925
|
return df
|
|
1424
|
-
|
|
1425
|
-
def seasonality_feature_extraction(
|
|
1926
|
+
|
|
1927
|
+
def seasonality_feature_extraction(
|
|
1928
|
+
self,
|
|
1929
|
+
df,
|
|
1930
|
+
kpi_var,
|
|
1931
|
+
n_features=10,
|
|
1932
|
+
test_size=0.1,
|
|
1933
|
+
random_state=42,
|
|
1934
|
+
shuffle=False,
|
|
1935
|
+
):
|
|
1426
1936
|
"""
|
|
1427
1937
|
1) Uses the provided dataframe (df), where:
|
|
1428
1938
|
- df['kpi_total_sales'] is the target (y).
|
|
1429
1939
|
- df['OBS'] is a date or index column (excluded from features).
|
|
1430
|
-
|
|
1940
|
+
|
|
1431
1941
|
2) Splits data into train/test using the specified test_size, random_state, and shuffle.
|
|
1432
1942
|
3) Trains XGBoost and Random Forest on all features.
|
|
1433
1943
|
4) Extracts the top n_features from each model.
|
|
@@ -1457,20 +1967,22 @@ class dataprocessing:
|
|
|
1457
1967
|
- "combined_features": merged unique feature list
|
|
1458
1968
|
- "performance": dictionary of performance metrics
|
|
1459
1969
|
- "models": dictionary of fitted models
|
|
1970
|
+
|
|
1460
1971
|
"""
|
|
1461
1972
|
# ---------------------------------------------------------------------
|
|
1462
1973
|
# 1. Prepare your data (X, y)
|
|
1463
1974
|
# ---------------------------------------------------------------------
|
|
1464
1975
|
# Extract target and features
|
|
1465
1976
|
y = df[kpi_var]
|
|
1466
|
-
X = df.drop(columns=[
|
|
1977
|
+
X = df.drop(columns=["OBS", kpi_var])
|
|
1467
1978
|
|
|
1468
1979
|
# Split into train/test
|
|
1469
1980
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
1470
|
-
X,
|
|
1981
|
+
X,
|
|
1982
|
+
y,
|
|
1471
1983
|
test_size=test_size,
|
|
1472
1984
|
random_state=random_state,
|
|
1473
|
-
shuffle=shuffle
|
|
1985
|
+
shuffle=shuffle,
|
|
1474
1986
|
)
|
|
1475
1987
|
|
|
1476
1988
|
# ---------------------------------------------------------------------
|
|
@@ -1483,16 +1995,13 @@ class dataprocessing:
|
|
|
1483
1995
|
# (B) Get feature importances
|
|
1484
1996
|
xgb_importances = xgb_model_full.feature_importances_
|
|
1485
1997
|
xgb_feat_importance_df = (
|
|
1486
|
-
pd.DataFrame({
|
|
1487
|
-
|
|
1488
|
-
'importance': xgb_importances
|
|
1489
|
-
})
|
|
1490
|
-
.sort_values('importance', ascending=False)
|
|
1998
|
+
pd.DataFrame({"feature": X.columns, "importance": xgb_importances})
|
|
1999
|
+
.sort_values("importance", ascending=False)
|
|
1491
2000
|
.reset_index(drop=True)
|
|
1492
2001
|
)
|
|
1493
2002
|
|
|
1494
2003
|
# (C) Select top N features
|
|
1495
|
-
top_features_xgb = xgb_feat_importance_df[
|
|
2004
|
+
top_features_xgb = xgb_feat_importance_df["feature"].head(n_features).tolist()
|
|
1496
2005
|
|
|
1497
2006
|
# (D) Subset data to top N features
|
|
1498
2007
|
X_train_xgb_topN = X_train[top_features_xgb]
|
|
@@ -1510,16 +2019,13 @@ class dataprocessing:
|
|
|
1510
2019
|
# (B) Get feature importances
|
|
1511
2020
|
rf_importances = rf_model_full.feature_importances_
|
|
1512
2021
|
rf_feat_importance_df = (
|
|
1513
|
-
pd.DataFrame({
|
|
1514
|
-
|
|
1515
|
-
'importance': rf_importances
|
|
1516
|
-
})
|
|
1517
|
-
.sort_values('importance', ascending=False)
|
|
2022
|
+
pd.DataFrame({"feature": X.columns, "importance": rf_importances})
|
|
2023
|
+
.sort_values("importance", ascending=False)
|
|
1518
2024
|
.reset_index(drop=True)
|
|
1519
2025
|
)
|
|
1520
2026
|
|
|
1521
2027
|
# (C) Select top N features
|
|
1522
|
-
top_features_rf = rf_feat_importance_df[
|
|
2028
|
+
top_features_rf = rf_feat_importance_df["feature"].head(n_features).tolist()
|
|
1523
2029
|
|
|
1524
2030
|
# (D) Subset data to top N features
|
|
1525
2031
|
X_train_rf_topN = X_train[top_features_rf]
|
|
@@ -1551,25 +2057,45 @@ class dataprocessing:
|
|
|
1551
2057
|
|
|
1552
2058
|
return output
|
|
1553
2059
|
|
|
1554
|
-
def quid_pr
|
|
2060
|
+
def quid_pr(self, df):
|
|
1555
2061
|
def convert_date(date_str):
|
|
1556
2062
|
try:
|
|
1557
|
-
return datetime.strptime(date_str,
|
|
2063
|
+
return datetime.strptime(date_str, "%b %d, %Y")
|
|
1558
2064
|
except ValueError:
|
|
1559
2065
|
return None # Return None if conversion fails
|
|
2066
|
+
|
|
1560
2067
|
# Apply conversion to create new columns
|
|
1561
|
-
df[
|
|
1562
|
-
df[
|
|
1563
|
-
df[
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
df[
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
2068
|
+
df["Start Date"] = df["Earliest Published"].astype(str).apply(convert_date)
|
|
2069
|
+
df["End Date"] = df["Latest Published"].astype(str).apply(convert_date)
|
|
2070
|
+
df["Days Duration"] = (
|
|
2071
|
+
df["End Date"] - df["Start Date"]
|
|
2072
|
+
).dt.days + 1 # Ensure inclusive range
|
|
2073
|
+
df["Count per Day"] = (
|
|
2074
|
+
df["Published Count"] / df["Days Duration"]
|
|
2075
|
+
) # Calculate count per day
|
|
2076
|
+
df["Social Engagement per Day"] = df["Social Engagement"] / df["Days Duration"]
|
|
2077
|
+
df["Week Start"] = df["Start Date"].apply(
|
|
2078
|
+
lambda x: x - timedelta(days=x.weekday()) if pd.notnull(x) else None,
|
|
2079
|
+
)
|
|
2080
|
+
count_df = df.groupby("Week Start")["Count per Day"].sum().reset_index()
|
|
2081
|
+
total_engagement_per_company = (
|
|
2082
|
+
df.groupby("Company (Primary Mention)")["Social Engagement"]
|
|
2083
|
+
.sum()
|
|
2084
|
+
.reset_index()
|
|
2085
|
+
) # Caluclates Social Engagement across whole period
|
|
2086
|
+
valid_companies = total_engagement_per_company[
|
|
2087
|
+
total_engagement_per_company["Social Engagement"] > 0
|
|
2088
|
+
][
|
|
2089
|
+
"Company (Primary Mention)"
|
|
2090
|
+
] # Filters out Companies with no Social Engagement
|
|
2091
|
+
social_engagement_df = (
|
|
2092
|
+
df[df["Company (Primary Mention)"].isin(valid_companies)]
|
|
2093
|
+
.groupby(["Week Start", "Company (Primary Mention)"])["Social Engagement"]
|
|
2094
|
+
.sum()
|
|
2095
|
+
.reset_index()
|
|
2096
|
+
)
|
|
2097
|
+
total_social_engagement_df = (
|
|
2098
|
+
df.groupby("Week Start")["Social Engagement per Day"].sum().reset_index()
|
|
2099
|
+
)
|
|
2100
|
+
|
|
2101
|
+
return count_df, total_social_engagement_df, social_engagement_df
|