imsciences 0.5.4.7__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/mmm.py ADDED
@@ -0,0 +1,1415 @@
1
+ import pandas as pd
2
+ import calendar
3
+ import os
4
+ import numpy as np
5
+ import re
6
+ from datetime import datetime, timedelta
7
+ import subprocess
8
+ import json
9
+
10
+ class dataprocessing:
11
+
12
+ def help(self):
13
+
14
+ print("\n1. get_wd_levels")
15
+ print(" - Description: Get the working directory with the option of moving up parents.")
16
+ print(" - Usage: get_wd_levels(levels)")
17
+ print(" - Example: get_wd_levels(0)")
18
+
19
+ print("\n2. aggregate_daily_to_wc_long")
20
+ print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
21
+ print(" - Usage: aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')")
22
+ print(" - Example: aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')")
23
+
24
+ print("\n3. convert_monthly_to_daily")
25
+ print(" - Description: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.")
26
+ print(" - Usage: convert_monthly_to_daily(df, date_column, divide=True)")
27
+ print(" - Example: convert_monthly_to_daily(df, 'date')")
28
+
29
+ print("\n4. week_of_year_mapping")
30
+ print(" - Description: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.")
31
+ print(" - Usage: week_of_year_mapping(df, week_col, start_day_str)")
32
+ print(" - Example: week_of_year_mapping(df, 'week', 'mon')")
33
+
34
+ print("\n5. rename_cols")
35
+ print(" - Description: Renames columns in a pandas DataFrame with a specified prefix or format.")
36
+ print(" - Usage: rename_cols(df, name='ame_')")
37
+ print(" - Example: rename_cols(df, 'ame_facebook')")
38
+
39
+ print("\n6. merge_new_and_old")
40
+ print(" - Description: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.")
41
+ print(" - Usage: merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')")
42
+ print(" - Example: merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')")
43
+
44
+ print("\n7. merge_dataframes_on_column")
45
+ print(" - Description: Merge a list of DataFrames on a common column.")
46
+ print(" - Usage: merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')")
47
+ print(" - Example: merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')")
48
+
49
+ print("\n8. merge_and_update_dfs")
50
+ print(" - Description: Merges two dataframes, updating columns from the second dataframe where values are available.")
51
+ print(" - Usage: merge_and_update_dfs(df1, df2, key_column)")
52
+ print(" - Example: merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')")
53
+
54
+ print("\n9. convert_us_to_uk_dates")
55
+ print(" - Description: Convert a DataFrame column with mixed US and UK date formats to datetime.")
56
+ print(" - Usage: convert_us_to_uk_dates(df, date_col)")
57
+ print(" - Example: convert_us_to_uk_dates(df, 'date')")
58
+
59
+ print("\n10. combine_sheets")
60
+ print(" - Description: Combines multiple DataFrames from a dictionary into a single DataFrame.")
61
+ print(" - Usage: combine_sheets(all_sheets)")
62
+ print(" - Example: combine_sheets({'Sheet1': df1, 'Sheet2': df2})")
63
+
64
+ print("\n11. pivot_table")
65
+ print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
66
+ print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')")
67
+ print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)")
68
+
69
+ print("\n12. apply_lookup_table_for_columns")
70
+ print(" - Description: Maps substrings in columns to new values based on a dictionary.")
71
+ print(" - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')")
72
+ print(" - Example: apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')")
73
+
74
+ print("\n13. aggregate_daily_to_wc_wide")
75
+ print(" - Description: Aggregates daily data into weekly data and pivots it to wide format.")
76
+ print(" - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)")
77
+ print(" - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)")
78
+
79
+ print("\n14. merge_cols_with_seperator")
80
+ print(" - Description: Merges multiple columns in a DataFrame into one column with a specified separator.")
81
+ print(" - Usage: merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')")
82
+ print(" - Example: merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')")
83
+
84
+ print("\n15. check_sum_of_df_cols_are_equal")
85
+ print(" - Description: Checks if the sum of two columns in two DataFrames are equal and provides the difference.")
86
+ print(" - Usage: check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)")
87
+ print(" - Example: check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')")
88
+
89
+ print("\n16. convert_2_df_cols_to_dict")
90
+ print(" - Description: Creates a dictionary from two DataFrame columns.")
91
+ print(" - Usage: convert_2_df_cols_to_dict(df, key_col, value_col)")
92
+ print(" - Example: convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')")
93
+
94
+ print("\n17. create_FY_and_H_columns")
95
+ print(" - Description: Adds financial year and half-year columns to a DataFrame based on a start date.")
96
+ print(" - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')")
97
+ print(" - Example: create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')")
98
+
99
+ print("\n18. keyword_lookup_replacement")
100
+ print(" - Description: Updates values in a column based on a lookup dictionary with conditional logic.")
101
+ print(" - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')")
102
+ print(" - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')")
103
+
104
+ print("\n19. create_new_version_of_col_using_LUT")
105
+ print(" - Description: Creates a new column based on a lookup table applied to an existing column.")
106
+ print(" - Usage: create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')")
107
+ print(" - Example: create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)")
108
+
109
+ print("\n20. convert_df_wide_2_long")
110
+ print(" - Description: Converts a wide-format DataFrame into a long-format DataFrame.")
111
+ print(" - Usage: convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')")
112
+ print(" - Example: convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')")
113
+
114
+ print("\n21. manually_edit_data")
115
+ print(" - Description: Manually updates specified cells in a DataFrame based on filters.")
116
+ print(" - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)")
117
+ print(" - Example: manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')")
118
+
119
+ print("\n22. format_numbers_with_commas")
120
+ print(" - Description: Formats numerical columns with commas and a specified number of decimal places.")
121
+ print(" - Usage: format_numbers_with_commas(df, decimal_length_chosen=2)")
122
+ print(" - Example: format_numbers_with_commas(df, decimal_length_chosen=1)")
123
+
124
+ print("\n23. filter_df_on_multiple_conditions")
125
+ print(" - Description: Filters a DataFrame based on multiple column conditions.")
126
+ print(" - Usage: filter_df_on_multiple_conditions(df, filters_dict)")
127
+ print(" - Example: filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})")
128
+
129
+ print("\n24. read_and_concatenate_files")
130
+ print(" - Description: Reads and concatenates files from a specified folder into a single DataFrame.")
131
+ print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
132
+ print(" - Example: read_and_concatenate_files('/path/to/files', file_type='xlsx')")
133
+
134
+ print("\n25. upgrade_outdated_packages")
135
+ print(" - Description: Upgrades all outdated Python packages except specified ones.")
136
+ print(" - Usage: upgrade_outdated_packages(exclude_packages=['twine'])")
137
+ print(" - Example: upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])")
138
+
139
+ print("\n26. convert_mixed_formats_dates")
140
+ print(" - Description: Converts mixed-format date columns into standardized datetime format.")
141
+ print(" - Usage: convert_mixed_formats_dates(df, column_name)")
142
+ print(" - Example: convert_mixed_formats_dates(df, 'date_col')")
143
+
144
+ print("\n27. fill_weekly_date_range")
145
+ print(" - Description: Fills in missing weekly dates in a DataFrame with a specified frequency.")
146
+ print(" - Usage: fill_weekly_date_range(df, date_column, freq='W-MON')")
147
+ print(" - Example: fill_weekly_date_range(df, 'date_col')")
148
+
149
+ print("\n28. add_prefix_and_suffix")
150
+ print(" - Description: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.")
151
+ print(" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)")
152
+ print(" - Example: add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')")
153
+
154
+ print("\n29. create_dummies")
155
+ print(" - Description: Creates dummy variables for columns, with an option to add a total dummy column.")
156
+ print(" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')")
157
+ print(" - Example: create_dummies(df, date_col='date_col', dummy_threshold=1)")
158
+
159
+ print("\n30. replace_substrings")
160
+ print(" - Description: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.")
161
+ print(" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)")
162
+ print(" - Example: replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')")
163
+
164
+ print("\n31. add_total_column")
165
+ print(" - Description: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.")
166
+ print(" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')")
167
+ print(" - Example: add_total_column(df, exclude_col='date_col')")
168
+
169
+ print("\n32. apply_lookup_table_based_on_substring")
170
+ print(" - Description: Categorizes text in a column using a lookup table based on substrings.")
171
+ print(" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')")
172
+ print(" - Example: apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})")
173
+
174
+ print("\n33. compare_overlap")
175
+ print(" - Description: Compares overlapping periods between two DataFrames and summarizes differences.")
176
+ print(" - Usage: compare_overlap(df1, df2, date_col)")
177
+ print(" - Example: compare_overlap(df1, df2, 'date_col')")
178
+
179
+ print("\n34. week_commencing_2_week_commencing_conversion_isoweekday")
180
+ print(" - Description: Maps dates to the start of the current ISO week based on a specified weekday.")
181
+ print(" - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')")
182
+ print(" - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')")
183
+
184
+ def get_wd_levels(self, levels):
185
+ """
186
+ Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
187
+
188
+ Parameters:
189
+ - data_frame: pandas DataFrame
190
+ The input data frame.
191
+ - num_rows_to_remove: int
192
+ The number of levels to move up pathways.
193
+
194
+ Returns:
195
+ - Current wd
196
+ """
197
+
198
+ directory = os.getcwd()
199
+ for _ in range(levels):
200
+ directory = os.path.dirname(directory)
201
+ return directory
202
+
203
+ def aggregate_daily_to_wc_long(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum') -> pd.DataFrame:
204
+ """
205
+ Aggregates daily data into weekly data, starting on a specified day of the week,
206
+ and groups the data by additional specified columns. It aggregates specified numeric columns
207
+ by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
208
+ of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
209
+ The day column is renamed from 'Day' to 'OBS'.
210
+
211
+ Parameters:
212
+ - df: pandas DataFrame
213
+ The input DataFrame containing daily data.
214
+ - date_column: string
215
+ The name of the column in the DataFrame that contains date information.
216
+ - group_columns: list of strings
217
+ Additional column names to group by along with the weekly grouping.
218
+ - sum_columns: list of strings
219
+ Numeric column names to be aggregated during aggregation.
220
+ - wc: string
221
+ The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
222
+ - aggregation: string, optional (default 'sum')
223
+ Aggregation method, either 'sum', 'average', or 'count'.
224
+
225
+ Returns:
226
+ - pandas DataFrame
227
+ A new DataFrame with weekly aggregated data. The index is reset,
228
+ and columns represent the grouped and aggregated metrics. The DataFrame
229
+ is in long format, with separate columns for each combination of
230
+ grouped metrics.
231
+ """
232
+
233
+ # Map the input week commencing day to a weekday number (0=Monday, 6=Sunday)
234
+ days = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, 'sun': 6}
235
+ if wc.lower() not in days:
236
+ return print(f"Incorrect week commencing day input: '{wc}'. Please choose a valid day of the week (e.g., 'sun', 'mon', etc.).")
237
+
238
+ start_day = days[wc.lower()]
239
+
240
+ # Make a copy of the DataFrame
241
+ df_copy = df.copy()
242
+
243
+ # Convert the date column to datetime
244
+ df_copy[date_column] = pd.to_datetime(df_copy[date_column])
245
+
246
+ # Determine the start of each week
247
+ df_copy['week_start'] = df_copy[date_column].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - start_day) % 7))
248
+
249
+ # Convert sum_columns to numeric and fill NaNs with 0, retaining decimal values
250
+ for col in sum_columns:
251
+ df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)
252
+
253
+ # Group by the new week start column and additional columns, then aggregate the numeric columns
254
+ if aggregation == 'average':
255
+ grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].mean().reset_index()
256
+ elif aggregation == 'count':
257
+ grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].count().reset_index()
258
+ else: # Default to 'sum' if any other value is provided
259
+ grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].sum().reset_index()
260
+
261
+ # Rename 'week_start' column to 'OBS'
262
+ grouped = grouped.rename(columns={'week_start': 'OBS'})
263
+
264
+ return grouped
265
+
266
+ def convert_monthly_to_daily(self, df, date_column, divide = True):
267
+ """
268
+ Convert a DataFrame with monthly data to daily data.
269
+ This function takes a DataFrame and a date column, then it expands each
270
+ monthly record into daily records by dividing the numeric values by the number of days in that month.
271
+
272
+ :param df: DataFrame with monthly data.
273
+ :param date_column: The name of the column containing the date.
274
+ :param divide: boolean divide by the number of days in a month (default True)
275
+ :return: A new DataFrame with daily data.
276
+ """
277
+
278
+ # Convert date_column to datetime
279
+ df[date_column] = pd.to_datetime(df[date_column])
280
+
281
+ # Initialize an empty list to hold the daily records
282
+ daily_records = []
283
+
284
+ # Iterate over each row in the DataFrame
285
+ for _, row in df.iterrows():
286
+ # Calculate the number of days in the month
287
+ num_days = calendar.monthrange(row[date_column].year, row[date_column].month)[1]
288
+
289
+ # Create a new record for each day of the month
290
+ for day in range(1, num_days + 1):
291
+ daily_row = row.copy()
292
+ daily_row[date_column] = row[date_column].replace(day=day)
293
+
294
+ # Divide each numeric value by the number of days in the month
295
+ for col in df.columns:
296
+ if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
297
+ if divide is True:
298
+ daily_row[col] = row[col] / num_days
299
+ else:
300
+ daily_row[col] = row[col]
301
+ daily_records.append(daily_row)
302
+
303
+ # Convert the list of daily records into a DataFrame
304
+ daily_df = pd.DataFrame(daily_records)
305
+
306
+ return daily_df
307
+
308
+ def week_of_year_mapping(self,df, week_col, start_day_str):
309
+
310
+ # Mapping of string day names to day numbers (1 for Monday, 7 for Sunday)
311
+ day_mapping = {
312
+ 'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7
313
+ }
314
+
315
+ # Convert the day string to a number, or raise an error if not valid
316
+ start_day = day_mapping.get(start_day_str.lower())
317
+ if start_day is None:
318
+ raise ValueError(f"Invalid day input: '{start_day_str}'. Please use one of 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'.")
319
+
320
+ # Function to convert week number to start date of the week
321
+ def week_to_startdate(week_str, start_day):
322
+ year, week = map(int, week_str.split('-W'))
323
+ first_day_of_year = datetime(year, 1, 1)
324
+ first_weekday_of_year = first_day_of_year.weekday() # Monday is 0 and Sunday is 6
325
+
326
+ # Calculate days to adjust to the desired start day of the week
327
+ days_to_adjust = (start_day - 1 - first_weekday_of_year) % 7
328
+ start_of_iso_week = first_day_of_year + timedelta(days=days_to_adjust)
329
+
330
+ # Calculate the start of the desired week
331
+ start_of_week = start_of_iso_week + timedelta(weeks=week - 1)
332
+ return start_of_week
333
+
334
+ # Apply the function to each row in the specified week column
335
+ df['OBS'] = df[week_col].apply(lambda x: week_to_startdate(x, start_day)).dt.strftime('%d/%m/%Y')
336
+ return df
337
+
338
+ def rename_cols(self, df, name = 'ame_'):
339
+ new_columns = {}
340
+ for col in df.columns:
341
+ if col != 'OBS':
342
+ new_col_name = name + col.replace(" ", "_").lower()
343
+ else:
344
+ new_col_name = col
345
+ new_columns[col] = new_col_name
346
+ return df.rename(columns=new_columns)
347
+
348
+ def merge_new_and_old(self, old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS'):
349
+ """
350
+ Creates a new DataFrame with two columns: one for dates and one for merged numeric values.
351
+ Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.
352
+
353
+ Parameters:
354
+ - old_df: pandas DataFrame
355
+ The old DataFrame from which to take the numeric values up to the specified date.
356
+ - old_col: str
357
+ The name of the numeric column in the old DataFrame whose values are to be taken.
358
+ - new_df: pandas DataFrame
359
+ The new DataFrame from which to take the numeric values from the specified date onwards.
360
+ - new_col: str
361
+ The name of the numeric column in the new DataFrame whose values are to be taken.
362
+ - cutoff_date: str
363
+ The cut-off date in 'YYYY-MM-DD' format to split the data between the two DataFrames.
364
+ - date_col_name: str, optional (default 'OBS')
365
+ The name of the date column in both DataFrames.
366
+
367
+ Returns:
368
+ - pandas DataFrame
369
+ A new DataFrame with two columns: 'Date' and a column named after 'new_col' containing merged numeric values.
370
+ """
371
+
372
+ # Convert date columns in both dataframes to datetime for comparison
373
+ old_df[date_col_name] = pd.to_datetime(old_df[date_col_name])
374
+ new_df[date_col_name] = pd.to_datetime(new_df[date_col_name])
375
+
376
+ # Convert the cutoff date string to datetime
377
+ cutoff_date = pd.to_datetime(cutoff_date)
378
+
379
+ # Split old and new dataframes based on the cutoff date
380
+ old_values = old_df[old_df[date_col_name] <= cutoff_date]
381
+ new_values = new_df[new_df[date_col_name] > cutoff_date]
382
+
383
+ # Create a new DataFrame with two columns: 'Date' and a column named after 'new_col'
384
+ merged_df = pd.DataFrame({
385
+ 'OBS': pd.concat([old_values[date_col_name], new_values[date_col_name]], ignore_index=True),
386
+ new_col: pd.concat([old_values[old_col], new_values[new_col]], ignore_index=True)
387
+ })
388
+
389
+ return merged_df
390
+
391
+ def merge_dataframes_on_column(self, dataframes, common_column='OBS', merge_how='outer'):
392
+ """
393
+ Merge a list of DataFrames on a common column.
394
+
395
+ Parameters:
396
+ - dataframes: A list of DataFrames to merge.
397
+ - common_column: The name of the common column to merge on.
398
+ - merge_how: The type of merge to perform ('inner', 'outer', 'left', or 'right').
399
+
400
+ Returns:
401
+ - A merged DataFrame.
402
+ """
403
+ if not dataframes:
404
+ return None
405
+
406
+ merged_df = dataframes[0] # Start with the first DataFrame
407
+
408
+ for df in dataframes[1:]:
409
+ merged_df = pd.merge(merged_df, df, on=common_column, how=merge_how)
410
+
411
+ # Check if the common column is of datetime dtype
412
+ if merged_df[common_column].dtype == 'datetime64[ns]':
413
+ merged_df[common_column] = pd.to_datetime(merged_df[common_column])
414
+ merged_df = merged_df.sort_values(by=common_column)
415
+ merged_df = merged_df.fillna(0)
416
+
417
+ return merged_df
418
+
419
+ def merge_and_update_dfs(self, df1, df2, key_column):
420
+ """
421
+ Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available,
422
+ and returns a dataframe sorted by the key column.
423
+
424
+ Parameters:
425
+ df1 (DataFrame): The first dataframe to merge (e.g., processed_facebook).
426
+ df2 (DataFrame): The second dataframe to merge (e.g., finalised_meta).
427
+ key_column (str): The name of the column to merge and sort by (e.g., 'OBS').
428
+
429
+ Returns:
430
+ DataFrame: The merged and updated dataframe.
431
+ """
432
+
433
+ # Sort both DataFrames by the key column
434
+ df1_sorted = df1.sort_values(by=key_column)
435
+ df2_sorted = df2.sort_values(by=key_column)
436
+
437
+ # Perform the full outer merge
438
+ merged_df = pd.merge(df1_sorted, df2_sorted, on=key_column, how='outer', suffixes=('', '_finalised'))
439
+
440
+ # Update with non-null values from df2
441
+ for column in merged_df.columns:
442
+ if column.endswith('_finalised'):
443
+ original_column = column.replace('_finalised', '')
444
+ merged_df.loc[merged_df[column].notnull(), original_column] = merged_df.loc[merged_df[column].notnull(), column]
445
+ merged_df.drop(column, axis=1, inplace=True)
446
+
447
+ # Sort the merged DataFrame by the key column
448
+ merged_df.sort_values(by=key_column, inplace=True)
449
+
450
+ # Handle null values (optional, can be adjusted as needed)
451
+ merged_df.fillna(0, inplace=True)
452
+
453
+ return merged_df
454
+
455
+ def convert_us_to_uk_dates(self, df, date_col):
456
+ """
457
+ Processes the date column of a DataFrame to remove hyphens and slashes,
458
+ and converts it to a datetime object.
459
+
460
+ Parameters:
461
+ df (pd.DataFrame): The DataFrame containing the date column.
462
+ date_col (str): The name of the date column.
463
+
464
+ Returns:
465
+ pd.DataFrame: The DataFrame with the processed date column.
466
+ """
467
+ df[date_col] = df[date_col].str.replace(r'[-/]', '', regex=True)
468
+ df[date_col] = pd.to_datetime(
469
+ df[date_col].str.slice(0, 2) + '/' +
470
+ df[date_col].str.slice(2, 4) + '/' +
471
+ df[date_col].str.slice(4, 8),
472
+ format='%m/%d/%Y'
473
+ )
474
+ return df
475
+
476
+ def combine_sheets(self, all_sheets):
477
+ """
478
+ Combines multiple DataFrames from a dictionary into a single DataFrame.
479
+ Adds a column 'SheetName' indicating the origin sheet of each row.
480
+
481
+ Parameters:
482
+ all_sheets (dict): A dictionary of DataFrames, typically read from an Excel file with multiple sheets.
483
+
484
+ Returns:
485
+ DataFrame: A concatenated DataFrame with an additional 'SheetName' column.
486
+ """
487
+ combined_df = pd.DataFrame()
488
+
489
+ for sheet_name, df in all_sheets.items():
490
+ df['SheetName'] = sheet_name
491
+ combined_df = pd.concat([combined_df, df], ignore_index=True)
492
+
493
+ return combined_df
494
+
495
+ def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
496
+ """
497
+ Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
498
+
499
+ Args:
500
+ df (pandas.DataFrame): The DataFrame containing the data.
501
+ index_col (str): Name of Column for your pivot table to index on
502
+ columns (str): Name of Columns for your pivot table.
503
+ values_col (str): Name of Values Columns for your pivot table.
504
+ filters_dict (dict, optional): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell. Defaults to None
505
+ fill_value (int, optional): The value to replace nan with. Defaults to 0.
506
+ aggfunc (str, optional): The method on which to aggregate the values column. Defaults to sum.
507
+ margins (bool, optional): Whether the pivot table needs a total rows and column. Defaults to False.
508
+ margins_name (str, optional): The name of the Totals columns. Defaults to "Total".
509
+ datetime_trans_needed (bool, optional): Whether the index column needs to be transformed into datetime format. Defaults to False.
510
+ reverse_header_order (bool, optional): Reverses the order of the column headers. Defaults to False.
511
+ fill_missing_weekly_dates (bool, optional): Fills in any weekly missing dates. Defaults to False.
512
+ week_commencing (str,optional): Fills in missing weeks if option is specified. Defaults to 'W-MON'.
513
+
514
+ Returns:
515
+ pandas.DataFrame: The pivot table specified
516
+ """
517
+
518
+ # Validate inputs
519
+ if index_col not in df.columns:
520
+ raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
521
+ if columns not in df.columns:
522
+ raise ValueError(f"columns '{columns}' not found in DataFrame.")
523
+ if values_col not in df.columns:
524
+ raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
525
+
526
+ # Apply filters if provided
527
+ if filters_dict:
528
+ df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
529
+ else:
530
+ df_filtered = df.copy()
531
+
532
+ # Ensure index column is in datetime format if needed
533
+ if datetime_trans_needed:
534
+ df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
535
+
536
+ # Create the pivot table
537
+ pivoted_df = df_filtered.pivot_table(
538
+ index=index_col,
539
+ columns=columns,
540
+ values=values_col,
541
+ aggfunc=aggfunc,
542
+ margins=margins,
543
+ margins_name=margins_name,
544
+ )
545
+
546
+ # Handle column headers
547
+ if isinstance(pivoted_df.columns, pd.MultiIndex):
548
+ pivoted_df.columns = [
549
+ "_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
550
+ for col in pivoted_df.columns.values
551
+ ]
552
+ else:
553
+ pivoted_df.columns = pivoted_df.columns.map(str)
554
+
555
+ # Reset the index
556
+ pivoted_df.reset_index(inplace=True)
557
+
558
+ # Handle sorting and formatting of index column
559
+ if datetime_trans_needed:
560
+ pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
561
+ pivoted_df.sort_values(by=index_col, inplace=True)
562
+ pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
563
+
564
+ # Fill missing values
565
+ pivoted_df.fillna(fill_value, inplace=True)
566
+
567
+ # Fill missing weekly dates if specified
568
+ if fill_missing_weekly_dates:
569
+ pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
570
+
571
+ return pivoted_df
572
+
573
+ def apply_lookup_table_for_columns(self, df, col_names, to_find_dict, if_not_in_dict="Other", new_column_name="Mapping"):
574
+ """
575
+ Creates a new DataFrame column based on a look up table, possibly with multiple columns to look up on (dictionary of substrings to class mappings).
576
+
577
+ Parameters:
578
+ df (pandas.DataFrame): The DataFrame containing the data.
579
+ col_names (list of str): these are the columns which are used for the lookup. One column or several columns can be inputted as a list, provided there is a merged column to lookup on. If there are multiple columns to look up on then a merged column must be inputted as the key of the dictionary of format e.g. col1|col2|col3
580
+ to_find_dict (dict): your look up table, where keys are the values being looked up, and the values are the resulting mappings.
581
+ if_not_in_dict (str, optional): default value if no substring matches are found in the look up table dictionary. Defaults to "Other".
582
+ new_column_name (str, optional): name of new column. Defaults to "Mapping".
583
+
584
+ Returns:
585
+ pandas.DataFrame: DataFrame with a new column containing the look up table results.
586
+ """
587
+
588
+ # Create regex pattern with word boundaries from the dictionary
589
+ regex_pattern = "|".join(r'\b' + re.escape(key) + r'\b' for key in to_find_dict.keys())
590
+
591
+ # Preprocess DataFrame if multiple columns
592
+ if len(col_names) > 1:
593
+ df["Merged"] = df[col_names].astype(str).apply('|'.join, axis=1)
594
+ col_to_use = "Merged"
595
+ else:
596
+ col_to_use = col_names[0]
597
+
598
+ # Extract the first match using the regex pattern
599
+ matches = df[col_to_use].str.extract(f'({regex_pattern})', expand=False, flags=re.IGNORECASE)
600
+
601
+ # Map the matches to the corresponding values in the dictionary
602
+ df[new_column_name] = matches.str.lower().map({k.lower(): v for k, v in to_find_dict.items()}).fillna(if_not_in_dict)
603
+
604
+ # Drop intermediate column if created
605
+ if len(col_names) > 1:
606
+ df.drop(columns=["Merged"], inplace=True)
607
+
608
+ return df
609
+
610
+ def aggregate_daily_to_wc_wide(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum', include_totals : bool = False) -> pd.DataFrame:
611
+ """
612
+ Aggregates daily data into weekly data, starting on a specified day of the week,
613
+ and groups the data by additional specified columns. It aggregates specified numeric columns
614
+ by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
615
+ of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
616
+ The day column is renamed from 'Day' to 'OBS'.
617
+
618
+ Parameters:
619
+ - df: pandas DataFrame
620
+ The input DataFrame containing daily data.
621
+ - date_column: string
622
+ The name of the column in the DataFrame that contains date information.
623
+ - group_columns: list of strings
624
+ Additional column names to group by along with the weekly grouping.
625
+ - sum_columns: list of strings
626
+ Numeric column names to be aggregated during aggregation.
627
+ - wc: string
628
+ The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
629
+ - aggregation: string, optional (default 'sum')
630
+ Aggregation method, either 'sum', 'average', or 'count'.
631
+ - include_totals: boolean, optional (default False)
632
+ If True, include total columns for each sum_column.
633
+
634
+ Returns:
635
+ - pandas DataFrame
636
+ A new DataFrame with weekly aggregated data. The index is reset,
637
+ and columns represent the grouped and aggregated metrics. The DataFrame
638
+ is in wide format, with separate columns for each combination of
639
+ grouped metrics.
640
+ """
641
+
642
+ grouped = self.aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation)
643
+
644
+ # Pivot the data to wide format
645
+ if group_columns:
646
+ wide_df = grouped.pivot_table(index='OBS',
647
+ columns=group_columns,
648
+ values=sum_columns,
649
+ aggfunc='first')
650
+ # Flatten the multi-level column index and create combined column names
651
+ wide_df.columns = ['_'.join(col).strip() for col in wide_df.columns.values]
652
+ else:
653
+ wide_df = grouped.set_index('OBS')
654
+
655
+ # Fill NaN values with 0
656
+ wide_df = wide_df.fillna(0)
657
+
658
+ # Adding total columns for each unique sum_column, if include_totals is True
659
+ if include_totals:
660
+ for col in sum_columns:
661
+ total_column_name = f'Total {col}'
662
+ if group_columns:
663
+ columns_to_sum = [column for column in wide_df.columns if col in column]
664
+ else:
665
+ columns_to_sum = [col]
666
+ wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
667
+
668
+ # Reset the index of the final DataFrame
669
+ wide_df = wide_df.reset_index()
670
+
671
+ return wide_df
672
+
673
+ def merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = "Merged",starting_prefix_str=None,ending_prefix_str=None):
674
+ """
675
+ Creates a new column in the dataframe that merges 2 or more columns together with a "_" seperator, possibly to be used for a look up table where multiple columns are being looked up
676
+
677
+ Parameters:
678
+ df (pandas.DataFrame): Dataframe to make changes to.
679
+ col_names (list): list of columm names ot merge.
680
+ seperator (str, optional): Name of column outputted. Defaults to "_".
681
+ output_column_name (str, optional): Name of column outputted. Defaults to "Merged".
682
+ starting_prefix_str (str, optional): string of optional text to be added before the merged column str value
683
+ ending_prefix_str (str, optional): string of optional text to be added after the merged column str value
684
+
685
+ Raises:
686
+ ValueError: if more less than two column names are inputted in the list there is nothing to merge on
687
+
688
+ Returns:
689
+ pandas.DataFrame: DataFrame with additional merged column
690
+ """
691
+ # Specify more than one column must be entered
692
+ if len(col_names) < 2:
693
+ raise ValueError("2 or more columns must be specified to merge")
694
+
695
+ # Create a new column with the merged columns
696
+ df[output_column_name] = df[col_names].astype(str).apply(seperator.join, axis=1)
697
+
698
+ # Add string before
699
+ if starting_prefix_str is not None:
700
+ df[output_column_name] = starting_prefix_str + df[output_column_name].astype(str)
701
+
702
+ # Add string after
703
+ if ending_prefix_str is not None:
704
+ df[output_column_name] = df[output_column_name].astype(str) + ending_prefix_str
705
+
706
+ return df
707
+
708
+ def check_sum_of_df_cols_are_equal(self, df_1,df_2,cols_1,cols_2):
709
+ """
710
+ Checks the sum of two different dataframe column or columns are equal
711
+
712
+ Parameters:
713
+ df_1 (pandas.DataFrame): First dataframe for columnsa to be summed on.
714
+ df_2 (pandas.DataFrame): Second dataframe for columnsa to be summed on.
715
+ cols_1 (list of str): Columns from first dataframe to sum.
716
+ cols_2 (list of str): Columns from second dataframe to sum.
717
+
718
+ Returns:
719
+ Tuple: Answer is the true or false answer to whether sums are the same, df_1_sum is the sum of the column/columns in the first dataframe, df_2_sum is the sum of the column/columns in the second dataframe
720
+ """
721
+ # Find the sum of both sets of columns
722
+ df_1_sum = df_1[cols_1].sum().sum()
723
+ df_2_sum = df_2[cols_2].sum().sum()
724
+
725
+ # If the the two columns are
726
+ if df_1_sum == df_2_sum:
727
+ Answer = "They are equal"
728
+ if df_1_sum != df_2_sum:
729
+ Answer = "They are different by " + str(df_2_sum-df_1_sum)
730
+
731
+ return Answer,df_1_sum,df_2_sum
732
+
733
+ def convert_2_df_cols_to_dict(self, df, key_col, value_col):
734
+ """
735
+ Create a dictionary mapping from two columns of a DataFrame.
736
+
737
+ Parameters:
738
+ df (pd.DataFrame): The DataFrame containing the data.
739
+ key_col (str): The column name to use as keys in the dictionary.
740
+ value_col (str): The column name to use as values in the dictionary.
741
+
742
+ Returns:
743
+ dict: A dictionary with keys from 'key_col' and values from 'value_col'.
744
+ """
745
+ if key_col not in df or value_col not in df:
746
+ raise ValueError("Specified columns are not in the DataFrame")
747
+
748
+ return {df[key_col].iloc[i]: df[value_col].iloc[i] for i in range(len(df))}
749
+
750
+ def create_FY_and_H_columns(self, df, index_col, start_date, starting_FY,short_format="No",half_years="No",combined_FY_and_H="No"):
751
+ """
752
+ Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
753
+
754
+ Parameters:
755
+ df (pandas.DataFrame): Dataframe to operate on.
756
+ index_col (str): Name of the column to use for datetime
757
+ start_date (str): String used to specify the start date of an FY specified, needs to be of format "yyyy-mm-dd" e.g. 2021-11-31
758
+ starting_FY (str): String used to specify which FY the start date refers to, needs to be formatted LONG e.g. FY2021
759
+ short_format (str, optional): String used to specify if short format is desired (e.g. FY21) or if long format is desired (e.g. FY2021). Defaults to "No".
760
+ half_years (str, optional): String used to specify if half year column is desired. Defaults to "No".
761
+ combined_FY_and_H (str, optional): String used to specify is a combined half year and FY column is desired. Defaults to "No".
762
+
763
+ Returns:
764
+ pandas.DataFrame: DataFrame with a new column 'FY' containing the FY as well as, if desired, a half year column and a combined FY half year column.
765
+ """
766
+
767
+ try:
768
+ start_date = datetime.strptime(start_date, '%Y-%m-%d')
769
+ except ValueError:
770
+ print("Error: Date must be of format yyyy-mm-dd")
771
+ return df
772
+
773
+ df["OBS"] = pd.to_datetime(df[index_col])
774
+ df["OBS as string"] = df["OBS"].dt.strftime("%Y-%m-%d")
775
+
776
+ df[index_col] = pd.to_datetime(df[index_col])
777
+
778
+ start_year = int(starting_FY[2:])
779
+
780
+ def calculate_FY_vectorized(date_series):
781
+ years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
782
+ fy = 'FY' + (start_year + years_since_start).astype(str)
783
+ if short_format == "Yes":
784
+ fy = 'FY' + fy.str[-2:]
785
+ return fy
786
+
787
+ df['FY'] = calculate_FY_vectorized(df[index_col])
788
+
789
+ if half_years == "Yes" or combined_FY_and_H == "Yes":
790
+ def calculate_half_year_vectorized(date_series):
791
+ fy_years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
792
+ fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(years=1)
793
+ fy_end_of_h1 = fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
794
+ half_year = np.where(date_series <= fy_end_of_h1, 'H1', 'H2')
795
+ return half_year
796
+
797
+ df['Half Years'] = calculate_half_year_vectorized(df[index_col])
798
+
799
+ if combined_FY_and_H == "Yes":
800
+ df['Financial Half Years'] = df['FY'] + ' ' + df['Half Years']
801
+
802
+ return df
803
+
804
+ def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
805
+ """
806
+ This function updates values in a specified column of the DataFrame based on a lookup dictionary.
807
+ It first merges several columns into a new 'Merged' column, then uses this merged column to determine
808
+ if replacements are needed based on the dictionary.
809
+
810
+ Parameters:
811
+ df (pd.DataFrame): The DataFrame to process.
812
+ col (str): The name of the column whose values are potentially replaced.
813
+ replacement_rows (str): The specific value in 'col' to check for replacements.
814
+ cols_to_merge (list of str): List of column names whose contents will be merged to form a lookup key.
815
+ replacement_lookup_dict (dict): Dictionary where keys are merged column values and values are the new data to replace in 'col'.
816
+ output_column_name (str, optional): Name of column outputted. Defaults to "Updated Column".
817
+
818
+ Returns:
819
+ pd.DataFrame: The modified DataFrame with updated values in the specified column.
820
+ """
821
+ # Create a merged column from specified columns
822
+ df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
823
+
824
+ # Replace values in the specified column based on the lookup
825
+ def replace_values(x):
826
+ if x[col] == replacement_rows:
827
+ merged_value = x['Merged']
828
+ if merged_value in replacement_lookup_dict:
829
+ return replacement_lookup_dict[merged_value]
830
+ return x[col]
831
+
832
+ # Apply replacement logic
833
+ df[output_column_name] = df.apply(replace_values, axis=1)
834
+
835
+ # Drop the intermediate 'Merged' column
836
+ df.drop(columns=['Merged'], inplace=True)
837
+
838
+ return df
839
+
840
+ def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
841
+ """
842
+ Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
843
+ The lookup is based on a column in the dataframe. Can only input one column and output one new column.
844
+
845
+ Parameters:
846
+ df (pandas.DataFrame): The DataFrame containing the data.
847
+ keys_col (str): The name of the column which the LUT will be refercing to ouput a value.
848
+ value_col (str): The name of the column which the new column will be based off. If a key in the key column is not found in the LUT, the values from this column are used instead.
849
+ dict_for_specific_changes (dict): The LUT which the keys_col will be mapped on to find any values that need changing in the new column.
850
+ new_col_name (str, optional): This is the name of the new column being generated. Defaults to "New Version of Old Col".
851
+
852
+ Returns:
853
+ pandas.DataFrame: DataFrame with a new column which is similar to the old column, except for where changes have been made to reflect the lookup table.
854
+ """
855
+
856
+ # Extract columns to change using new dictionary
857
+ smaller_df = df[[keys_col,value_col]]
858
+
859
+ # Use the new dictionary to create a new LUT
860
+ smaller_df_with_LUT = self.apply_lookup_table_for_columns(smaller_df,[keys_col,value_col],dict_for_specific_changes)
861
+
862
+ # In a new column, keep values from the old column that don't need updating as they are not in the dictionary, and replace values that do need updating with values from the dictionary based on the keys
863
+ smaller_df_with_LUT["Updated Col"]=smaller_df_with_LUT.apply(lambda x: x['Mapping'] if x['Mapping'] != "Other" else x[value_col],axis=1)
864
+
865
+ # Drop the extra unecessary cols
866
+ smaller_df_with_LUT.drop([keys_col,'Mapping'],axis=1,inplace=True)
867
+
868
+ # # Output dataframes as dictionary to be used in a LUT
869
+ new_dict = self.convert_2_df_cols_to_dict(smaller_df_with_LUT,value_col,"Updated Col")
870
+
871
+ # # Use new dictionary to create a new version of an old column
872
+ df_final = self.apply_lookup_table_for_columns(df,[keys_col],new_dict,"other",new_col_name)
873
+
874
+ return df_final
875
+
876
+ def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
877
+ """
878
+ Changes a dataframe from wide to long format.
879
+
880
+ Args:
881
+ df (pandas.DataFrame): The DataFrame containing the data.
882
+ value_cols (list of str or str if only one): List of column names to transform from several columns into one.
883
+ variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
884
+ value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
885
+
886
+ Returns:
887
+ pandas.DataFrame: DataFrame transformed from wide to long format.
888
+
889
+ Raises:
890
+ ValueError: If the number of columns to depivot is less than 2.
891
+ """
892
+ # Check length of value_cols is greater than 1
893
+ if len(value_cols) < 2:
894
+ raise ValueError("Number of inputs in list must be greater than 1")
895
+
896
+ # Find the columns that are not to be depivoted into one column
897
+ id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
898
+
899
+ # Melt all columns chosen into one column
900
+ df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
901
+
902
+ # Sort column order to match expected output
903
+ ordered_columns = id_vars + [variable_col_name, value_col_name]
904
+ df_final = df_final[ordered_columns]
905
+
906
+ return df_final
907
+
908
+ def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
909
+ """
910
+ Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
911
+
912
+ Args:
913
+ df (pandas.DataFrame): The DataFrame containing the data.
914
+ filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
915
+ col_to_change (str): String name of column to edit
916
+ new_value (any): Value of new input for cell
917
+ change_in_existing_df_col (str, optional): Input of Yes or No to describe whether to make the change in an existing column. Defaults to "No".
918
+ new_col_to_change_name (str, optional): Name of the new column to copy the column being edited into and to make the change in. Defaults to 'New'.
919
+ manual_edit_col_name (str, optional): Name of the current manual edits column, if one is not specified it will be created. Defaults to None.
920
+ add_notes (str, optional): Gives the option to create a new notes column. Defaults to "No".
921
+ existing_note_col_name (str, optional): If there is an existing notes column this can be specified. Defaults to None.
922
+ note (str), optional): The string of the note to be added to the column. Defaults to None.
923
+
924
+ Raises:
925
+ TypeError: The column for the column to change can only be specified as one column as it is a string not a list
926
+ ValueError: You can only input the values of "Yes" or "No" for whether to make the change in existing column
927
+ ValueError: You can only input the values of "Yes" or "No" for whether to make a new notes column
928
+
929
+ Returns:
930
+ pandas.DataFrame: Dataframe with manual changes added
931
+ """
932
+
933
+ # Raise type error if more than one col is supported
934
+ if isinstance(col_to_change, list):
935
+ raise TypeError("Col to change must be specified as a string, not a list")
936
+
937
+ # Raises value error if input is invalid for change_in_existing_df_col
938
+ if change_in_existing_df_col not in ["Yes", "No"]:
939
+ raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
940
+
941
+ # Raises value error if input is invalid for add_notes_col
942
+ if add_notes not in ["Yes", "No"]:
943
+ raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
944
+
945
+ # Validate filters_dict format
946
+ for col, cond in filters_dict.items():
947
+ if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
948
+ raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
949
+
950
+ # Create the filtered df by applying the conditions
951
+ df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
952
+
953
+ # Create a new column to add the changes if desired, else edit in the current chosen column
954
+ col_to_update = col_to_change if change_in_existing_df_col == "Yes" else new_col_to_change_name
955
+ if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
956
+ df = df.copy()
957
+ df[new_col_to_change_name] = df[col_to_change]
958
+
959
+ # Update the new cell in the chosen column
960
+ df.loc[df_filtered.index, col_to_update] = new_value
961
+
962
+ # Add in manual edit column if desired or specify where one already is
963
+ if manual_edit_col_name:
964
+ if manual_edit_col_name not in df.columns:
965
+ df[manual_edit_col_name] = 0
966
+ df.loc[df_filtered.index, manual_edit_col_name] = 1
967
+ elif not manual_edit_col_name and 'Manual Changes' not in df.columns:
968
+ df['Manual Changes'] = 0
969
+ df.loc[df_filtered.index, 'Manual Changes'] = 1
970
+
971
+ # Add note if desired in new column or an existing column
972
+ if add_notes == "Yes":
973
+ note_col = existing_note_col_name if existing_note_col_name else 'Notes'
974
+ if note_col not in df.columns:
975
+ df[note_col] = None
976
+ df.loc[df_filtered.index, note_col] = note
977
+
978
+ return df
979
+
980
+ def format_numbers_with_commas(self, df, decimal_length_chosen=2):
981
+ """
982
+ Converts data in numerical format into numbers with commas and a chosen decimal place length.
983
+
984
+ Args:
985
+ df (pandas.DataFrame): The DataFrame containing the data.
986
+ decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
987
+
988
+ Returns:
989
+ pandas.DataFrame: The DataFrame with the chosen updated format.
990
+ """
991
+ def format_number_with_commas(x, decimal_length=decimal_length_chosen):
992
+ if pd.isna(x): # Preserve None/NaN values
993
+ return pd.NA # Explicitly normalize to pd.NA
994
+ elif isinstance(x, (int, float)):
995
+ if decimal_length is not None:
996
+ format_str = f"{{:,.{decimal_length}f}}"
997
+ return format_str.format(x)
998
+ else:
999
+ return f"{x:,}"
1000
+ else:
1001
+ return x # Return unchanged if not a number
1002
+
1003
+ # Apply formatting column by column
1004
+ formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
1005
+
1006
+ return formatted_df
1007
+
1008
+ def filter_df_on_multiple_conditions(self, df, filters_dict):
1009
+ """
1010
+ Filter a dataframe based on mulitple conditions
1011
+
1012
+ Args:
1013
+ df (pandas.DatFrame): Dataframe to filter on
1014
+ filters_dict (dict): Dictionary with strings as conditions
1015
+
1016
+ Returns:
1017
+ pandas.DatFrame: Filtered Da
1018
+ """
1019
+ mask = pd.Series(True, index=df.index)
1020
+ for col, cond in filters_dict.items():
1021
+ cond = cond.strip()
1022
+ operator, value = cond.split(maxsplit=1)
1023
+
1024
+ # If value is a string condition make sure to check if there are new lines
1025
+ if "'" in value:
1026
+ value = value.strip().strip("'\"")
1027
+ # If not a string e.g. datetime or number condition you need to transform the string into a value
1028
+ else:
1029
+ value = eval(value)
1030
+
1031
+ if operator == "==":
1032
+ temp_mask = (df[col] == value)
1033
+ elif operator == "!=":
1034
+ temp_mask = (df[col] != value)
1035
+ elif operator == ">=":
1036
+ temp_mask = (df[col] >= value)
1037
+ elif operator == "<=":
1038
+ temp_mask = (df[col] <= value)
1039
+ elif operator == ">":
1040
+ temp_mask = (df[col] > value)
1041
+ elif operator == "<":
1042
+ temp_mask = (df[col] < value)
1043
+ mask &= temp_mask
1044
+
1045
+ # Create the filtered df by applying the conditions
1046
+ df_filtered = df[mask]
1047
+
1048
+ return df_filtered
1049
+
1050
+ def read_and_concatenate_files(self, folder_path, file_type='csv'):
1051
+ """
1052
+ Reads all files of a specified type (CSV or XLSX) from a given folder
1053
+ and concatenates them into a single DataFrame.
1054
+
1055
+ Parameters:
1056
+ folder_path (str): The path to the folder containing the files.
1057
+ file_type (str): The type of files to read ('csv' or 'xlsx'). Defaults to 'csv'.
1058
+
1059
+ Returns:
1060
+ pd.DataFrame: A DataFrame containing the concatenated data from all files.
1061
+ """
1062
+
1063
+ # Initialize an empty list to hold dataframes
1064
+ dataframes = []
1065
+
1066
+ # Define file extension based on file_type
1067
+ if file_type == 'csv':
1068
+ extension = '.csv'
1069
+ elif file_type == 'xlsx':
1070
+ extension = '.xlsx'
1071
+ else:
1072
+ raise ValueError("file_type must be either 'csv' or 'xlsx'")
1073
+
1074
+ # Loop through all files in the folder
1075
+ for filename in os.listdir(folder_path):
1076
+ # Check if the file has the correct extension
1077
+ if filename.endswith(extension):
1078
+ file_path = os.path.join(folder_path, filename)
1079
+ # Read the file into a DataFrame
1080
+ if file_type == 'csv':
1081
+ df = pd.read_csv(file_path)
1082
+ elif file_type == 'xlsx':
1083
+ df = pd.read_excel(file_path)
1084
+ # Append the DataFrame to the list
1085
+ dataframes.append(df)
1086
+
1087
+ # Concatenate all DataFrames into a single DataFrame
1088
+ combined_df = pd.concat(dataframes, ignore_index=True)
1089
+
1090
+ return combined_df
1091
+
1092
+ def upgrade_outdated_packages(exclude_packages=['twine']):
1093
+ """
1094
+ Upgrade all outdated Python packages except those specified in `exclude_packages`.
1095
+
1096
+ :param exclude_packages: List of package names to exclude from the upgrade process.
1097
+ """
1098
+ exclude_packages = set(exclude_packages or [])
1099
+
1100
+ try:
1101
+ # Get all installed packages
1102
+ installed_packages_result = subprocess.run(
1103
+ "pip list --format=json", shell=True, capture_output=True, text=True
1104
+ )
1105
+ installed_packages = json.loads(installed_packages_result.stdout)
1106
+
1107
+ # Get the list of outdated packages
1108
+ outdated_packages_result = subprocess.run(
1109
+ "pip list --outdated --format=json", shell=True, capture_output=True, text=True
1110
+ )
1111
+ outdated_packages = json.loads(outdated_packages_result.stdout)
1112
+
1113
+ # Create a set of outdated package names for quick lookup
1114
+ outdated_package_names = {pkg['name'] for pkg in outdated_packages}
1115
+
1116
+ # Upgrade only outdated packages, excluding specified packages
1117
+ for package in installed_packages:
1118
+ package_name = package['name']
1119
+ if package_name in outdated_package_names and package_name not in exclude_packages:
1120
+ try:
1121
+ print(f"Upgrading package: {package_name}")
1122
+ upgrade_result = subprocess.run(
1123
+ f"pip install --upgrade {package_name}", shell=True, capture_output=True, text=True
1124
+ )
1125
+ if upgrade_result.returncode == 0:
1126
+ print(f"Successfully upgraded {package_name}")
1127
+ else:
1128
+ print(f"Failed to upgrade {package_name}: {upgrade_result.stderr}")
1129
+ except Exception as e:
1130
+ print(f"An error occurred while upgrading {package_name}: {e}")
1131
+ elif package_name in exclude_packages:
1132
+ print(f"Skipping package: {package_name} (excluded)")
1133
+ else:
1134
+ print(f"{package_name} is already up to date or not outdated")
1135
+ except Exception as e:
1136
+ print(f"An error occurred during the upgrade process: {e}")
1137
+
1138
+ def convert_mixed_formats_dates(self, df, column_name):
1139
+ # Convert initial dates to datetime with coercion to handle errors
1140
+ df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
1141
+ df[column_name] = df[column_name].astype(str)
1142
+ corrected_dates = []
1143
+
1144
+ for date_str in df[column_name]:
1145
+ date_str = date_str.replace('-', '').replace('/', '')
1146
+ if len(date_str) == 8:
1147
+ year = date_str[:4]
1148
+ month = date_str[4:6]
1149
+ day = date_str[6:8]
1150
+ if int(day) <= 12:
1151
+ # Swap month and day
1152
+ corrected_date_str = f"{year}-{day}-{month}"
1153
+ else:
1154
+ corrected_date_str = f"{year}-{month}-{day}"
1155
+ # Convert to datetime
1156
+ corrected_date = pd.to_datetime(corrected_date_str, errors='coerce')
1157
+ else:
1158
+ corrected_date = pd.to_datetime(date_str, errors='coerce')
1159
+
1160
+ corrected_dates.append(corrected_date)
1161
+
1162
+ # Check length of the corrected_dates list
1163
+ if len(corrected_dates) != len(df):
1164
+ raise ValueError("Length of corrected_dates does not match the original DataFrame")
1165
+
1166
+ # Assign the corrected dates back to the DataFrame
1167
+ df[column_name] = corrected_dates
1168
+ return df
1169
+
1170
+ def fill_weekly_date_range(self, df, date_column, freq='W-MON'):
1171
+ # Ensure the date column is in datetime format
1172
+ df[date_column] = pd.to_datetime(df[date_column])
1173
+
1174
+ # Generate the full date range with the specified frequency
1175
+ full_date_range = pd.date_range(start=df[date_column].min(), end=df[date_column].max(), freq=freq)
1176
+
1177
+ # Create a new dataframe with the full date range
1178
+ full_date_df = pd.DataFrame({date_column: full_date_range})
1179
+
1180
+ # Merge the original dataframe with the new full date range dataframe
1181
+ df_full = full_date_df.merge(df, on=date_column, how='left')
1182
+
1183
+ # Fill missing values with 0
1184
+ df_full.fillna(0, inplace=True)
1185
+
1186
+ return df_full
1187
+
1188
+ def add_prefix_and_suffix(self, df, prefix='', suffix='', date_col=None):
1189
+ """
1190
+ Adds a specified prefix and/or suffix to the column names of a DataFrame. Optionally, a column (e.g., a date column) can be excluded.
1191
+
1192
+ Args:
1193
+ df (pd.DataFrame): The DataFrame whose column names will be modified.
1194
+ prefix (str, optional): The prefix to add to each column name. Default is an empty string.
1195
+ suffix (str, optional): The suffix to add to each column name. Default is an empty string.
1196
+ date_col (str, optional): The name of the column to exclude from adding prefix and suffix, typically a date column. Default is None.
1197
+
1198
+ Returns:
1199
+ pd.DataFrame: The DataFrame with updated column names.
1200
+ """
1201
+
1202
+ # If there is no date column
1203
+ if date_col is None:
1204
+ # Add prefixes and suffixes to all columns
1205
+ df.columns = [prefix + col + suffix for col in df.columns]
1206
+ else:
1207
+ # Add prefixes and suffixes to all columns except the date column
1208
+ df.columns = [prefix + col + suffix if col != date_col else col for col in df.columns]
1209
+
1210
+ return df
1211
+
1212
+ def create_dummies(self, df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total'):
1213
+ """
1214
+ Creates dummy variables for the DataFrame, converting values greater than the threshold to 1 and others to 0.
1215
+ Optionally adds a total dummy column indicating whether any row contains at least one value greater than the threshold.
1216
+
1217
+ Args:
1218
+ df (pd.DataFrame): The DataFrame to process.
1219
+ date_col (str, optional): The column name to exclude from the dummy conversion, typically a date column. Default is None.
1220
+ dummy_threshold (int, optional): The threshold value; values greater than this become 1, others become 0. Default is 0.
1221
+ add_total_dummy_col (str, optional): If set to any value other than 'No', adds a column that contains the max value (1 or 0) for each row. Default is 'No'.
1222
+ total_col_name (str, optional): The name of the total column to add if add_total_dummy_col is not 'No'. Default is 'total'.
1223
+
1224
+ Returns:
1225
+ pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
1226
+ """
1227
+
1228
+ # If there is no date column
1229
+ if date_col is None:
1230
+ df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
1231
+
1232
+ if add_total_dummy_col != 'No':
1233
+ # Find max value of rows
1234
+ df[total_col_name] = df.max(axis=1)
1235
+
1236
+ # If there is a date column
1237
+ else:
1238
+ # Create dummies for all columns except the date column
1239
+ df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
1240
+ lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
1241
+ )
1242
+
1243
+ if add_total_dummy_col != 'No':
1244
+ # Find max value of rows
1245
+ df[total_col_name] = df.loc[:, df.columns != date_col].max(axis=1)
1246
+
1247
+ return df
1248
+
1249
+ def replace_substrings(self, df, column, replacements, to_lower=False, new_column=None):
1250
+ """
1251
+ Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
1252
+ Optionally converts the column values to lowercase and allows creating a new column or modifying the existing one.
1253
+
1254
+ Args:
1255
+ df (pd.DataFrame): The DataFrame containing the column to modify.
1256
+ column (str): The column name where the replacements will be made.
1257
+ replacements (dict): A dictionary where keys are substrings to replace and values are the replacement strings.
1258
+ to_lower (bool, optional): If True, the column values will be converted to lowercase before applying replacements. Default is False.
1259
+ new_column (str, optional): If provided, the replacements will be applied to this new column. If None, the existing column will be modified. Default is None.
1260
+
1261
+ Returns:
1262
+ pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
1263
+ """
1264
+ if new_column is not None:
1265
+ # Create a new column for replacements
1266
+ df[new_column] = df[column]
1267
+ temp_column = new_column
1268
+ else:
1269
+ # Modify the existing column
1270
+ temp_column = column
1271
+
1272
+ # Optionally convert to lowercase
1273
+ if to_lower:
1274
+ df[temp_column] = df[temp_column].str.lower()
1275
+
1276
+ # Apply substring replacements
1277
+ for old, new in replacements.items():
1278
+ df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
1279
+
1280
+ return df
1281
+
1282
+ def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
1283
+ """
1284
+ Adds a total column to a DataFrame by summing across all columns. Optionally excludes a specified column.
1285
+
1286
+ Args:
1287
+ df (pd.DataFrame): The DataFrame to modify.
1288
+ exclude_col (str, optional): The column name to exclude from the sum. Default is None.
1289
+ total_col_name (str, optional): The name of the new total column. Default is 'Total'.
1290
+
1291
+ Returns:
1292
+ pd.DataFrame: The DataFrame with an added total column.
1293
+ """
1294
+ if exclude_col and exclude_col in df.columns:
1295
+ # Ensure the column to exclude exists before dropping
1296
+ df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
1297
+ else:
1298
+ # Sum across all columns if no column is specified to exclude
1299
+ df[total_col_name] = df.sum(axis=1)
1300
+
1301
+ return df
1302
+
1303
+ def apply_lookup_table_based_on_substring(self, df, column_name, category_dict, new_col_name='Category', other_label='Other'):
1304
+ """
1305
+ Categorizes text in a specified DataFrame column by applying a lookup table based on substrings.
1306
+
1307
+ Args:
1308
+ df (pd.DataFrame): The DataFrame containing the column to categorize.
1309
+ column_name (str): The name of the column in the DataFrame that contains the text data to categorize.
1310
+ category_dict (dict): A dictionary where keys are substrings to search for in the text and values are the categories to assign when a substring is found.
1311
+ new_col_name (str, optional): The name of the new column to be created in the DataFrame, which will hold the resulting categories. Default is 'Category'.
1312
+ other_label (str, optional): The name given to category if no substring from the dictionary is found in the cell
1313
+
1314
+ Returns:
1315
+ pd.DataFrame: The original DataFrame with an additional column containing the assigned categories.
1316
+ """
1317
+
1318
+ def categorize_text(text):
1319
+ """
1320
+ Assigns a category to a single text string based on the presence of substrings from a dictionary.
1321
+
1322
+ Args:
1323
+ text (str): The text string to categorize.
1324
+
1325
+ Returns:
1326
+ str: The category assigned based on the first matching substring found in the text. If no
1327
+ matching substring is found, returns other_name.
1328
+ """
1329
+ for key, category in category_dict.items():
1330
+ if key.lower() in text.lower(): # Check if the substring is in the text (case-insensitive)
1331
+ return category
1332
+ return other_label # Default category if no match is found
1333
+
1334
+ # Apply the categorize_text function to each element in the specified column
1335
+ df[new_col_name] = df[column_name].apply(categorize_text)
1336
+ return df
1337
+
1338
+ def compare_overlap(self, df1, df2, date_col):
1339
+ """
1340
+ Compare overlapping periods between two DataFrames and provide a summary of total differences.
1341
+
1342
+ Args:
1343
+ df1 (pandas.DataFrame): First DataFrame containing date-based data.
1344
+ df2 (pandas.DataFrame): Second DataFrame containing date-based data.
1345
+ date_col (str): The name of the date column used for aligning data.
1346
+
1347
+ Returns:
1348
+ tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
1349
+ """
1350
+ # Ensure date columns are in datetime format
1351
+ df1[date_col] = pd.to_datetime(df1[date_col])
1352
+ df2[date_col] = pd.to_datetime(df2[date_col])
1353
+
1354
+ # Determine the overlap period
1355
+ start_date = max(df1[date_col].min(), df2[date_col].min())
1356
+ end_date = min(df1[date_col].max(), df2[date_col].max())
1357
+
1358
+ # Filter DataFrames to the overlapping period
1359
+ df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
1360
+ df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
1361
+
1362
+ # Merge the DataFrames on the date column
1363
+ merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
1364
+
1365
+ # Get common columns, excluding the date column
1366
+ common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
1367
+
1368
+ # Create a DataFrame for differences
1369
+ diff_df = pd.DataFrame({date_col: merged_df[date_col]})
1370
+
1371
+ total_diff_list = []
1372
+ for col in common_cols:
1373
+ diff_col = f'diff_{col}'
1374
+ diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
1375
+
1376
+ # Sum differences for the column
1377
+ total_diff = diff_df[diff_col].sum()
1378
+ total_diff_list.append({'Column': col, 'Total Difference': total_diff})
1379
+
1380
+ # Create summary DataFrame
1381
+ total_diff_df = pd.DataFrame(total_diff_list)
1382
+
1383
+ return diff_df, total_diff_df
1384
+
1385
+ def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
1386
+ """
1387
+ Convert a DataFrame's date column so that each date is mapped back
1388
+ to the 'week_commencing' day of the *current ISO week*.
1389
+
1390
+ Args:
1391
+ df (pandas.DataFrame): The DataFrame with date-based data.
1392
+ date_col (str): The name of the date column.
1393
+ week_commencing (str): The desired start of the week.
1394
+ ('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
1395
+ Uses ISO day numbering (Mon=1, ..., Sun=7).
1396
+
1397
+ Returns:
1398
+ pandas.DataFrame: Original DataFrame with an extra column
1399
+ 'week_start_<week_commencing>' containing the
1400
+ start-of-week date for each row.
1401
+ """
1402
+ # ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
1403
+ iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
1404
+
1405
+ target_day = iso_day_dict[week_commencing]
1406
+
1407
+ def map_to_week_start(date_val):
1408
+ delta = (date_val.isoweekday() - target_day) % 7
1409
+ return date_val - pd.Timedelta(days=delta)
1410
+
1411
+ # Apply the transformation
1412
+ new_col = f"week_start_{week_commencing}"
1413
+ df[new_col] = df[date_col].apply(map_to_week_start)
1414
+
1415
+ return df