imsciences 0.5.4.7__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +4 -1
- imsciences/datafunctions-IMS-24Ltp-3.py +2711 -0
- imsciences/datafunctions.py +2842 -170
- imsciences/datapull.py +374 -0
- imsciences/geo.py +195 -0
- imsciences/mmm.py +1415 -0
- imsciences/pull.py +1483 -0
- imsciences/unittesting.py +1064 -0
- imsciences/vis.py +196 -0
- imsciences-0.9.3.dist-info/LICENSE.txt +21 -0
- imsciences-0.9.3.dist-info/METADATA +330 -0
- imsciences-0.9.3.dist-info/PKG-INFO-IMS-24Ltp-3 +24 -0
- imsciences-0.9.3.dist-info/RECORD +22 -0
- {imsciences-0.5.4.7.dist-info → imsciences-0.9.3.dist-info}/WHEEL +1 -1
- imsciences-0.5.4.7.dist-info/METADATA +0 -95
- imsciences-0.5.4.7.dist-info/RECORD +0 -13
- {imsciences-0.5.4.7.dist-info → imsciences-0.9.3.dist-info}/top_level.txt +0 -0
imsciences/mmm.py
ADDED
|
@@ -0,0 +1,1415 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import calendar
|
|
3
|
+
import os
|
|
4
|
+
import numpy as np
|
|
5
|
+
import re
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
import subprocess
|
|
8
|
+
import json
|
|
9
|
+
|
|
10
|
+
class dataprocessing:
|
|
11
|
+
|
|
12
|
+
def help(self):
|
|
13
|
+
|
|
14
|
+
print("\n1. get_wd_levels")
|
|
15
|
+
print(" - Description: Get the working directory with the option of moving up parents.")
|
|
16
|
+
print(" - Usage: get_wd_levels(levels)")
|
|
17
|
+
print(" - Example: get_wd_levels(0)")
|
|
18
|
+
|
|
19
|
+
print("\n2. aggregate_daily_to_wc_long")
|
|
20
|
+
print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
|
|
21
|
+
print(" - Usage: aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')")
|
|
22
|
+
print(" - Example: aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')")
|
|
23
|
+
|
|
24
|
+
print("\n3. convert_monthly_to_daily")
|
|
25
|
+
print(" - Description: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.")
|
|
26
|
+
print(" - Usage: convert_monthly_to_daily(df, date_column, divide=True)")
|
|
27
|
+
print(" - Example: convert_monthly_to_daily(df, 'date')")
|
|
28
|
+
|
|
29
|
+
print("\n4. week_of_year_mapping")
|
|
30
|
+
print(" - Description: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.")
|
|
31
|
+
print(" - Usage: week_of_year_mapping(df, week_col, start_day_str)")
|
|
32
|
+
print(" - Example: week_of_year_mapping(df, 'week', 'mon')")
|
|
33
|
+
|
|
34
|
+
print("\n5. rename_cols")
|
|
35
|
+
print(" - Description: Renames columns in a pandas DataFrame with a specified prefix or format.")
|
|
36
|
+
print(" - Usage: rename_cols(df, name='ame_')")
|
|
37
|
+
print(" - Example: rename_cols(df, 'ame_facebook')")
|
|
38
|
+
|
|
39
|
+
print("\n6. merge_new_and_old")
|
|
40
|
+
print(" - Description: Creates a new DataFrame by merging old and new dataframes based on a cutoff date.")
|
|
41
|
+
print(" - Usage: merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')")
|
|
42
|
+
print(" - Example: merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')")
|
|
43
|
+
|
|
44
|
+
print("\n7. merge_dataframes_on_column")
|
|
45
|
+
print(" - Description: Merge a list of DataFrames on a common column.")
|
|
46
|
+
print(" - Usage: merge_dataframes_on_column(dataframes, common_column='OBS', merge_how='outer')")
|
|
47
|
+
print(" - Example: merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')")
|
|
48
|
+
|
|
49
|
+
print("\n8. merge_and_update_dfs")
|
|
50
|
+
print(" - Description: Merges two dataframes, updating columns from the second dataframe where values are available.")
|
|
51
|
+
print(" - Usage: merge_and_update_dfs(df1, df2, key_column)")
|
|
52
|
+
print(" - Example: merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')")
|
|
53
|
+
|
|
54
|
+
print("\n9. convert_us_to_uk_dates")
|
|
55
|
+
print(" - Description: Convert a DataFrame column with mixed US and UK date formats to datetime.")
|
|
56
|
+
print(" - Usage: convert_us_to_uk_dates(df, date_col)")
|
|
57
|
+
print(" - Example: convert_us_to_uk_dates(df, 'date')")
|
|
58
|
+
|
|
59
|
+
print("\n10. combine_sheets")
|
|
60
|
+
print(" - Description: Combines multiple DataFrames from a dictionary into a single DataFrame.")
|
|
61
|
+
print(" - Usage: combine_sheets(all_sheets)")
|
|
62
|
+
print(" - Example: combine_sheets({'Sheet1': df1, 'Sheet2': df2})")
|
|
63
|
+
|
|
64
|
+
print("\n11. pivot_table")
|
|
65
|
+
print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
|
|
66
|
+
print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc='sum', margins=False, margins_name='Total', datetime_trans_needed=True, reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing='W-MON')")
|
|
67
|
+
print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value', filters_dict={'Master Include': ' == 1'}, fill_value=0)")
|
|
68
|
+
|
|
69
|
+
print("\n12. apply_lookup_table_for_columns")
|
|
70
|
+
print(" - Description: Maps substrings in columns to new values based on a dictionary.")
|
|
71
|
+
print(" - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')")
|
|
72
|
+
print(" - Example: apply_lookup_table_for_columns(df, col_names, {'spend': 'spd'}, if_not_in_dict='Other', new_column_name='Metrics Short')")
|
|
73
|
+
|
|
74
|
+
print("\n13. aggregate_daily_to_wc_wide")
|
|
75
|
+
print(" - Description: Aggregates daily data into weekly data and pivots it to wide format.")
|
|
76
|
+
print(" - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc='sun', aggregation='sum', include_totals=False)")
|
|
77
|
+
print(" - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions'], 'mon', 'average', True)")
|
|
78
|
+
|
|
79
|
+
print("\n14. merge_cols_with_seperator")
|
|
80
|
+
print(" - Description: Merges multiple columns in a DataFrame into one column with a specified separator.")
|
|
81
|
+
print(" - Usage: merge_cols_with_seperator(df, col_names, separator='_', output_column_name='Merged')")
|
|
82
|
+
print(" - Example: merge_cols_with_seperator(df, ['Campaign', 'Product'], separator='|', output_column_name='Merged Columns')")
|
|
83
|
+
|
|
84
|
+
print("\n15. check_sum_of_df_cols_are_equal")
|
|
85
|
+
print(" - Description: Checks if the sum of two columns in two DataFrames are equal and provides the difference.")
|
|
86
|
+
print(" - Usage: check_sum_of_df_cols_are_equal(df_1, df_2, cols_1, cols_2)")
|
|
87
|
+
print(" - Example: check_sum_of_df_cols_are_equal(df_1, df_2, 'Media Cost', 'Spend')")
|
|
88
|
+
|
|
89
|
+
print("\n16. convert_2_df_cols_to_dict")
|
|
90
|
+
print(" - Description: Creates a dictionary from two DataFrame columns.")
|
|
91
|
+
print(" - Usage: convert_2_df_cols_to_dict(df, key_col, value_col)")
|
|
92
|
+
print(" - Example: convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')")
|
|
93
|
+
|
|
94
|
+
print("\n17. create_FY_and_H_columns")
|
|
95
|
+
print(" - Description: Adds financial year and half-year columns to a DataFrame based on a start date.")
|
|
96
|
+
print(" - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY, short_format='No', half_years='No', combined_FY_and_H='No')")
|
|
97
|
+
print(" - Example: create_FY_and_H_columns(df, 'Week', '2022-10-03', 'FY2023', short_format='Yes')")
|
|
98
|
+
|
|
99
|
+
print("\n18. keyword_lookup_replacement")
|
|
100
|
+
print(" - Description: Updates values in a column based on a lookup dictionary with conditional logic.")
|
|
101
|
+
print(" - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name='Updated Column')")
|
|
102
|
+
print(" - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel', 'segment'], lookup_dict, output_column_name='Channel New')")
|
|
103
|
+
|
|
104
|
+
print("\n19. create_new_version_of_col_using_LUT")
|
|
105
|
+
print(" - Description: Creates a new column based on a lookup table applied to an existing column.")
|
|
106
|
+
print(" - Usage: create_new_version_of_col_using_LUT(df, keys_col, value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')")
|
|
107
|
+
print(" - Example: create_new_version_of_col_using_LUT(df, 'Campaign Name', 'Campaign Type', lookup_dict)")
|
|
108
|
+
|
|
109
|
+
print("\n20. convert_df_wide_2_long")
|
|
110
|
+
print(" - Description: Converts a wide-format DataFrame into a long-format DataFrame.")
|
|
111
|
+
print(" - Usage: convert_df_wide_2_long(df, value_cols, variable_col_name='Stacked', value_col_name='Value')")
|
|
112
|
+
print(" - Example: convert_df_wide_2_long(df, ['col1', 'col2'], variable_col_name='Var', value_col_name='Val')")
|
|
113
|
+
|
|
114
|
+
print("\n21. manually_edit_data")
|
|
115
|
+
print(" - Description: Manually updates specified cells in a DataFrame based on filters.")
|
|
116
|
+
print(" - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)")
|
|
117
|
+
print(" - Example: manually_edit_data(df, {'col1': '== 1'}, 'col2', 'new_val', add_notes='Yes', note='Manual Update')")
|
|
118
|
+
|
|
119
|
+
print("\n22. format_numbers_with_commas")
|
|
120
|
+
print(" - Description: Formats numerical columns with commas and a specified number of decimal places.")
|
|
121
|
+
print(" - Usage: format_numbers_with_commas(df, decimal_length_chosen=2)")
|
|
122
|
+
print(" - Example: format_numbers_with_commas(df, decimal_length_chosen=1)")
|
|
123
|
+
|
|
124
|
+
print("\n23. filter_df_on_multiple_conditions")
|
|
125
|
+
print(" - Description: Filters a DataFrame based on multiple column conditions.")
|
|
126
|
+
print(" - Usage: filter_df_on_multiple_conditions(df, filters_dict)")
|
|
127
|
+
print(" - Example: filter_df_on_multiple_conditions(df, {'col1': '>= 5', 'col2': '== 'val''})")
|
|
128
|
+
|
|
129
|
+
print("\n24. read_and_concatenate_files")
|
|
130
|
+
print(" - Description: Reads and concatenates files from a specified folder into a single DataFrame.")
|
|
131
|
+
print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
132
|
+
print(" - Example: read_and_concatenate_files('/path/to/files', file_type='xlsx')")
|
|
133
|
+
|
|
134
|
+
print("\n25. upgrade_outdated_packages")
|
|
135
|
+
print(" - Description: Upgrades all outdated Python packages except specified ones.")
|
|
136
|
+
print(" - Usage: upgrade_outdated_packages(exclude_packages=['twine'])")
|
|
137
|
+
print(" - Example: upgrade_outdated_packages(exclude_packages=['pip', 'setuptools'])")
|
|
138
|
+
|
|
139
|
+
print("\n26. convert_mixed_formats_dates")
|
|
140
|
+
print(" - Description: Converts mixed-format date columns into standardized datetime format.")
|
|
141
|
+
print(" - Usage: convert_mixed_formats_dates(df, column_name)")
|
|
142
|
+
print(" - Example: convert_mixed_formats_dates(df, 'date_col')")
|
|
143
|
+
|
|
144
|
+
print("\n27. fill_weekly_date_range")
|
|
145
|
+
print(" - Description: Fills in missing weekly dates in a DataFrame with a specified frequency.")
|
|
146
|
+
print(" - Usage: fill_weekly_date_range(df, date_column, freq='W-MON')")
|
|
147
|
+
print(" - Example: fill_weekly_date_range(df, 'date_col')")
|
|
148
|
+
|
|
149
|
+
print("\n28. add_prefix_and_suffix")
|
|
150
|
+
print(" - Description: Adds prefixes and/or suffixes to column names, with an option to exclude a date column.")
|
|
151
|
+
print(" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)")
|
|
152
|
+
print(" - Example: add_prefix_and_suffix(df, prefix='pre_', suffix='_suf', date_col='date_col')")
|
|
153
|
+
|
|
154
|
+
print("\n29. create_dummies")
|
|
155
|
+
print(" - Description: Creates dummy variables for columns, with an option to add a total dummy column.")
|
|
156
|
+
print(" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')")
|
|
157
|
+
print(" - Example: create_dummies(df, date_col='date_col', dummy_threshold=1)")
|
|
158
|
+
|
|
159
|
+
print("\n30. replace_substrings")
|
|
160
|
+
print(" - Description: Replaces substrings in a column based on a dictionary, with options for case conversion and new column creation.")
|
|
161
|
+
print(" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)")
|
|
162
|
+
print(" - Example: replace_substrings(df, 'text_col', {'old': 'new'}, to_lower=True, new_column='updated_text')")
|
|
163
|
+
|
|
164
|
+
print("\n31. add_total_column")
|
|
165
|
+
print(" - Description: Adds a total column to a DataFrame by summing values across columns, optionally excluding one.")
|
|
166
|
+
print(" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')")
|
|
167
|
+
print(" - Example: add_total_column(df, exclude_col='date_col')")
|
|
168
|
+
|
|
169
|
+
print("\n32. apply_lookup_table_based_on_substring")
|
|
170
|
+
print(" - Description: Categorizes text in a column using a lookup table based on substrings.")
|
|
171
|
+
print(" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')")
|
|
172
|
+
print(" - Example: apply_lookup_table_based_on_substring(df, 'text_col', {'sub1': 'cat1', 'sub2': 'cat2'})")
|
|
173
|
+
|
|
174
|
+
print("\n33. compare_overlap")
|
|
175
|
+
print(" - Description: Compares overlapping periods between two DataFrames and summarizes differences.")
|
|
176
|
+
print(" - Usage: compare_overlap(df1, df2, date_col)")
|
|
177
|
+
print(" - Example: compare_overlap(df1, df2, 'date_col')")
|
|
178
|
+
|
|
179
|
+
print("\n34. week_commencing_2_week_commencing_conversion_isoweekday")
|
|
180
|
+
print(" - Description: Maps dates to the start of the current ISO week based on a specified weekday.")
|
|
181
|
+
print(" - Usage: week_commencing_2_week_commencing_conversion_isoweekday(df, date_col, week_commencing='mon')")
|
|
182
|
+
print(" - Example: week_commencing_2_week_commencing_conversion_isoweekday(df, 'date_col', week_commencing='fri')")
|
|
183
|
+
|
|
184
|
+
def get_wd_levels(self, levels):
|
|
185
|
+
"""
|
|
186
|
+
Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
|
|
187
|
+
|
|
188
|
+
Parameters:
|
|
189
|
+
- data_frame: pandas DataFrame
|
|
190
|
+
The input data frame.
|
|
191
|
+
- num_rows_to_remove: int
|
|
192
|
+
The number of levels to move up pathways.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
- Current wd
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
directory = os.getcwd()
|
|
199
|
+
for _ in range(levels):
|
|
200
|
+
directory = os.path.dirname(directory)
|
|
201
|
+
return directory
|
|
202
|
+
|
|
203
|
+
def aggregate_daily_to_wc_long(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum') -> pd.DataFrame:
|
|
204
|
+
"""
|
|
205
|
+
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
206
|
+
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
207
|
+
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
208
|
+
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
209
|
+
The day column is renamed from 'Day' to 'OBS'.
|
|
210
|
+
|
|
211
|
+
Parameters:
|
|
212
|
+
- df: pandas DataFrame
|
|
213
|
+
The input DataFrame containing daily data.
|
|
214
|
+
- date_column: string
|
|
215
|
+
The name of the column in the DataFrame that contains date information.
|
|
216
|
+
- group_columns: list of strings
|
|
217
|
+
Additional column names to group by along with the weekly grouping.
|
|
218
|
+
- sum_columns: list of strings
|
|
219
|
+
Numeric column names to be aggregated during aggregation.
|
|
220
|
+
- wc: string
|
|
221
|
+
The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
|
|
222
|
+
- aggregation: string, optional (default 'sum')
|
|
223
|
+
Aggregation method, either 'sum', 'average', or 'count'.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
- pandas DataFrame
|
|
227
|
+
A new DataFrame with weekly aggregated data. The index is reset,
|
|
228
|
+
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
229
|
+
is in long format, with separate columns for each combination of
|
|
230
|
+
grouped metrics.
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
# Map the input week commencing day to a weekday number (0=Monday, 6=Sunday)
|
|
234
|
+
days = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, 'sun': 6}
|
|
235
|
+
if wc.lower() not in days:
|
|
236
|
+
return print(f"Incorrect week commencing day input: '{wc}'. Please choose a valid day of the week (e.g., 'sun', 'mon', etc.).")
|
|
237
|
+
|
|
238
|
+
start_day = days[wc.lower()]
|
|
239
|
+
|
|
240
|
+
# Make a copy of the DataFrame
|
|
241
|
+
df_copy = df.copy()
|
|
242
|
+
|
|
243
|
+
# Convert the date column to datetime
|
|
244
|
+
df_copy[date_column] = pd.to_datetime(df_copy[date_column])
|
|
245
|
+
|
|
246
|
+
# Determine the start of each week
|
|
247
|
+
df_copy['week_start'] = df_copy[date_column].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - start_day) % 7))
|
|
248
|
+
|
|
249
|
+
# Convert sum_columns to numeric and fill NaNs with 0, retaining decimal values
|
|
250
|
+
for col in sum_columns:
|
|
251
|
+
df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)
|
|
252
|
+
|
|
253
|
+
# Group by the new week start column and additional columns, then aggregate the numeric columns
|
|
254
|
+
if aggregation == 'average':
|
|
255
|
+
grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].mean().reset_index()
|
|
256
|
+
elif aggregation == 'count':
|
|
257
|
+
grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].count().reset_index()
|
|
258
|
+
else: # Default to 'sum' if any other value is provided
|
|
259
|
+
grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].sum().reset_index()
|
|
260
|
+
|
|
261
|
+
# Rename 'week_start' column to 'OBS'
|
|
262
|
+
grouped = grouped.rename(columns={'week_start': 'OBS'})
|
|
263
|
+
|
|
264
|
+
return grouped
|
|
265
|
+
|
|
266
|
+
def convert_monthly_to_daily(self, df, date_column, divide = True):
|
|
267
|
+
"""
|
|
268
|
+
Convert a DataFrame with monthly data to daily data.
|
|
269
|
+
This function takes a DataFrame and a date column, then it expands each
|
|
270
|
+
monthly record into daily records by dividing the numeric values by the number of days in that month.
|
|
271
|
+
|
|
272
|
+
:param df: DataFrame with monthly data.
|
|
273
|
+
:param date_column: The name of the column containing the date.
|
|
274
|
+
:param divide: boolean divide by the number of days in a month (default True)
|
|
275
|
+
:return: A new DataFrame with daily data.
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
# Convert date_column to datetime
|
|
279
|
+
df[date_column] = pd.to_datetime(df[date_column])
|
|
280
|
+
|
|
281
|
+
# Initialize an empty list to hold the daily records
|
|
282
|
+
daily_records = []
|
|
283
|
+
|
|
284
|
+
# Iterate over each row in the DataFrame
|
|
285
|
+
for _, row in df.iterrows():
|
|
286
|
+
# Calculate the number of days in the month
|
|
287
|
+
num_days = calendar.monthrange(row[date_column].year, row[date_column].month)[1]
|
|
288
|
+
|
|
289
|
+
# Create a new record for each day of the month
|
|
290
|
+
for day in range(1, num_days + 1):
|
|
291
|
+
daily_row = row.copy()
|
|
292
|
+
daily_row[date_column] = row[date_column].replace(day=day)
|
|
293
|
+
|
|
294
|
+
# Divide each numeric value by the number of days in the month
|
|
295
|
+
for col in df.columns:
|
|
296
|
+
if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
|
|
297
|
+
if divide is True:
|
|
298
|
+
daily_row[col] = row[col] / num_days
|
|
299
|
+
else:
|
|
300
|
+
daily_row[col] = row[col]
|
|
301
|
+
daily_records.append(daily_row)
|
|
302
|
+
|
|
303
|
+
# Convert the list of daily records into a DataFrame
|
|
304
|
+
daily_df = pd.DataFrame(daily_records)
|
|
305
|
+
|
|
306
|
+
return daily_df
|
|
307
|
+
|
|
308
|
+
def week_of_year_mapping(self,df, week_col, start_day_str):
|
|
309
|
+
|
|
310
|
+
# Mapping of string day names to day numbers (1 for Monday, 7 for Sunday)
|
|
311
|
+
day_mapping = {
|
|
312
|
+
'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
# Convert the day string to a number, or raise an error if not valid
|
|
316
|
+
start_day = day_mapping.get(start_day_str.lower())
|
|
317
|
+
if start_day is None:
|
|
318
|
+
raise ValueError(f"Invalid day input: '{start_day_str}'. Please use one of 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'.")
|
|
319
|
+
|
|
320
|
+
# Function to convert week number to start date of the week
|
|
321
|
+
def week_to_startdate(week_str, start_day):
|
|
322
|
+
year, week = map(int, week_str.split('-W'))
|
|
323
|
+
first_day_of_year = datetime(year, 1, 1)
|
|
324
|
+
first_weekday_of_year = first_day_of_year.weekday() # Monday is 0 and Sunday is 6
|
|
325
|
+
|
|
326
|
+
# Calculate days to adjust to the desired start day of the week
|
|
327
|
+
days_to_adjust = (start_day - 1 - first_weekday_of_year) % 7
|
|
328
|
+
start_of_iso_week = first_day_of_year + timedelta(days=days_to_adjust)
|
|
329
|
+
|
|
330
|
+
# Calculate the start of the desired week
|
|
331
|
+
start_of_week = start_of_iso_week + timedelta(weeks=week - 1)
|
|
332
|
+
return start_of_week
|
|
333
|
+
|
|
334
|
+
# Apply the function to each row in the specified week column
|
|
335
|
+
df['OBS'] = df[week_col].apply(lambda x: week_to_startdate(x, start_day)).dt.strftime('%d/%m/%Y')
|
|
336
|
+
return df
|
|
337
|
+
|
|
338
|
+
def rename_cols(self, df, name = 'ame_'):
|
|
339
|
+
new_columns = {}
|
|
340
|
+
for col in df.columns:
|
|
341
|
+
if col != 'OBS':
|
|
342
|
+
new_col_name = name + col.replace(" ", "_").lower()
|
|
343
|
+
else:
|
|
344
|
+
new_col_name = col
|
|
345
|
+
new_columns[col] = new_col_name
|
|
346
|
+
return df.rename(columns=new_columns)
|
|
347
|
+
|
|
348
|
+
def merge_new_and_old(self, old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS'):
|
|
349
|
+
"""
|
|
350
|
+
Creates a new DataFrame with two columns: one for dates and one for merged numeric values.
|
|
351
|
+
Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.
|
|
352
|
+
|
|
353
|
+
Parameters:
|
|
354
|
+
- old_df: pandas DataFrame
|
|
355
|
+
The old DataFrame from which to take the numeric values up to the specified date.
|
|
356
|
+
- old_col: str
|
|
357
|
+
The name of the numeric column in the old DataFrame whose values are to be taken.
|
|
358
|
+
- new_df: pandas DataFrame
|
|
359
|
+
The new DataFrame from which to take the numeric values from the specified date onwards.
|
|
360
|
+
- new_col: str
|
|
361
|
+
The name of the numeric column in the new DataFrame whose values are to be taken.
|
|
362
|
+
- cutoff_date: str
|
|
363
|
+
The cut-off date in 'YYYY-MM-DD' format to split the data between the two DataFrames.
|
|
364
|
+
- date_col_name: str, optional (default 'OBS')
|
|
365
|
+
The name of the date column in both DataFrames.
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
- pandas DataFrame
|
|
369
|
+
A new DataFrame with two columns: 'Date' and a column named after 'new_col' containing merged numeric values.
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
# Convert date columns in both dataframes to datetime for comparison
|
|
373
|
+
old_df[date_col_name] = pd.to_datetime(old_df[date_col_name])
|
|
374
|
+
new_df[date_col_name] = pd.to_datetime(new_df[date_col_name])
|
|
375
|
+
|
|
376
|
+
# Convert the cutoff date string to datetime
|
|
377
|
+
cutoff_date = pd.to_datetime(cutoff_date)
|
|
378
|
+
|
|
379
|
+
# Split old and new dataframes based on the cutoff date
|
|
380
|
+
old_values = old_df[old_df[date_col_name] <= cutoff_date]
|
|
381
|
+
new_values = new_df[new_df[date_col_name] > cutoff_date]
|
|
382
|
+
|
|
383
|
+
# Create a new DataFrame with two columns: 'Date' and a column named after 'new_col'
|
|
384
|
+
merged_df = pd.DataFrame({
|
|
385
|
+
'OBS': pd.concat([old_values[date_col_name], new_values[date_col_name]], ignore_index=True),
|
|
386
|
+
new_col: pd.concat([old_values[old_col], new_values[new_col]], ignore_index=True)
|
|
387
|
+
})
|
|
388
|
+
|
|
389
|
+
return merged_df
|
|
390
|
+
|
|
391
|
+
def merge_dataframes_on_column(self, dataframes, common_column='OBS', merge_how='outer'):
|
|
392
|
+
"""
|
|
393
|
+
Merge a list of DataFrames on a common column.
|
|
394
|
+
|
|
395
|
+
Parameters:
|
|
396
|
+
- dataframes: A list of DataFrames to merge.
|
|
397
|
+
- common_column: The name of the common column to merge on.
|
|
398
|
+
- merge_how: The type of merge to perform ('inner', 'outer', 'left', or 'right').
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
- A merged DataFrame.
|
|
402
|
+
"""
|
|
403
|
+
if not dataframes:
|
|
404
|
+
return None
|
|
405
|
+
|
|
406
|
+
merged_df = dataframes[0] # Start with the first DataFrame
|
|
407
|
+
|
|
408
|
+
for df in dataframes[1:]:
|
|
409
|
+
merged_df = pd.merge(merged_df, df, on=common_column, how=merge_how)
|
|
410
|
+
|
|
411
|
+
# Check if the common column is of datetime dtype
|
|
412
|
+
if merged_df[common_column].dtype == 'datetime64[ns]':
|
|
413
|
+
merged_df[common_column] = pd.to_datetime(merged_df[common_column])
|
|
414
|
+
merged_df = merged_df.sort_values(by=common_column)
|
|
415
|
+
merged_df = merged_df.fillna(0)
|
|
416
|
+
|
|
417
|
+
return merged_df
|
|
418
|
+
|
|
419
|
+
def merge_and_update_dfs(self, df1, df2, key_column):
|
|
420
|
+
"""
|
|
421
|
+
Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available,
|
|
422
|
+
and returns a dataframe sorted by the key column.
|
|
423
|
+
|
|
424
|
+
Parameters:
|
|
425
|
+
df1 (DataFrame): The first dataframe to merge (e.g., processed_facebook).
|
|
426
|
+
df2 (DataFrame): The second dataframe to merge (e.g., finalised_meta).
|
|
427
|
+
key_column (str): The name of the column to merge and sort by (e.g., 'OBS').
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
DataFrame: The merged and updated dataframe.
|
|
431
|
+
"""
|
|
432
|
+
|
|
433
|
+
# Sort both DataFrames by the key column
|
|
434
|
+
df1_sorted = df1.sort_values(by=key_column)
|
|
435
|
+
df2_sorted = df2.sort_values(by=key_column)
|
|
436
|
+
|
|
437
|
+
# Perform the full outer merge
|
|
438
|
+
merged_df = pd.merge(df1_sorted, df2_sorted, on=key_column, how='outer', suffixes=('', '_finalised'))
|
|
439
|
+
|
|
440
|
+
# Update with non-null values from df2
|
|
441
|
+
for column in merged_df.columns:
|
|
442
|
+
if column.endswith('_finalised'):
|
|
443
|
+
original_column = column.replace('_finalised', '')
|
|
444
|
+
merged_df.loc[merged_df[column].notnull(), original_column] = merged_df.loc[merged_df[column].notnull(), column]
|
|
445
|
+
merged_df.drop(column, axis=1, inplace=True)
|
|
446
|
+
|
|
447
|
+
# Sort the merged DataFrame by the key column
|
|
448
|
+
merged_df.sort_values(by=key_column, inplace=True)
|
|
449
|
+
|
|
450
|
+
# Handle null values (optional, can be adjusted as needed)
|
|
451
|
+
merged_df.fillna(0, inplace=True)
|
|
452
|
+
|
|
453
|
+
return merged_df
|
|
454
|
+
|
|
455
|
+
def convert_us_to_uk_dates(self, df, date_col):
|
|
456
|
+
"""
|
|
457
|
+
Processes the date column of a DataFrame to remove hyphens and slashes,
|
|
458
|
+
and converts it to a datetime object.
|
|
459
|
+
|
|
460
|
+
Parameters:
|
|
461
|
+
df (pd.DataFrame): The DataFrame containing the date column.
|
|
462
|
+
date_col (str): The name of the date column.
|
|
463
|
+
|
|
464
|
+
Returns:
|
|
465
|
+
pd.DataFrame: The DataFrame with the processed date column.
|
|
466
|
+
"""
|
|
467
|
+
df[date_col] = df[date_col].str.replace(r'[-/]', '', regex=True)
|
|
468
|
+
df[date_col] = pd.to_datetime(
|
|
469
|
+
df[date_col].str.slice(0, 2) + '/' +
|
|
470
|
+
df[date_col].str.slice(2, 4) + '/' +
|
|
471
|
+
df[date_col].str.slice(4, 8),
|
|
472
|
+
format='%m/%d/%Y'
|
|
473
|
+
)
|
|
474
|
+
return df
|
|
475
|
+
|
|
476
|
+
def combine_sheets(self, all_sheets):
|
|
477
|
+
"""
|
|
478
|
+
Combines multiple DataFrames from a dictionary into a single DataFrame.
|
|
479
|
+
Adds a column 'SheetName' indicating the origin sheet of each row.
|
|
480
|
+
|
|
481
|
+
Parameters:
|
|
482
|
+
all_sheets (dict): A dictionary of DataFrames, typically read from an Excel file with multiple sheets.
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
DataFrame: A concatenated DataFrame with an additional 'SheetName' column.
|
|
486
|
+
"""
|
|
487
|
+
combined_df = pd.DataFrame()
|
|
488
|
+
|
|
489
|
+
for sheet_name, df in all_sheets.items():
|
|
490
|
+
df['SheetName'] = sheet_name
|
|
491
|
+
combined_df = pd.concat([combined_df, df], ignore_index=True)
|
|
492
|
+
|
|
493
|
+
return combined_df
|
|
494
|
+
|
|
495
|
+
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
|
|
496
|
+
"""
|
|
497
|
+
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
501
|
+
index_col (str): Name of Column for your pivot table to index on
|
|
502
|
+
columns (str): Name of Columns for your pivot table.
|
|
503
|
+
values_col (str): Name of Values Columns for your pivot table.
|
|
504
|
+
filters_dict (dict, optional): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell. Defaults to None
|
|
505
|
+
fill_value (int, optional): The value to replace nan with. Defaults to 0.
|
|
506
|
+
aggfunc (str, optional): The method on which to aggregate the values column. Defaults to sum.
|
|
507
|
+
margins (bool, optional): Whether the pivot table needs a total rows and column. Defaults to False.
|
|
508
|
+
margins_name (str, optional): The name of the Totals columns. Defaults to "Total".
|
|
509
|
+
datetime_trans_needed (bool, optional): Whether the index column needs to be transformed into datetime format. Defaults to False.
|
|
510
|
+
reverse_header_order (bool, optional): Reverses the order of the column headers. Defaults to False.
|
|
511
|
+
fill_missing_weekly_dates (bool, optional): Fills in any weekly missing dates. Defaults to False.
|
|
512
|
+
week_commencing (str,optional): Fills in missing weeks if option is specified. Defaults to 'W-MON'.
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
pandas.DataFrame: The pivot table specified
|
|
516
|
+
"""
|
|
517
|
+
|
|
518
|
+
# Validate inputs
|
|
519
|
+
if index_col not in df.columns:
|
|
520
|
+
raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
|
|
521
|
+
if columns not in df.columns:
|
|
522
|
+
raise ValueError(f"columns '{columns}' not found in DataFrame.")
|
|
523
|
+
if values_col not in df.columns:
|
|
524
|
+
raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
|
|
525
|
+
|
|
526
|
+
# Apply filters if provided
|
|
527
|
+
if filters_dict:
|
|
528
|
+
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
529
|
+
else:
|
|
530
|
+
df_filtered = df.copy()
|
|
531
|
+
|
|
532
|
+
# Ensure index column is in datetime format if needed
|
|
533
|
+
if datetime_trans_needed:
|
|
534
|
+
df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
|
|
535
|
+
|
|
536
|
+
# Create the pivot table
|
|
537
|
+
pivoted_df = df_filtered.pivot_table(
|
|
538
|
+
index=index_col,
|
|
539
|
+
columns=columns,
|
|
540
|
+
values=values_col,
|
|
541
|
+
aggfunc=aggfunc,
|
|
542
|
+
margins=margins,
|
|
543
|
+
margins_name=margins_name,
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
# Handle column headers
|
|
547
|
+
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
548
|
+
pivoted_df.columns = [
|
|
549
|
+
"_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
|
|
550
|
+
for col in pivoted_df.columns.values
|
|
551
|
+
]
|
|
552
|
+
else:
|
|
553
|
+
pivoted_df.columns = pivoted_df.columns.map(str)
|
|
554
|
+
|
|
555
|
+
# Reset the index
|
|
556
|
+
pivoted_df.reset_index(inplace=True)
|
|
557
|
+
|
|
558
|
+
# Handle sorting and formatting of index column
|
|
559
|
+
if datetime_trans_needed:
|
|
560
|
+
pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
|
|
561
|
+
pivoted_df.sort_values(by=index_col, inplace=True)
|
|
562
|
+
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
|
|
563
|
+
|
|
564
|
+
# Fill missing values
|
|
565
|
+
pivoted_df.fillna(fill_value, inplace=True)
|
|
566
|
+
|
|
567
|
+
# Fill missing weekly dates if specified
|
|
568
|
+
if fill_missing_weekly_dates:
|
|
569
|
+
pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
|
|
570
|
+
|
|
571
|
+
return pivoted_df
|
|
572
|
+
|
|
573
|
+
def apply_lookup_table_for_columns(self, df, col_names, to_find_dict, if_not_in_dict="Other", new_column_name="Mapping"):
|
|
574
|
+
"""
|
|
575
|
+
Creates a new DataFrame column based on a look up table, possibly with multiple columns to look up on (dictionary of substrings to class mappings).
|
|
576
|
+
|
|
577
|
+
Parameters:
|
|
578
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
579
|
+
col_names (list of str): these are the columns which are used for the lookup. One column or several columns can be inputted as a list, provided there is a merged column to lookup on. If there are multiple columns to look up on then a merged column must be inputted as the key of the dictionary of format e.g. col1|col2|col3
|
|
580
|
+
to_find_dict (dict): your look up table, where keys are the values being looked up, and the values are the resulting mappings.
|
|
581
|
+
if_not_in_dict (str, optional): default value if no substring matches are found in the look up table dictionary. Defaults to "Other".
|
|
582
|
+
new_column_name (str, optional): name of new column. Defaults to "Mapping".
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
pandas.DataFrame: DataFrame with a new column containing the look up table results.
|
|
586
|
+
"""
|
|
587
|
+
|
|
588
|
+
# Create regex pattern with word boundaries from the dictionary
|
|
589
|
+
regex_pattern = "|".join(r'\b' + re.escape(key) + r'\b' for key in to_find_dict.keys())
|
|
590
|
+
|
|
591
|
+
# Preprocess DataFrame if multiple columns
|
|
592
|
+
if len(col_names) > 1:
|
|
593
|
+
df["Merged"] = df[col_names].astype(str).apply('|'.join, axis=1)
|
|
594
|
+
col_to_use = "Merged"
|
|
595
|
+
else:
|
|
596
|
+
col_to_use = col_names[0]
|
|
597
|
+
|
|
598
|
+
# Extract the first match using the regex pattern
|
|
599
|
+
matches = df[col_to_use].str.extract(f'({regex_pattern})', expand=False, flags=re.IGNORECASE)
|
|
600
|
+
|
|
601
|
+
# Map the matches to the corresponding values in the dictionary
|
|
602
|
+
df[new_column_name] = matches.str.lower().map({k.lower(): v for k, v in to_find_dict.items()}).fillna(if_not_in_dict)
|
|
603
|
+
|
|
604
|
+
# Drop intermediate column if created
|
|
605
|
+
if len(col_names) > 1:
|
|
606
|
+
df.drop(columns=["Merged"], inplace=True)
|
|
607
|
+
|
|
608
|
+
return df
|
|
609
|
+
|
|
610
|
+
def aggregate_daily_to_wc_wide(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum', include_totals : bool = False) -> pd.DataFrame:
|
|
611
|
+
"""
|
|
612
|
+
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
613
|
+
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
614
|
+
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
615
|
+
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
616
|
+
The day column is renamed from 'Day' to 'OBS'.
|
|
617
|
+
|
|
618
|
+
Parameters:
|
|
619
|
+
- df: pandas DataFrame
|
|
620
|
+
The input DataFrame containing daily data.
|
|
621
|
+
- date_column: string
|
|
622
|
+
The name of the column in the DataFrame that contains date information.
|
|
623
|
+
- group_columns: list of strings
|
|
624
|
+
Additional column names to group by along with the weekly grouping.
|
|
625
|
+
- sum_columns: list of strings
|
|
626
|
+
Numeric column names to be aggregated during aggregation.
|
|
627
|
+
- wc: string
|
|
628
|
+
The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
|
|
629
|
+
- aggregation: string, optional (default 'sum')
|
|
630
|
+
Aggregation method, either 'sum', 'average', or 'count'.
|
|
631
|
+
- include_totals: boolean, optional (default False)
|
|
632
|
+
If True, include total columns for each sum_column.
|
|
633
|
+
|
|
634
|
+
Returns:
|
|
635
|
+
- pandas DataFrame
|
|
636
|
+
A new DataFrame with weekly aggregated data. The index is reset,
|
|
637
|
+
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
638
|
+
is in wide format, with separate columns for each combination of
|
|
639
|
+
grouped metrics.
|
|
640
|
+
"""
|
|
641
|
+
|
|
642
|
+
grouped = self.aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation)
|
|
643
|
+
|
|
644
|
+
# Pivot the data to wide format
|
|
645
|
+
if group_columns:
|
|
646
|
+
wide_df = grouped.pivot_table(index='OBS',
|
|
647
|
+
columns=group_columns,
|
|
648
|
+
values=sum_columns,
|
|
649
|
+
aggfunc='first')
|
|
650
|
+
# Flatten the multi-level column index and create combined column names
|
|
651
|
+
wide_df.columns = ['_'.join(col).strip() for col in wide_df.columns.values]
|
|
652
|
+
else:
|
|
653
|
+
wide_df = grouped.set_index('OBS')
|
|
654
|
+
|
|
655
|
+
# Fill NaN values with 0
|
|
656
|
+
wide_df = wide_df.fillna(0)
|
|
657
|
+
|
|
658
|
+
# Adding total columns for each unique sum_column, if include_totals is True
|
|
659
|
+
if include_totals:
|
|
660
|
+
for col in sum_columns:
|
|
661
|
+
total_column_name = f'Total {col}'
|
|
662
|
+
if group_columns:
|
|
663
|
+
columns_to_sum = [column for column in wide_df.columns if col in column]
|
|
664
|
+
else:
|
|
665
|
+
columns_to_sum = [col]
|
|
666
|
+
wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
|
|
667
|
+
|
|
668
|
+
# Reset the index of the final DataFrame
|
|
669
|
+
wide_df = wide_df.reset_index()
|
|
670
|
+
|
|
671
|
+
return wide_df
|
|
672
|
+
|
|
673
|
+
def merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = "Merged",starting_prefix_str=None,ending_prefix_str=None):
|
|
674
|
+
"""
|
|
675
|
+
Creates a new column in the dataframe that merges 2 or more columns together with a "_" seperator, possibly to be used for a look up table where multiple columns are being looked up
|
|
676
|
+
|
|
677
|
+
Parameters:
|
|
678
|
+
df (pandas.DataFrame): Dataframe to make changes to.
|
|
679
|
+
col_names (list): list of columm names ot merge.
|
|
680
|
+
seperator (str, optional): Name of column outputted. Defaults to "_".
|
|
681
|
+
output_column_name (str, optional): Name of column outputted. Defaults to "Merged".
|
|
682
|
+
starting_prefix_str (str, optional): string of optional text to be added before the merged column str value
|
|
683
|
+
ending_prefix_str (str, optional): string of optional text to be added after the merged column str value
|
|
684
|
+
|
|
685
|
+
Raises:
|
|
686
|
+
ValueError: if more less than two column names are inputted in the list there is nothing to merge on
|
|
687
|
+
|
|
688
|
+
Returns:
|
|
689
|
+
pandas.DataFrame: DataFrame with additional merged column
|
|
690
|
+
"""
|
|
691
|
+
# Specify more than one column must be entered
|
|
692
|
+
if len(col_names) < 2:
|
|
693
|
+
raise ValueError("2 or more columns must be specified to merge")
|
|
694
|
+
|
|
695
|
+
# Create a new column with the merged columns
|
|
696
|
+
df[output_column_name] = df[col_names].astype(str).apply(seperator.join, axis=1)
|
|
697
|
+
|
|
698
|
+
# Add string before
|
|
699
|
+
if starting_prefix_str is not None:
|
|
700
|
+
df[output_column_name] = starting_prefix_str + df[output_column_name].astype(str)
|
|
701
|
+
|
|
702
|
+
# Add string after
|
|
703
|
+
if ending_prefix_str is not None:
|
|
704
|
+
df[output_column_name] = df[output_column_name].astype(str) + ending_prefix_str
|
|
705
|
+
|
|
706
|
+
return df
|
|
707
|
+
|
|
708
|
+
def check_sum_of_df_cols_are_equal(self, df_1,df_2,cols_1,cols_2):
|
|
709
|
+
"""
|
|
710
|
+
Checks the sum of two different dataframe column or columns are equal
|
|
711
|
+
|
|
712
|
+
Parameters:
|
|
713
|
+
df_1 (pandas.DataFrame): First dataframe for columnsa to be summed on.
|
|
714
|
+
df_2 (pandas.DataFrame): Second dataframe for columnsa to be summed on.
|
|
715
|
+
cols_1 (list of str): Columns from first dataframe to sum.
|
|
716
|
+
cols_2 (list of str): Columns from second dataframe to sum.
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
Tuple: Answer is the true or false answer to whether sums are the same, df_1_sum is the sum of the column/columns in the first dataframe, df_2_sum is the sum of the column/columns in the second dataframe
|
|
720
|
+
"""
|
|
721
|
+
# Find the sum of both sets of columns
|
|
722
|
+
df_1_sum = df_1[cols_1].sum().sum()
|
|
723
|
+
df_2_sum = df_2[cols_2].sum().sum()
|
|
724
|
+
|
|
725
|
+
# If the the two columns are
|
|
726
|
+
if df_1_sum == df_2_sum:
|
|
727
|
+
Answer = "They are equal"
|
|
728
|
+
if df_1_sum != df_2_sum:
|
|
729
|
+
Answer = "They are different by " + str(df_2_sum-df_1_sum)
|
|
730
|
+
|
|
731
|
+
return Answer,df_1_sum,df_2_sum
|
|
732
|
+
|
|
733
|
+
def convert_2_df_cols_to_dict(self, df, key_col, value_col):
|
|
734
|
+
"""
|
|
735
|
+
Create a dictionary mapping from two columns of a DataFrame.
|
|
736
|
+
|
|
737
|
+
Parameters:
|
|
738
|
+
df (pd.DataFrame): The DataFrame containing the data.
|
|
739
|
+
key_col (str): The column name to use as keys in the dictionary.
|
|
740
|
+
value_col (str): The column name to use as values in the dictionary.
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
dict: A dictionary with keys from 'key_col' and values from 'value_col'.
|
|
744
|
+
"""
|
|
745
|
+
if key_col not in df or value_col not in df:
|
|
746
|
+
raise ValueError("Specified columns are not in the DataFrame")
|
|
747
|
+
|
|
748
|
+
return {df[key_col].iloc[i]: df[value_col].iloc[i] for i in range(len(df))}
|
|
749
|
+
|
|
750
|
+
def create_FY_and_H_columns(self, df, index_col, start_date, starting_FY,short_format="No",half_years="No",combined_FY_and_H="No"):
|
|
751
|
+
"""
|
|
752
|
+
Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
|
|
753
|
+
|
|
754
|
+
Parameters:
|
|
755
|
+
df (pandas.DataFrame): Dataframe to operate on.
|
|
756
|
+
index_col (str): Name of the column to use for datetime
|
|
757
|
+
start_date (str): String used to specify the start date of an FY specified, needs to be of format "yyyy-mm-dd" e.g. 2021-11-31
|
|
758
|
+
starting_FY (str): String used to specify which FY the start date refers to, needs to be formatted LONG e.g. FY2021
|
|
759
|
+
short_format (str, optional): String used to specify if short format is desired (e.g. FY21) or if long format is desired (e.g. FY2021). Defaults to "No".
|
|
760
|
+
half_years (str, optional): String used to specify if half year column is desired. Defaults to "No".
|
|
761
|
+
combined_FY_and_H (str, optional): String used to specify is a combined half year and FY column is desired. Defaults to "No".
|
|
762
|
+
|
|
763
|
+
Returns:
|
|
764
|
+
pandas.DataFrame: DataFrame with a new column 'FY' containing the FY as well as, if desired, a half year column and a combined FY half year column.
|
|
765
|
+
"""
|
|
766
|
+
|
|
767
|
+
try:
|
|
768
|
+
start_date = datetime.strptime(start_date, '%Y-%m-%d')
|
|
769
|
+
except ValueError:
|
|
770
|
+
print("Error: Date must be of format yyyy-mm-dd")
|
|
771
|
+
return df
|
|
772
|
+
|
|
773
|
+
df["OBS"] = pd.to_datetime(df[index_col])
|
|
774
|
+
df["OBS as string"] = df["OBS"].dt.strftime("%Y-%m-%d")
|
|
775
|
+
|
|
776
|
+
df[index_col] = pd.to_datetime(df[index_col])
|
|
777
|
+
|
|
778
|
+
start_year = int(starting_FY[2:])
|
|
779
|
+
|
|
780
|
+
def calculate_FY_vectorized(date_series):
|
|
781
|
+
years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
|
|
782
|
+
fy = 'FY' + (start_year + years_since_start).astype(str)
|
|
783
|
+
if short_format == "Yes":
|
|
784
|
+
fy = 'FY' + fy.str[-2:]
|
|
785
|
+
return fy
|
|
786
|
+
|
|
787
|
+
df['FY'] = calculate_FY_vectorized(df[index_col])
|
|
788
|
+
|
|
789
|
+
if half_years == "Yes" or combined_FY_and_H == "Yes":
|
|
790
|
+
def calculate_half_year_vectorized(date_series):
|
|
791
|
+
fy_years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
|
|
792
|
+
fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(years=1)
|
|
793
|
+
fy_end_of_h1 = fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
|
|
794
|
+
half_year = np.where(date_series <= fy_end_of_h1, 'H1', 'H2')
|
|
795
|
+
return half_year
|
|
796
|
+
|
|
797
|
+
df['Half Years'] = calculate_half_year_vectorized(df[index_col])
|
|
798
|
+
|
|
799
|
+
if combined_FY_and_H == "Yes":
|
|
800
|
+
df['Financial Half Years'] = df['FY'] + ' ' + df['Half Years']
|
|
801
|
+
|
|
802
|
+
return df
|
|
803
|
+
|
|
804
|
+
def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
|
|
805
|
+
"""
|
|
806
|
+
This function updates values in a specified column of the DataFrame based on a lookup dictionary.
|
|
807
|
+
It first merges several columns into a new 'Merged' column, then uses this merged column to determine
|
|
808
|
+
if replacements are needed based on the dictionary.
|
|
809
|
+
|
|
810
|
+
Parameters:
|
|
811
|
+
df (pd.DataFrame): The DataFrame to process.
|
|
812
|
+
col (str): The name of the column whose values are potentially replaced.
|
|
813
|
+
replacement_rows (str): The specific value in 'col' to check for replacements.
|
|
814
|
+
cols_to_merge (list of str): List of column names whose contents will be merged to form a lookup key.
|
|
815
|
+
replacement_lookup_dict (dict): Dictionary where keys are merged column values and values are the new data to replace in 'col'.
|
|
816
|
+
output_column_name (str, optional): Name of column outputted. Defaults to "Updated Column".
|
|
817
|
+
|
|
818
|
+
Returns:
|
|
819
|
+
pd.DataFrame: The modified DataFrame with updated values in the specified column.
|
|
820
|
+
"""
|
|
821
|
+
# Create a merged column from specified columns
|
|
822
|
+
df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
|
|
823
|
+
|
|
824
|
+
# Replace values in the specified column based on the lookup
|
|
825
|
+
def replace_values(x):
|
|
826
|
+
if x[col] == replacement_rows:
|
|
827
|
+
merged_value = x['Merged']
|
|
828
|
+
if merged_value in replacement_lookup_dict:
|
|
829
|
+
return replacement_lookup_dict[merged_value]
|
|
830
|
+
return x[col]
|
|
831
|
+
|
|
832
|
+
# Apply replacement logic
|
|
833
|
+
df[output_column_name] = df.apply(replace_values, axis=1)
|
|
834
|
+
|
|
835
|
+
# Drop the intermediate 'Merged' column
|
|
836
|
+
df.drop(columns=['Merged'], inplace=True)
|
|
837
|
+
|
|
838
|
+
return df
|
|
839
|
+
|
|
840
|
+
def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
|
|
841
|
+
"""
|
|
842
|
+
Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
|
|
843
|
+
The lookup is based on a column in the dataframe. Can only input one column and output one new column.
|
|
844
|
+
|
|
845
|
+
Parameters:
|
|
846
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
847
|
+
keys_col (str): The name of the column which the LUT will be refercing to ouput a value.
|
|
848
|
+
value_col (str): The name of the column which the new column will be based off. If a key in the key column is not found in the LUT, the values from this column are used instead.
|
|
849
|
+
dict_for_specific_changes (dict): The LUT which the keys_col will be mapped on to find any values that need changing in the new column.
|
|
850
|
+
new_col_name (str, optional): This is the name of the new column being generated. Defaults to "New Version of Old Col".
|
|
851
|
+
|
|
852
|
+
Returns:
|
|
853
|
+
pandas.DataFrame: DataFrame with a new column which is similar to the old column, except for where changes have been made to reflect the lookup table.
|
|
854
|
+
"""
|
|
855
|
+
|
|
856
|
+
# Extract columns to change using new dictionary
|
|
857
|
+
smaller_df = df[[keys_col,value_col]]
|
|
858
|
+
|
|
859
|
+
# Use the new dictionary to create a new LUT
|
|
860
|
+
smaller_df_with_LUT = self.apply_lookup_table_for_columns(smaller_df,[keys_col,value_col],dict_for_specific_changes)
|
|
861
|
+
|
|
862
|
+
# In a new column, keep values from the old column that don't need updating as they are not in the dictionary, and replace values that do need updating with values from the dictionary based on the keys
|
|
863
|
+
smaller_df_with_LUT["Updated Col"]=smaller_df_with_LUT.apply(lambda x: x['Mapping'] if x['Mapping'] != "Other" else x[value_col],axis=1)
|
|
864
|
+
|
|
865
|
+
# Drop the extra unecessary cols
|
|
866
|
+
smaller_df_with_LUT.drop([keys_col,'Mapping'],axis=1,inplace=True)
|
|
867
|
+
|
|
868
|
+
# # Output dataframes as dictionary to be used in a LUT
|
|
869
|
+
new_dict = self.convert_2_df_cols_to_dict(smaller_df_with_LUT,value_col,"Updated Col")
|
|
870
|
+
|
|
871
|
+
# # Use new dictionary to create a new version of an old column
|
|
872
|
+
df_final = self.apply_lookup_table_for_columns(df,[keys_col],new_dict,"other",new_col_name)
|
|
873
|
+
|
|
874
|
+
return df_final
|
|
875
|
+
|
|
876
|
+
def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
|
|
877
|
+
"""
|
|
878
|
+
Changes a dataframe from wide to long format.
|
|
879
|
+
|
|
880
|
+
Args:
|
|
881
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
882
|
+
value_cols (list of str or str if only one): List of column names to transform from several columns into one.
|
|
883
|
+
variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
|
|
884
|
+
value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
|
|
885
|
+
|
|
886
|
+
Returns:
|
|
887
|
+
pandas.DataFrame: DataFrame transformed from wide to long format.
|
|
888
|
+
|
|
889
|
+
Raises:
|
|
890
|
+
ValueError: If the number of columns to depivot is less than 2.
|
|
891
|
+
"""
|
|
892
|
+
# Check length of value_cols is greater than 1
|
|
893
|
+
if len(value_cols) < 2:
|
|
894
|
+
raise ValueError("Number of inputs in list must be greater than 1")
|
|
895
|
+
|
|
896
|
+
# Find the columns that are not to be depivoted into one column
|
|
897
|
+
id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
|
|
898
|
+
|
|
899
|
+
# Melt all columns chosen into one column
|
|
900
|
+
df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
|
|
901
|
+
|
|
902
|
+
# Sort column order to match expected output
|
|
903
|
+
ordered_columns = id_vars + [variable_col_name, value_col_name]
|
|
904
|
+
df_final = df_final[ordered_columns]
|
|
905
|
+
|
|
906
|
+
return df_final
|
|
907
|
+
|
|
908
|
+
def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
|
|
909
|
+
"""
|
|
910
|
+
Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
|
|
911
|
+
|
|
912
|
+
Args:
|
|
913
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
914
|
+
filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
|
|
915
|
+
col_to_change (str): String name of column to edit
|
|
916
|
+
new_value (any): Value of new input for cell
|
|
917
|
+
change_in_existing_df_col (str, optional): Input of Yes or No to describe whether to make the change in an existing column. Defaults to "No".
|
|
918
|
+
new_col_to_change_name (str, optional): Name of the new column to copy the column being edited into and to make the change in. Defaults to 'New'.
|
|
919
|
+
manual_edit_col_name (str, optional): Name of the current manual edits column, if one is not specified it will be created. Defaults to None.
|
|
920
|
+
add_notes (str, optional): Gives the option to create a new notes column. Defaults to "No".
|
|
921
|
+
existing_note_col_name (str, optional): If there is an existing notes column this can be specified. Defaults to None.
|
|
922
|
+
note (str), optional): The string of the note to be added to the column. Defaults to None.
|
|
923
|
+
|
|
924
|
+
Raises:
|
|
925
|
+
TypeError: The column for the column to change can only be specified as one column as it is a string not a list
|
|
926
|
+
ValueError: You can only input the values of "Yes" or "No" for whether to make the change in existing column
|
|
927
|
+
ValueError: You can only input the values of "Yes" or "No" for whether to make a new notes column
|
|
928
|
+
|
|
929
|
+
Returns:
|
|
930
|
+
pandas.DataFrame: Dataframe with manual changes added
|
|
931
|
+
"""
|
|
932
|
+
|
|
933
|
+
# Raise type error if more than one col is supported
|
|
934
|
+
if isinstance(col_to_change, list):
|
|
935
|
+
raise TypeError("Col to change must be specified as a string, not a list")
|
|
936
|
+
|
|
937
|
+
# Raises value error if input is invalid for change_in_existing_df_col
|
|
938
|
+
if change_in_existing_df_col not in ["Yes", "No"]:
|
|
939
|
+
raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
|
|
940
|
+
|
|
941
|
+
# Raises value error if input is invalid for add_notes_col
|
|
942
|
+
if add_notes not in ["Yes", "No"]:
|
|
943
|
+
raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
|
|
944
|
+
|
|
945
|
+
# Validate filters_dict format
|
|
946
|
+
for col, cond in filters_dict.items():
|
|
947
|
+
if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
|
|
948
|
+
raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
|
|
949
|
+
|
|
950
|
+
# Create the filtered df by applying the conditions
|
|
951
|
+
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
952
|
+
|
|
953
|
+
# Create a new column to add the changes if desired, else edit in the current chosen column
|
|
954
|
+
col_to_update = col_to_change if change_in_existing_df_col == "Yes" else new_col_to_change_name
|
|
955
|
+
if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
|
|
956
|
+
df = df.copy()
|
|
957
|
+
df[new_col_to_change_name] = df[col_to_change]
|
|
958
|
+
|
|
959
|
+
# Update the new cell in the chosen column
|
|
960
|
+
df.loc[df_filtered.index, col_to_update] = new_value
|
|
961
|
+
|
|
962
|
+
# Add in manual edit column if desired or specify where one already is
|
|
963
|
+
if manual_edit_col_name:
|
|
964
|
+
if manual_edit_col_name not in df.columns:
|
|
965
|
+
df[manual_edit_col_name] = 0
|
|
966
|
+
df.loc[df_filtered.index, manual_edit_col_name] = 1
|
|
967
|
+
elif not manual_edit_col_name and 'Manual Changes' not in df.columns:
|
|
968
|
+
df['Manual Changes'] = 0
|
|
969
|
+
df.loc[df_filtered.index, 'Manual Changes'] = 1
|
|
970
|
+
|
|
971
|
+
# Add note if desired in new column or an existing column
|
|
972
|
+
if add_notes == "Yes":
|
|
973
|
+
note_col = existing_note_col_name if existing_note_col_name else 'Notes'
|
|
974
|
+
if note_col not in df.columns:
|
|
975
|
+
df[note_col] = None
|
|
976
|
+
df.loc[df_filtered.index, note_col] = note
|
|
977
|
+
|
|
978
|
+
return df
|
|
979
|
+
|
|
980
|
+
def format_numbers_with_commas(self, df, decimal_length_chosen=2):
|
|
981
|
+
"""
|
|
982
|
+
Converts data in numerical format into numbers with commas and a chosen decimal place length.
|
|
983
|
+
|
|
984
|
+
Args:
|
|
985
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
986
|
+
decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
|
|
987
|
+
|
|
988
|
+
Returns:
|
|
989
|
+
pandas.DataFrame: The DataFrame with the chosen updated format.
|
|
990
|
+
"""
|
|
991
|
+
def format_number_with_commas(x, decimal_length=decimal_length_chosen):
|
|
992
|
+
if pd.isna(x): # Preserve None/NaN values
|
|
993
|
+
return pd.NA # Explicitly normalize to pd.NA
|
|
994
|
+
elif isinstance(x, (int, float)):
|
|
995
|
+
if decimal_length is not None:
|
|
996
|
+
format_str = f"{{:,.{decimal_length}f}}"
|
|
997
|
+
return format_str.format(x)
|
|
998
|
+
else:
|
|
999
|
+
return f"{x:,}"
|
|
1000
|
+
else:
|
|
1001
|
+
return x # Return unchanged if not a number
|
|
1002
|
+
|
|
1003
|
+
# Apply formatting column by column
|
|
1004
|
+
formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
|
|
1005
|
+
|
|
1006
|
+
return formatted_df
|
|
1007
|
+
|
|
1008
|
+
def filter_df_on_multiple_conditions(self, df, filters_dict):
|
|
1009
|
+
"""
|
|
1010
|
+
Filter a dataframe based on mulitple conditions
|
|
1011
|
+
|
|
1012
|
+
Args:
|
|
1013
|
+
df (pandas.DatFrame): Dataframe to filter on
|
|
1014
|
+
filters_dict (dict): Dictionary with strings as conditions
|
|
1015
|
+
|
|
1016
|
+
Returns:
|
|
1017
|
+
pandas.DatFrame: Filtered Da
|
|
1018
|
+
"""
|
|
1019
|
+
mask = pd.Series(True, index=df.index)
|
|
1020
|
+
for col, cond in filters_dict.items():
|
|
1021
|
+
cond = cond.strip()
|
|
1022
|
+
operator, value = cond.split(maxsplit=1)
|
|
1023
|
+
|
|
1024
|
+
# If value is a string condition make sure to check if there are new lines
|
|
1025
|
+
if "'" in value:
|
|
1026
|
+
value = value.strip().strip("'\"")
|
|
1027
|
+
# If not a string e.g. datetime or number condition you need to transform the string into a value
|
|
1028
|
+
else:
|
|
1029
|
+
value = eval(value)
|
|
1030
|
+
|
|
1031
|
+
if operator == "==":
|
|
1032
|
+
temp_mask = (df[col] == value)
|
|
1033
|
+
elif operator == "!=":
|
|
1034
|
+
temp_mask = (df[col] != value)
|
|
1035
|
+
elif operator == ">=":
|
|
1036
|
+
temp_mask = (df[col] >= value)
|
|
1037
|
+
elif operator == "<=":
|
|
1038
|
+
temp_mask = (df[col] <= value)
|
|
1039
|
+
elif operator == ">":
|
|
1040
|
+
temp_mask = (df[col] > value)
|
|
1041
|
+
elif operator == "<":
|
|
1042
|
+
temp_mask = (df[col] < value)
|
|
1043
|
+
mask &= temp_mask
|
|
1044
|
+
|
|
1045
|
+
# Create the filtered df by applying the conditions
|
|
1046
|
+
df_filtered = df[mask]
|
|
1047
|
+
|
|
1048
|
+
return df_filtered
|
|
1049
|
+
|
|
1050
|
+
def read_and_concatenate_files(self, folder_path, file_type='csv'):
|
|
1051
|
+
"""
|
|
1052
|
+
Reads all files of a specified type (CSV or XLSX) from a given folder
|
|
1053
|
+
and concatenates them into a single DataFrame.
|
|
1054
|
+
|
|
1055
|
+
Parameters:
|
|
1056
|
+
folder_path (str): The path to the folder containing the files.
|
|
1057
|
+
file_type (str): The type of files to read ('csv' or 'xlsx'). Defaults to 'csv'.
|
|
1058
|
+
|
|
1059
|
+
Returns:
|
|
1060
|
+
pd.DataFrame: A DataFrame containing the concatenated data from all files.
|
|
1061
|
+
"""
|
|
1062
|
+
|
|
1063
|
+
# Initialize an empty list to hold dataframes
|
|
1064
|
+
dataframes = []
|
|
1065
|
+
|
|
1066
|
+
# Define file extension based on file_type
|
|
1067
|
+
if file_type == 'csv':
|
|
1068
|
+
extension = '.csv'
|
|
1069
|
+
elif file_type == 'xlsx':
|
|
1070
|
+
extension = '.xlsx'
|
|
1071
|
+
else:
|
|
1072
|
+
raise ValueError("file_type must be either 'csv' or 'xlsx'")
|
|
1073
|
+
|
|
1074
|
+
# Loop through all files in the folder
|
|
1075
|
+
for filename in os.listdir(folder_path):
|
|
1076
|
+
# Check if the file has the correct extension
|
|
1077
|
+
if filename.endswith(extension):
|
|
1078
|
+
file_path = os.path.join(folder_path, filename)
|
|
1079
|
+
# Read the file into a DataFrame
|
|
1080
|
+
if file_type == 'csv':
|
|
1081
|
+
df = pd.read_csv(file_path)
|
|
1082
|
+
elif file_type == 'xlsx':
|
|
1083
|
+
df = pd.read_excel(file_path)
|
|
1084
|
+
# Append the DataFrame to the list
|
|
1085
|
+
dataframes.append(df)
|
|
1086
|
+
|
|
1087
|
+
# Concatenate all DataFrames into a single DataFrame
|
|
1088
|
+
combined_df = pd.concat(dataframes, ignore_index=True)
|
|
1089
|
+
|
|
1090
|
+
return combined_df
|
|
1091
|
+
|
|
1092
|
+
def upgrade_outdated_packages(exclude_packages=['twine']):
|
|
1093
|
+
"""
|
|
1094
|
+
Upgrade all outdated Python packages except those specified in `exclude_packages`.
|
|
1095
|
+
|
|
1096
|
+
:param exclude_packages: List of package names to exclude from the upgrade process.
|
|
1097
|
+
"""
|
|
1098
|
+
exclude_packages = set(exclude_packages or [])
|
|
1099
|
+
|
|
1100
|
+
try:
|
|
1101
|
+
# Get all installed packages
|
|
1102
|
+
installed_packages_result = subprocess.run(
|
|
1103
|
+
"pip list --format=json", shell=True, capture_output=True, text=True
|
|
1104
|
+
)
|
|
1105
|
+
installed_packages = json.loads(installed_packages_result.stdout)
|
|
1106
|
+
|
|
1107
|
+
# Get the list of outdated packages
|
|
1108
|
+
outdated_packages_result = subprocess.run(
|
|
1109
|
+
"pip list --outdated --format=json", shell=True, capture_output=True, text=True
|
|
1110
|
+
)
|
|
1111
|
+
outdated_packages = json.loads(outdated_packages_result.stdout)
|
|
1112
|
+
|
|
1113
|
+
# Create a set of outdated package names for quick lookup
|
|
1114
|
+
outdated_package_names = {pkg['name'] for pkg in outdated_packages}
|
|
1115
|
+
|
|
1116
|
+
# Upgrade only outdated packages, excluding specified packages
|
|
1117
|
+
for package in installed_packages:
|
|
1118
|
+
package_name = package['name']
|
|
1119
|
+
if package_name in outdated_package_names and package_name not in exclude_packages:
|
|
1120
|
+
try:
|
|
1121
|
+
print(f"Upgrading package: {package_name}")
|
|
1122
|
+
upgrade_result = subprocess.run(
|
|
1123
|
+
f"pip install --upgrade {package_name}", shell=True, capture_output=True, text=True
|
|
1124
|
+
)
|
|
1125
|
+
if upgrade_result.returncode == 0:
|
|
1126
|
+
print(f"Successfully upgraded {package_name}")
|
|
1127
|
+
else:
|
|
1128
|
+
print(f"Failed to upgrade {package_name}: {upgrade_result.stderr}")
|
|
1129
|
+
except Exception as e:
|
|
1130
|
+
print(f"An error occurred while upgrading {package_name}: {e}")
|
|
1131
|
+
elif package_name in exclude_packages:
|
|
1132
|
+
print(f"Skipping package: {package_name} (excluded)")
|
|
1133
|
+
else:
|
|
1134
|
+
print(f"{package_name} is already up to date or not outdated")
|
|
1135
|
+
except Exception as e:
|
|
1136
|
+
print(f"An error occurred during the upgrade process: {e}")
|
|
1137
|
+
|
|
1138
|
+
def convert_mixed_formats_dates(self, df, column_name):
|
|
1139
|
+
# Convert initial dates to datetime with coercion to handle errors
|
|
1140
|
+
df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
|
|
1141
|
+
df[column_name] = df[column_name].astype(str)
|
|
1142
|
+
corrected_dates = []
|
|
1143
|
+
|
|
1144
|
+
for date_str in df[column_name]:
|
|
1145
|
+
date_str = date_str.replace('-', '').replace('/', '')
|
|
1146
|
+
if len(date_str) == 8:
|
|
1147
|
+
year = date_str[:4]
|
|
1148
|
+
month = date_str[4:6]
|
|
1149
|
+
day = date_str[6:8]
|
|
1150
|
+
if int(day) <= 12:
|
|
1151
|
+
# Swap month and day
|
|
1152
|
+
corrected_date_str = f"{year}-{day}-{month}"
|
|
1153
|
+
else:
|
|
1154
|
+
corrected_date_str = f"{year}-{month}-{day}"
|
|
1155
|
+
# Convert to datetime
|
|
1156
|
+
corrected_date = pd.to_datetime(corrected_date_str, errors='coerce')
|
|
1157
|
+
else:
|
|
1158
|
+
corrected_date = pd.to_datetime(date_str, errors='coerce')
|
|
1159
|
+
|
|
1160
|
+
corrected_dates.append(corrected_date)
|
|
1161
|
+
|
|
1162
|
+
# Check length of the corrected_dates list
|
|
1163
|
+
if len(corrected_dates) != len(df):
|
|
1164
|
+
raise ValueError("Length of corrected_dates does not match the original DataFrame")
|
|
1165
|
+
|
|
1166
|
+
# Assign the corrected dates back to the DataFrame
|
|
1167
|
+
df[column_name] = corrected_dates
|
|
1168
|
+
return df
|
|
1169
|
+
|
|
1170
|
+
def fill_weekly_date_range(self, df, date_column, freq='W-MON'):
|
|
1171
|
+
# Ensure the date column is in datetime format
|
|
1172
|
+
df[date_column] = pd.to_datetime(df[date_column])
|
|
1173
|
+
|
|
1174
|
+
# Generate the full date range with the specified frequency
|
|
1175
|
+
full_date_range = pd.date_range(start=df[date_column].min(), end=df[date_column].max(), freq=freq)
|
|
1176
|
+
|
|
1177
|
+
# Create a new dataframe with the full date range
|
|
1178
|
+
full_date_df = pd.DataFrame({date_column: full_date_range})
|
|
1179
|
+
|
|
1180
|
+
# Merge the original dataframe with the new full date range dataframe
|
|
1181
|
+
df_full = full_date_df.merge(df, on=date_column, how='left')
|
|
1182
|
+
|
|
1183
|
+
# Fill missing values with 0
|
|
1184
|
+
df_full.fillna(0, inplace=True)
|
|
1185
|
+
|
|
1186
|
+
return df_full
|
|
1187
|
+
|
|
1188
|
+
def add_prefix_and_suffix(self, df, prefix='', suffix='', date_col=None):
|
|
1189
|
+
"""
|
|
1190
|
+
Adds a specified prefix and/or suffix to the column names of a DataFrame. Optionally, a column (e.g., a date column) can be excluded.
|
|
1191
|
+
|
|
1192
|
+
Args:
|
|
1193
|
+
df (pd.DataFrame): The DataFrame whose column names will be modified.
|
|
1194
|
+
prefix (str, optional): The prefix to add to each column name. Default is an empty string.
|
|
1195
|
+
suffix (str, optional): The suffix to add to each column name. Default is an empty string.
|
|
1196
|
+
date_col (str, optional): The name of the column to exclude from adding prefix and suffix, typically a date column. Default is None.
|
|
1197
|
+
|
|
1198
|
+
Returns:
|
|
1199
|
+
pd.DataFrame: The DataFrame with updated column names.
|
|
1200
|
+
"""
|
|
1201
|
+
|
|
1202
|
+
# If there is no date column
|
|
1203
|
+
if date_col is None:
|
|
1204
|
+
# Add prefixes and suffixes to all columns
|
|
1205
|
+
df.columns = [prefix + col + suffix for col in df.columns]
|
|
1206
|
+
else:
|
|
1207
|
+
# Add prefixes and suffixes to all columns except the date column
|
|
1208
|
+
df.columns = [prefix + col + suffix if col != date_col else col for col in df.columns]
|
|
1209
|
+
|
|
1210
|
+
return df
|
|
1211
|
+
|
|
1212
|
+
def create_dummies(self, df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total'):
|
|
1213
|
+
"""
|
|
1214
|
+
Creates dummy variables for the DataFrame, converting values greater than the threshold to 1 and others to 0.
|
|
1215
|
+
Optionally adds a total dummy column indicating whether any row contains at least one value greater than the threshold.
|
|
1216
|
+
|
|
1217
|
+
Args:
|
|
1218
|
+
df (pd.DataFrame): The DataFrame to process.
|
|
1219
|
+
date_col (str, optional): The column name to exclude from the dummy conversion, typically a date column. Default is None.
|
|
1220
|
+
dummy_threshold (int, optional): The threshold value; values greater than this become 1, others become 0. Default is 0.
|
|
1221
|
+
add_total_dummy_col (str, optional): If set to any value other than 'No', adds a column that contains the max value (1 or 0) for each row. Default is 'No'.
|
|
1222
|
+
total_col_name (str, optional): The name of the total column to add if add_total_dummy_col is not 'No'. Default is 'total'.
|
|
1223
|
+
|
|
1224
|
+
Returns:
|
|
1225
|
+
pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
|
|
1226
|
+
"""
|
|
1227
|
+
|
|
1228
|
+
# If there is no date column
|
|
1229
|
+
if date_col is None:
|
|
1230
|
+
df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
|
|
1231
|
+
|
|
1232
|
+
if add_total_dummy_col != 'No':
|
|
1233
|
+
# Find max value of rows
|
|
1234
|
+
df[total_col_name] = df.max(axis=1)
|
|
1235
|
+
|
|
1236
|
+
# If there is a date column
|
|
1237
|
+
else:
|
|
1238
|
+
# Create dummies for all columns except the date column
|
|
1239
|
+
df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
|
|
1240
|
+
lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
|
|
1241
|
+
)
|
|
1242
|
+
|
|
1243
|
+
if add_total_dummy_col != 'No':
|
|
1244
|
+
# Find max value of rows
|
|
1245
|
+
df[total_col_name] = df.loc[:, df.columns != date_col].max(axis=1)
|
|
1246
|
+
|
|
1247
|
+
return df
|
|
1248
|
+
|
|
1249
|
+
def replace_substrings(self, df, column, replacements, to_lower=False, new_column=None):
|
|
1250
|
+
"""
|
|
1251
|
+
Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
|
|
1252
|
+
Optionally converts the column values to lowercase and allows creating a new column or modifying the existing one.
|
|
1253
|
+
|
|
1254
|
+
Args:
|
|
1255
|
+
df (pd.DataFrame): The DataFrame containing the column to modify.
|
|
1256
|
+
column (str): The column name where the replacements will be made.
|
|
1257
|
+
replacements (dict): A dictionary where keys are substrings to replace and values are the replacement strings.
|
|
1258
|
+
to_lower (bool, optional): If True, the column values will be converted to lowercase before applying replacements. Default is False.
|
|
1259
|
+
new_column (str, optional): If provided, the replacements will be applied to this new column. If None, the existing column will be modified. Default is None.
|
|
1260
|
+
|
|
1261
|
+
Returns:
|
|
1262
|
+
pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
|
|
1263
|
+
"""
|
|
1264
|
+
if new_column is not None:
|
|
1265
|
+
# Create a new column for replacements
|
|
1266
|
+
df[new_column] = df[column]
|
|
1267
|
+
temp_column = new_column
|
|
1268
|
+
else:
|
|
1269
|
+
# Modify the existing column
|
|
1270
|
+
temp_column = column
|
|
1271
|
+
|
|
1272
|
+
# Optionally convert to lowercase
|
|
1273
|
+
if to_lower:
|
|
1274
|
+
df[temp_column] = df[temp_column].str.lower()
|
|
1275
|
+
|
|
1276
|
+
# Apply substring replacements
|
|
1277
|
+
for old, new in replacements.items():
|
|
1278
|
+
df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
|
|
1279
|
+
|
|
1280
|
+
return df
|
|
1281
|
+
|
|
1282
|
+
def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
|
|
1283
|
+
"""
|
|
1284
|
+
Adds a total column to a DataFrame by summing across all columns. Optionally excludes a specified column.
|
|
1285
|
+
|
|
1286
|
+
Args:
|
|
1287
|
+
df (pd.DataFrame): The DataFrame to modify.
|
|
1288
|
+
exclude_col (str, optional): The column name to exclude from the sum. Default is None.
|
|
1289
|
+
total_col_name (str, optional): The name of the new total column. Default is 'Total'.
|
|
1290
|
+
|
|
1291
|
+
Returns:
|
|
1292
|
+
pd.DataFrame: The DataFrame with an added total column.
|
|
1293
|
+
"""
|
|
1294
|
+
if exclude_col and exclude_col in df.columns:
|
|
1295
|
+
# Ensure the column to exclude exists before dropping
|
|
1296
|
+
df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
|
|
1297
|
+
else:
|
|
1298
|
+
# Sum across all columns if no column is specified to exclude
|
|
1299
|
+
df[total_col_name] = df.sum(axis=1)
|
|
1300
|
+
|
|
1301
|
+
return df
|
|
1302
|
+
|
|
1303
|
+
def apply_lookup_table_based_on_substring(self, df, column_name, category_dict, new_col_name='Category', other_label='Other'):
|
|
1304
|
+
"""
|
|
1305
|
+
Categorizes text in a specified DataFrame column by applying a lookup table based on substrings.
|
|
1306
|
+
|
|
1307
|
+
Args:
|
|
1308
|
+
df (pd.DataFrame): The DataFrame containing the column to categorize.
|
|
1309
|
+
column_name (str): The name of the column in the DataFrame that contains the text data to categorize.
|
|
1310
|
+
category_dict (dict): A dictionary where keys are substrings to search for in the text and values are the categories to assign when a substring is found.
|
|
1311
|
+
new_col_name (str, optional): The name of the new column to be created in the DataFrame, which will hold the resulting categories. Default is 'Category'.
|
|
1312
|
+
other_label (str, optional): The name given to category if no substring from the dictionary is found in the cell
|
|
1313
|
+
|
|
1314
|
+
Returns:
|
|
1315
|
+
pd.DataFrame: The original DataFrame with an additional column containing the assigned categories.
|
|
1316
|
+
"""
|
|
1317
|
+
|
|
1318
|
+
def categorize_text(text):
|
|
1319
|
+
"""
|
|
1320
|
+
Assigns a category to a single text string based on the presence of substrings from a dictionary.
|
|
1321
|
+
|
|
1322
|
+
Args:
|
|
1323
|
+
text (str): The text string to categorize.
|
|
1324
|
+
|
|
1325
|
+
Returns:
|
|
1326
|
+
str: The category assigned based on the first matching substring found in the text. If no
|
|
1327
|
+
matching substring is found, returns other_name.
|
|
1328
|
+
"""
|
|
1329
|
+
for key, category in category_dict.items():
|
|
1330
|
+
if key.lower() in text.lower(): # Check if the substring is in the text (case-insensitive)
|
|
1331
|
+
return category
|
|
1332
|
+
return other_label # Default category if no match is found
|
|
1333
|
+
|
|
1334
|
+
# Apply the categorize_text function to each element in the specified column
|
|
1335
|
+
df[new_col_name] = df[column_name].apply(categorize_text)
|
|
1336
|
+
return df
|
|
1337
|
+
|
|
1338
|
+
def compare_overlap(self, df1, df2, date_col):
|
|
1339
|
+
"""
|
|
1340
|
+
Compare overlapping periods between two DataFrames and provide a summary of total differences.
|
|
1341
|
+
|
|
1342
|
+
Args:
|
|
1343
|
+
df1 (pandas.DataFrame): First DataFrame containing date-based data.
|
|
1344
|
+
df2 (pandas.DataFrame): Second DataFrame containing date-based data.
|
|
1345
|
+
date_col (str): The name of the date column used for aligning data.
|
|
1346
|
+
|
|
1347
|
+
Returns:
|
|
1348
|
+
tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
|
|
1349
|
+
"""
|
|
1350
|
+
# Ensure date columns are in datetime format
|
|
1351
|
+
df1[date_col] = pd.to_datetime(df1[date_col])
|
|
1352
|
+
df2[date_col] = pd.to_datetime(df2[date_col])
|
|
1353
|
+
|
|
1354
|
+
# Determine the overlap period
|
|
1355
|
+
start_date = max(df1[date_col].min(), df2[date_col].min())
|
|
1356
|
+
end_date = min(df1[date_col].max(), df2[date_col].max())
|
|
1357
|
+
|
|
1358
|
+
# Filter DataFrames to the overlapping period
|
|
1359
|
+
df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
|
|
1360
|
+
df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
|
|
1361
|
+
|
|
1362
|
+
# Merge the DataFrames on the date column
|
|
1363
|
+
merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
|
|
1364
|
+
|
|
1365
|
+
# Get common columns, excluding the date column
|
|
1366
|
+
common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
|
|
1367
|
+
|
|
1368
|
+
# Create a DataFrame for differences
|
|
1369
|
+
diff_df = pd.DataFrame({date_col: merged_df[date_col]})
|
|
1370
|
+
|
|
1371
|
+
total_diff_list = []
|
|
1372
|
+
for col in common_cols:
|
|
1373
|
+
diff_col = f'diff_{col}'
|
|
1374
|
+
diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
|
|
1375
|
+
|
|
1376
|
+
# Sum differences for the column
|
|
1377
|
+
total_diff = diff_df[diff_col].sum()
|
|
1378
|
+
total_diff_list.append({'Column': col, 'Total Difference': total_diff})
|
|
1379
|
+
|
|
1380
|
+
# Create summary DataFrame
|
|
1381
|
+
total_diff_df = pd.DataFrame(total_diff_list)
|
|
1382
|
+
|
|
1383
|
+
return diff_df, total_diff_df
|
|
1384
|
+
|
|
1385
|
+
def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
|
|
1386
|
+
"""
|
|
1387
|
+
Convert a DataFrame's date column so that each date is mapped back
|
|
1388
|
+
to the 'week_commencing' day of the *current ISO week*.
|
|
1389
|
+
|
|
1390
|
+
Args:
|
|
1391
|
+
df (pandas.DataFrame): The DataFrame with date-based data.
|
|
1392
|
+
date_col (str): The name of the date column.
|
|
1393
|
+
week_commencing (str): The desired start of the week.
|
|
1394
|
+
('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
|
|
1395
|
+
Uses ISO day numbering (Mon=1, ..., Sun=7).
|
|
1396
|
+
|
|
1397
|
+
Returns:
|
|
1398
|
+
pandas.DataFrame: Original DataFrame with an extra column
|
|
1399
|
+
'week_start_<week_commencing>' containing the
|
|
1400
|
+
start-of-week date for each row.
|
|
1401
|
+
"""
|
|
1402
|
+
# ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
|
|
1403
|
+
iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
|
|
1404
|
+
|
|
1405
|
+
target_day = iso_day_dict[week_commencing]
|
|
1406
|
+
|
|
1407
|
+
def map_to_week_start(date_val):
|
|
1408
|
+
delta = (date_val.isoweekday() - target_day) % 7
|
|
1409
|
+
return date_val - pd.Timedelta(days=delta)
|
|
1410
|
+
|
|
1411
|
+
# Apply the transformation
|
|
1412
|
+
new_col = f"week_start_{week_commencing}"
|
|
1413
|
+
df[new_col] = df[date_col].apply(map_to_week_start)
|
|
1414
|
+
|
|
1415
|
+
return df
|