imsciences 0.9.5.9__py3-none-any.whl → 0.9.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2711 +0,0 @@
1
- import pandas as pd
2
- import calendar
3
- import os
4
- import plotly.express as px
5
- import plotly.graph_objs as go
6
- import numpy as np
7
- import datetime
8
- import re
9
- import pandas as pd
10
- from fredapi import Fred
11
- import time
12
- from datetime import datetime,timedelta
13
- from cif import cif
14
- from io import StringIO
15
- import urllib
16
- import requests_cache
17
- import urllib.request
18
- import requests
19
- from geopy.geocoders import Nominatim
20
- import subprocess
21
- import json
22
-
23
- class dataprocessing:
24
-
25
- def help(self):
26
- print("This is the help section. The functions in the package are as follows:")
27
-
28
- print("\n1. get_wd_levels")
29
- print(" - Description: Get the working directory with the option of moving up parents.")
30
- print(" - Usage: get_wd_levels(levels)")
31
- print(" - Example: get_wd_levels(0)")
32
-
33
- print("\n2. remove_rows")
34
- print(" - Description: Removes a specified number of rows from a pandas DataFrame.")
35
- print(" - Usage: remove_rows(data_frame, num_rows_to_remove)")
36
- print(" - Example: remove_rows(df, 2)")
37
-
38
- print("\n3. aggregate_daily_to_wc_long")
39
- print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
40
- print(" - Usage: aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')")
41
- print(" - Example: aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')")
42
-
43
- print("\n4. convert_monthly_to_daily")
44
- print(" - Description: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.")
45
- print(" - Usage: convert_monthly_to_daily(df, date_column, divide)")
46
- print(" - Example: convert_monthly_to_daily(df, 'date')")
47
-
48
- print("\n5. plot_two")
49
- print(" - Description: Plots specified columns from two different DataFrames using a shared date column. Useful for comparing data.")
50
- print(" - Usage: plot_two(df1, col1, df2, col2, date_column, same_axis=True)")
51
- print(" - Example: plot_two(df1, 'cost', df2, 'cost', 'obs', True)")
52
-
53
- print("\n6. remove_nan_rows")
54
- print(" - Description: Removes rows from a DataFrame where the specified column has NaN values.")
55
- print(" - Usage: remove_nan_rows(df, col_to_remove_rows)")
56
- print(" - Example: remove_nan_rows(df, 'date')")
57
-
58
- print("\n7. filter_rows")
59
- print(" - Description: Filters the DataFrame based on whether the values in a specified column are in a provided list.")
60
- print(" - Usage: filter_rows(df, col_to_filter, list_of_filters)")
61
- print(" - Example: filter_rows(df, 'country', ['UK', 'IE'])")
62
-
63
- print("\n8. plot_one")
64
- print(" - Description: Plots a specified column from a DataFrame.")
65
- print(" - Usage: plot_one(df1, col1, date_column)")
66
- print(" - Example: plot_one(df, 'Spend', 'OBS')")
67
-
68
- print("\n9. week_of_year_mapping")
69
- print(" - Description: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.")
70
- print(" - Usage: week_of_year_mapping(df, week_col, start_day_str)")
71
- print(" - Example: week_of_year_mapping(df, 'week', 'mon')")
72
-
73
- print("\n10. exclude_rows")
74
- print(" - Description: Removes rows from a DataFrame based on whether the values in a specified column are not in a provided list.")
75
- print(" - Usage: exclude_rows(df, col_to_filter, list_of_filters)")
76
- print(" - Example: exclude_rows(df, 'week', ['2022-W20', '2022-W21'])")
77
-
78
- print("\n11. rename_cols")
79
- print(" - Description: Renames columns in a pandas DataFrame.")
80
- print(" - Usage: rename_cols(df, name)")
81
- print(" - Example: rename_cols(df, 'ame_facebook'")
82
-
83
- print("\n12. merge_new_and_old")
84
- print(" - Description: Creates a new DataFrame with two columns: one for dates and one for merged numeric values.")
85
- print(" - Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.")
86
- print(" - Usage: merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')")
87
- print(" - Example: merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')")
88
-
89
- print("\n13. merge_dataframes_on_date")
90
- print(" - Description: Merge a list of DataFrames on a common column.")
91
- print(" - Usage: merge_dataframes_on_date(dataframes, common_column='OBS', merge_how='outer')")
92
- print(" - Example: merge_dataframes_on_date([df1, df2, df3], common_column='OBS', merge_how='outer')")
93
-
94
- print("\n14. merge_and_update_dfs")
95
- print(" - Description: Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available, and returns a dataframe sorted by the key column.")
96
- print(" - Usage: merge_and_update_dfs(df1, df2, key_column)")
97
- print(" - Example: merged_dataframe = merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')")
98
-
99
- print("\n15. convert_us_to_uk_dates")
100
- print(" - Description: Convert a DataFrame column with mixed date formats to datetime.")
101
- print(" - Usage: convert_us_to_uk_dates(df, date_col)")
102
- print(" - Example: convert_us_to_uk_dates(df, 'date')")
103
-
104
- print("\n16. combine_sheets")
105
- print(" - Description: Combines multiple DataFrames from a dictionary into a single DataFrame.")
106
- print(" - Usage: combine_sheets(all_sheets)")
107
- print(" - Example: combine_sheets({'Sheet1': df1, 'Sheet2': df2})")
108
-
109
- print("\n17. pivot_table")
110
- print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
111
- print(" - Usage: pivot_table(df, filters_dict, index_col, columns, values_col, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True)")
112
- print(" - Example: pivot_table(df, {'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''}, 'OBS', 'Channel Short Names', 'Value', fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True)")
113
-
114
- print("\n18. apply_lookup_table_for_columns")
115
- print(" - Description: Equivalent of xlookup in excel. Allows you to map a dictionary of substrings within a column. If multiple columns are need for the LUT then a | seperator is needed.")
116
- print(" - Usage: classify_within_column(df, col_names, to_find_dict, if_not_in_country_dict='Other'), new_column_name='Mapping'")
117
- print(" - Example: classify_within_column(df, ['campaign type','media type'], {'France Paid Social FB|paid social': 'facebook','France Paid Social TW|paid social': 'twitter'}, 'other','mapping')")
118
-
119
- print("\n19. aggregate_daily_to_wc_wide")
120
- print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
121
- print(" - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc, aggregation='sum', include_totals=False)")
122
- print(" - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average', True)")
123
-
124
- print("\n20. merge_cols_with_seperator")
125
- print(" - Description: Merge multiple columns in a dataframe into 1 column with a seperator '_'.Can be used if multiple columns are needed for a LUT.")
126
- print(" - Usage: merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = 'Merged',starting_prefix_str=None,ending_prefix_str=None)")
127
- print(" - Example: merge_cols_with_seperator(df, ['Campaign','Product'],seperator='|','Merged Columns',starting_prefix_str='start_',ending_prefix_str='_end')")
128
-
129
- print("\n21. check_sum_of_df_cols_are_equal")
130
- print(" - Description: Checks if the sum of two columns in two dataframes are the same, and provides the sums of each column and the difference between them.")
131
- print(" - Usage: check_sum_of_df_cols_are_equal(df_1,df_2,cols_1,cols_2)")
132
- print(" - Example: check_sum_of_df_cols_are_equal(df_1,df_2,'Media Cost','Spend')")
133
-
134
- print("\n22. convert_2_df_cols_to_dict")
135
- print(" - Description: Can be used to create an LUT. Creates a dictionary using two columns in a dataframe.")
136
- print(" - Usage: convert_2_df_cols_to_dict(df, key_col, value_col)")
137
- print(" - Example: convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')")
138
-
139
- print("\n23. create_FY_and_H_columns")
140
- print(" - Description: Used to create a financial year, half year, and financial half year column.")
141
- print(" - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY,short_format='No',half_years='No',combined_FY_and_H='No')")
142
- print(" - Example: create_FY_and_H_columns(df, 'Week (M-S)', '2022-10-03', 'FY2023',short_format='Yes',half_years='Yes',combined_FY_and_H='Yes')")
143
-
144
- print("\n24. keyword_lookup_replacement")
145
- print(" - Description: Essentially provides an if statement with a xlookup if a value is something. Updates certain chosen values in a specified column of the DataFrame based on a lookup dictionary.")
146
- print(" - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name='Updated Column')")
147
- print(" - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel','segment','product'], qlik_dict_for_channel,output_column_name='Channel New')")
148
-
149
- print("\n25. create_new_version_of_col_using_LUT")
150
- print(" - Description: Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table. The lookup is based on a column in the dataframe.")
151
- print(" - Usage: create_new_version_of_col_using_LUT(df, keys_col,value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')")
152
- print(" - Example: keyword_lookup_replacement(df, '*Campaign Name','Campaign Type',search_campaign_name_retag_lut,'Campaign Name New')")
153
-
154
- print("\n26. convert_df_wide_2_long")
155
- print(" - Description: Changes a dataframe from wide to long format.")
156
- print(" - Usage: convert_df_wide_2_long(df,value_cols,variable_col_name='Stacked',value_col_name='Value')")
157
- print(" - Example: keyword_lookup_replacement(df, ['Media Cost','Impressions','Clicks'],variable_col_name='Metric')")
158
-
159
- print("\n27. manually_edit_data")
160
- print(" - Description: Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe.")
161
- print(" - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)")
162
- print(" - Example: keyword_lookup_replacement(df, {'OBS':' <= datetime(2023,1,23)','File_Name':' == 'France media''},'Master Include',1,change_in_existing_df_col = 'Yes',new_col_to_change_name = 'Master Include',manual_edit_col_name = 'Manual Changes')")
163
-
164
- print("\n28. format_numbers_with_commas")
165
- print(" - Description: Converts data in numerical format into numbers with commas and a chosen decimal place length.")
166
- print(" - Usage: format_numbers_with_commas(df, decimal_length_chosen=2)")
167
- print(" - Example: format_numbers_with_commas(df,1)")
168
-
169
- print("\n29. filter_df_on_multiple_conditions")
170
- print(" - Description: Filters dataframe on multiple conditions, which come in the form of a dictionary.")
171
- print(" - Usage: filter_df_on_multiple_conditions(df, filters_dict)")
172
- print(" - Example: filter_df_on_multiple_conditions(df, {'OBS':' <= datetime(2023,1,23)','File_Name':' == 'France media''})")
173
-
174
- print("\n30. read_and_concatenate_files")
175
- print(" - Description: Read and Concatinate all files of one type in a folder.")
176
- print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
177
- print(" - Example: read_and_concatenate_files(folder_path, file_type='csv')")
178
-
179
- print("\n31. remove zero values")
180
- print(" - Description: Remove zero values in a specified column.")
181
- print(" - Usage: remove_zero_values(self, data_frame, column_to_filter)")
182
- print(" - Example: remove_zero_values(None, df, 'Funeral_Delivery')")
183
-
184
- print("\n32. upgrade all packages")
185
- print(" - Description: Upgrades all packages.")
186
- print(" - Usage: upgrade_outdated_packages()")
187
- print(" - Example: upgrade_outdated_packages()")
188
-
189
- print("\n33. Convert Mixed Formats Dates")
190
- print(" - Description: Convert a mix of US and UK dates to datetime.")
191
- print(" - Usage: convert_mixed_formats_dates(df, datecol)")
192
- print(" - Example: convert_mixed_formats_dates(df, 'OBS')")
193
-
194
- print("\n34. Fill Weekly Missing Dates")
195
- print(" - Description: Fill in any missing weeks with 0.")
196
- print(" - Usage: fill_weekly_date_range(self, df, date_column, freq)")
197
- print(" - Example: fill_weekly_date_range(df, 'OBS', 'W-MON')")
198
-
199
- def get_wd_levels(self, levels):
200
- """
201
- Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
202
-
203
- Parameters:
204
- - data_frame: pandas DataFrame
205
- The input data frame.
206
- - num_rows_to_remove: int
207
- The number of levels to move up pathways.
208
-
209
- Returns:
210
- - Current wd
211
- """
212
-
213
- directory = os.getcwd()
214
- for _ in range(levels):
215
- directory = os.path.dirname(directory)
216
- return directory
217
-
218
- def remove_rows(self, data_frame, num_rows_to_remove):
219
- """
220
- Removes the specified number of rows from the given data frame, including the top row containing column names.
221
- The next row will be treated as the new set of column headings.
222
-
223
- Parameters:
224
- - data_frame: pandas DataFrame
225
- The input data frame.
226
- - num_rows_to_remove: int
227
- The number of rows to remove from the data frame, starting from the original header.
228
-
229
- Returns:
230
- - pandas DataFrames
231
- The modified data frame with rows removed and new column headings.
232
-
233
- Raises:
234
- - TypeError: If num_rows_to_remove is not an integer.
235
- - ValueError: If num_rows_to_remove is negative or exceeds the total number of rows.
236
- """
237
-
238
- if not isinstance(num_rows_to_remove, int):
239
- raise TypeError("num_rows_to_remove must be an integer")
240
-
241
- if num_rows_to_remove < 0 or num_rows_to_remove >= len(data_frame):
242
- raise ValueError("Number of rows to remove must be non-negative and less than the total number of rows in the data frame.")
243
-
244
- if num_rows_to_remove == 0:
245
- return data_frame
246
-
247
- new_header = data_frame.iloc[num_rows_to_remove - 1]
248
- modified_data_frame = data_frame[num_rows_to_remove:]
249
- modified_data_frame.columns = new_header
250
-
251
- return modified_data_frame
252
-
253
- def aggregate_daily_to_wc_long(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum') -> pd.DataFrame:
254
- """
255
- Aggregates daily data into weekly data, starting on a specified day of the week,
256
- and groups the data by additional specified columns. It aggregates specified numeric columns
257
- by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
258
- of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
259
- The day column is renamed from 'Day' to 'OBS'.
260
-
261
- Parameters:
262
- - df: pandas DataFrame
263
- The input DataFrame containing daily data.
264
- - date_column: string
265
- The name of the column in the DataFrame that contains date information.
266
- - group_columns: list of strings
267
- Additional column names to group by along with the weekly grouping.
268
- - sum_columns: list of strings
269
- Numeric column names to be aggregated during aggregation.
270
- - wc: string
271
- The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
272
- - aggregation: string, optional (default 'sum')
273
- Aggregation method, either 'sum', 'average', or 'count'.
274
-
275
- Returns:
276
- - pandas DataFrame
277
- A new DataFrame with weekly aggregated data. The index is reset,
278
- and columns represent the grouped and aggregated metrics. The DataFrame
279
- is in long format, with separate columns for each combination of
280
- grouped metrics.
281
- """
282
-
283
- # Map the input week commencing day to a weekday number (0=Monday, 6=Sunday)
284
- days = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, 'sun': 6}
285
- if wc.lower() not in days:
286
- return print(f"Incorrect week commencing day input: '{wc}'. Please choose a valid day of the week (e.g., 'sun', 'mon', etc.).")
287
-
288
- start_day = days[wc.lower()]
289
-
290
- # Make a copy of the DataFrame
291
- df_copy = df.copy()
292
-
293
- # Convert the date column to datetime
294
- df_copy[date_column] = pd.to_datetime(df_copy[date_column])
295
-
296
- # Determine the start of each week
297
- df_copy['week_start'] = df_copy[date_column].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - start_day) % 7))
298
-
299
- # Convert sum_columns to numeric and fill NaNs with 0, retaining decimal values
300
- for col in sum_columns:
301
- df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)
302
-
303
- # Group by the new week start column and additional columns, then aggregate the numeric columns
304
- if aggregation == 'average':
305
- grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].mean().reset_index()
306
- elif aggregation == 'count':
307
- grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].count().reset_index()
308
- else: # Default to 'sum' if any other value is provided
309
- grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].sum().reset_index()
310
-
311
- # Rename 'week_start' column to 'OBS'
312
- grouped = grouped.rename(columns={'week_start': 'OBS'})
313
-
314
- return grouped
315
-
316
- def convert_monthly_to_daily(self, df, date_column, divide = True):
317
- """
318
- Convert a DataFrame with monthly data to daily data.
319
- This function takes a DataFrame and a date column, then it expands each
320
- monthly record into daily records by dividing the numeric values by the number of days in that month.
321
-
322
- :param df: DataFrame with monthly data.
323
- :param date_column: The name of the column containing the date.
324
- :param divide: boolean divide by the number of days in a month (default True)
325
- :return: A new DataFrame with daily data.
326
- """
327
-
328
- # Convert date_column to datetime
329
- df[date_column] = pd.to_datetime(df[date_column])
330
-
331
- # Initialize an empty list to hold the daily records
332
- daily_records = []
333
-
334
- # Iterate over each row in the DataFrame
335
- for _, row in df.iterrows():
336
- # Calculate the number of days in the month
337
- num_days = calendar.monthrange(row[date_column].year, row[date_column].month)[1]
338
-
339
- # Create a new record for each day of the month
340
- for day in range(1, num_days + 1):
341
- daily_row = row.copy()
342
- daily_row[date_column] = row[date_column].replace(day=day)
343
-
344
- # Divide each numeric value by the number of days in the month
345
- for col in df.columns:
346
- if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
347
- if divide == True:
348
- daily_row[col] = row[col] / num_days
349
- else:
350
- daily_row[col] = row[col]
351
- daily_records.append(daily_row)
352
-
353
- # Convert the list of daily records into a DataFrame
354
- daily_df = pd.DataFrame(daily_records)
355
-
356
- return daily_df
357
-
358
- def plot_two(self, df1, col1, df2, col2, date_column, same_axis=True):
359
- """
360
- Plots specified columns from two different dataframes with both different and the same lengths,
361
- using a specified date column as the X-axis, and charting on either the same or separate y axes.
362
-
363
- :param df1: First DataFrame
364
- :param col1: Column name from the first DataFrame
365
- :param df2: Second DataFrame
366
- :param col2: Column name from the second DataFrame
367
- :param date_column: The name of the date column to use for the X-axis
368
- :param same_axis: If True, plot both traces on the same y-axis; otherwise, use separate y-axes.
369
- :return: Plotly figure
370
- """
371
- # Ensure date columns are datetime
372
- df1[date_column] = pd.to_datetime(df1[date_column])
373
- df2[date_column] = pd.to_datetime(df2[date_column])
374
-
375
- # Create traces for the first and second dataframes
376
- trace1 = go.Scatter(x=df1[date_column], y=df1[col1], mode='lines', name=col1, yaxis='y1')
377
-
378
- if same_axis:
379
- trace2 = go.Scatter(x=df2[date_column], y=df2[col2], mode='lines', name=col2, yaxis='y1')
380
- else:
381
- trace2 = go.Scatter(x=df2[date_column], y=df2[col2], mode='lines', name=col2, yaxis='y2')
382
-
383
- # Define layout for the plot
384
- layout = go.Layout(
385
- title="",
386
- xaxis=dict(title="OBS", showline=True, linecolor='black'),
387
- yaxis=dict(title="", showline=True, linecolor='black', rangemode='tozero'),
388
- yaxis2=dict(title="", overlaying='y', side='right', showline=True, linecolor='black', rangemode='tozero'),
389
- showlegend=True,
390
- plot_bgcolor='white' # Set the plot background color to white
391
- )
392
-
393
- # Create the figure with the defined layout and traces
394
- fig = go.Figure(data=[trace1, trace2], layout=layout)
395
-
396
- return fig
397
-
398
- def remove_nan_rows(self, df, col_to_remove_rows):
399
- # This line drops rows where the specified column has NaN values
400
- return df.dropna(subset=[col_to_remove_rows])
401
-
402
- def filter_rows(self, df, col_to_filter, list_of_filters):
403
- # This line filters the DataFrame based on whether the values in the specified column are in the list_of_filters
404
- return df[df[col_to_filter].isin(list_of_filters)]
405
-
406
- def plot_one(self, df1, col1, date_column):
407
- """
408
- Plots specified column from a DataFrame with white background and black axes,
409
- using a specified date column as the X-axis.
410
-
411
- :param df1: DataFrame
412
- :param col1: Column name from the DataFrame
413
- :param date_column: The name of the date column to use for the X-axis
414
- """
415
-
416
- # Check if columns exist in the DataFrame
417
- if col1 not in df1.columns or date_column not in df1.columns:
418
- raise ValueError("Column not found in DataFrame")
419
-
420
- # Check if the date column is in datetime format, if not convert it
421
- if not pd.api.types.is_datetime64_any_dtype(df1[date_column]):
422
- df1[date_column] = pd.to_datetime(df1[date_column])
423
-
424
- # Plotting using Plotly Express
425
- fig = px.line(df1, x=date_column, y=col1)
426
-
427
- # Update layout for white background and black axes lines, and setting y-axis to start at 0
428
- fig.update_layout(
429
- plot_bgcolor='white',
430
- xaxis=dict(
431
- showline=True,
432
- linecolor='black'
433
- ),
434
- yaxis=dict(
435
- showline=True,
436
- linecolor='black',
437
- rangemode='tozero' # Setting Y-axis to start at 0 if suitable
438
- )
439
- )
440
-
441
- return fig
442
-
443
- def week_of_year_mapping(self,df, week_col, start_day_str):
444
-
445
- # Mapping of string day names to day numbers (1 for Monday, 7 for Sunday)
446
- day_mapping = {
447
- 'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7
448
- }
449
-
450
- # Convert the day string to a number, or raise an error if not valid
451
- start_day = day_mapping.get(start_day_str.lower())
452
- if start_day is None:
453
- raise ValueError(f"Invalid day input: '{start_day_str}'. Please use one of 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'.")
454
-
455
- # Function to convert week number to start date of the week
456
- def week_to_startdate(week_str, start_day):
457
- year, week = map(int, week_str.split('-W'))
458
- first_day_of_year = datetime(year, 1, 1)
459
- first_weekday_of_year = first_day_of_year.weekday() # Monday is 0 and Sunday is 6
460
-
461
- # Calculate days to adjust to the desired start day of the week
462
- days_to_adjust = (start_day - 1 - first_weekday_of_year) % 7
463
- start_of_iso_week = first_day_of_year + timedelta(days=days_to_adjust)
464
-
465
- # Calculate the start of the desired week
466
- start_of_week = start_of_iso_week + timedelta(weeks=week - 1)
467
- return start_of_week
468
-
469
- # Apply the function to each row in the specified week column
470
- df['OBS'] = df[week_col].apply(lambda x: week_to_startdate(x, start_day)).dt.strftime('%d/%m/%Y')
471
- return df
472
-
473
- def exclude_rows(self, df, col_to_filter, list_of_filters):
474
- # This line filters the DataFrame based on whether the values in the specified column are not in the list_of_filters
475
- return df[~df[col_to_filter].isin(list_of_filters)]
476
-
477
- def rename_cols(self, df, name = 'ame_'):
478
- new_columns = {}
479
- for col in df.columns:
480
- if col != 'OBS':
481
- new_col_name = name + col.replace(" ", "_").lower()
482
- else:
483
- new_col_name = col
484
- new_columns[col] = new_col_name
485
- return df.rename(columns=new_columns)
486
-
487
- def merge_new_and_old(self, old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS'):
488
- """
489
- Creates a new DataFrame with two columns: one for dates and one for merged numeric values.
490
- Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.
491
-
492
- Parameters:
493
- - old_df: pandas DataFrame
494
- The old DataFrame from which to take the numeric values up to the specified date.
495
- - old_col: str
496
- The name of the numeric column in the old DataFrame whose values are to be taken.
497
- - new_df: pandas DataFrame
498
- The new DataFrame from which to take the numeric values from the specified date onwards.
499
- - new_col: str
500
- The name of the numeric column in the new DataFrame whose values are to be taken.
501
- - cutoff_date: str
502
- The cut-off date in 'YYYY-MM-DD' format to split the data between the two DataFrames.
503
- - date_col_name: str, optional (default 'OBS')
504
- The name of the date column in both DataFrames.
505
-
506
- Returns:
507
- - pandas DataFrame
508
- A new DataFrame with two columns: 'Date' and a column named after 'new_col' containing merged numeric values.
509
- """
510
-
511
- # Convert date columns in both dataframes to datetime for comparison
512
- old_df[date_col_name] = pd.to_datetime(old_df[date_col_name])
513
- new_df[date_col_name] = pd.to_datetime(new_df[date_col_name])
514
-
515
- # Convert the cutoff date string to datetime
516
- cutoff_date = pd.to_datetime(cutoff_date)
517
-
518
- # Split old and new dataframes based on the cutoff date
519
- old_values = old_df[old_df[date_col_name] <= cutoff_date]
520
- new_values = new_df[new_df[date_col_name] > cutoff_date]
521
-
522
- # Create a new DataFrame with two columns: 'Date' and a column named after 'new_col'
523
- merged_df = pd.DataFrame({
524
- 'OBS': pd.concat([old_values[date_col_name], new_values[date_col_name]], ignore_index=True),
525
- new_col: pd.concat([old_values[old_col], new_values[new_col]], ignore_index=True)
526
- })
527
-
528
- return merged_df
529
-
530
- def merge_dataframes_on_column(self, dataframes, common_column='OBS', merge_how='outer'):
531
- """
532
- Merge a list of DataFrames on a common column.
533
-
534
- Parameters:
535
- - dataframes: A list of DataFrames to merge.
536
- - common_column: The name of the common column to merge on.
537
- - merge_how: The type of merge to perform ('inner', 'outer', 'left', or 'right').
538
-
539
- Returns:
540
- - A merged DataFrame.
541
- """
542
- if not dataframes:
543
- return None
544
-
545
- merged_df = dataframes[0] # Start with the first DataFrame
546
-
547
- for df in dataframes[1:]:
548
- merged_df = pd.merge(merged_df, df, on=common_column, how=merge_how)
549
-
550
- # Check if the common column is of datetime dtype
551
- if merged_df[common_column].dtype == 'datetime64[ns]':
552
- merged_df[common_column] = pd.to_datetime(merged_df[common_column])
553
- merged_df = merged_df.sort_values(by=common_column)
554
- merged_df = merged_df.fillna(0)
555
-
556
- return merged_df
557
-
558
- def merge_and_update_dfs(self, df1, df2, key_column):
559
- """
560
- Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available,
561
- and returns a dataframe sorted by the key column.
562
-
563
- Parameters:
564
- df1 (DataFrame): The first dataframe to merge (e.g., processed_facebook).
565
- df2 (DataFrame): The second dataframe to merge (e.g., finalised_meta).
566
- key_column (str): The name of the column to merge and sort by (e.g., 'OBS').
567
-
568
- Returns:
569
- DataFrame: The merged and updated dataframe.
570
- """
571
-
572
- # Sort both DataFrames by the key column
573
- df1_sorted = df1.sort_values(by=key_column)
574
- df2_sorted = df2.sort_values(by=key_column)
575
-
576
- # Perform the full outer merge
577
- merged_df = pd.merge(df1_sorted, df2_sorted, on=key_column, how='outer', suffixes=('', '_finalised'))
578
-
579
- # Update with non-null values from df2
580
- for column in merged_df.columns:
581
- if column.endswith('_finalised'):
582
- original_column = column.replace('_finalised', '')
583
- merged_df.loc[merged_df[column].notnull(), original_column] = merged_df.loc[merged_df[column].notnull(), column]
584
- merged_df.drop(column, axis=1, inplace=True)
585
-
586
- # Sort the merged DataFrame by the key column
587
- merged_df.sort_values(by=key_column, inplace=True)
588
-
589
- # Handle null values (optional, can be adjusted as needed)
590
- merged_df.fillna(0, inplace=True)
591
-
592
- return merged_df
593
-
594
- def convert_us_to_uk_dates(self, df, date_col):
595
- """
596
- Processes the date column of a DataFrame to remove hyphens and slashes,
597
- and converts it to a datetime object.
598
-
599
- Parameters:
600
- df (pd.DataFrame): The DataFrame containing the date column.
601
- date_col (str): The name of the date column.
602
-
603
- Returns:
604
- pd.DataFrame: The DataFrame with the processed date column.
605
- """
606
- df[date_col] = df[date_col].str.replace(r'[-/]', '', regex=True)
607
- df[date_col] = pd.to_datetime(
608
- df[date_col].str.slice(0, 2) + '/' +
609
- df[date_col].str.slice(2, 4) + '/' +
610
- df[date_col].str.slice(4, 8),
611
- format='%m/%d/%Y'
612
- )
613
- return df
614
-
615
- def combine_sheets(self, all_sheets):
616
- """
617
- Combines multiple DataFrames from a dictionary into a single DataFrame.
618
- Adds a column 'SheetName' indicating the origin sheet of each row.
619
-
620
- Parameters:
621
- all_sheets (dict): A dictionary of DataFrames, typically read from an Excel file with multiple sheets.
622
-
623
- Returns:
624
- DataFrame: A concatenated DataFrame with an additional 'SheetName' column.
625
- """
626
- combined_df = pd.DataFrame()
627
-
628
- for sheet_name, df in all_sheets.items():
629
- df['SheetName'] = sheet_name
630
- combined_df = pd.concat([combined_df, df], ignore_index=True)
631
-
632
- return combined_df
633
-
634
- def pivot_table(self, df, filters_dict, index_col, columns, values_col, fill_value=0,aggfunc='sum',margins=False,margins_name="Total",datetime_trans_needed=True):
635
- """
636
- Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
637
-
638
- Args:
639
- df (pandas.DataFrame): The DataFrame containing the data.
640
- filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
641
- index_col (str): Name of Column for your pivot table to index on
642
- columns (str): Name of Columns for your pivot table.
643
- values_col (str): Name of Values Columns for your pivot table.
644
- fill_value (int, optional): The value to replace nan with. Defaults to 0.
645
- aggfunc (str, optional): The method on which to aggregate the values column. Defaults to sum.
646
- margins (bool, optional): Whether the pivot table needs a total rows and column. Defaults to False.
647
- margins_name (str, optional): The name of the Totals columns. Defaults to "Total".
648
- datetime_trans_needed (bool, optional): Whether the index column needs to be transformed into datetime format. Defaults to False.
649
-
650
- Returns:
651
- pandas.DataFrame: The pivot table specified
652
- """
653
-
654
- # Create the filtered df by applying the conditions
655
- df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
656
-
657
- # Ensure OBS is in datetime format for proper sorting
658
- df_filtered = df_filtered.copy()
659
-
660
- # If datetime transformation is needed
661
- if datetime_trans_needed is True:
662
- df_filtered.loc[:,index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
663
-
664
- # Create the pivot table
665
- pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc,margins=margins,margins_name=margins_name)
666
-
667
- # Handling MultiIndex columns if present, making them a flat structure
668
- if isinstance(pivoted_df.columns, pd.MultiIndex):
669
- pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
670
- else:
671
- pivoted_df.columns = pivoted_df.columns.map(str)
672
-
673
- # Reset the pivot before returning
674
- pivoted_df = pivoted_df.reset_index()
675
-
676
- # Sort by OBS from oldest to newest
677
- if datetime_trans_needed is True:
678
- # pivoted_df = pivoted_df.reset_index()
679
- pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col]) # Ensure sorting works correctly
680
- pivoted_df = pivoted_df.sort_values(by=index_col)
681
-
682
- # Convert OBS back to a string in YYYY-MM-DD format for display purposes
683
- pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
684
-
685
- # Set index back to date column
686
- # pivoted_df.set_index(index_col,inplace=True)
687
-
688
- # Fill in any NaNs
689
- pivoted_df = pivoted_df.fillna(fill_value)
690
-
691
- return pivoted_df
692
-
693
- def apply_lookup_table_for_columns(self, df, col_names, to_find_dict, if_not_in_dict="Other", new_column_name="Mapping"):
694
- """
695
- Creates a new DataFrame column based on a look up table, possibly with multiple columns to look up on (dictionary of substrings to class mappings).
696
-
697
- Parameters:
698
- df (pandas.DataFrame): The DataFrame containing the data.
699
- col_names (list of str): these are the columns which are used for the lookup. One column or several columns can be inputted as a list, provided there is a merged column to lookup on. If there are multiple columns to look up on then a merged column must be inputted as the key of the dictionary of format e.g. col1|col2|col3
700
- to_find_dict (dict): your look up table, where keys are the values being looked up, and the values are the resulting mappings.
701
- if_not_in_dict (str, optional): default value if no substring matches are found in the look up table dictionary. Defaults to "Other".
702
- new_column_name (str, optional): name of new column. Defaults to "Mapping".
703
-
704
- Returns:
705
- pandas.DataFrame: DataFrame with a new column containing the look up table results.
706
- """
707
-
708
- # Create regex pattern from the dictionary keys
709
- regex_pattern = "|".join(re.escape(key) for key in to_find_dict.keys())
710
-
711
- # Preprocess DataFrame if multiple columns
712
- if len(col_names) > 1:
713
- df["Merged"] = df[col_names].astype(str).apply('|'.join, axis=1)
714
- col_to_use = "Merged"
715
- else:
716
- col_to_use = col_names[0]
717
-
718
- # Extract the first match using the regex pattern
719
- matches = df[col_to_use].str.extract(f'({regex_pattern})', expand=False, flags=re.IGNORECASE)
720
-
721
- # Map the matches to the corresponding values in the dictionary
722
- df[new_column_name] = matches.str.lower().map({k.lower(): v for k, v in to_find_dict.items()}).fillna(if_not_in_dict)
723
-
724
- # Drop intermediate column if created
725
- if len(col_names) > 1:
726
- df.drop(columns=["Merged"], inplace=True)
727
-
728
- return df
729
-
730
- def aggregate_daily_to_wc_wide(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum', include_totals : bool = False) -> pd.DataFrame:
731
- """
732
- Aggregates daily data into weekly data, starting on a specified day of the week,
733
- and groups the data by additional specified columns. It aggregates specified numeric columns
734
- by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
735
- of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
736
- The day column is renamed from 'Day' to 'OBS'.
737
-
738
- Parameters:
739
- - df: pandas DataFrame
740
- The input DataFrame containing daily data.
741
- - date_column: string
742
- The name of the column in the DataFrame that contains date information.
743
- - group_columns: list of strings
744
- Additional column names to group by along with the weekly grouping.
745
- - sum_columns: list of strings
746
- Numeric column names to be aggregated during aggregation.
747
- - wc: string
748
- The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
749
- - aggregation: string, optional (default 'sum')
750
- Aggregation method, either 'sum', 'average', or 'count'.
751
- - include_totals: boolean, optional (default False)
752
- If True, include total columns for each sum_column.
753
-
754
- Returns:
755
- - pandas DataFrame
756
- A new DataFrame with weekly aggregated data. The index is reset,
757
- and columns represent the grouped and aggregated metrics. The DataFrame
758
- is in wide format, with separate columns for each combination of
759
- grouped metrics.
760
- """
761
-
762
- grouped = self.aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation)
763
-
764
- # Pivot the data to wide format
765
- if group_columns:
766
- wide_df = grouped.pivot_table(index='OBS',
767
- columns=group_columns,
768
- values=sum_columns,
769
- aggfunc='first')
770
- # Flatten the multi-level column index and create combined column names
771
- wide_df.columns = ['_'.join(col).strip() for col in wide_df.columns.values]
772
- else:
773
- wide_df = grouped.set_index('OBS')
774
-
775
- # Fill NaN values with 0
776
- wide_df = wide_df.fillna(0)
777
-
778
- # Adding total columns for each unique sum_column, if include_totals is True
779
- if include_totals:
780
- for col in sum_columns:
781
- total_column_name = f'Total {col}'
782
- if group_columns:
783
- columns_to_sum = [column for column in wide_df.columns if col in column]
784
- else:
785
- columns_to_sum = [col]
786
- wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
787
-
788
- # Reset the index of the final DataFrame
789
- wide_df = wide_df.reset_index()
790
-
791
- return wide_df
792
-
793
- def merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = "Merged",starting_prefix_str=None,ending_prefix_str=None):
794
- """
795
- Creates a new column in the dataframe that merges 2 or more columns together with a "_" seperator, possibly to be used for a look up table where multiple columns are being looked up
796
-
797
- Parameters:
798
- df (pandas.DataFrame): Dataframe to make changes to.
799
- col_names (list): list of columm names ot merge.
800
- seperator (str, optional): Name of column outputted. Defaults to "_".
801
- output_column_name (str, optional): Name of column outputted. Defaults to "Merged".
802
- starting_prefix_str (str, optional): string of optional text to be added before the merged column str value
803
- ending_prefix_str (str, optional): string of optional text to be added after the merged column str value
804
-
805
- Raises:
806
- ValueError: if more less than two column names are inputted in the list there is nothing to merge on
807
-
808
- Returns:
809
- pandas.DataFrame: DataFrame with additional merged column
810
- """
811
- # Specify more than one column must be entered
812
- if len(col_names) < 2:
813
- raise ValueError("2 or more columns must be specified to merge")
814
-
815
- # Create a new column with the merged columns
816
- df[output_column_name] = df[col_names].astype(str).apply(seperator.join, axis=1)
817
-
818
- # Add string before
819
- if starting_prefix_str is not None:
820
- df[output_column_name] = starting_prefix_str + df[output_column_name].astype(str)
821
-
822
- # Add string after
823
- if ending_prefix_str is not None:
824
- df[output_column_name] = df[output_column_name].astype(str) + ending_prefix_str
825
-
826
- return df
827
-
828
- def check_sum_of_df_cols_are_equal(self, df_1,df_2,cols_1,cols_2):
829
- """
830
- Checks the sum of two different dataframe column or columns are equal
831
-
832
- Parameters:
833
- df_1 (pandas.DataFrame): First dataframe for columnsa to be summed on.
834
- df_2 (pandas.DataFrame): Second dataframe for columnsa to be summed on.
835
- cols_1 (list of str): Columns from first dataframe to sum.
836
- cols_2 (list of str): Columns from second dataframe to sum.
837
-
838
- Returns:
839
- Tuple: Answer is the true or false answer to whether sums are the same, df_1_sum is the sum of the column/columns in the first dataframe, df_2_sum is the sum of the column/columns in the second dataframe
840
- """
841
- # Find the sum of both sets of columns
842
- df_1_sum = df_1[cols_1].sum().sum()
843
- df_2_sum = df_2[cols_2].sum().sum()
844
-
845
- # If the the two columns are
846
- if df_1_sum == df_2_sum:
847
- Answer = "They are equal"
848
- if df_1_sum != df_2_sum:
849
- Answer = "They are different by " + str(df_2_sum-df_1_sum)
850
-
851
- return Answer,df_1_sum,df_2_sum
852
-
853
- def convert_2_df_cols_to_dict(self, df, key_col, value_col):
854
- """
855
- Create a dictionary mapping from two columns of a DataFrame.
856
-
857
- Parameters:
858
- df (pd.DataFrame): The DataFrame containing the data.
859
- key_col (str): The column name to use as keys in the dictionary.
860
- value_col (str): The column name to use as values in the dictionary.
861
-
862
- Returns:
863
- dict: A dictionary with keys from 'key_col' and values from 'value_col'.
864
- """
865
- if key_col not in df or value_col not in df:
866
- raise ValueError("Specified columns are not in the DataFrame")
867
-
868
- return {df[key_col].iloc[i]: df[value_col].iloc[i] for i in range(len(df))}
869
-
870
- def create_FY_and_H_columns(self, df, index_col, start_date, starting_FY,short_format="No",half_years="No",combined_FY_and_H="No"):
871
- """
872
- Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
873
-
874
- Parameters:
875
- df (pandas.DataFrame): Dataframe to operate on.
876
- index_col (str): Name of the column to use for datetime
877
- start_date (str): String used to specify the start date of an FY specified, needs to be of format "yyyy-mm-dd" e.g. 2021-11-31
878
- starting_FY (str): String used to specify which FY the start date refers to, needs to be formatted LONG e.g. FY2021
879
- short_format (str, optional): String used to specify if short format is desired (e.g. FY21) or if long format is desired (e.g. FY2021). Defaults to "No".
880
- half_years (str, optional): String used to specify if half year column is desired. Defaults to "No".
881
- combined_FY_and_H (str, optional): String used to specify is a combined half year and FY column is desired. Defaults to "No".
882
-
883
- Returns:
884
- pandas.DataFrame: DataFrame with a new column 'FY' containing the FY as well as, if desired, a half year column and a combined FY half year column.
885
- """
886
-
887
- try:
888
- start_date = datetime.strptime(start_date, '%Y-%m-%d')
889
- except ValueError:
890
- print("Error: Date must be of format yyyy-mm-dd")
891
- return df
892
-
893
- df["OBS"] = pd.to_datetime(df[index_col])
894
- df["OBS as string"] = df["OBS"].dt.strftime("%Y-%m-%d")
895
-
896
- df[index_col] = pd.to_datetime(df[index_col])
897
-
898
- start_year = int(starting_FY[2:])
899
-
900
- def calculate_FY_vectorized(date_series):
901
- years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
902
- fy = 'FY' + (start_year + years_since_start).astype(str)
903
- if short_format == "Yes":
904
- fy = 'FY' + fy.str[-2:]
905
- return fy
906
-
907
- df['FY'] = calculate_FY_vectorized(df[index_col])
908
-
909
- if half_years == "Yes" or combined_FY_and_H == "Yes":
910
- def calculate_half_year_vectorized(date_series):
911
- fy_years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
912
- fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(years=1)
913
- fy_end_of_h1 = fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
914
- half_year = np.where(date_series <= fy_end_of_h1, 'H1', 'H2')
915
- return half_year
916
-
917
- df['Half Years'] = calculate_half_year_vectorized(df[index_col])
918
-
919
- if combined_FY_and_H == "Yes":
920
- df['Financial Half Years'] = df['FY'] + ' ' + df['Half Years']
921
-
922
- return df
923
-
924
- def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name="Updated Column"):
925
- """
926
- This function updates values in a specified column of the DataFrame based on a lookup dictionary.
927
- It first merges several columns into a new 'Merged' column, then uses this merged column to determine
928
- if replacements are needed based on the dictionary.
929
-
930
- Parameters:
931
- df (pd.DataFrame): The DataFrame to process.
932
- col (str): The name of the column whose values are potentially replaced.
933
- replacement_rows (str): The specific value in 'col' to check for replacements.
934
- cols_to_merge (list of str): List of column names whose contents will be merged to form a lookup key.
935
- replacement_lookup_dict (dict): Dictionary where keys are merged column values and values are the new data to replace in 'col'.
936
- output_column_name (str, optional): Name of column outputted. Defaults to "Updated Column".
937
-
938
- Returns:
939
- pd.DataFrame: The modified DataFrame with updated values in the specified column.
940
- """
941
- df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
942
-
943
- def replace_values(x):
944
- if x[col] == replacement_rows:
945
- merged_value = x['Merged']
946
- if merged_value in replacement_lookup_dict:
947
- return replacement_lookup_dict[merged_value]
948
- return x[col]
949
-
950
- df[output_column_name] = df.apply(replace_values, axis=1)
951
-
952
- return df
953
-
954
- def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
955
- """
956
- Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
957
- The lookup is based on a column in the dataframe. Can only input one column and output one new column.
958
-
959
- Parameters:
960
- df (pandas.DataFrame): The DataFrame containing the data.
961
- keys_col (str): The name of the column which the LUT will be refercing to ouput a value.
962
- value_col (str): The name of the column which the new column will be based off. If a key in the key column is not found in the LUT, the values from this column are used instead.
963
- dict_for_specific_changes (dict): The LUT which the keys_col will be mapped on to find any values that need changing in the new column.
964
- new_col_name (str, optional): This is the name of the new column being generated. Defaults to "New Version of Old Col".
965
-
966
- Returns:
967
- pandas.DataFrame: DataFrame with a new column which is similar to the old column, except for where changes have been made to reflect the lookup table.
968
- """
969
-
970
- # Extract columns to change using new dictionary
971
- smaller_df = df[[keys_col,value_col]]
972
-
973
- # Use the new dictionary to create a new LUT
974
- smaller_df_with_LUT = self.apply_lookup_table_for_columns(smaller_df,[keys_col,value_col],dict_for_specific_changes)
975
-
976
- # In a new column, keep values from the old column that don't need updating as they are not in the dictionary, and replace values that do need updating with values from the dictionary based on the keys
977
- smaller_df_with_LUT["Updated Col"]=smaller_df_with_LUT.apply(lambda x: x['Mapping'] if x['Mapping'] != "Other" else x[value_col],axis=1)
978
-
979
- # Drop the extra unecessary cols
980
- smaller_df_with_LUT.drop([keys_col,'Mapping'],axis=1,inplace=True)
981
-
982
- # # Output dataframes as dictionary to be used in a LUT
983
- new_dict = self.convert_2_df_cols_to_dict(smaller_df_with_LUT,value_col,"Updated Col")
984
-
985
- # # Use new dictionary to create a new version of an old column
986
- df_final = self.apply_lookup_table_for_columns(df,[keys_col],new_dict,"other",new_col_name)
987
-
988
- return df_final
989
-
990
- def convert_df_wide_2_long(self, df,value_cols,variable_col_name='Stacked',value_col_name='Value'):
991
- """
992
- Changes a dataframe from wide to long format.
993
-
994
- Args:
995
- df (pandas.DataFrame): The DataFrame containing the data.
996
- value_cols (list of str or str if only one): list of column names which are to be transformed from several columns into one.
997
- variable_col_name (str, optional): Name of new variables column, which contains the names of the columns which have been stacked into one. Defaults to 'Stacked'.
998
- value_col_name (str, optional): Name of the new value column which contains all the data from the stacked columns. Defaults to 'Value'.
999
-
1000
- Returns:
1001
- pandas.DataFrame:: Returns dataframe transformed from long to wide.
1002
-
1003
- Raises:
1004
- ValueError: If number of column names to be depivoted is less than 2, then this function is not neccesary.
1005
- """
1006
-
1007
- # Check length of value cols is greater than 1
1008
- if len(value_cols) < 2:
1009
- raise ValueError("Number of inputs in list must be greater than 1")
1010
-
1011
- # Find the columns that are not to be depivoted into one column
1012
- id_vars = list(set(df.columns.tolist()) - set(value_cols))
1013
-
1014
- # Melt all columns chosen into one column
1015
- df_final = pd.melt(df, id_vars,value_cols,var_name=variable_col_name,value_name=value_col_name)
1016
-
1017
- return df_final
1018
-
1019
- def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
1020
- """
1021
- Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
1022
-
1023
- Args:
1024
- df (pandas.DataFrame): The DataFrame containing the data.
1025
- filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
1026
- col_to_change (str): String name of column to edit
1027
- new_value (any): Value of new input for cell
1028
- change_in_existing_df_col (str, optional): Input of Yes or No to describe whether to make the change in an existing column. Defaults to "No".
1029
- new_col_to_change_name (str, optional): Name of the new column to copy the column being edited into and to make the change in. Defaults to 'New'.
1030
- manual_edit_col_name (str, optional): Name of the current manual edits column, if one is not specified it will be created. Defaults to None.
1031
- add_notes (str, optional): Gives the option to create a new notes column. Defaults to "No".
1032
- existing_note_col_name (str, optional): If there is an existing notes column this can be specified. Defaults to None.
1033
- note (str), optional): The string of the note to be added to the column. Defaults to None.
1034
-
1035
- Raises:
1036
- TypeError: The column for the column to change can only be specified as one column as it is a string not a list
1037
- ValueError: You can only input the values of "Yes" or "No" for whether to make the change in existing column
1038
- ValueError: You can only input the values of "Yes" or "No" for whether to make a new notes column
1039
-
1040
- Returns:
1041
- pandas.DataFrame: Dataframe with manual changes added
1042
- """
1043
- # Raise type error if more than one col is supported
1044
- if isinstance(col_to_change, list):
1045
- raise TypeError("Col to change must be specified as a string, not a list")
1046
-
1047
- # Raises value error if input is invalid for change_in_existing_df_col
1048
- if change_in_existing_df_col not in ["Yes", "No"]:
1049
- raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
1050
-
1051
- # Raises value error if input is invalid for add_notes_col
1052
- if add_notes not in ["Yes", "No"]:
1053
- raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
1054
-
1055
- # Create the filtered df by applying the conditions
1056
- df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
1057
-
1058
- # Create a new column to add the changes if desired, else edit in the current chosen column
1059
- col_to_update = col_to_change if change_in_existing_df_col == "Yes" else new_col_to_change_name
1060
- if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
1061
- df = df.copy()
1062
- df[new_col_to_change_name] = df[col_to_change]
1063
-
1064
- # Update the new cell in the chosen column
1065
- df.loc[df_filtered.index, col_to_update] = new_value
1066
-
1067
- # Add in manual edit column if desired or specify where one already is
1068
- if manual_edit_col_name:
1069
- if manual_edit_col_name not in df.columns:
1070
- df[manual_edit_col_name] = 0
1071
- df.loc[df_filtered.index, manual_edit_col_name] = 1
1072
- elif not manual_edit_col_name and 'Manual Changes' not in df.columns:
1073
- df['Manual Changes'] = 0
1074
- df.loc[df_filtered.index, 'Manual Changes'] = 1
1075
-
1076
- # Add note if desired in new column or an existing column
1077
- if add_notes == "Yes":
1078
- note_col = existing_note_col_name if existing_note_col_name else 'Notes'
1079
- if note_col not in df.columns:
1080
- df[note_col] = None
1081
- df.loc[df_filtered.index, note_col] = note
1082
-
1083
- return df
1084
-
1085
- def format_numbers_with_commas(self, df, decimal_length_chosen=2):
1086
- """
1087
- Converts data in numerical format into numbers with commas and a chosen decimal place length
1088
-
1089
- Args:
1090
- df (pandas.DataFrame): The DataFrame containing the data.
1091
- decimal_length_chosen (int, optional): _description_. Defaults to 2.
1092
-
1093
- Returns:
1094
- pandas.DataFrame: The dataframe with the chosen updated format
1095
- """
1096
- def format_number_with_commas(x, decimal_length=decimal_length_chosen):
1097
- if isinstance(x, (int, float)):
1098
- if decimal_length is not None:
1099
- format_str = "{:,.{}f}".format(x, decimal_length)
1100
- formatted_number = format_str.format(x)
1101
- else:
1102
- formatted_number = "{:,}".format(x)
1103
- return formatted_number
1104
- else:
1105
- return x # Return unchanged if not a number
1106
-
1107
-
1108
- # Apply the function across several columns using applymap()
1109
- formatted_df = df.applymap(format_number_with_commas)
1110
-
1111
- return formatted_df
1112
-
1113
- def filter_df_on_multiple_conditions(self, df, filters_dict):
1114
- """
1115
- Filter a dataframe based on mulitple conditions
1116
-
1117
- Args:
1118
- df (pandas.DatFrame): Dataframe to filter on
1119
- filters_dict (dict): Dictionary with strings as conditions
1120
-
1121
- Returns:
1122
- pandas.DatFrame: Filtered Da
1123
- """
1124
- mask = pd.Series(True, index=df.index)
1125
- for col, cond in filters_dict.items():
1126
- cond = cond.strip()
1127
- operator, value = cond.split(maxsplit=1)
1128
-
1129
- # If value is a string condition make sure to check if there are new lines
1130
- if "'" in value:
1131
- value = value.strip().strip("'\"")
1132
- # If not a string e.g. datetime or number condition you need to transform the string into a value
1133
- else:
1134
- value = eval(value)
1135
-
1136
- if operator == "==":
1137
- temp_mask = (df[col] == value)
1138
- elif operator == "!=":
1139
- temp_mask = (df[col] != value)
1140
- elif operator == ">=":
1141
- temp_mask = (df[col] >= value)
1142
- elif operator == "<=":
1143
- temp_mask = (df[col] <= value)
1144
- elif operator == ">":
1145
- temp_mask = (df[col] > value)
1146
- elif operator == "<":
1147
- temp_mask = (df[col] < value)
1148
- mask &= temp_mask
1149
-
1150
- # Create the filtered df by applying the conditions
1151
- df_filtered = df[mask]
1152
-
1153
- return df_filtered
1154
-
1155
- def read_and_concatenate_files(self, folder_path, file_type='csv'):
1156
- """
1157
- Reads all files of a specified type (CSV or XLSX) from a given folder
1158
- and concatenates them into a single DataFrame.
1159
-
1160
- Parameters:
1161
- folder_path (str): The path to the folder containing the files.
1162
- file_type (str): The type of files to read ('csv' or 'xlsx'). Defaults to 'csv'.
1163
-
1164
- Returns:
1165
- pd.DataFrame: A DataFrame containing the concatenated data from all files.
1166
- """
1167
-
1168
- # Initialize an empty list to hold dataframes
1169
- dataframes = []
1170
-
1171
- # Define file extension based on file_type
1172
- if file_type == 'csv':
1173
- extension = '.csv'
1174
- elif file_type == 'xlsx':
1175
- extension = '.xlsx'
1176
- else:
1177
- raise ValueError("file_type must be either 'csv' or 'xlsx'")
1178
-
1179
- # Loop through all files in the folder
1180
- for filename in os.listdir(folder_path):
1181
- # Check if the file has the correct extension
1182
- if filename.endswith(extension):
1183
- file_path = os.path.join(folder_path, filename)
1184
- # Read the file into a DataFrame
1185
- if file_type == 'csv':
1186
- df = pd.read_csv(file_path)
1187
- elif file_type == 'xlsx':
1188
- df = pd.read_excel(file_path)
1189
- # Append the DataFrame to the list
1190
- dataframes.append(df)
1191
-
1192
- # Concatenate all DataFrames into a single DataFrame
1193
- combined_df = pd.concat(dataframes, ignore_index=True)
1194
-
1195
- return combined_df
1196
-
1197
- def remove_zero_values(self, data_frame, column_to_filter):
1198
- """
1199
- Removes zero values from given columns
1200
-
1201
- Parameters:
1202
- df - input data frame
1203
- column_to_filter - a column to filter out zero values from
1204
-
1205
- Returns:
1206
- Pandas data frame without null values
1207
- """
1208
-
1209
- #This line removes zero values from given column
1210
-
1211
- return data_frame.loc[~(data_frame[column_to_filter] ==0)]
1212
-
1213
- def upgrade_outdated_packages(self):
1214
- try:
1215
- # Get all installed packages
1216
- installed_packages_result = subprocess.run("pip list --format=json", shell=True, capture_output=True, text=True)
1217
- installed_packages = json.loads(installed_packages_result.stdout)
1218
-
1219
- # Get the list of outdated packages
1220
- outdated_packages_result = subprocess.run("pip list --outdated --format=json", shell=True, capture_output=True, text=True)
1221
- outdated_packages = json.loads(outdated_packages_result.stdout)
1222
-
1223
- # Create a set of outdated package names for quick lookup
1224
- outdated_package_names = {pkg['name'] for pkg in outdated_packages}
1225
-
1226
- # Upgrade only outdated packages
1227
- for package in installed_packages:
1228
- package_name = package['name']
1229
- if package_name in outdated_package_names:
1230
- try:
1231
- print(f"Upgrading package: {package_name}")
1232
- upgrade_result = subprocess.run(f"pip install --upgrade {package_name}", shell=True, capture_output=True, text=True)
1233
- if upgrade_result.returncode == 0:
1234
- print(f"Successfully upgraded {package_name}")
1235
- else:
1236
- print(f"Failed to upgrade {package_name}: {upgrade_result.stderr}")
1237
- except Exception as e:
1238
- print(f"An error occurred while upgrading {package_name}: {e}")
1239
- else:
1240
- print(f"{package_name} is already up to date")
1241
- except Exception as e:
1242
- print(f"An error occurred during the upgrade process: {e}")
1243
-
1244
- def convert_mixed_formats_dates(self, df, column_name):
1245
- # Convert initial dates to datetime with coercion to handle errors
1246
- df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
1247
- df[column_name] = df[column_name].astype(str)
1248
- corrected_dates = []
1249
-
1250
- for date_str in df[column_name]:
1251
- date_str = date_str.replace('-', '').replace('/', '')
1252
- if len(date_str) == 8:
1253
- year = date_str[:4]
1254
- month = date_str[4:6]
1255
- day = date_str[6:8]
1256
- if int(day) <= 12:
1257
- # Swap month and day
1258
- corrected_date_str = f"{year}-{day}-{month}"
1259
- else:
1260
- corrected_date_str = f"{year}-{month}-{day}"
1261
- # Convert to datetime
1262
- corrected_date = pd.to_datetime(corrected_date_str, errors='coerce')
1263
- else:
1264
- corrected_date = pd.to_datetime(date_str, errors='coerce')
1265
-
1266
- corrected_dates.append(corrected_date)
1267
-
1268
- # Check length of the corrected_dates list
1269
- if len(corrected_dates) != len(df):
1270
- raise ValueError("Length of corrected_dates does not match the original DataFrame")
1271
-
1272
- # Assign the corrected dates back to the DataFrame
1273
- df[column_name] = corrected_dates
1274
- return df
1275
-
1276
- def fill_weekly_date_range(self, df, date_column, freq='W-MON'):
1277
- # Ensure the date column is in datetime format
1278
- df[date_column] = pd.to_datetime(df[date_column])
1279
-
1280
- # Generate the full date range with the specified frequency
1281
- full_date_range = pd.date_range(start=df[date_column].min(), end=df[date_column].max(), freq=freq)
1282
-
1283
- # Create a new dataframe with the full date range
1284
- full_date_df = pd.DataFrame({date_column: full_date_range})
1285
-
1286
- # Merge the original dataframe with the new full date range dataframe
1287
- df_full = full_date_df.merge(df, on=date_column, how='left')
1288
-
1289
- # Fill missing values with 0
1290
- df_full.fillna(0, inplace=True)
1291
-
1292
- return df_full
1293
-
1294
-
1295
-
1296
-
1297
-
1298
-
1299
-
1300
-
1301
- ########################################################################################################################################
1302
- ########################################################################################################################################
1303
-
1304
-
1305
-
1306
-
1307
-
1308
-
1309
-
1310
-
1311
-
1312
-
1313
-
1314
-
1315
- ims_proc = dataprocessing()
1316
-
1317
- class datapull:
1318
-
1319
- def help(self):
1320
- print("This is the help section. The functions in the package are as follows:")
1321
-
1322
- print("\n1. pull_fred_data")
1323
- print(" - Description: Get data from FRED by using series id tokens.")
1324
- print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
1325
- print(" - Example: pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])")
1326
-
1327
- print("\n2. pull_boe_data")
1328
- print(" - Description: Fetch and process Bank of England interest rate data.")
1329
- print(" - Usage: pull_boe_data(week_commencing)")
1330
- print(" - Example: pull_boe_data('mon')")
1331
-
1332
- print("\n3. pull_ons_data")
1333
- print(" - Description: Fetch and process time series data from the ONS API.")
1334
- print(" - Usage: pull_ons_data(series_list, week_commencing)")
1335
- print(" - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
1336
-
1337
- print("\n4. pull_oecd")
1338
- print(" - Description: Fetch macroeconomic data from OECD and other sources for a specified country.")
1339
- print(" - Usage: pull_macro(country='GBR', week_commencing='mon')")
1340
- print(" - Example: pull_macro('GBR', 'mon')")
1341
-
1342
- print("\n5. get_google_mobility_data")
1343
- print(" - Description: Fetch Google Mobility data for the specified country.")
1344
- print(" - Usage: get_google_mobility_data(country, wc)")
1345
- print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
1346
-
1347
- print("\n6. pull_combined_dummies")
1348
- print(" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
1349
- print(" - Usage: pull_combined_dummies(week_commencing)")
1350
- print(" - Example: pull_combined_dummies('mon')")
1351
-
1352
- print("\n7. pull_weather")
1353
- print(" - Description: Fetch and process historical weather data for the specified country.")
1354
- print(" - Usage: pull_weather(week_commencing, country)")
1355
- print(" - Example: pull_weather('mon', 'GBR')")
1356
-
1357
- ############################################################### MACRO ##########################################################################
1358
-
1359
- def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]) -> pd.DataFrame:
1360
- '''
1361
- Parameters
1362
- ----------
1363
- week_commencing : str
1364
- specify the day for the week commencing, the default is 'sun' (e.g., 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
1365
-
1366
- series_id_list : list[str]
1367
- provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
1368
- ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]
1369
-
1370
- Returns
1371
- ----------
1372
- pd.DataFrame
1373
- Return a data frame with FRED data according to the series IDs provided
1374
-
1375
- Example
1376
- ----------
1377
- pull_fred_data("mon", ["GCEC1", "SP500"])
1378
- '''
1379
- # Fred API
1380
- fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
1381
-
1382
- # Fetch the metadata for each series to get the full names
1383
- series_names = {series_id: fred.get_series_info(series_id).title for series_id in series_id_list}
1384
-
1385
- # Download data from series id list
1386
- fred_series = {series_id: fred.get_series(series_id) for series_id in series_id_list}
1387
-
1388
- # Data processing
1389
- date_range = {'OBS': pd.date_range("1950-01-01", datetime.today().strftime('%Y-%m-%d'), freq='d')}
1390
- fred_series_df = pd.DataFrame(date_range)
1391
-
1392
- for series_id, series_data in fred_series.items():
1393
- series_data = series_data.reset_index()
1394
- series_data.columns = ['OBS', series_names[series_id]] # Use the series name as the column header
1395
- fred_series_df = pd.merge_asof(fred_series_df, series_data, on='OBS', direction='backward')
1396
-
1397
- # Handle duplicate columns
1398
- for col in fred_series_df.columns:
1399
- if '_x' in col:
1400
- base_col = col.replace('_x', '')
1401
- fred_series_df[base_col] = fred_series_df[col].combine_first(fred_series_df[base_col + '_y'])
1402
- fred_series_df.drop([col, base_col + '_y'], axis=1, inplace=True)
1403
-
1404
- # Ensure sum_columns are present in the DataFrame
1405
- sum_columns = [series_names[series_id] for series_id in series_id_list if series_names[series_id] in fred_series_df.columns]
1406
-
1407
- # Aggregate results by week
1408
- fred_df_final = ims_proc.aggregate_daily_to_wc_wide(df=fred_series_df,
1409
- date_column="OBS",
1410
- group_columns=[],
1411
- sum_columns=sum_columns,
1412
- wc=week_commencing,
1413
- aggregation="average")
1414
-
1415
- # Remove anything after the instance of any ':' in the column names and rename, except for 'OBS'
1416
- fred_df_final.columns = ['OBS' if col == 'OBS' else 'macro_' + col.lower().split(':')[0].replace(' ', '_') for col in fred_df_final.columns]
1417
-
1418
- return fred_df_final
1419
-
1420
- def pull_boe_data(self, week_commencing="mon", max_retries=30, delay=5):
1421
- """
1422
- Fetch and process Bank of England interest rate data.
1423
-
1424
- Args:
1425
- week_commencing (str): The starting day of the week for aggregation.
1426
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1427
- Default is "sun".
1428
- max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 30.
1429
- delay (int): Delay in seconds between retry attempts. Default is 5.
1430
-
1431
- Returns:
1432
- pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
1433
- The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
1434
- and 'macro_boe_intr_rate' contains the average interest rate for the week.
1435
- """
1436
- # Week commencing dictionary
1437
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1438
-
1439
- # Function to fetch the data with retries
1440
- def fetch_data_with_retries(url, max_retries, delay):
1441
- for attempt in range(max_retries):
1442
- try:
1443
- html_table = pd.read_html(url)[0]
1444
- return html_table
1445
- except Exception as e:
1446
- print(f"Attempt {attempt + 1} failed: {e}")
1447
- if attempt < max_retries - 1:
1448
- time.sleep(delay)
1449
- else:
1450
- raise
1451
-
1452
- # Import HTML data from Bank of England rate
1453
- url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
1454
- html_table = fetch_data_with_retries(url, max_retries, delay)
1455
-
1456
- df = pd.DataFrame(html_table)
1457
- df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
1458
-
1459
- # Change date column to datetime and find the corresponding week to the date
1460
- df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
1461
- df.sort_values("OBS", axis=0, inplace=True)
1462
-
1463
- # Create a daily date range and find the week commencing for that day
1464
- date_range = pd.date_range(df["OBS"].iloc[0], datetime.today(), freq="d")
1465
- df_daily = pd.DataFrame(date_range, columns=["OBS"])
1466
-
1467
- # Adjust each date to the specified week commencing day
1468
- df_daily['Week_Commencing'] = df_daily["OBS"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1469
-
1470
- # Outer merge the daily date range on the boe dataframe and forward fill in the blanks
1471
- df_final = df_daily.merge(df, on='OBS', how="left")
1472
- df_final["macro_boe_intr_rate"].ffill(inplace=True)
1473
-
1474
- # Group by the week start date and get the mean of the interest rates for each week
1475
- df_final = df_final.groupby('Week_Commencing')['macro_boe_intr_rate'].mean().reset_index()
1476
-
1477
- df_final['Week_Commencing'] = df_final['Week_Commencing'].dt.strftime('%d/%m/%Y')
1478
- df_final.rename(columns={'Week_Commencing': 'OBS'}, inplace=True)
1479
-
1480
- return df_final
1481
-
1482
- def pull_ons_data(self, series_list, week_commencing):
1483
- """
1484
- Fetch and process time series data from the ONS API.
1485
-
1486
- Args:
1487
- series_list (list): A list of dictionaries where each dictionary represents a time series.
1488
- Each dictionary should have the keys 'series_id' and 'dataset_id'.
1489
- week_commencing (str): The starting day of the week for aggregation.
1490
- Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1491
-
1492
- Returns:
1493
- pd.DataFrame: A DataFrame with weekly aggregated ONS data. The 'OBS' column contains the week
1494
- commencing dates and other columns contain the aggregated time series values.
1495
- """
1496
-
1497
- def parse_quarter(date_str):
1498
- """Parses a string in 'YYYY Q#' format into a datetime object."""
1499
- year, quarter = date_str.split(' ')
1500
- quarter_number = int(quarter[1])
1501
- month = (quarter_number - 1) * 3 + 1
1502
- return pd.Timestamp(f"{year}-{month:02d}-01")
1503
-
1504
- # Generate a date range from 1950-01-01 to today
1505
- date_range = pd.date_range(start="1950-01-01", end=datetime.today(), freq='D')
1506
- daily_df = pd.DataFrame(date_range, columns=['OBS'])
1507
-
1508
- # Keep track of the renamed value columns
1509
- value_columns = []
1510
-
1511
- for series in series_list:
1512
- series_id = series['series_id']
1513
- dataset_id = series['dataset_id']
1514
-
1515
- # Construct the URL for data
1516
- data_url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
1517
-
1518
- # Make the request to the ONS API for data
1519
- data_response = requests.get(data_url)
1520
-
1521
- # Check if the request was successful
1522
- if data_response.status_code != 200:
1523
- print(f"Failed to fetch data for series {series_id}: {data_response.status_code} {data_response.text}")
1524
- continue
1525
-
1526
- # Parse the JSON response for data
1527
- data = data_response.json()
1528
-
1529
- # Attempt to extract the name of the time series from the data response
1530
- series_name = data.get('description', {}).get('title', 'Value')
1531
-
1532
- # Determine the most granular time series data available
1533
- if 'months' in data and data['months']:
1534
- time_series_data = data['months']
1535
- elif 'quarters' in data and data['quarters']:
1536
- time_series_data = data['quarters']
1537
- elif 'years' in data and data['years']:
1538
- time_series_data = data['years']
1539
- else:
1540
- print("No time series data found in the response")
1541
- continue
1542
-
1543
- # Create a DataFrame from the time series data
1544
- df = pd.DataFrame(time_series_data)
1545
-
1546
- # Handle different frequencies in the data
1547
- if 'date' in df.columns:
1548
- if any(df['date'].str.contains('Q')):
1549
- df['date'] = df['date'].apply(parse_quarter)
1550
- else:
1551
- df['date'] = pd.to_datetime(df['date'])
1552
-
1553
- df = df.rename(columns={'date': 'OBS', 'value': series_name})
1554
-
1555
- # Rename the value column
1556
- new_col_name = 'macro_' + series_name.lower().replace(':', '').replace(' ', '_').replace('-', '_')
1557
- df = df.rename(columns={series_name: new_col_name})
1558
-
1559
- # Track the renamed value column
1560
- value_columns.append(new_col_name)
1561
-
1562
- # Merge the data based on the observation date
1563
- daily_df = pd.merge_asof(daily_df, df[['OBS', new_col_name]], on='OBS', direction='backward')
1564
-
1565
- # Ensure columns are numeric
1566
- for col in value_columns:
1567
- if col in daily_df.columns:
1568
- daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
1569
- else:
1570
- print(f"Column {col} not found in daily_df")
1571
-
1572
- # Aggregate results by week
1573
- ons_df_final = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
1574
- date_column="OBS",
1575
- group_columns=[],
1576
- sum_columns=value_columns,
1577
- wc=week_commencing,
1578
- aggregation="average")
1579
-
1580
- return ons_df_final
1581
-
1582
- def pull_macro(self, country: str = "GBR", week_commencing: str = "mon"):
1583
- # Change country input to list
1584
- countries_list = [country]
1585
-
1586
- # Check if the data wants to be inputted at any other week commencing date
1587
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1588
-
1589
- # Two useful functions for quarterly data
1590
- # Define a function to get quarterly data
1591
- def get_quarter(p_date: datetime.date) -> int:
1592
- return (p_date.month - 1) // 3 + 1
1593
-
1594
- # Define a function to get the last day of the quarter
1595
- def get_last_day_of_the_quarter(p_date: datetime.date):
1596
- quarter = get_quarter(p_date)
1597
- return datetime(p_date.year + 3 * quarter // 12, 3 * quarter % 12 + 1, 1) + pd.Timedelta(days=-1)
1598
-
1599
- # For the monthly data
1600
- data_M, subjects_M, measures_M = cif.createDataFrameFromOECD(countries=countries_list, dsname='MEI',
1601
- subject=['LCEAMN01', 'LCEAPR', 'CSCICP03', 'CPALTT01',
1602
- 'LRHUTTTT', 'LORSGPRT', 'IR3TIB01',
1603
- 'PRINTO01'],
1604
- measure=['IXOBSA', 'IXNSA', 'IXNB', 'STSA', 'ST', 'GPSA', 'GY'],
1605
- frequency='M', startDate='2015-01')
1606
- data_M = data_M.stack(level=[0, -1, -2]).reset_index()
1607
-
1608
- data_Q, subjects_Q, measures_Q = cif.createDataFrameFromOECD(countries=countries_list, dsname='MEI',
1609
- subject=['LCEAMN01', 'LCEAPR', 'CSCICP03', 'CPALTT01',
1610
- 'LRHUTTTT', 'LORSGPRT', 'IR3TIB01',
1611
- 'PRINTO01'],
1612
- measure=['IXOBSA', 'IXNSA', 'IXNB', 'STSA', 'ST', 'GPSA', 'GY'],
1613
- frequency='Q', startDate='2015-01')
1614
-
1615
- data_Q = data_Q.stack(level=[0, -1, -2]).reset_index()
1616
-
1617
- # Create a data frame dictionary to store your monthly data frames
1618
- DataFrameDict_M = {elem: pd.DataFrame() for elem in countries_list}
1619
- for key in DataFrameDict_M.keys():
1620
- DataFrameDict_M[key] = data_M[:][data_M.country == key]
1621
-
1622
- # Create a data frame dictionary to store your quarterly data frames
1623
- DataFrameDict_Q = {elem: pd.DataFrame() for elem in countries_list}
1624
- for key in DataFrameDict_Q.keys():
1625
- DataFrameDict_Q[key] = data_Q[:][data_Q.country == key]
1626
-
1627
- # Create a monthly list of the dataframes to iterate through
1628
- countries_df_list_M = []
1629
- for i in countries_list:
1630
- df = pd.DataFrame(DataFrameDict_M[i])
1631
- df.rename(columns={0: 'Values'}, inplace=True)
1632
- df = pd.pivot_table(data=df, index='time', values='Values', columns=['subject', 'measure'])
1633
- countries_df_list_M.append(df)
1634
-
1635
- # Create a quarterly list of the dataframes to iterate through
1636
- countries_df_list_Q = []
1637
- for i in countries_list:
1638
- df = pd.DataFrame(DataFrameDict_Q[i])
1639
- df.rename(columns={0: 'Values'}, inplace=True)
1640
- df = pd.pivot_table(data=df, index='time', values='Values', columns=['subject', 'measure'])
1641
- countries_df_list_Q.append(df)
1642
-
1643
- combined_countries_df_list = list(zip(countries_df_list_M, countries_df_list_Q))
1644
-
1645
- # Loop through and create dataframes for every country
1646
- for index, data in enumerate(combined_countries_df_list):
1647
- # Find country being extracted
1648
- country = countries_list[index]
1649
- print(country)
1650
-
1651
- # For consumer confidence
1652
- # For countries with no data
1653
- if country in ['CAN', 'IND', 'NOR']:
1654
- Consumer_Confidence_Index_df_M = pd.DataFrame()
1655
- Consumer_Confidence_Index_df_Q = pd.DataFrame()
1656
- # For countries with quarterly data
1657
- elif country in []:
1658
- Consumer_Confidence_Index_df_Q = data[1]['CSCICP03']['IXNSA']
1659
- Consumer_Confidence_Index_df_Q.rename('consumer_confidence_index', inplace=True)
1660
- Consumer_Confidence_Index_df_M = pd.DataFrame()
1661
- # For countries with monthly data
1662
- else:
1663
- Consumer_Confidence_Index_df_M = data[0]['CSCICP03']['IXNSA']
1664
- Consumer_Confidence_Index_df_M.rename('consumer_confidence_index', inplace=True)
1665
- Consumer_Confidence_Index_df_Q = pd.DataFrame()
1666
-
1667
- # For consumer prices for COST OF LIVING
1668
- # For countries with no data
1669
- if country in []:
1670
- Consumer_Price_Index_Cost_Of_Living_df_M = pd.DataFrame()
1671
- Consumer_Price_Index_Cost_Of_Living_df_Q = pd.DataFrame()
1672
- # For countries with quarterly data
1673
- elif country in ['AUS', 'NZL']:
1674
- Consumer_Price_Index_Cost_Of_Living_df_Q = data[1]['CPALTT01']['IXNB']
1675
- Consumer_Price_Index_Cost_Of_Living_df_Q.rename('consumer_price_index_cost_of_living', inplace=True)
1676
- Consumer_Price_Index_Cost_Of_Living_df_M = pd.DataFrame()
1677
- # For countries with monthly data
1678
- else:
1679
- Consumer_Price_Index_Cost_Of_Living_df_M = data[0]['CPALTT01']['IXNB']
1680
- Consumer_Price_Index_Cost_Of_Living_df_M.rename('consumer_price_index_cost_of_living', inplace=True)
1681
- Consumer_Price_Index_Cost_Of_Living_df_Q = pd.DataFrame()
1682
-
1683
- # For consumer prices FOR INFLATION
1684
- # For countries with no data
1685
- if country in []:
1686
- Consumer_Price_Index_Inflation_df_M = pd.DataFrame()
1687
- Consumer_Price_Index_Inflation_df_Q = pd.DataFrame()
1688
- # For countries with quarterly data
1689
- elif country in ['AUS', 'NZL']:
1690
- Consumer_Price_Index_Inflation_df_Q = data[1]['CPALTT01']['GY']
1691
- Consumer_Price_Index_Inflation_df_Q.rename('consumer_price_index_inflation', inplace=True)
1692
- Consumer_Price_Index_Inflation_df_M = pd.DataFrame()
1693
- # For countries with monthly data
1694
- else:
1695
- Consumer_Price_Index_Inflation_df_M = data[0]['CPALTT01']['GY']
1696
- Consumer_Price_Index_Inflation_df_M.rename('consumer_price_index_inflation', inplace=True)
1697
- Consumer_Price_Index_Inflation_df_Q = pd.DataFrame()
1698
-
1699
- # For GDP Index Smoothed
1700
- # For countries with no data
1701
- if country in ['NLD', 'CHE', 'NZL', 'SWE', 'NOR']:
1702
- GDP_Index_Smoothed_df_M = pd.DataFrame()
1703
- GDP_Index_Smoothed_df_Q = pd.DataFrame()
1704
- # For countries with quarterly data
1705
- elif country in []:
1706
- GDP_Index_Smoothed_df_Q = data[1]['LORSGPRT']['STSA']
1707
- GDP_Index_Smoothed_df_Q.rename('gdp_index_smoothed', inplace=True)
1708
- GDP_Index_Smoothed_df_M = pd.DataFrame()
1709
- # For countries with monthly data
1710
- else:
1711
- GDP_Index_Smoothed_df_M = data[0]['LORSGPRT']['STSA']
1712
- GDP_Index_Smoothed_df_M.rename('gdp_index_smoothed', inplace=True)
1713
- GDP_Index_Smoothed_df_Q = pd.DataFrame()
1714
-
1715
- # For Harmonised Unemployment Index
1716
- # For countries with no data
1717
- if country in ['IND', 'CHE', 'ZAF', 'CHN']:
1718
- Harmonised_Unemployment_Index_df_M = pd.DataFrame()
1719
- Harmonised_Unemployment_Index_df_Q = pd.DataFrame()
1720
- # For countries with quarterly data
1721
- elif country in ['NZL']:
1722
- Harmonised_Unemployment_Index_df_Q = data[1]['LRHUTTTT']['STSA']
1723
- Harmonised_Unemployment_Index_df_Q.rename('harmonised_unemployment_index', inplace=True)
1724
- Harmonised_Unemployment_Index_df_M = pd.DataFrame()
1725
- # For countries with monthly data
1726
- else:
1727
- Harmonised_Unemployment_Index_df_M = data[0]['LRHUTTTT']['STSA']
1728
- Harmonised_Unemployment_Index_df_M.rename('harmonised_unemployment_index', inplace=True)
1729
- Harmonised_Unemployment_Index_df_Q = pd.DataFrame()
1730
-
1731
- # For hourly earnings index manufacturing
1732
- # For countries with no data
1733
- if country in ['IND', 'CHE', 'ZAF', 'CHN']:
1734
- Hourly_Earnings_Index_Manufacturing_df_M = pd.DataFrame()
1735
- Hourly_Earnings_Index_Manufacturing_df_Q = pd.DataFrame()
1736
- # For countries with quarterly data
1737
- elif country in ['FRA', 'DEU', 'ESP', 'AUS', 'NZL', 'KOR', 'NOR']:
1738
- Hourly_Earnings_Index_Manufacturing_df_Q = data[1]['LCEAMN01']['IXOBSA']
1739
- Hourly_Earnings_Index_Manufacturing_df_Q.rename('hourly_earnings_index_manufacturing', inplace=True)
1740
- Hourly_Earnings_Index_Manufacturing_df_M = pd.DataFrame()
1741
- # For countries with monthly data
1742
- else:
1743
- Hourly_Earnings_Index_Manufacturing_df_M = data[0]['LCEAMN01']['IXOBSA']
1744
- Hourly_Earnings_Index_Manufacturing_df_M.rename('hourly_earnings_index_manufacturing', inplace=True)
1745
- Hourly_Earnings_Index_Manufacturing_df_Q = pd.DataFrame()
1746
-
1747
- # For Short Term Interest Rate
1748
- # For countries with no data
1749
- if country in []:
1750
- Short_Term_Interest_Rate_df_M = pd.DataFrame()
1751
- Short_Term_Interest_Rate_df_Q = pd.DataFrame()
1752
- # For countries with quarterly data
1753
- elif country in []:
1754
- Short_Term_Interest_Rate_df_Q = data[1]['IR3TIB01']['ST']
1755
- Short_Term_Interest_Rate_df_Q.rename('short_term_interest_rate', inplace=True)
1756
- Short_Term_Interest_Rate_df_M = pd.DataFrame()
1757
- # For countries with monthly data
1758
- else:
1759
- Short_Term_Interest_Rate_df_M = data[0]['IR3TIB01']['ST']
1760
- Short_Term_Interest_Rate_df_M.rename('short_term_interest_rate', inplace=True)
1761
- Short_Term_Interest_Rate_df_Q = pd.DataFrame()
1762
-
1763
- # For Industrial Product Growth on Previous Period
1764
- # For countries with no data
1765
- if country in ['ZAF', 'CHN']:
1766
- Industrial_Product_Growth_on_Previous_Period_df_M = pd.DataFrame()
1767
- Industrial_Product_Growth_on_Previous_Period_df_Q = pd.DataFrame()
1768
- # For countries with quarterly data
1769
- elif country in ['AUS', 'NZL']:
1770
- Industrial_Product_Growth_on_Previous_Period_df_Q = data[1]['PRINTO01']['GPSA']
1771
- Industrial_Product_Growth_on_Previous_Period_df_Q.rename('industrial_product_growth_on_previous_period', inplace=True)
1772
- Industrial_Product_Growth_on_Previous_Period_df_M = pd.DataFrame()
1773
- # For countries with monthly data
1774
- else:
1775
- Industrial_Product_Growth_on_Previous_Period_df_M = data[0]['PRINTO01']['GPSA']
1776
- Industrial_Product_Growth_on_Previous_Period_df_M.rename('industrial_product_growth_on_previous_period', inplace=True)
1777
- Industrial_Product_Growth_on_Previous_Period_df_Q = pd.DataFrame()
1778
-
1779
- # For Industrial Production Index
1780
- # For countries with no data
1781
- if country in ['ZAF', 'CHN']:
1782
- Industrial_Production_Index_df_M = pd.DataFrame()
1783
- Industrial_Production_Index_df_Q = pd.DataFrame()
1784
- # For countries with quarterly data
1785
- elif country in ['AUS', 'NZL']:
1786
- Industrial_Production_Index_df_Q = data[1]['PRINTO01']['IXOBSA']
1787
- Industrial_Production_Index_df_Q.rename('industrial_production_index', inplace=True)
1788
- Industrial_Production_Index_df_M = pd.DataFrame()
1789
- # For countries with monthly data
1790
- else:
1791
- Industrial_Production_Index_df_M = data[0]['PRINTO01']['IXOBSA']
1792
- Industrial_Production_Index_df_M.rename('industrial_production_index', inplace=True)
1793
- Industrial_Production_Index_df_Q = pd.DataFrame()
1794
-
1795
- # Create monthly macroeconomic dataframe
1796
- all_dfs_list_M = [Consumer_Confidence_Index_df_M,
1797
- Consumer_Price_Index_Cost_Of_Living_df_M,
1798
- Consumer_Price_Index_Inflation_df_M,
1799
- GDP_Index_Smoothed_df_M,
1800
- Harmonised_Unemployment_Index_df_M,
1801
- Hourly_Earnings_Index_Manufacturing_df_M,
1802
- Short_Term_Interest_Rate_df_M,
1803
- Industrial_Product_Growth_on_Previous_Period_df_M,
1804
- Industrial_Production_Index_df_M]
1805
-
1806
- # Check if any dataframes are empty and if there are remove them
1807
- all_dfs_list_M = [df for df in all_dfs_list_M if not df.empty]
1808
- cif_Macroeconomic_df_M = pd.concat(all_dfs_list_M, axis=1)
1809
-
1810
- # Create quarterly macroeconomic dataframe
1811
- all_dfs_list_Q = [Consumer_Confidence_Index_df_Q,
1812
- Consumer_Price_Index_Cost_Of_Living_df_Q,
1813
- Consumer_Price_Index_Inflation_df_Q,
1814
- GDP_Index_Smoothed_df_Q,
1815
- Harmonised_Unemployment_Index_df_Q,
1816
- Hourly_Earnings_Index_Manufacturing_df_Q,
1817
- Short_Term_Interest_Rate_df_Q,
1818
- Industrial_Product_Growth_on_Previous_Period_df_Q,
1819
- Industrial_Production_Index_df_Q]
1820
-
1821
- # Check if any dataframes are empty and if there are remove them
1822
- all_dfs_list_Q = [df for df in all_dfs_list_Q if not df.empty]
1823
- if all_dfs_list_Q != []:
1824
- macroeconomic_monthly_df_Q = pd.concat(all_dfs_list_Q, axis=1)
1825
- else:
1826
- macroeconomic_monthly_df_Q = pd.DataFrame()
1827
-
1828
- # For USD GBP Exchange Rate
1829
- # If it's the UK add this series else don't
1830
- if countries_list[index] == 'GBR':
1831
- USD_GBP_Exchange_Rate_df = pd.read_csv(
1832
- 'https://stats.oecd.org/SDMX-JSON/data/MEI_FIN/CCUS.' + countries_list[index] + '.M/OECD?contentType=csv')
1833
- USD_GBP_Exchange_Rate_df.head()
1834
- USD_GBP_Exchange_Rate_df_pivot = pd.pivot_table(USD_GBP_Exchange_Rate_df, values='Value', index='TIME',
1835
- columns='Subject')
1836
- USD_GBP_Exchange_Rate_df_pivot_final = USD_GBP_Exchange_Rate_df_pivot.loc["2015-01":]
1837
- USD_GBP_Exchange_Rate_df_pivot_final.rename(
1838
- columns={'Currency exchange rates, monthly average': 'usd_gbp_exchange_rate'}, inplace=True)
1839
-
1840
- # Create final monthly dataframe
1841
- macroeconomic_monthly_df_M = pd.concat([cif_Macroeconomic_df_M, USD_GBP_Exchange_Rate_df_pivot_final], axis=1)
1842
- else:
1843
- # Create final monthly dataframe
1844
- macroeconomic_monthly_df_M = cif_Macroeconomic_df_M
1845
-
1846
- # Create the final W/C Sunday dataframe
1847
- # For monthly data
1848
- macroeconomic_monthly_df_M['Date'] = macroeconomic_monthly_df_M.index
1849
- df_M = macroeconomic_monthly_df_M.set_index(pd.to_datetime(macroeconomic_monthly_df_M['Date'])).drop(columns='Date')
1850
- df_M.fillna(method="ffill", inplace=True)
1851
- df_M.reset_index(inplace=True)
1852
-
1853
- daily_records = []
1854
- # Iterate over each row in the DataFrame
1855
- for _, row in df_M.iterrows():
1856
- # Calculate the number of days in the month
1857
- num_days = calendar.monthrange(row["Date"].year, row["Date"].month)[1]
1858
- # Create a new record for each day of the month
1859
- for day in range(1, num_days + 1):
1860
- daily_row = row.copy()
1861
- daily_row["Date"] = row["Date"].replace(day=day)
1862
- daily_records.append(daily_row)
1863
-
1864
- # Convert the list of daily records into a DataFrame
1865
- daily_df = pd.DataFrame(daily_records)
1866
-
1867
- # Extend dataframe to include the current data if needed
1868
- datelist = pd.date_range(daily_df["Date"].iloc[-1] + pd.Timedelta(days=1), datetime.today()).tolist()
1869
- extended_data = np.repeat([list(daily_df.iloc[-1, 1:].values)], len(datelist), axis=0)
1870
- q = pd.Series(datelist, name="Date")
1871
- s = pd.DataFrame(extended_data, columns=list(df_M.columns[1:]))
1872
- extended_daily_df = pd.concat([q, s], axis=1)
1873
- extended_daily_df = pd.concat([daily_df, extended_daily_df], ignore_index=False)
1874
-
1875
- # Create a week commencing column
1876
- extended_daily_df["Date"] = pd.to_datetime(extended_daily_df["Date"], format='%d %b %Y')
1877
- extended_daily_df['week_start'] = extended_daily_df["Date"].apply(
1878
- lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1879
- extended_daily_df.drop("Date", axis=1, inplace=True)
1880
- extended_daily_df.rename(columns={'week_start': "Date"}, inplace=True)
1881
-
1882
- # Take a weekly average
1883
- macroeconomic_weekly_df_M = extended_daily_df.groupby('Date').mean()
1884
-
1885
- # For quarterly data
1886
- # If there are quarterly datasets
1887
- if all_dfs_list_Q != []:
1888
- macroeconomic_monthly_df_Q['Date'] = macroeconomic_monthly_df_Q.index
1889
- df_Q = macroeconomic_monthly_df_Q.set_index(pd.to_datetime(macroeconomic_monthly_df_Q['Date'])).drop(
1890
- columns='Date')
1891
- df_Q.fillna(method="ffill", inplace=True)
1892
- df_Q.reset_index(inplace=True)
1893
-
1894
- daily_records = []
1895
- for _, row in df_Q.iterrows():
1896
- year = row["Date"].year
1897
- month = row["Date"].month
1898
- day = row["Date"].day
1899
- last_date = get_last_day_of_the_quarter(datetime(year, month, day).date())
1900
- all_days = pd.date_range(row["Date"], last_date, freq="D")
1901
-
1902
- # Create a new record for each day of the quarter
1903
- for day in all_days:
1904
- daily_row = row.copy()
1905
- daily_row["Date"] = row["Date"].replace(day=day.day, month=day.month)
1906
- daily_records.append(daily_row)
1907
-
1908
- # Convert the list of daily records into a DataFrame
1909
- daily_df = pd.DataFrame(daily_records)
1910
-
1911
- # Extend dataframe to include data up to today
1912
- datelist = pd.date_range(daily_df["Date"].iloc[-1] + pd.Timedelta(days=1), datetime.today()).tolist()
1913
- extended_data = np.repeat([list(daily_df.iloc[-1, 1:].values)], len(datelist), axis=0)
1914
- q = pd.Series(datelist, name="Date")
1915
- s = pd.DataFrame(extended_data, columns=list(df_Q.columns[1:]))
1916
- extended_daily_df = pd.concat([q, s], axis=1)
1917
- extended_daily_df = pd.concat([daily_df, extended_daily_df], ignore_index=False)
1918
-
1919
- # Create a week commencing column
1920
- extended_daily_df["Date"] = pd.to_datetime(extended_daily_df["Date"], format='%d %b %Y')
1921
- extended_daily_df['week_start'] = extended_daily_df["Date"].apply(
1922
- lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1923
- extended_daily_df.drop("Date", axis=1, inplace=True)
1924
- extended_daily_df.rename(columns={'week_start': "Date"}, inplace=True)
1925
-
1926
- # Take a weekly average
1927
- macroeconomic_weekly_df_Q = extended_daily_df.groupby('Date').mean()
1928
-
1929
- # Merge the two datasets together
1930
- if all_dfs_list_Q != []:
1931
- macroeconomic_weekly_df = macroeconomic_weekly_df_M.merge(macroeconomic_weekly_df_Q, left_index=True,
1932
- right_index=True)
1933
- # If there are no quarterly datasets
1934
- else:
1935
- macroeconomic_weekly_df = macroeconomic_weekly_df_M
1936
-
1937
- # Change datetime format
1938
- macroeconomic_weekly_df.index = macroeconomic_weekly_df.index.strftime('%d/%m/%Y')
1939
-
1940
- macroeconomic_weekly_df.reset_index()
1941
- macroeconomic_weekly_df.reset_index(drop=False, inplace=True)
1942
- macroeconomic_weekly_df.rename(columns={'Date': 'OBS'}, inplace=True)
1943
-
1944
- return macroeconomic_weekly_df
1945
-
1946
- def get_google_mobility_data(self, country: str, wc: str) -> pd.DataFrame:
1947
- """
1948
- Fetch Google Mobility data for the specified country.
1949
-
1950
- Parameters:
1951
- - country (str): The name of the country for which to fetch data.
1952
-
1953
- Returns:
1954
- - pd.DataFrame: A DataFrame containing the Google Mobility data.
1955
- """
1956
- # URL of the Google Mobility Reports CSV file
1957
- url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"
1958
-
1959
- # Fetch the CSV file
1960
- response = requests.get(url)
1961
- if response.status_code != 200:
1962
- raise Exception(f"Failed to fetch data: {response.status_code}")
1963
-
1964
- # Load the CSV file into a pandas DataFrame
1965
- csv_data = StringIO(response.text)
1966
- df = pd.read_csv(csv_data)
1967
-
1968
- # Filter the DataFrame for the specified country
1969
- country_df = df[df['country_region'] == country]
1970
-
1971
- final_covid = ims_proc.aggregate_daily_to_wc_wide(country_df, "date", [], ['retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline',
1972
- 'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline',
1973
- 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline'], wc, "average")
1974
-
1975
- final_covid1 = ims_proc.rename_cols(final_covid, 'covid_')
1976
- return final_covid1
1977
-
1978
- ############################################################### Seasonality ##########################################################################
1979
-
1980
- def pull_combined_dummies(self, week_commencing):
1981
- # Week commencing dictionary
1982
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1983
-
1984
- # Create daily date range dataframe
1985
- date_range = pd.date_range(datetime(2015, 1, 1), datetime.today(), freq="d")
1986
- df_daily = pd.DataFrame(date_range, columns=["Date"])
1987
-
1988
- # Create weekly date range dataframe
1989
- df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1990
- df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
1991
- df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
1992
-
1993
- df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
1994
- df_weekly_start.set_index("Date", inplace=True)
1995
-
1996
- # Create individual weekly dummies
1997
- dummy_columns = {}
1998
- for i in range(len(df_weekly_start)):
1999
- col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
2000
- dummy_columns[col_name] = [0] * len(df_weekly_start)
2001
- dummy_columns[col_name][i] = 1
2002
-
2003
- df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
2004
- df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
2005
-
2006
- # Create monthly dummies
2007
- df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
2008
- df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"])
2009
- df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2010
- df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2011
-
2012
- df_monthly_dummies.set_index("Date", inplace=True)
2013
- df_monthly_dummies = df_monthly_dummies / 7
2014
-
2015
- # Combine weekly and monthly dataframes
2016
- df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
2017
-
2018
- # Create weekly dummies
2019
- df_combined.reset_index(inplace=True)
2020
- df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
2021
- df_combined = pd.get_dummies(df_combined, prefix="wk", columns=["Week"])
2022
-
2023
- # Create yearly dummies
2024
- df_combined["Year"] = df_combined["Date"].dt.year
2025
- df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"])
2026
-
2027
- # Add constant
2028
- df_combined["Constant"] = 1
2029
-
2030
- # Add trend
2031
- df_combined["Trend"] = df_combined.index + 1
2032
-
2033
- # Set date as index
2034
- df_combined.set_index("Date", inplace=True)
2035
-
2036
- # Create COVID lockdown dummies
2037
- lockdown_periods = [
2038
- # Lockdown 1
2039
- ("2020-03-23", "2020-05-24"),
2040
- # Lockdown 2
2041
- ("2020-11-05", "2020-12-02"),
2042
- # Lockdown 3
2043
- ("2021-01-04", "2021-03-08")
2044
- ]
2045
-
2046
- df_covid = pd.DataFrame(date_range, columns=["Date"])
2047
- df_covid["national_lockdown"] = 0
2048
-
2049
- for start, end in lockdown_periods:
2050
- df_covid.loc[(df_covid["Date"] >= start) & (df_covid["Date"] <= end), "national_lockdown"] = 1
2051
-
2052
- df_covid['week_start'] = df_covid["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2053
- df_covid.drop("Date", axis=1, inplace=True)
2054
- df_covid.rename(columns={"week_start": "OBS"}, inplace=True)
2055
- df_national_lockdown_total = df_covid.groupby('OBS').sum(numeric_only=True)
2056
- df_national_lockdown_total.rename(columns={"national_lockdown": "covid_uk_national_lockdown_total"}, inplace=True)
2057
-
2058
- df_national_lockdown_1 = df_national_lockdown_total.copy(deep=True)
2059
- df_national_lockdown_2 = df_national_lockdown_total.copy(deep=True)
2060
- df_national_lockdown_3 = df_national_lockdown_total.copy(deep=True)
2061
-
2062
- df_national_lockdown_1.loc[df_national_lockdown_1.index > "2020-05-24"] = 0
2063
- df_national_lockdown_1.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_1"}, inplace=True)
2064
-
2065
- df_national_lockdown_2.loc[df_national_lockdown_2.index < "2020-11-05"] = 0
2066
- df_national_lockdown_2.loc[df_national_lockdown_2.index > "2020-12-02"] = 0
2067
- df_national_lockdown_2.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_2"}, inplace=True)
2068
-
2069
- df_national_lockdown_3.loc[df_national_lockdown_3.index < "2021-01-04"] = 0
2070
- df_national_lockdown_3.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_3"}, inplace=True)
2071
-
2072
- df_final_covid = pd.concat([df_national_lockdown_total, df_national_lockdown_1, df_national_lockdown_2, df_national_lockdown_3], axis=1)
2073
- df_final_covid.reset_index(inplace=True)
2074
- df_final_covid.rename(columns={"index": "OBS"}, inplace=True)
2075
-
2076
- # Create seasonal indicators for the last day and last Friday of the month
2077
- min_date = '2019-12-29'
2078
- max_date = datetime.today().strftime('%Y-%m-%d')
2079
- date_range_seas = pd.date_range(start=min_date, end=max_date)
2080
-
2081
- df_seas = pd.DataFrame(date_range_seas, columns=['Date'])
2082
- df_seas['Last_Day_of_Month'] = df_seas['Date'].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
2083
-
2084
- def is_last_friday(date):
2085
- last_day_of_month = date.to_period('M').to_timestamp('M')
2086
- last_day_weekday = last_day_of_month.dayofweek
2087
- if last_day_weekday >= 4:
2088
- days_to_subtract = last_day_weekday - 4
2089
- else:
2090
- days_to_subtract = last_day_weekday + 3
2091
- last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
2092
- return 1 if date == last_friday else 0
2093
-
2094
- df_seas['Last_Friday_of_Month'] = df_seas['Date'].apply(is_last_friday)
2095
-
2096
- df_seas['week_start'] = df_seas["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2097
- df_seas = df_seas.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
2098
- df_seas.set_index("Date", inplace=True)
2099
-
2100
- # Combine all dataframes
2101
- df_combined = df_combined.reset_index().rename(columns={"Date": "OBS"})
2102
- df_final_combined = pd.merge(df_combined, df_final_covid, how='left', left_on='OBS', right_on='OBS')
2103
- df_final_combined = pd.merge(df_final_combined, df_seas, how='left', left_on='OBS', right_on='Date')
2104
-
2105
- # Fill any NaN values with 0
2106
- df_final_combined.fillna(0, inplace=True)
2107
-
2108
- return df_final_combined
2109
-
2110
- def pull_weather(self, week_commencing, country) -> pd.DataFrame:
2111
- import pandas as pd
2112
- import urllib.request
2113
- from datetime import datetime
2114
- import requests
2115
- from geopy.geocoders import Nominatim
2116
-
2117
- # Week commencing dictionary
2118
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2119
-
2120
- # Country dictionary
2121
- country_dict = {"AUS": "AU__ASOS", "GBR": "GB__ASOS", "USA": "USCRN", "DEU": "DE__ASOS", "CAN": "Canada", "ZAF": "ZA__ASOS"}
2122
-
2123
- # Function to flatten a list of nested lists into a list
2124
- def flatten_list(nested_list):
2125
- return [item for sublist in nested_list for item in sublist]
2126
-
2127
- # Choose country
2128
- country = country_dict[country]
2129
-
2130
- # Choose start and end dates
2131
- start_day = 1
2132
- start_month = 1
2133
- start_year = 2014
2134
- formatted_date = datetime(start_year, start_month, start_day).strftime("%Y-%m-%d")
2135
- today = datetime.now()
2136
- end_day = today.day
2137
- end_month = today.month
2138
- end_year = today.year
2139
-
2140
- if country == "GB__ASOS":
2141
- stations = ["&stations=EGCC", "&stations=EGNM", "&stations=EGBB",
2142
- "&stations=EGSH", "&stations=EGFF", "&stations=EGHI",
2143
- "&stations=EGLC", "&stations=EGHQ", "&stations=EGAC",
2144
- "&stations=EGPF", "&stations=EGGD", "&stations=EGPE",
2145
- "&stations=EGNT"]
2146
- elif country == "AU__ASOS":
2147
- stations = ["&stations=YPDN", "&stations=YBCS", "&stations=YBBN",
2148
- "&stations=YSSY", "&stations=YSSY", "&stations=YMEN",
2149
- "&stations=YPAD", "&stations=YPPH"]
2150
- elif country == "USCRN":
2151
- stations = ["&stations=64756", "&stations=64758", "&stations=03761", "&stations=54797", # North
2152
- "&stations=53968", "&stations=53960", "&stations=54932", "&stations=13301", # Midwest
2153
- "&stations=64756", "&stations=64756", "&stations=92821", "&stations=63862", # South
2154
- "&stations=53152", "&stations=93245", "&stations=04138", "&stations=04237"] # West
2155
- elif country == "DE__ASOS":
2156
- stations = ["&stations=EDDL", "&stations=EDDH", "&stations=EDDB",
2157
- "&stations=EDDN", "&stations=EDDF", "&stations=EDDK",
2158
- "&stations=EDLW", "&stations=EDDM"]
2159
- elif country == "FR__ASOS":
2160
- stations = ["&stations=LFPB"]
2161
- elif country == "Canada":
2162
- institute_vector = ["CA_NB_ASOS", "CA_NF_ASOS", "CA_NT_ASOS", "CA_NS_ASOS",
2163
- "CA_NU_ASOS"]
2164
- stations_list = [[] for _ in range(5)]
2165
- stations_list[0].append(["&stations=CYQM", "&stations=CERM", "&stations=CZCR",
2166
- "&stations=CZBF", "&stations=CYFC", "&stations=CYCX"])
2167
-
2168
- stations_list[1].append(["&stations=CWZZ", "&stations=CYDP", "&stations=CYMH",
2169
- "&stations=CYAY", "&stations=CWDO", "&stations=CXTP",
2170
- "&stations=CYJT", "&stations=CYYR", "&stations=CZUM",
2171
- "&stations=CYWK", "&stations=CYWK"])
2172
-
2173
- stations_list[2].append(["&stations=CYHI", "&stations=CZCP", "&stations=CWLI",
2174
- "&stations=CWND", "&stations=CXTV", "&stations=CYVL",
2175
- "&stations=CYCO", "&stations=CXDE", "&stations=CYWE",
2176
- "&stations=CYLK", "&stations=CWID", "&stations=CYRF",
2177
- "&stations=CXYH", "&stations=CYWY", "&stations=CWMT"])
2178
-
2179
- stations_list[3].append(["&stations=CWEF", "&stations=CXIB", "&stations=CYQY",
2180
- "&stations=CYPD", "&stations=CXNP", "&stations=CXMY",
2181
- "&stations=CYAW", "&stations=CWKG", "&stations=CWVU",
2182
- "&stations=CXLB", "&stations=CWSA", "&stations=CWRN"])
2183
-
2184
- stations_list[4].append(["&stations=CYLT", "&stations=CWEU", "&stations=CWGZ",
2185
- "&stations=CYIO", "&stations=CXSE", "&stations=CYCB",
2186
- "&stations=CWIL", "&stations=CXWB", "&stations=CYZS",
2187
- "&stations=CWJC", "&stations=CYFB", "&stations=CWUW"])
2188
-
2189
- elif country == "ZA__ASOS":
2190
- cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
2191
- stations = []
2192
-
2193
- for city in cities:
2194
- geolocator = Nominatim(user_agent="MyApp")
2195
- location = geolocator.geocode(city)
2196
- stations.append(f"&latitude={location.latitude}&longitude={location.longitude}")
2197
-
2198
- # Temperature
2199
- if country in ["GB__ASOS", "AU__ASOS", "DE__ASOS", "FR__ASOS"]:
2200
- # We start by making a data frame of the following weather stations
2201
- station_query = ''.join(stations)
2202
-
2203
- raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
2204
- station_query,
2205
- "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
2206
- "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
2207
- raw_weather = urllib.request.urlopen(raw_weather_list)
2208
- raw_weather = pd.read_csv(raw_weather)
2209
-
2210
- # Replace the occurrences of "None" with Missing Value
2211
- raw_weather["max_temp_f"].replace("None", 0, inplace=True)
2212
- raw_weather["min_temp_f"].replace("None", 0, inplace=True)
2213
-
2214
- # Remove any data that isn't temperature-related
2215
- weather = raw_weather.iloc[:, 0:4]
2216
-
2217
- weather[["max_temp_f", "min_temp_f"]] = weather[["max_temp_f", "min_temp_f"]].apply(pd.to_numeric)
2218
-
2219
- # Estimate mean temperature
2220
- weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
2221
-
2222
- # Convert Fahrenheit to Celsius for max_temp_f
2223
- weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
2224
-
2225
- # Convert Fahrenheit to Celsius for min_temp_f
2226
- weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
2227
-
2228
- # Convert Fahrenheit to Celsius for mean_temp_f
2229
- weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
2230
-
2231
- # Aggregate the data to week commencing sunday taking the average of the data
2232
- # Convert the date column to a Date type
2233
- weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
2234
-
2235
- # Determine the starting chosen day for each date
2236
- weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2237
-
2238
- # Group by week_starting and summarize
2239
- numeric_columns = weather.select_dtypes(include='number').columns
2240
- weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
2241
- weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
2242
- "min_temp_f": "avg_min_temp_f",
2243
- "mean_temp_f": "avg_mean_temp_f",
2244
- "max_temp_c": "avg_max_temp_c",
2245
- "min_temp_c": "avg_min_temp_c",
2246
- "mean_temp_c": "avg_mean_temp_c"}, inplace=True)
2247
- elif country == "Canada":
2248
- for i in range(len(institute_vector)):
2249
- station_query_temp = ''.join(flatten_list(stations_list[i]))
2250
- institute_temp = institute_vector[i]
2251
- raw_weather_temp = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", institute_temp,
2252
- station_query_temp,
2253
- "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
2254
- "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
2255
- raw_weather_temp = urllib.request.urlopen(raw_weather_temp)
2256
- raw_weather_temp = pd.read_csv(raw_weather_temp)
2257
-
2258
- if len(raw_weather_temp.index) == 0:
2259
- continue
2260
- raw_weather_temp = raw_weather_temp[['station', 'day', 'max_temp_f', 'min_temp_f', 'precip_in']]
2261
-
2262
- if i == 1:
2263
- raw_weather = raw_weather_temp
2264
- else:
2265
- raw_weather = pd.concat([raw_weather, raw_weather_temp])
2266
-
2267
- # Drop error column if it exists
2268
- if 'ERROR: Invalid network specified' in list(raw_weather.columns):
2269
- raw_weather.drop('ERROR: Invalid network specified', axis=1, inplace=True)
2270
-
2271
- # Replace none values
2272
- raw_weather["max_temp_f"].replace("None", 0, inplace=True)
2273
- raw_weather["min_temp_f"].replace("None", 0, inplace=True)
2274
- raw_weather["precip_in"].replace("None", 0, inplace=True)
2275
-
2276
- weather = raw_weather
2277
- weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
2278
-
2279
- # Estimate mean temperature
2280
- weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
2281
-
2282
- # Convert Fahrenheit to Celsius for max_temp_f
2283
- weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
2284
-
2285
- # Convert Fahrenheit to Celsius for min_temp_f
2286
- weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
2287
-
2288
- # Convert Fahrenheit to Celsius for mean_temp_f
2289
- weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
2290
-
2291
- # Aggregate the data to week commencing sunday taking the average of the data
2292
- # Convert the date column to a Date type
2293
- weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
2294
-
2295
- # Determine the starting chosen day for each date
2296
- weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2297
-
2298
- # Group by week_starting and summarize
2299
- numeric_columns = weather.select_dtypes(include='number').columns
2300
- weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
2301
- weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
2302
- "min_temp_f": "avg_min_temp_f",
2303
- "mean_temp_f": "avg_mean_temp_f",
2304
- "max_temp_c": "avg_max_temp_c",
2305
- "min_temp_c": "avg_min_temp_c",
2306
- "mean_temp_c": "avg_mean_temp_c",
2307
- "precip_in": "avg_mean_perc"}, inplace=True)
2308
- elif country == "ZA__ASOS":
2309
- weather_data_list = []
2310
-
2311
- for city in cities:
2312
- geolocator = Nominatim(user_agent="MyApp")
2313
- location = geolocator.geocode(city)
2314
- url = "https://archive-api.open-meteo.com/v1/archive"
2315
-
2316
- params = {
2317
- "latitude": location.latitude,
2318
- "longitude": location.longitude,
2319
- "start_date": formatted_date,
2320
- "end_date": today.strftime("%Y-%m-%d"),
2321
- "daily": "temperature_2m_max,temperature_2m_min,precipitation_sum",
2322
- "timezone": "auto"
2323
- }
2324
-
2325
- response = requests.get(url, params=params)
2326
- response_data = response.json()
2327
-
2328
- daily_data = response_data["daily"]
2329
- dates = daily_data["time"]
2330
-
2331
- data = pd.DataFrame({
2332
- "day": dates,
2333
- "max_temp_f": daily_data["temperature_2m_max"],
2334
- "min_temp_f": daily_data["temperature_2m_min"],
2335
- "precip_in": daily_data["precipitation_sum"]
2336
- })
2337
- data["city"] = city
2338
- weather_data_list.append(data)
2339
-
2340
- weather = pd.concat(weather_data_list)
2341
-
2342
- # Convert the date column to a Date type
2343
- weather["day"] = pd.to_datetime(weather["day"])
2344
-
2345
- # Replace None values
2346
- weather["max_temp_f"].replace("None", 0, inplace=True)
2347
- weather["min_temp_f"].replace("None", 0, inplace=True)
2348
- weather["precip_in"].replace("None", 0, inplace=True)
2349
-
2350
- weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
2351
-
2352
- # Estimate mean temperature
2353
- weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
2354
-
2355
- # Convert Fahrenheit to Celsius for max_temp_f
2356
- weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
2357
-
2358
- # Convert Fahrenheit to Celsius for min_temp_f
2359
- weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
2360
-
2361
- # Convert Fahrenheit to Celsius for mean_temp_f
2362
- weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
2363
-
2364
- # Determine the starting chosen day for each date
2365
- weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2366
-
2367
- # Group by week_starting and summarize
2368
- numeric_columns = weather.select_dtypes(include='number').columns
2369
- weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
2370
- weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
2371
- "min_temp_f": "avg_min_temp_f",
2372
- "mean_temp_f": "avg_mean_temp_f",
2373
- "max_temp_c": "avg_max_temp_c",
2374
- "min_temp_c": "avg_min_temp_c",
2375
- "mean_temp_c": "avg_mean_temp_c",
2376
- "precip_in": "avg_mean_perc"}, inplace=True)
2377
-
2378
- else:
2379
- # We start by making a data frame of the following weather stations
2380
- station_query = ''.join(stations)
2381
-
2382
- raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
2383
- station_query,
2384
- "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
2385
- "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
2386
- raw_weather = urllib.request.urlopen(raw_weather_list)
2387
- raw_weather = pd.read_csv(raw_weather)
2388
-
2389
- raw_weather = raw_weather[['day', 'max_temp_f', 'min_temp_f', 'precip_in']]
2390
-
2391
- # Replace the occurrences of "None" with Missing Value
2392
- raw_weather["max_temp_f"].replace("None", 0, inplace=True)
2393
- raw_weather["min_temp_f"].replace("None", 0, inplace=True)
2394
- raw_weather["precip_in"].replace("None", 0, inplace=True)
2395
-
2396
- # Remove any data that isn't temperature-related
2397
- weather = raw_weather
2398
-
2399
- weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
2400
-
2401
- # Estimate mean temperature
2402
- weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
2403
-
2404
- # Convert Fahrenheit to Celsius for max_temp_f
2405
- weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
2406
-
2407
- # Convert Fahrenheit to Celsius for min_temp_f
2408
- weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
2409
-
2410
- # Convert Fahrenheit to Celsius for mean_temp_f
2411
- weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
2412
-
2413
- # Aggregate the data to week commencing sunday taking the average of the data
2414
- # Convert the date column to a Date type
2415
- weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
2416
-
2417
- # Determine the starting chosen day for each date
2418
- weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2419
-
2420
- # Group by week_starting and summarize
2421
- numeric_columns = weather.select_dtypes(include='number').columns
2422
- weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
2423
- weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
2424
- "min_temp_f": "avg_min_temp_f",
2425
- "mean_temp_f": "avg_mean_temp_f",
2426
- "max_temp_c": "avg_max_temp_c",
2427
- "min_temp_c": "avg_min_temp_c",
2428
- "mean_temp_c": "avg_mean_temp_c",
2429
- "precip_in": "avg_mean_perc"}, inplace=True)
2430
-
2431
- # Rainfall
2432
- if country == "GB__ASOS":
2433
- # Define cities and date range
2434
- cities = ["Manchester", "Leeds", "Birmingham", "Norwich", "Cardiff", "Southampton", "London", "Newquay", "Belfast", "Glasgow", "Bristol", "Newcastle"]
2435
-
2436
- start_date = formatted_date
2437
- end_date = today.strftime("%Y-%m-%d")
2438
-
2439
- # Initialize an empty list to store the weather data for each city
2440
- weather_data_list = []
2441
-
2442
- # Loop through each city and fetch weather data
2443
- for city in cities:
2444
- # Initialize Nominatim API
2445
- geolocator = Nominatim(user_agent="MyApp")
2446
- location = geolocator.geocode(city)
2447
- url = "https://archive-api.open-meteo.com/v1/archive"
2448
-
2449
- params = {
2450
- "latitude": location.latitude,
2451
- "longitude": location.longitude,
2452
- "start_date": start_date,
2453
- "end_date": end_date,
2454
- "daily": "precipitation_sum",
2455
- "timezone": "auto"
2456
- }
2457
-
2458
- response = requests.get(url, params=params)
2459
- response_data = response.json()
2460
-
2461
- daily_data = response_data["daily"]["precipitation_sum"]
2462
- dates = response_data["daily"]["time"]
2463
-
2464
- data = pd.DataFrame({"date": dates, "rainfall": daily_data})
2465
- data["city"] = city
2466
-
2467
- weather_data_list.append(data)
2468
-
2469
- # Combine all city data into a single data frame
2470
- all_weather_data = pd.concat(weather_data_list)
2471
-
2472
- # Convert the date column to a Date type
2473
- all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
2474
-
2475
- # Set week commencing col up
2476
- all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2477
-
2478
- # Group by week_starting and summarize
2479
- numeric_columns = all_weather_data.select_dtypes(include='number').columns
2480
- weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
2481
- weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
2482
-
2483
- # Change index to datetime
2484
- weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
2485
-
2486
- elif country == "AU__ASOS":
2487
-
2488
- # Define cities and date range
2489
- cities = ["Darwin", "Cairns", "Brisbane", "Sydney", "Melbourne", "Adelaide", "Perth"]
2490
-
2491
- start_date = formatted_date
2492
- end_date = today.strftime("%Y-%m-%d")
2493
-
2494
- # Initialize an empty list to store the weather data for each city
2495
- weather_data_list = []
2496
-
2497
- # Loop through each city and fetch weather data
2498
- for city in cities:
2499
- # Initialize Nominatim API
2500
- geolocator = Nominatim(user_agent="MyApp")
2501
- location = geolocator.geocode(city)
2502
- url = "https://archive-api.open-meteo.com/v1/archive"
2503
-
2504
- params = {
2505
- "latitude": location.latitude,
2506
- "longitude": location.longitude,
2507
- "start_date": start_date,
2508
- "end_date": end_date,
2509
- "daily": "precipitation_sum",
2510
- "timezone": "auto"
2511
- }
2512
-
2513
- response = requests.get(url, params=params)
2514
- response_data = response.json()
2515
-
2516
- daily_data = response_data["daily"]["precipitation_sum"]
2517
- dates = response_data["daily"]["time"]
2518
-
2519
- data = pd.DataFrame({"date": dates, "rainfall": daily_data})
2520
- data["city"] = city
2521
-
2522
- weather_data_list.append(data)
2523
-
2524
- # Combine all city data into a single data frame
2525
- all_weather_data = pd.concat(weather_data_list)
2526
-
2527
- # Convert the date column to a Date type
2528
- all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
2529
-
2530
- # Set week commencing col up
2531
- all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2532
-
2533
- # Group by week_starting and summarize
2534
- numeric_columns = all_weather_data.select_dtypes(include='number').columns
2535
- weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
2536
- weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
2537
-
2538
- # Change index to datetime
2539
- weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
2540
-
2541
- elif country == "DE__ASOS":
2542
-
2543
- # Define cities and date range
2544
- cities = ["Dortmund", "Düsseldorf", "Frankfurt", "Munich", "Cologne", "Berlin", "Hamburg", "Nuernberg"]
2545
-
2546
- start_date = formatted_date
2547
- end_date = today.strftime("%Y-%m-%d")
2548
-
2549
- # Initialize an empty list to store the weather data for each city
2550
- weather_data_list = []
2551
-
2552
- # Loop through each city and fetch weather data
2553
- for city in cities:
2554
- # Initialize Nominatim API
2555
- geolocator = Nominatim(user_agent="MyApp")
2556
- location = geolocator.geocode(city)
2557
- url = "https://archive-api.open-meteo.com/v1/archive"
2558
-
2559
- params = {
2560
- "latitude": location.latitude,
2561
- "longitude": location.longitude,
2562
- "start_date": start_date,
2563
- "end_date": end_date,
2564
- "daily": "precipitation_sum",
2565
- "timezone": "auto"
2566
- }
2567
-
2568
- response = requests.get(url, params=params)
2569
- response_data = response.json()
2570
-
2571
- daily_data = response_data["daily"]["precipitation_sum"]
2572
- dates = response_data["daily"]["time"]
2573
-
2574
- data = pd.DataFrame({"date": dates, "rainfall": daily_data})
2575
- data["city"] = city
2576
-
2577
- weather_data_list.append(data)
2578
-
2579
- # Combine all city data into a single data frame
2580
- all_weather_data = pd.concat(weather_data_list)
2581
-
2582
- # Convert the date column to a Date type
2583
- all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
2584
-
2585
- # Set week commencing col up
2586
- all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2587
-
2588
- # Group by week_starting and summarize
2589
- numeric_columns = all_weather_data.select_dtypes(include='number').columns
2590
- weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
2591
- weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
2592
-
2593
- # Change index to datetime
2594
- weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
2595
-
2596
- elif country == "FR__ASOS":
2597
-
2598
- # Define cities and date range
2599
- cities = ["Paris"]
2600
-
2601
- start_date = formatted_date
2602
- end_date = today.strftime("%Y-%m-%d")
2603
-
2604
- # Initialize an empty list to store the weather data for each city
2605
- weather_data_list = []
2606
-
2607
- # Loop through each city and fetch weather data
2608
- for city in cities:
2609
- # Initialize Nominatim API
2610
- geolocator = Nominatim(user_agent="MyApp")
2611
- location = geolocator.geocode(city)
2612
- url = "https://archive-api.open-meteo.com/v1/archive"
2613
-
2614
- params = {
2615
- "latitude": location.latitude,
2616
- "longitude": location.longitude,
2617
- "start_date": start_date,
2618
- "end_date": end_date,
2619
- "daily": "precipitation_sum",
2620
- "timezone": "auto"
2621
- }
2622
-
2623
- response = requests.get(url, params=params)
2624
- response_data = response.json()
2625
-
2626
- daily_data = response_data["daily"]["precipitation_sum"]
2627
- dates = response_data["daily"]["time"]
2628
-
2629
- data = pd.DataFrame({"date": dates, "rainfall": daily_data})
2630
- data["city"] = city
2631
-
2632
- weather_data_list.append(data)
2633
-
2634
- # Combine all city data into a single data frame
2635
- all_weather_data = pd.concat(weather_data_list)
2636
-
2637
- # Convert the date column to a Date type
2638
- all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
2639
-
2640
- # Set week commencing col up
2641
- all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2642
-
2643
- # Group by week_starting and summarize
2644
- numeric_columns = all_weather_data.select_dtypes(include='number').columns
2645
- weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
2646
- weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
2647
-
2648
- # Change index to datetime
2649
- weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
2650
-
2651
- elif country == "ZA__ASOS":
2652
- cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
2653
- start_date = formatted_date
2654
- end_date = today.strftime("%Y-%m-%d")
2655
-
2656
- weather_data_list = []
2657
-
2658
- for city in cities:
2659
- geolocator = Nominatim(user_agent="MyApp")
2660
- location = geolocator.geocode(city)
2661
- url = "https://archive-api.open-meteo.com/v1/archive"
2662
-
2663
- params = {
2664
- "latitude": location.latitude,
2665
- "longitude": location.longitude,
2666
- "start_date": start_date,
2667
- "end_date": end_date,
2668
- "daily": "precipitation_sum",
2669
- "timezone": "auto"
2670
- }
2671
-
2672
- response = requests.get(url, params=params)
2673
- response_data = response.json()
2674
-
2675
- daily_data = response_data["daily"]["precipitation_sum"]
2676
- dates = response_data["daily"]["time"]
2677
-
2678
- data = pd.DataFrame({"date": dates, "rainfall": daily_data})
2679
- data["city"] = city
2680
-
2681
- weather_data_list.append(data)
2682
-
2683
- # Combine all city data into a single data frame
2684
- all_weather_data = pd.concat(weather_data_list)
2685
-
2686
- # Convert the date column to a Date type
2687
- all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
2688
-
2689
- # Set week commencing col up
2690
- all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2691
-
2692
- # Group by week_starting and summarize
2693
- numeric_columns = all_weather_data.select_dtypes(include='number').columns
2694
- weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
2695
- weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
2696
-
2697
- # Change index to datetime
2698
- weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
2699
-
2700
- # Merge the dataframes
2701
- if country in ["AU__ASOS", "DE__ASOS", "FR__ASOS", "GB__ASOS", "ZA__ASOS"]:
2702
- merged_df = weekly_avg_rain.merge(weekly_avg_temp, on="week_starting")
2703
- else:
2704
- merged_df = weekly_avg_temp
2705
-
2706
- merged_df.reset_index(drop=False, inplace=True)
2707
- merged_df.rename(columns={'week_starting': 'OBS'}, inplace=True)
2708
-
2709
- final_weather = ims_proc.rename_cols(merged_df, 'seas_')
2710
-
2711
- return final_weather