imsciences 0.9.5.9__py3-none-any.whl → 0.9.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/pull.py +14 -11
- {imsciences-0.9.5.9.dist-info → imsciences-0.9.6.2.dist-info}/METADATA +1 -1
- imsciences-0.9.6.2.dist-info/RECORD +11 -0
- dataprocessing/__init__.py +0 -1
- dataprocessing/data-processing-functions.py +0 -2
- dataprocessing/datafunctions.py +0 -2
- imsciences/datafunctions-IMS-24Ltp-3.py +0 -2711
- imsciences/datafunctions.py +0 -3351
- imsciences/datapull.py +0 -374
- imsciences-0.9.5.9.dist-info/PKG-INFO-IMS-24Ltp-3 +0 -24
- imsciences-0.9.5.9.dist-info/RECORD +0 -22
- imsciencesdataprocessing/__init__.py +0 -1
- imsciencesdataprocessing/datafunctions.py +0 -2
- imsdataprocessing/__init__.py +0 -1
- imsdataprocessing/datafunctions.py +0 -2
- {imsciences-0.9.5.9.dist-info → imsciences-0.9.6.2.dist-info}/LICENSE.txt +0 -0
- {imsciences-0.9.5.9.dist-info → imsciences-0.9.6.2.dist-info}/WHEEL +0 -0
- {imsciences-0.9.5.9.dist-info → imsciences-0.9.6.2.dist-info}/top_level.txt +0 -0
|
@@ -1,2711 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import calendar
|
|
3
|
-
import os
|
|
4
|
-
import plotly.express as px
|
|
5
|
-
import plotly.graph_objs as go
|
|
6
|
-
import numpy as np
|
|
7
|
-
import datetime
|
|
8
|
-
import re
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from fredapi import Fred
|
|
11
|
-
import time
|
|
12
|
-
from datetime import datetime,timedelta
|
|
13
|
-
from cif import cif
|
|
14
|
-
from io import StringIO
|
|
15
|
-
import urllib
|
|
16
|
-
import requests_cache
|
|
17
|
-
import urllib.request
|
|
18
|
-
import requests
|
|
19
|
-
from geopy.geocoders import Nominatim
|
|
20
|
-
import subprocess
|
|
21
|
-
import json
|
|
22
|
-
|
|
23
|
-
class dataprocessing:
|
|
24
|
-
|
|
25
|
-
def help(self):
|
|
26
|
-
print("This is the help section. The functions in the package are as follows:")
|
|
27
|
-
|
|
28
|
-
print("\n1. get_wd_levels")
|
|
29
|
-
print(" - Description: Get the working directory with the option of moving up parents.")
|
|
30
|
-
print(" - Usage: get_wd_levels(levels)")
|
|
31
|
-
print(" - Example: get_wd_levels(0)")
|
|
32
|
-
|
|
33
|
-
print("\n2. remove_rows")
|
|
34
|
-
print(" - Description: Removes a specified number of rows from a pandas DataFrame.")
|
|
35
|
-
print(" - Usage: remove_rows(data_frame, num_rows_to_remove)")
|
|
36
|
-
print(" - Example: remove_rows(df, 2)")
|
|
37
|
-
|
|
38
|
-
print("\n3. aggregate_daily_to_wc_long")
|
|
39
|
-
print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
|
|
40
|
-
print(" - Usage: aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation='sum')")
|
|
41
|
-
print(" - Example: aggregate_daily_to_wc_long(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average')")
|
|
42
|
-
|
|
43
|
-
print("\n4. convert_monthly_to_daily")
|
|
44
|
-
print(" - Description: Converts monthly data in a DataFrame to daily data by expanding and dividing the numeric values.")
|
|
45
|
-
print(" - Usage: convert_monthly_to_daily(df, date_column, divide)")
|
|
46
|
-
print(" - Example: convert_monthly_to_daily(df, 'date')")
|
|
47
|
-
|
|
48
|
-
print("\n5. plot_two")
|
|
49
|
-
print(" - Description: Plots specified columns from two different DataFrames using a shared date column. Useful for comparing data.")
|
|
50
|
-
print(" - Usage: plot_two(df1, col1, df2, col2, date_column, same_axis=True)")
|
|
51
|
-
print(" - Example: plot_two(df1, 'cost', df2, 'cost', 'obs', True)")
|
|
52
|
-
|
|
53
|
-
print("\n6. remove_nan_rows")
|
|
54
|
-
print(" - Description: Removes rows from a DataFrame where the specified column has NaN values.")
|
|
55
|
-
print(" - Usage: remove_nan_rows(df, col_to_remove_rows)")
|
|
56
|
-
print(" - Example: remove_nan_rows(df, 'date')")
|
|
57
|
-
|
|
58
|
-
print("\n7. filter_rows")
|
|
59
|
-
print(" - Description: Filters the DataFrame based on whether the values in a specified column are in a provided list.")
|
|
60
|
-
print(" - Usage: filter_rows(df, col_to_filter, list_of_filters)")
|
|
61
|
-
print(" - Example: filter_rows(df, 'country', ['UK', 'IE'])")
|
|
62
|
-
|
|
63
|
-
print("\n8. plot_one")
|
|
64
|
-
print(" - Description: Plots a specified column from a DataFrame.")
|
|
65
|
-
print(" - Usage: plot_one(df1, col1, date_column)")
|
|
66
|
-
print(" - Example: plot_one(df, 'Spend', 'OBS')")
|
|
67
|
-
|
|
68
|
-
print("\n9. week_of_year_mapping")
|
|
69
|
-
print(" - Description: Converts a week column in 'yyyy-Www' or 'yyyy-ww' format to week commencing date.")
|
|
70
|
-
print(" - Usage: week_of_year_mapping(df, week_col, start_day_str)")
|
|
71
|
-
print(" - Example: week_of_year_mapping(df, 'week', 'mon')")
|
|
72
|
-
|
|
73
|
-
print("\n10. exclude_rows")
|
|
74
|
-
print(" - Description: Removes rows from a DataFrame based on whether the values in a specified column are not in a provided list.")
|
|
75
|
-
print(" - Usage: exclude_rows(df, col_to_filter, list_of_filters)")
|
|
76
|
-
print(" - Example: exclude_rows(df, 'week', ['2022-W20', '2022-W21'])")
|
|
77
|
-
|
|
78
|
-
print("\n11. rename_cols")
|
|
79
|
-
print(" - Description: Renames columns in a pandas DataFrame.")
|
|
80
|
-
print(" - Usage: rename_cols(df, name)")
|
|
81
|
-
print(" - Example: rename_cols(df, 'ame_facebook'")
|
|
82
|
-
|
|
83
|
-
print("\n12. merge_new_and_old")
|
|
84
|
-
print(" - Description: Creates a new DataFrame with two columns: one for dates and one for merged numeric values.")
|
|
85
|
-
print(" - Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.")
|
|
86
|
-
print(" - Usage: merge_new_and_old(old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS')")
|
|
87
|
-
print(" - Example: merge_new_and_old(df1, 'old_col', df2, 'new_col', '2023-01-15')")
|
|
88
|
-
|
|
89
|
-
print("\n13. merge_dataframes_on_date")
|
|
90
|
-
print(" - Description: Merge a list of DataFrames on a common column.")
|
|
91
|
-
print(" - Usage: merge_dataframes_on_date(dataframes, common_column='OBS', merge_how='outer')")
|
|
92
|
-
print(" - Example: merge_dataframes_on_date([df1, df2, df3], common_column='OBS', merge_how='outer')")
|
|
93
|
-
|
|
94
|
-
print("\n14. merge_and_update_dfs")
|
|
95
|
-
print(" - Description: Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available, and returns a dataframe sorted by the key column.")
|
|
96
|
-
print(" - Usage: merge_and_update_dfs(df1, df2, key_column)")
|
|
97
|
-
print(" - Example: merged_dataframe = merge_and_update_dfs(processed_facebook, finalised_meta, 'OBS')")
|
|
98
|
-
|
|
99
|
-
print("\n15. convert_us_to_uk_dates")
|
|
100
|
-
print(" - Description: Convert a DataFrame column with mixed date formats to datetime.")
|
|
101
|
-
print(" - Usage: convert_us_to_uk_dates(df, date_col)")
|
|
102
|
-
print(" - Example: convert_us_to_uk_dates(df, 'date')")
|
|
103
|
-
|
|
104
|
-
print("\n16. combine_sheets")
|
|
105
|
-
print(" - Description: Combines multiple DataFrames from a dictionary into a single DataFrame.")
|
|
106
|
-
print(" - Usage: combine_sheets(all_sheets)")
|
|
107
|
-
print(" - Example: combine_sheets({'Sheet1': df1, 'Sheet2': df2})")
|
|
108
|
-
|
|
109
|
-
print("\n17. pivot_table")
|
|
110
|
-
print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
|
|
111
|
-
print(" - Usage: pivot_table(df, filters_dict, index_col, columns, values_col, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True)")
|
|
112
|
-
print(" - Example: pivot_table(df, {'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''}, 'OBS', 'Channel Short Names', 'Value', fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True)")
|
|
113
|
-
|
|
114
|
-
print("\n18. apply_lookup_table_for_columns")
|
|
115
|
-
print(" - Description: Equivalent of xlookup in excel. Allows you to map a dictionary of substrings within a column. If multiple columns are need for the LUT then a | seperator is needed.")
|
|
116
|
-
print(" - Usage: classify_within_column(df, col_names, to_find_dict, if_not_in_country_dict='Other'), new_column_name='Mapping'")
|
|
117
|
-
print(" - Example: classify_within_column(df, ['campaign type','media type'], {'France Paid Social FB|paid social': 'facebook','France Paid Social TW|paid social': 'twitter'}, 'other','mapping')")
|
|
118
|
-
|
|
119
|
-
print("\n19. aggregate_daily_to_wc_wide")
|
|
120
|
-
print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
|
|
121
|
-
print(" - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc, aggregation='sum', include_totals=False)")
|
|
122
|
-
print(" - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average', True)")
|
|
123
|
-
|
|
124
|
-
print("\n20. merge_cols_with_seperator")
|
|
125
|
-
print(" - Description: Merge multiple columns in a dataframe into 1 column with a seperator '_'.Can be used if multiple columns are needed for a LUT.")
|
|
126
|
-
print(" - Usage: merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = 'Merged',starting_prefix_str=None,ending_prefix_str=None)")
|
|
127
|
-
print(" - Example: merge_cols_with_seperator(df, ['Campaign','Product'],seperator='|','Merged Columns',starting_prefix_str='start_',ending_prefix_str='_end')")
|
|
128
|
-
|
|
129
|
-
print("\n21. check_sum_of_df_cols_are_equal")
|
|
130
|
-
print(" - Description: Checks if the sum of two columns in two dataframes are the same, and provides the sums of each column and the difference between them.")
|
|
131
|
-
print(" - Usage: check_sum_of_df_cols_are_equal(df_1,df_2,cols_1,cols_2)")
|
|
132
|
-
print(" - Example: check_sum_of_df_cols_are_equal(df_1,df_2,'Media Cost','Spend')")
|
|
133
|
-
|
|
134
|
-
print("\n22. convert_2_df_cols_to_dict")
|
|
135
|
-
print(" - Description: Can be used to create an LUT. Creates a dictionary using two columns in a dataframe.")
|
|
136
|
-
print(" - Usage: convert_2_df_cols_to_dict(df, key_col, value_col)")
|
|
137
|
-
print(" - Example: convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')")
|
|
138
|
-
|
|
139
|
-
print("\n23. create_FY_and_H_columns")
|
|
140
|
-
print(" - Description: Used to create a financial year, half year, and financial half year column.")
|
|
141
|
-
print(" - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY,short_format='No',half_years='No',combined_FY_and_H='No')")
|
|
142
|
-
print(" - Example: create_FY_and_H_columns(df, 'Week (M-S)', '2022-10-03', 'FY2023',short_format='Yes',half_years='Yes',combined_FY_and_H='Yes')")
|
|
143
|
-
|
|
144
|
-
print("\n24. keyword_lookup_replacement")
|
|
145
|
-
print(" - Description: Essentially provides an if statement with a xlookup if a value is something. Updates certain chosen values in a specified column of the DataFrame based on a lookup dictionary.")
|
|
146
|
-
print(" - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name='Updated Column')")
|
|
147
|
-
print(" - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel','segment','product'], qlik_dict_for_channel,output_column_name='Channel New')")
|
|
148
|
-
|
|
149
|
-
print("\n25. create_new_version_of_col_using_LUT")
|
|
150
|
-
print(" - Description: Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table. The lookup is based on a column in the dataframe.")
|
|
151
|
-
print(" - Usage: create_new_version_of_col_using_LUT(df, keys_col,value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')")
|
|
152
|
-
print(" - Example: keyword_lookup_replacement(df, '*Campaign Name','Campaign Type',search_campaign_name_retag_lut,'Campaign Name New')")
|
|
153
|
-
|
|
154
|
-
print("\n26. convert_df_wide_2_long")
|
|
155
|
-
print(" - Description: Changes a dataframe from wide to long format.")
|
|
156
|
-
print(" - Usage: convert_df_wide_2_long(df,value_cols,variable_col_name='Stacked',value_col_name='Value')")
|
|
157
|
-
print(" - Example: keyword_lookup_replacement(df, ['Media Cost','Impressions','Clicks'],variable_col_name='Metric')")
|
|
158
|
-
|
|
159
|
-
print("\n27. manually_edit_data")
|
|
160
|
-
print(" - Description: Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe.")
|
|
161
|
-
print(" - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)")
|
|
162
|
-
print(" - Example: keyword_lookup_replacement(df, {'OBS':' <= datetime(2023,1,23)','File_Name':' == 'France media''},'Master Include',1,change_in_existing_df_col = 'Yes',new_col_to_change_name = 'Master Include',manual_edit_col_name = 'Manual Changes')")
|
|
163
|
-
|
|
164
|
-
print("\n28. format_numbers_with_commas")
|
|
165
|
-
print(" - Description: Converts data in numerical format into numbers with commas and a chosen decimal place length.")
|
|
166
|
-
print(" - Usage: format_numbers_with_commas(df, decimal_length_chosen=2)")
|
|
167
|
-
print(" - Example: format_numbers_with_commas(df,1)")
|
|
168
|
-
|
|
169
|
-
print("\n29. filter_df_on_multiple_conditions")
|
|
170
|
-
print(" - Description: Filters dataframe on multiple conditions, which come in the form of a dictionary.")
|
|
171
|
-
print(" - Usage: filter_df_on_multiple_conditions(df, filters_dict)")
|
|
172
|
-
print(" - Example: filter_df_on_multiple_conditions(df, {'OBS':' <= datetime(2023,1,23)','File_Name':' == 'France media''})")
|
|
173
|
-
|
|
174
|
-
print("\n30. read_and_concatenate_files")
|
|
175
|
-
print(" - Description: Read and Concatinate all files of one type in a folder.")
|
|
176
|
-
print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
177
|
-
print(" - Example: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
178
|
-
|
|
179
|
-
print("\n31. remove zero values")
|
|
180
|
-
print(" - Description: Remove zero values in a specified column.")
|
|
181
|
-
print(" - Usage: remove_zero_values(self, data_frame, column_to_filter)")
|
|
182
|
-
print(" - Example: remove_zero_values(None, df, 'Funeral_Delivery')")
|
|
183
|
-
|
|
184
|
-
print("\n32. upgrade all packages")
|
|
185
|
-
print(" - Description: Upgrades all packages.")
|
|
186
|
-
print(" - Usage: upgrade_outdated_packages()")
|
|
187
|
-
print(" - Example: upgrade_outdated_packages()")
|
|
188
|
-
|
|
189
|
-
print("\n33. Convert Mixed Formats Dates")
|
|
190
|
-
print(" - Description: Convert a mix of US and UK dates to datetime.")
|
|
191
|
-
print(" - Usage: convert_mixed_formats_dates(df, datecol)")
|
|
192
|
-
print(" - Example: convert_mixed_formats_dates(df, 'OBS')")
|
|
193
|
-
|
|
194
|
-
print("\n34. Fill Weekly Missing Dates")
|
|
195
|
-
print(" - Description: Fill in any missing weeks with 0.")
|
|
196
|
-
print(" - Usage: fill_weekly_date_range(self, df, date_column, freq)")
|
|
197
|
-
print(" - Example: fill_weekly_date_range(df, 'OBS', 'W-MON')")
|
|
198
|
-
|
|
199
|
-
def get_wd_levels(self, levels):
|
|
200
|
-
"""
|
|
201
|
-
Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
|
|
202
|
-
|
|
203
|
-
Parameters:
|
|
204
|
-
- data_frame: pandas DataFrame
|
|
205
|
-
The input data frame.
|
|
206
|
-
- num_rows_to_remove: int
|
|
207
|
-
The number of levels to move up pathways.
|
|
208
|
-
|
|
209
|
-
Returns:
|
|
210
|
-
- Current wd
|
|
211
|
-
"""
|
|
212
|
-
|
|
213
|
-
directory = os.getcwd()
|
|
214
|
-
for _ in range(levels):
|
|
215
|
-
directory = os.path.dirname(directory)
|
|
216
|
-
return directory
|
|
217
|
-
|
|
218
|
-
def remove_rows(self, data_frame, num_rows_to_remove):
|
|
219
|
-
"""
|
|
220
|
-
Removes the specified number of rows from the given data frame, including the top row containing column names.
|
|
221
|
-
The next row will be treated as the new set of column headings.
|
|
222
|
-
|
|
223
|
-
Parameters:
|
|
224
|
-
- data_frame: pandas DataFrame
|
|
225
|
-
The input data frame.
|
|
226
|
-
- num_rows_to_remove: int
|
|
227
|
-
The number of rows to remove from the data frame, starting from the original header.
|
|
228
|
-
|
|
229
|
-
Returns:
|
|
230
|
-
- pandas DataFrames
|
|
231
|
-
The modified data frame with rows removed and new column headings.
|
|
232
|
-
|
|
233
|
-
Raises:
|
|
234
|
-
- TypeError: If num_rows_to_remove is not an integer.
|
|
235
|
-
- ValueError: If num_rows_to_remove is negative or exceeds the total number of rows.
|
|
236
|
-
"""
|
|
237
|
-
|
|
238
|
-
if not isinstance(num_rows_to_remove, int):
|
|
239
|
-
raise TypeError("num_rows_to_remove must be an integer")
|
|
240
|
-
|
|
241
|
-
if num_rows_to_remove < 0 or num_rows_to_remove >= len(data_frame):
|
|
242
|
-
raise ValueError("Number of rows to remove must be non-negative and less than the total number of rows in the data frame.")
|
|
243
|
-
|
|
244
|
-
if num_rows_to_remove == 0:
|
|
245
|
-
return data_frame
|
|
246
|
-
|
|
247
|
-
new_header = data_frame.iloc[num_rows_to_remove - 1]
|
|
248
|
-
modified_data_frame = data_frame[num_rows_to_remove:]
|
|
249
|
-
modified_data_frame.columns = new_header
|
|
250
|
-
|
|
251
|
-
return modified_data_frame
|
|
252
|
-
|
|
253
|
-
def aggregate_daily_to_wc_long(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum') -> pd.DataFrame:
|
|
254
|
-
"""
|
|
255
|
-
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
256
|
-
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
257
|
-
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
258
|
-
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
259
|
-
The day column is renamed from 'Day' to 'OBS'.
|
|
260
|
-
|
|
261
|
-
Parameters:
|
|
262
|
-
- df: pandas DataFrame
|
|
263
|
-
The input DataFrame containing daily data.
|
|
264
|
-
- date_column: string
|
|
265
|
-
The name of the column in the DataFrame that contains date information.
|
|
266
|
-
- group_columns: list of strings
|
|
267
|
-
Additional column names to group by along with the weekly grouping.
|
|
268
|
-
- sum_columns: list of strings
|
|
269
|
-
Numeric column names to be aggregated during aggregation.
|
|
270
|
-
- wc: string
|
|
271
|
-
The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
|
|
272
|
-
- aggregation: string, optional (default 'sum')
|
|
273
|
-
Aggregation method, either 'sum', 'average', or 'count'.
|
|
274
|
-
|
|
275
|
-
Returns:
|
|
276
|
-
- pandas DataFrame
|
|
277
|
-
A new DataFrame with weekly aggregated data. The index is reset,
|
|
278
|
-
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
279
|
-
is in long format, with separate columns for each combination of
|
|
280
|
-
grouped metrics.
|
|
281
|
-
"""
|
|
282
|
-
|
|
283
|
-
# Map the input week commencing day to a weekday number (0=Monday, 6=Sunday)
|
|
284
|
-
days = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, 'sun': 6}
|
|
285
|
-
if wc.lower() not in days:
|
|
286
|
-
return print(f"Incorrect week commencing day input: '{wc}'. Please choose a valid day of the week (e.g., 'sun', 'mon', etc.).")
|
|
287
|
-
|
|
288
|
-
start_day = days[wc.lower()]
|
|
289
|
-
|
|
290
|
-
# Make a copy of the DataFrame
|
|
291
|
-
df_copy = df.copy()
|
|
292
|
-
|
|
293
|
-
# Convert the date column to datetime
|
|
294
|
-
df_copy[date_column] = pd.to_datetime(df_copy[date_column])
|
|
295
|
-
|
|
296
|
-
# Determine the start of each week
|
|
297
|
-
df_copy['week_start'] = df_copy[date_column].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - start_day) % 7))
|
|
298
|
-
|
|
299
|
-
# Convert sum_columns to numeric and fill NaNs with 0, retaining decimal values
|
|
300
|
-
for col in sum_columns:
|
|
301
|
-
df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)
|
|
302
|
-
|
|
303
|
-
# Group by the new week start column and additional columns, then aggregate the numeric columns
|
|
304
|
-
if aggregation == 'average':
|
|
305
|
-
grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].mean().reset_index()
|
|
306
|
-
elif aggregation == 'count':
|
|
307
|
-
grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].count().reset_index()
|
|
308
|
-
else: # Default to 'sum' if any other value is provided
|
|
309
|
-
grouped = df_copy.groupby(['week_start'] + group_columns)[sum_columns].sum().reset_index()
|
|
310
|
-
|
|
311
|
-
# Rename 'week_start' column to 'OBS'
|
|
312
|
-
grouped = grouped.rename(columns={'week_start': 'OBS'})
|
|
313
|
-
|
|
314
|
-
return grouped
|
|
315
|
-
|
|
316
|
-
def convert_monthly_to_daily(self, df, date_column, divide = True):
|
|
317
|
-
"""
|
|
318
|
-
Convert a DataFrame with monthly data to daily data.
|
|
319
|
-
This function takes a DataFrame and a date column, then it expands each
|
|
320
|
-
monthly record into daily records by dividing the numeric values by the number of days in that month.
|
|
321
|
-
|
|
322
|
-
:param df: DataFrame with monthly data.
|
|
323
|
-
:param date_column: The name of the column containing the date.
|
|
324
|
-
:param divide: boolean divide by the number of days in a month (default True)
|
|
325
|
-
:return: A new DataFrame with daily data.
|
|
326
|
-
"""
|
|
327
|
-
|
|
328
|
-
# Convert date_column to datetime
|
|
329
|
-
df[date_column] = pd.to_datetime(df[date_column])
|
|
330
|
-
|
|
331
|
-
# Initialize an empty list to hold the daily records
|
|
332
|
-
daily_records = []
|
|
333
|
-
|
|
334
|
-
# Iterate over each row in the DataFrame
|
|
335
|
-
for _, row in df.iterrows():
|
|
336
|
-
# Calculate the number of days in the month
|
|
337
|
-
num_days = calendar.monthrange(row[date_column].year, row[date_column].month)[1]
|
|
338
|
-
|
|
339
|
-
# Create a new record for each day of the month
|
|
340
|
-
for day in range(1, num_days + 1):
|
|
341
|
-
daily_row = row.copy()
|
|
342
|
-
daily_row[date_column] = row[date_column].replace(day=day)
|
|
343
|
-
|
|
344
|
-
# Divide each numeric value by the number of days in the month
|
|
345
|
-
for col in df.columns:
|
|
346
|
-
if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
|
|
347
|
-
if divide == True:
|
|
348
|
-
daily_row[col] = row[col] / num_days
|
|
349
|
-
else:
|
|
350
|
-
daily_row[col] = row[col]
|
|
351
|
-
daily_records.append(daily_row)
|
|
352
|
-
|
|
353
|
-
# Convert the list of daily records into a DataFrame
|
|
354
|
-
daily_df = pd.DataFrame(daily_records)
|
|
355
|
-
|
|
356
|
-
return daily_df
|
|
357
|
-
|
|
358
|
-
def plot_two(self, df1, col1, df2, col2, date_column, same_axis=True):
|
|
359
|
-
"""
|
|
360
|
-
Plots specified columns from two different dataframes with both different and the same lengths,
|
|
361
|
-
using a specified date column as the X-axis, and charting on either the same or separate y axes.
|
|
362
|
-
|
|
363
|
-
:param df1: First DataFrame
|
|
364
|
-
:param col1: Column name from the first DataFrame
|
|
365
|
-
:param df2: Second DataFrame
|
|
366
|
-
:param col2: Column name from the second DataFrame
|
|
367
|
-
:param date_column: The name of the date column to use for the X-axis
|
|
368
|
-
:param same_axis: If True, plot both traces on the same y-axis; otherwise, use separate y-axes.
|
|
369
|
-
:return: Plotly figure
|
|
370
|
-
"""
|
|
371
|
-
# Ensure date columns are datetime
|
|
372
|
-
df1[date_column] = pd.to_datetime(df1[date_column])
|
|
373
|
-
df2[date_column] = pd.to_datetime(df2[date_column])
|
|
374
|
-
|
|
375
|
-
# Create traces for the first and second dataframes
|
|
376
|
-
trace1 = go.Scatter(x=df1[date_column], y=df1[col1], mode='lines', name=col1, yaxis='y1')
|
|
377
|
-
|
|
378
|
-
if same_axis:
|
|
379
|
-
trace2 = go.Scatter(x=df2[date_column], y=df2[col2], mode='lines', name=col2, yaxis='y1')
|
|
380
|
-
else:
|
|
381
|
-
trace2 = go.Scatter(x=df2[date_column], y=df2[col2], mode='lines', name=col2, yaxis='y2')
|
|
382
|
-
|
|
383
|
-
# Define layout for the plot
|
|
384
|
-
layout = go.Layout(
|
|
385
|
-
title="",
|
|
386
|
-
xaxis=dict(title="OBS", showline=True, linecolor='black'),
|
|
387
|
-
yaxis=dict(title="", showline=True, linecolor='black', rangemode='tozero'),
|
|
388
|
-
yaxis2=dict(title="", overlaying='y', side='right', showline=True, linecolor='black', rangemode='tozero'),
|
|
389
|
-
showlegend=True,
|
|
390
|
-
plot_bgcolor='white' # Set the plot background color to white
|
|
391
|
-
)
|
|
392
|
-
|
|
393
|
-
# Create the figure with the defined layout and traces
|
|
394
|
-
fig = go.Figure(data=[trace1, trace2], layout=layout)
|
|
395
|
-
|
|
396
|
-
return fig
|
|
397
|
-
|
|
398
|
-
def remove_nan_rows(self, df, col_to_remove_rows):
|
|
399
|
-
# This line drops rows where the specified column has NaN values
|
|
400
|
-
return df.dropna(subset=[col_to_remove_rows])
|
|
401
|
-
|
|
402
|
-
def filter_rows(self, df, col_to_filter, list_of_filters):
|
|
403
|
-
# This line filters the DataFrame based on whether the values in the specified column are in the list_of_filters
|
|
404
|
-
return df[df[col_to_filter].isin(list_of_filters)]
|
|
405
|
-
|
|
406
|
-
def plot_one(self, df1, col1, date_column):
|
|
407
|
-
"""
|
|
408
|
-
Plots specified column from a DataFrame with white background and black axes,
|
|
409
|
-
using a specified date column as the X-axis.
|
|
410
|
-
|
|
411
|
-
:param df1: DataFrame
|
|
412
|
-
:param col1: Column name from the DataFrame
|
|
413
|
-
:param date_column: The name of the date column to use for the X-axis
|
|
414
|
-
"""
|
|
415
|
-
|
|
416
|
-
# Check if columns exist in the DataFrame
|
|
417
|
-
if col1 not in df1.columns or date_column not in df1.columns:
|
|
418
|
-
raise ValueError("Column not found in DataFrame")
|
|
419
|
-
|
|
420
|
-
# Check if the date column is in datetime format, if not convert it
|
|
421
|
-
if not pd.api.types.is_datetime64_any_dtype(df1[date_column]):
|
|
422
|
-
df1[date_column] = pd.to_datetime(df1[date_column])
|
|
423
|
-
|
|
424
|
-
# Plotting using Plotly Express
|
|
425
|
-
fig = px.line(df1, x=date_column, y=col1)
|
|
426
|
-
|
|
427
|
-
# Update layout for white background and black axes lines, and setting y-axis to start at 0
|
|
428
|
-
fig.update_layout(
|
|
429
|
-
plot_bgcolor='white',
|
|
430
|
-
xaxis=dict(
|
|
431
|
-
showline=True,
|
|
432
|
-
linecolor='black'
|
|
433
|
-
),
|
|
434
|
-
yaxis=dict(
|
|
435
|
-
showline=True,
|
|
436
|
-
linecolor='black',
|
|
437
|
-
rangemode='tozero' # Setting Y-axis to start at 0 if suitable
|
|
438
|
-
)
|
|
439
|
-
)
|
|
440
|
-
|
|
441
|
-
return fig
|
|
442
|
-
|
|
443
|
-
def week_of_year_mapping(self,df, week_col, start_day_str):
|
|
444
|
-
|
|
445
|
-
# Mapping of string day names to day numbers (1 for Monday, 7 for Sunday)
|
|
446
|
-
day_mapping = {
|
|
447
|
-
'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7
|
|
448
|
-
}
|
|
449
|
-
|
|
450
|
-
# Convert the day string to a number, or raise an error if not valid
|
|
451
|
-
start_day = day_mapping.get(start_day_str.lower())
|
|
452
|
-
if start_day is None:
|
|
453
|
-
raise ValueError(f"Invalid day input: '{start_day_str}'. Please use one of 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'.")
|
|
454
|
-
|
|
455
|
-
# Function to convert week number to start date of the week
|
|
456
|
-
def week_to_startdate(week_str, start_day):
|
|
457
|
-
year, week = map(int, week_str.split('-W'))
|
|
458
|
-
first_day_of_year = datetime(year, 1, 1)
|
|
459
|
-
first_weekday_of_year = first_day_of_year.weekday() # Monday is 0 and Sunday is 6
|
|
460
|
-
|
|
461
|
-
# Calculate days to adjust to the desired start day of the week
|
|
462
|
-
days_to_adjust = (start_day - 1 - first_weekday_of_year) % 7
|
|
463
|
-
start_of_iso_week = first_day_of_year + timedelta(days=days_to_adjust)
|
|
464
|
-
|
|
465
|
-
# Calculate the start of the desired week
|
|
466
|
-
start_of_week = start_of_iso_week + timedelta(weeks=week - 1)
|
|
467
|
-
return start_of_week
|
|
468
|
-
|
|
469
|
-
# Apply the function to each row in the specified week column
|
|
470
|
-
df['OBS'] = df[week_col].apply(lambda x: week_to_startdate(x, start_day)).dt.strftime('%d/%m/%Y')
|
|
471
|
-
return df
|
|
472
|
-
|
|
473
|
-
def exclude_rows(self, df, col_to_filter, list_of_filters):
|
|
474
|
-
# This line filters the DataFrame based on whether the values in the specified column are not in the list_of_filters
|
|
475
|
-
return df[~df[col_to_filter].isin(list_of_filters)]
|
|
476
|
-
|
|
477
|
-
def rename_cols(self, df, name = 'ame_'):
|
|
478
|
-
new_columns = {}
|
|
479
|
-
for col in df.columns:
|
|
480
|
-
if col != 'OBS':
|
|
481
|
-
new_col_name = name + col.replace(" ", "_").lower()
|
|
482
|
-
else:
|
|
483
|
-
new_col_name = col
|
|
484
|
-
new_columns[col] = new_col_name
|
|
485
|
-
return df.rename(columns=new_columns)
|
|
486
|
-
|
|
487
|
-
def merge_new_and_old(self, old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS'):
|
|
488
|
-
"""
|
|
489
|
-
Creates a new DataFrame with two columns: one for dates and one for merged numeric values.
|
|
490
|
-
Merges numeric values from specified columns in the old and new DataFrames based on a given cutoff date.
|
|
491
|
-
|
|
492
|
-
Parameters:
|
|
493
|
-
- old_df: pandas DataFrame
|
|
494
|
-
The old DataFrame from which to take the numeric values up to the specified date.
|
|
495
|
-
- old_col: str
|
|
496
|
-
The name of the numeric column in the old DataFrame whose values are to be taken.
|
|
497
|
-
- new_df: pandas DataFrame
|
|
498
|
-
The new DataFrame from which to take the numeric values from the specified date onwards.
|
|
499
|
-
- new_col: str
|
|
500
|
-
The name of the numeric column in the new DataFrame whose values are to be taken.
|
|
501
|
-
- cutoff_date: str
|
|
502
|
-
The cut-off date in 'YYYY-MM-DD' format to split the data between the two DataFrames.
|
|
503
|
-
- date_col_name: str, optional (default 'OBS')
|
|
504
|
-
The name of the date column in both DataFrames.
|
|
505
|
-
|
|
506
|
-
Returns:
|
|
507
|
-
- pandas DataFrame
|
|
508
|
-
A new DataFrame with two columns: 'Date' and a column named after 'new_col' containing merged numeric values.
|
|
509
|
-
"""
|
|
510
|
-
|
|
511
|
-
# Convert date columns in both dataframes to datetime for comparison
|
|
512
|
-
old_df[date_col_name] = pd.to_datetime(old_df[date_col_name])
|
|
513
|
-
new_df[date_col_name] = pd.to_datetime(new_df[date_col_name])
|
|
514
|
-
|
|
515
|
-
# Convert the cutoff date string to datetime
|
|
516
|
-
cutoff_date = pd.to_datetime(cutoff_date)
|
|
517
|
-
|
|
518
|
-
# Split old and new dataframes based on the cutoff date
|
|
519
|
-
old_values = old_df[old_df[date_col_name] <= cutoff_date]
|
|
520
|
-
new_values = new_df[new_df[date_col_name] > cutoff_date]
|
|
521
|
-
|
|
522
|
-
# Create a new DataFrame with two columns: 'Date' and a column named after 'new_col'
|
|
523
|
-
merged_df = pd.DataFrame({
|
|
524
|
-
'OBS': pd.concat([old_values[date_col_name], new_values[date_col_name]], ignore_index=True),
|
|
525
|
-
new_col: pd.concat([old_values[old_col], new_values[new_col]], ignore_index=True)
|
|
526
|
-
})
|
|
527
|
-
|
|
528
|
-
return merged_df
|
|
529
|
-
|
|
530
|
-
def merge_dataframes_on_column(self, dataframes, common_column='OBS', merge_how='outer'):
|
|
531
|
-
"""
|
|
532
|
-
Merge a list of DataFrames on a common column.
|
|
533
|
-
|
|
534
|
-
Parameters:
|
|
535
|
-
- dataframes: A list of DataFrames to merge.
|
|
536
|
-
- common_column: The name of the common column to merge on.
|
|
537
|
-
- merge_how: The type of merge to perform ('inner', 'outer', 'left', or 'right').
|
|
538
|
-
|
|
539
|
-
Returns:
|
|
540
|
-
- A merged DataFrame.
|
|
541
|
-
"""
|
|
542
|
-
if not dataframes:
|
|
543
|
-
return None
|
|
544
|
-
|
|
545
|
-
merged_df = dataframes[0] # Start with the first DataFrame
|
|
546
|
-
|
|
547
|
-
for df in dataframes[1:]:
|
|
548
|
-
merged_df = pd.merge(merged_df, df, on=common_column, how=merge_how)
|
|
549
|
-
|
|
550
|
-
# Check if the common column is of datetime dtype
|
|
551
|
-
if merged_df[common_column].dtype == 'datetime64[ns]':
|
|
552
|
-
merged_df[common_column] = pd.to_datetime(merged_df[common_column])
|
|
553
|
-
merged_df = merged_df.sort_values(by=common_column)
|
|
554
|
-
merged_df = merged_df.fillna(0)
|
|
555
|
-
|
|
556
|
-
return merged_df
|
|
557
|
-
|
|
558
|
-
def merge_and_update_dfs(self, df1, df2, key_column):
|
|
559
|
-
"""
|
|
560
|
-
Merges two dataframes on a key column, updates the first dataframe's columns with the second's where available,
|
|
561
|
-
and returns a dataframe sorted by the key column.
|
|
562
|
-
|
|
563
|
-
Parameters:
|
|
564
|
-
df1 (DataFrame): The first dataframe to merge (e.g., processed_facebook).
|
|
565
|
-
df2 (DataFrame): The second dataframe to merge (e.g., finalised_meta).
|
|
566
|
-
key_column (str): The name of the column to merge and sort by (e.g., 'OBS').
|
|
567
|
-
|
|
568
|
-
Returns:
|
|
569
|
-
DataFrame: The merged and updated dataframe.
|
|
570
|
-
"""
|
|
571
|
-
|
|
572
|
-
# Sort both DataFrames by the key column
|
|
573
|
-
df1_sorted = df1.sort_values(by=key_column)
|
|
574
|
-
df2_sorted = df2.sort_values(by=key_column)
|
|
575
|
-
|
|
576
|
-
# Perform the full outer merge
|
|
577
|
-
merged_df = pd.merge(df1_sorted, df2_sorted, on=key_column, how='outer', suffixes=('', '_finalised'))
|
|
578
|
-
|
|
579
|
-
# Update with non-null values from df2
|
|
580
|
-
for column in merged_df.columns:
|
|
581
|
-
if column.endswith('_finalised'):
|
|
582
|
-
original_column = column.replace('_finalised', '')
|
|
583
|
-
merged_df.loc[merged_df[column].notnull(), original_column] = merged_df.loc[merged_df[column].notnull(), column]
|
|
584
|
-
merged_df.drop(column, axis=1, inplace=True)
|
|
585
|
-
|
|
586
|
-
# Sort the merged DataFrame by the key column
|
|
587
|
-
merged_df.sort_values(by=key_column, inplace=True)
|
|
588
|
-
|
|
589
|
-
# Handle null values (optional, can be adjusted as needed)
|
|
590
|
-
merged_df.fillna(0, inplace=True)
|
|
591
|
-
|
|
592
|
-
return merged_df
|
|
593
|
-
|
|
594
|
-
def convert_us_to_uk_dates(self, df, date_col):
|
|
595
|
-
"""
|
|
596
|
-
Processes the date column of a DataFrame to remove hyphens and slashes,
|
|
597
|
-
and converts it to a datetime object.
|
|
598
|
-
|
|
599
|
-
Parameters:
|
|
600
|
-
df (pd.DataFrame): The DataFrame containing the date column.
|
|
601
|
-
date_col (str): The name of the date column.
|
|
602
|
-
|
|
603
|
-
Returns:
|
|
604
|
-
pd.DataFrame: The DataFrame with the processed date column.
|
|
605
|
-
"""
|
|
606
|
-
df[date_col] = df[date_col].str.replace(r'[-/]', '', regex=True)
|
|
607
|
-
df[date_col] = pd.to_datetime(
|
|
608
|
-
df[date_col].str.slice(0, 2) + '/' +
|
|
609
|
-
df[date_col].str.slice(2, 4) + '/' +
|
|
610
|
-
df[date_col].str.slice(4, 8),
|
|
611
|
-
format='%m/%d/%Y'
|
|
612
|
-
)
|
|
613
|
-
return df
|
|
614
|
-
|
|
615
|
-
def combine_sheets(self, all_sheets):
|
|
616
|
-
"""
|
|
617
|
-
Combines multiple DataFrames from a dictionary into a single DataFrame.
|
|
618
|
-
Adds a column 'SheetName' indicating the origin sheet of each row.
|
|
619
|
-
|
|
620
|
-
Parameters:
|
|
621
|
-
all_sheets (dict): A dictionary of DataFrames, typically read from an Excel file with multiple sheets.
|
|
622
|
-
|
|
623
|
-
Returns:
|
|
624
|
-
DataFrame: A concatenated DataFrame with an additional 'SheetName' column.
|
|
625
|
-
"""
|
|
626
|
-
combined_df = pd.DataFrame()
|
|
627
|
-
|
|
628
|
-
for sheet_name, df in all_sheets.items():
|
|
629
|
-
df['SheetName'] = sheet_name
|
|
630
|
-
combined_df = pd.concat([combined_df, df], ignore_index=True)
|
|
631
|
-
|
|
632
|
-
return combined_df
|
|
633
|
-
|
|
634
|
-
def pivot_table(self, df, filters_dict, index_col, columns, values_col, fill_value=0,aggfunc='sum',margins=False,margins_name="Total",datetime_trans_needed=True):
|
|
635
|
-
"""
|
|
636
|
-
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
637
|
-
|
|
638
|
-
Args:
|
|
639
|
-
df (pandas.DataFrame): The DataFrame containing the data.
|
|
640
|
-
filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
|
|
641
|
-
index_col (str): Name of Column for your pivot table to index on
|
|
642
|
-
columns (str): Name of Columns for your pivot table.
|
|
643
|
-
values_col (str): Name of Values Columns for your pivot table.
|
|
644
|
-
fill_value (int, optional): The value to replace nan with. Defaults to 0.
|
|
645
|
-
aggfunc (str, optional): The method on which to aggregate the values column. Defaults to sum.
|
|
646
|
-
margins (bool, optional): Whether the pivot table needs a total rows and column. Defaults to False.
|
|
647
|
-
margins_name (str, optional): The name of the Totals columns. Defaults to "Total".
|
|
648
|
-
datetime_trans_needed (bool, optional): Whether the index column needs to be transformed into datetime format. Defaults to False.
|
|
649
|
-
|
|
650
|
-
Returns:
|
|
651
|
-
pandas.DataFrame: The pivot table specified
|
|
652
|
-
"""
|
|
653
|
-
|
|
654
|
-
# Create the filtered df by applying the conditions
|
|
655
|
-
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
656
|
-
|
|
657
|
-
# Ensure OBS is in datetime format for proper sorting
|
|
658
|
-
df_filtered = df_filtered.copy()
|
|
659
|
-
|
|
660
|
-
# If datetime transformation is needed
|
|
661
|
-
if datetime_trans_needed is True:
|
|
662
|
-
df_filtered.loc[:,index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
|
|
663
|
-
|
|
664
|
-
# Create the pivot table
|
|
665
|
-
pivoted_df = df_filtered.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc=aggfunc,margins=margins,margins_name=margins_name)
|
|
666
|
-
|
|
667
|
-
# Handling MultiIndex columns if present, making them a flat structure
|
|
668
|
-
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
669
|
-
pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
|
|
670
|
-
else:
|
|
671
|
-
pivoted_df.columns = pivoted_df.columns.map(str)
|
|
672
|
-
|
|
673
|
-
# Reset the pivot before returning
|
|
674
|
-
pivoted_df = pivoted_df.reset_index()
|
|
675
|
-
|
|
676
|
-
# Sort by OBS from oldest to newest
|
|
677
|
-
if datetime_trans_needed is True:
|
|
678
|
-
# pivoted_df = pivoted_df.reset_index()
|
|
679
|
-
pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col]) # Ensure sorting works correctly
|
|
680
|
-
pivoted_df = pivoted_df.sort_values(by=index_col)
|
|
681
|
-
|
|
682
|
-
# Convert OBS back to a string in YYYY-MM-DD format for display purposes
|
|
683
|
-
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
|
|
684
|
-
|
|
685
|
-
# Set index back to date column
|
|
686
|
-
# pivoted_df.set_index(index_col,inplace=True)
|
|
687
|
-
|
|
688
|
-
# Fill in any NaNs
|
|
689
|
-
pivoted_df = pivoted_df.fillna(fill_value)
|
|
690
|
-
|
|
691
|
-
return pivoted_df
|
|
692
|
-
|
|
693
|
-
def apply_lookup_table_for_columns(self, df, col_names, to_find_dict, if_not_in_dict="Other", new_column_name="Mapping"):
|
|
694
|
-
"""
|
|
695
|
-
Creates a new DataFrame column based on a look up table, possibly with multiple columns to look up on (dictionary of substrings to class mappings).
|
|
696
|
-
|
|
697
|
-
Parameters:
|
|
698
|
-
df (pandas.DataFrame): The DataFrame containing the data.
|
|
699
|
-
col_names (list of str): these are the columns which are used for the lookup. One column or several columns can be inputted as a list, provided there is a merged column to lookup on. If there are multiple columns to look up on then a merged column must be inputted as the key of the dictionary of format e.g. col1|col2|col3
|
|
700
|
-
to_find_dict (dict): your look up table, where keys are the values being looked up, and the values are the resulting mappings.
|
|
701
|
-
if_not_in_dict (str, optional): default value if no substring matches are found in the look up table dictionary. Defaults to "Other".
|
|
702
|
-
new_column_name (str, optional): name of new column. Defaults to "Mapping".
|
|
703
|
-
|
|
704
|
-
Returns:
|
|
705
|
-
pandas.DataFrame: DataFrame with a new column containing the look up table results.
|
|
706
|
-
"""
|
|
707
|
-
|
|
708
|
-
# Create regex pattern from the dictionary keys
|
|
709
|
-
regex_pattern = "|".join(re.escape(key) for key in to_find_dict.keys())
|
|
710
|
-
|
|
711
|
-
# Preprocess DataFrame if multiple columns
|
|
712
|
-
if len(col_names) > 1:
|
|
713
|
-
df["Merged"] = df[col_names].astype(str).apply('|'.join, axis=1)
|
|
714
|
-
col_to_use = "Merged"
|
|
715
|
-
else:
|
|
716
|
-
col_to_use = col_names[0]
|
|
717
|
-
|
|
718
|
-
# Extract the first match using the regex pattern
|
|
719
|
-
matches = df[col_to_use].str.extract(f'({regex_pattern})', expand=False, flags=re.IGNORECASE)
|
|
720
|
-
|
|
721
|
-
# Map the matches to the corresponding values in the dictionary
|
|
722
|
-
df[new_column_name] = matches.str.lower().map({k.lower(): v for k, v in to_find_dict.items()}).fillna(if_not_in_dict)
|
|
723
|
-
|
|
724
|
-
# Drop intermediate column if created
|
|
725
|
-
if len(col_names) > 1:
|
|
726
|
-
df.drop(columns=["Merged"], inplace=True)
|
|
727
|
-
|
|
728
|
-
return df
|
|
729
|
-
|
|
730
|
-
def aggregate_daily_to_wc_wide(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum', include_totals : bool = False) -> pd.DataFrame:
|
|
731
|
-
"""
|
|
732
|
-
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
733
|
-
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
734
|
-
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
735
|
-
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
736
|
-
The day column is renamed from 'Day' to 'OBS'.
|
|
737
|
-
|
|
738
|
-
Parameters:
|
|
739
|
-
- df: pandas DataFrame
|
|
740
|
-
The input DataFrame containing daily data.
|
|
741
|
-
- date_column: string
|
|
742
|
-
The name of the column in the DataFrame that contains date information.
|
|
743
|
-
- group_columns: list of strings
|
|
744
|
-
Additional column names to group by along with the weekly grouping.
|
|
745
|
-
- sum_columns: list of strings
|
|
746
|
-
Numeric column names to be aggregated during aggregation.
|
|
747
|
-
- wc: string
|
|
748
|
-
The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
|
|
749
|
-
- aggregation: string, optional (default 'sum')
|
|
750
|
-
Aggregation method, either 'sum', 'average', or 'count'.
|
|
751
|
-
- include_totals: boolean, optional (default False)
|
|
752
|
-
If True, include total columns for each sum_column.
|
|
753
|
-
|
|
754
|
-
Returns:
|
|
755
|
-
- pandas DataFrame
|
|
756
|
-
A new DataFrame with weekly aggregated data. The index is reset,
|
|
757
|
-
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
758
|
-
is in wide format, with separate columns for each combination of
|
|
759
|
-
grouped metrics.
|
|
760
|
-
"""
|
|
761
|
-
|
|
762
|
-
grouped = self.aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation)
|
|
763
|
-
|
|
764
|
-
# Pivot the data to wide format
|
|
765
|
-
if group_columns:
|
|
766
|
-
wide_df = grouped.pivot_table(index='OBS',
|
|
767
|
-
columns=group_columns,
|
|
768
|
-
values=sum_columns,
|
|
769
|
-
aggfunc='first')
|
|
770
|
-
# Flatten the multi-level column index and create combined column names
|
|
771
|
-
wide_df.columns = ['_'.join(col).strip() for col in wide_df.columns.values]
|
|
772
|
-
else:
|
|
773
|
-
wide_df = grouped.set_index('OBS')
|
|
774
|
-
|
|
775
|
-
# Fill NaN values with 0
|
|
776
|
-
wide_df = wide_df.fillna(0)
|
|
777
|
-
|
|
778
|
-
# Adding total columns for each unique sum_column, if include_totals is True
|
|
779
|
-
if include_totals:
|
|
780
|
-
for col in sum_columns:
|
|
781
|
-
total_column_name = f'Total {col}'
|
|
782
|
-
if group_columns:
|
|
783
|
-
columns_to_sum = [column for column in wide_df.columns if col in column]
|
|
784
|
-
else:
|
|
785
|
-
columns_to_sum = [col]
|
|
786
|
-
wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
|
|
787
|
-
|
|
788
|
-
# Reset the index of the final DataFrame
|
|
789
|
-
wide_df = wide_df.reset_index()
|
|
790
|
-
|
|
791
|
-
return wide_df
|
|
792
|
-
|
|
793
|
-
def merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = "Merged",starting_prefix_str=None,ending_prefix_str=None):
|
|
794
|
-
"""
|
|
795
|
-
Creates a new column in the dataframe that merges 2 or more columns together with a "_" seperator, possibly to be used for a look up table where multiple columns are being looked up
|
|
796
|
-
|
|
797
|
-
Parameters:
|
|
798
|
-
df (pandas.DataFrame): Dataframe to make changes to.
|
|
799
|
-
col_names (list): list of columm names ot merge.
|
|
800
|
-
seperator (str, optional): Name of column outputted. Defaults to "_".
|
|
801
|
-
output_column_name (str, optional): Name of column outputted. Defaults to "Merged".
|
|
802
|
-
starting_prefix_str (str, optional): string of optional text to be added before the merged column str value
|
|
803
|
-
ending_prefix_str (str, optional): string of optional text to be added after the merged column str value
|
|
804
|
-
|
|
805
|
-
Raises:
|
|
806
|
-
ValueError: if more less than two column names are inputted in the list there is nothing to merge on
|
|
807
|
-
|
|
808
|
-
Returns:
|
|
809
|
-
pandas.DataFrame: DataFrame with additional merged column
|
|
810
|
-
"""
|
|
811
|
-
# Specify more than one column must be entered
|
|
812
|
-
if len(col_names) < 2:
|
|
813
|
-
raise ValueError("2 or more columns must be specified to merge")
|
|
814
|
-
|
|
815
|
-
# Create a new column with the merged columns
|
|
816
|
-
df[output_column_name] = df[col_names].astype(str).apply(seperator.join, axis=1)
|
|
817
|
-
|
|
818
|
-
# Add string before
|
|
819
|
-
if starting_prefix_str is not None:
|
|
820
|
-
df[output_column_name] = starting_prefix_str + df[output_column_name].astype(str)
|
|
821
|
-
|
|
822
|
-
# Add string after
|
|
823
|
-
if ending_prefix_str is not None:
|
|
824
|
-
df[output_column_name] = df[output_column_name].astype(str) + ending_prefix_str
|
|
825
|
-
|
|
826
|
-
return df
|
|
827
|
-
|
|
828
|
-
def check_sum_of_df_cols_are_equal(self, df_1,df_2,cols_1,cols_2):
|
|
829
|
-
"""
|
|
830
|
-
Checks the sum of two different dataframe column or columns are equal
|
|
831
|
-
|
|
832
|
-
Parameters:
|
|
833
|
-
df_1 (pandas.DataFrame): First dataframe for columnsa to be summed on.
|
|
834
|
-
df_2 (pandas.DataFrame): Second dataframe for columnsa to be summed on.
|
|
835
|
-
cols_1 (list of str): Columns from first dataframe to sum.
|
|
836
|
-
cols_2 (list of str): Columns from second dataframe to sum.
|
|
837
|
-
|
|
838
|
-
Returns:
|
|
839
|
-
Tuple: Answer is the true or false answer to whether sums are the same, df_1_sum is the sum of the column/columns in the first dataframe, df_2_sum is the sum of the column/columns in the second dataframe
|
|
840
|
-
"""
|
|
841
|
-
# Find the sum of both sets of columns
|
|
842
|
-
df_1_sum = df_1[cols_1].sum().sum()
|
|
843
|
-
df_2_sum = df_2[cols_2].sum().sum()
|
|
844
|
-
|
|
845
|
-
# If the the two columns are
|
|
846
|
-
if df_1_sum == df_2_sum:
|
|
847
|
-
Answer = "They are equal"
|
|
848
|
-
if df_1_sum != df_2_sum:
|
|
849
|
-
Answer = "They are different by " + str(df_2_sum-df_1_sum)
|
|
850
|
-
|
|
851
|
-
return Answer,df_1_sum,df_2_sum
|
|
852
|
-
|
|
853
|
-
def convert_2_df_cols_to_dict(self, df, key_col, value_col):
|
|
854
|
-
"""
|
|
855
|
-
Create a dictionary mapping from two columns of a DataFrame.
|
|
856
|
-
|
|
857
|
-
Parameters:
|
|
858
|
-
df (pd.DataFrame): The DataFrame containing the data.
|
|
859
|
-
key_col (str): The column name to use as keys in the dictionary.
|
|
860
|
-
value_col (str): The column name to use as values in the dictionary.
|
|
861
|
-
|
|
862
|
-
Returns:
|
|
863
|
-
dict: A dictionary with keys from 'key_col' and values from 'value_col'.
|
|
864
|
-
"""
|
|
865
|
-
if key_col not in df or value_col not in df:
|
|
866
|
-
raise ValueError("Specified columns are not in the DataFrame")
|
|
867
|
-
|
|
868
|
-
return {df[key_col].iloc[i]: df[value_col].iloc[i] for i in range(len(df))}
|
|
869
|
-
|
|
870
|
-
def create_FY_and_H_columns(self, df, index_col, start_date, starting_FY,short_format="No",half_years="No",combined_FY_and_H="No"):
|
|
871
|
-
"""
|
|
872
|
-
Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
|
|
873
|
-
|
|
874
|
-
Parameters:
|
|
875
|
-
df (pandas.DataFrame): Dataframe to operate on.
|
|
876
|
-
index_col (str): Name of the column to use for datetime
|
|
877
|
-
start_date (str): String used to specify the start date of an FY specified, needs to be of format "yyyy-mm-dd" e.g. 2021-11-31
|
|
878
|
-
starting_FY (str): String used to specify which FY the start date refers to, needs to be formatted LONG e.g. FY2021
|
|
879
|
-
short_format (str, optional): String used to specify if short format is desired (e.g. FY21) or if long format is desired (e.g. FY2021). Defaults to "No".
|
|
880
|
-
half_years (str, optional): String used to specify if half year column is desired. Defaults to "No".
|
|
881
|
-
combined_FY_and_H (str, optional): String used to specify is a combined half year and FY column is desired. Defaults to "No".
|
|
882
|
-
|
|
883
|
-
Returns:
|
|
884
|
-
pandas.DataFrame: DataFrame with a new column 'FY' containing the FY as well as, if desired, a half year column and a combined FY half year column.
|
|
885
|
-
"""
|
|
886
|
-
|
|
887
|
-
try:
|
|
888
|
-
start_date = datetime.strptime(start_date, '%Y-%m-%d')
|
|
889
|
-
except ValueError:
|
|
890
|
-
print("Error: Date must be of format yyyy-mm-dd")
|
|
891
|
-
return df
|
|
892
|
-
|
|
893
|
-
df["OBS"] = pd.to_datetime(df[index_col])
|
|
894
|
-
df["OBS as string"] = df["OBS"].dt.strftime("%Y-%m-%d")
|
|
895
|
-
|
|
896
|
-
df[index_col] = pd.to_datetime(df[index_col])
|
|
897
|
-
|
|
898
|
-
start_year = int(starting_FY[2:])
|
|
899
|
-
|
|
900
|
-
def calculate_FY_vectorized(date_series):
|
|
901
|
-
years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
|
|
902
|
-
fy = 'FY' + (start_year + years_since_start).astype(str)
|
|
903
|
-
if short_format == "Yes":
|
|
904
|
-
fy = 'FY' + fy.str[-2:]
|
|
905
|
-
return fy
|
|
906
|
-
|
|
907
|
-
df['FY'] = calculate_FY_vectorized(df[index_col])
|
|
908
|
-
|
|
909
|
-
if half_years == "Yes" or combined_FY_and_H == "Yes":
|
|
910
|
-
def calculate_half_year_vectorized(date_series):
|
|
911
|
-
fy_years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
|
|
912
|
-
fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(years=1)
|
|
913
|
-
fy_end_of_h1 = fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
|
|
914
|
-
half_year = np.where(date_series <= fy_end_of_h1, 'H1', 'H2')
|
|
915
|
-
return half_year
|
|
916
|
-
|
|
917
|
-
df['Half Years'] = calculate_half_year_vectorized(df[index_col])
|
|
918
|
-
|
|
919
|
-
if combined_FY_and_H == "Yes":
|
|
920
|
-
df['Financial Half Years'] = df['FY'] + ' ' + df['Half Years']
|
|
921
|
-
|
|
922
|
-
return df
|
|
923
|
-
|
|
924
|
-
def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name="Updated Column"):
|
|
925
|
-
"""
|
|
926
|
-
This function updates values in a specified column of the DataFrame based on a lookup dictionary.
|
|
927
|
-
It first merges several columns into a new 'Merged' column, then uses this merged column to determine
|
|
928
|
-
if replacements are needed based on the dictionary.
|
|
929
|
-
|
|
930
|
-
Parameters:
|
|
931
|
-
df (pd.DataFrame): The DataFrame to process.
|
|
932
|
-
col (str): The name of the column whose values are potentially replaced.
|
|
933
|
-
replacement_rows (str): The specific value in 'col' to check for replacements.
|
|
934
|
-
cols_to_merge (list of str): List of column names whose contents will be merged to form a lookup key.
|
|
935
|
-
replacement_lookup_dict (dict): Dictionary where keys are merged column values and values are the new data to replace in 'col'.
|
|
936
|
-
output_column_name (str, optional): Name of column outputted. Defaults to "Updated Column".
|
|
937
|
-
|
|
938
|
-
Returns:
|
|
939
|
-
pd.DataFrame: The modified DataFrame with updated values in the specified column.
|
|
940
|
-
"""
|
|
941
|
-
df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
|
|
942
|
-
|
|
943
|
-
def replace_values(x):
|
|
944
|
-
if x[col] == replacement_rows:
|
|
945
|
-
merged_value = x['Merged']
|
|
946
|
-
if merged_value in replacement_lookup_dict:
|
|
947
|
-
return replacement_lookup_dict[merged_value]
|
|
948
|
-
return x[col]
|
|
949
|
-
|
|
950
|
-
df[output_column_name] = df.apply(replace_values, axis=1)
|
|
951
|
-
|
|
952
|
-
return df
|
|
953
|
-
|
|
954
|
-
def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
|
|
955
|
-
"""
|
|
956
|
-
Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
|
|
957
|
-
The lookup is based on a column in the dataframe. Can only input one column and output one new column.
|
|
958
|
-
|
|
959
|
-
Parameters:
|
|
960
|
-
df (pandas.DataFrame): The DataFrame containing the data.
|
|
961
|
-
keys_col (str): The name of the column which the LUT will be refercing to ouput a value.
|
|
962
|
-
value_col (str): The name of the column which the new column will be based off. If a key in the key column is not found in the LUT, the values from this column are used instead.
|
|
963
|
-
dict_for_specific_changes (dict): The LUT which the keys_col will be mapped on to find any values that need changing in the new column.
|
|
964
|
-
new_col_name (str, optional): This is the name of the new column being generated. Defaults to "New Version of Old Col".
|
|
965
|
-
|
|
966
|
-
Returns:
|
|
967
|
-
pandas.DataFrame: DataFrame with a new column which is similar to the old column, except for where changes have been made to reflect the lookup table.
|
|
968
|
-
"""
|
|
969
|
-
|
|
970
|
-
# Extract columns to change using new dictionary
|
|
971
|
-
smaller_df = df[[keys_col,value_col]]
|
|
972
|
-
|
|
973
|
-
# Use the new dictionary to create a new LUT
|
|
974
|
-
smaller_df_with_LUT = self.apply_lookup_table_for_columns(smaller_df,[keys_col,value_col],dict_for_specific_changes)
|
|
975
|
-
|
|
976
|
-
# In a new column, keep values from the old column that don't need updating as they are not in the dictionary, and replace values that do need updating with values from the dictionary based on the keys
|
|
977
|
-
smaller_df_with_LUT["Updated Col"]=smaller_df_with_LUT.apply(lambda x: x['Mapping'] if x['Mapping'] != "Other" else x[value_col],axis=1)
|
|
978
|
-
|
|
979
|
-
# Drop the extra unecessary cols
|
|
980
|
-
smaller_df_with_LUT.drop([keys_col,'Mapping'],axis=1,inplace=True)
|
|
981
|
-
|
|
982
|
-
# # Output dataframes as dictionary to be used in a LUT
|
|
983
|
-
new_dict = self.convert_2_df_cols_to_dict(smaller_df_with_LUT,value_col,"Updated Col")
|
|
984
|
-
|
|
985
|
-
# # Use new dictionary to create a new version of an old column
|
|
986
|
-
df_final = self.apply_lookup_table_for_columns(df,[keys_col],new_dict,"other",new_col_name)
|
|
987
|
-
|
|
988
|
-
return df_final
|
|
989
|
-
|
|
990
|
-
def convert_df_wide_2_long(self, df,value_cols,variable_col_name='Stacked',value_col_name='Value'):
|
|
991
|
-
"""
|
|
992
|
-
Changes a dataframe from wide to long format.
|
|
993
|
-
|
|
994
|
-
Args:
|
|
995
|
-
df (pandas.DataFrame): The DataFrame containing the data.
|
|
996
|
-
value_cols (list of str or str if only one): list of column names which are to be transformed from several columns into one.
|
|
997
|
-
variable_col_name (str, optional): Name of new variables column, which contains the names of the columns which have been stacked into one. Defaults to 'Stacked'.
|
|
998
|
-
value_col_name (str, optional): Name of the new value column which contains all the data from the stacked columns. Defaults to 'Value'.
|
|
999
|
-
|
|
1000
|
-
Returns:
|
|
1001
|
-
pandas.DataFrame:: Returns dataframe transformed from long to wide.
|
|
1002
|
-
|
|
1003
|
-
Raises:
|
|
1004
|
-
ValueError: If number of column names to be depivoted is less than 2, then this function is not neccesary.
|
|
1005
|
-
"""
|
|
1006
|
-
|
|
1007
|
-
# Check length of value cols is greater than 1
|
|
1008
|
-
if len(value_cols) < 2:
|
|
1009
|
-
raise ValueError("Number of inputs in list must be greater than 1")
|
|
1010
|
-
|
|
1011
|
-
# Find the columns that are not to be depivoted into one column
|
|
1012
|
-
id_vars = list(set(df.columns.tolist()) - set(value_cols))
|
|
1013
|
-
|
|
1014
|
-
# Melt all columns chosen into one column
|
|
1015
|
-
df_final = pd.melt(df, id_vars,value_cols,var_name=variable_col_name,value_name=value_col_name)
|
|
1016
|
-
|
|
1017
|
-
return df_final
|
|
1018
|
-
|
|
1019
|
-
def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
|
|
1020
|
-
"""
|
|
1021
|
-
Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
|
|
1022
|
-
|
|
1023
|
-
Args:
|
|
1024
|
-
df (pandas.DataFrame): The DataFrame containing the data.
|
|
1025
|
-
filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
|
|
1026
|
-
col_to_change (str): String name of column to edit
|
|
1027
|
-
new_value (any): Value of new input for cell
|
|
1028
|
-
change_in_existing_df_col (str, optional): Input of Yes or No to describe whether to make the change in an existing column. Defaults to "No".
|
|
1029
|
-
new_col_to_change_name (str, optional): Name of the new column to copy the column being edited into and to make the change in. Defaults to 'New'.
|
|
1030
|
-
manual_edit_col_name (str, optional): Name of the current manual edits column, if one is not specified it will be created. Defaults to None.
|
|
1031
|
-
add_notes (str, optional): Gives the option to create a new notes column. Defaults to "No".
|
|
1032
|
-
existing_note_col_name (str, optional): If there is an existing notes column this can be specified. Defaults to None.
|
|
1033
|
-
note (str), optional): The string of the note to be added to the column. Defaults to None.
|
|
1034
|
-
|
|
1035
|
-
Raises:
|
|
1036
|
-
TypeError: The column for the column to change can only be specified as one column as it is a string not a list
|
|
1037
|
-
ValueError: You can only input the values of "Yes" or "No" for whether to make the change in existing column
|
|
1038
|
-
ValueError: You can only input the values of "Yes" or "No" for whether to make a new notes column
|
|
1039
|
-
|
|
1040
|
-
Returns:
|
|
1041
|
-
pandas.DataFrame: Dataframe with manual changes added
|
|
1042
|
-
"""
|
|
1043
|
-
# Raise type error if more than one col is supported
|
|
1044
|
-
if isinstance(col_to_change, list):
|
|
1045
|
-
raise TypeError("Col to change must be specified as a string, not a list")
|
|
1046
|
-
|
|
1047
|
-
# Raises value error if input is invalid for change_in_existing_df_col
|
|
1048
|
-
if change_in_existing_df_col not in ["Yes", "No"]:
|
|
1049
|
-
raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
|
|
1050
|
-
|
|
1051
|
-
# Raises value error if input is invalid for add_notes_col
|
|
1052
|
-
if add_notes not in ["Yes", "No"]:
|
|
1053
|
-
raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
|
|
1054
|
-
|
|
1055
|
-
# Create the filtered df by applying the conditions
|
|
1056
|
-
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
1057
|
-
|
|
1058
|
-
# Create a new column to add the changes if desired, else edit in the current chosen column
|
|
1059
|
-
col_to_update = col_to_change if change_in_existing_df_col == "Yes" else new_col_to_change_name
|
|
1060
|
-
if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
|
|
1061
|
-
df = df.copy()
|
|
1062
|
-
df[new_col_to_change_name] = df[col_to_change]
|
|
1063
|
-
|
|
1064
|
-
# Update the new cell in the chosen column
|
|
1065
|
-
df.loc[df_filtered.index, col_to_update] = new_value
|
|
1066
|
-
|
|
1067
|
-
# Add in manual edit column if desired or specify where one already is
|
|
1068
|
-
if manual_edit_col_name:
|
|
1069
|
-
if manual_edit_col_name not in df.columns:
|
|
1070
|
-
df[manual_edit_col_name] = 0
|
|
1071
|
-
df.loc[df_filtered.index, manual_edit_col_name] = 1
|
|
1072
|
-
elif not manual_edit_col_name and 'Manual Changes' not in df.columns:
|
|
1073
|
-
df['Manual Changes'] = 0
|
|
1074
|
-
df.loc[df_filtered.index, 'Manual Changes'] = 1
|
|
1075
|
-
|
|
1076
|
-
# Add note if desired in new column or an existing column
|
|
1077
|
-
if add_notes == "Yes":
|
|
1078
|
-
note_col = existing_note_col_name if existing_note_col_name else 'Notes'
|
|
1079
|
-
if note_col not in df.columns:
|
|
1080
|
-
df[note_col] = None
|
|
1081
|
-
df.loc[df_filtered.index, note_col] = note
|
|
1082
|
-
|
|
1083
|
-
return df
|
|
1084
|
-
|
|
1085
|
-
def format_numbers_with_commas(self, df, decimal_length_chosen=2):
|
|
1086
|
-
"""
|
|
1087
|
-
Converts data in numerical format into numbers with commas and a chosen decimal place length
|
|
1088
|
-
|
|
1089
|
-
Args:
|
|
1090
|
-
df (pandas.DataFrame): The DataFrame containing the data.
|
|
1091
|
-
decimal_length_chosen (int, optional): _description_. Defaults to 2.
|
|
1092
|
-
|
|
1093
|
-
Returns:
|
|
1094
|
-
pandas.DataFrame: The dataframe with the chosen updated format
|
|
1095
|
-
"""
|
|
1096
|
-
def format_number_with_commas(x, decimal_length=decimal_length_chosen):
|
|
1097
|
-
if isinstance(x, (int, float)):
|
|
1098
|
-
if decimal_length is not None:
|
|
1099
|
-
format_str = "{:,.{}f}".format(x, decimal_length)
|
|
1100
|
-
formatted_number = format_str.format(x)
|
|
1101
|
-
else:
|
|
1102
|
-
formatted_number = "{:,}".format(x)
|
|
1103
|
-
return formatted_number
|
|
1104
|
-
else:
|
|
1105
|
-
return x # Return unchanged if not a number
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
# Apply the function across several columns using applymap()
|
|
1109
|
-
formatted_df = df.applymap(format_number_with_commas)
|
|
1110
|
-
|
|
1111
|
-
return formatted_df
|
|
1112
|
-
|
|
1113
|
-
def filter_df_on_multiple_conditions(self, df, filters_dict):
|
|
1114
|
-
"""
|
|
1115
|
-
Filter a dataframe based on mulitple conditions
|
|
1116
|
-
|
|
1117
|
-
Args:
|
|
1118
|
-
df (pandas.DatFrame): Dataframe to filter on
|
|
1119
|
-
filters_dict (dict): Dictionary with strings as conditions
|
|
1120
|
-
|
|
1121
|
-
Returns:
|
|
1122
|
-
pandas.DatFrame: Filtered Da
|
|
1123
|
-
"""
|
|
1124
|
-
mask = pd.Series(True, index=df.index)
|
|
1125
|
-
for col, cond in filters_dict.items():
|
|
1126
|
-
cond = cond.strip()
|
|
1127
|
-
operator, value = cond.split(maxsplit=1)
|
|
1128
|
-
|
|
1129
|
-
# If value is a string condition make sure to check if there are new lines
|
|
1130
|
-
if "'" in value:
|
|
1131
|
-
value = value.strip().strip("'\"")
|
|
1132
|
-
# If not a string e.g. datetime or number condition you need to transform the string into a value
|
|
1133
|
-
else:
|
|
1134
|
-
value = eval(value)
|
|
1135
|
-
|
|
1136
|
-
if operator == "==":
|
|
1137
|
-
temp_mask = (df[col] == value)
|
|
1138
|
-
elif operator == "!=":
|
|
1139
|
-
temp_mask = (df[col] != value)
|
|
1140
|
-
elif operator == ">=":
|
|
1141
|
-
temp_mask = (df[col] >= value)
|
|
1142
|
-
elif operator == "<=":
|
|
1143
|
-
temp_mask = (df[col] <= value)
|
|
1144
|
-
elif operator == ">":
|
|
1145
|
-
temp_mask = (df[col] > value)
|
|
1146
|
-
elif operator == "<":
|
|
1147
|
-
temp_mask = (df[col] < value)
|
|
1148
|
-
mask &= temp_mask
|
|
1149
|
-
|
|
1150
|
-
# Create the filtered df by applying the conditions
|
|
1151
|
-
df_filtered = df[mask]
|
|
1152
|
-
|
|
1153
|
-
return df_filtered
|
|
1154
|
-
|
|
1155
|
-
def read_and_concatenate_files(self, folder_path, file_type='csv'):
|
|
1156
|
-
"""
|
|
1157
|
-
Reads all files of a specified type (CSV or XLSX) from a given folder
|
|
1158
|
-
and concatenates them into a single DataFrame.
|
|
1159
|
-
|
|
1160
|
-
Parameters:
|
|
1161
|
-
folder_path (str): The path to the folder containing the files.
|
|
1162
|
-
file_type (str): The type of files to read ('csv' or 'xlsx'). Defaults to 'csv'.
|
|
1163
|
-
|
|
1164
|
-
Returns:
|
|
1165
|
-
pd.DataFrame: A DataFrame containing the concatenated data from all files.
|
|
1166
|
-
"""
|
|
1167
|
-
|
|
1168
|
-
# Initialize an empty list to hold dataframes
|
|
1169
|
-
dataframes = []
|
|
1170
|
-
|
|
1171
|
-
# Define file extension based on file_type
|
|
1172
|
-
if file_type == 'csv':
|
|
1173
|
-
extension = '.csv'
|
|
1174
|
-
elif file_type == 'xlsx':
|
|
1175
|
-
extension = '.xlsx'
|
|
1176
|
-
else:
|
|
1177
|
-
raise ValueError("file_type must be either 'csv' or 'xlsx'")
|
|
1178
|
-
|
|
1179
|
-
# Loop through all files in the folder
|
|
1180
|
-
for filename in os.listdir(folder_path):
|
|
1181
|
-
# Check if the file has the correct extension
|
|
1182
|
-
if filename.endswith(extension):
|
|
1183
|
-
file_path = os.path.join(folder_path, filename)
|
|
1184
|
-
# Read the file into a DataFrame
|
|
1185
|
-
if file_type == 'csv':
|
|
1186
|
-
df = pd.read_csv(file_path)
|
|
1187
|
-
elif file_type == 'xlsx':
|
|
1188
|
-
df = pd.read_excel(file_path)
|
|
1189
|
-
# Append the DataFrame to the list
|
|
1190
|
-
dataframes.append(df)
|
|
1191
|
-
|
|
1192
|
-
# Concatenate all DataFrames into a single DataFrame
|
|
1193
|
-
combined_df = pd.concat(dataframes, ignore_index=True)
|
|
1194
|
-
|
|
1195
|
-
return combined_df
|
|
1196
|
-
|
|
1197
|
-
def remove_zero_values(self, data_frame, column_to_filter):
|
|
1198
|
-
"""
|
|
1199
|
-
Removes zero values from given columns
|
|
1200
|
-
|
|
1201
|
-
Parameters:
|
|
1202
|
-
df - input data frame
|
|
1203
|
-
column_to_filter - a column to filter out zero values from
|
|
1204
|
-
|
|
1205
|
-
Returns:
|
|
1206
|
-
Pandas data frame without null values
|
|
1207
|
-
"""
|
|
1208
|
-
|
|
1209
|
-
#This line removes zero values from given column
|
|
1210
|
-
|
|
1211
|
-
return data_frame.loc[~(data_frame[column_to_filter] ==0)]
|
|
1212
|
-
|
|
1213
|
-
def upgrade_outdated_packages(self):
|
|
1214
|
-
try:
|
|
1215
|
-
# Get all installed packages
|
|
1216
|
-
installed_packages_result = subprocess.run("pip list --format=json", shell=True, capture_output=True, text=True)
|
|
1217
|
-
installed_packages = json.loads(installed_packages_result.stdout)
|
|
1218
|
-
|
|
1219
|
-
# Get the list of outdated packages
|
|
1220
|
-
outdated_packages_result = subprocess.run("pip list --outdated --format=json", shell=True, capture_output=True, text=True)
|
|
1221
|
-
outdated_packages = json.loads(outdated_packages_result.stdout)
|
|
1222
|
-
|
|
1223
|
-
# Create a set of outdated package names for quick lookup
|
|
1224
|
-
outdated_package_names = {pkg['name'] for pkg in outdated_packages}
|
|
1225
|
-
|
|
1226
|
-
# Upgrade only outdated packages
|
|
1227
|
-
for package in installed_packages:
|
|
1228
|
-
package_name = package['name']
|
|
1229
|
-
if package_name in outdated_package_names:
|
|
1230
|
-
try:
|
|
1231
|
-
print(f"Upgrading package: {package_name}")
|
|
1232
|
-
upgrade_result = subprocess.run(f"pip install --upgrade {package_name}", shell=True, capture_output=True, text=True)
|
|
1233
|
-
if upgrade_result.returncode == 0:
|
|
1234
|
-
print(f"Successfully upgraded {package_name}")
|
|
1235
|
-
else:
|
|
1236
|
-
print(f"Failed to upgrade {package_name}: {upgrade_result.stderr}")
|
|
1237
|
-
except Exception as e:
|
|
1238
|
-
print(f"An error occurred while upgrading {package_name}: {e}")
|
|
1239
|
-
else:
|
|
1240
|
-
print(f"{package_name} is already up to date")
|
|
1241
|
-
except Exception as e:
|
|
1242
|
-
print(f"An error occurred during the upgrade process: {e}")
|
|
1243
|
-
|
|
1244
|
-
def convert_mixed_formats_dates(self, df, column_name):
|
|
1245
|
-
# Convert initial dates to datetime with coercion to handle errors
|
|
1246
|
-
df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
|
|
1247
|
-
df[column_name] = df[column_name].astype(str)
|
|
1248
|
-
corrected_dates = []
|
|
1249
|
-
|
|
1250
|
-
for date_str in df[column_name]:
|
|
1251
|
-
date_str = date_str.replace('-', '').replace('/', '')
|
|
1252
|
-
if len(date_str) == 8:
|
|
1253
|
-
year = date_str[:4]
|
|
1254
|
-
month = date_str[4:6]
|
|
1255
|
-
day = date_str[6:8]
|
|
1256
|
-
if int(day) <= 12:
|
|
1257
|
-
# Swap month and day
|
|
1258
|
-
corrected_date_str = f"{year}-{day}-{month}"
|
|
1259
|
-
else:
|
|
1260
|
-
corrected_date_str = f"{year}-{month}-{day}"
|
|
1261
|
-
# Convert to datetime
|
|
1262
|
-
corrected_date = pd.to_datetime(corrected_date_str, errors='coerce')
|
|
1263
|
-
else:
|
|
1264
|
-
corrected_date = pd.to_datetime(date_str, errors='coerce')
|
|
1265
|
-
|
|
1266
|
-
corrected_dates.append(corrected_date)
|
|
1267
|
-
|
|
1268
|
-
# Check length of the corrected_dates list
|
|
1269
|
-
if len(corrected_dates) != len(df):
|
|
1270
|
-
raise ValueError("Length of corrected_dates does not match the original DataFrame")
|
|
1271
|
-
|
|
1272
|
-
# Assign the corrected dates back to the DataFrame
|
|
1273
|
-
df[column_name] = corrected_dates
|
|
1274
|
-
return df
|
|
1275
|
-
|
|
1276
|
-
def fill_weekly_date_range(self, df, date_column, freq='W-MON'):
|
|
1277
|
-
# Ensure the date column is in datetime format
|
|
1278
|
-
df[date_column] = pd.to_datetime(df[date_column])
|
|
1279
|
-
|
|
1280
|
-
# Generate the full date range with the specified frequency
|
|
1281
|
-
full_date_range = pd.date_range(start=df[date_column].min(), end=df[date_column].max(), freq=freq)
|
|
1282
|
-
|
|
1283
|
-
# Create a new dataframe with the full date range
|
|
1284
|
-
full_date_df = pd.DataFrame({date_column: full_date_range})
|
|
1285
|
-
|
|
1286
|
-
# Merge the original dataframe with the new full date range dataframe
|
|
1287
|
-
df_full = full_date_df.merge(df, on=date_column, how='left')
|
|
1288
|
-
|
|
1289
|
-
# Fill missing values with 0
|
|
1290
|
-
df_full.fillna(0, inplace=True)
|
|
1291
|
-
|
|
1292
|
-
return df_full
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
########################################################################################################################################
|
|
1302
|
-
########################################################################################################################################
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
ims_proc = dataprocessing()
|
|
1316
|
-
|
|
1317
|
-
class datapull:
|
|
1318
|
-
|
|
1319
|
-
def help(self):
|
|
1320
|
-
print("This is the help section. The functions in the package are as follows:")
|
|
1321
|
-
|
|
1322
|
-
print("\n1. pull_fred_data")
|
|
1323
|
-
print(" - Description: Get data from FRED by using series id tokens.")
|
|
1324
|
-
print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
|
|
1325
|
-
print(" - Example: pull_fred_data('mon', ['GPDIC1', 'Y057RX1Q020SBEA', 'GCEC1', 'ND000333Q', 'Y006RX1Q020SBEA'])")
|
|
1326
|
-
|
|
1327
|
-
print("\n2. pull_boe_data")
|
|
1328
|
-
print(" - Description: Fetch and process Bank of England interest rate data.")
|
|
1329
|
-
print(" - Usage: pull_boe_data(week_commencing)")
|
|
1330
|
-
print(" - Example: pull_boe_data('mon')")
|
|
1331
|
-
|
|
1332
|
-
print("\n3. pull_ons_data")
|
|
1333
|
-
print(" - Description: Fetch and process time series data from the ONS API.")
|
|
1334
|
-
print(" - Usage: pull_ons_data(series_list, week_commencing)")
|
|
1335
|
-
print(" - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
|
|
1336
|
-
|
|
1337
|
-
print("\n4. pull_oecd")
|
|
1338
|
-
print(" - Description: Fetch macroeconomic data from OECD and other sources for a specified country.")
|
|
1339
|
-
print(" - Usage: pull_macro(country='GBR', week_commencing='mon')")
|
|
1340
|
-
print(" - Example: pull_macro('GBR', 'mon')")
|
|
1341
|
-
|
|
1342
|
-
print("\n5. get_google_mobility_data")
|
|
1343
|
-
print(" - Description: Fetch Google Mobility data for the specified country.")
|
|
1344
|
-
print(" - Usage: get_google_mobility_data(country, wc)")
|
|
1345
|
-
print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
|
|
1346
|
-
|
|
1347
|
-
print("\n6. pull_combined_dummies")
|
|
1348
|
-
print(" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
|
|
1349
|
-
print(" - Usage: pull_combined_dummies(week_commencing)")
|
|
1350
|
-
print(" - Example: pull_combined_dummies('mon')")
|
|
1351
|
-
|
|
1352
|
-
print("\n7. pull_weather")
|
|
1353
|
-
print(" - Description: Fetch and process historical weather data for the specified country.")
|
|
1354
|
-
print(" - Usage: pull_weather(week_commencing, country)")
|
|
1355
|
-
print(" - Example: pull_weather('mon', 'GBR')")
|
|
1356
|
-
|
|
1357
|
-
############################################################### MACRO ##########################################################################
|
|
1358
|
-
|
|
1359
|
-
def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]) -> pd.DataFrame:
|
|
1360
|
-
'''
|
|
1361
|
-
Parameters
|
|
1362
|
-
----------
|
|
1363
|
-
week_commencing : str
|
|
1364
|
-
specify the day for the week commencing, the default is 'sun' (e.g., 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
|
|
1365
|
-
|
|
1366
|
-
series_id_list : list[str]
|
|
1367
|
-
provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
|
|
1368
|
-
["GPDIC1", "Y057RX1Q020SBEA", "GCEC1", "ND000333Q", "Y006RX1Q020SBEA"]
|
|
1369
|
-
|
|
1370
|
-
Returns
|
|
1371
|
-
----------
|
|
1372
|
-
pd.DataFrame
|
|
1373
|
-
Return a data frame with FRED data according to the series IDs provided
|
|
1374
|
-
|
|
1375
|
-
Example
|
|
1376
|
-
----------
|
|
1377
|
-
pull_fred_data("mon", ["GCEC1", "SP500"])
|
|
1378
|
-
'''
|
|
1379
|
-
# Fred API
|
|
1380
|
-
fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
|
|
1381
|
-
|
|
1382
|
-
# Fetch the metadata for each series to get the full names
|
|
1383
|
-
series_names = {series_id: fred.get_series_info(series_id).title for series_id in series_id_list}
|
|
1384
|
-
|
|
1385
|
-
# Download data from series id list
|
|
1386
|
-
fred_series = {series_id: fred.get_series(series_id) for series_id in series_id_list}
|
|
1387
|
-
|
|
1388
|
-
# Data processing
|
|
1389
|
-
date_range = {'OBS': pd.date_range("1950-01-01", datetime.today().strftime('%Y-%m-%d'), freq='d')}
|
|
1390
|
-
fred_series_df = pd.DataFrame(date_range)
|
|
1391
|
-
|
|
1392
|
-
for series_id, series_data in fred_series.items():
|
|
1393
|
-
series_data = series_data.reset_index()
|
|
1394
|
-
series_data.columns = ['OBS', series_names[series_id]] # Use the series name as the column header
|
|
1395
|
-
fred_series_df = pd.merge_asof(fred_series_df, series_data, on='OBS', direction='backward')
|
|
1396
|
-
|
|
1397
|
-
# Handle duplicate columns
|
|
1398
|
-
for col in fred_series_df.columns:
|
|
1399
|
-
if '_x' in col:
|
|
1400
|
-
base_col = col.replace('_x', '')
|
|
1401
|
-
fred_series_df[base_col] = fred_series_df[col].combine_first(fred_series_df[base_col + '_y'])
|
|
1402
|
-
fred_series_df.drop([col, base_col + '_y'], axis=1, inplace=True)
|
|
1403
|
-
|
|
1404
|
-
# Ensure sum_columns are present in the DataFrame
|
|
1405
|
-
sum_columns = [series_names[series_id] for series_id in series_id_list if series_names[series_id] in fred_series_df.columns]
|
|
1406
|
-
|
|
1407
|
-
# Aggregate results by week
|
|
1408
|
-
fred_df_final = ims_proc.aggregate_daily_to_wc_wide(df=fred_series_df,
|
|
1409
|
-
date_column="OBS",
|
|
1410
|
-
group_columns=[],
|
|
1411
|
-
sum_columns=sum_columns,
|
|
1412
|
-
wc=week_commencing,
|
|
1413
|
-
aggregation="average")
|
|
1414
|
-
|
|
1415
|
-
# Remove anything after the instance of any ':' in the column names and rename, except for 'OBS'
|
|
1416
|
-
fred_df_final.columns = ['OBS' if col == 'OBS' else 'macro_' + col.lower().split(':')[0].replace(' ', '_') for col in fred_df_final.columns]
|
|
1417
|
-
|
|
1418
|
-
return fred_df_final
|
|
1419
|
-
|
|
1420
|
-
def pull_boe_data(self, week_commencing="mon", max_retries=30, delay=5):
|
|
1421
|
-
"""
|
|
1422
|
-
Fetch and process Bank of England interest rate data.
|
|
1423
|
-
|
|
1424
|
-
Args:
|
|
1425
|
-
week_commencing (str): The starting day of the week for aggregation.
|
|
1426
|
-
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
1427
|
-
Default is "sun".
|
|
1428
|
-
max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 30.
|
|
1429
|
-
delay (int): Delay in seconds between retry attempts. Default is 5.
|
|
1430
|
-
|
|
1431
|
-
Returns:
|
|
1432
|
-
pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
|
|
1433
|
-
The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
|
|
1434
|
-
and 'macro_boe_intr_rate' contains the average interest rate for the week.
|
|
1435
|
-
"""
|
|
1436
|
-
# Week commencing dictionary
|
|
1437
|
-
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1438
|
-
|
|
1439
|
-
# Function to fetch the data with retries
|
|
1440
|
-
def fetch_data_with_retries(url, max_retries, delay):
|
|
1441
|
-
for attempt in range(max_retries):
|
|
1442
|
-
try:
|
|
1443
|
-
html_table = pd.read_html(url)[0]
|
|
1444
|
-
return html_table
|
|
1445
|
-
except Exception as e:
|
|
1446
|
-
print(f"Attempt {attempt + 1} failed: {e}")
|
|
1447
|
-
if attempt < max_retries - 1:
|
|
1448
|
-
time.sleep(delay)
|
|
1449
|
-
else:
|
|
1450
|
-
raise
|
|
1451
|
-
|
|
1452
|
-
# Import HTML data from Bank of England rate
|
|
1453
|
-
url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
|
|
1454
|
-
html_table = fetch_data_with_retries(url, max_retries, delay)
|
|
1455
|
-
|
|
1456
|
-
df = pd.DataFrame(html_table)
|
|
1457
|
-
df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
|
|
1458
|
-
|
|
1459
|
-
# Change date column to datetime and find the corresponding week to the date
|
|
1460
|
-
df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
|
|
1461
|
-
df.sort_values("OBS", axis=0, inplace=True)
|
|
1462
|
-
|
|
1463
|
-
# Create a daily date range and find the week commencing for that day
|
|
1464
|
-
date_range = pd.date_range(df["OBS"].iloc[0], datetime.today(), freq="d")
|
|
1465
|
-
df_daily = pd.DataFrame(date_range, columns=["OBS"])
|
|
1466
|
-
|
|
1467
|
-
# Adjust each date to the specified week commencing day
|
|
1468
|
-
df_daily['Week_Commencing'] = df_daily["OBS"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1469
|
-
|
|
1470
|
-
# Outer merge the daily date range on the boe dataframe and forward fill in the blanks
|
|
1471
|
-
df_final = df_daily.merge(df, on='OBS', how="left")
|
|
1472
|
-
df_final["macro_boe_intr_rate"].ffill(inplace=True)
|
|
1473
|
-
|
|
1474
|
-
# Group by the week start date and get the mean of the interest rates for each week
|
|
1475
|
-
df_final = df_final.groupby('Week_Commencing')['macro_boe_intr_rate'].mean().reset_index()
|
|
1476
|
-
|
|
1477
|
-
df_final['Week_Commencing'] = df_final['Week_Commencing'].dt.strftime('%d/%m/%Y')
|
|
1478
|
-
df_final.rename(columns={'Week_Commencing': 'OBS'}, inplace=True)
|
|
1479
|
-
|
|
1480
|
-
return df_final
|
|
1481
|
-
|
|
1482
|
-
def pull_ons_data(self, series_list, week_commencing):
|
|
1483
|
-
"""
|
|
1484
|
-
Fetch and process time series data from the ONS API.
|
|
1485
|
-
|
|
1486
|
-
Args:
|
|
1487
|
-
series_list (list): A list of dictionaries where each dictionary represents a time series.
|
|
1488
|
-
Each dictionary should have the keys 'series_id' and 'dataset_id'.
|
|
1489
|
-
week_commencing (str): The starting day of the week for aggregation.
|
|
1490
|
-
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
1491
|
-
|
|
1492
|
-
Returns:
|
|
1493
|
-
pd.DataFrame: A DataFrame with weekly aggregated ONS data. The 'OBS' column contains the week
|
|
1494
|
-
commencing dates and other columns contain the aggregated time series values.
|
|
1495
|
-
"""
|
|
1496
|
-
|
|
1497
|
-
def parse_quarter(date_str):
|
|
1498
|
-
"""Parses a string in 'YYYY Q#' format into a datetime object."""
|
|
1499
|
-
year, quarter = date_str.split(' ')
|
|
1500
|
-
quarter_number = int(quarter[1])
|
|
1501
|
-
month = (quarter_number - 1) * 3 + 1
|
|
1502
|
-
return pd.Timestamp(f"{year}-{month:02d}-01")
|
|
1503
|
-
|
|
1504
|
-
# Generate a date range from 1950-01-01 to today
|
|
1505
|
-
date_range = pd.date_range(start="1950-01-01", end=datetime.today(), freq='D')
|
|
1506
|
-
daily_df = pd.DataFrame(date_range, columns=['OBS'])
|
|
1507
|
-
|
|
1508
|
-
# Keep track of the renamed value columns
|
|
1509
|
-
value_columns = []
|
|
1510
|
-
|
|
1511
|
-
for series in series_list:
|
|
1512
|
-
series_id = series['series_id']
|
|
1513
|
-
dataset_id = series['dataset_id']
|
|
1514
|
-
|
|
1515
|
-
# Construct the URL for data
|
|
1516
|
-
data_url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
|
|
1517
|
-
|
|
1518
|
-
# Make the request to the ONS API for data
|
|
1519
|
-
data_response = requests.get(data_url)
|
|
1520
|
-
|
|
1521
|
-
# Check if the request was successful
|
|
1522
|
-
if data_response.status_code != 200:
|
|
1523
|
-
print(f"Failed to fetch data for series {series_id}: {data_response.status_code} {data_response.text}")
|
|
1524
|
-
continue
|
|
1525
|
-
|
|
1526
|
-
# Parse the JSON response for data
|
|
1527
|
-
data = data_response.json()
|
|
1528
|
-
|
|
1529
|
-
# Attempt to extract the name of the time series from the data response
|
|
1530
|
-
series_name = data.get('description', {}).get('title', 'Value')
|
|
1531
|
-
|
|
1532
|
-
# Determine the most granular time series data available
|
|
1533
|
-
if 'months' in data and data['months']:
|
|
1534
|
-
time_series_data = data['months']
|
|
1535
|
-
elif 'quarters' in data and data['quarters']:
|
|
1536
|
-
time_series_data = data['quarters']
|
|
1537
|
-
elif 'years' in data and data['years']:
|
|
1538
|
-
time_series_data = data['years']
|
|
1539
|
-
else:
|
|
1540
|
-
print("No time series data found in the response")
|
|
1541
|
-
continue
|
|
1542
|
-
|
|
1543
|
-
# Create a DataFrame from the time series data
|
|
1544
|
-
df = pd.DataFrame(time_series_data)
|
|
1545
|
-
|
|
1546
|
-
# Handle different frequencies in the data
|
|
1547
|
-
if 'date' in df.columns:
|
|
1548
|
-
if any(df['date'].str.contains('Q')):
|
|
1549
|
-
df['date'] = df['date'].apply(parse_quarter)
|
|
1550
|
-
else:
|
|
1551
|
-
df['date'] = pd.to_datetime(df['date'])
|
|
1552
|
-
|
|
1553
|
-
df = df.rename(columns={'date': 'OBS', 'value': series_name})
|
|
1554
|
-
|
|
1555
|
-
# Rename the value column
|
|
1556
|
-
new_col_name = 'macro_' + series_name.lower().replace(':', '').replace(' ', '_').replace('-', '_')
|
|
1557
|
-
df = df.rename(columns={series_name: new_col_name})
|
|
1558
|
-
|
|
1559
|
-
# Track the renamed value column
|
|
1560
|
-
value_columns.append(new_col_name)
|
|
1561
|
-
|
|
1562
|
-
# Merge the data based on the observation date
|
|
1563
|
-
daily_df = pd.merge_asof(daily_df, df[['OBS', new_col_name]], on='OBS', direction='backward')
|
|
1564
|
-
|
|
1565
|
-
# Ensure columns are numeric
|
|
1566
|
-
for col in value_columns:
|
|
1567
|
-
if col in daily_df.columns:
|
|
1568
|
-
daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
|
|
1569
|
-
else:
|
|
1570
|
-
print(f"Column {col} not found in daily_df")
|
|
1571
|
-
|
|
1572
|
-
# Aggregate results by week
|
|
1573
|
-
ons_df_final = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
|
|
1574
|
-
date_column="OBS",
|
|
1575
|
-
group_columns=[],
|
|
1576
|
-
sum_columns=value_columns,
|
|
1577
|
-
wc=week_commencing,
|
|
1578
|
-
aggregation="average")
|
|
1579
|
-
|
|
1580
|
-
return ons_df_final
|
|
1581
|
-
|
|
1582
|
-
def pull_macro(self, country: str = "GBR", week_commencing: str = "mon"):
|
|
1583
|
-
# Change country input to list
|
|
1584
|
-
countries_list = [country]
|
|
1585
|
-
|
|
1586
|
-
# Check if the data wants to be inputted at any other week commencing date
|
|
1587
|
-
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1588
|
-
|
|
1589
|
-
# Two useful functions for quarterly data
|
|
1590
|
-
# Define a function to get quarterly data
|
|
1591
|
-
def get_quarter(p_date: datetime.date) -> int:
|
|
1592
|
-
return (p_date.month - 1) // 3 + 1
|
|
1593
|
-
|
|
1594
|
-
# Define a function to get the last day of the quarter
|
|
1595
|
-
def get_last_day_of_the_quarter(p_date: datetime.date):
|
|
1596
|
-
quarter = get_quarter(p_date)
|
|
1597
|
-
return datetime(p_date.year + 3 * quarter // 12, 3 * quarter % 12 + 1, 1) + pd.Timedelta(days=-1)
|
|
1598
|
-
|
|
1599
|
-
# For the monthly data
|
|
1600
|
-
data_M, subjects_M, measures_M = cif.createDataFrameFromOECD(countries=countries_list, dsname='MEI',
|
|
1601
|
-
subject=['LCEAMN01', 'LCEAPR', 'CSCICP03', 'CPALTT01',
|
|
1602
|
-
'LRHUTTTT', 'LORSGPRT', 'IR3TIB01',
|
|
1603
|
-
'PRINTO01'],
|
|
1604
|
-
measure=['IXOBSA', 'IXNSA', 'IXNB', 'STSA', 'ST', 'GPSA', 'GY'],
|
|
1605
|
-
frequency='M', startDate='2015-01')
|
|
1606
|
-
data_M = data_M.stack(level=[0, -1, -2]).reset_index()
|
|
1607
|
-
|
|
1608
|
-
data_Q, subjects_Q, measures_Q = cif.createDataFrameFromOECD(countries=countries_list, dsname='MEI',
|
|
1609
|
-
subject=['LCEAMN01', 'LCEAPR', 'CSCICP03', 'CPALTT01',
|
|
1610
|
-
'LRHUTTTT', 'LORSGPRT', 'IR3TIB01',
|
|
1611
|
-
'PRINTO01'],
|
|
1612
|
-
measure=['IXOBSA', 'IXNSA', 'IXNB', 'STSA', 'ST', 'GPSA', 'GY'],
|
|
1613
|
-
frequency='Q', startDate='2015-01')
|
|
1614
|
-
|
|
1615
|
-
data_Q = data_Q.stack(level=[0, -1, -2]).reset_index()
|
|
1616
|
-
|
|
1617
|
-
# Create a data frame dictionary to store your monthly data frames
|
|
1618
|
-
DataFrameDict_M = {elem: pd.DataFrame() for elem in countries_list}
|
|
1619
|
-
for key in DataFrameDict_M.keys():
|
|
1620
|
-
DataFrameDict_M[key] = data_M[:][data_M.country == key]
|
|
1621
|
-
|
|
1622
|
-
# Create a data frame dictionary to store your quarterly data frames
|
|
1623
|
-
DataFrameDict_Q = {elem: pd.DataFrame() for elem in countries_list}
|
|
1624
|
-
for key in DataFrameDict_Q.keys():
|
|
1625
|
-
DataFrameDict_Q[key] = data_Q[:][data_Q.country == key]
|
|
1626
|
-
|
|
1627
|
-
# Create a monthly list of the dataframes to iterate through
|
|
1628
|
-
countries_df_list_M = []
|
|
1629
|
-
for i in countries_list:
|
|
1630
|
-
df = pd.DataFrame(DataFrameDict_M[i])
|
|
1631
|
-
df.rename(columns={0: 'Values'}, inplace=True)
|
|
1632
|
-
df = pd.pivot_table(data=df, index='time', values='Values', columns=['subject', 'measure'])
|
|
1633
|
-
countries_df_list_M.append(df)
|
|
1634
|
-
|
|
1635
|
-
# Create a quarterly list of the dataframes to iterate through
|
|
1636
|
-
countries_df_list_Q = []
|
|
1637
|
-
for i in countries_list:
|
|
1638
|
-
df = pd.DataFrame(DataFrameDict_Q[i])
|
|
1639
|
-
df.rename(columns={0: 'Values'}, inplace=True)
|
|
1640
|
-
df = pd.pivot_table(data=df, index='time', values='Values', columns=['subject', 'measure'])
|
|
1641
|
-
countries_df_list_Q.append(df)
|
|
1642
|
-
|
|
1643
|
-
combined_countries_df_list = list(zip(countries_df_list_M, countries_df_list_Q))
|
|
1644
|
-
|
|
1645
|
-
# Loop through and create dataframes for every country
|
|
1646
|
-
for index, data in enumerate(combined_countries_df_list):
|
|
1647
|
-
# Find country being extracted
|
|
1648
|
-
country = countries_list[index]
|
|
1649
|
-
print(country)
|
|
1650
|
-
|
|
1651
|
-
# For consumer confidence
|
|
1652
|
-
# For countries with no data
|
|
1653
|
-
if country in ['CAN', 'IND', 'NOR']:
|
|
1654
|
-
Consumer_Confidence_Index_df_M = pd.DataFrame()
|
|
1655
|
-
Consumer_Confidence_Index_df_Q = pd.DataFrame()
|
|
1656
|
-
# For countries with quarterly data
|
|
1657
|
-
elif country in []:
|
|
1658
|
-
Consumer_Confidence_Index_df_Q = data[1]['CSCICP03']['IXNSA']
|
|
1659
|
-
Consumer_Confidence_Index_df_Q.rename('consumer_confidence_index', inplace=True)
|
|
1660
|
-
Consumer_Confidence_Index_df_M = pd.DataFrame()
|
|
1661
|
-
# For countries with monthly data
|
|
1662
|
-
else:
|
|
1663
|
-
Consumer_Confidence_Index_df_M = data[0]['CSCICP03']['IXNSA']
|
|
1664
|
-
Consumer_Confidence_Index_df_M.rename('consumer_confidence_index', inplace=True)
|
|
1665
|
-
Consumer_Confidence_Index_df_Q = pd.DataFrame()
|
|
1666
|
-
|
|
1667
|
-
# For consumer prices for COST OF LIVING
|
|
1668
|
-
# For countries with no data
|
|
1669
|
-
if country in []:
|
|
1670
|
-
Consumer_Price_Index_Cost_Of_Living_df_M = pd.DataFrame()
|
|
1671
|
-
Consumer_Price_Index_Cost_Of_Living_df_Q = pd.DataFrame()
|
|
1672
|
-
# For countries with quarterly data
|
|
1673
|
-
elif country in ['AUS', 'NZL']:
|
|
1674
|
-
Consumer_Price_Index_Cost_Of_Living_df_Q = data[1]['CPALTT01']['IXNB']
|
|
1675
|
-
Consumer_Price_Index_Cost_Of_Living_df_Q.rename('consumer_price_index_cost_of_living', inplace=True)
|
|
1676
|
-
Consumer_Price_Index_Cost_Of_Living_df_M = pd.DataFrame()
|
|
1677
|
-
# For countries with monthly data
|
|
1678
|
-
else:
|
|
1679
|
-
Consumer_Price_Index_Cost_Of_Living_df_M = data[0]['CPALTT01']['IXNB']
|
|
1680
|
-
Consumer_Price_Index_Cost_Of_Living_df_M.rename('consumer_price_index_cost_of_living', inplace=True)
|
|
1681
|
-
Consumer_Price_Index_Cost_Of_Living_df_Q = pd.DataFrame()
|
|
1682
|
-
|
|
1683
|
-
# For consumer prices FOR INFLATION
|
|
1684
|
-
# For countries with no data
|
|
1685
|
-
if country in []:
|
|
1686
|
-
Consumer_Price_Index_Inflation_df_M = pd.DataFrame()
|
|
1687
|
-
Consumer_Price_Index_Inflation_df_Q = pd.DataFrame()
|
|
1688
|
-
# For countries with quarterly data
|
|
1689
|
-
elif country in ['AUS', 'NZL']:
|
|
1690
|
-
Consumer_Price_Index_Inflation_df_Q = data[1]['CPALTT01']['GY']
|
|
1691
|
-
Consumer_Price_Index_Inflation_df_Q.rename('consumer_price_index_inflation', inplace=True)
|
|
1692
|
-
Consumer_Price_Index_Inflation_df_M = pd.DataFrame()
|
|
1693
|
-
# For countries with monthly data
|
|
1694
|
-
else:
|
|
1695
|
-
Consumer_Price_Index_Inflation_df_M = data[0]['CPALTT01']['GY']
|
|
1696
|
-
Consumer_Price_Index_Inflation_df_M.rename('consumer_price_index_inflation', inplace=True)
|
|
1697
|
-
Consumer_Price_Index_Inflation_df_Q = pd.DataFrame()
|
|
1698
|
-
|
|
1699
|
-
# For GDP Index Smoothed
|
|
1700
|
-
# For countries with no data
|
|
1701
|
-
if country in ['NLD', 'CHE', 'NZL', 'SWE', 'NOR']:
|
|
1702
|
-
GDP_Index_Smoothed_df_M = pd.DataFrame()
|
|
1703
|
-
GDP_Index_Smoothed_df_Q = pd.DataFrame()
|
|
1704
|
-
# For countries with quarterly data
|
|
1705
|
-
elif country in []:
|
|
1706
|
-
GDP_Index_Smoothed_df_Q = data[1]['LORSGPRT']['STSA']
|
|
1707
|
-
GDP_Index_Smoothed_df_Q.rename('gdp_index_smoothed', inplace=True)
|
|
1708
|
-
GDP_Index_Smoothed_df_M = pd.DataFrame()
|
|
1709
|
-
# For countries with monthly data
|
|
1710
|
-
else:
|
|
1711
|
-
GDP_Index_Smoothed_df_M = data[0]['LORSGPRT']['STSA']
|
|
1712
|
-
GDP_Index_Smoothed_df_M.rename('gdp_index_smoothed', inplace=True)
|
|
1713
|
-
GDP_Index_Smoothed_df_Q = pd.DataFrame()
|
|
1714
|
-
|
|
1715
|
-
# For Harmonised Unemployment Index
|
|
1716
|
-
# For countries with no data
|
|
1717
|
-
if country in ['IND', 'CHE', 'ZAF', 'CHN']:
|
|
1718
|
-
Harmonised_Unemployment_Index_df_M = pd.DataFrame()
|
|
1719
|
-
Harmonised_Unemployment_Index_df_Q = pd.DataFrame()
|
|
1720
|
-
# For countries with quarterly data
|
|
1721
|
-
elif country in ['NZL']:
|
|
1722
|
-
Harmonised_Unemployment_Index_df_Q = data[1]['LRHUTTTT']['STSA']
|
|
1723
|
-
Harmonised_Unemployment_Index_df_Q.rename('harmonised_unemployment_index', inplace=True)
|
|
1724
|
-
Harmonised_Unemployment_Index_df_M = pd.DataFrame()
|
|
1725
|
-
# For countries with monthly data
|
|
1726
|
-
else:
|
|
1727
|
-
Harmonised_Unemployment_Index_df_M = data[0]['LRHUTTTT']['STSA']
|
|
1728
|
-
Harmonised_Unemployment_Index_df_M.rename('harmonised_unemployment_index', inplace=True)
|
|
1729
|
-
Harmonised_Unemployment_Index_df_Q = pd.DataFrame()
|
|
1730
|
-
|
|
1731
|
-
# For hourly earnings index manufacturing
|
|
1732
|
-
# For countries with no data
|
|
1733
|
-
if country in ['IND', 'CHE', 'ZAF', 'CHN']:
|
|
1734
|
-
Hourly_Earnings_Index_Manufacturing_df_M = pd.DataFrame()
|
|
1735
|
-
Hourly_Earnings_Index_Manufacturing_df_Q = pd.DataFrame()
|
|
1736
|
-
# For countries with quarterly data
|
|
1737
|
-
elif country in ['FRA', 'DEU', 'ESP', 'AUS', 'NZL', 'KOR', 'NOR']:
|
|
1738
|
-
Hourly_Earnings_Index_Manufacturing_df_Q = data[1]['LCEAMN01']['IXOBSA']
|
|
1739
|
-
Hourly_Earnings_Index_Manufacturing_df_Q.rename('hourly_earnings_index_manufacturing', inplace=True)
|
|
1740
|
-
Hourly_Earnings_Index_Manufacturing_df_M = pd.DataFrame()
|
|
1741
|
-
# For countries with monthly data
|
|
1742
|
-
else:
|
|
1743
|
-
Hourly_Earnings_Index_Manufacturing_df_M = data[0]['LCEAMN01']['IXOBSA']
|
|
1744
|
-
Hourly_Earnings_Index_Manufacturing_df_M.rename('hourly_earnings_index_manufacturing', inplace=True)
|
|
1745
|
-
Hourly_Earnings_Index_Manufacturing_df_Q = pd.DataFrame()
|
|
1746
|
-
|
|
1747
|
-
# For Short Term Interest Rate
|
|
1748
|
-
# For countries with no data
|
|
1749
|
-
if country in []:
|
|
1750
|
-
Short_Term_Interest_Rate_df_M = pd.DataFrame()
|
|
1751
|
-
Short_Term_Interest_Rate_df_Q = pd.DataFrame()
|
|
1752
|
-
# For countries with quarterly data
|
|
1753
|
-
elif country in []:
|
|
1754
|
-
Short_Term_Interest_Rate_df_Q = data[1]['IR3TIB01']['ST']
|
|
1755
|
-
Short_Term_Interest_Rate_df_Q.rename('short_term_interest_rate', inplace=True)
|
|
1756
|
-
Short_Term_Interest_Rate_df_M = pd.DataFrame()
|
|
1757
|
-
# For countries with monthly data
|
|
1758
|
-
else:
|
|
1759
|
-
Short_Term_Interest_Rate_df_M = data[0]['IR3TIB01']['ST']
|
|
1760
|
-
Short_Term_Interest_Rate_df_M.rename('short_term_interest_rate', inplace=True)
|
|
1761
|
-
Short_Term_Interest_Rate_df_Q = pd.DataFrame()
|
|
1762
|
-
|
|
1763
|
-
# For Industrial Product Growth on Previous Period
|
|
1764
|
-
# For countries with no data
|
|
1765
|
-
if country in ['ZAF', 'CHN']:
|
|
1766
|
-
Industrial_Product_Growth_on_Previous_Period_df_M = pd.DataFrame()
|
|
1767
|
-
Industrial_Product_Growth_on_Previous_Period_df_Q = pd.DataFrame()
|
|
1768
|
-
# For countries with quarterly data
|
|
1769
|
-
elif country in ['AUS', 'NZL']:
|
|
1770
|
-
Industrial_Product_Growth_on_Previous_Period_df_Q = data[1]['PRINTO01']['GPSA']
|
|
1771
|
-
Industrial_Product_Growth_on_Previous_Period_df_Q.rename('industrial_product_growth_on_previous_period', inplace=True)
|
|
1772
|
-
Industrial_Product_Growth_on_Previous_Period_df_M = pd.DataFrame()
|
|
1773
|
-
# For countries with monthly data
|
|
1774
|
-
else:
|
|
1775
|
-
Industrial_Product_Growth_on_Previous_Period_df_M = data[0]['PRINTO01']['GPSA']
|
|
1776
|
-
Industrial_Product_Growth_on_Previous_Period_df_M.rename('industrial_product_growth_on_previous_period', inplace=True)
|
|
1777
|
-
Industrial_Product_Growth_on_Previous_Period_df_Q = pd.DataFrame()
|
|
1778
|
-
|
|
1779
|
-
# For Industrial Production Index
|
|
1780
|
-
# For countries with no data
|
|
1781
|
-
if country in ['ZAF', 'CHN']:
|
|
1782
|
-
Industrial_Production_Index_df_M = pd.DataFrame()
|
|
1783
|
-
Industrial_Production_Index_df_Q = pd.DataFrame()
|
|
1784
|
-
# For countries with quarterly data
|
|
1785
|
-
elif country in ['AUS', 'NZL']:
|
|
1786
|
-
Industrial_Production_Index_df_Q = data[1]['PRINTO01']['IXOBSA']
|
|
1787
|
-
Industrial_Production_Index_df_Q.rename('industrial_production_index', inplace=True)
|
|
1788
|
-
Industrial_Production_Index_df_M = pd.DataFrame()
|
|
1789
|
-
# For countries with monthly data
|
|
1790
|
-
else:
|
|
1791
|
-
Industrial_Production_Index_df_M = data[0]['PRINTO01']['IXOBSA']
|
|
1792
|
-
Industrial_Production_Index_df_M.rename('industrial_production_index', inplace=True)
|
|
1793
|
-
Industrial_Production_Index_df_Q = pd.DataFrame()
|
|
1794
|
-
|
|
1795
|
-
# Create monthly macroeconomic dataframe
|
|
1796
|
-
all_dfs_list_M = [Consumer_Confidence_Index_df_M,
|
|
1797
|
-
Consumer_Price_Index_Cost_Of_Living_df_M,
|
|
1798
|
-
Consumer_Price_Index_Inflation_df_M,
|
|
1799
|
-
GDP_Index_Smoothed_df_M,
|
|
1800
|
-
Harmonised_Unemployment_Index_df_M,
|
|
1801
|
-
Hourly_Earnings_Index_Manufacturing_df_M,
|
|
1802
|
-
Short_Term_Interest_Rate_df_M,
|
|
1803
|
-
Industrial_Product_Growth_on_Previous_Period_df_M,
|
|
1804
|
-
Industrial_Production_Index_df_M]
|
|
1805
|
-
|
|
1806
|
-
# Check if any dataframes are empty and if there are remove them
|
|
1807
|
-
all_dfs_list_M = [df for df in all_dfs_list_M if not df.empty]
|
|
1808
|
-
cif_Macroeconomic_df_M = pd.concat(all_dfs_list_M, axis=1)
|
|
1809
|
-
|
|
1810
|
-
# Create quarterly macroeconomic dataframe
|
|
1811
|
-
all_dfs_list_Q = [Consumer_Confidence_Index_df_Q,
|
|
1812
|
-
Consumer_Price_Index_Cost_Of_Living_df_Q,
|
|
1813
|
-
Consumer_Price_Index_Inflation_df_Q,
|
|
1814
|
-
GDP_Index_Smoothed_df_Q,
|
|
1815
|
-
Harmonised_Unemployment_Index_df_Q,
|
|
1816
|
-
Hourly_Earnings_Index_Manufacturing_df_Q,
|
|
1817
|
-
Short_Term_Interest_Rate_df_Q,
|
|
1818
|
-
Industrial_Product_Growth_on_Previous_Period_df_Q,
|
|
1819
|
-
Industrial_Production_Index_df_Q]
|
|
1820
|
-
|
|
1821
|
-
# Check if any dataframes are empty and if there are remove them
|
|
1822
|
-
all_dfs_list_Q = [df for df in all_dfs_list_Q if not df.empty]
|
|
1823
|
-
if all_dfs_list_Q != []:
|
|
1824
|
-
macroeconomic_monthly_df_Q = pd.concat(all_dfs_list_Q, axis=1)
|
|
1825
|
-
else:
|
|
1826
|
-
macroeconomic_monthly_df_Q = pd.DataFrame()
|
|
1827
|
-
|
|
1828
|
-
# For USD GBP Exchange Rate
|
|
1829
|
-
# If it's the UK add this series else don't
|
|
1830
|
-
if countries_list[index] == 'GBR':
|
|
1831
|
-
USD_GBP_Exchange_Rate_df = pd.read_csv(
|
|
1832
|
-
'https://stats.oecd.org/SDMX-JSON/data/MEI_FIN/CCUS.' + countries_list[index] + '.M/OECD?contentType=csv')
|
|
1833
|
-
USD_GBP_Exchange_Rate_df.head()
|
|
1834
|
-
USD_GBP_Exchange_Rate_df_pivot = pd.pivot_table(USD_GBP_Exchange_Rate_df, values='Value', index='TIME',
|
|
1835
|
-
columns='Subject')
|
|
1836
|
-
USD_GBP_Exchange_Rate_df_pivot_final = USD_GBP_Exchange_Rate_df_pivot.loc["2015-01":]
|
|
1837
|
-
USD_GBP_Exchange_Rate_df_pivot_final.rename(
|
|
1838
|
-
columns={'Currency exchange rates, monthly average': 'usd_gbp_exchange_rate'}, inplace=True)
|
|
1839
|
-
|
|
1840
|
-
# Create final monthly dataframe
|
|
1841
|
-
macroeconomic_monthly_df_M = pd.concat([cif_Macroeconomic_df_M, USD_GBP_Exchange_Rate_df_pivot_final], axis=1)
|
|
1842
|
-
else:
|
|
1843
|
-
# Create final monthly dataframe
|
|
1844
|
-
macroeconomic_monthly_df_M = cif_Macroeconomic_df_M
|
|
1845
|
-
|
|
1846
|
-
# Create the final W/C Sunday dataframe
|
|
1847
|
-
# For monthly data
|
|
1848
|
-
macroeconomic_monthly_df_M['Date'] = macroeconomic_monthly_df_M.index
|
|
1849
|
-
df_M = macroeconomic_monthly_df_M.set_index(pd.to_datetime(macroeconomic_monthly_df_M['Date'])).drop(columns='Date')
|
|
1850
|
-
df_M.fillna(method="ffill", inplace=True)
|
|
1851
|
-
df_M.reset_index(inplace=True)
|
|
1852
|
-
|
|
1853
|
-
daily_records = []
|
|
1854
|
-
# Iterate over each row in the DataFrame
|
|
1855
|
-
for _, row in df_M.iterrows():
|
|
1856
|
-
# Calculate the number of days in the month
|
|
1857
|
-
num_days = calendar.monthrange(row["Date"].year, row["Date"].month)[1]
|
|
1858
|
-
# Create a new record for each day of the month
|
|
1859
|
-
for day in range(1, num_days + 1):
|
|
1860
|
-
daily_row = row.copy()
|
|
1861
|
-
daily_row["Date"] = row["Date"].replace(day=day)
|
|
1862
|
-
daily_records.append(daily_row)
|
|
1863
|
-
|
|
1864
|
-
# Convert the list of daily records into a DataFrame
|
|
1865
|
-
daily_df = pd.DataFrame(daily_records)
|
|
1866
|
-
|
|
1867
|
-
# Extend dataframe to include the current data if needed
|
|
1868
|
-
datelist = pd.date_range(daily_df["Date"].iloc[-1] + pd.Timedelta(days=1), datetime.today()).tolist()
|
|
1869
|
-
extended_data = np.repeat([list(daily_df.iloc[-1, 1:].values)], len(datelist), axis=0)
|
|
1870
|
-
q = pd.Series(datelist, name="Date")
|
|
1871
|
-
s = pd.DataFrame(extended_data, columns=list(df_M.columns[1:]))
|
|
1872
|
-
extended_daily_df = pd.concat([q, s], axis=1)
|
|
1873
|
-
extended_daily_df = pd.concat([daily_df, extended_daily_df], ignore_index=False)
|
|
1874
|
-
|
|
1875
|
-
# Create a week commencing column
|
|
1876
|
-
extended_daily_df["Date"] = pd.to_datetime(extended_daily_df["Date"], format='%d %b %Y')
|
|
1877
|
-
extended_daily_df['week_start'] = extended_daily_df["Date"].apply(
|
|
1878
|
-
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1879
|
-
extended_daily_df.drop("Date", axis=1, inplace=True)
|
|
1880
|
-
extended_daily_df.rename(columns={'week_start': "Date"}, inplace=True)
|
|
1881
|
-
|
|
1882
|
-
# Take a weekly average
|
|
1883
|
-
macroeconomic_weekly_df_M = extended_daily_df.groupby('Date').mean()
|
|
1884
|
-
|
|
1885
|
-
# For quarterly data
|
|
1886
|
-
# If there are quarterly datasets
|
|
1887
|
-
if all_dfs_list_Q != []:
|
|
1888
|
-
macroeconomic_monthly_df_Q['Date'] = macroeconomic_monthly_df_Q.index
|
|
1889
|
-
df_Q = macroeconomic_monthly_df_Q.set_index(pd.to_datetime(macroeconomic_monthly_df_Q['Date'])).drop(
|
|
1890
|
-
columns='Date')
|
|
1891
|
-
df_Q.fillna(method="ffill", inplace=True)
|
|
1892
|
-
df_Q.reset_index(inplace=True)
|
|
1893
|
-
|
|
1894
|
-
daily_records = []
|
|
1895
|
-
for _, row in df_Q.iterrows():
|
|
1896
|
-
year = row["Date"].year
|
|
1897
|
-
month = row["Date"].month
|
|
1898
|
-
day = row["Date"].day
|
|
1899
|
-
last_date = get_last_day_of_the_quarter(datetime(year, month, day).date())
|
|
1900
|
-
all_days = pd.date_range(row["Date"], last_date, freq="D")
|
|
1901
|
-
|
|
1902
|
-
# Create a new record for each day of the quarter
|
|
1903
|
-
for day in all_days:
|
|
1904
|
-
daily_row = row.copy()
|
|
1905
|
-
daily_row["Date"] = row["Date"].replace(day=day.day, month=day.month)
|
|
1906
|
-
daily_records.append(daily_row)
|
|
1907
|
-
|
|
1908
|
-
# Convert the list of daily records into a DataFrame
|
|
1909
|
-
daily_df = pd.DataFrame(daily_records)
|
|
1910
|
-
|
|
1911
|
-
# Extend dataframe to include data up to today
|
|
1912
|
-
datelist = pd.date_range(daily_df["Date"].iloc[-1] + pd.Timedelta(days=1), datetime.today()).tolist()
|
|
1913
|
-
extended_data = np.repeat([list(daily_df.iloc[-1, 1:].values)], len(datelist), axis=0)
|
|
1914
|
-
q = pd.Series(datelist, name="Date")
|
|
1915
|
-
s = pd.DataFrame(extended_data, columns=list(df_Q.columns[1:]))
|
|
1916
|
-
extended_daily_df = pd.concat([q, s], axis=1)
|
|
1917
|
-
extended_daily_df = pd.concat([daily_df, extended_daily_df], ignore_index=False)
|
|
1918
|
-
|
|
1919
|
-
# Create a week commencing column
|
|
1920
|
-
extended_daily_df["Date"] = pd.to_datetime(extended_daily_df["Date"], format='%d %b %Y')
|
|
1921
|
-
extended_daily_df['week_start'] = extended_daily_df["Date"].apply(
|
|
1922
|
-
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1923
|
-
extended_daily_df.drop("Date", axis=1, inplace=True)
|
|
1924
|
-
extended_daily_df.rename(columns={'week_start': "Date"}, inplace=True)
|
|
1925
|
-
|
|
1926
|
-
# Take a weekly average
|
|
1927
|
-
macroeconomic_weekly_df_Q = extended_daily_df.groupby('Date').mean()
|
|
1928
|
-
|
|
1929
|
-
# Merge the two datasets together
|
|
1930
|
-
if all_dfs_list_Q != []:
|
|
1931
|
-
macroeconomic_weekly_df = macroeconomic_weekly_df_M.merge(macroeconomic_weekly_df_Q, left_index=True,
|
|
1932
|
-
right_index=True)
|
|
1933
|
-
# If there are no quarterly datasets
|
|
1934
|
-
else:
|
|
1935
|
-
macroeconomic_weekly_df = macroeconomic_weekly_df_M
|
|
1936
|
-
|
|
1937
|
-
# Change datetime format
|
|
1938
|
-
macroeconomic_weekly_df.index = macroeconomic_weekly_df.index.strftime('%d/%m/%Y')
|
|
1939
|
-
|
|
1940
|
-
macroeconomic_weekly_df.reset_index()
|
|
1941
|
-
macroeconomic_weekly_df.reset_index(drop=False, inplace=True)
|
|
1942
|
-
macroeconomic_weekly_df.rename(columns={'Date': 'OBS'}, inplace=True)
|
|
1943
|
-
|
|
1944
|
-
return macroeconomic_weekly_df
|
|
1945
|
-
|
|
1946
|
-
def get_google_mobility_data(self, country: str, wc: str) -> pd.DataFrame:
|
|
1947
|
-
"""
|
|
1948
|
-
Fetch Google Mobility data for the specified country.
|
|
1949
|
-
|
|
1950
|
-
Parameters:
|
|
1951
|
-
- country (str): The name of the country for which to fetch data.
|
|
1952
|
-
|
|
1953
|
-
Returns:
|
|
1954
|
-
- pd.DataFrame: A DataFrame containing the Google Mobility data.
|
|
1955
|
-
"""
|
|
1956
|
-
# URL of the Google Mobility Reports CSV file
|
|
1957
|
-
url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"
|
|
1958
|
-
|
|
1959
|
-
# Fetch the CSV file
|
|
1960
|
-
response = requests.get(url)
|
|
1961
|
-
if response.status_code != 200:
|
|
1962
|
-
raise Exception(f"Failed to fetch data: {response.status_code}")
|
|
1963
|
-
|
|
1964
|
-
# Load the CSV file into a pandas DataFrame
|
|
1965
|
-
csv_data = StringIO(response.text)
|
|
1966
|
-
df = pd.read_csv(csv_data)
|
|
1967
|
-
|
|
1968
|
-
# Filter the DataFrame for the specified country
|
|
1969
|
-
country_df = df[df['country_region'] == country]
|
|
1970
|
-
|
|
1971
|
-
final_covid = ims_proc.aggregate_daily_to_wc_wide(country_df, "date", [], ['retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline',
|
|
1972
|
-
'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline',
|
|
1973
|
-
'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline'], wc, "average")
|
|
1974
|
-
|
|
1975
|
-
final_covid1 = ims_proc.rename_cols(final_covid, 'covid_')
|
|
1976
|
-
return final_covid1
|
|
1977
|
-
|
|
1978
|
-
############################################################### Seasonality ##########################################################################
|
|
1979
|
-
|
|
1980
|
-
def pull_combined_dummies(self, week_commencing):
|
|
1981
|
-
# Week commencing dictionary
|
|
1982
|
-
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1983
|
-
|
|
1984
|
-
# Create daily date range dataframe
|
|
1985
|
-
date_range = pd.date_range(datetime(2015, 1, 1), datetime.today(), freq="d")
|
|
1986
|
-
df_daily = pd.DataFrame(date_range, columns=["Date"])
|
|
1987
|
-
|
|
1988
|
-
# Create weekly date range dataframe
|
|
1989
|
-
df_daily['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1990
|
-
df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
|
|
1991
|
-
df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
|
|
1992
|
-
|
|
1993
|
-
df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
|
|
1994
|
-
df_weekly_start.set_index("Date", inplace=True)
|
|
1995
|
-
|
|
1996
|
-
# Create individual weekly dummies
|
|
1997
|
-
dummy_columns = {}
|
|
1998
|
-
for i in range(len(df_weekly_start)):
|
|
1999
|
-
col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
|
|
2000
|
-
dummy_columns[col_name] = [0] * len(df_weekly_start)
|
|
2001
|
-
dummy_columns[col_name][i] = 1
|
|
2002
|
-
|
|
2003
|
-
df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
|
|
2004
|
-
df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
|
|
2005
|
-
|
|
2006
|
-
# Create monthly dummies
|
|
2007
|
-
df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
|
|
2008
|
-
df_monthly_dummies = pd.get_dummies(df_daily, prefix="seas", columns=["Month"])
|
|
2009
|
-
df_monthly_dummies['week_start'] = df_daily["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2010
|
-
df_monthly_dummies = df_monthly_dummies.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
|
|
2011
|
-
|
|
2012
|
-
df_monthly_dummies.set_index("Date", inplace=True)
|
|
2013
|
-
df_monthly_dummies = df_monthly_dummies / 7
|
|
2014
|
-
|
|
2015
|
-
# Combine weekly and monthly dataframes
|
|
2016
|
-
df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
|
|
2017
|
-
|
|
2018
|
-
# Create weekly dummies
|
|
2019
|
-
df_combined.reset_index(inplace=True)
|
|
2020
|
-
df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
|
|
2021
|
-
df_combined = pd.get_dummies(df_combined, prefix="wk", columns=["Week"])
|
|
2022
|
-
|
|
2023
|
-
# Create yearly dummies
|
|
2024
|
-
df_combined["Year"] = df_combined["Date"].dt.year
|
|
2025
|
-
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"])
|
|
2026
|
-
|
|
2027
|
-
# Add constant
|
|
2028
|
-
df_combined["Constant"] = 1
|
|
2029
|
-
|
|
2030
|
-
# Add trend
|
|
2031
|
-
df_combined["Trend"] = df_combined.index + 1
|
|
2032
|
-
|
|
2033
|
-
# Set date as index
|
|
2034
|
-
df_combined.set_index("Date", inplace=True)
|
|
2035
|
-
|
|
2036
|
-
# Create COVID lockdown dummies
|
|
2037
|
-
lockdown_periods = [
|
|
2038
|
-
# Lockdown 1
|
|
2039
|
-
("2020-03-23", "2020-05-24"),
|
|
2040
|
-
# Lockdown 2
|
|
2041
|
-
("2020-11-05", "2020-12-02"),
|
|
2042
|
-
# Lockdown 3
|
|
2043
|
-
("2021-01-04", "2021-03-08")
|
|
2044
|
-
]
|
|
2045
|
-
|
|
2046
|
-
df_covid = pd.DataFrame(date_range, columns=["Date"])
|
|
2047
|
-
df_covid["national_lockdown"] = 0
|
|
2048
|
-
|
|
2049
|
-
for start, end in lockdown_periods:
|
|
2050
|
-
df_covid.loc[(df_covid["Date"] >= start) & (df_covid["Date"] <= end), "national_lockdown"] = 1
|
|
2051
|
-
|
|
2052
|
-
df_covid['week_start'] = df_covid["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2053
|
-
df_covid.drop("Date", axis=1, inplace=True)
|
|
2054
|
-
df_covid.rename(columns={"week_start": "OBS"}, inplace=True)
|
|
2055
|
-
df_national_lockdown_total = df_covid.groupby('OBS').sum(numeric_only=True)
|
|
2056
|
-
df_national_lockdown_total.rename(columns={"national_lockdown": "covid_uk_national_lockdown_total"}, inplace=True)
|
|
2057
|
-
|
|
2058
|
-
df_national_lockdown_1 = df_national_lockdown_total.copy(deep=True)
|
|
2059
|
-
df_national_lockdown_2 = df_national_lockdown_total.copy(deep=True)
|
|
2060
|
-
df_national_lockdown_3 = df_national_lockdown_total.copy(deep=True)
|
|
2061
|
-
|
|
2062
|
-
df_national_lockdown_1.loc[df_national_lockdown_1.index > "2020-05-24"] = 0
|
|
2063
|
-
df_national_lockdown_1.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_1"}, inplace=True)
|
|
2064
|
-
|
|
2065
|
-
df_national_lockdown_2.loc[df_national_lockdown_2.index < "2020-11-05"] = 0
|
|
2066
|
-
df_national_lockdown_2.loc[df_national_lockdown_2.index > "2020-12-02"] = 0
|
|
2067
|
-
df_national_lockdown_2.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_2"}, inplace=True)
|
|
2068
|
-
|
|
2069
|
-
df_national_lockdown_3.loc[df_national_lockdown_3.index < "2021-01-04"] = 0
|
|
2070
|
-
df_national_lockdown_3.rename(columns={"covid_uk_national_lockdown_total": "covid_uk_national_lockdown_3"}, inplace=True)
|
|
2071
|
-
|
|
2072
|
-
df_final_covid = pd.concat([df_national_lockdown_total, df_national_lockdown_1, df_national_lockdown_2, df_national_lockdown_3], axis=1)
|
|
2073
|
-
df_final_covid.reset_index(inplace=True)
|
|
2074
|
-
df_final_covid.rename(columns={"index": "OBS"}, inplace=True)
|
|
2075
|
-
|
|
2076
|
-
# Create seasonal indicators for the last day and last Friday of the month
|
|
2077
|
-
min_date = '2019-12-29'
|
|
2078
|
-
max_date = datetime.today().strftime('%Y-%m-%d')
|
|
2079
|
-
date_range_seas = pd.date_range(start=min_date, end=max_date)
|
|
2080
|
-
|
|
2081
|
-
df_seas = pd.DataFrame(date_range_seas, columns=['Date'])
|
|
2082
|
-
df_seas['Last_Day_of_Month'] = df_seas['Date'].apply(lambda x: 1 if x == x.to_period('M').to_timestamp('M') else 0)
|
|
2083
|
-
|
|
2084
|
-
def is_last_friday(date):
|
|
2085
|
-
last_day_of_month = date.to_period('M').to_timestamp('M')
|
|
2086
|
-
last_day_weekday = last_day_of_month.dayofweek
|
|
2087
|
-
if last_day_weekday >= 4:
|
|
2088
|
-
days_to_subtract = last_day_weekday - 4
|
|
2089
|
-
else:
|
|
2090
|
-
days_to_subtract = last_day_weekday + 3
|
|
2091
|
-
last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
|
|
2092
|
-
return 1 if date == last_friday else 0
|
|
2093
|
-
|
|
2094
|
-
df_seas['Last_Friday_of_Month'] = df_seas['Date'].apply(is_last_friday)
|
|
2095
|
-
|
|
2096
|
-
df_seas['week_start'] = df_seas["Date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2097
|
-
df_seas = df_seas.groupby('week_start').sum(numeric_only=True).reset_index().rename(columns={'week_start': "Date"})
|
|
2098
|
-
df_seas.set_index("Date", inplace=True)
|
|
2099
|
-
|
|
2100
|
-
# Combine all dataframes
|
|
2101
|
-
df_combined = df_combined.reset_index().rename(columns={"Date": "OBS"})
|
|
2102
|
-
df_final_combined = pd.merge(df_combined, df_final_covid, how='left', left_on='OBS', right_on='OBS')
|
|
2103
|
-
df_final_combined = pd.merge(df_final_combined, df_seas, how='left', left_on='OBS', right_on='Date')
|
|
2104
|
-
|
|
2105
|
-
# Fill any NaN values with 0
|
|
2106
|
-
df_final_combined.fillna(0, inplace=True)
|
|
2107
|
-
|
|
2108
|
-
return df_final_combined
|
|
2109
|
-
|
|
2110
|
-
def pull_weather(self, week_commencing, country) -> pd.DataFrame:
|
|
2111
|
-
import pandas as pd
|
|
2112
|
-
import urllib.request
|
|
2113
|
-
from datetime import datetime
|
|
2114
|
-
import requests
|
|
2115
|
-
from geopy.geocoders import Nominatim
|
|
2116
|
-
|
|
2117
|
-
# Week commencing dictionary
|
|
2118
|
-
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
2119
|
-
|
|
2120
|
-
# Country dictionary
|
|
2121
|
-
country_dict = {"AUS": "AU__ASOS", "GBR": "GB__ASOS", "USA": "USCRN", "DEU": "DE__ASOS", "CAN": "Canada", "ZAF": "ZA__ASOS"}
|
|
2122
|
-
|
|
2123
|
-
# Function to flatten a list of nested lists into a list
|
|
2124
|
-
def flatten_list(nested_list):
|
|
2125
|
-
return [item for sublist in nested_list for item in sublist]
|
|
2126
|
-
|
|
2127
|
-
# Choose country
|
|
2128
|
-
country = country_dict[country]
|
|
2129
|
-
|
|
2130
|
-
# Choose start and end dates
|
|
2131
|
-
start_day = 1
|
|
2132
|
-
start_month = 1
|
|
2133
|
-
start_year = 2014
|
|
2134
|
-
formatted_date = datetime(start_year, start_month, start_day).strftime("%Y-%m-%d")
|
|
2135
|
-
today = datetime.now()
|
|
2136
|
-
end_day = today.day
|
|
2137
|
-
end_month = today.month
|
|
2138
|
-
end_year = today.year
|
|
2139
|
-
|
|
2140
|
-
if country == "GB__ASOS":
|
|
2141
|
-
stations = ["&stations=EGCC", "&stations=EGNM", "&stations=EGBB",
|
|
2142
|
-
"&stations=EGSH", "&stations=EGFF", "&stations=EGHI",
|
|
2143
|
-
"&stations=EGLC", "&stations=EGHQ", "&stations=EGAC",
|
|
2144
|
-
"&stations=EGPF", "&stations=EGGD", "&stations=EGPE",
|
|
2145
|
-
"&stations=EGNT"]
|
|
2146
|
-
elif country == "AU__ASOS":
|
|
2147
|
-
stations = ["&stations=YPDN", "&stations=YBCS", "&stations=YBBN",
|
|
2148
|
-
"&stations=YSSY", "&stations=YSSY", "&stations=YMEN",
|
|
2149
|
-
"&stations=YPAD", "&stations=YPPH"]
|
|
2150
|
-
elif country == "USCRN":
|
|
2151
|
-
stations = ["&stations=64756", "&stations=64758", "&stations=03761", "&stations=54797", # North
|
|
2152
|
-
"&stations=53968", "&stations=53960", "&stations=54932", "&stations=13301", # Midwest
|
|
2153
|
-
"&stations=64756", "&stations=64756", "&stations=92821", "&stations=63862", # South
|
|
2154
|
-
"&stations=53152", "&stations=93245", "&stations=04138", "&stations=04237"] # West
|
|
2155
|
-
elif country == "DE__ASOS":
|
|
2156
|
-
stations = ["&stations=EDDL", "&stations=EDDH", "&stations=EDDB",
|
|
2157
|
-
"&stations=EDDN", "&stations=EDDF", "&stations=EDDK",
|
|
2158
|
-
"&stations=EDLW", "&stations=EDDM"]
|
|
2159
|
-
elif country == "FR__ASOS":
|
|
2160
|
-
stations = ["&stations=LFPB"]
|
|
2161
|
-
elif country == "Canada":
|
|
2162
|
-
institute_vector = ["CA_NB_ASOS", "CA_NF_ASOS", "CA_NT_ASOS", "CA_NS_ASOS",
|
|
2163
|
-
"CA_NU_ASOS"]
|
|
2164
|
-
stations_list = [[] for _ in range(5)]
|
|
2165
|
-
stations_list[0].append(["&stations=CYQM", "&stations=CERM", "&stations=CZCR",
|
|
2166
|
-
"&stations=CZBF", "&stations=CYFC", "&stations=CYCX"])
|
|
2167
|
-
|
|
2168
|
-
stations_list[1].append(["&stations=CWZZ", "&stations=CYDP", "&stations=CYMH",
|
|
2169
|
-
"&stations=CYAY", "&stations=CWDO", "&stations=CXTP",
|
|
2170
|
-
"&stations=CYJT", "&stations=CYYR", "&stations=CZUM",
|
|
2171
|
-
"&stations=CYWK", "&stations=CYWK"])
|
|
2172
|
-
|
|
2173
|
-
stations_list[2].append(["&stations=CYHI", "&stations=CZCP", "&stations=CWLI",
|
|
2174
|
-
"&stations=CWND", "&stations=CXTV", "&stations=CYVL",
|
|
2175
|
-
"&stations=CYCO", "&stations=CXDE", "&stations=CYWE",
|
|
2176
|
-
"&stations=CYLK", "&stations=CWID", "&stations=CYRF",
|
|
2177
|
-
"&stations=CXYH", "&stations=CYWY", "&stations=CWMT"])
|
|
2178
|
-
|
|
2179
|
-
stations_list[3].append(["&stations=CWEF", "&stations=CXIB", "&stations=CYQY",
|
|
2180
|
-
"&stations=CYPD", "&stations=CXNP", "&stations=CXMY",
|
|
2181
|
-
"&stations=CYAW", "&stations=CWKG", "&stations=CWVU",
|
|
2182
|
-
"&stations=CXLB", "&stations=CWSA", "&stations=CWRN"])
|
|
2183
|
-
|
|
2184
|
-
stations_list[4].append(["&stations=CYLT", "&stations=CWEU", "&stations=CWGZ",
|
|
2185
|
-
"&stations=CYIO", "&stations=CXSE", "&stations=CYCB",
|
|
2186
|
-
"&stations=CWIL", "&stations=CXWB", "&stations=CYZS",
|
|
2187
|
-
"&stations=CWJC", "&stations=CYFB", "&stations=CWUW"])
|
|
2188
|
-
|
|
2189
|
-
elif country == "ZA__ASOS":
|
|
2190
|
-
cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
|
|
2191
|
-
stations = []
|
|
2192
|
-
|
|
2193
|
-
for city in cities:
|
|
2194
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
2195
|
-
location = geolocator.geocode(city)
|
|
2196
|
-
stations.append(f"&latitude={location.latitude}&longitude={location.longitude}")
|
|
2197
|
-
|
|
2198
|
-
# Temperature
|
|
2199
|
-
if country in ["GB__ASOS", "AU__ASOS", "DE__ASOS", "FR__ASOS"]:
|
|
2200
|
-
# We start by making a data frame of the following weather stations
|
|
2201
|
-
station_query = ''.join(stations)
|
|
2202
|
-
|
|
2203
|
-
raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
|
|
2204
|
-
station_query,
|
|
2205
|
-
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
2206
|
-
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
2207
|
-
raw_weather = urllib.request.urlopen(raw_weather_list)
|
|
2208
|
-
raw_weather = pd.read_csv(raw_weather)
|
|
2209
|
-
|
|
2210
|
-
# Replace the occurrences of "None" with Missing Value
|
|
2211
|
-
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
2212
|
-
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
2213
|
-
|
|
2214
|
-
# Remove any data that isn't temperature-related
|
|
2215
|
-
weather = raw_weather.iloc[:, 0:4]
|
|
2216
|
-
|
|
2217
|
-
weather[["max_temp_f", "min_temp_f"]] = weather[["max_temp_f", "min_temp_f"]].apply(pd.to_numeric)
|
|
2218
|
-
|
|
2219
|
-
# Estimate mean temperature
|
|
2220
|
-
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
2221
|
-
|
|
2222
|
-
# Convert Fahrenheit to Celsius for max_temp_f
|
|
2223
|
-
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
2224
|
-
|
|
2225
|
-
# Convert Fahrenheit to Celsius for min_temp_f
|
|
2226
|
-
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
2227
|
-
|
|
2228
|
-
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
2229
|
-
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
2230
|
-
|
|
2231
|
-
# Aggregate the data to week commencing sunday taking the average of the data
|
|
2232
|
-
# Convert the date column to a Date type
|
|
2233
|
-
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
2234
|
-
|
|
2235
|
-
# Determine the starting chosen day for each date
|
|
2236
|
-
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2237
|
-
|
|
2238
|
-
# Group by week_starting and summarize
|
|
2239
|
-
numeric_columns = weather.select_dtypes(include='number').columns
|
|
2240
|
-
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
2241
|
-
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
2242
|
-
"min_temp_f": "avg_min_temp_f",
|
|
2243
|
-
"mean_temp_f": "avg_mean_temp_f",
|
|
2244
|
-
"max_temp_c": "avg_max_temp_c",
|
|
2245
|
-
"min_temp_c": "avg_min_temp_c",
|
|
2246
|
-
"mean_temp_c": "avg_mean_temp_c"}, inplace=True)
|
|
2247
|
-
elif country == "Canada":
|
|
2248
|
-
for i in range(len(institute_vector)):
|
|
2249
|
-
station_query_temp = ''.join(flatten_list(stations_list[i]))
|
|
2250
|
-
institute_temp = institute_vector[i]
|
|
2251
|
-
raw_weather_temp = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", institute_temp,
|
|
2252
|
-
station_query_temp,
|
|
2253
|
-
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
2254
|
-
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
2255
|
-
raw_weather_temp = urllib.request.urlopen(raw_weather_temp)
|
|
2256
|
-
raw_weather_temp = pd.read_csv(raw_weather_temp)
|
|
2257
|
-
|
|
2258
|
-
if len(raw_weather_temp.index) == 0:
|
|
2259
|
-
continue
|
|
2260
|
-
raw_weather_temp = raw_weather_temp[['station', 'day', 'max_temp_f', 'min_temp_f', 'precip_in']]
|
|
2261
|
-
|
|
2262
|
-
if i == 1:
|
|
2263
|
-
raw_weather = raw_weather_temp
|
|
2264
|
-
else:
|
|
2265
|
-
raw_weather = pd.concat([raw_weather, raw_weather_temp])
|
|
2266
|
-
|
|
2267
|
-
# Drop error column if it exists
|
|
2268
|
-
if 'ERROR: Invalid network specified' in list(raw_weather.columns):
|
|
2269
|
-
raw_weather.drop('ERROR: Invalid network specified', axis=1, inplace=True)
|
|
2270
|
-
|
|
2271
|
-
# Replace none values
|
|
2272
|
-
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
2273
|
-
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
2274
|
-
raw_weather["precip_in"].replace("None", 0, inplace=True)
|
|
2275
|
-
|
|
2276
|
-
weather = raw_weather
|
|
2277
|
-
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
2278
|
-
|
|
2279
|
-
# Estimate mean temperature
|
|
2280
|
-
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
2281
|
-
|
|
2282
|
-
# Convert Fahrenheit to Celsius for max_temp_f
|
|
2283
|
-
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
2284
|
-
|
|
2285
|
-
# Convert Fahrenheit to Celsius for min_temp_f
|
|
2286
|
-
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
2287
|
-
|
|
2288
|
-
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
2289
|
-
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
2290
|
-
|
|
2291
|
-
# Aggregate the data to week commencing sunday taking the average of the data
|
|
2292
|
-
# Convert the date column to a Date type
|
|
2293
|
-
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
2294
|
-
|
|
2295
|
-
# Determine the starting chosen day for each date
|
|
2296
|
-
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2297
|
-
|
|
2298
|
-
# Group by week_starting and summarize
|
|
2299
|
-
numeric_columns = weather.select_dtypes(include='number').columns
|
|
2300
|
-
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
2301
|
-
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
2302
|
-
"min_temp_f": "avg_min_temp_f",
|
|
2303
|
-
"mean_temp_f": "avg_mean_temp_f",
|
|
2304
|
-
"max_temp_c": "avg_max_temp_c",
|
|
2305
|
-
"min_temp_c": "avg_min_temp_c",
|
|
2306
|
-
"mean_temp_c": "avg_mean_temp_c",
|
|
2307
|
-
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
2308
|
-
elif country == "ZA__ASOS":
|
|
2309
|
-
weather_data_list = []
|
|
2310
|
-
|
|
2311
|
-
for city in cities:
|
|
2312
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
2313
|
-
location = geolocator.geocode(city)
|
|
2314
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2315
|
-
|
|
2316
|
-
params = {
|
|
2317
|
-
"latitude": location.latitude,
|
|
2318
|
-
"longitude": location.longitude,
|
|
2319
|
-
"start_date": formatted_date,
|
|
2320
|
-
"end_date": today.strftime("%Y-%m-%d"),
|
|
2321
|
-
"daily": "temperature_2m_max,temperature_2m_min,precipitation_sum",
|
|
2322
|
-
"timezone": "auto"
|
|
2323
|
-
}
|
|
2324
|
-
|
|
2325
|
-
response = requests.get(url, params=params)
|
|
2326
|
-
response_data = response.json()
|
|
2327
|
-
|
|
2328
|
-
daily_data = response_data["daily"]
|
|
2329
|
-
dates = daily_data["time"]
|
|
2330
|
-
|
|
2331
|
-
data = pd.DataFrame({
|
|
2332
|
-
"day": dates,
|
|
2333
|
-
"max_temp_f": daily_data["temperature_2m_max"],
|
|
2334
|
-
"min_temp_f": daily_data["temperature_2m_min"],
|
|
2335
|
-
"precip_in": daily_data["precipitation_sum"]
|
|
2336
|
-
})
|
|
2337
|
-
data["city"] = city
|
|
2338
|
-
weather_data_list.append(data)
|
|
2339
|
-
|
|
2340
|
-
weather = pd.concat(weather_data_list)
|
|
2341
|
-
|
|
2342
|
-
# Convert the date column to a Date type
|
|
2343
|
-
weather["day"] = pd.to_datetime(weather["day"])
|
|
2344
|
-
|
|
2345
|
-
# Replace None values
|
|
2346
|
-
weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
2347
|
-
weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
2348
|
-
weather["precip_in"].replace("None", 0, inplace=True)
|
|
2349
|
-
|
|
2350
|
-
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
2351
|
-
|
|
2352
|
-
# Estimate mean temperature
|
|
2353
|
-
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
2354
|
-
|
|
2355
|
-
# Convert Fahrenheit to Celsius for max_temp_f
|
|
2356
|
-
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
2357
|
-
|
|
2358
|
-
# Convert Fahrenheit to Celsius for min_temp_f
|
|
2359
|
-
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
2360
|
-
|
|
2361
|
-
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
2362
|
-
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
2363
|
-
|
|
2364
|
-
# Determine the starting chosen day for each date
|
|
2365
|
-
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2366
|
-
|
|
2367
|
-
# Group by week_starting and summarize
|
|
2368
|
-
numeric_columns = weather.select_dtypes(include='number').columns
|
|
2369
|
-
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
2370
|
-
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
2371
|
-
"min_temp_f": "avg_min_temp_f",
|
|
2372
|
-
"mean_temp_f": "avg_mean_temp_f",
|
|
2373
|
-
"max_temp_c": "avg_max_temp_c",
|
|
2374
|
-
"min_temp_c": "avg_min_temp_c",
|
|
2375
|
-
"mean_temp_c": "avg_mean_temp_c",
|
|
2376
|
-
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
2377
|
-
|
|
2378
|
-
else:
|
|
2379
|
-
# We start by making a data frame of the following weather stations
|
|
2380
|
-
station_query = ''.join(stations)
|
|
2381
|
-
|
|
2382
|
-
raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
|
|
2383
|
-
station_query,
|
|
2384
|
-
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
2385
|
-
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
2386
|
-
raw_weather = urllib.request.urlopen(raw_weather_list)
|
|
2387
|
-
raw_weather = pd.read_csv(raw_weather)
|
|
2388
|
-
|
|
2389
|
-
raw_weather = raw_weather[['day', 'max_temp_f', 'min_temp_f', 'precip_in']]
|
|
2390
|
-
|
|
2391
|
-
# Replace the occurrences of "None" with Missing Value
|
|
2392
|
-
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
2393
|
-
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
2394
|
-
raw_weather["precip_in"].replace("None", 0, inplace=True)
|
|
2395
|
-
|
|
2396
|
-
# Remove any data that isn't temperature-related
|
|
2397
|
-
weather = raw_weather
|
|
2398
|
-
|
|
2399
|
-
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
2400
|
-
|
|
2401
|
-
# Estimate mean temperature
|
|
2402
|
-
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
2403
|
-
|
|
2404
|
-
# Convert Fahrenheit to Celsius for max_temp_f
|
|
2405
|
-
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
2406
|
-
|
|
2407
|
-
# Convert Fahrenheit to Celsius for min_temp_f
|
|
2408
|
-
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
2409
|
-
|
|
2410
|
-
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
2411
|
-
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
2412
|
-
|
|
2413
|
-
# Aggregate the data to week commencing sunday taking the average of the data
|
|
2414
|
-
# Convert the date column to a Date type
|
|
2415
|
-
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
2416
|
-
|
|
2417
|
-
# Determine the starting chosen day for each date
|
|
2418
|
-
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2419
|
-
|
|
2420
|
-
# Group by week_starting and summarize
|
|
2421
|
-
numeric_columns = weather.select_dtypes(include='number').columns
|
|
2422
|
-
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
2423
|
-
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
2424
|
-
"min_temp_f": "avg_min_temp_f",
|
|
2425
|
-
"mean_temp_f": "avg_mean_temp_f",
|
|
2426
|
-
"max_temp_c": "avg_max_temp_c",
|
|
2427
|
-
"min_temp_c": "avg_min_temp_c",
|
|
2428
|
-
"mean_temp_c": "avg_mean_temp_c",
|
|
2429
|
-
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
2430
|
-
|
|
2431
|
-
# Rainfall
|
|
2432
|
-
if country == "GB__ASOS":
|
|
2433
|
-
# Define cities and date range
|
|
2434
|
-
cities = ["Manchester", "Leeds", "Birmingham", "Norwich", "Cardiff", "Southampton", "London", "Newquay", "Belfast", "Glasgow", "Bristol", "Newcastle"]
|
|
2435
|
-
|
|
2436
|
-
start_date = formatted_date
|
|
2437
|
-
end_date = today.strftime("%Y-%m-%d")
|
|
2438
|
-
|
|
2439
|
-
# Initialize an empty list to store the weather data for each city
|
|
2440
|
-
weather_data_list = []
|
|
2441
|
-
|
|
2442
|
-
# Loop through each city and fetch weather data
|
|
2443
|
-
for city in cities:
|
|
2444
|
-
# Initialize Nominatim API
|
|
2445
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
2446
|
-
location = geolocator.geocode(city)
|
|
2447
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2448
|
-
|
|
2449
|
-
params = {
|
|
2450
|
-
"latitude": location.latitude,
|
|
2451
|
-
"longitude": location.longitude,
|
|
2452
|
-
"start_date": start_date,
|
|
2453
|
-
"end_date": end_date,
|
|
2454
|
-
"daily": "precipitation_sum",
|
|
2455
|
-
"timezone": "auto"
|
|
2456
|
-
}
|
|
2457
|
-
|
|
2458
|
-
response = requests.get(url, params=params)
|
|
2459
|
-
response_data = response.json()
|
|
2460
|
-
|
|
2461
|
-
daily_data = response_data["daily"]["precipitation_sum"]
|
|
2462
|
-
dates = response_data["daily"]["time"]
|
|
2463
|
-
|
|
2464
|
-
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
2465
|
-
data["city"] = city
|
|
2466
|
-
|
|
2467
|
-
weather_data_list.append(data)
|
|
2468
|
-
|
|
2469
|
-
# Combine all city data into a single data frame
|
|
2470
|
-
all_weather_data = pd.concat(weather_data_list)
|
|
2471
|
-
|
|
2472
|
-
# Convert the date column to a Date type
|
|
2473
|
-
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
2474
|
-
|
|
2475
|
-
# Set week commencing col up
|
|
2476
|
-
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2477
|
-
|
|
2478
|
-
# Group by week_starting and summarize
|
|
2479
|
-
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
2480
|
-
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
2481
|
-
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
2482
|
-
|
|
2483
|
-
# Change index to datetime
|
|
2484
|
-
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
2485
|
-
|
|
2486
|
-
elif country == "AU__ASOS":
|
|
2487
|
-
|
|
2488
|
-
# Define cities and date range
|
|
2489
|
-
cities = ["Darwin", "Cairns", "Brisbane", "Sydney", "Melbourne", "Adelaide", "Perth"]
|
|
2490
|
-
|
|
2491
|
-
start_date = formatted_date
|
|
2492
|
-
end_date = today.strftime("%Y-%m-%d")
|
|
2493
|
-
|
|
2494
|
-
# Initialize an empty list to store the weather data for each city
|
|
2495
|
-
weather_data_list = []
|
|
2496
|
-
|
|
2497
|
-
# Loop through each city and fetch weather data
|
|
2498
|
-
for city in cities:
|
|
2499
|
-
# Initialize Nominatim API
|
|
2500
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
2501
|
-
location = geolocator.geocode(city)
|
|
2502
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2503
|
-
|
|
2504
|
-
params = {
|
|
2505
|
-
"latitude": location.latitude,
|
|
2506
|
-
"longitude": location.longitude,
|
|
2507
|
-
"start_date": start_date,
|
|
2508
|
-
"end_date": end_date,
|
|
2509
|
-
"daily": "precipitation_sum",
|
|
2510
|
-
"timezone": "auto"
|
|
2511
|
-
}
|
|
2512
|
-
|
|
2513
|
-
response = requests.get(url, params=params)
|
|
2514
|
-
response_data = response.json()
|
|
2515
|
-
|
|
2516
|
-
daily_data = response_data["daily"]["precipitation_sum"]
|
|
2517
|
-
dates = response_data["daily"]["time"]
|
|
2518
|
-
|
|
2519
|
-
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
2520
|
-
data["city"] = city
|
|
2521
|
-
|
|
2522
|
-
weather_data_list.append(data)
|
|
2523
|
-
|
|
2524
|
-
# Combine all city data into a single data frame
|
|
2525
|
-
all_weather_data = pd.concat(weather_data_list)
|
|
2526
|
-
|
|
2527
|
-
# Convert the date column to a Date type
|
|
2528
|
-
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
2529
|
-
|
|
2530
|
-
# Set week commencing col up
|
|
2531
|
-
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2532
|
-
|
|
2533
|
-
# Group by week_starting and summarize
|
|
2534
|
-
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
2535
|
-
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
2536
|
-
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
2537
|
-
|
|
2538
|
-
# Change index to datetime
|
|
2539
|
-
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
2540
|
-
|
|
2541
|
-
elif country == "DE__ASOS":
|
|
2542
|
-
|
|
2543
|
-
# Define cities and date range
|
|
2544
|
-
cities = ["Dortmund", "Düsseldorf", "Frankfurt", "Munich", "Cologne", "Berlin", "Hamburg", "Nuernberg"]
|
|
2545
|
-
|
|
2546
|
-
start_date = formatted_date
|
|
2547
|
-
end_date = today.strftime("%Y-%m-%d")
|
|
2548
|
-
|
|
2549
|
-
# Initialize an empty list to store the weather data for each city
|
|
2550
|
-
weather_data_list = []
|
|
2551
|
-
|
|
2552
|
-
# Loop through each city and fetch weather data
|
|
2553
|
-
for city in cities:
|
|
2554
|
-
# Initialize Nominatim API
|
|
2555
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
2556
|
-
location = geolocator.geocode(city)
|
|
2557
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2558
|
-
|
|
2559
|
-
params = {
|
|
2560
|
-
"latitude": location.latitude,
|
|
2561
|
-
"longitude": location.longitude,
|
|
2562
|
-
"start_date": start_date,
|
|
2563
|
-
"end_date": end_date,
|
|
2564
|
-
"daily": "precipitation_sum",
|
|
2565
|
-
"timezone": "auto"
|
|
2566
|
-
}
|
|
2567
|
-
|
|
2568
|
-
response = requests.get(url, params=params)
|
|
2569
|
-
response_data = response.json()
|
|
2570
|
-
|
|
2571
|
-
daily_data = response_data["daily"]["precipitation_sum"]
|
|
2572
|
-
dates = response_data["daily"]["time"]
|
|
2573
|
-
|
|
2574
|
-
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
2575
|
-
data["city"] = city
|
|
2576
|
-
|
|
2577
|
-
weather_data_list.append(data)
|
|
2578
|
-
|
|
2579
|
-
# Combine all city data into a single data frame
|
|
2580
|
-
all_weather_data = pd.concat(weather_data_list)
|
|
2581
|
-
|
|
2582
|
-
# Convert the date column to a Date type
|
|
2583
|
-
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
2584
|
-
|
|
2585
|
-
# Set week commencing col up
|
|
2586
|
-
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2587
|
-
|
|
2588
|
-
# Group by week_starting and summarize
|
|
2589
|
-
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
2590
|
-
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
2591
|
-
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
2592
|
-
|
|
2593
|
-
# Change index to datetime
|
|
2594
|
-
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
2595
|
-
|
|
2596
|
-
elif country == "FR__ASOS":
|
|
2597
|
-
|
|
2598
|
-
# Define cities and date range
|
|
2599
|
-
cities = ["Paris"]
|
|
2600
|
-
|
|
2601
|
-
start_date = formatted_date
|
|
2602
|
-
end_date = today.strftime("%Y-%m-%d")
|
|
2603
|
-
|
|
2604
|
-
# Initialize an empty list to store the weather data for each city
|
|
2605
|
-
weather_data_list = []
|
|
2606
|
-
|
|
2607
|
-
# Loop through each city and fetch weather data
|
|
2608
|
-
for city in cities:
|
|
2609
|
-
# Initialize Nominatim API
|
|
2610
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
2611
|
-
location = geolocator.geocode(city)
|
|
2612
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2613
|
-
|
|
2614
|
-
params = {
|
|
2615
|
-
"latitude": location.latitude,
|
|
2616
|
-
"longitude": location.longitude,
|
|
2617
|
-
"start_date": start_date,
|
|
2618
|
-
"end_date": end_date,
|
|
2619
|
-
"daily": "precipitation_sum",
|
|
2620
|
-
"timezone": "auto"
|
|
2621
|
-
}
|
|
2622
|
-
|
|
2623
|
-
response = requests.get(url, params=params)
|
|
2624
|
-
response_data = response.json()
|
|
2625
|
-
|
|
2626
|
-
daily_data = response_data["daily"]["precipitation_sum"]
|
|
2627
|
-
dates = response_data["daily"]["time"]
|
|
2628
|
-
|
|
2629
|
-
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
2630
|
-
data["city"] = city
|
|
2631
|
-
|
|
2632
|
-
weather_data_list.append(data)
|
|
2633
|
-
|
|
2634
|
-
# Combine all city data into a single data frame
|
|
2635
|
-
all_weather_data = pd.concat(weather_data_list)
|
|
2636
|
-
|
|
2637
|
-
# Convert the date column to a Date type
|
|
2638
|
-
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
2639
|
-
|
|
2640
|
-
# Set week commencing col up
|
|
2641
|
-
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2642
|
-
|
|
2643
|
-
# Group by week_starting and summarize
|
|
2644
|
-
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
2645
|
-
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
2646
|
-
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
2647
|
-
|
|
2648
|
-
# Change index to datetime
|
|
2649
|
-
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
2650
|
-
|
|
2651
|
-
elif country == "ZA__ASOS":
|
|
2652
|
-
cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
|
|
2653
|
-
start_date = formatted_date
|
|
2654
|
-
end_date = today.strftime("%Y-%m-%d")
|
|
2655
|
-
|
|
2656
|
-
weather_data_list = []
|
|
2657
|
-
|
|
2658
|
-
for city in cities:
|
|
2659
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
2660
|
-
location = geolocator.geocode(city)
|
|
2661
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2662
|
-
|
|
2663
|
-
params = {
|
|
2664
|
-
"latitude": location.latitude,
|
|
2665
|
-
"longitude": location.longitude,
|
|
2666
|
-
"start_date": start_date,
|
|
2667
|
-
"end_date": end_date,
|
|
2668
|
-
"daily": "precipitation_sum",
|
|
2669
|
-
"timezone": "auto"
|
|
2670
|
-
}
|
|
2671
|
-
|
|
2672
|
-
response = requests.get(url, params=params)
|
|
2673
|
-
response_data = response.json()
|
|
2674
|
-
|
|
2675
|
-
daily_data = response_data["daily"]["precipitation_sum"]
|
|
2676
|
-
dates = response_data["daily"]["time"]
|
|
2677
|
-
|
|
2678
|
-
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
2679
|
-
data["city"] = city
|
|
2680
|
-
|
|
2681
|
-
weather_data_list.append(data)
|
|
2682
|
-
|
|
2683
|
-
# Combine all city data into a single data frame
|
|
2684
|
-
all_weather_data = pd.concat(weather_data_list)
|
|
2685
|
-
|
|
2686
|
-
# Convert the date column to a Date type
|
|
2687
|
-
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
2688
|
-
|
|
2689
|
-
# Set week commencing col up
|
|
2690
|
-
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2691
|
-
|
|
2692
|
-
# Group by week_starting and summarize
|
|
2693
|
-
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
2694
|
-
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
2695
|
-
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
2696
|
-
|
|
2697
|
-
# Change index to datetime
|
|
2698
|
-
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
2699
|
-
|
|
2700
|
-
# Merge the dataframes
|
|
2701
|
-
if country in ["AU__ASOS", "DE__ASOS", "FR__ASOS", "GB__ASOS", "ZA__ASOS"]:
|
|
2702
|
-
merged_df = weekly_avg_rain.merge(weekly_avg_temp, on="week_starting")
|
|
2703
|
-
else:
|
|
2704
|
-
merged_df = weekly_avg_temp
|
|
2705
|
-
|
|
2706
|
-
merged_df.reset_index(drop=False, inplace=True)
|
|
2707
|
-
merged_df.rename(columns={'week_starting': 'OBS'}, inplace=True)
|
|
2708
|
-
|
|
2709
|
-
final_weather = ims_proc.rename_cols(merged_df, 'seas_')
|
|
2710
|
-
|
|
2711
|
-
return final_weather
|