imsciences 0.5.4.8__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imsciences/__init__.py +4 -1
- imsciences/datafunctions-IMS-24Ltp-3.py +2711 -0
- imsciences/datafunctions.py +2841 -169
- imsciences/datapull.py +374 -0
- imsciences/geo.py +195 -0
- imsciences/mmm.py +1415 -0
- imsciences/pull.py +1483 -0
- imsciences/unittesting.py +1064 -0
- imsciences/vis.py +196 -0
- imsciences-0.9.3.dist-info/LICENSE.txt +21 -0
- imsciences-0.9.3.dist-info/METADATA +330 -0
- imsciences-0.9.3.dist-info/PKG-INFO-IMS-24Ltp-3 +24 -0
- imsciences-0.9.3.dist-info/RECORD +22 -0
- {imsciences-0.5.4.8.dist-info → imsciences-0.9.3.dist-info}/WHEEL +1 -1
- imsciences-0.5.4.8.dist-info/METADATA +0 -95
- imsciences-0.5.4.8.dist-info/RECORD +0 -13
- {imsciences-0.5.4.8.dist-info → imsciences-0.9.3.dist-info}/top_level.txt +0 -0
imsciences/datafunctions.py
CHANGED
|
@@ -3,8 +3,31 @@ import calendar
|
|
|
3
3
|
import os
|
|
4
4
|
import plotly.express as px
|
|
5
5
|
import plotly.graph_objs as go
|
|
6
|
-
from dateutil.parser import parse
|
|
7
6
|
import numpy as np
|
|
7
|
+
import re
|
|
8
|
+
from fredapi import Fred
|
|
9
|
+
import time
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from io import StringIO
|
|
12
|
+
import requests
|
|
13
|
+
import subprocess
|
|
14
|
+
import json
|
|
15
|
+
import xml.etree.ElementTree as ET
|
|
16
|
+
from bs4 import BeautifulSoup
|
|
17
|
+
import yfinance as yf
|
|
18
|
+
import holidays
|
|
19
|
+
from dateutil.easter import easter
|
|
20
|
+
from google.analytics.data_v1beta import BetaAnalyticsDataClient
|
|
21
|
+
from google.analytics.data_v1beta.types import DateRange
|
|
22
|
+
from google.analytics.data_v1beta.types import Dimension
|
|
23
|
+
from google.analytics.data_v1beta.types import Metric
|
|
24
|
+
from google.analytics.data_v1beta.types import RunReportRequest
|
|
25
|
+
from google.analytics.data_v1beta.types import OrderBy
|
|
26
|
+
from google.analytics.data_v1beta.types import Filter
|
|
27
|
+
from google.analytics.data_v1beta.types import FilterExpression
|
|
28
|
+
from google.analytics.data_v1beta.types import FilterExpressionList
|
|
29
|
+
from google.auth.exceptions import DefaultCredentialsError
|
|
30
|
+
import logging
|
|
8
31
|
|
|
9
32
|
class dataprocessing:
|
|
10
33
|
|
|
@@ -63,8 +86,8 @@ class dataprocessing:
|
|
|
63
86
|
|
|
64
87
|
print("\n11. rename_cols")
|
|
65
88
|
print(" - Description: Renames columns in a pandas DataFrame.")
|
|
66
|
-
print(" - Usage: rename_cols(df,
|
|
67
|
-
print(" - Example: rename_cols(df,
|
|
89
|
+
print(" - Usage: rename_cols(df, name)")
|
|
90
|
+
print(" - Example: rename_cols(df, 'ame_facebook'")
|
|
68
91
|
|
|
69
92
|
print("\n12. merge_new_and_old")
|
|
70
93
|
print(" - Description: Creates a new DataFrame with two columns: one for dates and one for merged numeric values.")
|
|
@@ -92,21 +115,142 @@ class dataprocessing:
|
|
|
92
115
|
print(" - Usage: combine_sheets(all_sheets)")
|
|
93
116
|
print(" - Example: combine_sheets({'Sheet1': df1, 'Sheet2': df2})")
|
|
94
117
|
|
|
95
|
-
print("\n17.
|
|
118
|
+
print("\n17. pivot_table")
|
|
96
119
|
print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
|
|
97
|
-
print(" - Usage:
|
|
98
|
-
print(" - Example:
|
|
120
|
+
print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'False',fill_missing_weekly_dates=False,week_commencing='W-MON')")
|
|
121
|
+
print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value',filters_dict={'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''}, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'True',fill_missing_weekly_dates=True,week_commencing='W-MON')")
|
|
99
122
|
|
|
100
|
-
print("\n18.
|
|
101
|
-
print(" - Description: Allows you to map a dictionary of substrings within a column.")
|
|
102
|
-
print(" - Usage:
|
|
103
|
-
print(" - Example:
|
|
123
|
+
print("\n18. apply_lookup_table_for_columns")
|
|
124
|
+
print(" - Description: Equivalent of xlookup in excel. Allows you to map a dictionary of substrings within a column. If multiple columns are need for the LUT then a | seperator is needed.")
|
|
125
|
+
print(" - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')")
|
|
126
|
+
print(" - Example: apply_lookup_table_for_columns(df, col_names, {'spend':'spd','clicks':'clk'}, if_not_in_dict='Other', new_column_name='Metrics Short')")
|
|
104
127
|
|
|
105
128
|
print("\n19. aggregate_daily_to_wc_wide")
|
|
106
129
|
print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
|
|
107
130
|
print(" - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc, aggregation='sum', include_totals=False)")
|
|
108
131
|
print(" - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average', True)")
|
|
109
132
|
|
|
133
|
+
print("\n20. merge_cols_with_seperator")
|
|
134
|
+
print(" - Description: Merge multiple columns in a dataframe into 1 column with a seperator '_'.Can be used if multiple columns are needed for a LUT.")
|
|
135
|
+
print(" - Usage: merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = 'Merged',starting_prefix_str=None,ending_prefix_str=None)")
|
|
136
|
+
print(" - Example: merge_cols_with_seperator(df, ['Campaign','Product'],seperator='|','Merged Columns',starting_prefix_str='start_',ending_prefix_str='_end')")
|
|
137
|
+
|
|
138
|
+
print("\n21. check_sum_of_df_cols_are_equal")
|
|
139
|
+
print(" - Description: Checks if the sum of two columns in two dataframes are the same, and provides the sums of each column and the difference between them.")
|
|
140
|
+
print(" - Usage: check_sum_of_df_cols_are_equal(df_1,df_2,cols_1,cols_2)")
|
|
141
|
+
print(" - Example: check_sum_of_df_cols_are_equal(df_1,df_2,'Media Cost','Spend')")
|
|
142
|
+
|
|
143
|
+
print("\n22. convert_2_df_cols_to_dict")
|
|
144
|
+
print(" - Description: Can be used to create an LUT. Creates a dictionary using two columns in a dataframe.")
|
|
145
|
+
print(" - Usage: convert_2_df_cols_to_dict(df, key_col, value_col)")
|
|
146
|
+
print(" - Example: convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')")
|
|
147
|
+
|
|
148
|
+
print("\n23. create_FY_and_H_columns")
|
|
149
|
+
print(" - Description: Used to create a financial year, half year, and financial half year column.")
|
|
150
|
+
print(" - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY,short_format='No',half_years='No',combined_FY_and_H='No')")
|
|
151
|
+
print(" - Example: create_FY_and_H_columns(df, 'Week (M-S)', '2022-10-03', 'FY2023',short_format='Yes',half_years='Yes',combined_FY_and_H='Yes')")
|
|
152
|
+
|
|
153
|
+
print("\n24. keyword_lookup_replacement")
|
|
154
|
+
print(" - Description: Essentially provides an if statement with a xlookup if a value is something. Updates certain chosen values in a specified column of the DataFrame based on a lookup dictionary.")
|
|
155
|
+
print(" - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name='Updated Column')")
|
|
156
|
+
print(" - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel','segment','product'], qlik_dict_for_channel,output_column_name='Channel New')")
|
|
157
|
+
|
|
158
|
+
print("\n25. create_new_version_of_col_using_LUT")
|
|
159
|
+
print(" - Description: Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table. The lookup is based on a column in the dataframe.")
|
|
160
|
+
print(" - Usage: create_new_version_of_col_using_LUT(df, keys_col,value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')")
|
|
161
|
+
print(" - Example: keyword_lookup_replacement(df, '*Campaign Name','Campaign Type',search_campaign_name_retag_lut,'Campaign Name New')")
|
|
162
|
+
|
|
163
|
+
print("\n26. convert_df_wide_2_long")
|
|
164
|
+
print(" - Description: Changes a dataframe from wide to long format.")
|
|
165
|
+
print(" - Usage: convert_df_wide_2_long(df,value_cols,variable_col_name='Stacked',value_col_name='Value')")
|
|
166
|
+
print(" - Example: keyword_lookup_replacement(df, ['Media Cost','Impressions','Clicks'],variable_col_name='Metric')")
|
|
167
|
+
|
|
168
|
+
print("\n27. manually_edit_data")
|
|
169
|
+
print(" - Description: Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe.")
|
|
170
|
+
print(" - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)")
|
|
171
|
+
print(" - Example: keyword_lookup_replacement(df, {'OBS':' <= datetime(2023,1,23)','File_Name':' == 'France media''},'Master Include',1,change_in_existing_df_col = 'Yes',new_col_to_change_name = 'Master Include',manual_edit_col_name = 'Manual Changes')")
|
|
172
|
+
|
|
173
|
+
print("\n28. format_numbers_with_commas")
|
|
174
|
+
print(" - Description: Converts data in numerical format into numbers with commas and a chosen decimal place length.")
|
|
175
|
+
print(" - Usage: format_numbers_with_commas(df, decimal_length_chosen=2)")
|
|
176
|
+
print(" - Example: format_numbers_with_commas(df,1)")
|
|
177
|
+
|
|
178
|
+
print("\n29. filter_df_on_multiple_conditions")
|
|
179
|
+
print(" - Description: Filters dataframe on multiple conditions, which come in the form of a dictionary.")
|
|
180
|
+
print(" - Usage: filter_df_on_multiple_conditions(df, filters_dict)")
|
|
181
|
+
print(" - Example: filter_df_on_multiple_conditions(df, {'OBS':' <= datetime(2023,1,23)','File_Name':' == 'France media''})")
|
|
182
|
+
|
|
183
|
+
print("\n30. read_and_concatenate_files")
|
|
184
|
+
print(" - Description: Read and Concatinate all files of one type in a folder.")
|
|
185
|
+
print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
186
|
+
print(" - Example: read_and_concatenate_files(folder_path, file_type='csv')")
|
|
187
|
+
|
|
188
|
+
print("\n31. remove_zero_values")
|
|
189
|
+
print(" - Description: Remove zero values in a specified column.")
|
|
190
|
+
print(" - Usage: remove_zero_values(self, data_frame, column_to_filter)")
|
|
191
|
+
print(" - Example: remove_zero_values(None, df, 'Funeral_Delivery')")
|
|
192
|
+
|
|
193
|
+
print("\n32. upgrade_outdated_packages")
|
|
194
|
+
print(" - Description: Upgrades all packages.")
|
|
195
|
+
print(" - Usage: upgrade_outdated_packages()")
|
|
196
|
+
print(" - Example: upgrade_outdated_packages()")
|
|
197
|
+
|
|
198
|
+
print("\n33. convert_mixed_formats_dates")
|
|
199
|
+
print(" - Description: Convert a mix of US and UK dates to datetime.")
|
|
200
|
+
print(" - Usage: convert_mixed_formats_dates(df, datecol)")
|
|
201
|
+
print(" - Example: convert_mixed_formats_dates(df, 'OBS')")
|
|
202
|
+
|
|
203
|
+
print("\n34. fill_weekly_date_range")
|
|
204
|
+
print(" - Description: Fill in any missing weeks with 0.")
|
|
205
|
+
print(" - Usage: fill_weekly_date_range(df, date_column, freq)")
|
|
206
|
+
print(" - Example: fill_weekly_date_range(df, 'OBS', 'W-MON')")
|
|
207
|
+
|
|
208
|
+
print("\n35. add_prefix_and_suffix")
|
|
209
|
+
print(" - Description: Add Prefix and/or Suffix to Column Headers.")
|
|
210
|
+
print(" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)")
|
|
211
|
+
print(" - Example: add_prefix_and_suffix(df, prefix='media_', suffix='_spd', date_col='obs')")
|
|
212
|
+
|
|
213
|
+
print("\n36. create_dummies")
|
|
214
|
+
print(" - Description: Changes time series to 0s and 1s based off threshold")
|
|
215
|
+
print(" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')")
|
|
216
|
+
print(" - Example: create_dummies(df, date_col='obs', dummy_threshold=100, add_total_dummy_col='Yes', total_col_name='med_total_dum')")
|
|
217
|
+
|
|
218
|
+
print("\n37. replace_substrings")
|
|
219
|
+
print(" - Description: Replace substrings in column of strings based off dictionary, can also change column to lower")
|
|
220
|
+
print(" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)")
|
|
221
|
+
print(" - Example: replace_substrings(df, 'Influencer Handle', replacement_dict, to_lower=True, new_column='Short Version')")
|
|
222
|
+
|
|
223
|
+
print("\n38. add_total_column")
|
|
224
|
+
print(" - Description: Sums all columns with the option to exclude an date column to create a total column")
|
|
225
|
+
print(" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')")
|
|
226
|
+
print(" - Example: add_total_column(df, exclude_col='obs', total_col_name='total_media_spd')")
|
|
227
|
+
|
|
228
|
+
print("\n39. apply_lookup_table_based_on_substring")
|
|
229
|
+
print(" - Description: Equivalent of xlookup in excel, but only based on substrings. If a substring is found in a cell, than look it up in the dictionary. Otherwise use the other label")
|
|
230
|
+
print(" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')")
|
|
231
|
+
print(" - Example: apply_lookup_table_based_on_substring(df, 'Campaign Name', campaign_dict, new_col_name='Campaign Name Short', other_label='Full Funnel')")
|
|
232
|
+
|
|
233
|
+
print("\n40. compare_overlap")
|
|
234
|
+
print(" - Description: With two matching dataset, it takes the common columns and rows and takes the difference between them, outputing a differences and total differences table")
|
|
235
|
+
print(" - Usage: compare_overlap(df1, df2, date_col)")
|
|
236
|
+
print(" - Example: compare_overlap(df_1, df_2, 'obs')")
|
|
237
|
+
|
|
238
|
+
print("\n41. week_commencing_2_week_commencing_conversion")
|
|
239
|
+
print(" - Description: Take a week commencing column say sunday and creates a new column with a different week commencing e.g. monday")
|
|
240
|
+
print(" - Usage: week_commencing_2_week_commencing_conversion(df,date_col,week_commencing='sun')")
|
|
241
|
+
print(" - Example: week_commencing_2_week_commencing_conversion(df,'obs,week_commencing='mon')")
|
|
242
|
+
|
|
243
|
+
print("\n42. plot_chart")
|
|
244
|
+
print(" - Description: Plots a range of charts including line, area, scatter, bubble, bar etc.")
|
|
245
|
+
print(" - Usage: plot_chart(df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs)")
|
|
246
|
+
print(" - Example: plot_chart(df, 'obs', df.cols, chart_type='line', title='Spend Over Time', x_title='Date', y_title='Spend')")
|
|
247
|
+
|
|
248
|
+
print("\n43. plot_two_with_common_cols")
|
|
249
|
+
print(" - Description: Plots the number of charts in two dataframes for which there are two common column names")
|
|
250
|
+
print(" - Usage: plot_two_with_common_cols(df1, df2, date_column, same_axis=True)")
|
|
251
|
+
print(" - Example: plot_two_with_common_cols(df_1, df_2,date_column='obs')")
|
|
252
|
+
|
|
253
|
+
|
|
110
254
|
def get_wd_levels(self, levels):
|
|
111
255
|
"""
|
|
112
256
|
Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
|
|
@@ -138,7 +282,7 @@ class dataprocessing:
|
|
|
138
282
|
The number of rows to remove from the data frame, starting from the original header.
|
|
139
283
|
|
|
140
284
|
Returns:
|
|
141
|
-
- pandas
|
|
285
|
+
- pandas DataFrames
|
|
142
286
|
The modified data frame with rows removed and new column headings.
|
|
143
287
|
|
|
144
288
|
Raises:
|
|
@@ -224,71 +368,6 @@ class dataprocessing:
|
|
|
224
368
|
|
|
225
369
|
return grouped
|
|
226
370
|
|
|
227
|
-
def aggregate_daily_to_wc_wide(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum', include_totals : bool = False) -> pd.DataFrame:
|
|
228
|
-
"""
|
|
229
|
-
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
230
|
-
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
231
|
-
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
232
|
-
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
233
|
-
The day column is renamed from 'Day' to 'OBS'.
|
|
234
|
-
|
|
235
|
-
Parameters:
|
|
236
|
-
- df: pandas DataFrame
|
|
237
|
-
The input DataFrame containing daily data.
|
|
238
|
-
- date_column: string
|
|
239
|
-
The name of the column in the DataFrame that contains date information.
|
|
240
|
-
- group_columns: list of strings
|
|
241
|
-
Additional column names to group by along with the weekly grouping.
|
|
242
|
-
- sum_columns: list of strings
|
|
243
|
-
Numeric column names to be aggregated during aggregation.
|
|
244
|
-
- wc: string
|
|
245
|
-
The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
|
|
246
|
-
- aggregation: string, optional (default 'sum')
|
|
247
|
-
Aggregation method, either 'sum', 'average', or 'count'.
|
|
248
|
-
- include_totals: boolean, optional (default False)
|
|
249
|
-
If True, include total columns for each sum_column.
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
Returns:
|
|
254
|
-
- pandas DataFrame
|
|
255
|
-
A new DataFrame with weekly aggregated data. The index is reset,
|
|
256
|
-
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
257
|
-
is in wide format, with separate columns for each combination of
|
|
258
|
-
grouped metrics.
|
|
259
|
-
"""
|
|
260
|
-
|
|
261
|
-
grouped = self.aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation)
|
|
262
|
-
|
|
263
|
-
# Pivot the data to wide format
|
|
264
|
-
if group_columns:
|
|
265
|
-
wide_df = grouped.pivot_table(index='OBS',
|
|
266
|
-
columns=group_columns,
|
|
267
|
-
values=sum_columns,
|
|
268
|
-
aggfunc='first')
|
|
269
|
-
# Flatten the multi-level column index and create combined column names
|
|
270
|
-
wide_df.columns = ['_'.join(col).strip() for col in wide_df.columns.values]
|
|
271
|
-
else:
|
|
272
|
-
wide_df = grouped.set_index('OBS')
|
|
273
|
-
|
|
274
|
-
# Fill NaN values with 0
|
|
275
|
-
wide_df = wide_df.fillna(0)
|
|
276
|
-
|
|
277
|
-
# Adding total columns for each unique sum_column, if include_totals is True
|
|
278
|
-
if include_totals:
|
|
279
|
-
for col in sum_columns:
|
|
280
|
-
total_column_name = f'Total {col}'
|
|
281
|
-
if group_columns:
|
|
282
|
-
columns_to_sum = [column for column in wide_df.columns if col in column]
|
|
283
|
-
else:
|
|
284
|
-
columns_to_sum = [col]
|
|
285
|
-
wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
|
|
286
|
-
|
|
287
|
-
# Reset the index of the final DataFrame
|
|
288
|
-
wide_df = wide_df.reset_index()
|
|
289
|
-
|
|
290
|
-
return wide_df
|
|
291
|
-
|
|
292
371
|
def convert_monthly_to_daily(self, df, date_column, divide = True):
|
|
293
372
|
"""
|
|
294
373
|
Convert a DataFrame with monthly data to daily data.
|
|
@@ -320,7 +399,7 @@ class dataprocessing:
|
|
|
320
399
|
# Divide each numeric value by the number of days in the month
|
|
321
400
|
for col in df.columns:
|
|
322
401
|
if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
|
|
323
|
-
if divide
|
|
402
|
+
if divide is True:
|
|
324
403
|
daily_row[col] = row[col] / num_days
|
|
325
404
|
else:
|
|
326
405
|
daily_row[col] = row[col]
|
|
@@ -344,7 +423,10 @@ class dataprocessing:
|
|
|
344
423
|
:param same_axis: If True, plot both traces on the same y-axis; otherwise, use separate y-axes.
|
|
345
424
|
:return: Plotly figure
|
|
346
425
|
"""
|
|
347
|
-
|
|
426
|
+
# Ensure date columns are datetime
|
|
427
|
+
df1[date_column] = pd.to_datetime(df1[date_column])
|
|
428
|
+
df2[date_column] = pd.to_datetime(df2[date_column])
|
|
429
|
+
|
|
348
430
|
# Create traces for the first and second dataframes
|
|
349
431
|
trace1 = go.Scatter(x=df1[date_column], y=df1[col1], mode='lines', name=col1, yaxis='y1')
|
|
350
432
|
|
|
@@ -352,7 +434,7 @@ class dataprocessing:
|
|
|
352
434
|
trace2 = go.Scatter(x=df2[date_column], y=df2[col2], mode='lines', name=col2, yaxis='y1')
|
|
353
435
|
else:
|
|
354
436
|
trace2 = go.Scatter(x=df2[date_column], y=df2[col2], mode='lines', name=col2, yaxis='y2')
|
|
355
|
-
|
|
437
|
+
|
|
356
438
|
# Define layout for the plot
|
|
357
439
|
layout = go.Layout(
|
|
358
440
|
title="",
|
|
@@ -413,8 +495,8 @@ class dataprocessing:
|
|
|
413
495
|
|
|
414
496
|
return fig
|
|
415
497
|
|
|
416
|
-
def week_of_year_mapping(self,
|
|
417
|
-
|
|
498
|
+
def week_of_year_mapping(self,df, week_col, start_day_str):
|
|
499
|
+
|
|
418
500
|
# Mapping of string day names to day numbers (1 for Monday, 7 for Sunday)
|
|
419
501
|
day_mapping = {
|
|
420
502
|
'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7
|
|
@@ -429,15 +511,15 @@ class dataprocessing:
|
|
|
429
511
|
def week_to_startdate(week_str, start_day):
|
|
430
512
|
year, week = map(int, week_str.split('-W'))
|
|
431
513
|
first_day_of_year = datetime(year, 1, 1)
|
|
432
|
-
|
|
433
|
-
days_to_add = (7 - day_of_week + 1) if day_of_week > 4 else (1 - day_of_week)
|
|
434
|
-
start_of_iso_week = first_day_of_year + timedelta(days=days_to_add)
|
|
514
|
+
first_weekday_of_year = first_day_of_year.weekday() # Monday is 0 and Sunday is 6
|
|
435
515
|
|
|
436
|
-
#
|
|
437
|
-
|
|
438
|
-
|
|
516
|
+
# Calculate days to adjust to the desired start day of the week
|
|
517
|
+
days_to_adjust = (start_day - 1 - first_weekday_of_year) % 7
|
|
518
|
+
start_of_iso_week = first_day_of_year + timedelta(days=days_to_adjust)
|
|
439
519
|
|
|
440
|
-
|
|
520
|
+
# Calculate the start of the desired week
|
|
521
|
+
start_of_week = start_of_iso_week + timedelta(weeks=week - 1)
|
|
522
|
+
return start_of_week
|
|
441
523
|
|
|
442
524
|
# Apply the function to each row in the specified week column
|
|
443
525
|
df['OBS'] = df[week_col].apply(lambda x: week_to_startdate(x, start_day)).dt.strftime('%d/%m/%Y')
|
|
@@ -447,22 +529,15 @@ class dataprocessing:
|
|
|
447
529
|
# This line filters the DataFrame based on whether the values in the specified column are not in the list_of_filters
|
|
448
530
|
return df[~df[col_to_filter].isin(list_of_filters)]
|
|
449
531
|
|
|
450
|
-
def rename_cols(self, df,
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
Returns:
|
|
461
|
-
- pandas DataFrame
|
|
462
|
-
The DataFrame with renamed columns.
|
|
463
|
-
"""
|
|
464
|
-
|
|
465
|
-
return df.rename(columns=cols_to_rename)
|
|
532
|
+
def rename_cols(self, df, name = 'ame_'):
|
|
533
|
+
new_columns = {}
|
|
534
|
+
for col in df.columns:
|
|
535
|
+
if col != 'OBS':
|
|
536
|
+
new_col_name = name + col.replace(" ", "_").lower()
|
|
537
|
+
else:
|
|
538
|
+
new_col_name = col
|
|
539
|
+
new_columns[col] = new_col_name
|
|
540
|
+
return df.rename(columns=new_columns)
|
|
466
541
|
|
|
467
542
|
def merge_new_and_old(self, old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS'):
|
|
468
543
|
"""
|
|
@@ -572,32 +647,24 @@ class dataprocessing:
|
|
|
572
647
|
return merged_df
|
|
573
648
|
|
|
574
649
|
def convert_us_to_uk_dates(self, df, date_col):
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
month, day = day, month
|
|
594
|
-
return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
|
|
595
|
-
else:
|
|
596
|
-
# Handle already correct or non-standard formats cautiously
|
|
597
|
-
return d
|
|
598
|
-
|
|
599
|
-
# Apply the fix to the specified column
|
|
600
|
-
df[date_col] = df[date_col].apply(lambda x: fix_date(x) if not pd.isnull(x) else x)
|
|
650
|
+
"""
|
|
651
|
+
Processes the date column of a DataFrame to remove hyphens and slashes,
|
|
652
|
+
and converts it to a datetime object.
|
|
653
|
+
|
|
654
|
+
Parameters:
|
|
655
|
+
df (pd.DataFrame): The DataFrame containing the date column.
|
|
656
|
+
date_col (str): The name of the date column.
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
pd.DataFrame: The DataFrame with the processed date column.
|
|
660
|
+
"""
|
|
661
|
+
df[date_col] = df[date_col].str.replace(r'[-/]', '', regex=True)
|
|
662
|
+
df[date_col] = pd.to_datetime(
|
|
663
|
+
df[date_col].str.slice(0, 2) + '/' +
|
|
664
|
+
df[date_col].str.slice(2, 4) + '/' +
|
|
665
|
+
df[date_col].str.slice(4, 8),
|
|
666
|
+
format='%m/%d/%Y'
|
|
667
|
+
)
|
|
601
668
|
return df
|
|
602
669
|
|
|
603
670
|
def combine_sheets(self, all_sheets):
|
|
@@ -619,61 +686,2666 @@ class dataprocessing:
|
|
|
619
686
|
|
|
620
687
|
return combined_df
|
|
621
688
|
|
|
622
|
-
def
|
|
623
|
-
|
|
624
|
-
|
|
689
|
+
def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
|
|
690
|
+
"""
|
|
691
|
+
Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
|
|
692
|
+
|
|
693
|
+
Args:
|
|
694
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
695
|
+
index_col (str): Name of Column for your pivot table to index on
|
|
696
|
+
columns (str): Name of Columns for your pivot table.
|
|
697
|
+
values_col (str): Name of Values Columns for your pivot table.
|
|
698
|
+
filters_dict (dict, optional): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell. Defaults to None
|
|
699
|
+
fill_value (int, optional): The value to replace nan with. Defaults to 0.
|
|
700
|
+
aggfunc (str, optional): The method on which to aggregate the values column. Defaults to sum.
|
|
701
|
+
margins (bool, optional): Whether the pivot table needs a total rows and column. Defaults to False.
|
|
702
|
+
margins_name (str, optional): The name of the Totals columns. Defaults to "Total".
|
|
703
|
+
datetime_trans_needed (bool, optional): Whether the index column needs to be transformed into datetime format. Defaults to False.
|
|
704
|
+
reverse_header_order (bool, optional): Reverses the order of the column headers. Defaults to False.
|
|
705
|
+
fill_missing_weekly_dates (bool, optional): Fills in any weekly missing dates. Defaults to False.
|
|
706
|
+
week_commencing (str,optional): Fills in missing weeks if option is specified. Defaults to 'W-MON'.
|
|
707
|
+
|
|
708
|
+
Returns:
|
|
709
|
+
pandas.DataFrame: The pivot table specified
|
|
710
|
+
"""
|
|
625
711
|
|
|
626
|
-
#
|
|
627
|
-
if
|
|
628
|
-
|
|
629
|
-
|
|
712
|
+
# Validate inputs
|
|
713
|
+
if index_col not in df.columns:
|
|
714
|
+
raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
|
|
715
|
+
if columns not in df.columns:
|
|
716
|
+
raise ValueError(f"columns '{columns}' not found in DataFrame.")
|
|
717
|
+
if values_col not in df.columns:
|
|
718
|
+
raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
|
|
719
|
+
|
|
720
|
+
# Apply filters if provided
|
|
721
|
+
if filters_dict:
|
|
722
|
+
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
630
723
|
else:
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
724
|
+
df_filtered = df.copy()
|
|
725
|
+
|
|
726
|
+
# Ensure index column is in datetime format if needed
|
|
727
|
+
if datetime_trans_needed:
|
|
728
|
+
df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
|
|
729
|
+
|
|
730
|
+
# Create the pivot table
|
|
731
|
+
pivoted_df = df_filtered.pivot_table(
|
|
732
|
+
index=index_col,
|
|
733
|
+
columns=columns,
|
|
734
|
+
values=values_col,
|
|
735
|
+
aggfunc=aggfunc,
|
|
736
|
+
margins=margins,
|
|
737
|
+
margins_name=margins_name,
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
# Handle column headers
|
|
635
741
|
if isinstance(pivoted_df.columns, pd.MultiIndex):
|
|
636
|
-
pivoted_df.columns = [
|
|
742
|
+
pivoted_df.columns = [
|
|
743
|
+
"_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
|
|
744
|
+
for col in pivoted_df.columns.values
|
|
745
|
+
]
|
|
637
746
|
else:
|
|
638
747
|
pivoted_df.columns = pivoted_df.columns.map(str)
|
|
748
|
+
|
|
749
|
+
# Reset the index
|
|
750
|
+
pivoted_df.reset_index(inplace=True)
|
|
751
|
+
|
|
752
|
+
# Handle sorting and formatting of index column
|
|
753
|
+
if datetime_trans_needed:
|
|
754
|
+
pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
|
|
755
|
+
pivoted_df.sort_values(by=index_col, inplace=True)
|
|
756
|
+
pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
|
|
757
|
+
|
|
758
|
+
# Fill missing values
|
|
759
|
+
pivoted_df.fillna(fill_value, inplace=True)
|
|
760
|
+
|
|
761
|
+
# Fill missing weekly dates if specified
|
|
762
|
+
if fill_missing_weekly_dates:
|
|
763
|
+
pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
|
|
764
|
+
|
|
765
|
+
return pivoted_df
|
|
766
|
+
|
|
767
|
+
def apply_lookup_table_for_columns(self, df, col_names, to_find_dict, if_not_in_dict="Other", new_column_name="Mapping"):
|
|
768
|
+
"""
|
|
769
|
+
Creates a new DataFrame column based on a look up table, possibly with multiple columns to look up on (dictionary of substrings to class mappings).
|
|
770
|
+
|
|
771
|
+
Parameters:
|
|
772
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
773
|
+
col_names (list of str): these are the columns which are used for the lookup. One column or several columns can be inputted as a list, provided there is a merged column to lookup on. If there are multiple columns to look up on then a merged column must be inputted as the key of the dictionary of format e.g. col1|col2|col3
|
|
774
|
+
to_find_dict (dict): your look up table, where keys are the values being looked up, and the values are the resulting mappings.
|
|
775
|
+
if_not_in_dict (str, optional): default value if no substring matches are found in the look up table dictionary. Defaults to "Other".
|
|
776
|
+
new_column_name (str, optional): name of new column. Defaults to "Mapping".
|
|
777
|
+
|
|
778
|
+
Returns:
|
|
779
|
+
pandas.DataFrame: DataFrame with a new column containing the look up table results.
|
|
780
|
+
"""
|
|
781
|
+
|
|
782
|
+
# Create regex pattern with word boundaries from the dictionary
|
|
783
|
+
regex_pattern = "|".join(r'\b' + re.escape(key) + r'\b' for key in to_find_dict.keys())
|
|
784
|
+
|
|
785
|
+
# Preprocess DataFrame if multiple columns
|
|
786
|
+
if len(col_names) > 1:
|
|
787
|
+
df["Merged"] = df[col_names].astype(str).apply('|'.join, axis=1)
|
|
788
|
+
col_to_use = "Merged"
|
|
789
|
+
else:
|
|
790
|
+
col_to_use = col_names[0]
|
|
791
|
+
|
|
792
|
+
# Extract the first match using the regex pattern
|
|
793
|
+
matches = df[col_to_use].str.extract(f'({regex_pattern})', expand=False, flags=re.IGNORECASE)
|
|
794
|
+
|
|
795
|
+
# Map the matches to the corresponding values in the dictionary
|
|
796
|
+
df[new_column_name] = matches.str.lower().map({k.lower(): v for k, v in to_find_dict.items()}).fillna(if_not_in_dict)
|
|
797
|
+
|
|
798
|
+
# Drop intermediate column if created
|
|
799
|
+
if len(col_names) > 1:
|
|
800
|
+
df.drop(columns=["Merged"], inplace=True)
|
|
801
|
+
|
|
802
|
+
return df
|
|
803
|
+
|
|
804
|
+
def aggregate_daily_to_wc_wide(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum', include_totals : bool = False) -> pd.DataFrame:
|
|
805
|
+
"""
|
|
806
|
+
Aggregates daily data into weekly data, starting on a specified day of the week,
|
|
807
|
+
and groups the data by additional specified columns. It aggregates specified numeric columns
|
|
808
|
+
by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
|
|
809
|
+
of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
|
|
810
|
+
The day column is renamed from 'Day' to 'OBS'.
|
|
811
|
+
|
|
812
|
+
Parameters:
|
|
813
|
+
- df: pandas DataFrame
|
|
814
|
+
The input DataFrame containing daily data.
|
|
815
|
+
- date_column: string
|
|
816
|
+
The name of the column in the DataFrame that contains date information.
|
|
817
|
+
- group_columns: list of strings
|
|
818
|
+
Additional column names to group by along with the weekly grouping.
|
|
819
|
+
- sum_columns: list of strings
|
|
820
|
+
Numeric column names to be aggregated during aggregation.
|
|
821
|
+
- wc: string
|
|
822
|
+
The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
|
|
823
|
+
- aggregation: string, optional (default 'sum')
|
|
824
|
+
Aggregation method, either 'sum', 'average', or 'count'.
|
|
825
|
+
- include_totals: boolean, optional (default False)
|
|
826
|
+
If True, include total columns for each sum_column.
|
|
827
|
+
|
|
828
|
+
Returns:
|
|
829
|
+
- pandas DataFrame
|
|
830
|
+
A new DataFrame with weekly aggregated data. The index is reset,
|
|
831
|
+
and columns represent the grouped and aggregated metrics. The DataFrame
|
|
832
|
+
is in wide format, with separate columns for each combination of
|
|
833
|
+
grouped metrics.
|
|
834
|
+
"""
|
|
639
835
|
|
|
640
|
-
|
|
641
|
-
pivoted_df = pivoted_df.reset_index()
|
|
836
|
+
grouped = self.aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation)
|
|
642
837
|
|
|
643
|
-
#
|
|
644
|
-
|
|
645
|
-
|
|
838
|
+
# Pivot the data to wide format
|
|
839
|
+
if group_columns:
|
|
840
|
+
wide_df = grouped.pivot_table(index='OBS',
|
|
841
|
+
columns=group_columns,
|
|
842
|
+
values=sum_columns,
|
|
843
|
+
aggfunc='first')
|
|
844
|
+
# Flatten the multi-level column index and create combined column names
|
|
845
|
+
wide_df.columns = ['_'.join(col).strip() for col in wide_df.columns.values]
|
|
846
|
+
else:
|
|
847
|
+
wide_df = grouped.set_index('OBS')
|
|
848
|
+
|
|
849
|
+
# Fill NaN values with 0
|
|
850
|
+
wide_df = wide_df.fillna(0)
|
|
851
|
+
|
|
852
|
+
# Adding total columns for each unique sum_column, if include_totals is True
|
|
853
|
+
if include_totals:
|
|
854
|
+
for col in sum_columns:
|
|
855
|
+
total_column_name = f'Total {col}'
|
|
856
|
+
if group_columns:
|
|
857
|
+
columns_to_sum = [column for column in wide_df.columns if col in column]
|
|
858
|
+
else:
|
|
859
|
+
columns_to_sum = [col]
|
|
860
|
+
wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
|
|
861
|
+
|
|
862
|
+
# Reset the index of the final DataFrame
|
|
863
|
+
wide_df = wide_df.reset_index()
|
|
864
|
+
|
|
865
|
+
return wide_df
|
|
866
|
+
|
|
867
|
+
def merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = "Merged",starting_prefix_str=None,ending_prefix_str=None):
|
|
868
|
+
"""
|
|
869
|
+
Creates a new column in the dataframe that merges 2 or more columns together with a "_" seperator, possibly to be used for a look up table where multiple columns are being looked up
|
|
870
|
+
|
|
871
|
+
Parameters:
|
|
872
|
+
df (pandas.DataFrame): Dataframe to make changes to.
|
|
873
|
+
col_names (list): list of columm names ot merge.
|
|
874
|
+
seperator (str, optional): Name of column outputted. Defaults to "_".
|
|
875
|
+
output_column_name (str, optional): Name of column outputted. Defaults to "Merged".
|
|
876
|
+
starting_prefix_str (str, optional): string of optional text to be added before the merged column str value
|
|
877
|
+
ending_prefix_str (str, optional): string of optional text to be added after the merged column str value
|
|
878
|
+
|
|
879
|
+
Raises:
|
|
880
|
+
ValueError: if more less than two column names are inputted in the list there is nothing to merge on
|
|
881
|
+
|
|
882
|
+
Returns:
|
|
883
|
+
pandas.DataFrame: DataFrame with additional merged column
|
|
884
|
+
"""
|
|
885
|
+
# Specify more than one column must be entered
|
|
886
|
+
if len(col_names) < 2:
|
|
887
|
+
raise ValueError("2 or more columns must be specified to merge")
|
|
646
888
|
|
|
647
|
-
#
|
|
648
|
-
|
|
889
|
+
# Create a new column with the merged columns
|
|
890
|
+
df[output_column_name] = df[col_names].astype(str).apply(seperator.join, axis=1)
|
|
891
|
+
|
|
892
|
+
# Add string before
|
|
893
|
+
if starting_prefix_str is not None:
|
|
894
|
+
df[output_column_name] = starting_prefix_str + df[output_column_name].astype(str)
|
|
649
895
|
|
|
650
|
-
#
|
|
651
|
-
|
|
896
|
+
# Add string after
|
|
897
|
+
if ending_prefix_str is not None:
|
|
898
|
+
df[output_column_name] = df[output_column_name].astype(str) + ending_prefix_str
|
|
899
|
+
|
|
900
|
+
return df
|
|
901
|
+
|
|
902
|
+
def check_sum_of_df_cols_are_equal(self, df_1,df_2,cols_1,cols_2):
|
|
903
|
+
"""
|
|
904
|
+
Checks the sum of two different dataframe column or columns are equal
|
|
905
|
+
|
|
906
|
+
Parameters:
|
|
907
|
+
df_1 (pandas.DataFrame): First dataframe for columnsa to be summed on.
|
|
908
|
+
df_2 (pandas.DataFrame): Second dataframe for columnsa to be summed on.
|
|
909
|
+
cols_1 (list of str): Columns from first dataframe to sum.
|
|
910
|
+
cols_2 (list of str): Columns from second dataframe to sum.
|
|
911
|
+
|
|
912
|
+
Returns:
|
|
913
|
+
Tuple: Answer is the true or false answer to whether sums are the same, df_1_sum is the sum of the column/columns in the first dataframe, df_2_sum is the sum of the column/columns in the second dataframe
|
|
914
|
+
"""
|
|
915
|
+
# Find the sum of both sets of columns
|
|
916
|
+
df_1_sum = df_1[cols_1].sum().sum()
|
|
917
|
+
df_2_sum = df_2[cols_2].sum().sum()
|
|
652
918
|
|
|
653
|
-
|
|
919
|
+
# If the the two columns are
|
|
920
|
+
if df_1_sum == df_2_sum:
|
|
921
|
+
Answer = "They are equal"
|
|
922
|
+
if df_1_sum != df_2_sum:
|
|
923
|
+
Answer = "They are different by " + str(df_2_sum-df_1_sum)
|
|
924
|
+
|
|
925
|
+
return Answer,df_1_sum,df_2_sum
|
|
926
|
+
|
|
927
|
+
def convert_2_df_cols_to_dict(self, df, key_col, value_col):
|
|
928
|
+
"""
|
|
929
|
+
Create a dictionary mapping from two columns of a DataFrame.
|
|
930
|
+
|
|
931
|
+
Parameters:
|
|
932
|
+
df (pd.DataFrame): The DataFrame containing the data.
|
|
933
|
+
key_col (str): The column name to use as keys in the dictionary.
|
|
934
|
+
value_col (str): The column name to use as values in the dictionary.
|
|
935
|
+
|
|
936
|
+
Returns:
|
|
937
|
+
dict: A dictionary with keys from 'key_col' and values from 'value_col'.
|
|
938
|
+
"""
|
|
939
|
+
if key_col not in df or value_col not in df:
|
|
940
|
+
raise ValueError("Specified columns are not in the DataFrame")
|
|
941
|
+
|
|
942
|
+
return {df[key_col].iloc[i]: df[value_col].iloc[i] for i in range(len(df))}
|
|
654
943
|
|
|
655
|
-
def
|
|
944
|
+
def create_FY_and_H_columns(self, df, index_col, start_date, starting_FY,short_format="No",half_years="No",combined_FY_and_H="No"):
|
|
945
|
+
"""
|
|
946
|
+
Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
|
|
947
|
+
|
|
948
|
+
Parameters:
|
|
949
|
+
df (pandas.DataFrame): Dataframe to operate on.
|
|
950
|
+
index_col (str): Name of the column to use for datetime
|
|
951
|
+
start_date (str): String used to specify the start date of an FY specified, needs to be of format "yyyy-mm-dd" e.g. 2021-11-31
|
|
952
|
+
starting_FY (str): String used to specify which FY the start date refers to, needs to be formatted LONG e.g. FY2021
|
|
953
|
+
short_format (str, optional): String used to specify if short format is desired (e.g. FY21) or if long format is desired (e.g. FY2021). Defaults to "No".
|
|
954
|
+
half_years (str, optional): String used to specify if half year column is desired. Defaults to "No".
|
|
955
|
+
combined_FY_and_H (str, optional): String used to specify is a combined half year and FY column is desired. Defaults to "No".
|
|
956
|
+
|
|
957
|
+
Returns:
|
|
958
|
+
pandas.DataFrame: DataFrame with a new column 'FY' containing the FY as well as, if desired, a half year column and a combined FY half year column.
|
|
656
959
|
"""
|
|
657
|
-
Classify entries in a DataFrame column based on a dictionary of substrings to class mappings.
|
|
658
960
|
|
|
961
|
+
try:
|
|
962
|
+
start_date = datetime.strptime(start_date, '%Y-%m-%d')
|
|
963
|
+
except ValueError:
|
|
964
|
+
print("Error: Date must be of format yyyy-mm-dd")
|
|
965
|
+
return df
|
|
966
|
+
|
|
967
|
+
df["OBS"] = pd.to_datetime(df[index_col])
|
|
968
|
+
df["OBS as string"] = df["OBS"].dt.strftime("%Y-%m-%d")
|
|
969
|
+
|
|
970
|
+
df[index_col] = pd.to_datetime(df[index_col])
|
|
971
|
+
|
|
972
|
+
start_year = int(starting_FY[2:])
|
|
973
|
+
|
|
974
|
+
def calculate_FY_vectorized(date_series):
|
|
975
|
+
years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
|
|
976
|
+
fy = 'FY' + (start_year + years_since_start).astype(str)
|
|
977
|
+
if short_format == "Yes":
|
|
978
|
+
fy = 'FY' + fy.str[-2:]
|
|
979
|
+
return fy
|
|
980
|
+
|
|
981
|
+
df['FY'] = calculate_FY_vectorized(df[index_col])
|
|
982
|
+
|
|
983
|
+
if half_years == "Yes" or combined_FY_and_H == "Yes":
|
|
984
|
+
def calculate_half_year_vectorized(date_series):
|
|
985
|
+
fy_years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
|
|
986
|
+
fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(years=1)
|
|
987
|
+
fy_end_of_h1 = fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
|
|
988
|
+
half_year = np.where(date_series <= fy_end_of_h1, 'H1', 'H2')
|
|
989
|
+
return half_year
|
|
990
|
+
|
|
991
|
+
df['Half Years'] = calculate_half_year_vectorized(df[index_col])
|
|
992
|
+
|
|
993
|
+
if combined_FY_and_H == "Yes":
|
|
994
|
+
df['Financial Half Years'] = df['FY'] + ' ' + df['Half Years']
|
|
995
|
+
|
|
996
|
+
return df
|
|
997
|
+
|
|
998
|
+
def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
|
|
999
|
+
"""
|
|
1000
|
+
This function updates values in a specified column of the DataFrame based on a lookup dictionary.
|
|
1001
|
+
It first merges several columns into a new 'Merged' column, then uses this merged column to determine
|
|
1002
|
+
if replacements are needed based on the dictionary.
|
|
1003
|
+
|
|
659
1004
|
Parameters:
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
1005
|
+
df (pd.DataFrame): The DataFrame to process.
|
|
1006
|
+
col (str): The name of the column whose values are potentially replaced.
|
|
1007
|
+
replacement_rows (str): The specific value in 'col' to check for replacements.
|
|
1008
|
+
cols_to_merge (list of str): List of column names whose contents will be merged to form a lookup key.
|
|
1009
|
+
replacement_lookup_dict (dict): Dictionary where keys are merged column values and values are the new data to replace in 'col'.
|
|
1010
|
+
output_column_name (str, optional): Name of column outputted. Defaults to "Updated Column".
|
|
1011
|
+
|
|
1012
|
+
Returns:
|
|
1013
|
+
pd.DataFrame: The modified DataFrame with updated values in the specified column.
|
|
1014
|
+
"""
|
|
1015
|
+
# Create a merged column from specified columns
|
|
1016
|
+
df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
|
|
1017
|
+
|
|
1018
|
+
# Replace values in the specified column based on the lookup
|
|
1019
|
+
def replace_values(x):
|
|
1020
|
+
if x[col] == replacement_rows:
|
|
1021
|
+
merged_value = x['Merged']
|
|
1022
|
+
if merged_value in replacement_lookup_dict:
|
|
1023
|
+
return replacement_lookup_dict[merged_value]
|
|
1024
|
+
return x[col]
|
|
664
1025
|
|
|
1026
|
+
# Apply replacement logic
|
|
1027
|
+
df[output_column_name] = df.apply(replace_values, axis=1)
|
|
1028
|
+
|
|
1029
|
+
# Drop the intermediate 'Merged' column
|
|
1030
|
+
df.drop(columns=['Merged'], inplace=True)
|
|
1031
|
+
|
|
1032
|
+
return df
|
|
1033
|
+
|
|
1034
|
+
def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
|
|
1035
|
+
"""
|
|
1036
|
+
Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
|
|
1037
|
+
The lookup is based on a column in the dataframe. Can only input one column and output one new column.
|
|
1038
|
+
|
|
1039
|
+
Parameters:
|
|
1040
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
1041
|
+
keys_col (str): The name of the column which the LUT will be refercing to ouput a value.
|
|
1042
|
+
value_col (str): The name of the column which the new column will be based off. If a key in the key column is not found in the LUT, the values from this column are used instead.
|
|
1043
|
+
dict_for_specific_changes (dict): The LUT which the keys_col will be mapped on to find any values that need changing in the new column.
|
|
1044
|
+
new_col_name (str, optional): This is the name of the new column being generated. Defaults to "New Version of Old Col".
|
|
1045
|
+
|
|
665
1046
|
Returns:
|
|
666
|
-
|
|
1047
|
+
pandas.DataFrame: DataFrame with a new column which is similar to the old column, except for where changes have been made to reflect the lookup table.
|
|
667
1048
|
"""
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
1049
|
+
|
|
1050
|
+
# Extract columns to change using new dictionary
|
|
1051
|
+
smaller_df = df[[keys_col,value_col]]
|
|
1052
|
+
|
|
1053
|
+
# Use the new dictionary to create a new LUT
|
|
1054
|
+
smaller_df_with_LUT = self.apply_lookup_table_for_columns(smaller_df,[keys_col,value_col],dict_for_specific_changes)
|
|
1055
|
+
|
|
1056
|
+
# In a new column, keep values from the old column that don't need updating as they are not in the dictionary, and replace values that do need updating with values from the dictionary based on the keys
|
|
1057
|
+
smaller_df_with_LUT["Updated Col"]=smaller_df_with_LUT.apply(lambda x: x['Mapping'] if x['Mapping'] != "Other" else x[value_col],axis=1)
|
|
1058
|
+
|
|
1059
|
+
# Drop the extra unecessary cols
|
|
1060
|
+
smaller_df_with_LUT.drop([keys_col,'Mapping'],axis=1,inplace=True)
|
|
1061
|
+
|
|
1062
|
+
# # Output dataframes as dictionary to be used in a LUT
|
|
1063
|
+
new_dict = self.convert_2_df_cols_to_dict(smaller_df_with_LUT,value_col,"Updated Col")
|
|
675
1064
|
|
|
676
|
-
#
|
|
677
|
-
|
|
1065
|
+
# # Use new dictionary to create a new version of an old column
|
|
1066
|
+
df_final = self.apply_lookup_table_for_columns(df,[keys_col],new_dict,"other",new_col_name)
|
|
678
1067
|
|
|
679
|
-
return
|
|
1068
|
+
return df_final
|
|
1069
|
+
|
|
1070
|
+
def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
|
|
1071
|
+
"""
|
|
1072
|
+
Changes a dataframe from wide to long format.
|
|
1073
|
+
|
|
1074
|
+
Args:
|
|
1075
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
1076
|
+
value_cols (list of str or str if only one): List of column names to transform from several columns into one.
|
|
1077
|
+
variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
|
|
1078
|
+
value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
|
|
1079
|
+
|
|
1080
|
+
Returns:
|
|
1081
|
+
pandas.DataFrame: DataFrame transformed from wide to long format.
|
|
1082
|
+
|
|
1083
|
+
Raises:
|
|
1084
|
+
ValueError: If the number of columns to depivot is less than 2.
|
|
1085
|
+
"""
|
|
1086
|
+
# Check length of value_cols is greater than 1
|
|
1087
|
+
if len(value_cols) < 2:
|
|
1088
|
+
raise ValueError("Number of inputs in list must be greater than 1")
|
|
1089
|
+
|
|
1090
|
+
# Find the columns that are not to be depivoted into one column
|
|
1091
|
+
id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
|
|
1092
|
+
|
|
1093
|
+
# Melt all columns chosen into one column
|
|
1094
|
+
df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
|
|
1095
|
+
|
|
1096
|
+
# Sort column order to match expected output
|
|
1097
|
+
ordered_columns = id_vars + [variable_col_name, value_col_name]
|
|
1098
|
+
df_final = df_final[ordered_columns]
|
|
1099
|
+
|
|
1100
|
+
return df_final
|
|
1101
|
+
|
|
1102
|
+
def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
|
|
1103
|
+
"""
|
|
1104
|
+
Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
|
|
1105
|
+
|
|
1106
|
+
Args:
|
|
1107
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
1108
|
+
filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
|
|
1109
|
+
col_to_change (str): String name of column to edit
|
|
1110
|
+
new_value (any): Value of new input for cell
|
|
1111
|
+
change_in_existing_df_col (str, optional): Input of Yes or No to describe whether to make the change in an existing column. Defaults to "No".
|
|
1112
|
+
new_col_to_change_name (str, optional): Name of the new column to copy the column being edited into and to make the change in. Defaults to 'New'.
|
|
1113
|
+
manual_edit_col_name (str, optional): Name of the current manual edits column, if one is not specified it will be created. Defaults to None.
|
|
1114
|
+
add_notes (str, optional): Gives the option to create a new notes column. Defaults to "No".
|
|
1115
|
+
existing_note_col_name (str, optional): If there is an existing notes column this can be specified. Defaults to None.
|
|
1116
|
+
note (str), optional): The string of the note to be added to the column. Defaults to None.
|
|
1117
|
+
|
|
1118
|
+
Raises:
|
|
1119
|
+
TypeError: The column for the column to change can only be specified as one column as it is a string not a list
|
|
1120
|
+
ValueError: You can only input the values of "Yes" or "No" for whether to make the change in existing column
|
|
1121
|
+
ValueError: You can only input the values of "Yes" or "No" for whether to make a new notes column
|
|
1122
|
+
|
|
1123
|
+
Returns:
|
|
1124
|
+
pandas.DataFrame: Dataframe with manual changes added
|
|
1125
|
+
"""
|
|
1126
|
+
|
|
1127
|
+
# Raise type error if more than one col is supported
|
|
1128
|
+
if isinstance(col_to_change, list):
|
|
1129
|
+
raise TypeError("Col to change must be specified as a string, not a list")
|
|
1130
|
+
|
|
1131
|
+
# Raises value error if input is invalid for change_in_existing_df_col
|
|
1132
|
+
if change_in_existing_df_col not in ["Yes", "No"]:
|
|
1133
|
+
raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
|
|
1134
|
+
|
|
1135
|
+
# Raises value error if input is invalid for add_notes_col
|
|
1136
|
+
if add_notes not in ["Yes", "No"]:
|
|
1137
|
+
raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
|
|
1138
|
+
|
|
1139
|
+
# Validate filters_dict format
|
|
1140
|
+
for col, cond in filters_dict.items():
|
|
1141
|
+
if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
|
|
1142
|
+
raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
|
|
1143
|
+
|
|
1144
|
+
# Create the filtered df by applying the conditions
|
|
1145
|
+
df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
|
|
1146
|
+
|
|
1147
|
+
# Create a new column to add the changes if desired, else edit in the current chosen column
|
|
1148
|
+
col_to_update = col_to_change if change_in_existing_df_col == "Yes" else new_col_to_change_name
|
|
1149
|
+
if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
|
|
1150
|
+
df = df.copy()
|
|
1151
|
+
df[new_col_to_change_name] = df[col_to_change]
|
|
1152
|
+
|
|
1153
|
+
# Update the new cell in the chosen column
|
|
1154
|
+
df.loc[df_filtered.index, col_to_update] = new_value
|
|
1155
|
+
|
|
1156
|
+
# Add in manual edit column if desired or specify where one already is
|
|
1157
|
+
if manual_edit_col_name:
|
|
1158
|
+
if manual_edit_col_name not in df.columns:
|
|
1159
|
+
df[manual_edit_col_name] = 0
|
|
1160
|
+
df.loc[df_filtered.index, manual_edit_col_name] = 1
|
|
1161
|
+
elif not manual_edit_col_name and 'Manual Changes' not in df.columns:
|
|
1162
|
+
df['Manual Changes'] = 0
|
|
1163
|
+
df.loc[df_filtered.index, 'Manual Changes'] = 1
|
|
1164
|
+
|
|
1165
|
+
# Add note if desired in new column or an existing column
|
|
1166
|
+
if add_notes == "Yes":
|
|
1167
|
+
note_col = existing_note_col_name if existing_note_col_name else 'Notes'
|
|
1168
|
+
if note_col not in df.columns:
|
|
1169
|
+
df[note_col] = None
|
|
1170
|
+
df.loc[df_filtered.index, note_col] = note
|
|
1171
|
+
|
|
1172
|
+
return df
|
|
1173
|
+
|
|
1174
|
+
def format_numbers_with_commas(self, df, decimal_length_chosen=2):
|
|
1175
|
+
"""
|
|
1176
|
+
Converts data in numerical format into numbers with commas and a chosen decimal place length.
|
|
1177
|
+
|
|
1178
|
+
Args:
|
|
1179
|
+
df (pandas.DataFrame): The DataFrame containing the data.
|
|
1180
|
+
decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
|
|
1181
|
+
|
|
1182
|
+
Returns:
|
|
1183
|
+
pandas.DataFrame: The DataFrame with the chosen updated format.
|
|
1184
|
+
"""
|
|
1185
|
+
def format_number_with_commas(x, decimal_length=decimal_length_chosen):
|
|
1186
|
+
if pd.isna(x): # Preserve None/NaN values
|
|
1187
|
+
return pd.NA # Explicitly normalize to pd.NA
|
|
1188
|
+
elif isinstance(x, (int, float)):
|
|
1189
|
+
if decimal_length is not None:
|
|
1190
|
+
format_str = f"{{:,.{decimal_length}f}}"
|
|
1191
|
+
return format_str.format(x)
|
|
1192
|
+
else:
|
|
1193
|
+
return f"{x:,}"
|
|
1194
|
+
else:
|
|
1195
|
+
return x # Return unchanged if not a number
|
|
1196
|
+
|
|
1197
|
+
# Apply formatting column by column
|
|
1198
|
+
formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
|
|
1199
|
+
|
|
1200
|
+
return formatted_df
|
|
1201
|
+
|
|
1202
|
+
def filter_df_on_multiple_conditions(self, df, filters_dict):
|
|
1203
|
+
"""
|
|
1204
|
+
Filter a dataframe based on mulitple conditions
|
|
1205
|
+
|
|
1206
|
+
Args:
|
|
1207
|
+
df (pandas.DatFrame): Dataframe to filter on
|
|
1208
|
+
filters_dict (dict): Dictionary with strings as conditions
|
|
1209
|
+
|
|
1210
|
+
Returns:
|
|
1211
|
+
pandas.DatFrame: Filtered Da
|
|
1212
|
+
"""
|
|
1213
|
+
mask = pd.Series(True, index=df.index)
|
|
1214
|
+
for col, cond in filters_dict.items():
|
|
1215
|
+
cond = cond.strip()
|
|
1216
|
+
operator, value = cond.split(maxsplit=1)
|
|
1217
|
+
|
|
1218
|
+
# If value is a string condition make sure to check if there are new lines
|
|
1219
|
+
if "'" in value:
|
|
1220
|
+
value = value.strip().strip("'\"")
|
|
1221
|
+
# If not a string e.g. datetime or number condition you need to transform the string into a value
|
|
1222
|
+
else:
|
|
1223
|
+
value = eval(value)
|
|
1224
|
+
|
|
1225
|
+
if operator == "==":
|
|
1226
|
+
temp_mask = (df[col] == value)
|
|
1227
|
+
elif operator == "!=":
|
|
1228
|
+
temp_mask = (df[col] != value)
|
|
1229
|
+
elif operator == ">=":
|
|
1230
|
+
temp_mask = (df[col] >= value)
|
|
1231
|
+
elif operator == "<=":
|
|
1232
|
+
temp_mask = (df[col] <= value)
|
|
1233
|
+
elif operator == ">":
|
|
1234
|
+
temp_mask = (df[col] > value)
|
|
1235
|
+
elif operator == "<":
|
|
1236
|
+
temp_mask = (df[col] < value)
|
|
1237
|
+
mask &= temp_mask
|
|
1238
|
+
|
|
1239
|
+
# Create the filtered df by applying the conditions
|
|
1240
|
+
df_filtered = df[mask]
|
|
1241
|
+
|
|
1242
|
+
return df_filtered
|
|
1243
|
+
|
|
1244
|
+
def read_and_concatenate_files(self, folder_path, file_type='csv'):
|
|
1245
|
+
"""
|
|
1246
|
+
Reads all files of a specified type (CSV or XLSX) from a given folder
|
|
1247
|
+
and concatenates them into a single DataFrame.
|
|
1248
|
+
|
|
1249
|
+
Parameters:
|
|
1250
|
+
folder_path (str): The path to the folder containing the files.
|
|
1251
|
+
file_type (str): The type of files to read ('csv' or 'xlsx'). Defaults to 'csv'.
|
|
1252
|
+
|
|
1253
|
+
Returns:
|
|
1254
|
+
pd.DataFrame: A DataFrame containing the concatenated data from all files.
|
|
1255
|
+
"""
|
|
1256
|
+
|
|
1257
|
+
# Initialize an empty list to hold dataframes
|
|
1258
|
+
dataframes = []
|
|
1259
|
+
|
|
1260
|
+
# Define file extension based on file_type
|
|
1261
|
+
if file_type == 'csv':
|
|
1262
|
+
extension = '.csv'
|
|
1263
|
+
elif file_type == 'xlsx':
|
|
1264
|
+
extension = '.xlsx'
|
|
1265
|
+
else:
|
|
1266
|
+
raise ValueError("file_type must be either 'csv' or 'xlsx'")
|
|
1267
|
+
|
|
1268
|
+
# Loop through all files in the folder
|
|
1269
|
+
for filename in os.listdir(folder_path):
|
|
1270
|
+
# Check if the file has the correct extension
|
|
1271
|
+
if filename.endswith(extension):
|
|
1272
|
+
file_path = os.path.join(folder_path, filename)
|
|
1273
|
+
# Read the file into a DataFrame
|
|
1274
|
+
if file_type == 'csv':
|
|
1275
|
+
df = pd.read_csv(file_path)
|
|
1276
|
+
elif file_type == 'xlsx':
|
|
1277
|
+
df = pd.read_excel(file_path)
|
|
1278
|
+
# Append the DataFrame to the list
|
|
1279
|
+
dataframes.append(df)
|
|
1280
|
+
|
|
1281
|
+
# Concatenate all DataFrames into a single DataFrame
|
|
1282
|
+
combined_df = pd.concat(dataframes, ignore_index=True)
|
|
1283
|
+
|
|
1284
|
+
return combined_df
|
|
1285
|
+
|
|
1286
|
+
def remove_zero_values(self, data_frame, column_to_filter):
|
|
1287
|
+
"""
|
|
1288
|
+
Removes zero values from given columns
|
|
1289
|
+
|
|
1290
|
+
Parameters:
|
|
1291
|
+
df - input data frame
|
|
1292
|
+
column_to_filter - a column to filter out zero values from
|
|
1293
|
+
|
|
1294
|
+
Returns:
|
|
1295
|
+
Pandas data frame without null values
|
|
1296
|
+
"""
|
|
1297
|
+
|
|
1298
|
+
#This line removes zero values from given column
|
|
1299
|
+
return data_frame.loc[~(data_frame[column_to_filter] ==0)]
|
|
1300
|
+
|
|
1301
|
+
def upgrade_outdated_packages(self):
|
|
1302
|
+
try:
|
|
1303
|
+
# Get all installed packages
|
|
1304
|
+
installed_packages_result = subprocess.run("pip list --format=json", shell=True, capture_output=True, text=True)
|
|
1305
|
+
installed_packages = json.loads(installed_packages_result.stdout)
|
|
1306
|
+
|
|
1307
|
+
# Get the list of outdated packages
|
|
1308
|
+
outdated_packages_result = subprocess.run("pip list --outdated --format=json", shell=True, capture_output=True, text=True)
|
|
1309
|
+
outdated_packages = json.loads(outdated_packages_result.stdout)
|
|
1310
|
+
|
|
1311
|
+
# Create a set of outdated package names for quick lookup
|
|
1312
|
+
outdated_package_names = {pkg['name'] for pkg in outdated_packages}
|
|
1313
|
+
|
|
1314
|
+
# Upgrade only outdated packages
|
|
1315
|
+
for package in installed_packages:
|
|
1316
|
+
package_name = package['name']
|
|
1317
|
+
if package_name in outdated_package_names:
|
|
1318
|
+
try:
|
|
1319
|
+
print(f"Upgrading package: {package_name}")
|
|
1320
|
+
upgrade_result = subprocess.run(f"pip install --upgrade {package_name}", shell=True, capture_output=True, text=True)
|
|
1321
|
+
if upgrade_result.returncode == 0:
|
|
1322
|
+
print(f"Successfully upgraded {package_name}")
|
|
1323
|
+
else:
|
|
1324
|
+
print(f"Failed to upgrade {package_name}: {upgrade_result.stderr}")
|
|
1325
|
+
except Exception as e:
|
|
1326
|
+
print(f"An error occurred while upgrading {package_name}: {e}")
|
|
1327
|
+
else:
|
|
1328
|
+
print(f"{package_name} is already up to date")
|
|
1329
|
+
except Exception as e:
|
|
1330
|
+
print(f"An error occurred during the upgrade process: {e}")
|
|
1331
|
+
|
|
1332
|
+
def convert_mixed_formats_dates(self, df, column_name):
|
|
1333
|
+
# Convert initial dates to datetime with coercion to handle errors
|
|
1334
|
+
df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
|
|
1335
|
+
df[column_name] = df[column_name].astype(str)
|
|
1336
|
+
corrected_dates = []
|
|
1337
|
+
|
|
1338
|
+
for date_str in df[column_name]:
|
|
1339
|
+
date_str = date_str.replace('-', '').replace('/', '')
|
|
1340
|
+
if len(date_str) == 8:
|
|
1341
|
+
year = date_str[:4]
|
|
1342
|
+
month = date_str[4:6]
|
|
1343
|
+
day = date_str[6:8]
|
|
1344
|
+
if int(day) <= 12:
|
|
1345
|
+
# Swap month and day
|
|
1346
|
+
corrected_date_str = f"{year}-{day}-{month}"
|
|
1347
|
+
else:
|
|
1348
|
+
corrected_date_str = f"{year}-{month}-{day}"
|
|
1349
|
+
# Convert to datetime
|
|
1350
|
+
corrected_date = pd.to_datetime(corrected_date_str, errors='coerce')
|
|
1351
|
+
else:
|
|
1352
|
+
corrected_date = pd.to_datetime(date_str, errors='coerce')
|
|
1353
|
+
|
|
1354
|
+
corrected_dates.append(corrected_date)
|
|
1355
|
+
|
|
1356
|
+
# Check length of the corrected_dates list
|
|
1357
|
+
if len(corrected_dates) != len(df):
|
|
1358
|
+
raise ValueError("Length of corrected_dates does not match the original DataFrame")
|
|
1359
|
+
|
|
1360
|
+
# Assign the corrected dates back to the DataFrame
|
|
1361
|
+
df[column_name] = corrected_dates
|
|
1362
|
+
return df
|
|
1363
|
+
|
|
1364
|
+
def fill_weekly_date_range(self, df, date_column, freq='W-MON'):
|
|
1365
|
+
# Ensure the date column is in datetime format
|
|
1366
|
+
df[date_column] = pd.to_datetime(df[date_column])
|
|
1367
|
+
|
|
1368
|
+
# Generate the full date range with the specified frequency
|
|
1369
|
+
full_date_range = pd.date_range(start=df[date_column].min(), end=df[date_column].max(), freq=freq)
|
|
1370
|
+
|
|
1371
|
+
# Create a new dataframe with the full date range
|
|
1372
|
+
full_date_df = pd.DataFrame({date_column: full_date_range})
|
|
1373
|
+
|
|
1374
|
+
# Merge the original dataframe with the new full date range dataframe
|
|
1375
|
+
df_full = full_date_df.merge(df, on=date_column, how='left')
|
|
1376
|
+
|
|
1377
|
+
# Fill missing values with 0
|
|
1378
|
+
df_full.fillna(0, inplace=True)
|
|
1379
|
+
|
|
1380
|
+
return df_full
|
|
1381
|
+
|
|
1382
|
+
def add_prefix_and_suffix(self, df, prefix='', suffix='', date_col=None):
|
|
1383
|
+
"""
|
|
1384
|
+
Adds a specified prefix and/or suffix to the column names of a DataFrame. Optionally, a column (e.g., a date column) can be excluded.
|
|
1385
|
+
|
|
1386
|
+
Args:
|
|
1387
|
+
df (pd.DataFrame): The DataFrame whose column names will be modified.
|
|
1388
|
+
prefix (str, optional): The prefix to add to each column name. Default is an empty string.
|
|
1389
|
+
suffix (str, optional): The suffix to add to each column name. Default is an empty string.
|
|
1390
|
+
date_col (str, optional): The name of the column to exclude from adding prefix and suffix, typically a date column. Default is None.
|
|
1391
|
+
|
|
1392
|
+
Returns:
|
|
1393
|
+
pd.DataFrame: The DataFrame with updated column names.
|
|
1394
|
+
"""
|
|
1395
|
+
|
|
1396
|
+
# If there is no date column
|
|
1397
|
+
if date_col is None:
|
|
1398
|
+
# Add prefixes and suffixes to all columns
|
|
1399
|
+
df.columns = [prefix + col + suffix for col in df.columns]
|
|
1400
|
+
else:
|
|
1401
|
+
# Add prefixes and suffixes to all columns except the date column
|
|
1402
|
+
df.columns = [prefix + col + suffix if col != date_col else col for col in df.columns]
|
|
1403
|
+
|
|
1404
|
+
return df
|
|
1405
|
+
|
|
1406
|
+
def create_dummies(self, df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total'):
|
|
1407
|
+
"""
|
|
1408
|
+
Creates dummy variables for the DataFrame, converting values greater than the threshold to 1 and others to 0.
|
|
1409
|
+
Optionally adds a total dummy column indicating whether any row contains at least one value greater than the threshold.
|
|
1410
|
+
|
|
1411
|
+
Args:
|
|
1412
|
+
df (pd.DataFrame): The DataFrame to process.
|
|
1413
|
+
date_col (str, optional): The column name to exclude from the dummy conversion, typically a date column. Default is None.
|
|
1414
|
+
dummy_threshold (int, optional): The threshold value; values greater than this become 1, others become 0. Default is 0.
|
|
1415
|
+
add_total_dummy_col (str, optional): If set to any value other than 'No', adds a column that contains the max value (1 or 0) for each row. Default is 'No'.
|
|
1416
|
+
total_col_name (str, optional): The name of the total column to add if add_total_dummy_col is not 'No'. Default is 'total'.
|
|
1417
|
+
|
|
1418
|
+
Returns:
|
|
1419
|
+
pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
|
|
1420
|
+
"""
|
|
1421
|
+
|
|
1422
|
+
# If there is no date column
|
|
1423
|
+
if date_col is None:
|
|
1424
|
+
df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
|
|
1425
|
+
|
|
1426
|
+
if add_total_dummy_col != 'No':
|
|
1427
|
+
# Find max value of rows
|
|
1428
|
+
df[total_col_name] = df.max(axis=1)
|
|
1429
|
+
|
|
1430
|
+
# If there is a date column
|
|
1431
|
+
else:
|
|
1432
|
+
# Create dummies for all columns except the date column
|
|
1433
|
+
df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
|
|
1434
|
+
lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
|
|
1435
|
+
)
|
|
1436
|
+
|
|
1437
|
+
if add_total_dummy_col != 'No':
|
|
1438
|
+
# Find max value of rows
|
|
1439
|
+
df[total_col_name] = df.loc[:, df.columns != date_col].max(axis=1)
|
|
1440
|
+
|
|
1441
|
+
return df
|
|
1442
|
+
|
|
1443
|
+
def replace_substrings(self, df, column, replacements, to_lower=False, new_column=None):
|
|
1444
|
+
"""
|
|
1445
|
+
Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
|
|
1446
|
+
Optionally converts the column values to lowercase and allows creating a new column or modifying the existing one.
|
|
1447
|
+
|
|
1448
|
+
Args:
|
|
1449
|
+
df (pd.DataFrame): The DataFrame containing the column to modify.
|
|
1450
|
+
column (str): The column name where the replacements will be made.
|
|
1451
|
+
replacements (dict): A dictionary where keys are substrings to replace and values are the replacement strings.
|
|
1452
|
+
to_lower (bool, optional): If True, the column values will be converted to lowercase before applying replacements. Default is False.
|
|
1453
|
+
new_column (str, optional): If provided, the replacements will be applied to this new column. If None, the existing column will be modified. Default is None.
|
|
1454
|
+
|
|
1455
|
+
Returns:
|
|
1456
|
+
pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
|
|
1457
|
+
"""
|
|
1458
|
+
if new_column is not None:
|
|
1459
|
+
# Create a new column for replacements
|
|
1460
|
+
df[new_column] = df[column]
|
|
1461
|
+
temp_column = new_column
|
|
1462
|
+
else:
|
|
1463
|
+
# Modify the existing column
|
|
1464
|
+
temp_column = column
|
|
1465
|
+
|
|
1466
|
+
# Optionally convert to lowercase
|
|
1467
|
+
if to_lower:
|
|
1468
|
+
df[temp_column] = df[temp_column].str.lower()
|
|
1469
|
+
|
|
1470
|
+
# Apply substring replacements
|
|
1471
|
+
for old, new in replacements.items():
|
|
1472
|
+
df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
|
|
1473
|
+
|
|
1474
|
+
return df
|
|
1475
|
+
|
|
1476
|
+
def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
|
|
1477
|
+
"""
|
|
1478
|
+
Adds a total column to a DataFrame by summing across all columns. Optionally excludes a specified column.
|
|
1479
|
+
|
|
1480
|
+
Args:
|
|
1481
|
+
df (pd.DataFrame): The DataFrame to modify.
|
|
1482
|
+
exclude_col (str, optional): The column name to exclude from the sum. Default is None.
|
|
1483
|
+
total_col_name (str, optional): The name of the new total column. Default is 'Total'.
|
|
1484
|
+
|
|
1485
|
+
Returns:
|
|
1486
|
+
pd.DataFrame: The DataFrame with an added total column.
|
|
1487
|
+
"""
|
|
1488
|
+
if exclude_col and exclude_col in df.columns:
|
|
1489
|
+
# Ensure the column to exclude exists before dropping
|
|
1490
|
+
df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
|
|
1491
|
+
else:
|
|
1492
|
+
# Sum across all columns if no column is specified to exclude
|
|
1493
|
+
df[total_col_name] = df.sum(axis=1)
|
|
1494
|
+
|
|
1495
|
+
return df
|
|
1496
|
+
|
|
1497
|
+
def apply_lookup_table_based_on_substring(self, df, column_name, category_dict, new_col_name='Category', other_label='Other'):
|
|
1498
|
+
"""
|
|
1499
|
+
Categorizes text in a specified DataFrame column by applying a lookup table based on substrings.
|
|
1500
|
+
|
|
1501
|
+
Args:
|
|
1502
|
+
df (pd.DataFrame): The DataFrame containing the column to categorize.
|
|
1503
|
+
column_name (str): The name of the column in the DataFrame that contains the text data to categorize.
|
|
1504
|
+
category_dict (dict): A dictionary where keys are substrings to search for in the text and values are the categories to assign when a substring is found.
|
|
1505
|
+
new_col_name (str, optional): The name of the new column to be created in the DataFrame, which will hold the resulting categories. Default is 'Category'.
|
|
1506
|
+
other_label (str, optional): The name given to category if no substring from the dictionary is found in the cell
|
|
1507
|
+
|
|
1508
|
+
Returns:
|
|
1509
|
+
pd.DataFrame: The original DataFrame with an additional column containing the assigned categories.
|
|
1510
|
+
"""
|
|
1511
|
+
|
|
1512
|
+
def categorize_text(text):
|
|
1513
|
+
"""
|
|
1514
|
+
Assigns a category to a single text string based on the presence of substrings from a dictionary.
|
|
1515
|
+
|
|
1516
|
+
Args:
|
|
1517
|
+
text (str): The text string to categorize.
|
|
1518
|
+
|
|
1519
|
+
Returns:
|
|
1520
|
+
str: The category assigned based on the first matching substring found in the text. If no
|
|
1521
|
+
matching substring is found, returns other_name.
|
|
1522
|
+
"""
|
|
1523
|
+
for key, category in category_dict.items():
|
|
1524
|
+
if key.lower() in text.lower(): # Check if the substring is in the text (case-insensitive)
|
|
1525
|
+
return category
|
|
1526
|
+
return other_label # Default category if no match is found
|
|
1527
|
+
|
|
1528
|
+
# Apply the categorize_text function to each element in the specified column
|
|
1529
|
+
df[new_col_name] = df[column_name].apply(categorize_text)
|
|
1530
|
+
return df
|
|
1531
|
+
|
|
1532
|
+
def compare_overlap(self, df1, df2, date_col):
|
|
1533
|
+
"""
|
|
1534
|
+
Compare overlapping periods between two DataFrames and provide a summary of total differences.
|
|
1535
|
+
|
|
1536
|
+
Args:
|
|
1537
|
+
df1 (pandas.DataFrame): First DataFrame containing date-based data.
|
|
1538
|
+
df2 (pandas.DataFrame): Second DataFrame containing date-based data.
|
|
1539
|
+
date_col (str): The name of the date column used for aligning data.
|
|
1540
|
+
|
|
1541
|
+
Returns:
|
|
1542
|
+
tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
|
|
1543
|
+
"""
|
|
1544
|
+
# Ensure date columns are in datetime format
|
|
1545
|
+
df1[date_col] = pd.to_datetime(df1[date_col])
|
|
1546
|
+
df2[date_col] = pd.to_datetime(df2[date_col])
|
|
1547
|
+
|
|
1548
|
+
# Determine the overlap period
|
|
1549
|
+
start_date = max(df1[date_col].min(), df2[date_col].min())
|
|
1550
|
+
end_date = min(df1[date_col].max(), df2[date_col].max())
|
|
1551
|
+
|
|
1552
|
+
# Filter DataFrames to the overlapping period
|
|
1553
|
+
df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
|
|
1554
|
+
df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
|
|
1555
|
+
|
|
1556
|
+
# Merge the DataFrames on the date column
|
|
1557
|
+
merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
|
|
1558
|
+
|
|
1559
|
+
# Get common columns, excluding the date column
|
|
1560
|
+
common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
|
|
1561
|
+
|
|
1562
|
+
# Create a DataFrame for differences
|
|
1563
|
+
diff_df = pd.DataFrame({date_col: merged_df[date_col]})
|
|
1564
|
+
|
|
1565
|
+
total_diff_list = []
|
|
1566
|
+
for col in common_cols:
|
|
1567
|
+
diff_col = f'diff_{col}'
|
|
1568
|
+
diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
|
|
1569
|
+
|
|
1570
|
+
# Sum differences for the column
|
|
1571
|
+
total_diff = diff_df[diff_col].sum()
|
|
1572
|
+
total_diff_list.append({'Column': col, 'Total Difference': total_diff})
|
|
1573
|
+
|
|
1574
|
+
# Create summary DataFrame
|
|
1575
|
+
total_diff_df = pd.DataFrame(total_diff_list)
|
|
1576
|
+
|
|
1577
|
+
return diff_df, total_diff_df
|
|
1578
|
+
|
|
1579
|
+
def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
|
|
1580
|
+
"""
|
|
1581
|
+
Convert a DataFrame's date column so that each date is mapped back
|
|
1582
|
+
to the 'week_commencing' day of the *current ISO week*.
|
|
1583
|
+
|
|
1584
|
+
Args:
|
|
1585
|
+
df (pandas.DataFrame): The DataFrame with date-based data.
|
|
1586
|
+
date_col (str): The name of the date column.
|
|
1587
|
+
week_commencing (str): The desired start of the week.
|
|
1588
|
+
('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
|
|
1589
|
+
Uses ISO day numbering (Mon=1, ..., Sun=7).
|
|
1590
|
+
|
|
1591
|
+
Returns:
|
|
1592
|
+
pandas.DataFrame: Original DataFrame with an extra column
|
|
1593
|
+
'week_start_<week_commencing>' containing the
|
|
1594
|
+
start-of-week date for each row.
|
|
1595
|
+
"""
|
|
1596
|
+
# ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
|
|
1597
|
+
iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
|
|
1598
|
+
|
|
1599
|
+
target_day = iso_day_dict[week_commencing]
|
|
1600
|
+
|
|
1601
|
+
def map_to_week_start(date_val):
|
|
1602
|
+
delta = (date_val.isoweekday() - target_day) % 7
|
|
1603
|
+
return date_val - pd.Timedelta(days=delta)
|
|
1604
|
+
|
|
1605
|
+
# Apply the transformation
|
|
1606
|
+
new_col = f"week_start_{week_commencing}"
|
|
1607
|
+
df[new_col] = df[date_col].apply(map_to_week_start)
|
|
1608
|
+
|
|
1609
|
+
return df
|
|
1610
|
+
|
|
1611
|
+
def plot_chart(self, df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs):
|
|
1612
|
+
"""
|
|
1613
|
+
Plot various types of charts using Plotly.
|
|
1614
|
+
|
|
1615
|
+
Args:
|
|
1616
|
+
df (pandas.DataFrame): DataFrame containing the data.
|
|
1617
|
+
date_col (str): The name of the column with date information.
|
|
1618
|
+
value_cols (list): List of columns to plot.
|
|
1619
|
+
chart_type (str): Type of chart to plot ('line', 'bar', 'scatter', 'pie', 'box', 'heatmap', 'area', 'bubble', 'funnel', 'waterfall', 'contour', 'scatter3d').
|
|
1620
|
+
title (str): Title of the chart.
|
|
1621
|
+
x_title (str): Title of the x-axis.
|
|
1622
|
+
y_title (str): Title of the y-axis.
|
|
1623
|
+
**kwargs: Additional keyword arguments for customization.
|
|
1624
|
+
|
|
1625
|
+
Returns:
|
|
1626
|
+
plotly.graph_objects.Figure: The Plotly figure object.
|
|
1627
|
+
"""
|
|
1628
|
+
# Ensure the date column is in datetime format
|
|
1629
|
+
df[date_col] = pd.to_datetime(df[date_col])
|
|
1630
|
+
|
|
1631
|
+
# Initialize the figure
|
|
1632
|
+
fig = go.Figure()
|
|
1633
|
+
|
|
1634
|
+
# Make sure the date col is excluded from the line cols
|
|
1635
|
+
value_cols = [x for x in value_cols if x!=date_col]
|
|
1636
|
+
|
|
1637
|
+
# Add each value column to the plot based on the chart type
|
|
1638
|
+
for col in value_cols:
|
|
1639
|
+
if chart_type == 'line':
|
|
1640
|
+
fig.add_trace(go.Scatter(
|
|
1641
|
+
x=df[date_col],
|
|
1642
|
+
y=df[col],
|
|
1643
|
+
mode='lines',
|
|
1644
|
+
name=col,
|
|
1645
|
+
**kwargs
|
|
1646
|
+
))
|
|
1647
|
+
elif chart_type == 'bar':
|
|
1648
|
+
fig.add_trace(go.Bar(
|
|
1649
|
+
x=df[date_col],
|
|
1650
|
+
y=df[col],
|
|
1651
|
+
name=col,
|
|
1652
|
+
**kwargs
|
|
1653
|
+
))
|
|
1654
|
+
elif chart_type == 'scatter':
|
|
1655
|
+
fig.add_trace(go.Scatter(
|
|
1656
|
+
x=df[date_col],
|
|
1657
|
+
y=df[col],
|
|
1658
|
+
mode='markers',
|
|
1659
|
+
name=col,
|
|
1660
|
+
**kwargs
|
|
1661
|
+
))
|
|
1662
|
+
elif chart_type == 'histogram':
|
|
1663
|
+
fig.add_trace(go.Histogram(
|
|
1664
|
+
x=df[col],
|
|
1665
|
+
name=col,
|
|
1666
|
+
**kwargs
|
|
1667
|
+
))
|
|
1668
|
+
elif chart_type == 'pie':
|
|
1669
|
+
fig.add_trace(go.Pie(
|
|
1670
|
+
labels=df[date_col], # or another column for labels
|
|
1671
|
+
values=df[col],
|
|
1672
|
+
name=col,
|
|
1673
|
+
**kwargs
|
|
1674
|
+
))
|
|
1675
|
+
elif chart_type == 'box':
|
|
1676
|
+
fig.add_trace(go.Box(
|
|
1677
|
+
y=df[col],
|
|
1678
|
+
name=col,
|
|
1679
|
+
**kwargs
|
|
1680
|
+
))
|
|
1681
|
+
elif chart_type == 'heatmap':
|
|
1682
|
+
fig.add_trace(go.Heatmap(
|
|
1683
|
+
z=df.pivot_table(index=date_col, columns=value_cols[0], values=value_cols[1]),
|
|
1684
|
+
x=df[value_cols[0]],
|
|
1685
|
+
y=df[date_col],
|
|
1686
|
+
**kwargs
|
|
1687
|
+
))
|
|
1688
|
+
elif chart_type == 'area':
|
|
1689
|
+
fig.add_trace(go.Scatter(
|
|
1690
|
+
x=df[date_col],
|
|
1691
|
+
y=df[col],
|
|
1692
|
+
mode='lines', # Use 'lines+markers' if you want markers
|
|
1693
|
+
fill='tozeroy', # Fill the area under the line
|
|
1694
|
+
name=col,
|
|
1695
|
+
**kwargs
|
|
1696
|
+
))
|
|
1697
|
+
elif chart_type == 'bubble':
|
|
1698
|
+
fig.add_trace(go.Scatter(
|
|
1699
|
+
x=df[value_cols[0]],
|
|
1700
|
+
y=df[value_cols[1]],
|
|
1701
|
+
mode='markers',
|
|
1702
|
+
marker=dict(size=df[value_cols[2]]),
|
|
1703
|
+
name='Bubble Chart',
|
|
1704
|
+
**kwargs
|
|
1705
|
+
))
|
|
1706
|
+
elif chart_type == 'funnel':
|
|
1707
|
+
fig.add_trace(go.Funnel(
|
|
1708
|
+
y=df[date_col],
|
|
1709
|
+
x=df[col],
|
|
1710
|
+
**kwargs
|
|
1711
|
+
))
|
|
1712
|
+
elif chart_type == 'waterfall':
|
|
1713
|
+
fig.add_trace(go.Waterfall(
|
|
1714
|
+
x=df[date_col],
|
|
1715
|
+
y=df[col],
|
|
1716
|
+
measure=df[value_cols[1]], # measures like 'increase', 'decrease', 'total'
|
|
1717
|
+
**kwargs
|
|
1718
|
+
))
|
|
1719
|
+
elif chart_type == 'contour':
|
|
1720
|
+
fig.add_trace(go.Contour(
|
|
1721
|
+
z=df.pivot_table(index=value_cols[0], columns=value_cols[1], values=value_cols[2]),
|
|
1722
|
+
x=df[value_cols[0]],
|
|
1723
|
+
y=df[value_cols[1]],
|
|
1724
|
+
**kwargs
|
|
1725
|
+
))
|
|
1726
|
+
elif chart_type == 'scatter3d':
|
|
1727
|
+
fig.add_trace(go.Scatter3d(
|
|
1728
|
+
x=df[value_cols[0]],
|
|
1729
|
+
y=df[value_cols[1]],
|
|
1730
|
+
z=df[value_cols[2]],
|
|
1731
|
+
mode='markers',
|
|
1732
|
+
**kwargs
|
|
1733
|
+
))
|
|
1734
|
+
else:
|
|
1735
|
+
raise ValueError(f"Unsupported chart type: {chart_type}")
|
|
1736
|
+
|
|
1737
|
+
# Update the layout of the figure
|
|
1738
|
+
fig.update_layout(
|
|
1739
|
+
title=title,
|
|
1740
|
+
xaxis_title=x_title,
|
|
1741
|
+
yaxis_title=y_title,
|
|
1742
|
+
legend_title='Series',
|
|
1743
|
+
template='plotly_dark'
|
|
1744
|
+
)
|
|
1745
|
+
|
|
1746
|
+
return fig
|
|
1747
|
+
|
|
1748
|
+
def plot_two_with_common_cols(self, df1, df2, date_column, same_axis=True):
|
|
1749
|
+
"""
|
|
1750
|
+
Plot multiple series from two DataFrames with common columns using a specified date column for the X-axis.
|
|
1751
|
+
|
|
1752
|
+
Args:
|
|
1753
|
+
df1 (pandas.DataFrame): The first DataFrame containing data to plot.
|
|
1754
|
+
df2 (pandas.DataFrame): The second DataFrame containing data to plot.
|
|
1755
|
+
date_column (str): The name of the date column in the DataFrames.
|
|
1756
|
+
same_axis (bool, optional): Whether to plot the series on the same y-axis. Defaults to True.
|
|
1757
|
+
|
|
1758
|
+
Returns:
|
|
1759
|
+
list: A list of Plotly figures generated from the common columns.
|
|
1760
|
+
"""
|
|
1761
|
+
# Find common columns between df1 and df2, excluding the date column
|
|
1762
|
+
common_columns = list(set(df1.columns).intersection(set(df2.columns)) - {date_column})
|
|
1763
|
+
|
|
1764
|
+
# Generate col_pairs list for plot_two function
|
|
1765
|
+
col_pairs = [(col, col) for col in common_columns]
|
|
1766
|
+
|
|
1767
|
+
# Loop through the common columns and plot each pair
|
|
1768
|
+
figs = []
|
|
1769
|
+
for col1, col2 in col_pairs:
|
|
1770
|
+
# Call the existing plot_two function
|
|
1771
|
+
fig = self.plot_two(df1, col1, df2, col2, date_column, same_axis=same_axis)
|
|
1772
|
+
figs.append(fig)
|
|
1773
|
+
|
|
1774
|
+
return figs
|
|
1775
|
+
|
|
1776
|
+
########################################################################################################################################
|
|
1777
|
+
########################################################################################################################################
|
|
1778
|
+
|
|
1779
|
+
ims_proc = dataprocessing()
|
|
1780
|
+
|
|
1781
|
+
class datapull:
|
|
1782
|
+
|
|
1783
|
+
def help(self):
|
|
1784
|
+
print("This is the help section. The functions in the package are as follows:")
|
|
1785
|
+
|
|
1786
|
+
print("\n1. pull_fred_data")
|
|
1787
|
+
print(" - Description: Get data from FRED by using series id tokens.")
|
|
1788
|
+
print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
|
|
1789
|
+
print(" - Example: pull_fred_data('mon', ['GPDIC1'])")
|
|
1790
|
+
|
|
1791
|
+
print("\n2. pull_boe_data")
|
|
1792
|
+
print(" - Description: Fetch and process Bank of England interest rate data.")
|
|
1793
|
+
print(" - Usage: pull_boe_data(week_commencing)")
|
|
1794
|
+
print(" - Example: pull_boe_data('mon')")
|
|
1795
|
+
|
|
1796
|
+
print("\n3. pull_oecd")
|
|
1797
|
+
print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
|
|
1798
|
+
print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')")
|
|
1799
|
+
print(" - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
|
|
1800
|
+
|
|
1801
|
+
print("\n4. get_google_mobility_data")
|
|
1802
|
+
print(" - Description: Fetch Google Mobility data for the specified country.")
|
|
1803
|
+
print(" - Usage: get_google_mobility_data(country, wc)")
|
|
1804
|
+
print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
|
|
1805
|
+
|
|
1806
|
+
print("\n5. pull_seasonality")
|
|
1807
|
+
print(" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
|
|
1808
|
+
print(" - Usage: pull_seasonality(week_commencing, start_date, countries)")
|
|
1809
|
+
print(" - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
|
|
1810
|
+
|
|
1811
|
+
print("\n6. pull_weather")
|
|
1812
|
+
print(" - Description: Fetch and process historical weather data for the specified country.")
|
|
1813
|
+
print(" - Usage: pull_weather(week_commencing, country)")
|
|
1814
|
+
print(" - Example: pull_weather('mon', 'GBR')")
|
|
1815
|
+
|
|
1816
|
+
print("\n7. pull_macro_ons_uk")
|
|
1817
|
+
print(" - Description: Fetch and process time series data from the Beta ONS API.")
|
|
1818
|
+
print(" - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
|
|
1819
|
+
print(" - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
|
|
1820
|
+
|
|
1821
|
+
print("\n8. pull_yfinance")
|
|
1822
|
+
print(" - Description: Fetch and process time series data from the Beta ONS API.")
|
|
1823
|
+
print(" - Usage: pull_yfinance(tickers, week_start_day)")
|
|
1824
|
+
print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
|
|
1825
|
+
|
|
1826
|
+
print("\n9. pull_ga")
|
|
1827
|
+
print(" - Description: Pull in GA4 data for geo experiments.")
|
|
1828
|
+
print(" - Usage: pull_ga(credentials_file, property_id, start_date, country, metrics)")
|
|
1829
|
+
print(" - Example: pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])")
|
|
1830
|
+
|
|
1831
|
+
############################################################### MACRO ##########################################################################
|
|
1832
|
+
|
|
1833
|
+
def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
|
|
1834
|
+
'''
|
|
1835
|
+
Parameters
|
|
1836
|
+
----------
|
|
1837
|
+
week_commencing : str
|
|
1838
|
+
specify the day for the week commencing, the default is 'sun' (e.g., 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
|
|
1839
|
+
|
|
1840
|
+
series_id_list : list[str]
|
|
1841
|
+
provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
|
|
1842
|
+
["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
|
|
1843
|
+
|
|
1844
|
+
Returns
|
|
1845
|
+
----------
|
|
1846
|
+
pd.DataFrame
|
|
1847
|
+
Return a data frame with FRED data according to the series IDs provided
|
|
1848
|
+
'''
|
|
1849
|
+
# Fred API
|
|
1850
|
+
fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
|
|
1851
|
+
|
|
1852
|
+
# Fetch the metadata for each series to get the full names
|
|
1853
|
+
series_names = {series_id: fred.get_series_info(series_id).title for series_id in series_id_list}
|
|
1854
|
+
|
|
1855
|
+
# Download data from series id list
|
|
1856
|
+
fred_series = {series_id: fred.get_series(series_id) for series_id in series_id_list}
|
|
1857
|
+
|
|
1858
|
+
# Data processing
|
|
1859
|
+
date_range = {'OBS': pd.date_range("1950-01-01", datetime.today().strftime('%Y-%m-%d'), freq='d')}
|
|
1860
|
+
fred_series_df = pd.DataFrame(date_range)
|
|
1861
|
+
|
|
1862
|
+
for series_id, series_data in fred_series.items():
|
|
1863
|
+
series_data = series_data.reset_index()
|
|
1864
|
+
series_data.columns = ['OBS', series_names[series_id]] # Use the series name as the column header
|
|
1865
|
+
fred_series_df = pd.merge_asof(fred_series_df, series_data, on='OBS', direction='backward')
|
|
1866
|
+
|
|
1867
|
+
# Handle duplicate columns
|
|
1868
|
+
for col in fred_series_df.columns:
|
|
1869
|
+
if '_x' in col:
|
|
1870
|
+
base_col = col.replace('_x', '')
|
|
1871
|
+
fred_series_df[base_col] = fred_series_df[col].combine_first(fred_series_df[base_col + '_y'])
|
|
1872
|
+
fred_series_df.drop([col, base_col + '_y'], axis=1, inplace=True)
|
|
1873
|
+
|
|
1874
|
+
# Ensure sum_columns are present in the DataFrame
|
|
1875
|
+
sum_columns = [series_names[series_id] for series_id in series_id_list if series_names[series_id] in fred_series_df.columns]
|
|
1876
|
+
|
|
1877
|
+
# Aggregate results by week
|
|
1878
|
+
fred_df_final = ims_proc.aggregate_daily_to_wc_wide(df=fred_series_df,
|
|
1879
|
+
date_column="OBS",
|
|
1880
|
+
group_columns=[],
|
|
1881
|
+
sum_columns=sum_columns,
|
|
1882
|
+
wc=week_commencing,
|
|
1883
|
+
aggregation="average")
|
|
1884
|
+
|
|
1885
|
+
# Remove anything after the instance of any ':' in the column names and rename, except for 'OBS'
|
|
1886
|
+
fred_df_final.columns = ['OBS' if col == 'OBS' else 'macro_' + col.lower().split(':')[0].replace(' ', '_') for col in fred_df_final.columns]
|
|
1887
|
+
|
|
1888
|
+
return fred_df_final
|
|
1889
|
+
|
|
1890
|
+
def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
|
|
1891
|
+
"""
|
|
1892
|
+
Fetch and process Bank of England interest rate data.
|
|
1893
|
+
|
|
1894
|
+
Args:
|
|
1895
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
1896
|
+
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
1897
|
+
Default is "mon".
|
|
1898
|
+
max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
|
|
1899
|
+
delay (int): Delay in seconds between retry attempts. Default is 5.
|
|
1900
|
+
|
|
1901
|
+
Returns:
|
|
1902
|
+
pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
|
|
1903
|
+
The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
|
|
1904
|
+
and 'macro_boe_intr_rate' contains the average interest rate for the week.
|
|
1905
|
+
"""
|
|
1906
|
+
# Week commencing dictionary
|
|
1907
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1908
|
+
|
|
1909
|
+
# URL of the Bank of England data page
|
|
1910
|
+
url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
|
|
1911
|
+
|
|
1912
|
+
# Retry logic for HTTP request
|
|
1913
|
+
for attempt in range(max_retries):
|
|
1914
|
+
try:
|
|
1915
|
+
# Set up headers to mimic a browser request
|
|
1916
|
+
headers = {
|
|
1917
|
+
"User-Agent": (
|
|
1918
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
1919
|
+
"Chrome/91.0.4472.124 Safari/537.36"
|
|
1920
|
+
)
|
|
1921
|
+
}
|
|
1922
|
+
response = requests.get(url, headers=headers)
|
|
1923
|
+
response.raise_for_status() # Raise an exception for HTTP errors
|
|
1924
|
+
break
|
|
1925
|
+
except requests.exceptions.RequestException as e:
|
|
1926
|
+
print(f"Attempt {attempt + 1} failed: {e}")
|
|
1927
|
+
if attempt < max_retries - 1:
|
|
1928
|
+
time.sleep(delay)
|
|
1929
|
+
else:
|
|
1930
|
+
raise
|
|
1931
|
+
|
|
1932
|
+
# Parse the HTML page
|
|
1933
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
1934
|
+
|
|
1935
|
+
# Find the table on the page
|
|
1936
|
+
table = soup.find("table") # Locate the first table
|
|
1937
|
+
table_html = str(table) # Convert table to string
|
|
1938
|
+
df = pd.read_html(StringIO(table_html))[0] # Use StringIO to wrap the table HTML
|
|
1939
|
+
|
|
1940
|
+
# Rename and clean up columns
|
|
1941
|
+
df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
|
|
1942
|
+
df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
|
|
1943
|
+
df.sort_values("OBS", inplace=True)
|
|
1944
|
+
|
|
1945
|
+
# Create a daily date range
|
|
1946
|
+
date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
|
|
1947
|
+
df_daily = pd.DataFrame(date_range, columns=["OBS"])
|
|
1948
|
+
|
|
1949
|
+
# Adjust each date to the specified week commencing day
|
|
1950
|
+
df_daily["Week_Commencing"] = df_daily["OBS"].apply(
|
|
1951
|
+
lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
1952
|
+
)
|
|
1953
|
+
|
|
1954
|
+
# Merge and forward-fill missing rates
|
|
1955
|
+
df_daily = df_daily.merge(df, on="OBS", how="left")
|
|
1956
|
+
df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
|
|
1957
|
+
|
|
1958
|
+
# Group by week commencing and calculate the average rate
|
|
1959
|
+
df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
|
|
1960
|
+
df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
|
|
1961
|
+
df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
|
|
1962
|
+
|
|
1963
|
+
return df_final
|
|
1964
|
+
|
|
1965
|
+
def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "2020-01-01") -> pd.DataFrame:
|
|
1966
|
+
"""
|
|
1967
|
+
Fetch and process time series data from the OECD API.
|
|
1968
|
+
|
|
1969
|
+
Args:
|
|
1970
|
+
country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
|
|
1971
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
1972
|
+
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
1973
|
+
start_date (str): Dataset start date in the format "YYYY-MM-DD"
|
|
1974
|
+
|
|
1975
|
+
Returns:
|
|
1976
|
+
pd.DataFrame: A DataFrame with weekly aggregated OECD data. The 'OBS' column contains the week
|
|
1977
|
+
commencing dates, and other columns contain the aggregated time series values.
|
|
1978
|
+
"""
|
|
1979
|
+
|
|
1980
|
+
def parse_quarter(date_str):
|
|
1981
|
+
"""Parses a string in 'YYYY-Q#' format into a datetime object."""
|
|
1982
|
+
year, quarter = date_str.split('-')
|
|
1983
|
+
quarter_number = int(quarter[1])
|
|
1984
|
+
month = (quarter_number - 1) * 3 + 1
|
|
1985
|
+
return pd.Timestamp(f"{year}-{month:02d}-01")
|
|
1986
|
+
|
|
1987
|
+
# Generate a date range from 1950-01-01 to today
|
|
1988
|
+
date_range = pd.date_range(start=start_date, end=datetime.today(), freq='D')
|
|
1989
|
+
|
|
1990
|
+
url_details = [
|
|
1991
|
+
["BCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_business_confidence_index"],
|
|
1992
|
+
["CCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_consumer_confidence_index"],
|
|
1993
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA._T.N.GY", "macro_cpi_total"],
|
|
1994
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP041T043.N.GY", "macro_cpi_housing"],
|
|
1995
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP01.N.GY", "macro_cpi_food"],
|
|
1996
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP045_0722.N.GY", "macro_cpi_energy"],
|
|
1997
|
+
["UNE_LF_M", "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,", "._Z.Y._T.Y_GE15.", "macro_unemployment_rate"],
|
|
1998
|
+
["EAR", "SDD.TPS,DSD_EAR@DF_HOU_EAR,", ".Y..S1D", "macro_private_hourly_earnings"],
|
|
1999
|
+
["RHP", "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0", "", "macro_real_house_prices"],
|
|
2000
|
+
["PRVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX.C..", "macro_manufacturing_production_volume"],
|
|
2001
|
+
["TOVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX...", "macro_retail_trade_volume"],
|
|
2002
|
+
["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
|
|
2003
|
+
["IRLT", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_long_term_interest_rate"],
|
|
2004
|
+
["B1GQ", "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1", "._Z....GY.T0102", "macro_gdp_growth_yoy"]
|
|
2005
|
+
]
|
|
2006
|
+
|
|
2007
|
+
# Create empty final dataframe
|
|
2008
|
+
oecd_df_final = pd.DataFrame()
|
|
2009
|
+
|
|
2010
|
+
daily_df = pd.DataFrame({'OBS': date_range})
|
|
2011
|
+
value_columns = []
|
|
2012
|
+
|
|
2013
|
+
# Iterate for each variable of interest
|
|
2014
|
+
for series_details in url_details:
|
|
2015
|
+
series = series_details[0]
|
|
2016
|
+
dataset_id = series_details[1]
|
|
2017
|
+
filter = series_details[2]
|
|
2018
|
+
col_name = series_details[3]
|
|
2019
|
+
|
|
2020
|
+
# check if request was successful and determine the most granular data available
|
|
2021
|
+
for freq in ['M', 'Q', 'A']:
|
|
2022
|
+
|
|
2023
|
+
if series in ["UNE_LF_M", "EAR"]:
|
|
2024
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
|
|
2025
|
+
elif series in ["B1GQ"]:
|
|
2026
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
|
|
2027
|
+
else:
|
|
2028
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
|
|
2029
|
+
|
|
2030
|
+
# Make the request to the OECD API for data
|
|
2031
|
+
data_response = requests.get(data_url)
|
|
2032
|
+
|
|
2033
|
+
# Check if the request was successful
|
|
2034
|
+
if data_response.status_code != 200:
|
|
2035
|
+
print(f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}")
|
|
2036
|
+
url_test = False
|
|
2037
|
+
continue
|
|
2038
|
+
else:
|
|
2039
|
+
url_test = True
|
|
2040
|
+
break
|
|
2041
|
+
|
|
2042
|
+
# get data for the next variable if url doesn't exist
|
|
2043
|
+
if url_test is False:
|
|
2044
|
+
continue
|
|
2045
|
+
|
|
2046
|
+
root = ET.fromstring(data_response.content)
|
|
2047
|
+
|
|
2048
|
+
# Define namespaces if necessary (the namespace is included in the tags)
|
|
2049
|
+
namespaces = {'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic'}
|
|
2050
|
+
|
|
2051
|
+
# Lists to store the data
|
|
2052
|
+
dates = []
|
|
2053
|
+
values = []
|
|
2054
|
+
|
|
2055
|
+
# Iterate over all <Obs> elements and extract date and value
|
|
2056
|
+
for obs in root.findall('.//generic:Obs', namespaces):
|
|
2057
|
+
|
|
2058
|
+
# Extracting the time period (date)
|
|
2059
|
+
time_period = obs.find('.//generic:ObsDimension', namespaces).get('value')
|
|
2060
|
+
|
|
2061
|
+
# Extracting the observation value
|
|
2062
|
+
value = obs.find('.//generic:ObsValue', namespaces).get('value')
|
|
2063
|
+
|
|
2064
|
+
# Storing the data
|
|
2065
|
+
if time_period and value:
|
|
2066
|
+
dates.append(time_period)
|
|
2067
|
+
values.append(float(value)) # Convert value to float
|
|
2068
|
+
|
|
2069
|
+
# Add variable names that were found to a list
|
|
2070
|
+
value_columns.append(col_name)
|
|
2071
|
+
|
|
2072
|
+
# Creating a DataFrame
|
|
2073
|
+
data = pd.DataFrame({'OBS': dates, col_name: values})
|
|
2074
|
+
|
|
2075
|
+
# Convert date strings into datetime format
|
|
2076
|
+
if freq == 'Q':
|
|
2077
|
+
data['OBS'] = data['OBS'].apply(parse_quarter)
|
|
2078
|
+
else:
|
|
2079
|
+
# Display the DataFrame
|
|
2080
|
+
data['OBS'] = data['OBS'].apply(lambda x: datetime.strptime(x, '%Y-%m'))
|
|
2081
|
+
|
|
2082
|
+
# Sort data by chronological order
|
|
2083
|
+
data.sort_values(by='OBS', inplace=True)
|
|
2084
|
+
|
|
2085
|
+
# Merge the data based on the observation date
|
|
2086
|
+
daily_df = pd.merge_asof(daily_df, data[['OBS', col_name]], on='OBS', direction='backward')
|
|
2087
|
+
|
|
2088
|
+
|
|
2089
|
+
# Ensure columns are numeric
|
|
2090
|
+
for col in value_columns:
|
|
2091
|
+
if col in daily_df.columns:
|
|
2092
|
+
daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
|
|
2093
|
+
else:
|
|
2094
|
+
print(f"Column {col} not found in daily_df")
|
|
2095
|
+
|
|
2096
|
+
# Aggregate results by week
|
|
2097
|
+
country_df = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
|
|
2098
|
+
date_column="OBS",
|
|
2099
|
+
group_columns=[],
|
|
2100
|
+
sum_columns=value_columns,
|
|
2101
|
+
wc=week_commencing,
|
|
2102
|
+
aggregation="average")
|
|
2103
|
+
|
|
2104
|
+
oecd_df_final = pd.concat([oecd_df_final, country_df], axis=0, ignore_index=True)
|
|
2105
|
+
|
|
2106
|
+
return oecd_df_final
|
|
2107
|
+
|
|
2108
|
+
def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
|
|
2109
|
+
"""
|
|
2110
|
+
Fetch Google Mobility data for the specified country.
|
|
2111
|
+
|
|
2112
|
+
Parameters:
|
|
2113
|
+
- country (str): The name of the country for which to fetch data.
|
|
2114
|
+
|
|
2115
|
+
Returns:
|
|
2116
|
+
- pd.DataFrame: A DataFrame containing the Google Mobility data.
|
|
2117
|
+
"""
|
|
2118
|
+
# URL of the Google Mobility Reports CSV file
|
|
2119
|
+
url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"
|
|
2120
|
+
|
|
2121
|
+
# Fetch the CSV file
|
|
2122
|
+
response = requests.get(url)
|
|
2123
|
+
if response.status_code != 200:
|
|
2124
|
+
raise Exception(f"Failed to fetch data: {response.status_code}")
|
|
2125
|
+
|
|
2126
|
+
# Load the CSV file into a pandas DataFrame
|
|
2127
|
+
csv_data = StringIO(response.text)
|
|
2128
|
+
df = pd.read_csv(csv_data, low_memory=False)
|
|
2129
|
+
|
|
2130
|
+
# Filter the DataFrame for the specified country
|
|
2131
|
+
country_df = df[df['country_region'] == country]
|
|
2132
|
+
|
|
2133
|
+
final_covid = ims_proc.aggregate_daily_to_wc_wide(country_df, "date", [], ['retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline',
|
|
2134
|
+
'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline',
|
|
2135
|
+
'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline'], wc, "average")
|
|
2136
|
+
|
|
2137
|
+
final_covid1 = ims_proc.rename_cols(final_covid, 'covid_')
|
|
2138
|
+
return final_covid1
|
|
2139
|
+
|
|
2140
|
+
############################################################### Seasonality ##########################################################################
|
|
2141
|
+
|
|
2142
|
+
def pull_seasonality(self, week_commencing, start_date, countries):
|
|
2143
|
+
# ---------------------------------------------------------------------
|
|
2144
|
+
# 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
|
|
2145
|
+
# ---------------------------------------------------------------------
|
|
2146
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
2147
|
+
|
|
2148
|
+
# ---------------------------------------------------------------------
|
|
2149
|
+
# 1. Create daily date range from start_date to today
|
|
2150
|
+
# ---------------------------------------------------------------------
|
|
2151
|
+
date_range = pd.date_range(
|
|
2152
|
+
start=pd.to_datetime(start_date),
|
|
2153
|
+
end=datetime.today(),
|
|
2154
|
+
freq="D"
|
|
2155
|
+
)
|
|
2156
|
+
df_daily = pd.DataFrame(date_range, columns=["Date"])
|
|
2157
|
+
|
|
2158
|
+
# ---------------------------------------------------------------------
|
|
2159
|
+
# 1.1 Identify "week_start" for each daily row, based on week_commencing
|
|
2160
|
+
# ---------------------------------------------------------------------
|
|
2161
|
+
df_daily['week_start'] = df_daily["Date"].apply(
|
|
2162
|
+
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
2163
|
+
)
|
|
2164
|
+
|
|
2165
|
+
# ---------------------------------------------------------------------
|
|
2166
|
+
# 2. Build a weekly index (df_weekly_start) with dummy columns
|
|
2167
|
+
# ---------------------------------------------------------------------
|
|
2168
|
+
df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
|
|
2169
|
+
df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
|
|
2170
|
+
|
|
2171
|
+
# Set index to weekly "start of week"
|
|
2172
|
+
df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
|
|
2173
|
+
df_weekly_start.set_index("Date", inplace=True)
|
|
2174
|
+
|
|
2175
|
+
# Create individual weekly dummies
|
|
2176
|
+
dummy_columns = {}
|
|
2177
|
+
for i in range(len(df_weekly_start)):
|
|
2178
|
+
col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
|
|
2179
|
+
dummy_columns[col_name] = [0] * len(df_weekly_start)
|
|
2180
|
+
dummy_columns[col_name][i] = 1
|
|
2181
|
+
|
|
2182
|
+
df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
|
|
2183
|
+
df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
|
|
2184
|
+
|
|
2185
|
+
# ---------------------------------------------------------------------
|
|
2186
|
+
# 3. Public holidays (daily) from 'holidays' package + each holiday name
|
|
2187
|
+
# ---------------------------------------------------------------------
|
|
2188
|
+
for country in countries:
|
|
2189
|
+
country_holidays = holidays.CountryHoliday(
|
|
2190
|
+
country,
|
|
2191
|
+
years=range(int(start_date[:4]), datetime.today().year + 1)
|
|
2192
|
+
)
|
|
2193
|
+
# Daily indicator: 1 if that date is a holiday
|
|
2194
|
+
df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
|
|
2195
|
+
lambda x: 1 if x in country_holidays else 0
|
|
2196
|
+
)
|
|
2197
|
+
# Create columns for specific holiday names
|
|
2198
|
+
for date_hol, name in country_holidays.items():
|
|
2199
|
+
col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
|
|
2200
|
+
if col_name not in df_daily.columns:
|
|
2201
|
+
df_daily[col_name] = 0
|
|
2202
|
+
df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
|
|
2203
|
+
|
|
2204
|
+
# ---------------------------------------------------------------------
|
|
2205
|
+
# 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
|
|
2206
|
+
# We'll add daily columns for each.
|
|
2207
|
+
# ---------------------------------------------------------------------
|
|
2208
|
+
# Initialize columns
|
|
2209
|
+
extra_cols = [
|
|
2210
|
+
"seas_valentines_day",
|
|
2211
|
+
"seas_halloween",
|
|
2212
|
+
"seas_fathers_day_us_uk",
|
|
2213
|
+
"seas_mothers_day_us",
|
|
2214
|
+
"seas_mothers_day_uk",
|
|
2215
|
+
"seas_good_friday",
|
|
2216
|
+
"seas_easter_monday",
|
|
2217
|
+
"seas_black_friday",
|
|
2218
|
+
"seas_cyber_monday",
|
|
2219
|
+
]
|
|
2220
|
+
for c in extra_cols:
|
|
2221
|
+
df_daily[c] = 0 # default zero
|
|
2222
|
+
|
|
2223
|
+
# Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
|
|
2224
|
+
# weekday: Monday=0, Tuesday=1, ... Sunday=6
|
|
2225
|
+
def nth_weekday_of_month(year, month, weekday, nth):
|
|
2226
|
+
"""
|
|
2227
|
+
Returns date of the nth <weekday> in <month> of <year>.
|
|
2228
|
+
E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
|
|
2229
|
+
"""
|
|
2230
|
+
# 1st day of the month
|
|
2231
|
+
d = datetime(year, month, 1)
|
|
2232
|
+
# What is the weekday of day #1?
|
|
2233
|
+
w = d.weekday() # Monday=0, Tuesday=1, ... Sunday=6
|
|
2234
|
+
# If we want, e.g. Sunday=6, we see how many days to add
|
|
2235
|
+
delta = (weekday - w) % 7
|
|
2236
|
+
# This is the first <weekday> in that month
|
|
2237
|
+
first_weekday = d + timedelta(days=delta)
|
|
2238
|
+
# Now add 7*(nth-1) days
|
|
2239
|
+
return first_weekday + timedelta(days=7 * (nth-1))
|
|
2240
|
+
|
|
2241
|
+
def get_good_friday(year):
|
|
2242
|
+
"""Good Friday is 2 days before Easter Sunday."""
|
|
2243
|
+
return easter(year) - timedelta(days=2)
|
|
2244
|
+
|
|
2245
|
+
def get_easter_monday(year):
|
|
2246
|
+
"""Easter Monday is 1 day after Easter Sunday."""
|
|
2247
|
+
return easter(year) + timedelta(days=1)
|
|
2248
|
+
|
|
2249
|
+
def get_black_friday(year):
|
|
2250
|
+
"""
|
|
2251
|
+
Black Friday = day after US Thanksgiving,
|
|
2252
|
+
and US Thanksgiving is the 4th Thursday in November.
|
|
2253
|
+
"""
|
|
2254
|
+
# 4th Thursday in November
|
|
2255
|
+
fourth_thursday = nth_weekday_of_month(year, 11, 3, 4) # weekday=3 => Thursday
|
|
2256
|
+
return fourth_thursday + timedelta(days=1)
|
|
2257
|
+
|
|
2258
|
+
def get_cyber_monday(year):
|
|
2259
|
+
"""Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
|
|
2260
|
+
# 4th Thursday in November
|
|
2261
|
+
fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
|
|
2262
|
+
return fourth_thursday + timedelta(days=4) # Monday after Thanksgiving
|
|
2263
|
+
|
|
2264
|
+
# Loop over each year in range
|
|
2265
|
+
start_yr = int(start_date[:4])
|
|
2266
|
+
end_yr = datetime.today().year
|
|
2267
|
+
|
|
2268
|
+
for yr in range(start_yr, end_yr + 1):
|
|
2269
|
+
# Valentines = Feb 14
|
|
2270
|
+
valentines_day = datetime(yr, 2, 14)
|
|
2271
|
+
# Halloween = Oct 31
|
|
2272
|
+
halloween_day = datetime(yr, 10, 31)
|
|
2273
|
+
# Father's Day (US & UK) = 3rd Sunday in June
|
|
2274
|
+
fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
|
|
2275
|
+
# Mother's Day US = 2nd Sunday in May
|
|
2276
|
+
mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
|
|
2277
|
+
# Mother's Day UK: 4th Sunday in Lent => "Mothering Sunday"
|
|
2278
|
+
# We can approximate as: Easter Sunday - 21 days
|
|
2279
|
+
# BUT we also must ensure it's actually Sunday
|
|
2280
|
+
# (the 4th Sunday in Lent can shift. We'll do the official approach below.)
|
|
2281
|
+
# Another approach: Easter Sunday - 7 * (4 weeks) is the 4th Sunday prior to Easter.
|
|
2282
|
+
# But that might overshoot if Lent started mid-week.
|
|
2283
|
+
# Let's do a quick approach:
|
|
2284
|
+
# Officially: Mothering Sunday = 3 weeks before Easter Sunday (the 4th Sunday is Easter Sunday itself).
|
|
2285
|
+
# So Easter - 21 days should be the Sunday, but let's confirm with weekday check.
|
|
2286
|
+
mothering_sunday = easter(yr) - timedelta(days=21)
|
|
2287
|
+
# If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
|
|
2288
|
+
while mothering_sunday.weekday() != 6: # Sunday=6
|
|
2289
|
+
mothering_sunday -= timedelta(days=1)
|
|
2290
|
+
|
|
2291
|
+
# Good Friday, Easter Monday
|
|
2292
|
+
gf = get_good_friday(yr)
|
|
2293
|
+
em = get_easter_monday(yr)
|
|
2294
|
+
|
|
2295
|
+
# Black Friday, Cyber Monday
|
|
2296
|
+
bf = get_black_friday(yr)
|
|
2297
|
+
cm = get_cyber_monday(yr)
|
|
2298
|
+
|
|
2299
|
+
# Mark them in df_daily if in range
|
|
2300
|
+
for special_date, col in [
|
|
2301
|
+
(valentines_day, "seas_valentines_day"),
|
|
2302
|
+
(halloween_day, "seas_halloween"),
|
|
2303
|
+
(fathers_day, "seas_fathers_day_us_uk"),
|
|
2304
|
+
(mothers_day_us, "seas_mothers_day_us"),
|
|
2305
|
+
(mothering_sunday, "seas_mothers_day_uk"),
|
|
2306
|
+
(gf, "seas_good_friday"),
|
|
2307
|
+
(em, "seas_easter_monday"),
|
|
2308
|
+
(bf, "seas_black_friday"),
|
|
2309
|
+
(cm, "seas_cyber_monday"),
|
|
2310
|
+
]:
|
|
2311
|
+
# Convert to pd.Timestamp:
|
|
2312
|
+
special_ts = pd.Timestamp(special_date)
|
|
2313
|
+
|
|
2314
|
+
# Only set if it's within your daily range
|
|
2315
|
+
if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
|
|
2316
|
+
df_daily.loc[df_daily["Date"] == special_ts, col] = 1
|
|
2317
|
+
|
|
2318
|
+
# ---------------------------------------------------------------------
|
|
2319
|
+
# 4. Add daily indicators for last day & last Friday of month
|
|
2320
|
+
# Then aggregate them to weekly level using .max()
|
|
2321
|
+
# ---------------------------------------------------------------------
|
|
2322
|
+
# Last day of month (daily)
|
|
2323
|
+
df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
|
|
2324
|
+
lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
|
|
2325
|
+
)
|
|
2326
|
+
|
|
2327
|
+
# Last Friday of month (daily)
|
|
2328
|
+
def is_last_friday(date):
|
|
2329
|
+
# last day of the month
|
|
2330
|
+
last_day_of_month = date.to_period("M").to_timestamp("M")
|
|
2331
|
+
last_day_weekday = last_day_of_month.weekday() # Monday=0,...Sunday=6
|
|
2332
|
+
# Determine how many days we go back from the last day to get Friday (weekday=4)
|
|
2333
|
+
if last_day_weekday >= 4:
|
|
2334
|
+
days_to_subtract = last_day_weekday - 4
|
|
2335
|
+
else:
|
|
2336
|
+
days_to_subtract = last_day_weekday + 3
|
|
2337
|
+
last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
|
|
2338
|
+
return 1 if date == last_friday else 0
|
|
2339
|
+
|
|
2340
|
+
df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
|
|
2341
|
+
|
|
2342
|
+
# ---------------------------------------------------------------------
|
|
2343
|
+
# 5. Weekly aggregation for holiday columns & monthly dummies
|
|
2344
|
+
# ---------------------------------------------------------------------
|
|
2345
|
+
# For monthly dummies, create a daily col "Month", then get_dummies
|
|
2346
|
+
df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
|
|
2347
|
+
df_monthly_dummies = pd.get_dummies(
|
|
2348
|
+
df_daily,
|
|
2349
|
+
prefix="seas",
|
|
2350
|
+
columns=["Month"],
|
|
2351
|
+
dtype=int
|
|
2352
|
+
)
|
|
2353
|
+
# Recalculate 'week_start' (already in df_daily, but just to be sure)
|
|
2354
|
+
df_monthly_dummies['week_start'] = df_daily['week_start']
|
|
2355
|
+
|
|
2356
|
+
# Group monthly dummies by .sum() or .mean()—we often spread them across the week
|
|
2357
|
+
df_monthly_dummies = (
|
|
2358
|
+
df_monthly_dummies
|
|
2359
|
+
.groupby('week_start')
|
|
2360
|
+
.sum(numeric_only=True) # sum the daily flags
|
|
2361
|
+
.reset_index()
|
|
2362
|
+
.rename(columns={'week_start': "Date"})
|
|
2363
|
+
.set_index("Date")
|
|
2364
|
+
)
|
|
2365
|
+
# Spread monthly dummies by 7 to distribute across that week
|
|
2366
|
+
monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
|
|
2367
|
+
df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
|
|
2368
|
+
|
|
2369
|
+
# Group holiday & special-day columns by .max() => binary at weekly level
|
|
2370
|
+
df_holidays = (
|
|
2371
|
+
df_daily
|
|
2372
|
+
.groupby('week_start')
|
|
2373
|
+
.max(numeric_only=True) # if any day=1 in that week, entire week=1
|
|
2374
|
+
.reset_index()
|
|
2375
|
+
.rename(columns={'week_start': "Date"})
|
|
2376
|
+
.set_index("Date")
|
|
2377
|
+
)
|
|
2378
|
+
|
|
2379
|
+
# ---------------------------------------------------------------------
|
|
2380
|
+
# 6. Combine weekly start, monthly dummies, holiday flags
|
|
2381
|
+
# ---------------------------------------------------------------------
|
|
2382
|
+
df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
|
|
2383
|
+
df_combined = pd.concat([df_combined, df_holidays], axis=1)
|
|
2384
|
+
df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
|
|
2385
|
+
|
|
2386
|
+
# ---------------------------------------------------------------------
|
|
2387
|
+
# 7. Create weekly dummies for Week of Year & yearly dummies
|
|
2388
|
+
# ---------------------------------------------------------------------
|
|
2389
|
+
df_combined.reset_index(inplace=True)
|
|
2390
|
+
df_combined.rename(columns={"index": "old_index"}, inplace=True) # just in case
|
|
2391
|
+
|
|
2392
|
+
df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
|
|
2393
|
+
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
|
|
2394
|
+
|
|
2395
|
+
df_combined["Year"] = df_combined["Date"].dt.year
|
|
2396
|
+
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
|
|
2397
|
+
|
|
2398
|
+
# ---------------------------------------------------------------------
|
|
2399
|
+
# 8. Add constant & trend
|
|
2400
|
+
# ---------------------------------------------------------------------
|
|
2401
|
+
df_combined["Constant"] = 1
|
|
2402
|
+
df_combined["Trend"] = df_combined.index + 1
|
|
2403
|
+
|
|
2404
|
+
# ---------------------------------------------------------------------
|
|
2405
|
+
# 9. Rename Date -> OBS and return
|
|
2406
|
+
# ---------------------------------------------------------------------
|
|
2407
|
+
df_combined.rename(columns={"Date": "OBS"}, inplace=True)
|
|
2408
|
+
|
|
2409
|
+
return df_combined
|
|
2410
|
+
|
|
2411
|
+
|
|
2412
|
+
def pull_weather(self, week_commencing, country) -> pd.DataFrame:
|
|
2413
|
+
import pandas as pd
|
|
2414
|
+
import urllib.request # noqa: F811
|
|
2415
|
+
from datetime import datetime
|
|
2416
|
+
import requests
|
|
2417
|
+
from geopy.geocoders import Nominatim # noqa: F811
|
|
2418
|
+
|
|
2419
|
+
# Week commencing dictionary
|
|
2420
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
2421
|
+
|
|
2422
|
+
# Country dictionary
|
|
2423
|
+
country_dict = {"AUS": "AU__ASOS", "GBR": "GB__ASOS", "USA": "USCRN", "DEU": "DE__ASOS", "CAN": "Canada", "ZAF": "ZA__ASOS"}
|
|
2424
|
+
|
|
2425
|
+
# Function to flatten a list of nested lists into a list
|
|
2426
|
+
def flatten_list(nested_list):
|
|
2427
|
+
return [item for sublist in nested_list for item in sublist]
|
|
2428
|
+
|
|
2429
|
+
# Choose country
|
|
2430
|
+
country = country_dict[country]
|
|
2431
|
+
|
|
2432
|
+
# Choose start and end dates
|
|
2433
|
+
start_day = 1
|
|
2434
|
+
start_month = 1
|
|
2435
|
+
start_year = 2014
|
|
2436
|
+
formatted_date = datetime(start_year, start_month, start_day).strftime("%Y-%m-%d")
|
|
2437
|
+
today = datetime.now()
|
|
2438
|
+
end_day = today.day
|
|
2439
|
+
end_month = today.month
|
|
2440
|
+
end_year = today.year
|
|
2441
|
+
|
|
2442
|
+
if country == "GB__ASOS":
|
|
2443
|
+
stations = ["&stations=EGCC", "&stations=EGNM", "&stations=EGBB",
|
|
2444
|
+
"&stations=EGSH", "&stations=EGFF", "&stations=EGHI",
|
|
2445
|
+
"&stations=EGLC", "&stations=EGHQ", "&stations=EGAC",
|
|
2446
|
+
"&stations=EGPF", "&stations=EGGD", "&stations=EGPE",
|
|
2447
|
+
"&stations=EGNT"]
|
|
2448
|
+
elif country == "AU__ASOS":
|
|
2449
|
+
stations = ["&stations=YPDN", "&stations=YBCS", "&stations=YBBN",
|
|
2450
|
+
"&stations=YSSY", "&stations=YSSY", "&stations=YMEN",
|
|
2451
|
+
"&stations=YPAD", "&stations=YPPH"]
|
|
2452
|
+
elif country == "USCRN":
|
|
2453
|
+
stations = ["&stations=64756", "&stations=64758", "&stations=03761", "&stations=54797", # North
|
|
2454
|
+
"&stations=53968", "&stations=53960", "&stations=54932", "&stations=13301", # Midwest
|
|
2455
|
+
"&stations=64756", "&stations=64756", "&stations=92821", "&stations=63862", # South
|
|
2456
|
+
"&stations=53152", "&stations=93245", "&stations=04138", "&stations=04237"] # West
|
|
2457
|
+
elif country == "DE__ASOS":
|
|
2458
|
+
stations = ["&stations=EDDL", "&stations=EDDH", "&stations=EDDB",
|
|
2459
|
+
"&stations=EDDN", "&stations=EDDF", "&stations=EDDK",
|
|
2460
|
+
"&stations=EDLW", "&stations=EDDM"]
|
|
2461
|
+
elif country == "FR__ASOS":
|
|
2462
|
+
stations = ["&stations=LFPB"]
|
|
2463
|
+
elif country == "Canada":
|
|
2464
|
+
institute_vector = ["CA_NB_ASOS", "CA_NF_ASOS", "CA_NT_ASOS", "CA_NS_ASOS",
|
|
2465
|
+
"CA_NU_ASOS"]
|
|
2466
|
+
stations_list = [[] for _ in range(5)]
|
|
2467
|
+
stations_list[0].append(["&stations=CYQM", "&stations=CERM", "&stations=CZCR",
|
|
2468
|
+
"&stations=CZBF", "&stations=CYFC", "&stations=CYCX"])
|
|
2469
|
+
|
|
2470
|
+
stations_list[1].append(["&stations=CWZZ", "&stations=CYDP", "&stations=CYMH",
|
|
2471
|
+
"&stations=CYAY", "&stations=CWDO", "&stations=CXTP",
|
|
2472
|
+
"&stations=CYJT", "&stations=CYYR", "&stations=CZUM",
|
|
2473
|
+
"&stations=CYWK", "&stations=CYWK"])
|
|
2474
|
+
|
|
2475
|
+
stations_list[2].append(["&stations=CYHI", "&stations=CZCP", "&stations=CWLI",
|
|
2476
|
+
"&stations=CWND", "&stations=CXTV", "&stations=CYVL",
|
|
2477
|
+
"&stations=CYCO", "&stations=CXDE", "&stations=CYWE",
|
|
2478
|
+
"&stations=CYLK", "&stations=CWID", "&stations=CYRF",
|
|
2479
|
+
"&stations=CXYH", "&stations=CYWY", "&stations=CWMT"])
|
|
2480
|
+
|
|
2481
|
+
stations_list[3].append(["&stations=CWEF", "&stations=CXIB", "&stations=CYQY",
|
|
2482
|
+
"&stations=CYPD", "&stations=CXNP", "&stations=CXMY",
|
|
2483
|
+
"&stations=CYAW", "&stations=CWKG", "&stations=CWVU",
|
|
2484
|
+
"&stations=CXLB", "&stations=CWSA", "&stations=CWRN"])
|
|
2485
|
+
|
|
2486
|
+
stations_list[4].append(["&stations=CYLT", "&stations=CWEU", "&stations=CWGZ",
|
|
2487
|
+
"&stations=CYIO", "&stations=CXSE", "&stations=CYCB",
|
|
2488
|
+
"&stations=CWIL", "&stations=CXWB", "&stations=CYZS",
|
|
2489
|
+
"&stations=CWJC", "&stations=CYFB", "&stations=CWUW"])
|
|
2490
|
+
|
|
2491
|
+
elif country == "ZA__ASOS":
|
|
2492
|
+
cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
|
|
2493
|
+
stations = []
|
|
2494
|
+
|
|
2495
|
+
for city in cities:
|
|
2496
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
2497
|
+
location = geolocator.geocode(city)
|
|
2498
|
+
stations.append(f"&latitude={location.latitude}&longitude={location.longitude}")
|
|
2499
|
+
|
|
2500
|
+
# Temperature
|
|
2501
|
+
if country in ["GB__ASOS", "AU__ASOS", "DE__ASOS", "FR__ASOS"]:
|
|
2502
|
+
# We start by making a data frame of the following weather stations
|
|
2503
|
+
station_query = ''.join(stations)
|
|
2504
|
+
|
|
2505
|
+
raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
|
|
2506
|
+
station_query,
|
|
2507
|
+
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
2508
|
+
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
2509
|
+
raw_weather = urllib.request.urlopen(raw_weather_list)
|
|
2510
|
+
raw_weather = pd.read_csv(raw_weather)
|
|
2511
|
+
|
|
2512
|
+
# Replace the occurrences of "None" with Missing Value
|
|
2513
|
+
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
2514
|
+
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
2515
|
+
|
|
2516
|
+
# Remove any data that isn't temperature-related
|
|
2517
|
+
weather = raw_weather.iloc[:, 0:4]
|
|
2518
|
+
|
|
2519
|
+
weather[["max_temp_f", "min_temp_f"]] = weather[["max_temp_f", "min_temp_f"]].apply(pd.to_numeric)
|
|
2520
|
+
|
|
2521
|
+
# Estimate mean temperature
|
|
2522
|
+
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
2523
|
+
|
|
2524
|
+
# Convert Fahrenheit to Celsius for max_temp_f
|
|
2525
|
+
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
2526
|
+
|
|
2527
|
+
# Convert Fahrenheit to Celsius for min_temp_f
|
|
2528
|
+
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
2529
|
+
|
|
2530
|
+
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
2531
|
+
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
2532
|
+
|
|
2533
|
+
# Aggregate the data to week commencing sunday taking the average of the data
|
|
2534
|
+
# Convert the date column to a Date type
|
|
2535
|
+
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
2536
|
+
|
|
2537
|
+
# Determine the starting chosen day for each date
|
|
2538
|
+
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2539
|
+
|
|
2540
|
+
# Group by week_starting and summarize
|
|
2541
|
+
numeric_columns = weather.select_dtypes(include='number').columns
|
|
2542
|
+
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
2543
|
+
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
2544
|
+
"min_temp_f": "avg_min_temp_f",
|
|
2545
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
2546
|
+
"max_temp_c": "avg_max_temp_c",
|
|
2547
|
+
"min_temp_c": "avg_min_temp_c",
|
|
2548
|
+
"mean_temp_c": "avg_mean_temp_c"}, inplace=True)
|
|
2549
|
+
elif country == "Canada":
|
|
2550
|
+
for i in range(len(institute_vector)):
|
|
2551
|
+
station_query_temp = ''.join(flatten_list(stations_list[i]))
|
|
2552
|
+
institute_temp = institute_vector[i]
|
|
2553
|
+
raw_weather_temp = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", institute_temp,
|
|
2554
|
+
station_query_temp,
|
|
2555
|
+
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
2556
|
+
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
2557
|
+
raw_weather_temp = urllib.request.urlopen(raw_weather_temp)
|
|
2558
|
+
raw_weather_temp = pd.read_csv(raw_weather_temp)
|
|
2559
|
+
|
|
2560
|
+
if len(raw_weather_temp.index) == 0:
|
|
2561
|
+
continue
|
|
2562
|
+
raw_weather_temp = raw_weather_temp[['station', 'day', 'max_temp_f', 'min_temp_f', 'precip_in']]
|
|
2563
|
+
|
|
2564
|
+
if i == 1:
|
|
2565
|
+
raw_weather = raw_weather_temp
|
|
2566
|
+
else:
|
|
2567
|
+
raw_weather = pd.concat([raw_weather, raw_weather_temp])
|
|
2568
|
+
|
|
2569
|
+
# Drop error column if it exists
|
|
2570
|
+
if 'ERROR: Invalid network specified' in list(raw_weather.columns):
|
|
2571
|
+
raw_weather.drop('ERROR: Invalid network specified', axis=1, inplace=True)
|
|
2572
|
+
|
|
2573
|
+
# Replace none values
|
|
2574
|
+
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
2575
|
+
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
2576
|
+
raw_weather["precip_in"].replace("None", 0, inplace=True)
|
|
2577
|
+
|
|
2578
|
+
weather = raw_weather
|
|
2579
|
+
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
2580
|
+
|
|
2581
|
+
# Estimate mean temperature
|
|
2582
|
+
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
2583
|
+
|
|
2584
|
+
# Convert Fahrenheit to Celsius for max_temp_f
|
|
2585
|
+
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
2586
|
+
|
|
2587
|
+
# Convert Fahrenheit to Celsius for min_temp_f
|
|
2588
|
+
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
2589
|
+
|
|
2590
|
+
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
2591
|
+
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
2592
|
+
|
|
2593
|
+
# Aggregate the data to week commencing sunday taking the average of the data
|
|
2594
|
+
# Convert the date column to a Date type
|
|
2595
|
+
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
2596
|
+
|
|
2597
|
+
# Determine the starting chosen day for each date
|
|
2598
|
+
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2599
|
+
|
|
2600
|
+
# Group by week_starting and summarize
|
|
2601
|
+
numeric_columns = weather.select_dtypes(include='number').columns
|
|
2602
|
+
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
2603
|
+
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
2604
|
+
"min_temp_f": "avg_min_temp_f",
|
|
2605
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
2606
|
+
"max_temp_c": "avg_max_temp_c",
|
|
2607
|
+
"min_temp_c": "avg_min_temp_c",
|
|
2608
|
+
"mean_temp_c": "avg_mean_temp_c",
|
|
2609
|
+
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
2610
|
+
elif country == "ZA__ASOS":
|
|
2611
|
+
weather_data_list = []
|
|
2612
|
+
|
|
2613
|
+
for city in cities:
|
|
2614
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
2615
|
+
location = geolocator.geocode(city)
|
|
2616
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2617
|
+
|
|
2618
|
+
params = {
|
|
2619
|
+
"latitude": location.latitude,
|
|
2620
|
+
"longitude": location.longitude,
|
|
2621
|
+
"start_date": formatted_date,
|
|
2622
|
+
"end_date": today.strftime("%Y-%m-%d"),
|
|
2623
|
+
"daily": "temperature_2m_max,temperature_2m_min,precipitation_sum",
|
|
2624
|
+
"timezone": "auto"
|
|
2625
|
+
}
|
|
2626
|
+
|
|
2627
|
+
response = requests.get(url, params=params)
|
|
2628
|
+
response_data = response.json()
|
|
2629
|
+
|
|
2630
|
+
daily_data = response_data["daily"]
|
|
2631
|
+
dates = daily_data["time"]
|
|
2632
|
+
|
|
2633
|
+
data = pd.DataFrame({
|
|
2634
|
+
"day": dates,
|
|
2635
|
+
"max_temp_f": daily_data["temperature_2m_max"],
|
|
2636
|
+
"min_temp_f": daily_data["temperature_2m_min"],
|
|
2637
|
+
"precip_in": daily_data["precipitation_sum"]
|
|
2638
|
+
})
|
|
2639
|
+
data["city"] = city
|
|
2640
|
+
weather_data_list.append(data)
|
|
2641
|
+
|
|
2642
|
+
weather = pd.concat(weather_data_list)
|
|
2643
|
+
|
|
2644
|
+
# Convert the date column to a Date type
|
|
2645
|
+
weather["day"] = pd.to_datetime(weather["day"])
|
|
2646
|
+
|
|
2647
|
+
# Replace None values
|
|
2648
|
+
weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
2649
|
+
weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
2650
|
+
weather["precip_in"].replace("None", 0, inplace=True)
|
|
2651
|
+
|
|
2652
|
+
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
2653
|
+
|
|
2654
|
+
# Estimate mean temperature
|
|
2655
|
+
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
2656
|
+
|
|
2657
|
+
# Convert Fahrenheit to Celsius for max_temp_f
|
|
2658
|
+
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
2659
|
+
|
|
2660
|
+
# Convert Fahrenheit to Celsius for min_temp_f
|
|
2661
|
+
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
2662
|
+
|
|
2663
|
+
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
2664
|
+
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
2665
|
+
|
|
2666
|
+
# Determine the starting chosen day for each date
|
|
2667
|
+
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2668
|
+
|
|
2669
|
+
# Group by week_starting and summarize
|
|
2670
|
+
numeric_columns = weather.select_dtypes(include='number').columns
|
|
2671
|
+
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
2672
|
+
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
2673
|
+
"min_temp_f": "avg_min_temp_f",
|
|
2674
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
2675
|
+
"max_temp_c": "avg_max_temp_c",
|
|
2676
|
+
"min_temp_c": "avg_min_temp_c",
|
|
2677
|
+
"mean_temp_c": "avg_mean_temp_c",
|
|
2678
|
+
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
2679
|
+
|
|
2680
|
+
else:
|
|
2681
|
+
# We start by making a data frame of the following weather stations
|
|
2682
|
+
station_query = ''.join(stations)
|
|
2683
|
+
|
|
2684
|
+
raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
|
|
2685
|
+
station_query,
|
|
2686
|
+
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
2687
|
+
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
2688
|
+
raw_weather = urllib.request.urlopen(raw_weather_list)
|
|
2689
|
+
raw_weather = pd.read_csv(raw_weather)
|
|
2690
|
+
|
|
2691
|
+
raw_weather = raw_weather[['day', 'max_temp_f', 'min_temp_f', 'precip_in']]
|
|
2692
|
+
|
|
2693
|
+
# Replace the occurrences of "None" with Missing Value
|
|
2694
|
+
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
2695
|
+
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
2696
|
+
raw_weather["precip_in"].replace("None", 0, inplace=True)
|
|
2697
|
+
|
|
2698
|
+
# Remove any data that isn't temperature-related
|
|
2699
|
+
weather = raw_weather
|
|
2700
|
+
|
|
2701
|
+
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
2702
|
+
|
|
2703
|
+
# Estimate mean temperature
|
|
2704
|
+
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
2705
|
+
|
|
2706
|
+
# Convert Fahrenheit to Celsius for max_temp_f
|
|
2707
|
+
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
2708
|
+
|
|
2709
|
+
# Convert Fahrenheit to Celsius for min_temp_f
|
|
2710
|
+
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
2711
|
+
|
|
2712
|
+
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
2713
|
+
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
2714
|
+
|
|
2715
|
+
# Aggregate the data to week commencing sunday taking the average of the data
|
|
2716
|
+
# Convert the date column to a Date type
|
|
2717
|
+
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
2718
|
+
|
|
2719
|
+
# Determine the starting chosen day for each date
|
|
2720
|
+
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2721
|
+
|
|
2722
|
+
# Group by week_starting and summarize
|
|
2723
|
+
numeric_columns = weather.select_dtypes(include='number').columns
|
|
2724
|
+
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
2725
|
+
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
2726
|
+
"min_temp_f": "avg_min_temp_f",
|
|
2727
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
2728
|
+
"max_temp_c": "avg_max_temp_c",
|
|
2729
|
+
"min_temp_c": "avg_min_temp_c",
|
|
2730
|
+
"mean_temp_c": "avg_mean_temp_c",
|
|
2731
|
+
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
2732
|
+
|
|
2733
|
+
# Rainfall
|
|
2734
|
+
if country == "GB__ASOS":
|
|
2735
|
+
# Define cities and date range
|
|
2736
|
+
cities = ["Manchester", "Leeds", "Birmingham", "Norwich", "Cardiff", "Southampton", "London", "Newquay", "Belfast", "Glasgow", "Bristol", "Newcastle"]
|
|
2737
|
+
|
|
2738
|
+
start_date = formatted_date
|
|
2739
|
+
end_date = today.strftime("%Y-%m-%d")
|
|
2740
|
+
|
|
2741
|
+
# Initialize an empty list to store the weather data for each city
|
|
2742
|
+
weather_data_list = []
|
|
2743
|
+
|
|
2744
|
+
# Loop through each city and fetch weather data
|
|
2745
|
+
for city in cities:
|
|
2746
|
+
# Initialize Nominatim API
|
|
2747
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
2748
|
+
location = geolocator.geocode(city)
|
|
2749
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2750
|
+
|
|
2751
|
+
params = {
|
|
2752
|
+
"latitude": location.latitude,
|
|
2753
|
+
"longitude": location.longitude,
|
|
2754
|
+
"start_date": start_date,
|
|
2755
|
+
"end_date": end_date,
|
|
2756
|
+
"daily": "precipitation_sum",
|
|
2757
|
+
"timezone": "auto"
|
|
2758
|
+
}
|
|
2759
|
+
|
|
2760
|
+
response = requests.get(url, params=params)
|
|
2761
|
+
response_data = response.json()
|
|
2762
|
+
|
|
2763
|
+
daily_data = response_data["daily"]["precipitation_sum"]
|
|
2764
|
+
dates = response_data["daily"]["time"]
|
|
2765
|
+
|
|
2766
|
+
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
2767
|
+
data["city"] = city
|
|
2768
|
+
|
|
2769
|
+
weather_data_list.append(data)
|
|
2770
|
+
|
|
2771
|
+
# Combine all city data into a single data frame
|
|
2772
|
+
all_weather_data = pd.concat(weather_data_list)
|
|
2773
|
+
|
|
2774
|
+
# Convert the date column to a Date type
|
|
2775
|
+
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
2776
|
+
|
|
2777
|
+
# Set week commencing col up
|
|
2778
|
+
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2779
|
+
|
|
2780
|
+
# Group by week_starting and summarize
|
|
2781
|
+
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
2782
|
+
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
2783
|
+
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
2784
|
+
|
|
2785
|
+
# Change index to datetime
|
|
2786
|
+
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
2787
|
+
|
|
2788
|
+
elif country == "AU__ASOS":
|
|
2789
|
+
|
|
2790
|
+
# Define cities and date range
|
|
2791
|
+
cities = ["Darwin", "Cairns", "Brisbane", "Sydney", "Melbourne", "Adelaide", "Perth"]
|
|
2792
|
+
|
|
2793
|
+
start_date = formatted_date
|
|
2794
|
+
end_date = today.strftime("%Y-%m-%d")
|
|
2795
|
+
|
|
2796
|
+
# Initialize an empty list to store the weather data for each city
|
|
2797
|
+
weather_data_list = []
|
|
2798
|
+
|
|
2799
|
+
# Loop through each city and fetch weather data
|
|
2800
|
+
for city in cities:
|
|
2801
|
+
# Initialize Nominatim API
|
|
2802
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
2803
|
+
location = geolocator.geocode(city)
|
|
2804
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2805
|
+
|
|
2806
|
+
params = {
|
|
2807
|
+
"latitude": location.latitude,
|
|
2808
|
+
"longitude": location.longitude,
|
|
2809
|
+
"start_date": start_date,
|
|
2810
|
+
"end_date": end_date,
|
|
2811
|
+
"daily": "precipitation_sum",
|
|
2812
|
+
"timezone": "auto"
|
|
2813
|
+
}
|
|
2814
|
+
|
|
2815
|
+
response = requests.get(url, params=params)
|
|
2816
|
+
response_data = response.json()
|
|
2817
|
+
|
|
2818
|
+
daily_data = response_data["daily"]["precipitation_sum"]
|
|
2819
|
+
dates = response_data["daily"]["time"]
|
|
2820
|
+
|
|
2821
|
+
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
2822
|
+
data["city"] = city
|
|
2823
|
+
|
|
2824
|
+
weather_data_list.append(data)
|
|
2825
|
+
|
|
2826
|
+
# Combine all city data into a single data frame
|
|
2827
|
+
all_weather_data = pd.concat(weather_data_list)
|
|
2828
|
+
|
|
2829
|
+
# Convert the date column to a Date type
|
|
2830
|
+
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
2831
|
+
|
|
2832
|
+
# Set week commencing col up
|
|
2833
|
+
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2834
|
+
|
|
2835
|
+
# Group by week_starting and summarize
|
|
2836
|
+
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
2837
|
+
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
2838
|
+
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
2839
|
+
|
|
2840
|
+
# Change index to datetime
|
|
2841
|
+
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
2842
|
+
|
|
2843
|
+
elif country == "DE__ASOS":
|
|
2844
|
+
|
|
2845
|
+
# Define cities and date range
|
|
2846
|
+
cities = ["Dortmund", "Düsseldorf", "Frankfurt", "Munich", "Cologne", "Berlin", "Hamburg", "Nuernberg"]
|
|
2847
|
+
|
|
2848
|
+
start_date = formatted_date
|
|
2849
|
+
end_date = today.strftime("%Y-%m-%d")
|
|
2850
|
+
|
|
2851
|
+
# Initialize an empty list to store the weather data for each city
|
|
2852
|
+
weather_data_list = []
|
|
2853
|
+
|
|
2854
|
+
# Loop through each city and fetch weather data
|
|
2855
|
+
for city in cities:
|
|
2856
|
+
# Initialize Nominatim API
|
|
2857
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
2858
|
+
location = geolocator.geocode(city)
|
|
2859
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2860
|
+
|
|
2861
|
+
params = {
|
|
2862
|
+
"latitude": location.latitude,
|
|
2863
|
+
"longitude": location.longitude,
|
|
2864
|
+
"start_date": start_date,
|
|
2865
|
+
"end_date": end_date,
|
|
2866
|
+
"daily": "precipitation_sum",
|
|
2867
|
+
"timezone": "auto"
|
|
2868
|
+
}
|
|
2869
|
+
|
|
2870
|
+
response = requests.get(url, params=params)
|
|
2871
|
+
response_data = response.json()
|
|
2872
|
+
|
|
2873
|
+
daily_data = response_data["daily"]["precipitation_sum"]
|
|
2874
|
+
dates = response_data["daily"]["time"]
|
|
2875
|
+
|
|
2876
|
+
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
2877
|
+
data["city"] = city
|
|
2878
|
+
|
|
2879
|
+
weather_data_list.append(data)
|
|
2880
|
+
|
|
2881
|
+
# Combine all city data into a single data frame
|
|
2882
|
+
all_weather_data = pd.concat(weather_data_list)
|
|
2883
|
+
|
|
2884
|
+
# Convert the date column to a Date type
|
|
2885
|
+
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
2886
|
+
|
|
2887
|
+
# Set week commencing col up
|
|
2888
|
+
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2889
|
+
|
|
2890
|
+
# Group by week_starting and summarize
|
|
2891
|
+
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
2892
|
+
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
2893
|
+
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
2894
|
+
|
|
2895
|
+
# Change index to datetime
|
|
2896
|
+
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
2897
|
+
|
|
2898
|
+
elif country == "FR__ASOS":
|
|
2899
|
+
|
|
2900
|
+
# Define cities and date range
|
|
2901
|
+
cities = ["Paris"]
|
|
2902
|
+
|
|
2903
|
+
start_date = formatted_date
|
|
2904
|
+
end_date = today.strftime("%Y-%m-%d")
|
|
2905
|
+
|
|
2906
|
+
# Initialize an empty list to store the weather data for each city
|
|
2907
|
+
weather_data_list = []
|
|
2908
|
+
|
|
2909
|
+
# Loop through each city and fetch weather data
|
|
2910
|
+
for city in cities:
|
|
2911
|
+
# Initialize Nominatim API
|
|
2912
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
2913
|
+
location = geolocator.geocode(city)
|
|
2914
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2915
|
+
|
|
2916
|
+
params = {
|
|
2917
|
+
"latitude": location.latitude,
|
|
2918
|
+
"longitude": location.longitude,
|
|
2919
|
+
"start_date": start_date,
|
|
2920
|
+
"end_date": end_date,
|
|
2921
|
+
"daily": "precipitation_sum",
|
|
2922
|
+
"timezone": "auto"
|
|
2923
|
+
}
|
|
2924
|
+
|
|
2925
|
+
response = requests.get(url, params=params)
|
|
2926
|
+
response_data = response.json()
|
|
2927
|
+
|
|
2928
|
+
daily_data = response_data["daily"]["precipitation_sum"]
|
|
2929
|
+
dates = response_data["daily"]["time"]
|
|
2930
|
+
|
|
2931
|
+
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
2932
|
+
data["city"] = city
|
|
2933
|
+
|
|
2934
|
+
weather_data_list.append(data)
|
|
2935
|
+
|
|
2936
|
+
# Combine all city data into a single data frame
|
|
2937
|
+
all_weather_data = pd.concat(weather_data_list)
|
|
2938
|
+
|
|
2939
|
+
# Convert the date column to a Date type
|
|
2940
|
+
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
2941
|
+
|
|
2942
|
+
# Set week commencing col up
|
|
2943
|
+
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2944
|
+
|
|
2945
|
+
# Group by week_starting and summarize
|
|
2946
|
+
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
2947
|
+
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
2948
|
+
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
2949
|
+
|
|
2950
|
+
# Change index to datetime
|
|
2951
|
+
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
2952
|
+
|
|
2953
|
+
elif country == "ZA__ASOS":
|
|
2954
|
+
cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
|
|
2955
|
+
start_date = formatted_date
|
|
2956
|
+
end_date = today.strftime("%Y-%m-%d")
|
|
2957
|
+
|
|
2958
|
+
weather_data_list = []
|
|
2959
|
+
|
|
2960
|
+
for city in cities:
|
|
2961
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
2962
|
+
location = geolocator.geocode(city)
|
|
2963
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
2964
|
+
|
|
2965
|
+
params = {
|
|
2966
|
+
"latitude": location.latitude,
|
|
2967
|
+
"longitude": location.longitude,
|
|
2968
|
+
"start_date": start_date,
|
|
2969
|
+
"end_date": end_date,
|
|
2970
|
+
"daily": "precipitation_sum",
|
|
2971
|
+
"timezone": "auto"
|
|
2972
|
+
}
|
|
2973
|
+
|
|
2974
|
+
response = requests.get(url, params=params)
|
|
2975
|
+
response_data = response.json()
|
|
2976
|
+
|
|
2977
|
+
daily_data = response_data["daily"]["precipitation_sum"]
|
|
2978
|
+
dates = response_data["daily"]["time"]
|
|
2979
|
+
|
|
2980
|
+
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
2981
|
+
data["city"] = city
|
|
2982
|
+
|
|
2983
|
+
weather_data_list.append(data)
|
|
2984
|
+
|
|
2985
|
+
# Combine all city data into a single data frame
|
|
2986
|
+
all_weather_data = pd.concat(weather_data_list)
|
|
2987
|
+
|
|
2988
|
+
# Convert the date column to a Date type
|
|
2989
|
+
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
2990
|
+
|
|
2991
|
+
# Set week commencing col up
|
|
2992
|
+
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
2993
|
+
|
|
2994
|
+
# Group by week_starting and summarize
|
|
2995
|
+
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
2996
|
+
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
2997
|
+
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
2998
|
+
|
|
2999
|
+
# Change index to datetime
|
|
3000
|
+
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
3001
|
+
|
|
3002
|
+
# Merge the dataframes
|
|
3003
|
+
if country in ["AU__ASOS", "DE__ASOS", "FR__ASOS", "GB__ASOS", "ZA__ASOS"]:
|
|
3004
|
+
merged_df = weekly_avg_rain.merge(weekly_avg_temp, on="week_starting")
|
|
3005
|
+
else:
|
|
3006
|
+
merged_df = weekly_avg_temp
|
|
3007
|
+
|
|
3008
|
+
merged_df.reset_index(drop=False, inplace=True)
|
|
3009
|
+
merged_df.rename(columns={'week_starting': 'OBS'}, inplace=True)
|
|
3010
|
+
|
|
3011
|
+
final_weather = ims_proc.rename_cols(merged_df, 'seas_')
|
|
3012
|
+
|
|
3013
|
+
return final_weather
|
|
3014
|
+
|
|
3015
|
+
def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
|
|
3016
|
+
"""
|
|
3017
|
+
Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
|
|
3018
|
+
aggregates it to weekly averages, and renames variables based on specified rules.
|
|
3019
|
+
|
|
3020
|
+
Parameters:
|
|
3021
|
+
cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
3022
|
+
week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
|
|
3023
|
+
sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
|
|
3024
|
+
|
|
3025
|
+
Returns:
|
|
3026
|
+
pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
|
|
3027
|
+
and all series as renamed columns.
|
|
3028
|
+
"""
|
|
3029
|
+
# Define CDIDs for sectors and defaults
|
|
3030
|
+
sector_cdids = {
|
|
3031
|
+
"fast_food": ["L7TD", "L78Q", "DOAD"],
|
|
3032
|
+
"default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
|
|
3033
|
+
}
|
|
3034
|
+
|
|
3035
|
+
default_cdids = sector_cdids["default"]
|
|
3036
|
+
sector_specific_cdids = sector_cdids.get(sector, [])
|
|
3037
|
+
standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
|
|
3038
|
+
|
|
3039
|
+
# Combine standard CDIDs and additional CDIDs
|
|
3040
|
+
if cdid_list is None:
|
|
3041
|
+
cdid_list = []
|
|
3042
|
+
cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
|
|
3043
|
+
|
|
3044
|
+
base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
|
|
3045
|
+
base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
|
|
3046
|
+
combined_df = pd.DataFrame()
|
|
3047
|
+
|
|
3048
|
+
# Map week start day to pandas weekday convention
|
|
3049
|
+
days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
3050
|
+
if week_start_day not in days_map:
|
|
3051
|
+
raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
|
|
3052
|
+
week_start = days_map[week_start_day]
|
|
3053
|
+
|
|
3054
|
+
for cdid in cdid_list:
|
|
3055
|
+
try:
|
|
3056
|
+
# Search for the series
|
|
3057
|
+
search_url = f"{base_search_url}{cdid}"
|
|
3058
|
+
search_response = requests.get(search_url)
|
|
3059
|
+
search_response.raise_for_status()
|
|
3060
|
+
search_data = search_response.json()
|
|
3061
|
+
|
|
3062
|
+
items = search_data.get("items", [])
|
|
3063
|
+
if not items:
|
|
3064
|
+
print(f"No data found for CDID: {cdid}")
|
|
3065
|
+
continue
|
|
3066
|
+
|
|
3067
|
+
# Extract series name and latest release URI
|
|
3068
|
+
series_name = items[0].get("title", f"Series_{cdid}")
|
|
3069
|
+
latest_date = max(
|
|
3070
|
+
datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
|
|
3071
|
+
for item in items if "release_date" in item
|
|
3072
|
+
)
|
|
3073
|
+
latest_uri = next(
|
|
3074
|
+
item["uri"] for item in items
|
|
3075
|
+
if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
|
|
3076
|
+
)
|
|
3077
|
+
|
|
3078
|
+
# Fetch the dataset
|
|
3079
|
+
data_url = f"{base_data_url}{latest_uri}"
|
|
3080
|
+
data_response = requests.get(data_url)
|
|
3081
|
+
data_response.raise_for_status()
|
|
3082
|
+
data_json = data_response.json()
|
|
3083
|
+
|
|
3084
|
+
# Detect the frequency and process accordingly
|
|
3085
|
+
if "months" in data_json and data_json["months"]:
|
|
3086
|
+
frequency_key = "months"
|
|
3087
|
+
elif "quarters" in data_json and data_json["quarters"]:
|
|
3088
|
+
frequency_key = "quarters"
|
|
3089
|
+
elif "years" in data_json and data_json["years"]:
|
|
3090
|
+
frequency_key = "years"
|
|
3091
|
+
else:
|
|
3092
|
+
print(f"Unsupported frequency or no data for CDID: {cdid}")
|
|
3093
|
+
continue
|
|
3094
|
+
|
|
3095
|
+
# Prepare the DataFrame
|
|
3096
|
+
df = pd.DataFrame(data_json[frequency_key])
|
|
3097
|
+
|
|
3098
|
+
# Parse the 'date' field based on frequency
|
|
3099
|
+
if frequency_key == "months":
|
|
3100
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
|
|
3101
|
+
elif frequency_key == "quarters":
|
|
3102
|
+
def parse_quarter(quarter_str):
|
|
3103
|
+
year, qtr = quarter_str.split(" Q")
|
|
3104
|
+
month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
|
|
3105
|
+
return datetime(int(year), month, 1)
|
|
3106
|
+
df["date"] = df["date"].apply(parse_quarter)
|
|
3107
|
+
elif frequency_key == "years":
|
|
3108
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
|
|
3109
|
+
|
|
3110
|
+
df["value"] = pd.to_numeric(df["value"], errors="coerce")
|
|
3111
|
+
df.rename(columns={"value": series_name}, inplace=True)
|
|
3112
|
+
|
|
3113
|
+
# Combine data
|
|
3114
|
+
df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
|
|
3115
|
+
if combined_df.empty:
|
|
3116
|
+
combined_df = df
|
|
3117
|
+
else:
|
|
3118
|
+
combined_df = pd.merge(combined_df, df, on="date", how="outer")
|
|
3119
|
+
|
|
3120
|
+
except requests.exceptions.RequestException as e:
|
|
3121
|
+
print(f"Error fetching data for CDID {cdid}: {e}")
|
|
3122
|
+
except (KeyError, ValueError) as e:
|
|
3123
|
+
print(f"Error processing data for CDID {cdid}: {e}")
|
|
3124
|
+
|
|
3125
|
+
if not combined_df.empty:
|
|
3126
|
+
min_date = combined_df["date"].min()
|
|
3127
|
+
max_date = datetime.today()
|
|
3128
|
+
date_range = pd.date_range(start=min_date, end=max_date, freq='D')
|
|
3129
|
+
daily_df = pd.DataFrame(date_range, columns=['date'])
|
|
3130
|
+
daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
|
|
3131
|
+
daily_df = daily_df.ffill()
|
|
3132
|
+
|
|
3133
|
+
# Aggregate to weekly frequency
|
|
3134
|
+
daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
|
|
3135
|
+
weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
|
|
3136
|
+
|
|
3137
|
+
def clean_column_name(name):
|
|
3138
|
+
name = re.sub(r"\(.*?\)", "", name)
|
|
3139
|
+
name = re.split(r":", name)[0]
|
|
3140
|
+
name = re.sub(r"\d+", "", name)
|
|
3141
|
+
name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
|
|
3142
|
+
name = re.sub(r"[^\w\s]", "", name)
|
|
3143
|
+
name = name.replace(" ", "_")
|
|
3144
|
+
name = re.sub(r"_+", "_", name)
|
|
3145
|
+
name = name.rstrip("_")
|
|
3146
|
+
return f"macro_{name.lower()}_uk"
|
|
3147
|
+
|
|
3148
|
+
weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
|
|
3149
|
+
weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
|
|
3150
|
+
|
|
3151
|
+
weekly_df = weekly_df.fillna(0)
|
|
3152
|
+
|
|
3153
|
+
return weekly_df
|
|
3154
|
+
else:
|
|
3155
|
+
print("No data available to process.")
|
|
3156
|
+
return pd.DataFrame()
|
|
3157
|
+
|
|
3158
|
+
def pull_yfinance(self, tickers=None, week_start_day="mon"):
|
|
3159
|
+
"""
|
|
3160
|
+
Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
|
|
3161
|
+
aggregates it to weekly averages, and renames variables.
|
|
3162
|
+
|
|
3163
|
+
Parameters:
|
|
3164
|
+
tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
|
|
3165
|
+
week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
|
|
3166
|
+
|
|
3167
|
+
Returns:
|
|
3168
|
+
pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
|
|
3169
|
+
and aggregated stock data for the specified tickers, with NaN values filled with 0.
|
|
3170
|
+
"""
|
|
3171
|
+
# Define default tickers
|
|
3172
|
+
default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
|
|
3173
|
+
|
|
3174
|
+
# Combine default tickers with additional ones
|
|
3175
|
+
if tickers is None:
|
|
3176
|
+
tickers = []
|
|
3177
|
+
tickers = list(set(default_tickers + tickers)) # Ensure no duplicates
|
|
3178
|
+
|
|
3179
|
+
# Automatically set end_date to today
|
|
3180
|
+
end_date = datetime.today().strftime("%Y-%m-%d")
|
|
3181
|
+
|
|
3182
|
+
# Mapping week start day to pandas weekday convention
|
|
3183
|
+
days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
3184
|
+
if week_start_day not in days_map:
|
|
3185
|
+
raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
|
|
3186
|
+
week_start = days_map[week_start_day]
|
|
3187
|
+
|
|
3188
|
+
# Fetch data for all tickers without specifying a start date to get all available data
|
|
3189
|
+
data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
|
|
3190
|
+
|
|
3191
|
+
# Process the data
|
|
3192
|
+
combined_df = pd.DataFrame()
|
|
3193
|
+
for ticker in tickers:
|
|
3194
|
+
try:
|
|
3195
|
+
# Extract the ticker's data
|
|
3196
|
+
ticker_data = data[ticker] if len(tickers) > 1 else data
|
|
3197
|
+
ticker_data = ticker_data.reset_index()
|
|
3198
|
+
|
|
3199
|
+
# Ensure necessary columns are present
|
|
3200
|
+
if "Close" not in ticker_data.columns:
|
|
3201
|
+
raise ValueError(f"Ticker {ticker} does not have 'Close' price data.")
|
|
3202
|
+
|
|
3203
|
+
# Keep only relevant columns
|
|
3204
|
+
ticker_data = ticker_data[["Date", "Close"]]
|
|
3205
|
+
ticker_data.rename(columns={"Close": ticker}, inplace=True)
|
|
3206
|
+
|
|
3207
|
+
# Merge data
|
|
3208
|
+
if combined_df.empty:
|
|
3209
|
+
combined_df = ticker_data
|
|
3210
|
+
else:
|
|
3211
|
+
combined_df = pd.merge(combined_df, ticker_data, on="Date", how="outer")
|
|
3212
|
+
|
|
3213
|
+
except KeyError:
|
|
3214
|
+
print(f"Data for ticker {ticker} not available.")
|
|
3215
|
+
except Exception as e:
|
|
3216
|
+
print(f"Error processing ticker {ticker}: {e}")
|
|
3217
|
+
|
|
3218
|
+
if not combined_df.empty:
|
|
3219
|
+
# Convert to daily frequency
|
|
3220
|
+
combined_df["Date"] = pd.to_datetime(combined_df["Date"])
|
|
3221
|
+
combined_df.set_index("Date", inplace=True)
|
|
3222
|
+
|
|
3223
|
+
# Fill missing dates
|
|
3224
|
+
min_date = combined_df.index.min()
|
|
3225
|
+
max_date = combined_df.index.max()
|
|
3226
|
+
daily_index = pd.date_range(start=min_date, end=max_date, freq='D')
|
|
3227
|
+
combined_df = combined_df.reindex(daily_index)
|
|
3228
|
+
combined_df.index.name = "Date"
|
|
3229
|
+
combined_df = combined_df.ffill()
|
|
3230
|
+
|
|
3231
|
+
# Aggregate to weekly frequency
|
|
3232
|
+
combined_df["OBS"] = combined_df.index - pd.to_timedelta((combined_df.index.weekday - week_start) % 7, unit="D")
|
|
3233
|
+
weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
|
|
3234
|
+
|
|
3235
|
+
# Fill NaN values with 0
|
|
3236
|
+
weekly_df = weekly_df.fillna(0)
|
|
3237
|
+
|
|
3238
|
+
# Clean column names
|
|
3239
|
+
def clean_column_name(name):
|
|
3240
|
+
name = re.sub(r"[^\w\s]", "", name)
|
|
3241
|
+
return f"macro_{name.lower()}"
|
|
3242
|
+
|
|
3243
|
+
weekly_df.columns = [clean_column_name(col) if col != "OBS" else col for col in weekly_df.columns]
|
|
3244
|
+
|
|
3245
|
+
return weekly_df
|
|
3246
|
+
|
|
3247
|
+
else:
|
|
3248
|
+
print("No data available to process.")
|
|
3249
|
+
return pd.DataFrame()
|
|
3250
|
+
|
|
3251
|
+
def pull_ga(self, credentials_file, property_id, start_date, country, metrics):
|
|
3252
|
+
"""
|
|
3253
|
+
Pulls Google Analytics data using the BetaAnalyticsDataClient.
|
|
3254
|
+
|
|
3255
|
+
Parameters:
|
|
3256
|
+
credentials_file (str): Path to the JSON credentials file.
|
|
3257
|
+
property_id (str): Google Analytics property ID.
|
|
3258
|
+
start_date (str): Start date in 'YYYY-MM-DD' format.
|
|
3259
|
+
country (str): Country to filter the data by.
|
|
3260
|
+
metrics (list): List of metrics to retrieve (e.g., ["totalUsers", "sessions"]).
|
|
3261
|
+
|
|
3262
|
+
Returns:
|
|
3263
|
+
pd.DataFrame: A pandas DataFrame containing the fetched data.
|
|
3264
|
+
"""
|
|
3265
|
+
try:
|
|
3266
|
+
end_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
|
|
3267
|
+
|
|
3268
|
+
if not os.path.exists(credentials_file):
|
|
3269
|
+
raise FileNotFoundError(f"Credentials file '{credentials_file}' not found.")
|
|
3270
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_file
|
|
3271
|
+
|
|
3272
|
+
try:
|
|
3273
|
+
client = BetaAnalyticsDataClient()
|
|
3274
|
+
except DefaultCredentialsError as e:
|
|
3275
|
+
raise DefaultCredentialsError(
|
|
3276
|
+
f"Failed to initialize Google Analytics client: {e}"
|
|
3277
|
+
)
|
|
3278
|
+
|
|
3279
|
+
def format_report(request):
|
|
3280
|
+
response = client.run_report(request)
|
|
3281
|
+
# Row index
|
|
3282
|
+
row_index_names = [header.name for header in response.dimension_headers]
|
|
3283
|
+
row_header = []
|
|
3284
|
+
for i in range(len(row_index_names)):
|
|
3285
|
+
row_header.append([row.dimension_values[i].value for row in response.rows])
|
|
3286
|
+
|
|
3287
|
+
row_index_named = pd.MultiIndex.from_arrays(np.array(row_header), names=np.array(row_index_names))
|
|
3288
|
+
# Row flat data
|
|
3289
|
+
metric_names = [header.name for header in response.metric_headers]
|
|
3290
|
+
data_values = []
|
|
3291
|
+
for i in range(len(metric_names)):
|
|
3292
|
+
data_values.append([row.metric_values[i].value for row in response.rows])
|
|
3293
|
+
|
|
3294
|
+
output = pd.DataFrame(data=np.transpose(np.array(data_values, dtype='f')),
|
|
3295
|
+
index=row_index_named, columns=metric_names)
|
|
3296
|
+
return output
|
|
3297
|
+
|
|
3298
|
+
all_dfs = []
|
|
3299
|
+
offset_value = 0
|
|
3300
|
+
batch_size = 100000
|
|
3301
|
+
|
|
3302
|
+
while True:
|
|
3303
|
+
metric_objects = [Metric(name=metric) for metric in metrics]
|
|
3304
|
+
|
|
3305
|
+
request = RunReportRequest(
|
|
3306
|
+
property='properties/' + property_id,
|
|
3307
|
+
dimensions=[Dimension(name="date"), Dimension(name="city")],
|
|
3308
|
+
metrics=metric_objects,
|
|
3309
|
+
order_bys=[OrderBy(dimension={'dimension_name': 'date'}),
|
|
3310
|
+
OrderBy(dimension={'dimension_name': 'city'})],
|
|
3311
|
+
date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
|
|
3312
|
+
limit=batch_size,
|
|
3313
|
+
offset=offset_value,
|
|
3314
|
+
dimension_filter=FilterExpression(
|
|
3315
|
+
and_group=FilterExpressionList(
|
|
3316
|
+
expressions=[
|
|
3317
|
+
FilterExpression(
|
|
3318
|
+
filter=Filter(
|
|
3319
|
+
field_name="country",
|
|
3320
|
+
string_filter=Filter.StringFilter(value=country),
|
|
3321
|
+
)
|
|
3322
|
+
),
|
|
3323
|
+
]
|
|
3324
|
+
)
|
|
3325
|
+
)
|
|
3326
|
+
)
|
|
3327
|
+
|
|
3328
|
+
df = format_report(request)
|
|
3329
|
+
if df.empty:
|
|
3330
|
+
break
|
|
3331
|
+
|
|
3332
|
+
df = df.reset_index()
|
|
3333
|
+
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
|
|
3334
|
+
all_dfs.append(df)
|
|
3335
|
+
offset_value += batch_size
|
|
3336
|
+
|
|
3337
|
+
if not all_dfs:
|
|
3338
|
+
return pd.DataFrame()
|
|
3339
|
+
|
|
3340
|
+
final_df = pd.concat(all_dfs, ignore_index=True)
|
|
3341
|
+
return final_df
|
|
3342
|
+
|
|
3343
|
+
except FileNotFoundError as e:
|
|
3344
|
+
logging.error(f"FileNotFoundError: {e}")
|
|
3345
|
+
raise
|
|
3346
|
+
except DefaultCredentialsError as e:
|
|
3347
|
+
logging.error(f"DefaultCredentialsError: {e}")
|
|
3348
|
+
raise
|
|
3349
|
+
except Exception as e:
|
|
3350
|
+
logging.error(f"An unexpected error occurred: {e}")
|
|
3351
|
+
raise
|