imsciences 0.5.4.8__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,8 +3,31 @@ import calendar
3
3
  import os
4
4
  import plotly.express as px
5
5
  import plotly.graph_objs as go
6
- from dateutil.parser import parse
7
6
  import numpy as np
7
+ import re
8
+ from fredapi import Fred
9
+ import time
10
+ from datetime import datetime, timedelta
11
+ from io import StringIO
12
+ import requests
13
+ import subprocess
14
+ import json
15
+ import xml.etree.ElementTree as ET
16
+ from bs4 import BeautifulSoup
17
+ import yfinance as yf
18
+ import holidays
19
+ from dateutil.easter import easter
20
+ from google.analytics.data_v1beta import BetaAnalyticsDataClient
21
+ from google.analytics.data_v1beta.types import DateRange
22
+ from google.analytics.data_v1beta.types import Dimension
23
+ from google.analytics.data_v1beta.types import Metric
24
+ from google.analytics.data_v1beta.types import RunReportRequest
25
+ from google.analytics.data_v1beta.types import OrderBy
26
+ from google.analytics.data_v1beta.types import Filter
27
+ from google.analytics.data_v1beta.types import FilterExpression
28
+ from google.analytics.data_v1beta.types import FilterExpressionList
29
+ from google.auth.exceptions import DefaultCredentialsError
30
+ import logging
8
31
 
9
32
  class dataprocessing:
10
33
 
@@ -63,8 +86,8 @@ class dataprocessing:
63
86
 
64
87
  print("\n11. rename_cols")
65
88
  print(" - Description: Renames columns in a pandas DataFrame.")
66
- print(" - Usage: rename_cols(df, cols_to_rename)")
67
- print(" - Example: rename_cols(df, {'old_col_name': 'new_col_name'})")
89
+ print(" - Usage: rename_cols(df, name)")
90
+ print(" - Example: rename_cols(df, 'ame_facebook'")
68
91
 
69
92
  print("\n12. merge_new_and_old")
70
93
  print(" - Description: Creates a new DataFrame with two columns: one for dates and one for merged numeric values.")
@@ -92,21 +115,142 @@ class dataprocessing:
92
115
  print(" - Usage: combine_sheets(all_sheets)")
93
116
  print(" - Example: combine_sheets({'Sheet1': df1, 'Sheet2': df2})")
94
117
 
95
- print("\n17. dynamic_pivot")
118
+ print("\n17. pivot_table")
96
119
  print(" - Description: Dynamically pivots a DataFrame based on specified columns.")
97
- print(" - Usage: dynamic_pivot(data_frame, index_col, columns, values_col, fill_value=0)")
98
- print(" - Example: dynamic_pivot(df, 'Date', ['Category1', 'Category2'], ['Value1'])")
120
+ print(" - Usage: pivot_table(df, index_col, columns, values_col, filters_dict=None, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'False',fill_missing_weekly_dates=False,week_commencing='W-MON')")
121
+ print(" - Example: pivot_table(df, 'OBS', 'Channel Short Names', 'Value',filters_dict={'Master Include':' == 1','OBS':' >= datetime(2019,9,9)','Metric Short Names':' == 'spd''}, fill_value=0,aggfunc='sum',margins=False,margins_name='Total',datetime_trans_needed=True,reverse_header_order = 'True',fill_missing_weekly_dates=True,week_commencing='W-MON')")
99
122
 
100
- print("\n18. classify_within_column")
101
- print(" - Description: Allows you to map a dictionary of substrings within a column.")
102
- print(" - Usage: classify_within_column(df, col_name, to_find_dict, if_not_in_country_dict='Other')")
103
- print(" - Example: classify_within_column(df, 'campaign', {'uk_': 'uk'}, 'other')")
123
+ print("\n18. apply_lookup_table_for_columns")
124
+ print(" - Description: Equivalent of xlookup in excel. Allows you to map a dictionary of substrings within a column. If multiple columns are need for the LUT then a | seperator is needed.")
125
+ print(" - Usage: apply_lookup_table_for_columns(df, col_names, to_find_dict, if_not_in_dict='Other', new_column_name='Mapping')")
126
+ print(" - Example: apply_lookup_table_for_columns(df, col_names, {'spend':'spd','clicks':'clk'}, if_not_in_dict='Other', new_column_name='Metrics Short')")
104
127
 
105
128
  print("\n19. aggregate_daily_to_wc_wide")
106
129
  print(" - Description: Aggregates daily data into weekly data, grouping and summing specified columns, starting on a specified day of the week.")
107
130
  print(" - Usage: aggregate_daily_to_wc_wide(df, date_column, group_columns, sum_columns, wc, aggregation='sum', include_totals=False)")
108
131
  print(" - Example: aggregate_daily_to_wc_wide(df, 'date', ['platform'], ['cost', 'impressions', 'clicks'], 'mon', 'average', True)")
109
132
 
133
+ print("\n20. merge_cols_with_seperator")
134
+ print(" - Description: Merge multiple columns in a dataframe into 1 column with a seperator '_'.Can be used if multiple columns are needed for a LUT.")
135
+ print(" - Usage: merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = 'Merged',starting_prefix_str=None,ending_prefix_str=None)")
136
+ print(" - Example: merge_cols_with_seperator(df, ['Campaign','Product'],seperator='|','Merged Columns',starting_prefix_str='start_',ending_prefix_str='_end')")
137
+
138
+ print("\n21. check_sum_of_df_cols_are_equal")
139
+ print(" - Description: Checks if the sum of two columns in two dataframes are the same, and provides the sums of each column and the difference between them.")
140
+ print(" - Usage: check_sum_of_df_cols_are_equal(df_1,df_2,cols_1,cols_2)")
141
+ print(" - Example: check_sum_of_df_cols_are_equal(df_1,df_2,'Media Cost','Spend')")
142
+
143
+ print("\n22. convert_2_df_cols_to_dict")
144
+ print(" - Description: Can be used to create an LUT. Creates a dictionary using two columns in a dataframe.")
145
+ print(" - Usage: convert_2_df_cols_to_dict(df, key_col, value_col)")
146
+ print(" - Example: convert_2_df_cols_to_dict(df, 'Campaign', 'Channel')")
147
+
148
+ print("\n23. create_FY_and_H_columns")
149
+ print(" - Description: Used to create a financial year, half year, and financial half year column.")
150
+ print(" - Usage: create_FY_and_H_columns(df, index_col, start_date, starting_FY,short_format='No',half_years='No',combined_FY_and_H='No')")
151
+ print(" - Example: create_FY_and_H_columns(df, 'Week (M-S)', '2022-10-03', 'FY2023',short_format='Yes',half_years='Yes',combined_FY_and_H='Yes')")
152
+
153
+ print("\n24. keyword_lookup_replacement")
154
+ print(" - Description: Essentially provides an if statement with a xlookup if a value is something. Updates certain chosen values in a specified column of the DataFrame based on a lookup dictionary.")
155
+ print(" - Usage: keyword_lookup_replacement(df, col, replacement_rows, cols_to_merge, replacement_lookup_dict,output_column_name='Updated Column')")
156
+ print(" - Example: keyword_lookup_replacement(df, 'channel', 'Paid Search Generic', ['channel','segment','product'], qlik_dict_for_channel,output_column_name='Channel New')")
157
+
158
+ print("\n25. create_new_version_of_col_using_LUT")
159
+ print(" - Description: Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table. The lookup is based on a column in the dataframe.")
160
+ print(" - Usage: create_new_version_of_col_using_LUT(df, keys_col,value_col, dict_for_specific_changes, new_col_name='New Version of Old Col')")
161
+ print(" - Example: keyword_lookup_replacement(df, '*Campaign Name','Campaign Type',search_campaign_name_retag_lut,'Campaign Name New')")
162
+
163
+ print("\n26. convert_df_wide_2_long")
164
+ print(" - Description: Changes a dataframe from wide to long format.")
165
+ print(" - Usage: convert_df_wide_2_long(df,value_cols,variable_col_name='Stacked',value_col_name='Value')")
166
+ print(" - Example: keyword_lookup_replacement(df, ['Media Cost','Impressions','Clicks'],variable_col_name='Metric')")
167
+
168
+ print("\n27. manually_edit_data")
169
+ print(" - Description: Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe.")
170
+ print(" - Usage: manually_edit_data(df, filters_dict, col_to_change, new_value, change_in_existing_df_col='No', new_col_to_change_name='New', manual_edit_col_name=None, add_notes='No', existing_note_col_name=None, note=None)")
171
+ print(" - Example: keyword_lookup_replacement(df, {'OBS':' <= datetime(2023,1,23)','File_Name':' == 'France media''},'Master Include',1,change_in_existing_df_col = 'Yes',new_col_to_change_name = 'Master Include',manual_edit_col_name = 'Manual Changes')")
172
+
173
+ print("\n28. format_numbers_with_commas")
174
+ print(" - Description: Converts data in numerical format into numbers with commas and a chosen decimal place length.")
175
+ print(" - Usage: format_numbers_with_commas(df, decimal_length_chosen=2)")
176
+ print(" - Example: format_numbers_with_commas(df,1)")
177
+
178
+ print("\n29. filter_df_on_multiple_conditions")
179
+ print(" - Description: Filters dataframe on multiple conditions, which come in the form of a dictionary.")
180
+ print(" - Usage: filter_df_on_multiple_conditions(df, filters_dict)")
181
+ print(" - Example: filter_df_on_multiple_conditions(df, {'OBS':' <= datetime(2023,1,23)','File_Name':' == 'France media''})")
182
+
183
+ print("\n30. read_and_concatenate_files")
184
+ print(" - Description: Read and Concatinate all files of one type in a folder.")
185
+ print(" - Usage: read_and_concatenate_files(folder_path, file_type='csv')")
186
+ print(" - Example: read_and_concatenate_files(folder_path, file_type='csv')")
187
+
188
+ print("\n31. remove_zero_values")
189
+ print(" - Description: Remove zero values in a specified column.")
190
+ print(" - Usage: remove_zero_values(self, data_frame, column_to_filter)")
191
+ print(" - Example: remove_zero_values(None, df, 'Funeral_Delivery')")
192
+
193
+ print("\n32. upgrade_outdated_packages")
194
+ print(" - Description: Upgrades all packages.")
195
+ print(" - Usage: upgrade_outdated_packages()")
196
+ print(" - Example: upgrade_outdated_packages()")
197
+
198
+ print("\n33. convert_mixed_formats_dates")
199
+ print(" - Description: Convert a mix of US and UK dates to datetime.")
200
+ print(" - Usage: convert_mixed_formats_dates(df, datecol)")
201
+ print(" - Example: convert_mixed_formats_dates(df, 'OBS')")
202
+
203
+ print("\n34. fill_weekly_date_range")
204
+ print(" - Description: Fill in any missing weeks with 0.")
205
+ print(" - Usage: fill_weekly_date_range(df, date_column, freq)")
206
+ print(" - Example: fill_weekly_date_range(df, 'OBS', 'W-MON')")
207
+
208
+ print("\n35. add_prefix_and_suffix")
209
+ print(" - Description: Add Prefix and/or Suffix to Column Headers.")
210
+ print(" - Usage: add_prefix_and_suffix(df, prefix='', suffix='', date_col=None)")
211
+ print(" - Example: add_prefix_and_suffix(df, prefix='media_', suffix='_spd', date_col='obs')")
212
+
213
+ print("\n36. create_dummies")
214
+ print(" - Description: Changes time series to 0s and 1s based off threshold")
215
+ print(" - Usage: create_dummies(df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total')")
216
+ print(" - Example: create_dummies(df, date_col='obs', dummy_threshold=100, add_total_dummy_col='Yes', total_col_name='med_total_dum')")
217
+
218
+ print("\n37. replace_substrings")
219
+ print(" - Description: Replace substrings in column of strings based off dictionary, can also change column to lower")
220
+ print(" - Usage: replace_substrings(df, column, replacements, to_lower=False, new_column=None)")
221
+ print(" - Example: replace_substrings(df, 'Influencer Handle', replacement_dict, to_lower=True, new_column='Short Version')")
222
+
223
+ print("\n38. add_total_column")
224
+ print(" - Description: Sums all columns with the option to exclude an date column to create a total column")
225
+ print(" - Usage: add_total_column(df, exclude_col=None, total_col_name='Total')")
226
+ print(" - Example: add_total_column(df, exclude_col='obs', total_col_name='total_media_spd')")
227
+
228
+ print("\n39. apply_lookup_table_based_on_substring")
229
+ print(" - Description: Equivalent of xlookup in excel, but only based on substrings. If a substring is found in a cell, than look it up in the dictionary. Otherwise use the other label")
230
+ print(" - Usage: apply_lookup_table_based_on_substring(df, column_name, category_dict, new_col_name='Category', other_label='Other')")
231
+ print(" - Example: apply_lookup_table_based_on_substring(df, 'Campaign Name', campaign_dict, new_col_name='Campaign Name Short', other_label='Full Funnel')")
232
+
233
+ print("\n40. compare_overlap")
234
+ print(" - Description: With two matching dataset, it takes the common columns and rows and takes the difference between them, outputing a differences and total differences table")
235
+ print(" - Usage: compare_overlap(df1, df2, date_col)")
236
+ print(" - Example: compare_overlap(df_1, df_2, 'obs')")
237
+
238
+ print("\n41. week_commencing_2_week_commencing_conversion")
239
+ print(" - Description: Take a week commencing column say sunday and creates a new column with a different week commencing e.g. monday")
240
+ print(" - Usage: week_commencing_2_week_commencing_conversion(df,date_col,week_commencing='sun')")
241
+ print(" - Example: week_commencing_2_week_commencing_conversion(df,'obs,week_commencing='mon')")
242
+
243
+ print("\n42. plot_chart")
244
+ print(" - Description: Plots a range of charts including line, area, scatter, bubble, bar etc.")
245
+ print(" - Usage: plot_chart(df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs)")
246
+ print(" - Example: plot_chart(df, 'obs', df.cols, chart_type='line', title='Spend Over Time', x_title='Date', y_title='Spend')")
247
+
248
+ print("\n43. plot_two_with_common_cols")
249
+ print(" - Description: Plots the number of charts in two dataframes for which there are two common column names")
250
+ print(" - Usage: plot_two_with_common_cols(df1, df2, date_column, same_axis=True)")
251
+ print(" - Example: plot_two_with_common_cols(df_1, df_2,date_column='obs')")
252
+
253
+
110
254
  def get_wd_levels(self, levels):
111
255
  """
112
256
  Gets the current wd of whoever is working on it and gives the options to move the number of levels up.
@@ -138,7 +282,7 @@ class dataprocessing:
138
282
  The number of rows to remove from the data frame, starting from the original header.
139
283
 
140
284
  Returns:
141
- - pandas DataFrame
285
+ - pandas DataFrames
142
286
  The modified data frame with rows removed and new column headings.
143
287
 
144
288
  Raises:
@@ -224,71 +368,6 @@ class dataprocessing:
224
368
 
225
369
  return grouped
226
370
 
227
- def aggregate_daily_to_wc_wide(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum', include_totals : bool = False) -> pd.DataFrame:
228
- """
229
- Aggregates daily data into weekly data, starting on a specified day of the week,
230
- and groups the data by additional specified columns. It aggregates specified numeric columns
231
- by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
232
- of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
233
- The day column is renamed from 'Day' to 'OBS'.
234
-
235
- Parameters:
236
- - df: pandas DataFrame
237
- The input DataFrame containing daily data.
238
- - date_column: string
239
- The name of the column in the DataFrame that contains date information.
240
- - group_columns: list of strings
241
- Additional column names to group by along with the weekly grouping.
242
- - sum_columns: list of strings
243
- Numeric column names to be aggregated during aggregation.
244
- - wc: string
245
- The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
246
- - aggregation: string, optional (default 'sum')
247
- Aggregation method, either 'sum', 'average', or 'count'.
248
- - include_totals: boolean, optional (default False)
249
- If True, include total columns for each sum_column.
250
-
251
-
252
-
253
- Returns:
254
- - pandas DataFrame
255
- A new DataFrame with weekly aggregated data. The index is reset,
256
- and columns represent the grouped and aggregated metrics. The DataFrame
257
- is in wide format, with separate columns for each combination of
258
- grouped metrics.
259
- """
260
-
261
- grouped = self.aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation)
262
-
263
- # Pivot the data to wide format
264
- if group_columns:
265
- wide_df = grouped.pivot_table(index='OBS',
266
- columns=group_columns,
267
- values=sum_columns,
268
- aggfunc='first')
269
- # Flatten the multi-level column index and create combined column names
270
- wide_df.columns = ['_'.join(col).strip() for col in wide_df.columns.values]
271
- else:
272
- wide_df = grouped.set_index('OBS')
273
-
274
- # Fill NaN values with 0
275
- wide_df = wide_df.fillna(0)
276
-
277
- # Adding total columns for each unique sum_column, if include_totals is True
278
- if include_totals:
279
- for col in sum_columns:
280
- total_column_name = f'Total {col}'
281
- if group_columns:
282
- columns_to_sum = [column for column in wide_df.columns if col in column]
283
- else:
284
- columns_to_sum = [col]
285
- wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
286
-
287
- # Reset the index of the final DataFrame
288
- wide_df = wide_df.reset_index()
289
-
290
- return wide_df
291
-
292
371
  def convert_monthly_to_daily(self, df, date_column, divide = True):
293
372
  """
294
373
  Convert a DataFrame with monthly data to daily data.
@@ -320,7 +399,7 @@ class dataprocessing:
320
399
  # Divide each numeric value by the number of days in the month
321
400
  for col in df.columns:
322
401
  if pd.api.types.is_numeric_dtype(df[col]) and col != date_column:
323
- if divide == True:
402
+ if divide is True:
324
403
  daily_row[col] = row[col] / num_days
325
404
  else:
326
405
  daily_row[col] = row[col]
@@ -344,7 +423,10 @@ class dataprocessing:
344
423
  :param same_axis: If True, plot both traces on the same y-axis; otherwise, use separate y-axes.
345
424
  :return: Plotly figure
346
425
  """
347
-
426
+ # Ensure date columns are datetime
427
+ df1[date_column] = pd.to_datetime(df1[date_column])
428
+ df2[date_column] = pd.to_datetime(df2[date_column])
429
+
348
430
  # Create traces for the first and second dataframes
349
431
  trace1 = go.Scatter(x=df1[date_column], y=df1[col1], mode='lines', name=col1, yaxis='y1')
350
432
 
@@ -352,7 +434,7 @@ class dataprocessing:
352
434
  trace2 = go.Scatter(x=df2[date_column], y=df2[col2], mode='lines', name=col2, yaxis='y1')
353
435
  else:
354
436
  trace2 = go.Scatter(x=df2[date_column], y=df2[col2], mode='lines', name=col2, yaxis='y2')
355
-
437
+
356
438
  # Define layout for the plot
357
439
  layout = go.Layout(
358
440
  title="",
@@ -413,8 +495,8 @@ class dataprocessing:
413
495
 
414
496
  return fig
415
497
 
416
- def week_of_year_mapping(self, df, week_col, start_day_str):
417
- from datetime import datetime, timedelta
498
+ def week_of_year_mapping(self,df, week_col, start_day_str):
499
+
418
500
  # Mapping of string day names to day numbers (1 for Monday, 7 for Sunday)
419
501
  day_mapping = {
420
502
  'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7
@@ -429,15 +511,15 @@ class dataprocessing:
429
511
  def week_to_startdate(week_str, start_day):
430
512
  year, week = map(int, week_str.split('-W'))
431
513
  first_day_of_year = datetime(year, 1, 1)
432
- day_of_week = first_day_of_year.isocalendar()[2]
433
- days_to_add = (7 - day_of_week + 1) if day_of_week > 4 else (1 - day_of_week)
434
- start_of_iso_week = first_day_of_year + timedelta(days=days_to_add)
514
+ first_weekday_of_year = first_day_of_year.weekday() # Monday is 0 and Sunday is 6
435
515
 
436
- # Adjust start day
437
- days_to_shift = (start_day - 1) % 7
438
- start_of_week = start_of_iso_week + timedelta(days=days_to_shift)
516
+ # Calculate days to adjust to the desired start day of the week
517
+ days_to_adjust = (start_day - 1 - first_weekday_of_year) % 7
518
+ start_of_iso_week = first_day_of_year + timedelta(days=days_to_adjust)
439
519
 
440
- return start_of_week + timedelta(weeks=week - 1)
520
+ # Calculate the start of the desired week
521
+ start_of_week = start_of_iso_week + timedelta(weeks=week - 1)
522
+ return start_of_week
441
523
 
442
524
  # Apply the function to each row in the specified week column
443
525
  df['OBS'] = df[week_col].apply(lambda x: week_to_startdate(x, start_day)).dt.strftime('%d/%m/%Y')
@@ -447,22 +529,15 @@ class dataprocessing:
447
529
  # This line filters the DataFrame based on whether the values in the specified column are not in the list_of_filters
448
530
  return df[~df[col_to_filter].isin(list_of_filters)]
449
531
 
450
- def rename_cols(self, df, cols_to_rename):
451
- """
452
- Renames columns in a pandas DataFrame.
453
-
454
- Parameters:
455
- - df: pandas DataFrame
456
- The DataFrame whose columns are to be renamed.
457
- - cols_to_rename: dict
458
- A dictionary where keys are the current column names and values are the new column names.
459
-
460
- Returns:
461
- - pandas DataFrame
462
- The DataFrame with renamed columns.
463
- """
464
-
465
- return df.rename(columns=cols_to_rename)
532
+ def rename_cols(self, df, name = 'ame_'):
533
+ new_columns = {}
534
+ for col in df.columns:
535
+ if col != 'OBS':
536
+ new_col_name = name + col.replace(" ", "_").lower()
537
+ else:
538
+ new_col_name = col
539
+ new_columns[col] = new_col_name
540
+ return df.rename(columns=new_columns)
466
541
 
467
542
  def merge_new_and_old(self, old_df, old_col, new_df, new_col, cutoff_date, date_col_name='OBS'):
468
543
  """
@@ -572,32 +647,24 @@ class dataprocessing:
572
647
  return merged_df
573
648
 
574
649
  def convert_us_to_uk_dates(self, df, date_col):
575
- import datetime
576
- def fix_date(d):
577
- # Convert datetime objects to string
578
- if isinstance(d, pd.Timestamp) or isinstance(d, datetime.datetime):
579
- d = d.strftime('%m/%d/%Y')
580
-
581
- # Split date string into components
582
- parts = d.split('/')
583
-
584
- # Check for two formats: mm/dd/yyyy and dd/mm/yyyy
585
- if len(parts) == 3:
586
- year, month, day = parts[2], parts[0], parts[1]
587
- # Correct for two-digit years
588
- if len(year) == 2:
589
- year = '20' + year
590
- # Identify dates needing correction: where month > 12 or it follows the yyyy-dd-mm pattern
591
- if int(month) > 12 or (int(day) <= 12 and int(year) > 31):
592
- # Assume year is correct, flip 'month' and 'day' for correction
593
- month, day = day, month
594
- return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
595
- else:
596
- # Handle already correct or non-standard formats cautiously
597
- return d
598
-
599
- # Apply the fix to the specified column
600
- df[date_col] = df[date_col].apply(lambda x: fix_date(x) if not pd.isnull(x) else x)
650
+ """
651
+ Processes the date column of a DataFrame to remove hyphens and slashes,
652
+ and converts it to a datetime object.
653
+
654
+ Parameters:
655
+ df (pd.DataFrame): The DataFrame containing the date column.
656
+ date_col (str): The name of the date column.
657
+
658
+ Returns:
659
+ pd.DataFrame: The DataFrame with the processed date column.
660
+ """
661
+ df[date_col] = df[date_col].str.replace(r'[-/]', '', regex=True)
662
+ df[date_col] = pd.to_datetime(
663
+ df[date_col].str.slice(0, 2) + '/' +
664
+ df[date_col].str.slice(2, 4) + '/' +
665
+ df[date_col].str.slice(4, 8),
666
+ format='%m/%d/%Y'
667
+ )
601
668
  return df
602
669
 
603
670
  def combine_sheets(self, all_sheets):
@@ -619,61 +686,2666 @@ class dataprocessing:
619
686
 
620
687
  return combined_df
621
688
 
622
- def dynamic_pivot(self, data_frame, index_col, columns, values_col, fill_value=0):
623
- # Ensure OBS is in datetime format for proper sorting
624
- data_frame[index_col] = pd.to_datetime(data_frame[index_col], dayfirst=True)
689
+ def pivot_table(self, df, index_col, columns, values_col, filters_dict=None, fill_value=0, aggfunc="sum", margins=False, margins_name="Total", datetime_trans_needed=True, date_format="%Y-%m-%d", reverse_header_order=False, fill_missing_weekly_dates=False, week_commencing="W-MON"):
690
+ """
691
+ Provides the ability to create pivot tables, filtering the data to get to data you want and then pivoting on certain columns
692
+
693
+ Args:
694
+ df (pandas.DataFrame): The DataFrame containing the data.
695
+ index_col (str): Name of Column for your pivot table to index on
696
+ columns (str): Name of Columns for your pivot table.
697
+ values_col (str): Name of Values Columns for your pivot table.
698
+ filters_dict (dict, optional): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell. Defaults to None
699
+ fill_value (int, optional): The value to replace nan with. Defaults to 0.
700
+ aggfunc (str, optional): The method on which to aggregate the values column. Defaults to sum.
701
+ margins (bool, optional): Whether the pivot table needs a total rows and column. Defaults to False.
702
+ margins_name (str, optional): The name of the Totals columns. Defaults to "Total".
703
+ datetime_trans_needed (bool, optional): Whether the index column needs to be transformed into datetime format. Defaults to False.
704
+ reverse_header_order (bool, optional): Reverses the order of the column headers. Defaults to False.
705
+ fill_missing_weekly_dates (bool, optional): Fills in any weekly missing dates. Defaults to False.
706
+ week_commencing (str,optional): Fills in missing weeks if option is specified. Defaults to 'W-MON'.
707
+
708
+ Returns:
709
+ pandas.DataFrame: The pivot table specified
710
+ """
625
711
 
626
- # Check if values_col is a single column or a list and pivot accordingly
627
- if isinstance(values_col, list):
628
- # If values_col is a list, use .pivot_table() to accommodate multiple values columns
629
- pivoted_df = data_frame.pivot_table(index=index_col, columns=columns, values=values_col, aggfunc='sum')
712
+ # Validate inputs
713
+ if index_col not in df.columns:
714
+ raise ValueError(f"index_col '{index_col}' not found in DataFrame.")
715
+ if columns not in df.columns:
716
+ raise ValueError(f"columns '{columns}' not found in DataFrame.")
717
+ if values_col not in df.columns:
718
+ raise ValueError(f"values_col '{values_col}' not found in DataFrame.")
719
+
720
+ # Apply filters if provided
721
+ if filters_dict:
722
+ df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
630
723
  else:
631
- # For a single value column, use .pivot()
632
- pivoted_df = data_frame.pivot(index=index_col, columns=columns, values=values_col)
633
-
634
- # Handling MultiIndex columns if present, making them a flat structure
724
+ df_filtered = df.copy()
725
+
726
+ # Ensure index column is in datetime format if needed
727
+ if datetime_trans_needed:
728
+ df_filtered[index_col] = pd.to_datetime(df_filtered[index_col], dayfirst=True)
729
+
730
+ # Create the pivot table
731
+ pivoted_df = df_filtered.pivot_table(
732
+ index=index_col,
733
+ columns=columns,
734
+ values=values_col,
735
+ aggfunc=aggfunc,
736
+ margins=margins,
737
+ margins_name=margins_name,
738
+ )
739
+
740
+ # Handle column headers
635
741
  if isinstance(pivoted_df.columns, pd.MultiIndex):
636
- pivoted_df.columns = ['_'.join(map(str, col)).strip() for col in pivoted_df.columns.values]
742
+ pivoted_df.columns = [
743
+ "_".join(reversed(map(str, col)) if reverse_header_order else map(str, col))
744
+ for col in pivoted_df.columns.values
745
+ ]
637
746
  else:
638
747
  pivoted_df.columns = pivoted_df.columns.map(str)
748
+
749
+ # Reset the index
750
+ pivoted_df.reset_index(inplace=True)
751
+
752
+ # Handle sorting and formatting of index column
753
+ if datetime_trans_needed:
754
+ pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col], errors="coerce")
755
+ pivoted_df.sort_values(by=index_col, inplace=True)
756
+ pivoted_df[index_col] = pivoted_df[index_col].dt.strftime(date_format)
757
+
758
+ # Fill missing values
759
+ pivoted_df.fillna(fill_value, inplace=True)
760
+
761
+ # Fill missing weekly dates if specified
762
+ if fill_missing_weekly_dates:
763
+ pivoted_df = self.fill_weekly_date_range(pivoted_df, index_col, freq=week_commencing)
764
+
765
+ return pivoted_df
766
+
767
+ def apply_lookup_table_for_columns(self, df, col_names, to_find_dict, if_not_in_dict="Other", new_column_name="Mapping"):
768
+ """
769
+ Creates a new DataFrame column based on a look up table, possibly with multiple columns to look up on (dictionary of substrings to class mappings).
770
+
771
+ Parameters:
772
+ df (pandas.DataFrame): The DataFrame containing the data.
773
+ col_names (list of str): these are the columns which are used for the lookup. One column or several columns can be inputted as a list, provided there is a merged column to lookup on. If there are multiple columns to look up on then a merged column must be inputted as the key of the dictionary of format e.g. col1|col2|col3
774
+ to_find_dict (dict): your look up table, where keys are the values being looked up, and the values are the resulting mappings.
775
+ if_not_in_dict (str, optional): default value if no substring matches are found in the look up table dictionary. Defaults to "Other".
776
+ new_column_name (str, optional): name of new column. Defaults to "Mapping".
777
+
778
+ Returns:
779
+ pandas.DataFrame: DataFrame with a new column containing the look up table results.
780
+ """
781
+
782
+ # Create regex pattern with word boundaries from the dictionary
783
+ regex_pattern = "|".join(r'\b' + re.escape(key) + r'\b' for key in to_find_dict.keys())
784
+
785
+ # Preprocess DataFrame if multiple columns
786
+ if len(col_names) > 1:
787
+ df["Merged"] = df[col_names].astype(str).apply('|'.join, axis=1)
788
+ col_to_use = "Merged"
789
+ else:
790
+ col_to_use = col_names[0]
791
+
792
+ # Extract the first match using the regex pattern
793
+ matches = df[col_to_use].str.extract(f'({regex_pattern})', expand=False, flags=re.IGNORECASE)
794
+
795
+ # Map the matches to the corresponding values in the dictionary
796
+ df[new_column_name] = matches.str.lower().map({k.lower(): v for k, v in to_find_dict.items()}).fillna(if_not_in_dict)
797
+
798
+ # Drop intermediate column if created
799
+ if len(col_names) > 1:
800
+ df.drop(columns=["Merged"], inplace=True)
801
+
802
+ return df
803
+
804
+ def aggregate_daily_to_wc_wide(self, df : pd.DataFrame, date_column : str, group_columns : list[str], sum_columns : list[str], wc : str = 'sun', aggregation : str = 'sum', include_totals : bool = False) -> pd.DataFrame:
805
+ """
806
+ Aggregates daily data into weekly data, starting on a specified day of the week,
807
+ and groups the data by additional specified columns. It aggregates specified numeric columns
808
+ by summing, averaging, or counting them, and pivots the data to create separate columns for each combination
809
+ of the group columns and sum columns. NaN values are replaced with 0 and the index is reset.
810
+ The day column is renamed from 'Day' to 'OBS'.
811
+
812
+ Parameters:
813
+ - df: pandas DataFrame
814
+ The input DataFrame containing daily data.
815
+ - date_column: string
816
+ The name of the column in the DataFrame that contains date information.
817
+ - group_columns: list of strings
818
+ Additional column names to group by along with the weekly grouping.
819
+ - sum_columns: list of strings
820
+ Numeric column names to be aggregated during aggregation.
821
+ - wc: string
822
+ The week commencing day (e.g., 'sun' for Sunday, 'mon' for Monday).
823
+ - aggregation: string, optional (default 'sum')
824
+ Aggregation method, either 'sum', 'average', or 'count'.
825
+ - include_totals: boolean, optional (default False)
826
+ If True, include total columns for each sum_column.
827
+
828
+ Returns:
829
+ - pandas DataFrame
830
+ A new DataFrame with weekly aggregated data. The index is reset,
831
+ and columns represent the grouped and aggregated metrics. The DataFrame
832
+ is in wide format, with separate columns for each combination of
833
+ grouped metrics.
834
+ """
639
835
 
640
- # Reset the pivot before returning
641
- pivoted_df = pivoted_df.reset_index()
836
+ grouped = self.aggregate_daily_to_wc_long(df, date_column, group_columns, sum_columns, wc, aggregation)
642
837
 
643
- # Sort by OBS from oldest to newest
644
- pivoted_df[index_col] = pd.to_datetime(pivoted_df[index_col]) # Ensure sorting works correctly
645
- pivoted_df = pivoted_df.sort_values(by=index_col)
838
+ # Pivot the data to wide format
839
+ if group_columns:
840
+ wide_df = grouped.pivot_table(index='OBS',
841
+ columns=group_columns,
842
+ values=sum_columns,
843
+ aggfunc='first')
844
+ # Flatten the multi-level column index and create combined column names
845
+ wide_df.columns = ['_'.join(col).strip() for col in wide_df.columns.values]
846
+ else:
847
+ wide_df = grouped.set_index('OBS')
848
+
849
+ # Fill NaN values with 0
850
+ wide_df = wide_df.fillna(0)
851
+
852
+ # Adding total columns for each unique sum_column, if include_totals is True
853
+ if include_totals:
854
+ for col in sum_columns:
855
+ total_column_name = f'Total {col}'
856
+ if group_columns:
857
+ columns_to_sum = [column for column in wide_df.columns if col in column]
858
+ else:
859
+ columns_to_sum = [col]
860
+ wide_df[total_column_name] = wide_df[columns_to_sum].sum(axis=1)
861
+
862
+ # Reset the index of the final DataFrame
863
+ wide_df = wide_df.reset_index()
864
+
865
+ return wide_df
866
+
867
+ def merge_cols_with_seperator(self, df, col_names,seperator='_',output_column_name = "Merged",starting_prefix_str=None,ending_prefix_str=None):
868
+ """
869
+ Creates a new column in the dataframe that merges 2 or more columns together with a "_" seperator, possibly to be used for a look up table where multiple columns are being looked up
870
+
871
+ Parameters:
872
+ df (pandas.DataFrame): Dataframe to make changes to.
873
+ col_names (list): list of columm names ot merge.
874
+ seperator (str, optional): Name of column outputted. Defaults to "_".
875
+ output_column_name (str, optional): Name of column outputted. Defaults to "Merged".
876
+ starting_prefix_str (str, optional): string of optional text to be added before the merged column str value
877
+ ending_prefix_str (str, optional): string of optional text to be added after the merged column str value
878
+
879
+ Raises:
880
+ ValueError: if more less than two column names are inputted in the list there is nothing to merge on
881
+
882
+ Returns:
883
+ pandas.DataFrame: DataFrame with additional merged column
884
+ """
885
+ # Specify more than one column must be entered
886
+ if len(col_names) < 2:
887
+ raise ValueError("2 or more columns must be specified to merge")
646
888
 
647
- # Convert OBS back to a string in YYYY-MM-DD format for display purposes
648
- pivoted_df[index_col] = pivoted_df[index_col].dt.strftime('%Y-%m-%d')
889
+ # Create a new column with the merged columns
890
+ df[output_column_name] = df[col_names].astype(str).apply(seperator.join, axis=1)
891
+
892
+ # Add string before
893
+ if starting_prefix_str is not None:
894
+ df[output_column_name] = starting_prefix_str + df[output_column_name].astype(str)
649
895
 
650
- # Fill in any NaNs
651
- pivoted_df = pivoted_df.fillna(fill_value)
896
+ # Add string after
897
+ if ending_prefix_str is not None:
898
+ df[output_column_name] = df[output_column_name].astype(str) + ending_prefix_str
899
+
900
+ return df
901
+
902
+ def check_sum_of_df_cols_are_equal(self, df_1,df_2,cols_1,cols_2):
903
+ """
904
+ Checks the sum of two different dataframe column or columns are equal
905
+
906
+ Parameters:
907
+ df_1 (pandas.DataFrame): First dataframe for columnsa to be summed on.
908
+ df_2 (pandas.DataFrame): Second dataframe for columnsa to be summed on.
909
+ cols_1 (list of str): Columns from first dataframe to sum.
910
+ cols_2 (list of str): Columns from second dataframe to sum.
911
+
912
+ Returns:
913
+ Tuple: Answer is the true or false answer to whether sums are the same, df_1_sum is the sum of the column/columns in the first dataframe, df_2_sum is the sum of the column/columns in the second dataframe
914
+ """
915
+ # Find the sum of both sets of columns
916
+ df_1_sum = df_1[cols_1].sum().sum()
917
+ df_2_sum = df_2[cols_2].sum().sum()
652
918
 
653
- return pivoted_df
919
+ # If the the two columns are
920
+ if df_1_sum == df_2_sum:
921
+ Answer = "They are equal"
922
+ if df_1_sum != df_2_sum:
923
+ Answer = "They are different by " + str(df_2_sum-df_1_sum)
924
+
925
+ return Answer,df_1_sum,df_2_sum
926
+
927
+ def convert_2_df_cols_to_dict(self, df, key_col, value_col):
928
+ """
929
+ Create a dictionary mapping from two columns of a DataFrame.
930
+
931
+ Parameters:
932
+ df (pd.DataFrame): The DataFrame containing the data.
933
+ key_col (str): The column name to use as keys in the dictionary.
934
+ value_col (str): The column name to use as values in the dictionary.
935
+
936
+ Returns:
937
+ dict: A dictionary with keys from 'key_col' and values from 'value_col'.
938
+ """
939
+ if key_col not in df or value_col not in df:
940
+ raise ValueError("Specified columns are not in the DataFrame")
941
+
942
+ return {df[key_col].iloc[i]: df[value_col].iloc[i] for i in range(len(df))}
654
943
 
655
- def classify_within_column(self, df, col_name, to_find_dict, if_not_in_country_dict="Other"):
944
+ def create_FY_and_H_columns(self, df, index_col, start_date, starting_FY,short_format="No",half_years="No",combined_FY_and_H="No"):
945
+ """
946
+ Creates new DataFrame columns containing companies' Financial Year, Half Years and Financial Half years, based on the start date of the first full financial year
947
+
948
+ Parameters:
949
+ df (pandas.DataFrame): Dataframe to operate on.
950
+ index_col (str): Name of the column to use for datetime
951
+ start_date (str): String used to specify the start date of an FY specified, needs to be of format "yyyy-mm-dd" e.g. 2021-11-31
952
+ starting_FY (str): String used to specify which FY the start date refers to, needs to be formatted LONG e.g. FY2021
953
+ short_format (str, optional): String used to specify if short format is desired (e.g. FY21) or if long format is desired (e.g. FY2021). Defaults to "No".
954
+ half_years (str, optional): String used to specify if half year column is desired. Defaults to "No".
955
+ combined_FY_and_H (str, optional): String used to specify is a combined half year and FY column is desired. Defaults to "No".
956
+
957
+ Returns:
958
+ pandas.DataFrame: DataFrame with a new column 'FY' containing the FY as well as, if desired, a half year column and a combined FY half year column.
656
959
  """
657
- Classify entries in a DataFrame column based on a dictionary of substrings to class mappings.
658
960
 
961
+ try:
962
+ start_date = datetime.strptime(start_date, '%Y-%m-%d')
963
+ except ValueError:
964
+ print("Error: Date must be of format yyyy-mm-dd")
965
+ return df
966
+
967
+ df["OBS"] = pd.to_datetime(df[index_col])
968
+ df["OBS as string"] = df["OBS"].dt.strftime("%Y-%m-%d")
969
+
970
+ df[index_col] = pd.to_datetime(df[index_col])
971
+
972
+ start_year = int(starting_FY[2:])
973
+
974
+ def calculate_FY_vectorized(date_series):
975
+ years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
976
+ fy = 'FY' + (start_year + years_since_start).astype(str)
977
+ if short_format == "Yes":
978
+ fy = 'FY' + fy.str[-2:]
979
+ return fy
980
+
981
+ df['FY'] = calculate_FY_vectorized(df[index_col])
982
+
983
+ if half_years == "Yes" or combined_FY_and_H == "Yes":
984
+ def calculate_half_year_vectorized(date_series):
985
+ fy_years_since_start = ((date_series - start_date).dt.days / 364).astype(int)
986
+ fy_start_dates = start_date + fy_years_since_start * pd.DateOffset(years=1)
987
+ fy_end_of_h1 = fy_start_dates + pd.DateOffset(weeks=26) - pd.DateOffset(weeks=1)
988
+ half_year = np.where(date_series <= fy_end_of_h1, 'H1', 'H2')
989
+ return half_year
990
+
991
+ df['Half Years'] = calculate_half_year_vectorized(df[index_col])
992
+
993
+ if combined_FY_and_H == "Yes":
994
+ df['Financial Half Years'] = df['FY'] + ' ' + df['Half Years']
995
+
996
+ return df
997
+
998
+ def keyword_lookup_replacement(self, df, col, replacement_rows, cols_to_merge, replacement_lookup_dict, output_column_name="Updated Column"):
999
+ """
1000
+ This function updates values in a specified column of the DataFrame based on a lookup dictionary.
1001
+ It first merges several columns into a new 'Merged' column, then uses this merged column to determine
1002
+ if replacements are needed based on the dictionary.
1003
+
659
1004
  Parameters:
660
- - df: pandas.DataFrame to operate on.
661
- - col_name: String, name of the column to classify.
662
- - to_find_dict: Dictionary, where keys are substrings to find and values are the corresponding classifications.
663
- - if_not_in_country_dict: String, default classification if no substring matches are found.
1005
+ df (pd.DataFrame): The DataFrame to process.
1006
+ col (str): The name of the column whose values are potentially replaced.
1007
+ replacement_rows (str): The specific value in 'col' to check for replacements.
1008
+ cols_to_merge (list of str): List of column names whose contents will be merged to form a lookup key.
1009
+ replacement_lookup_dict (dict): Dictionary where keys are merged column values and values are the new data to replace in 'col'.
1010
+ output_column_name (str, optional): Name of column outputted. Defaults to "Updated Column".
1011
+
1012
+ Returns:
1013
+ pd.DataFrame: The modified DataFrame with updated values in the specified column.
1014
+ """
1015
+ # Create a merged column from specified columns
1016
+ df["Merged"] = df[cols_to_merge].apply(lambda row: '|'.join(row.values.astype(str)), axis=1)
1017
+
1018
+ # Replace values in the specified column based on the lookup
1019
+ def replace_values(x):
1020
+ if x[col] == replacement_rows:
1021
+ merged_value = x['Merged']
1022
+ if merged_value in replacement_lookup_dict:
1023
+ return replacement_lookup_dict[merged_value]
1024
+ return x[col]
664
1025
 
1026
+ # Apply replacement logic
1027
+ df[output_column_name] = df.apply(replace_values, axis=1)
1028
+
1029
+ # Drop the intermediate 'Merged' column
1030
+ df.drop(columns=['Merged'], inplace=True)
1031
+
1032
+ return df
1033
+
1034
+ def create_new_version_of_col_using_LUT(self, df, keys_col,value_col, dict_for_specific_changes, new_col_name="New Version of Old Col"):
1035
+ """
1036
+ Creates a new column in a dataframe, which takes an old column and uses a lookup table to changes values in the new column to reflect the lookup table.
1037
+ The lookup is based on a column in the dataframe. Can only input one column and output one new column.
1038
+
1039
+ Parameters:
1040
+ df (pandas.DataFrame): The DataFrame containing the data.
1041
+ keys_col (str): The name of the column which the LUT will be refercing to ouput a value.
1042
+ value_col (str): The name of the column which the new column will be based off. If a key in the key column is not found in the LUT, the values from this column are used instead.
1043
+ dict_for_specific_changes (dict): The LUT which the keys_col will be mapped on to find any values that need changing in the new column.
1044
+ new_col_name (str, optional): This is the name of the new column being generated. Defaults to "New Version of Old Col".
1045
+
665
1046
  Returns:
666
- - DataFrame with a new column '<col_name>_mapping' containing the classification results.
1047
+ pandas.DataFrame: DataFrame with a new column which is similar to the old column, except for where changes have been made to reflect the lookup table.
667
1048
  """
668
- # Define the inner function for classification
669
- def find_word_in_string(x_string, to_find_dict=to_find_dict, default_value=if_not_in_country_dict):
670
- x_string_lower = x_string.lower()
671
- for key, value in to_find_dict.items():
672
- if key.lower() in x_string_lower:
673
- return value
674
- return default_value
1049
+
1050
+ # Extract columns to change using new dictionary
1051
+ smaller_df = df[[keys_col,value_col]]
1052
+
1053
+ # Use the new dictionary to create a new LUT
1054
+ smaller_df_with_LUT = self.apply_lookup_table_for_columns(smaller_df,[keys_col,value_col],dict_for_specific_changes)
1055
+
1056
+ # In a new column, keep values from the old column that don't need updating as they are not in the dictionary, and replace values that do need updating with values from the dictionary based on the keys
1057
+ smaller_df_with_LUT["Updated Col"]=smaller_df_with_LUT.apply(lambda x: x['Mapping'] if x['Mapping'] != "Other" else x[value_col],axis=1)
1058
+
1059
+ # Drop the extra unecessary cols
1060
+ smaller_df_with_LUT.drop([keys_col,'Mapping'],axis=1,inplace=True)
1061
+
1062
+ # # Output dataframes as dictionary to be used in a LUT
1063
+ new_dict = self.convert_2_df_cols_to_dict(smaller_df_with_LUT,value_col,"Updated Col")
675
1064
 
676
- # Apply the inner function to the specified column and create a new column with the results
677
- df[col_name + '_mapping'] = df[col_name].apply(find_word_in_string)
1065
+ # # Use new dictionary to create a new version of an old column
1066
+ df_final = self.apply_lookup_table_for_columns(df,[keys_col],new_dict,"other",new_col_name)
678
1067
 
679
- return df
1068
+ return df_final
1069
+
1070
+ def convert_df_wide_2_long(self, df, value_cols, variable_col_name='Stacked', value_col_name='Value'):
1071
+ """
1072
+ Changes a dataframe from wide to long format.
1073
+
1074
+ Args:
1075
+ df (pandas.DataFrame): The DataFrame containing the data.
1076
+ value_cols (list of str or str if only one): List of column names to transform from several columns into one.
1077
+ variable_col_name (str, optional): Name of the new variable column containing the original column names. Defaults to 'Stacked'.
1078
+ value_col_name (str, optional): Name of the new value column containing the data from stacked columns. Defaults to 'Value'.
1079
+
1080
+ Returns:
1081
+ pandas.DataFrame: DataFrame transformed from wide to long format.
1082
+
1083
+ Raises:
1084
+ ValueError: If the number of columns to depivot is less than 2.
1085
+ """
1086
+ # Check length of value_cols is greater than 1
1087
+ if len(value_cols) < 2:
1088
+ raise ValueError("Number of inputs in list must be greater than 1")
1089
+
1090
+ # Find the columns that are not to be depivoted into one column
1091
+ id_vars = [col for col in df.columns if col not in value_cols] # Preserve column order in the DataFrame
1092
+
1093
+ # Melt all columns chosen into one column
1094
+ df_final = pd.melt(df, id_vars=id_vars, value_vars=value_cols, var_name=variable_col_name, value_name=value_col_name)
1095
+
1096
+ # Sort column order to match expected output
1097
+ ordered_columns = id_vars + [variable_col_name, value_col_name]
1098
+ df_final = df_final[ordered_columns]
1099
+
1100
+ return df_final
1101
+
1102
+ def manually_edit_data(self, df, filters_dict, col_to_change, new_value, change_in_existing_df_col="No", new_col_to_change_name='New', manual_edit_col_name=None, add_notes="No", existing_note_col_name=None, note=None):
1103
+ """
1104
+ Allows the capability to manually update any cell in dataframe by applying filters and chosing a column to edit in dataframe
1105
+
1106
+ Args:
1107
+ df (pandas.DataFrame): The DataFrame containing the data.
1108
+ filters_dict (dict): Dictionary of conditions for the boolean mask i.e. what to filter your df on to get to your chosen cell
1109
+ col_to_change (str): String name of column to edit
1110
+ new_value (any): Value of new input for cell
1111
+ change_in_existing_df_col (str, optional): Input of Yes or No to describe whether to make the change in an existing column. Defaults to "No".
1112
+ new_col_to_change_name (str, optional): Name of the new column to copy the column being edited into and to make the change in. Defaults to 'New'.
1113
+ manual_edit_col_name (str, optional): Name of the current manual edits column, if one is not specified it will be created. Defaults to None.
1114
+ add_notes (str, optional): Gives the option to create a new notes column. Defaults to "No".
1115
+ existing_note_col_name (str, optional): If there is an existing notes column this can be specified. Defaults to None.
1116
+ note (str), optional): The string of the note to be added to the column. Defaults to None.
1117
+
1118
+ Raises:
1119
+ TypeError: The column for the column to change can only be specified as one column as it is a string not a list
1120
+ ValueError: You can only input the values of "Yes" or "No" for whether to make the change in existing column
1121
+ ValueError: You can only input the values of "Yes" or "No" for whether to make a new notes column
1122
+
1123
+ Returns:
1124
+ pandas.DataFrame: Dataframe with manual changes added
1125
+ """
1126
+
1127
+ # Raise type error if more than one col is supported
1128
+ if isinstance(col_to_change, list):
1129
+ raise TypeError("Col to change must be specified as a string, not a list")
1130
+
1131
+ # Raises value error if input is invalid for change_in_existing_df_col
1132
+ if change_in_existing_df_col not in ["Yes", "No"]:
1133
+ raise ValueError("Invalid input value for change_in_existing_df_col. Allowed values are: ['Yes', 'No']")
1134
+
1135
+ # Raises value error if input is invalid for add_notes_col
1136
+ if add_notes not in ["Yes", "No"]:
1137
+ raise ValueError("Invalid input value for add_notes. Allowed values are: ['Yes', 'No']")
1138
+
1139
+ # Validate filters_dict format
1140
+ for col, cond in filters_dict.items():
1141
+ if not isinstance(cond, str) or len(cond.split(maxsplit=1)) < 2:
1142
+ raise ValueError(f"Invalid filter condition for column '{col}': '{cond}'. Expected format: 'operator value'")
1143
+
1144
+ # Create the filtered df by applying the conditions
1145
+ df_filtered = self.filter_df_on_multiple_conditions(df, filters_dict)
1146
+
1147
+ # Create a new column to add the changes if desired, else edit in the current chosen column
1148
+ col_to_update = col_to_change if change_in_existing_df_col == "Yes" else new_col_to_change_name
1149
+ if change_in_existing_df_col == "No" and new_col_to_change_name not in df.columns:
1150
+ df = df.copy()
1151
+ df[new_col_to_change_name] = df[col_to_change]
1152
+
1153
+ # Update the new cell in the chosen column
1154
+ df.loc[df_filtered.index, col_to_update] = new_value
1155
+
1156
+ # Add in manual edit column if desired or specify where one already is
1157
+ if manual_edit_col_name:
1158
+ if manual_edit_col_name not in df.columns:
1159
+ df[manual_edit_col_name] = 0
1160
+ df.loc[df_filtered.index, manual_edit_col_name] = 1
1161
+ elif not manual_edit_col_name and 'Manual Changes' not in df.columns:
1162
+ df['Manual Changes'] = 0
1163
+ df.loc[df_filtered.index, 'Manual Changes'] = 1
1164
+
1165
+ # Add note if desired in new column or an existing column
1166
+ if add_notes == "Yes":
1167
+ note_col = existing_note_col_name if existing_note_col_name else 'Notes'
1168
+ if note_col not in df.columns:
1169
+ df[note_col] = None
1170
+ df.loc[df_filtered.index, note_col] = note
1171
+
1172
+ return df
1173
+
1174
+ def format_numbers_with_commas(self, df, decimal_length_chosen=2):
1175
+ """
1176
+ Converts data in numerical format into numbers with commas and a chosen decimal place length.
1177
+
1178
+ Args:
1179
+ df (pandas.DataFrame): The DataFrame containing the data.
1180
+ decimal_length_chosen (int, optional): Number of decimal places. Defaults to 2.
1181
+
1182
+ Returns:
1183
+ pandas.DataFrame: The DataFrame with the chosen updated format.
1184
+ """
1185
+ def format_number_with_commas(x, decimal_length=decimal_length_chosen):
1186
+ if pd.isna(x): # Preserve None/NaN values
1187
+ return pd.NA # Explicitly normalize to pd.NA
1188
+ elif isinstance(x, (int, float)):
1189
+ if decimal_length is not None:
1190
+ format_str = f"{{:,.{decimal_length}f}}"
1191
+ return format_str.format(x)
1192
+ else:
1193
+ return f"{x:,}"
1194
+ else:
1195
+ return x # Return unchanged if not a number
1196
+
1197
+ # Apply formatting column by column
1198
+ formatted_df = df.apply(lambda col: col.map(format_number_with_commas)).fillna(value=pd.NA)
1199
+
1200
+ return formatted_df
1201
+
1202
+ def filter_df_on_multiple_conditions(self, df, filters_dict):
1203
+ """
1204
+ Filter a dataframe based on mulitple conditions
1205
+
1206
+ Args:
1207
+ df (pandas.DatFrame): Dataframe to filter on
1208
+ filters_dict (dict): Dictionary with strings as conditions
1209
+
1210
+ Returns:
1211
+ pandas.DatFrame: Filtered Da
1212
+ """
1213
+ mask = pd.Series(True, index=df.index)
1214
+ for col, cond in filters_dict.items():
1215
+ cond = cond.strip()
1216
+ operator, value = cond.split(maxsplit=1)
1217
+
1218
+ # If value is a string condition make sure to check if there are new lines
1219
+ if "'" in value:
1220
+ value = value.strip().strip("'\"")
1221
+ # If not a string e.g. datetime or number condition you need to transform the string into a value
1222
+ else:
1223
+ value = eval(value)
1224
+
1225
+ if operator == "==":
1226
+ temp_mask = (df[col] == value)
1227
+ elif operator == "!=":
1228
+ temp_mask = (df[col] != value)
1229
+ elif operator == ">=":
1230
+ temp_mask = (df[col] >= value)
1231
+ elif operator == "<=":
1232
+ temp_mask = (df[col] <= value)
1233
+ elif operator == ">":
1234
+ temp_mask = (df[col] > value)
1235
+ elif operator == "<":
1236
+ temp_mask = (df[col] < value)
1237
+ mask &= temp_mask
1238
+
1239
+ # Create the filtered df by applying the conditions
1240
+ df_filtered = df[mask]
1241
+
1242
+ return df_filtered
1243
+
1244
+ def read_and_concatenate_files(self, folder_path, file_type='csv'):
1245
+ """
1246
+ Reads all files of a specified type (CSV or XLSX) from a given folder
1247
+ and concatenates them into a single DataFrame.
1248
+
1249
+ Parameters:
1250
+ folder_path (str): The path to the folder containing the files.
1251
+ file_type (str): The type of files to read ('csv' or 'xlsx'). Defaults to 'csv'.
1252
+
1253
+ Returns:
1254
+ pd.DataFrame: A DataFrame containing the concatenated data from all files.
1255
+ """
1256
+
1257
+ # Initialize an empty list to hold dataframes
1258
+ dataframes = []
1259
+
1260
+ # Define file extension based on file_type
1261
+ if file_type == 'csv':
1262
+ extension = '.csv'
1263
+ elif file_type == 'xlsx':
1264
+ extension = '.xlsx'
1265
+ else:
1266
+ raise ValueError("file_type must be either 'csv' or 'xlsx'")
1267
+
1268
+ # Loop through all files in the folder
1269
+ for filename in os.listdir(folder_path):
1270
+ # Check if the file has the correct extension
1271
+ if filename.endswith(extension):
1272
+ file_path = os.path.join(folder_path, filename)
1273
+ # Read the file into a DataFrame
1274
+ if file_type == 'csv':
1275
+ df = pd.read_csv(file_path)
1276
+ elif file_type == 'xlsx':
1277
+ df = pd.read_excel(file_path)
1278
+ # Append the DataFrame to the list
1279
+ dataframes.append(df)
1280
+
1281
+ # Concatenate all DataFrames into a single DataFrame
1282
+ combined_df = pd.concat(dataframes, ignore_index=True)
1283
+
1284
+ return combined_df
1285
+
1286
+ def remove_zero_values(self, data_frame, column_to_filter):
1287
+ """
1288
+ Removes zero values from given columns
1289
+
1290
+ Parameters:
1291
+ df - input data frame
1292
+ column_to_filter - a column to filter out zero values from
1293
+
1294
+ Returns:
1295
+ Pandas data frame without null values
1296
+ """
1297
+
1298
+ #This line removes zero values from given column
1299
+ return data_frame.loc[~(data_frame[column_to_filter] ==0)]
1300
+
1301
+ def upgrade_outdated_packages(self):
1302
+ try:
1303
+ # Get all installed packages
1304
+ installed_packages_result = subprocess.run("pip list --format=json", shell=True, capture_output=True, text=True)
1305
+ installed_packages = json.loads(installed_packages_result.stdout)
1306
+
1307
+ # Get the list of outdated packages
1308
+ outdated_packages_result = subprocess.run("pip list --outdated --format=json", shell=True, capture_output=True, text=True)
1309
+ outdated_packages = json.loads(outdated_packages_result.stdout)
1310
+
1311
+ # Create a set of outdated package names for quick lookup
1312
+ outdated_package_names = {pkg['name'] for pkg in outdated_packages}
1313
+
1314
+ # Upgrade only outdated packages
1315
+ for package in installed_packages:
1316
+ package_name = package['name']
1317
+ if package_name in outdated_package_names:
1318
+ try:
1319
+ print(f"Upgrading package: {package_name}")
1320
+ upgrade_result = subprocess.run(f"pip install --upgrade {package_name}", shell=True, capture_output=True, text=True)
1321
+ if upgrade_result.returncode == 0:
1322
+ print(f"Successfully upgraded {package_name}")
1323
+ else:
1324
+ print(f"Failed to upgrade {package_name}: {upgrade_result.stderr}")
1325
+ except Exception as e:
1326
+ print(f"An error occurred while upgrading {package_name}: {e}")
1327
+ else:
1328
+ print(f"{package_name} is already up to date")
1329
+ except Exception as e:
1330
+ print(f"An error occurred during the upgrade process: {e}")
1331
+
1332
+ def convert_mixed_formats_dates(self, df, column_name):
1333
+ # Convert initial dates to datetime with coercion to handle errors
1334
+ df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
1335
+ df[column_name] = df[column_name].astype(str)
1336
+ corrected_dates = []
1337
+
1338
+ for date_str in df[column_name]:
1339
+ date_str = date_str.replace('-', '').replace('/', '')
1340
+ if len(date_str) == 8:
1341
+ year = date_str[:4]
1342
+ month = date_str[4:6]
1343
+ day = date_str[6:8]
1344
+ if int(day) <= 12:
1345
+ # Swap month and day
1346
+ corrected_date_str = f"{year}-{day}-{month}"
1347
+ else:
1348
+ corrected_date_str = f"{year}-{month}-{day}"
1349
+ # Convert to datetime
1350
+ corrected_date = pd.to_datetime(corrected_date_str, errors='coerce')
1351
+ else:
1352
+ corrected_date = pd.to_datetime(date_str, errors='coerce')
1353
+
1354
+ corrected_dates.append(corrected_date)
1355
+
1356
+ # Check length of the corrected_dates list
1357
+ if len(corrected_dates) != len(df):
1358
+ raise ValueError("Length of corrected_dates does not match the original DataFrame")
1359
+
1360
+ # Assign the corrected dates back to the DataFrame
1361
+ df[column_name] = corrected_dates
1362
+ return df
1363
+
1364
+ def fill_weekly_date_range(self, df, date_column, freq='W-MON'):
1365
+ # Ensure the date column is in datetime format
1366
+ df[date_column] = pd.to_datetime(df[date_column])
1367
+
1368
+ # Generate the full date range with the specified frequency
1369
+ full_date_range = pd.date_range(start=df[date_column].min(), end=df[date_column].max(), freq=freq)
1370
+
1371
+ # Create a new dataframe with the full date range
1372
+ full_date_df = pd.DataFrame({date_column: full_date_range})
1373
+
1374
+ # Merge the original dataframe with the new full date range dataframe
1375
+ df_full = full_date_df.merge(df, on=date_column, how='left')
1376
+
1377
+ # Fill missing values with 0
1378
+ df_full.fillna(0, inplace=True)
1379
+
1380
+ return df_full
1381
+
1382
+ def add_prefix_and_suffix(self, df, prefix='', suffix='', date_col=None):
1383
+ """
1384
+ Adds a specified prefix and/or suffix to the column names of a DataFrame. Optionally, a column (e.g., a date column) can be excluded.
1385
+
1386
+ Args:
1387
+ df (pd.DataFrame): The DataFrame whose column names will be modified.
1388
+ prefix (str, optional): The prefix to add to each column name. Default is an empty string.
1389
+ suffix (str, optional): The suffix to add to each column name. Default is an empty string.
1390
+ date_col (str, optional): The name of the column to exclude from adding prefix and suffix, typically a date column. Default is None.
1391
+
1392
+ Returns:
1393
+ pd.DataFrame: The DataFrame with updated column names.
1394
+ """
1395
+
1396
+ # If there is no date column
1397
+ if date_col is None:
1398
+ # Add prefixes and suffixes to all columns
1399
+ df.columns = [prefix + col + suffix for col in df.columns]
1400
+ else:
1401
+ # Add prefixes and suffixes to all columns except the date column
1402
+ df.columns = [prefix + col + suffix if col != date_col else col for col in df.columns]
1403
+
1404
+ return df
1405
+
1406
+ def create_dummies(self, df, date_col=None, dummy_threshold=0, add_total_dummy_col='No', total_col_name='total'):
1407
+ """
1408
+ Creates dummy variables for the DataFrame, converting values greater than the threshold to 1 and others to 0.
1409
+ Optionally adds a total dummy column indicating whether any row contains at least one value greater than the threshold.
1410
+
1411
+ Args:
1412
+ df (pd.DataFrame): The DataFrame to process.
1413
+ date_col (str, optional): The column name to exclude from the dummy conversion, typically a date column. Default is None.
1414
+ dummy_threshold (int, optional): The threshold value; values greater than this become 1, others become 0. Default is 0.
1415
+ add_total_dummy_col (str, optional): If set to any value other than 'No', adds a column that contains the max value (1 or 0) for each row. Default is 'No'.
1416
+ total_col_name (str, optional): The name of the total column to add if add_total_dummy_col is not 'No'. Default is 'total'.
1417
+
1418
+ Returns:
1419
+ pd.DataFrame: The modified DataFrame with dummies applied and optional total column.
1420
+ """
1421
+
1422
+ # If there is no date column
1423
+ if date_col is None:
1424
+ df = df.apply(lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0))
1425
+
1426
+ if add_total_dummy_col != 'No':
1427
+ # Find max value of rows
1428
+ df[total_col_name] = df.max(axis=1)
1429
+
1430
+ # If there is a date column
1431
+ else:
1432
+ # Create dummies for all columns except the date column
1433
+ df.loc[:, df.columns != date_col] = df.loc[:, df.columns != date_col].apply(
1434
+ lambda col: col.map(lambda x: 1 if x > dummy_threshold else 0)
1435
+ )
1436
+
1437
+ if add_total_dummy_col != 'No':
1438
+ # Find max value of rows
1439
+ df[total_col_name] = df.loc[:, df.columns != date_col].max(axis=1)
1440
+
1441
+ return df
1442
+
1443
+ def replace_substrings(self, df, column, replacements, to_lower=False, new_column=None):
1444
+ """
1445
+ Replaces substrings in a column of a DataFrame based on a dictionary of replacements.
1446
+ Optionally converts the column values to lowercase and allows creating a new column or modifying the existing one.
1447
+
1448
+ Args:
1449
+ df (pd.DataFrame): The DataFrame containing the column to modify.
1450
+ column (str): The column name where the replacements will be made.
1451
+ replacements (dict): A dictionary where keys are substrings to replace and values are the replacement strings.
1452
+ to_lower (bool, optional): If True, the column values will be converted to lowercase before applying replacements. Default is False.
1453
+ new_column (str, optional): If provided, the replacements will be applied to this new column. If None, the existing column will be modified. Default is None.
1454
+
1455
+ Returns:
1456
+ pd.DataFrame: The DataFrame with the specified replacements made, and optionally with lowercase strings.
1457
+ """
1458
+ if new_column is not None:
1459
+ # Create a new column for replacements
1460
+ df[new_column] = df[column]
1461
+ temp_column = new_column
1462
+ else:
1463
+ # Modify the existing column
1464
+ temp_column = column
1465
+
1466
+ # Optionally convert to lowercase
1467
+ if to_lower:
1468
+ df[temp_column] = df[temp_column].str.lower()
1469
+
1470
+ # Apply substring replacements
1471
+ for old, new in replacements.items():
1472
+ df[temp_column] = df[temp_column].str.replace(old, new, regex=False)
1473
+
1474
+ return df
1475
+
1476
+ def add_total_column(self, df, exclude_col=None, total_col_name='Total'):
1477
+ """
1478
+ Adds a total column to a DataFrame by summing across all columns. Optionally excludes a specified column.
1479
+
1480
+ Args:
1481
+ df (pd.DataFrame): The DataFrame to modify.
1482
+ exclude_col (str, optional): The column name to exclude from the sum. Default is None.
1483
+ total_col_name (str, optional): The name of the new total column. Default is 'Total'.
1484
+
1485
+ Returns:
1486
+ pd.DataFrame: The DataFrame with an added total column.
1487
+ """
1488
+ if exclude_col and exclude_col in df.columns:
1489
+ # Ensure the column to exclude exists before dropping
1490
+ df[total_col_name] = df.drop(columns=[exclude_col], errors='ignore').sum(axis=1)
1491
+ else:
1492
+ # Sum across all columns if no column is specified to exclude
1493
+ df[total_col_name] = df.sum(axis=1)
1494
+
1495
+ return df
1496
+
1497
+ def apply_lookup_table_based_on_substring(self, df, column_name, category_dict, new_col_name='Category', other_label='Other'):
1498
+ """
1499
+ Categorizes text in a specified DataFrame column by applying a lookup table based on substrings.
1500
+
1501
+ Args:
1502
+ df (pd.DataFrame): The DataFrame containing the column to categorize.
1503
+ column_name (str): The name of the column in the DataFrame that contains the text data to categorize.
1504
+ category_dict (dict): A dictionary where keys are substrings to search for in the text and values are the categories to assign when a substring is found.
1505
+ new_col_name (str, optional): The name of the new column to be created in the DataFrame, which will hold the resulting categories. Default is 'Category'.
1506
+ other_label (str, optional): The name given to category if no substring from the dictionary is found in the cell
1507
+
1508
+ Returns:
1509
+ pd.DataFrame: The original DataFrame with an additional column containing the assigned categories.
1510
+ """
1511
+
1512
+ def categorize_text(text):
1513
+ """
1514
+ Assigns a category to a single text string based on the presence of substrings from a dictionary.
1515
+
1516
+ Args:
1517
+ text (str): The text string to categorize.
1518
+
1519
+ Returns:
1520
+ str: The category assigned based on the first matching substring found in the text. If no
1521
+ matching substring is found, returns other_name.
1522
+ """
1523
+ for key, category in category_dict.items():
1524
+ if key.lower() in text.lower(): # Check if the substring is in the text (case-insensitive)
1525
+ return category
1526
+ return other_label # Default category if no match is found
1527
+
1528
+ # Apply the categorize_text function to each element in the specified column
1529
+ df[new_col_name] = df[column_name].apply(categorize_text)
1530
+ return df
1531
+
1532
+ def compare_overlap(self, df1, df2, date_col):
1533
+ """
1534
+ Compare overlapping periods between two DataFrames and provide a summary of total differences.
1535
+
1536
+ Args:
1537
+ df1 (pandas.DataFrame): First DataFrame containing date-based data.
1538
+ df2 (pandas.DataFrame): Second DataFrame containing date-based data.
1539
+ date_col (str): The name of the date column used for aligning data.
1540
+
1541
+ Returns:
1542
+ tuple: A tuple containing the DataFrame of differences and a summary DataFrame with total differences by column.
1543
+ """
1544
+ # Ensure date columns are in datetime format
1545
+ df1[date_col] = pd.to_datetime(df1[date_col])
1546
+ df2[date_col] = pd.to_datetime(df2[date_col])
1547
+
1548
+ # Determine the overlap period
1549
+ start_date = max(df1[date_col].min(), df2[date_col].min())
1550
+ end_date = min(df1[date_col].max(), df2[date_col].max())
1551
+
1552
+ # Filter DataFrames to the overlapping period
1553
+ df1_overlap = df1[(df1[date_col] >= start_date) & (df1[date_col] <= end_date)]
1554
+ df2_overlap = df2[(df2[date_col] >= start_date) & (df2[date_col] <= end_date)]
1555
+
1556
+ # Merge the DataFrames on the date column
1557
+ merged_df = pd.merge(df1_overlap, df2_overlap, on=date_col, suffixes=('_df1', '_df2'))
1558
+
1559
+ # Get common columns, excluding the date column
1560
+ common_cols = [col for col in df1.columns if col != date_col and col in df2.columns]
1561
+
1562
+ # Create a DataFrame for differences
1563
+ diff_df = pd.DataFrame({date_col: merged_df[date_col]})
1564
+
1565
+ total_diff_list = []
1566
+ for col in common_cols:
1567
+ diff_col = f'diff_{col}'
1568
+ diff_df[diff_col] = merged_df[f'{col}_df1'] - merged_df[f'{col}_df2'] # Corrected subtraction order
1569
+
1570
+ # Sum differences for the column
1571
+ total_diff = diff_df[diff_col].sum()
1572
+ total_diff_list.append({'Column': col, 'Total Difference': total_diff})
1573
+
1574
+ # Create summary DataFrame
1575
+ total_diff_df = pd.DataFrame(total_diff_list)
1576
+
1577
+ return diff_df, total_diff_df
1578
+
1579
+ def week_commencing_2_week_commencing_conversion_isoweekday(self, df, date_col, week_commencing='mon'):
1580
+ """
1581
+ Convert a DataFrame's date column so that each date is mapped back
1582
+ to the 'week_commencing' day of the *current ISO week*.
1583
+
1584
+ Args:
1585
+ df (pandas.DataFrame): The DataFrame with date-based data.
1586
+ date_col (str): The name of the date column.
1587
+ week_commencing (str): The desired start of the week.
1588
+ ('mon'=Monday, 'tue'=Tuesday, ..., 'sun'=Sunday).
1589
+ Uses ISO day numbering (Mon=1, ..., Sun=7).
1590
+
1591
+ Returns:
1592
+ pandas.DataFrame: Original DataFrame with an extra column
1593
+ 'week_start_<week_commencing>' containing the
1594
+ start-of-week date for each row.
1595
+ """
1596
+ # ISO-based dictionary: Monday=1, Tuesday=2, ..., Sunday=7
1597
+ iso_day_dict = {"mon": 1, "tue": 2, "wed": 3, "thur": 4, "fri": 5, "sat": 6, "sun": 7}
1598
+
1599
+ target_day = iso_day_dict[week_commencing]
1600
+
1601
+ def map_to_week_start(date_val):
1602
+ delta = (date_val.isoweekday() - target_day) % 7
1603
+ return date_val - pd.Timedelta(days=delta)
1604
+
1605
+ # Apply the transformation
1606
+ new_col = f"week_start_{week_commencing}"
1607
+ df[new_col] = df[date_col].apply(map_to_week_start)
1608
+
1609
+ return df
1610
+
1611
+ def plot_chart(self, df, date_col, value_cols, chart_type='line', title='Chart', x_title='Date', y_title='Values', **kwargs):
1612
+ """
1613
+ Plot various types of charts using Plotly.
1614
+
1615
+ Args:
1616
+ df (pandas.DataFrame): DataFrame containing the data.
1617
+ date_col (str): The name of the column with date information.
1618
+ value_cols (list): List of columns to plot.
1619
+ chart_type (str): Type of chart to plot ('line', 'bar', 'scatter', 'pie', 'box', 'heatmap', 'area', 'bubble', 'funnel', 'waterfall', 'contour', 'scatter3d').
1620
+ title (str): Title of the chart.
1621
+ x_title (str): Title of the x-axis.
1622
+ y_title (str): Title of the y-axis.
1623
+ **kwargs: Additional keyword arguments for customization.
1624
+
1625
+ Returns:
1626
+ plotly.graph_objects.Figure: The Plotly figure object.
1627
+ """
1628
+ # Ensure the date column is in datetime format
1629
+ df[date_col] = pd.to_datetime(df[date_col])
1630
+
1631
+ # Initialize the figure
1632
+ fig = go.Figure()
1633
+
1634
+ # Make sure the date col is excluded from the line cols
1635
+ value_cols = [x for x in value_cols if x!=date_col]
1636
+
1637
+ # Add each value column to the plot based on the chart type
1638
+ for col in value_cols:
1639
+ if chart_type == 'line':
1640
+ fig.add_trace(go.Scatter(
1641
+ x=df[date_col],
1642
+ y=df[col],
1643
+ mode='lines',
1644
+ name=col,
1645
+ **kwargs
1646
+ ))
1647
+ elif chart_type == 'bar':
1648
+ fig.add_trace(go.Bar(
1649
+ x=df[date_col],
1650
+ y=df[col],
1651
+ name=col,
1652
+ **kwargs
1653
+ ))
1654
+ elif chart_type == 'scatter':
1655
+ fig.add_trace(go.Scatter(
1656
+ x=df[date_col],
1657
+ y=df[col],
1658
+ mode='markers',
1659
+ name=col,
1660
+ **kwargs
1661
+ ))
1662
+ elif chart_type == 'histogram':
1663
+ fig.add_trace(go.Histogram(
1664
+ x=df[col],
1665
+ name=col,
1666
+ **kwargs
1667
+ ))
1668
+ elif chart_type == 'pie':
1669
+ fig.add_trace(go.Pie(
1670
+ labels=df[date_col], # or another column for labels
1671
+ values=df[col],
1672
+ name=col,
1673
+ **kwargs
1674
+ ))
1675
+ elif chart_type == 'box':
1676
+ fig.add_trace(go.Box(
1677
+ y=df[col],
1678
+ name=col,
1679
+ **kwargs
1680
+ ))
1681
+ elif chart_type == 'heatmap':
1682
+ fig.add_trace(go.Heatmap(
1683
+ z=df.pivot_table(index=date_col, columns=value_cols[0], values=value_cols[1]),
1684
+ x=df[value_cols[0]],
1685
+ y=df[date_col],
1686
+ **kwargs
1687
+ ))
1688
+ elif chart_type == 'area':
1689
+ fig.add_trace(go.Scatter(
1690
+ x=df[date_col],
1691
+ y=df[col],
1692
+ mode='lines', # Use 'lines+markers' if you want markers
1693
+ fill='tozeroy', # Fill the area under the line
1694
+ name=col,
1695
+ **kwargs
1696
+ ))
1697
+ elif chart_type == 'bubble':
1698
+ fig.add_trace(go.Scatter(
1699
+ x=df[value_cols[0]],
1700
+ y=df[value_cols[1]],
1701
+ mode='markers',
1702
+ marker=dict(size=df[value_cols[2]]),
1703
+ name='Bubble Chart',
1704
+ **kwargs
1705
+ ))
1706
+ elif chart_type == 'funnel':
1707
+ fig.add_trace(go.Funnel(
1708
+ y=df[date_col],
1709
+ x=df[col],
1710
+ **kwargs
1711
+ ))
1712
+ elif chart_type == 'waterfall':
1713
+ fig.add_trace(go.Waterfall(
1714
+ x=df[date_col],
1715
+ y=df[col],
1716
+ measure=df[value_cols[1]], # measures like 'increase', 'decrease', 'total'
1717
+ **kwargs
1718
+ ))
1719
+ elif chart_type == 'contour':
1720
+ fig.add_trace(go.Contour(
1721
+ z=df.pivot_table(index=value_cols[0], columns=value_cols[1], values=value_cols[2]),
1722
+ x=df[value_cols[0]],
1723
+ y=df[value_cols[1]],
1724
+ **kwargs
1725
+ ))
1726
+ elif chart_type == 'scatter3d':
1727
+ fig.add_trace(go.Scatter3d(
1728
+ x=df[value_cols[0]],
1729
+ y=df[value_cols[1]],
1730
+ z=df[value_cols[2]],
1731
+ mode='markers',
1732
+ **kwargs
1733
+ ))
1734
+ else:
1735
+ raise ValueError(f"Unsupported chart type: {chart_type}")
1736
+
1737
+ # Update the layout of the figure
1738
+ fig.update_layout(
1739
+ title=title,
1740
+ xaxis_title=x_title,
1741
+ yaxis_title=y_title,
1742
+ legend_title='Series',
1743
+ template='plotly_dark'
1744
+ )
1745
+
1746
+ return fig
1747
+
1748
+ def plot_two_with_common_cols(self, df1, df2, date_column, same_axis=True):
1749
+ """
1750
+ Plot multiple series from two DataFrames with common columns using a specified date column for the X-axis.
1751
+
1752
+ Args:
1753
+ df1 (pandas.DataFrame): The first DataFrame containing data to plot.
1754
+ df2 (pandas.DataFrame): The second DataFrame containing data to plot.
1755
+ date_column (str): The name of the date column in the DataFrames.
1756
+ same_axis (bool, optional): Whether to plot the series on the same y-axis. Defaults to True.
1757
+
1758
+ Returns:
1759
+ list: A list of Plotly figures generated from the common columns.
1760
+ """
1761
+ # Find common columns between df1 and df2, excluding the date column
1762
+ common_columns = list(set(df1.columns).intersection(set(df2.columns)) - {date_column})
1763
+
1764
+ # Generate col_pairs list for plot_two function
1765
+ col_pairs = [(col, col) for col in common_columns]
1766
+
1767
+ # Loop through the common columns and plot each pair
1768
+ figs = []
1769
+ for col1, col2 in col_pairs:
1770
+ # Call the existing plot_two function
1771
+ fig = self.plot_two(df1, col1, df2, col2, date_column, same_axis=same_axis)
1772
+ figs.append(fig)
1773
+
1774
+ return figs
1775
+
1776
+ ########################################################################################################################################
1777
+ ########################################################################################################################################
1778
+
1779
+ ims_proc = dataprocessing()
1780
+
1781
+ class datapull:
1782
+
1783
+ def help(self):
1784
+ print("This is the help section. The functions in the package are as follows:")
1785
+
1786
+ print("\n1. pull_fred_data")
1787
+ print(" - Description: Get data from FRED by using series id tokens.")
1788
+ print(" - Usage: pull_fred_data(week_commencing, series_id_list)")
1789
+ print(" - Example: pull_fred_data('mon', ['GPDIC1'])")
1790
+
1791
+ print("\n2. pull_boe_data")
1792
+ print(" - Description: Fetch and process Bank of England interest rate data.")
1793
+ print(" - Usage: pull_boe_data(week_commencing)")
1794
+ print(" - Example: pull_boe_data('mon')")
1795
+
1796
+ print("\n3. pull_oecd")
1797
+ print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
1798
+ print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '2020-01-01')")
1799
+ print(" - Example: pull_oecd('GBR', 'mon', '2000-01-01')")
1800
+
1801
+ print("\n4. get_google_mobility_data")
1802
+ print(" - Description: Fetch Google Mobility data for the specified country.")
1803
+ print(" - Usage: get_google_mobility_data(country, wc)")
1804
+ print(" - Example: get_google_mobility_data('United Kingdom', 'mon')")
1805
+
1806
+ print("\n5. pull_seasonality")
1807
+ print(" - Description: Generate combined dummy variables for seasonality, trends, and COVID lockdowns.")
1808
+ print(" - Usage: pull_seasonality(week_commencing, start_date, countries)")
1809
+ print(" - Example: pull_seasonality('mon', '2020-01-01', ['US', 'GB'])")
1810
+
1811
+ print("\n6. pull_weather")
1812
+ print(" - Description: Fetch and process historical weather data for the specified country.")
1813
+ print(" - Usage: pull_weather(week_commencing, country)")
1814
+ print(" - Example: pull_weather('mon', 'GBR')")
1815
+
1816
+ print("\n7. pull_macro_ons_uk")
1817
+ print(" - Description: Fetch and process time series data from the Beta ONS API.")
1818
+ print(" - Usage: pull_macro_ons_uk(aditional_list, week_commencing, sector)")
1819
+ print(" - Example: pull_macro_ons_uk(['HBOI'], 'mon', 'fast_food')")
1820
+
1821
+ print("\n8. pull_yfinance")
1822
+ print(" - Description: Fetch and process time series data from the Beta ONS API.")
1823
+ print(" - Usage: pull_yfinance(tickers, week_start_day)")
1824
+ print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
1825
+
1826
+ print("\n9. pull_ga")
1827
+ print(" - Description: Pull in GA4 data for geo experiments.")
1828
+ print(" - Usage: pull_ga(credentials_file, property_id, start_date, country, metrics)")
1829
+ print(" - Example: pull_ga('GeoExperiment-31c5f5db2c39.json', '111111111', '2023-10-15', 'United Kingdom', ['totalUsers', 'newUsers'])")
1830
+
1831
+ ############################################################### MACRO ##########################################################################
1832
+
1833
+ def pull_fred_data(self, week_commencing: str = 'mon', series_id_list: list[str] = ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]) -> pd.DataFrame:
1834
+ '''
1835
+ Parameters
1836
+ ----------
1837
+ week_commencing : str
1838
+ specify the day for the week commencing, the default is 'sun' (e.g., 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
1839
+
1840
+ series_id_list : list[str]
1841
+ provide a list with IDs to download data series from FRED (link: https://fred.stlouisfed.org/tags/series?t=id). Default list is
1842
+ ["GPDIC1", "Y057RX1Q020SBEA", "GCEC1"]
1843
+
1844
+ Returns
1845
+ ----------
1846
+ pd.DataFrame
1847
+ Return a data frame with FRED data according to the series IDs provided
1848
+ '''
1849
+ # Fred API
1850
+ fred = Fred(api_key='76f5f8156145fdb8fbaf66f1eb944f8a')
1851
+
1852
+ # Fetch the metadata for each series to get the full names
1853
+ series_names = {series_id: fred.get_series_info(series_id).title for series_id in series_id_list}
1854
+
1855
+ # Download data from series id list
1856
+ fred_series = {series_id: fred.get_series(series_id) for series_id in series_id_list}
1857
+
1858
+ # Data processing
1859
+ date_range = {'OBS': pd.date_range("1950-01-01", datetime.today().strftime('%Y-%m-%d'), freq='d')}
1860
+ fred_series_df = pd.DataFrame(date_range)
1861
+
1862
+ for series_id, series_data in fred_series.items():
1863
+ series_data = series_data.reset_index()
1864
+ series_data.columns = ['OBS', series_names[series_id]] # Use the series name as the column header
1865
+ fred_series_df = pd.merge_asof(fred_series_df, series_data, on='OBS', direction='backward')
1866
+
1867
+ # Handle duplicate columns
1868
+ for col in fred_series_df.columns:
1869
+ if '_x' in col:
1870
+ base_col = col.replace('_x', '')
1871
+ fred_series_df[base_col] = fred_series_df[col].combine_first(fred_series_df[base_col + '_y'])
1872
+ fred_series_df.drop([col, base_col + '_y'], axis=1, inplace=True)
1873
+
1874
+ # Ensure sum_columns are present in the DataFrame
1875
+ sum_columns = [series_names[series_id] for series_id in series_id_list if series_names[series_id] in fred_series_df.columns]
1876
+
1877
+ # Aggregate results by week
1878
+ fred_df_final = ims_proc.aggregate_daily_to_wc_wide(df=fred_series_df,
1879
+ date_column="OBS",
1880
+ group_columns=[],
1881
+ sum_columns=sum_columns,
1882
+ wc=week_commencing,
1883
+ aggregation="average")
1884
+
1885
+ # Remove anything after the instance of any ':' in the column names and rename, except for 'OBS'
1886
+ fred_df_final.columns = ['OBS' if col == 'OBS' else 'macro_' + col.lower().split(':')[0].replace(' ', '_') for col in fred_df_final.columns]
1887
+
1888
+ return fred_df_final
1889
+
1890
+ def pull_boe_data(self, week_commencing="mon", max_retries=5, delay=5):
1891
+ """
1892
+ Fetch and process Bank of England interest rate data.
1893
+
1894
+ Args:
1895
+ week_commencing (str): The starting day of the week for aggregation.
1896
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1897
+ Default is "mon".
1898
+ max_retries (int): Maximum number of retries to fetch data in case of failure. Default is 5.
1899
+ delay (int): Delay in seconds between retry attempts. Default is 5.
1900
+
1901
+ Returns:
1902
+ pd.DataFrame: A DataFrame with weekly aggregated Bank of England interest rates.
1903
+ The 'OBS' column contains the week commencing dates in 'dd/mm/yyyy' format
1904
+ and 'macro_boe_intr_rate' contains the average interest rate for the week.
1905
+ """
1906
+ # Week commencing dictionary
1907
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1908
+
1909
+ # URL of the Bank of England data page
1910
+ url = 'https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp'
1911
+
1912
+ # Retry logic for HTTP request
1913
+ for attempt in range(max_retries):
1914
+ try:
1915
+ # Set up headers to mimic a browser request
1916
+ headers = {
1917
+ "User-Agent": (
1918
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
1919
+ "Chrome/91.0.4472.124 Safari/537.36"
1920
+ )
1921
+ }
1922
+ response = requests.get(url, headers=headers)
1923
+ response.raise_for_status() # Raise an exception for HTTP errors
1924
+ break
1925
+ except requests.exceptions.RequestException as e:
1926
+ print(f"Attempt {attempt + 1} failed: {e}")
1927
+ if attempt < max_retries - 1:
1928
+ time.sleep(delay)
1929
+ else:
1930
+ raise
1931
+
1932
+ # Parse the HTML page
1933
+ soup = BeautifulSoup(response.content, "html.parser")
1934
+
1935
+ # Find the table on the page
1936
+ table = soup.find("table") # Locate the first table
1937
+ table_html = str(table) # Convert table to string
1938
+ df = pd.read_html(StringIO(table_html))[0] # Use StringIO to wrap the table HTML
1939
+
1940
+ # Rename and clean up columns
1941
+ df.rename(columns={"Date Changed": "OBS", "Rate": "macro_boe_intr_rate"}, inplace=True)
1942
+ df["OBS"] = pd.to_datetime(df["OBS"], format="%d %b %y")
1943
+ df.sort_values("OBS", inplace=True)
1944
+
1945
+ # Create a daily date range
1946
+ date_range = pd.date_range(df["OBS"].min(), datetime.today(), freq="D")
1947
+ df_daily = pd.DataFrame(date_range, columns=["OBS"])
1948
+
1949
+ # Adjust each date to the specified week commencing day
1950
+ df_daily["Week_Commencing"] = df_daily["OBS"].apply(
1951
+ lambda x: x - timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
1952
+ )
1953
+
1954
+ # Merge and forward-fill missing rates
1955
+ df_daily = df_daily.merge(df, on="OBS", how="left")
1956
+ df_daily["macro_boe_intr_rate"] = df_daily["macro_boe_intr_rate"].ffill()
1957
+
1958
+ # Group by week commencing and calculate the average rate
1959
+ df_final = df_daily.groupby("Week_Commencing")["macro_boe_intr_rate"].mean().reset_index()
1960
+ df_final["Week_Commencing"] = df_final["Week_Commencing"].dt.strftime('%d/%m/%Y')
1961
+ df_final.rename(columns={"Week_Commencing": "OBS"}, inplace=True)
1962
+
1963
+ return df_final
1964
+
1965
+ def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "2020-01-01") -> pd.DataFrame:
1966
+ """
1967
+ Fetch and process time series data from the OECD API.
1968
+
1969
+ Args:
1970
+ country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
1971
+ week_commencing (str): The starting day of the week for aggregation.
1972
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1973
+ start_date (str): Dataset start date in the format "YYYY-MM-DD"
1974
+
1975
+ Returns:
1976
+ pd.DataFrame: A DataFrame with weekly aggregated OECD data. The 'OBS' column contains the week
1977
+ commencing dates, and other columns contain the aggregated time series values.
1978
+ """
1979
+
1980
+ def parse_quarter(date_str):
1981
+ """Parses a string in 'YYYY-Q#' format into a datetime object."""
1982
+ year, quarter = date_str.split('-')
1983
+ quarter_number = int(quarter[1])
1984
+ month = (quarter_number - 1) * 3 + 1
1985
+ return pd.Timestamp(f"{year}-{month:02d}-01")
1986
+
1987
+ # Generate a date range from 1950-01-01 to today
1988
+ date_range = pd.date_range(start=start_date, end=datetime.today(), freq='D')
1989
+
1990
+ url_details = [
1991
+ ["BCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_business_confidence_index"],
1992
+ ["CCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_consumer_confidence_index"],
1993
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA._T.N.GY", "macro_cpi_total"],
1994
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP041T043.N.GY", "macro_cpi_housing"],
1995
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP01.N.GY", "macro_cpi_food"],
1996
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP045_0722.N.GY", "macro_cpi_energy"],
1997
+ ["UNE_LF_M", "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,", "._Z.Y._T.Y_GE15.", "macro_unemployment_rate"],
1998
+ ["EAR", "SDD.TPS,DSD_EAR@DF_HOU_EAR,", ".Y..S1D", "macro_private_hourly_earnings"],
1999
+ ["RHP", "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0", "", "macro_real_house_prices"],
2000
+ ["PRVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX.C..", "macro_manufacturing_production_volume"],
2001
+ ["TOVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX...", "macro_retail_trade_volume"],
2002
+ ["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
2003
+ ["IRLT", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_long_term_interest_rate"],
2004
+ ["B1GQ", "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1", "._Z....GY.T0102", "macro_gdp_growth_yoy"]
2005
+ ]
2006
+
2007
+ # Create empty final dataframe
2008
+ oecd_df_final = pd.DataFrame()
2009
+
2010
+ daily_df = pd.DataFrame({'OBS': date_range})
2011
+ value_columns = []
2012
+
2013
+ # Iterate for each variable of interest
2014
+ for series_details in url_details:
2015
+ series = series_details[0]
2016
+ dataset_id = series_details[1]
2017
+ filter = series_details[2]
2018
+ col_name = series_details[3]
2019
+
2020
+ # check if request was successful and determine the most granular data available
2021
+ for freq in ['M', 'Q', 'A']:
2022
+
2023
+ if series in ["UNE_LF_M", "EAR"]:
2024
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
2025
+ elif series in ["B1GQ"]:
2026
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
2027
+ else:
2028
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
2029
+
2030
+ # Make the request to the OECD API for data
2031
+ data_response = requests.get(data_url)
2032
+
2033
+ # Check if the request was successful
2034
+ if data_response.status_code != 200:
2035
+ print(f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}")
2036
+ url_test = False
2037
+ continue
2038
+ else:
2039
+ url_test = True
2040
+ break
2041
+
2042
+ # get data for the next variable if url doesn't exist
2043
+ if url_test is False:
2044
+ continue
2045
+
2046
+ root = ET.fromstring(data_response.content)
2047
+
2048
+ # Define namespaces if necessary (the namespace is included in the tags)
2049
+ namespaces = {'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic'}
2050
+
2051
+ # Lists to store the data
2052
+ dates = []
2053
+ values = []
2054
+
2055
+ # Iterate over all <Obs> elements and extract date and value
2056
+ for obs in root.findall('.//generic:Obs', namespaces):
2057
+
2058
+ # Extracting the time period (date)
2059
+ time_period = obs.find('.//generic:ObsDimension', namespaces).get('value')
2060
+
2061
+ # Extracting the observation value
2062
+ value = obs.find('.//generic:ObsValue', namespaces).get('value')
2063
+
2064
+ # Storing the data
2065
+ if time_period and value:
2066
+ dates.append(time_period)
2067
+ values.append(float(value)) # Convert value to float
2068
+
2069
+ # Add variable names that were found to a list
2070
+ value_columns.append(col_name)
2071
+
2072
+ # Creating a DataFrame
2073
+ data = pd.DataFrame({'OBS': dates, col_name: values})
2074
+
2075
+ # Convert date strings into datetime format
2076
+ if freq == 'Q':
2077
+ data['OBS'] = data['OBS'].apply(parse_quarter)
2078
+ else:
2079
+ # Display the DataFrame
2080
+ data['OBS'] = data['OBS'].apply(lambda x: datetime.strptime(x, '%Y-%m'))
2081
+
2082
+ # Sort data by chronological order
2083
+ data.sort_values(by='OBS', inplace=True)
2084
+
2085
+ # Merge the data based on the observation date
2086
+ daily_df = pd.merge_asof(daily_df, data[['OBS', col_name]], on='OBS', direction='backward')
2087
+
2088
+
2089
+ # Ensure columns are numeric
2090
+ for col in value_columns:
2091
+ if col in daily_df.columns:
2092
+ daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
2093
+ else:
2094
+ print(f"Column {col} not found in daily_df")
2095
+
2096
+ # Aggregate results by week
2097
+ country_df = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
2098
+ date_column="OBS",
2099
+ group_columns=[],
2100
+ sum_columns=value_columns,
2101
+ wc=week_commencing,
2102
+ aggregation="average")
2103
+
2104
+ oecd_df_final = pd.concat([oecd_df_final, country_df], axis=0, ignore_index=True)
2105
+
2106
+ return oecd_df_final
2107
+
2108
+ def get_google_mobility_data(self, country="United Kingdom", wc="mon") -> pd.DataFrame:
2109
+ """
2110
+ Fetch Google Mobility data for the specified country.
2111
+
2112
+ Parameters:
2113
+ - country (str): The name of the country for which to fetch data.
2114
+
2115
+ Returns:
2116
+ - pd.DataFrame: A DataFrame containing the Google Mobility data.
2117
+ """
2118
+ # URL of the Google Mobility Reports CSV file
2119
+ url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"
2120
+
2121
+ # Fetch the CSV file
2122
+ response = requests.get(url)
2123
+ if response.status_code != 200:
2124
+ raise Exception(f"Failed to fetch data: {response.status_code}")
2125
+
2126
+ # Load the CSV file into a pandas DataFrame
2127
+ csv_data = StringIO(response.text)
2128
+ df = pd.read_csv(csv_data, low_memory=False)
2129
+
2130
+ # Filter the DataFrame for the specified country
2131
+ country_df = df[df['country_region'] == country]
2132
+
2133
+ final_covid = ims_proc.aggregate_daily_to_wc_wide(country_df, "date", [], ['retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline',
2134
+ 'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline',
2135
+ 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline'], wc, "average")
2136
+
2137
+ final_covid1 = ims_proc.rename_cols(final_covid, 'covid_')
2138
+ return final_covid1
2139
+
2140
+ ############################################################### Seasonality ##########################################################################
2141
+
2142
+ def pull_seasonality(self, week_commencing, start_date, countries):
2143
+ # ---------------------------------------------------------------------
2144
+ # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
2145
+ # ---------------------------------------------------------------------
2146
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2147
+
2148
+ # ---------------------------------------------------------------------
2149
+ # 1. Create daily date range from start_date to today
2150
+ # ---------------------------------------------------------------------
2151
+ date_range = pd.date_range(
2152
+ start=pd.to_datetime(start_date),
2153
+ end=datetime.today(),
2154
+ freq="D"
2155
+ )
2156
+ df_daily = pd.DataFrame(date_range, columns=["Date"])
2157
+
2158
+ # ---------------------------------------------------------------------
2159
+ # 1.1 Identify "week_start" for each daily row, based on week_commencing
2160
+ # ---------------------------------------------------------------------
2161
+ df_daily['week_start'] = df_daily["Date"].apply(
2162
+ lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
2163
+ )
2164
+
2165
+ # ---------------------------------------------------------------------
2166
+ # 2. Build a weekly index (df_weekly_start) with dummy columns
2167
+ # ---------------------------------------------------------------------
2168
+ df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
2169
+ df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
2170
+
2171
+ # Set index to weekly "start of week"
2172
+ df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
2173
+ df_weekly_start.set_index("Date", inplace=True)
2174
+
2175
+ # Create individual weekly dummies
2176
+ dummy_columns = {}
2177
+ for i in range(len(df_weekly_start)):
2178
+ col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
2179
+ dummy_columns[col_name] = [0] * len(df_weekly_start)
2180
+ dummy_columns[col_name][i] = 1
2181
+
2182
+ df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
2183
+ df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
2184
+
2185
+ # ---------------------------------------------------------------------
2186
+ # 3. Public holidays (daily) from 'holidays' package + each holiday name
2187
+ # ---------------------------------------------------------------------
2188
+ for country in countries:
2189
+ country_holidays = holidays.CountryHoliday(
2190
+ country,
2191
+ years=range(int(start_date[:4]), datetime.today().year + 1)
2192
+ )
2193
+ # Daily indicator: 1 if that date is a holiday
2194
+ df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
2195
+ lambda x: 1 if x in country_holidays else 0
2196
+ )
2197
+ # Create columns for specific holiday names
2198
+ for date_hol, name in country_holidays.items():
2199
+ col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
2200
+ if col_name not in df_daily.columns:
2201
+ df_daily[col_name] = 0
2202
+ df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
2203
+
2204
+ # ---------------------------------------------------------------------
2205
+ # 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
2206
+ # We'll add daily columns for each.
2207
+ # ---------------------------------------------------------------------
2208
+ # Initialize columns
2209
+ extra_cols = [
2210
+ "seas_valentines_day",
2211
+ "seas_halloween",
2212
+ "seas_fathers_day_us_uk",
2213
+ "seas_mothers_day_us",
2214
+ "seas_mothers_day_uk",
2215
+ "seas_good_friday",
2216
+ "seas_easter_monday",
2217
+ "seas_black_friday",
2218
+ "seas_cyber_monday",
2219
+ ]
2220
+ for c in extra_cols:
2221
+ df_daily[c] = 0 # default zero
2222
+
2223
+ # Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
2224
+ # weekday: Monday=0, Tuesday=1, ... Sunday=6
2225
+ def nth_weekday_of_month(year, month, weekday, nth):
2226
+ """
2227
+ Returns date of the nth <weekday> in <month> of <year>.
2228
+ E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
2229
+ """
2230
+ # 1st day of the month
2231
+ d = datetime(year, month, 1)
2232
+ # What is the weekday of day #1?
2233
+ w = d.weekday() # Monday=0, Tuesday=1, ... Sunday=6
2234
+ # If we want, e.g. Sunday=6, we see how many days to add
2235
+ delta = (weekday - w) % 7
2236
+ # This is the first <weekday> in that month
2237
+ first_weekday = d + timedelta(days=delta)
2238
+ # Now add 7*(nth-1) days
2239
+ return first_weekday + timedelta(days=7 * (nth-1))
2240
+
2241
+ def get_good_friday(year):
2242
+ """Good Friday is 2 days before Easter Sunday."""
2243
+ return easter(year) - timedelta(days=2)
2244
+
2245
+ def get_easter_monday(year):
2246
+ """Easter Monday is 1 day after Easter Sunday."""
2247
+ return easter(year) + timedelta(days=1)
2248
+
2249
+ def get_black_friday(year):
2250
+ """
2251
+ Black Friday = day after US Thanksgiving,
2252
+ and US Thanksgiving is the 4th Thursday in November.
2253
+ """
2254
+ # 4th Thursday in November
2255
+ fourth_thursday = nth_weekday_of_month(year, 11, 3, 4) # weekday=3 => Thursday
2256
+ return fourth_thursday + timedelta(days=1)
2257
+
2258
+ def get_cyber_monday(year):
2259
+ """Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
2260
+ # 4th Thursday in November
2261
+ fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
2262
+ return fourth_thursday + timedelta(days=4) # Monday after Thanksgiving
2263
+
2264
+ # Loop over each year in range
2265
+ start_yr = int(start_date[:4])
2266
+ end_yr = datetime.today().year
2267
+
2268
+ for yr in range(start_yr, end_yr + 1):
2269
+ # Valentines = Feb 14
2270
+ valentines_day = datetime(yr, 2, 14)
2271
+ # Halloween = Oct 31
2272
+ halloween_day = datetime(yr, 10, 31)
2273
+ # Father's Day (US & UK) = 3rd Sunday in June
2274
+ fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
2275
+ # Mother's Day US = 2nd Sunday in May
2276
+ mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
2277
+ # Mother's Day UK: 4th Sunday in Lent => "Mothering Sunday"
2278
+ # We can approximate as: Easter Sunday - 21 days
2279
+ # BUT we also must ensure it's actually Sunday
2280
+ # (the 4th Sunday in Lent can shift. We'll do the official approach below.)
2281
+ # Another approach: Easter Sunday - 7 * (4 weeks) is the 4th Sunday prior to Easter.
2282
+ # But that might overshoot if Lent started mid-week.
2283
+ # Let's do a quick approach:
2284
+ # Officially: Mothering Sunday = 3 weeks before Easter Sunday (the 4th Sunday is Easter Sunday itself).
2285
+ # So Easter - 21 days should be the Sunday, but let's confirm with weekday check.
2286
+ mothering_sunday = easter(yr) - timedelta(days=21)
2287
+ # If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
2288
+ while mothering_sunday.weekday() != 6: # Sunday=6
2289
+ mothering_sunday -= timedelta(days=1)
2290
+
2291
+ # Good Friday, Easter Monday
2292
+ gf = get_good_friday(yr)
2293
+ em = get_easter_monday(yr)
2294
+
2295
+ # Black Friday, Cyber Monday
2296
+ bf = get_black_friday(yr)
2297
+ cm = get_cyber_monday(yr)
2298
+
2299
+ # Mark them in df_daily if in range
2300
+ for special_date, col in [
2301
+ (valentines_day, "seas_valentines_day"),
2302
+ (halloween_day, "seas_halloween"),
2303
+ (fathers_day, "seas_fathers_day_us_uk"),
2304
+ (mothers_day_us, "seas_mothers_day_us"),
2305
+ (mothering_sunday, "seas_mothers_day_uk"),
2306
+ (gf, "seas_good_friday"),
2307
+ (em, "seas_easter_monday"),
2308
+ (bf, "seas_black_friday"),
2309
+ (cm, "seas_cyber_monday"),
2310
+ ]:
2311
+ # Convert to pd.Timestamp:
2312
+ special_ts = pd.Timestamp(special_date)
2313
+
2314
+ # Only set if it's within your daily range
2315
+ if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
2316
+ df_daily.loc[df_daily["Date"] == special_ts, col] = 1
2317
+
2318
+ # ---------------------------------------------------------------------
2319
+ # 4. Add daily indicators for last day & last Friday of month
2320
+ # Then aggregate them to weekly level using .max()
2321
+ # ---------------------------------------------------------------------
2322
+ # Last day of month (daily)
2323
+ df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
2324
+ lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
2325
+ )
2326
+
2327
+ # Last Friday of month (daily)
2328
+ def is_last_friday(date):
2329
+ # last day of the month
2330
+ last_day_of_month = date.to_period("M").to_timestamp("M")
2331
+ last_day_weekday = last_day_of_month.weekday() # Monday=0,...Sunday=6
2332
+ # Determine how many days we go back from the last day to get Friday (weekday=4)
2333
+ if last_day_weekday >= 4:
2334
+ days_to_subtract = last_day_weekday - 4
2335
+ else:
2336
+ days_to_subtract = last_day_weekday + 3
2337
+ last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
2338
+ return 1 if date == last_friday else 0
2339
+
2340
+ df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
2341
+
2342
+ # ---------------------------------------------------------------------
2343
+ # 5. Weekly aggregation for holiday columns & monthly dummies
2344
+ # ---------------------------------------------------------------------
2345
+ # For monthly dummies, create a daily col "Month", then get_dummies
2346
+ df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
2347
+ df_monthly_dummies = pd.get_dummies(
2348
+ df_daily,
2349
+ prefix="seas",
2350
+ columns=["Month"],
2351
+ dtype=int
2352
+ )
2353
+ # Recalculate 'week_start' (already in df_daily, but just to be sure)
2354
+ df_monthly_dummies['week_start'] = df_daily['week_start']
2355
+
2356
+ # Group monthly dummies by .sum() or .mean()—we often spread them across the week
2357
+ df_monthly_dummies = (
2358
+ df_monthly_dummies
2359
+ .groupby('week_start')
2360
+ .sum(numeric_only=True) # sum the daily flags
2361
+ .reset_index()
2362
+ .rename(columns={'week_start': "Date"})
2363
+ .set_index("Date")
2364
+ )
2365
+ # Spread monthly dummies by 7 to distribute across that week
2366
+ monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
2367
+ df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
2368
+
2369
+ # Group holiday & special-day columns by .max() => binary at weekly level
2370
+ df_holidays = (
2371
+ df_daily
2372
+ .groupby('week_start')
2373
+ .max(numeric_only=True) # if any day=1 in that week, entire week=1
2374
+ .reset_index()
2375
+ .rename(columns={'week_start': "Date"})
2376
+ .set_index("Date")
2377
+ )
2378
+
2379
+ # ---------------------------------------------------------------------
2380
+ # 6. Combine weekly start, monthly dummies, holiday flags
2381
+ # ---------------------------------------------------------------------
2382
+ df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
2383
+ df_combined = pd.concat([df_combined, df_holidays], axis=1)
2384
+ df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
2385
+
2386
+ # ---------------------------------------------------------------------
2387
+ # 7. Create weekly dummies for Week of Year & yearly dummies
2388
+ # ---------------------------------------------------------------------
2389
+ df_combined.reset_index(inplace=True)
2390
+ df_combined.rename(columns={"index": "old_index"}, inplace=True) # just in case
2391
+
2392
+ df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
2393
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
2394
+
2395
+ df_combined["Year"] = df_combined["Date"].dt.year
2396
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
2397
+
2398
+ # ---------------------------------------------------------------------
2399
+ # 8. Add constant & trend
2400
+ # ---------------------------------------------------------------------
2401
+ df_combined["Constant"] = 1
2402
+ df_combined["Trend"] = df_combined.index + 1
2403
+
2404
+ # ---------------------------------------------------------------------
2405
+ # 9. Rename Date -> OBS and return
2406
+ # ---------------------------------------------------------------------
2407
+ df_combined.rename(columns={"Date": "OBS"}, inplace=True)
2408
+
2409
+ return df_combined
2410
+
2411
+
2412
+ def pull_weather(self, week_commencing, country) -> pd.DataFrame:
2413
+ import pandas as pd
2414
+ import urllib.request # noqa: F811
2415
+ from datetime import datetime
2416
+ import requests
2417
+ from geopy.geocoders import Nominatim # noqa: F811
2418
+
2419
+ # Week commencing dictionary
2420
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
2421
+
2422
+ # Country dictionary
2423
+ country_dict = {"AUS": "AU__ASOS", "GBR": "GB__ASOS", "USA": "USCRN", "DEU": "DE__ASOS", "CAN": "Canada", "ZAF": "ZA__ASOS"}
2424
+
2425
+ # Function to flatten a list of nested lists into a list
2426
+ def flatten_list(nested_list):
2427
+ return [item for sublist in nested_list for item in sublist]
2428
+
2429
+ # Choose country
2430
+ country = country_dict[country]
2431
+
2432
+ # Choose start and end dates
2433
+ start_day = 1
2434
+ start_month = 1
2435
+ start_year = 2014
2436
+ formatted_date = datetime(start_year, start_month, start_day).strftime("%Y-%m-%d")
2437
+ today = datetime.now()
2438
+ end_day = today.day
2439
+ end_month = today.month
2440
+ end_year = today.year
2441
+
2442
+ if country == "GB__ASOS":
2443
+ stations = ["&stations=EGCC", "&stations=EGNM", "&stations=EGBB",
2444
+ "&stations=EGSH", "&stations=EGFF", "&stations=EGHI",
2445
+ "&stations=EGLC", "&stations=EGHQ", "&stations=EGAC",
2446
+ "&stations=EGPF", "&stations=EGGD", "&stations=EGPE",
2447
+ "&stations=EGNT"]
2448
+ elif country == "AU__ASOS":
2449
+ stations = ["&stations=YPDN", "&stations=YBCS", "&stations=YBBN",
2450
+ "&stations=YSSY", "&stations=YSSY", "&stations=YMEN",
2451
+ "&stations=YPAD", "&stations=YPPH"]
2452
+ elif country == "USCRN":
2453
+ stations = ["&stations=64756", "&stations=64758", "&stations=03761", "&stations=54797", # North
2454
+ "&stations=53968", "&stations=53960", "&stations=54932", "&stations=13301", # Midwest
2455
+ "&stations=64756", "&stations=64756", "&stations=92821", "&stations=63862", # South
2456
+ "&stations=53152", "&stations=93245", "&stations=04138", "&stations=04237"] # West
2457
+ elif country == "DE__ASOS":
2458
+ stations = ["&stations=EDDL", "&stations=EDDH", "&stations=EDDB",
2459
+ "&stations=EDDN", "&stations=EDDF", "&stations=EDDK",
2460
+ "&stations=EDLW", "&stations=EDDM"]
2461
+ elif country == "FR__ASOS":
2462
+ stations = ["&stations=LFPB"]
2463
+ elif country == "Canada":
2464
+ institute_vector = ["CA_NB_ASOS", "CA_NF_ASOS", "CA_NT_ASOS", "CA_NS_ASOS",
2465
+ "CA_NU_ASOS"]
2466
+ stations_list = [[] for _ in range(5)]
2467
+ stations_list[0].append(["&stations=CYQM", "&stations=CERM", "&stations=CZCR",
2468
+ "&stations=CZBF", "&stations=CYFC", "&stations=CYCX"])
2469
+
2470
+ stations_list[1].append(["&stations=CWZZ", "&stations=CYDP", "&stations=CYMH",
2471
+ "&stations=CYAY", "&stations=CWDO", "&stations=CXTP",
2472
+ "&stations=CYJT", "&stations=CYYR", "&stations=CZUM",
2473
+ "&stations=CYWK", "&stations=CYWK"])
2474
+
2475
+ stations_list[2].append(["&stations=CYHI", "&stations=CZCP", "&stations=CWLI",
2476
+ "&stations=CWND", "&stations=CXTV", "&stations=CYVL",
2477
+ "&stations=CYCO", "&stations=CXDE", "&stations=CYWE",
2478
+ "&stations=CYLK", "&stations=CWID", "&stations=CYRF",
2479
+ "&stations=CXYH", "&stations=CYWY", "&stations=CWMT"])
2480
+
2481
+ stations_list[3].append(["&stations=CWEF", "&stations=CXIB", "&stations=CYQY",
2482
+ "&stations=CYPD", "&stations=CXNP", "&stations=CXMY",
2483
+ "&stations=CYAW", "&stations=CWKG", "&stations=CWVU",
2484
+ "&stations=CXLB", "&stations=CWSA", "&stations=CWRN"])
2485
+
2486
+ stations_list[4].append(["&stations=CYLT", "&stations=CWEU", "&stations=CWGZ",
2487
+ "&stations=CYIO", "&stations=CXSE", "&stations=CYCB",
2488
+ "&stations=CWIL", "&stations=CXWB", "&stations=CYZS",
2489
+ "&stations=CWJC", "&stations=CYFB", "&stations=CWUW"])
2490
+
2491
+ elif country == "ZA__ASOS":
2492
+ cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
2493
+ stations = []
2494
+
2495
+ for city in cities:
2496
+ geolocator = Nominatim(user_agent="MyApp")
2497
+ location = geolocator.geocode(city)
2498
+ stations.append(f"&latitude={location.latitude}&longitude={location.longitude}")
2499
+
2500
+ # Temperature
2501
+ if country in ["GB__ASOS", "AU__ASOS", "DE__ASOS", "FR__ASOS"]:
2502
+ # We start by making a data frame of the following weather stations
2503
+ station_query = ''.join(stations)
2504
+
2505
+ raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
2506
+ station_query,
2507
+ "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
2508
+ "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
2509
+ raw_weather = urllib.request.urlopen(raw_weather_list)
2510
+ raw_weather = pd.read_csv(raw_weather)
2511
+
2512
+ # Replace the occurrences of "None" with Missing Value
2513
+ raw_weather["max_temp_f"].replace("None", 0, inplace=True)
2514
+ raw_weather["min_temp_f"].replace("None", 0, inplace=True)
2515
+
2516
+ # Remove any data that isn't temperature-related
2517
+ weather = raw_weather.iloc[:, 0:4]
2518
+
2519
+ weather[["max_temp_f", "min_temp_f"]] = weather[["max_temp_f", "min_temp_f"]].apply(pd.to_numeric)
2520
+
2521
+ # Estimate mean temperature
2522
+ weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
2523
+
2524
+ # Convert Fahrenheit to Celsius for max_temp_f
2525
+ weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
2526
+
2527
+ # Convert Fahrenheit to Celsius for min_temp_f
2528
+ weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
2529
+
2530
+ # Convert Fahrenheit to Celsius for mean_temp_f
2531
+ weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
2532
+
2533
+ # Aggregate the data to week commencing sunday taking the average of the data
2534
+ # Convert the date column to a Date type
2535
+ weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
2536
+
2537
+ # Determine the starting chosen day for each date
2538
+ weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2539
+
2540
+ # Group by week_starting and summarize
2541
+ numeric_columns = weather.select_dtypes(include='number').columns
2542
+ weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
2543
+ weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
2544
+ "min_temp_f": "avg_min_temp_f",
2545
+ "mean_temp_f": "avg_mean_temp_f",
2546
+ "max_temp_c": "avg_max_temp_c",
2547
+ "min_temp_c": "avg_min_temp_c",
2548
+ "mean_temp_c": "avg_mean_temp_c"}, inplace=True)
2549
+ elif country == "Canada":
2550
+ for i in range(len(institute_vector)):
2551
+ station_query_temp = ''.join(flatten_list(stations_list[i]))
2552
+ institute_temp = institute_vector[i]
2553
+ raw_weather_temp = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", institute_temp,
2554
+ station_query_temp,
2555
+ "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
2556
+ "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
2557
+ raw_weather_temp = urllib.request.urlopen(raw_weather_temp)
2558
+ raw_weather_temp = pd.read_csv(raw_weather_temp)
2559
+
2560
+ if len(raw_weather_temp.index) == 0:
2561
+ continue
2562
+ raw_weather_temp = raw_weather_temp[['station', 'day', 'max_temp_f', 'min_temp_f', 'precip_in']]
2563
+
2564
+ if i == 1:
2565
+ raw_weather = raw_weather_temp
2566
+ else:
2567
+ raw_weather = pd.concat([raw_weather, raw_weather_temp])
2568
+
2569
+ # Drop error column if it exists
2570
+ if 'ERROR: Invalid network specified' in list(raw_weather.columns):
2571
+ raw_weather.drop('ERROR: Invalid network specified', axis=1, inplace=True)
2572
+
2573
+ # Replace none values
2574
+ raw_weather["max_temp_f"].replace("None", 0, inplace=True)
2575
+ raw_weather["min_temp_f"].replace("None", 0, inplace=True)
2576
+ raw_weather["precip_in"].replace("None", 0, inplace=True)
2577
+
2578
+ weather = raw_weather
2579
+ weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
2580
+
2581
+ # Estimate mean temperature
2582
+ weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
2583
+
2584
+ # Convert Fahrenheit to Celsius for max_temp_f
2585
+ weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
2586
+
2587
+ # Convert Fahrenheit to Celsius for min_temp_f
2588
+ weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
2589
+
2590
+ # Convert Fahrenheit to Celsius for mean_temp_f
2591
+ weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
2592
+
2593
+ # Aggregate the data to week commencing sunday taking the average of the data
2594
+ # Convert the date column to a Date type
2595
+ weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
2596
+
2597
+ # Determine the starting chosen day for each date
2598
+ weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2599
+
2600
+ # Group by week_starting and summarize
2601
+ numeric_columns = weather.select_dtypes(include='number').columns
2602
+ weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
2603
+ weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
2604
+ "min_temp_f": "avg_min_temp_f",
2605
+ "mean_temp_f": "avg_mean_temp_f",
2606
+ "max_temp_c": "avg_max_temp_c",
2607
+ "min_temp_c": "avg_min_temp_c",
2608
+ "mean_temp_c": "avg_mean_temp_c",
2609
+ "precip_in": "avg_mean_perc"}, inplace=True)
2610
+ elif country == "ZA__ASOS":
2611
+ weather_data_list = []
2612
+
2613
+ for city in cities:
2614
+ geolocator = Nominatim(user_agent="MyApp")
2615
+ location = geolocator.geocode(city)
2616
+ url = "https://archive-api.open-meteo.com/v1/archive"
2617
+
2618
+ params = {
2619
+ "latitude": location.latitude,
2620
+ "longitude": location.longitude,
2621
+ "start_date": formatted_date,
2622
+ "end_date": today.strftime("%Y-%m-%d"),
2623
+ "daily": "temperature_2m_max,temperature_2m_min,precipitation_sum",
2624
+ "timezone": "auto"
2625
+ }
2626
+
2627
+ response = requests.get(url, params=params)
2628
+ response_data = response.json()
2629
+
2630
+ daily_data = response_data["daily"]
2631
+ dates = daily_data["time"]
2632
+
2633
+ data = pd.DataFrame({
2634
+ "day": dates,
2635
+ "max_temp_f": daily_data["temperature_2m_max"],
2636
+ "min_temp_f": daily_data["temperature_2m_min"],
2637
+ "precip_in": daily_data["precipitation_sum"]
2638
+ })
2639
+ data["city"] = city
2640
+ weather_data_list.append(data)
2641
+
2642
+ weather = pd.concat(weather_data_list)
2643
+
2644
+ # Convert the date column to a Date type
2645
+ weather["day"] = pd.to_datetime(weather["day"])
2646
+
2647
+ # Replace None values
2648
+ weather["max_temp_f"].replace("None", 0, inplace=True)
2649
+ weather["min_temp_f"].replace("None", 0, inplace=True)
2650
+ weather["precip_in"].replace("None", 0, inplace=True)
2651
+
2652
+ weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
2653
+
2654
+ # Estimate mean temperature
2655
+ weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
2656
+
2657
+ # Convert Fahrenheit to Celsius for max_temp_f
2658
+ weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
2659
+
2660
+ # Convert Fahrenheit to Celsius for min_temp_f
2661
+ weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
2662
+
2663
+ # Convert Fahrenheit to Celsius for mean_temp_f
2664
+ weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
2665
+
2666
+ # Determine the starting chosen day for each date
2667
+ weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2668
+
2669
+ # Group by week_starting and summarize
2670
+ numeric_columns = weather.select_dtypes(include='number').columns
2671
+ weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
2672
+ weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
2673
+ "min_temp_f": "avg_min_temp_f",
2674
+ "mean_temp_f": "avg_mean_temp_f",
2675
+ "max_temp_c": "avg_max_temp_c",
2676
+ "min_temp_c": "avg_min_temp_c",
2677
+ "mean_temp_c": "avg_mean_temp_c",
2678
+ "precip_in": "avg_mean_perc"}, inplace=True)
2679
+
2680
+ else:
2681
+ # We start by making a data frame of the following weather stations
2682
+ station_query = ''.join(stations)
2683
+
2684
+ raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
2685
+ station_query,
2686
+ "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
2687
+ "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
2688
+ raw_weather = urllib.request.urlopen(raw_weather_list)
2689
+ raw_weather = pd.read_csv(raw_weather)
2690
+
2691
+ raw_weather = raw_weather[['day', 'max_temp_f', 'min_temp_f', 'precip_in']]
2692
+
2693
+ # Replace the occurrences of "None" with Missing Value
2694
+ raw_weather["max_temp_f"].replace("None", 0, inplace=True)
2695
+ raw_weather["min_temp_f"].replace("None", 0, inplace=True)
2696
+ raw_weather["precip_in"].replace("None", 0, inplace=True)
2697
+
2698
+ # Remove any data that isn't temperature-related
2699
+ weather = raw_weather
2700
+
2701
+ weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
2702
+
2703
+ # Estimate mean temperature
2704
+ weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
2705
+
2706
+ # Convert Fahrenheit to Celsius for max_temp_f
2707
+ weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
2708
+
2709
+ # Convert Fahrenheit to Celsius for min_temp_f
2710
+ weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
2711
+
2712
+ # Convert Fahrenheit to Celsius for mean_temp_f
2713
+ weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
2714
+
2715
+ # Aggregate the data to week commencing sunday taking the average of the data
2716
+ # Convert the date column to a Date type
2717
+ weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
2718
+
2719
+ # Determine the starting chosen day for each date
2720
+ weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2721
+
2722
+ # Group by week_starting and summarize
2723
+ numeric_columns = weather.select_dtypes(include='number').columns
2724
+ weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
2725
+ weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
2726
+ "min_temp_f": "avg_min_temp_f",
2727
+ "mean_temp_f": "avg_mean_temp_f",
2728
+ "max_temp_c": "avg_max_temp_c",
2729
+ "min_temp_c": "avg_min_temp_c",
2730
+ "mean_temp_c": "avg_mean_temp_c",
2731
+ "precip_in": "avg_mean_perc"}, inplace=True)
2732
+
2733
+ # Rainfall
2734
+ if country == "GB__ASOS":
2735
+ # Define cities and date range
2736
+ cities = ["Manchester", "Leeds", "Birmingham", "Norwich", "Cardiff", "Southampton", "London", "Newquay", "Belfast", "Glasgow", "Bristol", "Newcastle"]
2737
+
2738
+ start_date = formatted_date
2739
+ end_date = today.strftime("%Y-%m-%d")
2740
+
2741
+ # Initialize an empty list to store the weather data for each city
2742
+ weather_data_list = []
2743
+
2744
+ # Loop through each city and fetch weather data
2745
+ for city in cities:
2746
+ # Initialize Nominatim API
2747
+ geolocator = Nominatim(user_agent="MyApp")
2748
+ location = geolocator.geocode(city)
2749
+ url = "https://archive-api.open-meteo.com/v1/archive"
2750
+
2751
+ params = {
2752
+ "latitude": location.latitude,
2753
+ "longitude": location.longitude,
2754
+ "start_date": start_date,
2755
+ "end_date": end_date,
2756
+ "daily": "precipitation_sum",
2757
+ "timezone": "auto"
2758
+ }
2759
+
2760
+ response = requests.get(url, params=params)
2761
+ response_data = response.json()
2762
+
2763
+ daily_data = response_data["daily"]["precipitation_sum"]
2764
+ dates = response_data["daily"]["time"]
2765
+
2766
+ data = pd.DataFrame({"date": dates, "rainfall": daily_data})
2767
+ data["city"] = city
2768
+
2769
+ weather_data_list.append(data)
2770
+
2771
+ # Combine all city data into a single data frame
2772
+ all_weather_data = pd.concat(weather_data_list)
2773
+
2774
+ # Convert the date column to a Date type
2775
+ all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
2776
+
2777
+ # Set week commencing col up
2778
+ all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2779
+
2780
+ # Group by week_starting and summarize
2781
+ numeric_columns = all_weather_data.select_dtypes(include='number').columns
2782
+ weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
2783
+ weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
2784
+
2785
+ # Change index to datetime
2786
+ weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
2787
+
2788
+ elif country == "AU__ASOS":
2789
+
2790
+ # Define cities and date range
2791
+ cities = ["Darwin", "Cairns", "Brisbane", "Sydney", "Melbourne", "Adelaide", "Perth"]
2792
+
2793
+ start_date = formatted_date
2794
+ end_date = today.strftime("%Y-%m-%d")
2795
+
2796
+ # Initialize an empty list to store the weather data for each city
2797
+ weather_data_list = []
2798
+
2799
+ # Loop through each city and fetch weather data
2800
+ for city in cities:
2801
+ # Initialize Nominatim API
2802
+ geolocator = Nominatim(user_agent="MyApp")
2803
+ location = geolocator.geocode(city)
2804
+ url = "https://archive-api.open-meteo.com/v1/archive"
2805
+
2806
+ params = {
2807
+ "latitude": location.latitude,
2808
+ "longitude": location.longitude,
2809
+ "start_date": start_date,
2810
+ "end_date": end_date,
2811
+ "daily": "precipitation_sum",
2812
+ "timezone": "auto"
2813
+ }
2814
+
2815
+ response = requests.get(url, params=params)
2816
+ response_data = response.json()
2817
+
2818
+ daily_data = response_data["daily"]["precipitation_sum"]
2819
+ dates = response_data["daily"]["time"]
2820
+
2821
+ data = pd.DataFrame({"date": dates, "rainfall": daily_data})
2822
+ data["city"] = city
2823
+
2824
+ weather_data_list.append(data)
2825
+
2826
+ # Combine all city data into a single data frame
2827
+ all_weather_data = pd.concat(weather_data_list)
2828
+
2829
+ # Convert the date column to a Date type
2830
+ all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
2831
+
2832
+ # Set week commencing col up
2833
+ all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2834
+
2835
+ # Group by week_starting and summarize
2836
+ numeric_columns = all_weather_data.select_dtypes(include='number').columns
2837
+ weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
2838
+ weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
2839
+
2840
+ # Change index to datetime
2841
+ weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
2842
+
2843
+ elif country == "DE__ASOS":
2844
+
2845
+ # Define cities and date range
2846
+ cities = ["Dortmund", "Düsseldorf", "Frankfurt", "Munich", "Cologne", "Berlin", "Hamburg", "Nuernberg"]
2847
+
2848
+ start_date = formatted_date
2849
+ end_date = today.strftime("%Y-%m-%d")
2850
+
2851
+ # Initialize an empty list to store the weather data for each city
2852
+ weather_data_list = []
2853
+
2854
+ # Loop through each city and fetch weather data
2855
+ for city in cities:
2856
+ # Initialize Nominatim API
2857
+ geolocator = Nominatim(user_agent="MyApp")
2858
+ location = geolocator.geocode(city)
2859
+ url = "https://archive-api.open-meteo.com/v1/archive"
2860
+
2861
+ params = {
2862
+ "latitude": location.latitude,
2863
+ "longitude": location.longitude,
2864
+ "start_date": start_date,
2865
+ "end_date": end_date,
2866
+ "daily": "precipitation_sum",
2867
+ "timezone": "auto"
2868
+ }
2869
+
2870
+ response = requests.get(url, params=params)
2871
+ response_data = response.json()
2872
+
2873
+ daily_data = response_data["daily"]["precipitation_sum"]
2874
+ dates = response_data["daily"]["time"]
2875
+
2876
+ data = pd.DataFrame({"date": dates, "rainfall": daily_data})
2877
+ data["city"] = city
2878
+
2879
+ weather_data_list.append(data)
2880
+
2881
+ # Combine all city data into a single data frame
2882
+ all_weather_data = pd.concat(weather_data_list)
2883
+
2884
+ # Convert the date column to a Date type
2885
+ all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
2886
+
2887
+ # Set week commencing col up
2888
+ all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2889
+
2890
+ # Group by week_starting and summarize
2891
+ numeric_columns = all_weather_data.select_dtypes(include='number').columns
2892
+ weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
2893
+ weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
2894
+
2895
+ # Change index to datetime
2896
+ weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
2897
+
2898
+ elif country == "FR__ASOS":
2899
+
2900
+ # Define cities and date range
2901
+ cities = ["Paris"]
2902
+
2903
+ start_date = formatted_date
2904
+ end_date = today.strftime("%Y-%m-%d")
2905
+
2906
+ # Initialize an empty list to store the weather data for each city
2907
+ weather_data_list = []
2908
+
2909
+ # Loop through each city and fetch weather data
2910
+ for city in cities:
2911
+ # Initialize Nominatim API
2912
+ geolocator = Nominatim(user_agent="MyApp")
2913
+ location = geolocator.geocode(city)
2914
+ url = "https://archive-api.open-meteo.com/v1/archive"
2915
+
2916
+ params = {
2917
+ "latitude": location.latitude,
2918
+ "longitude": location.longitude,
2919
+ "start_date": start_date,
2920
+ "end_date": end_date,
2921
+ "daily": "precipitation_sum",
2922
+ "timezone": "auto"
2923
+ }
2924
+
2925
+ response = requests.get(url, params=params)
2926
+ response_data = response.json()
2927
+
2928
+ daily_data = response_data["daily"]["precipitation_sum"]
2929
+ dates = response_data["daily"]["time"]
2930
+
2931
+ data = pd.DataFrame({"date": dates, "rainfall": daily_data})
2932
+ data["city"] = city
2933
+
2934
+ weather_data_list.append(data)
2935
+
2936
+ # Combine all city data into a single data frame
2937
+ all_weather_data = pd.concat(weather_data_list)
2938
+
2939
+ # Convert the date column to a Date type
2940
+ all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
2941
+
2942
+ # Set week commencing col up
2943
+ all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2944
+
2945
+ # Group by week_starting and summarize
2946
+ numeric_columns = all_weather_data.select_dtypes(include='number').columns
2947
+ weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
2948
+ weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
2949
+
2950
+ # Change index to datetime
2951
+ weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
2952
+
2953
+ elif country == "ZA__ASOS":
2954
+ cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
2955
+ start_date = formatted_date
2956
+ end_date = today.strftime("%Y-%m-%d")
2957
+
2958
+ weather_data_list = []
2959
+
2960
+ for city in cities:
2961
+ geolocator = Nominatim(user_agent="MyApp")
2962
+ location = geolocator.geocode(city)
2963
+ url = "https://archive-api.open-meteo.com/v1/archive"
2964
+
2965
+ params = {
2966
+ "latitude": location.latitude,
2967
+ "longitude": location.longitude,
2968
+ "start_date": start_date,
2969
+ "end_date": end_date,
2970
+ "daily": "precipitation_sum",
2971
+ "timezone": "auto"
2972
+ }
2973
+
2974
+ response = requests.get(url, params=params)
2975
+ response_data = response.json()
2976
+
2977
+ daily_data = response_data["daily"]["precipitation_sum"]
2978
+ dates = response_data["daily"]["time"]
2979
+
2980
+ data = pd.DataFrame({"date": dates, "rainfall": daily_data})
2981
+ data["city"] = city
2982
+
2983
+ weather_data_list.append(data)
2984
+
2985
+ # Combine all city data into a single data frame
2986
+ all_weather_data = pd.concat(weather_data_list)
2987
+
2988
+ # Convert the date column to a Date type
2989
+ all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
2990
+
2991
+ # Set week commencing col up
2992
+ all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
2993
+
2994
+ # Group by week_starting and summarize
2995
+ numeric_columns = all_weather_data.select_dtypes(include='number').columns
2996
+ weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
2997
+ weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
2998
+
2999
+ # Change index to datetime
3000
+ weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
3001
+
3002
+ # Merge the dataframes
3003
+ if country in ["AU__ASOS", "DE__ASOS", "FR__ASOS", "GB__ASOS", "ZA__ASOS"]:
3004
+ merged_df = weekly_avg_rain.merge(weekly_avg_temp, on="week_starting")
3005
+ else:
3006
+ merged_df = weekly_avg_temp
3007
+
3008
+ merged_df.reset_index(drop=False, inplace=True)
3009
+ merged_df.rename(columns={'week_starting': 'OBS'}, inplace=True)
3010
+
3011
+ final_weather = ims_proc.rename_cols(merged_df, 'seas_')
3012
+
3013
+ return final_weather
3014
+
3015
+ def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
3016
+ """
3017
+ Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
3018
+ aggregates it to weekly averages, and renames variables based on specified rules.
3019
+
3020
+ Parameters:
3021
+ cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
3022
+ week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
3023
+ sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
3024
+
3025
+ Returns:
3026
+ pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
3027
+ and all series as renamed columns.
3028
+ """
3029
+ # Define CDIDs for sectors and defaults
3030
+ sector_cdids = {
3031
+ "fast_food": ["L7TD", "L78Q", "DOAD"],
3032
+ "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
3033
+ }
3034
+
3035
+ default_cdids = sector_cdids["default"]
3036
+ sector_specific_cdids = sector_cdids.get(sector, [])
3037
+ standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
3038
+
3039
+ # Combine standard CDIDs and additional CDIDs
3040
+ if cdid_list is None:
3041
+ cdid_list = []
3042
+ cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
3043
+
3044
+ base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
3045
+ base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
3046
+ combined_df = pd.DataFrame()
3047
+
3048
+ # Map week start day to pandas weekday convention
3049
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
3050
+ if week_start_day not in days_map:
3051
+ raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
3052
+ week_start = days_map[week_start_day]
3053
+
3054
+ for cdid in cdid_list:
3055
+ try:
3056
+ # Search for the series
3057
+ search_url = f"{base_search_url}{cdid}"
3058
+ search_response = requests.get(search_url)
3059
+ search_response.raise_for_status()
3060
+ search_data = search_response.json()
3061
+
3062
+ items = search_data.get("items", [])
3063
+ if not items:
3064
+ print(f"No data found for CDID: {cdid}")
3065
+ continue
3066
+
3067
+ # Extract series name and latest release URI
3068
+ series_name = items[0].get("title", f"Series_{cdid}")
3069
+ latest_date = max(
3070
+ datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
3071
+ for item in items if "release_date" in item
3072
+ )
3073
+ latest_uri = next(
3074
+ item["uri"] for item in items
3075
+ if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
3076
+ )
3077
+
3078
+ # Fetch the dataset
3079
+ data_url = f"{base_data_url}{latest_uri}"
3080
+ data_response = requests.get(data_url)
3081
+ data_response.raise_for_status()
3082
+ data_json = data_response.json()
3083
+
3084
+ # Detect the frequency and process accordingly
3085
+ if "months" in data_json and data_json["months"]:
3086
+ frequency_key = "months"
3087
+ elif "quarters" in data_json and data_json["quarters"]:
3088
+ frequency_key = "quarters"
3089
+ elif "years" in data_json and data_json["years"]:
3090
+ frequency_key = "years"
3091
+ else:
3092
+ print(f"Unsupported frequency or no data for CDID: {cdid}")
3093
+ continue
3094
+
3095
+ # Prepare the DataFrame
3096
+ df = pd.DataFrame(data_json[frequency_key])
3097
+
3098
+ # Parse the 'date' field based on frequency
3099
+ if frequency_key == "months":
3100
+ df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
3101
+ elif frequency_key == "quarters":
3102
+ def parse_quarter(quarter_str):
3103
+ year, qtr = quarter_str.split(" Q")
3104
+ month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
3105
+ return datetime(int(year), month, 1)
3106
+ df["date"] = df["date"].apply(parse_quarter)
3107
+ elif frequency_key == "years":
3108
+ df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
3109
+
3110
+ df["value"] = pd.to_numeric(df["value"], errors="coerce")
3111
+ df.rename(columns={"value": series_name}, inplace=True)
3112
+
3113
+ # Combine data
3114
+ df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
3115
+ if combined_df.empty:
3116
+ combined_df = df
3117
+ else:
3118
+ combined_df = pd.merge(combined_df, df, on="date", how="outer")
3119
+
3120
+ except requests.exceptions.RequestException as e:
3121
+ print(f"Error fetching data for CDID {cdid}: {e}")
3122
+ except (KeyError, ValueError) as e:
3123
+ print(f"Error processing data for CDID {cdid}: {e}")
3124
+
3125
+ if not combined_df.empty:
3126
+ min_date = combined_df["date"].min()
3127
+ max_date = datetime.today()
3128
+ date_range = pd.date_range(start=min_date, end=max_date, freq='D')
3129
+ daily_df = pd.DataFrame(date_range, columns=['date'])
3130
+ daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
3131
+ daily_df = daily_df.ffill()
3132
+
3133
+ # Aggregate to weekly frequency
3134
+ daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
3135
+ weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
3136
+
3137
+ def clean_column_name(name):
3138
+ name = re.sub(r"\(.*?\)", "", name)
3139
+ name = re.split(r":", name)[0]
3140
+ name = re.sub(r"\d+", "", name)
3141
+ name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
3142
+ name = re.sub(r"[^\w\s]", "", name)
3143
+ name = name.replace(" ", "_")
3144
+ name = re.sub(r"_+", "_", name)
3145
+ name = name.rstrip("_")
3146
+ return f"macro_{name.lower()}_uk"
3147
+
3148
+ weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
3149
+ weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
3150
+
3151
+ weekly_df = weekly_df.fillna(0)
3152
+
3153
+ return weekly_df
3154
+ else:
3155
+ print("No data available to process.")
3156
+ return pd.DataFrame()
3157
+
3158
+ def pull_yfinance(self, tickers=None, week_start_day="mon"):
3159
+ """
3160
+ Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
3161
+ aggregates it to weekly averages, and renames variables.
3162
+
3163
+ Parameters:
3164
+ tickers (list): A list of additional stock tickers to fetch (e.g., ['AAPL', 'MSFT']). Defaults to None.
3165
+ week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
3166
+
3167
+ Returns:
3168
+ pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column
3169
+ and aggregated stock data for the specified tickers, with NaN values filled with 0.
3170
+ """
3171
+ # Define default tickers
3172
+ default_tickers = ["^FTSE", "GBPUSD=X", "GBPEUR=X", "^GSPC"]
3173
+
3174
+ # Combine default tickers with additional ones
3175
+ if tickers is None:
3176
+ tickers = []
3177
+ tickers = list(set(default_tickers + tickers)) # Ensure no duplicates
3178
+
3179
+ # Automatically set end_date to today
3180
+ end_date = datetime.today().strftime("%Y-%m-%d")
3181
+
3182
+ # Mapping week start day to pandas weekday convention
3183
+ days_map = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
3184
+ if week_start_day not in days_map:
3185
+ raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
3186
+ week_start = days_map[week_start_day]
3187
+
3188
+ # Fetch data for all tickers without specifying a start date to get all available data
3189
+ data = yf.download(tickers, end=end_date, group_by="ticker", auto_adjust=True)
3190
+
3191
+ # Process the data
3192
+ combined_df = pd.DataFrame()
3193
+ for ticker in tickers:
3194
+ try:
3195
+ # Extract the ticker's data
3196
+ ticker_data = data[ticker] if len(tickers) > 1 else data
3197
+ ticker_data = ticker_data.reset_index()
3198
+
3199
+ # Ensure necessary columns are present
3200
+ if "Close" not in ticker_data.columns:
3201
+ raise ValueError(f"Ticker {ticker} does not have 'Close' price data.")
3202
+
3203
+ # Keep only relevant columns
3204
+ ticker_data = ticker_data[["Date", "Close"]]
3205
+ ticker_data.rename(columns={"Close": ticker}, inplace=True)
3206
+
3207
+ # Merge data
3208
+ if combined_df.empty:
3209
+ combined_df = ticker_data
3210
+ else:
3211
+ combined_df = pd.merge(combined_df, ticker_data, on="Date", how="outer")
3212
+
3213
+ except KeyError:
3214
+ print(f"Data for ticker {ticker} not available.")
3215
+ except Exception as e:
3216
+ print(f"Error processing ticker {ticker}: {e}")
3217
+
3218
+ if not combined_df.empty:
3219
+ # Convert to daily frequency
3220
+ combined_df["Date"] = pd.to_datetime(combined_df["Date"])
3221
+ combined_df.set_index("Date", inplace=True)
3222
+
3223
+ # Fill missing dates
3224
+ min_date = combined_df.index.min()
3225
+ max_date = combined_df.index.max()
3226
+ daily_index = pd.date_range(start=min_date, end=max_date, freq='D')
3227
+ combined_df = combined_df.reindex(daily_index)
3228
+ combined_df.index.name = "Date"
3229
+ combined_df = combined_df.ffill()
3230
+
3231
+ # Aggregate to weekly frequency
3232
+ combined_df["OBS"] = combined_df.index - pd.to_timedelta((combined_df.index.weekday - week_start) % 7, unit="D")
3233
+ weekly_df = combined_df.groupby("OBS").mean(numeric_only=True).reset_index()
3234
+
3235
+ # Fill NaN values with 0
3236
+ weekly_df = weekly_df.fillna(0)
3237
+
3238
+ # Clean column names
3239
+ def clean_column_name(name):
3240
+ name = re.sub(r"[^\w\s]", "", name)
3241
+ return f"macro_{name.lower()}"
3242
+
3243
+ weekly_df.columns = [clean_column_name(col) if col != "OBS" else col for col in weekly_df.columns]
3244
+
3245
+ return weekly_df
3246
+
3247
+ else:
3248
+ print("No data available to process.")
3249
+ return pd.DataFrame()
3250
+
3251
+ def pull_ga(self, credentials_file, property_id, start_date, country, metrics):
3252
+ """
3253
+ Pulls Google Analytics data using the BetaAnalyticsDataClient.
3254
+
3255
+ Parameters:
3256
+ credentials_file (str): Path to the JSON credentials file.
3257
+ property_id (str): Google Analytics property ID.
3258
+ start_date (str): Start date in 'YYYY-MM-DD' format.
3259
+ country (str): Country to filter the data by.
3260
+ metrics (list): List of metrics to retrieve (e.g., ["totalUsers", "sessions"]).
3261
+
3262
+ Returns:
3263
+ pd.DataFrame: A pandas DataFrame containing the fetched data.
3264
+ """
3265
+ try:
3266
+ end_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
3267
+
3268
+ if not os.path.exists(credentials_file):
3269
+ raise FileNotFoundError(f"Credentials file '{credentials_file}' not found.")
3270
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_file
3271
+
3272
+ try:
3273
+ client = BetaAnalyticsDataClient()
3274
+ except DefaultCredentialsError as e:
3275
+ raise DefaultCredentialsError(
3276
+ f"Failed to initialize Google Analytics client: {e}"
3277
+ )
3278
+
3279
+ def format_report(request):
3280
+ response = client.run_report(request)
3281
+ # Row index
3282
+ row_index_names = [header.name for header in response.dimension_headers]
3283
+ row_header = []
3284
+ for i in range(len(row_index_names)):
3285
+ row_header.append([row.dimension_values[i].value for row in response.rows])
3286
+
3287
+ row_index_named = pd.MultiIndex.from_arrays(np.array(row_header), names=np.array(row_index_names))
3288
+ # Row flat data
3289
+ metric_names = [header.name for header in response.metric_headers]
3290
+ data_values = []
3291
+ for i in range(len(metric_names)):
3292
+ data_values.append([row.metric_values[i].value for row in response.rows])
3293
+
3294
+ output = pd.DataFrame(data=np.transpose(np.array(data_values, dtype='f')),
3295
+ index=row_index_named, columns=metric_names)
3296
+ return output
3297
+
3298
+ all_dfs = []
3299
+ offset_value = 0
3300
+ batch_size = 100000
3301
+
3302
+ while True:
3303
+ metric_objects = [Metric(name=metric) for metric in metrics]
3304
+
3305
+ request = RunReportRequest(
3306
+ property='properties/' + property_id,
3307
+ dimensions=[Dimension(name="date"), Dimension(name="city")],
3308
+ metrics=metric_objects,
3309
+ order_bys=[OrderBy(dimension={'dimension_name': 'date'}),
3310
+ OrderBy(dimension={'dimension_name': 'city'})],
3311
+ date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
3312
+ limit=batch_size,
3313
+ offset=offset_value,
3314
+ dimension_filter=FilterExpression(
3315
+ and_group=FilterExpressionList(
3316
+ expressions=[
3317
+ FilterExpression(
3318
+ filter=Filter(
3319
+ field_name="country",
3320
+ string_filter=Filter.StringFilter(value=country),
3321
+ )
3322
+ ),
3323
+ ]
3324
+ )
3325
+ )
3326
+ )
3327
+
3328
+ df = format_report(request)
3329
+ if df.empty:
3330
+ break
3331
+
3332
+ df = df.reset_index()
3333
+ df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
3334
+ all_dfs.append(df)
3335
+ offset_value += batch_size
3336
+
3337
+ if not all_dfs:
3338
+ return pd.DataFrame()
3339
+
3340
+ final_df = pd.concat(all_dfs, ignore_index=True)
3341
+ return final_df
3342
+
3343
+ except FileNotFoundError as e:
3344
+ logging.error(f"FileNotFoundError: {e}")
3345
+ raise
3346
+ except DefaultCredentialsError as e:
3347
+ logging.error(f"DefaultCredentialsError: {e}")
3348
+ raise
3349
+ except Exception as e:
3350
+ logging.error(f"An unexpected error occurred: {e}")
3351
+ raise