imsciences 0.9.6.2__py3-none-any.whl → 0.9.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of imsciences might be problematic. Click here for more details.
- imsciences/pull.py +509 -303
- {imsciences-0.9.6.2.dist-info → imsciences-0.9.6.4.dist-info}/METADATA +1 -1
- imsciences-0.9.6.4.dist-info/PKG-INFO-TomG-HP-290722 +355 -0
- imsciences-0.9.6.4.dist-info/RECORD +12 -0
- imsciences-0.9.6.2.dist-info/RECORD +0 -11
- {imsciences-0.9.6.2.dist-info → imsciences-0.9.6.4.dist-info}/LICENSE.txt +0 -0
- {imsciences-0.9.6.2.dist-info → imsciences-0.9.6.4.dist-info}/WHEEL +0 -0
- {imsciences-0.9.6.2.dist-info → imsciences-0.9.6.4.dist-info}/top_level.txt +0 -0
imsciences/pull.py
CHANGED
|
@@ -380,265 +380,368 @@ class datapull:
|
|
|
380
380
|
############################################################### Seasonality ##########################################################################
|
|
381
381
|
|
|
382
382
|
def pull_seasonality(self, week_commencing, start_date, countries):
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
# ---------------------------------------------------------------------
|
|
386
|
-
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
387
|
-
|
|
388
|
-
# ---------------------------------------------------------------------
|
|
389
|
-
# 1. Create daily date range from start_date to today
|
|
390
|
-
# ---------------------------------------------------------------------
|
|
391
|
-
date_range = pd.date_range(
|
|
392
|
-
start=pd.to_datetime(start_date),
|
|
393
|
-
end=datetime.today(),
|
|
394
|
-
freq="D"
|
|
395
|
-
)
|
|
396
|
-
df_daily = pd.DataFrame(date_range, columns=["Date"])
|
|
383
|
+
"""
|
|
384
|
+
Generates a DataFrame with weekly seasonality features.
|
|
397
385
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
403
|
-
)
|
|
386
|
+
Args:
|
|
387
|
+
week_commencing (str): The starting day of the week ('mon', 'tue', ..., 'sun').
|
|
388
|
+
start_date (str): The start date in 'YYYY-MM-DD' format.
|
|
389
|
+
countries (list): A list of country codes (e.g., ['GB', 'US']) for holidays.
|
|
404
390
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
391
|
+
Returns:
|
|
392
|
+
pd.DataFrame: A DataFrame indexed by week start date, containing various
|
|
393
|
+
seasonal dummy variables, holidays, trend, and constant.
|
|
394
|
+
The date column is named 'OBS'.
|
|
395
|
+
"""
|
|
396
|
+
# ---------------------------------------------------------------------
|
|
397
|
+
# 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
|
|
398
|
+
# ---------------------------------------------------------------------
|
|
399
|
+
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
400
|
+
if week_commencing not in day_dict:
|
|
401
|
+
raise ValueError(f"Invalid week_commencing value: {week_commencing}. Use one of {list(day_dict.keys())}")
|
|
402
|
+
|
|
403
|
+
# ---------------------------------------------------------------------
|
|
404
|
+
# 1. Create daily date range from start_date to today
|
|
405
|
+
# ---------------------------------------------------------------------
|
|
406
|
+
try:
|
|
407
|
+
start_dt = pd.to_datetime(start_date)
|
|
408
|
+
except ValueError:
|
|
409
|
+
raise ValueError(f"Invalid start_date format: {start_date}. Use 'YYYY-MM-DD'")
|
|
410
|
+
|
|
411
|
+
end_dt = datetime.today()
|
|
412
|
+
# Ensure end date is not before start date
|
|
413
|
+
if end_dt < start_dt:
|
|
414
|
+
end_dt = start_dt + timedelta(days=1) # Or handle as error if preferred
|
|
415
|
+
|
|
416
|
+
date_range = pd.date_range(
|
|
417
|
+
start=start_dt,
|
|
418
|
+
end=end_dt,
|
|
419
|
+
freq="D"
|
|
432
420
|
)
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
421
|
+
df_daily = pd.DataFrame(date_range, columns=["Date"])
|
|
422
|
+
|
|
423
|
+
# ---------------------------------------------------------------------
|
|
424
|
+
# 1.1 Identify "week_start" for each daily row, based on week_commencing
|
|
425
|
+
# ---------------------------------------------------------------------
|
|
426
|
+
start_day_int = day_dict[week_commencing]
|
|
427
|
+
df_daily['week_start'] = df_daily["Date"].apply(
|
|
428
|
+
lambda x: x - pd.Timedelta(days=(x.weekday() - start_day_int) % 7)
|
|
436
429
|
)
|
|
437
|
-
# Create columns for specific holiday names
|
|
438
|
-
for date_hol, name in country_holidays.items():
|
|
439
|
-
col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
|
|
440
|
-
if col_name not in df_daily.columns:
|
|
441
|
-
df_daily[col_name] = 0
|
|
442
|
-
df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
|
|
443
|
-
|
|
444
|
-
# ---------------------------------------------------------------------
|
|
445
|
-
# 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
|
|
446
|
-
# We'll add daily columns for each.
|
|
447
|
-
# ---------------------------------------------------------------------
|
|
448
|
-
# Initialize columns
|
|
449
|
-
extra_cols = [
|
|
450
|
-
"seas_valentines_day",
|
|
451
|
-
"seas_halloween",
|
|
452
|
-
"seas_fathers_day_us_uk",
|
|
453
|
-
"seas_mothers_day_us",
|
|
454
|
-
"seas_mothers_day_uk",
|
|
455
|
-
"seas_good_friday",
|
|
456
|
-
"seas_easter_monday",
|
|
457
|
-
"seas_black_friday",
|
|
458
|
-
"seas_cyber_monday",
|
|
459
|
-
]
|
|
460
|
-
for c in extra_cols:
|
|
461
|
-
df_daily[c] = 0 # default zero
|
|
462
|
-
|
|
463
|
-
# Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
|
|
464
|
-
# weekday: Monday=0, Tuesday=1, ... Sunday=6
|
|
465
|
-
def nth_weekday_of_month(year, month, weekday, nth):
|
|
466
|
-
"""
|
|
467
|
-
Returns date of the nth <weekday> in <month> of <year>.
|
|
468
|
-
E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
|
|
469
|
-
"""
|
|
470
|
-
# 1st day of the month
|
|
471
|
-
d = datetime(year, month, 1)
|
|
472
|
-
# What is the weekday of day #1?
|
|
473
|
-
w = d.weekday() # Monday=0, Tuesday=1, ... Sunday=6
|
|
474
|
-
# If we want, e.g. Sunday=6, we see how many days to add
|
|
475
|
-
delta = (weekday - w) % 7
|
|
476
|
-
# This is the first <weekday> in that month
|
|
477
|
-
first_weekday = d + timedelta(days=delta)
|
|
478
|
-
# Now add 7*(nth-1) days
|
|
479
|
-
return first_weekday + timedelta(days=7 * (nth-1))
|
|
480
|
-
|
|
481
|
-
def get_good_friday(year):
|
|
482
|
-
"""Good Friday is 2 days before Easter Sunday."""
|
|
483
|
-
return easter(year) - timedelta(days=2)
|
|
484
|
-
|
|
485
|
-
def get_easter_monday(year):
|
|
486
|
-
"""Easter Monday is 1 day after Easter Sunday."""
|
|
487
|
-
return easter(year) + timedelta(days=1)
|
|
488
|
-
|
|
489
|
-
def get_black_friday(year):
|
|
490
|
-
"""
|
|
491
|
-
Black Friday = day after US Thanksgiving,
|
|
492
|
-
and US Thanksgiving is the 4th Thursday in November.
|
|
493
|
-
"""
|
|
494
|
-
# 4th Thursday in November
|
|
495
|
-
fourth_thursday = nth_weekday_of_month(year, 11, 3, 4) # weekday=3 => Thursday
|
|
496
|
-
return fourth_thursday + timedelta(days=1)
|
|
497
|
-
|
|
498
|
-
def get_cyber_monday(year):
|
|
499
|
-
"""Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
|
|
500
|
-
# 4th Thursday in November
|
|
501
|
-
fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
|
|
502
|
-
return fourth_thursday + timedelta(days=4) # Monday after Thanksgiving
|
|
503
|
-
|
|
504
|
-
# Loop over each year in range
|
|
505
|
-
start_yr = int(start_date[:4])
|
|
506
|
-
end_yr = datetime.today().year
|
|
507
|
-
|
|
508
|
-
for yr in range(start_yr, end_yr + 1):
|
|
509
|
-
# Valentines = Feb 14
|
|
510
|
-
valentines_day = datetime(yr, 2, 14)
|
|
511
|
-
# Halloween = Oct 31
|
|
512
|
-
halloween_day = datetime(yr, 10, 31)
|
|
513
|
-
# Father's Day (US & UK) = 3rd Sunday in June
|
|
514
|
-
fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
|
|
515
|
-
# Mother's Day US = 2nd Sunday in May
|
|
516
|
-
mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
|
|
517
|
-
mothering_sunday = easter(yr) - timedelta(days=21)
|
|
518
|
-
# If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
|
|
519
|
-
while mothering_sunday.weekday() != 6: # Sunday=6
|
|
520
|
-
mothering_sunday -= timedelta(days=1)
|
|
521
|
-
|
|
522
|
-
# Good Friday, Easter Monday
|
|
523
|
-
gf = get_good_friday(yr)
|
|
524
|
-
em = get_easter_monday(yr)
|
|
525
|
-
|
|
526
|
-
# Black Friday, Cyber Monday
|
|
527
|
-
bf = get_black_friday(yr)
|
|
528
|
-
cm = get_cyber_monday(yr)
|
|
529
|
-
|
|
530
|
-
# Mark them in df_daily if in range
|
|
531
|
-
for special_date, col in [
|
|
532
|
-
(valentines_day, "seas_valentines_day"),
|
|
533
|
-
(halloween_day, "seas_halloween"),
|
|
534
|
-
(fathers_day, "seas_fathers_day_us_uk"),
|
|
535
|
-
(mothers_day_us, "seas_mothers_day_us"),
|
|
536
|
-
(mothering_sunday, "seas_mothers_day_uk"),
|
|
537
|
-
(gf, "seas_good_friday"),
|
|
538
|
-
(em, "seas_easter_monday"),
|
|
539
|
-
(bf, "seas_black_friday"),
|
|
540
|
-
(cm, "seas_cyber_monday"),
|
|
541
|
-
]:
|
|
542
|
-
# Convert to pd.Timestamp:
|
|
543
|
-
special_ts = pd.Timestamp(special_date)
|
|
544
|
-
|
|
545
|
-
# Only set if it's within your daily range
|
|
546
|
-
if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
|
|
547
|
-
df_daily.loc[df_daily["Date"] == special_ts, col] = 1
|
|
548
|
-
|
|
549
|
-
# ---------------------------------------------------------------------
|
|
550
|
-
# 4. Add daily indicators for last day & last Friday of month
|
|
551
|
-
# Then aggregate them to weekly level using .max()
|
|
552
|
-
# ---------------------------------------------------------------------
|
|
553
|
-
# Last day of month (daily)
|
|
554
|
-
df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
|
|
555
|
-
lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
|
|
556
|
-
)
|
|
557
|
-
|
|
558
|
-
# Last Friday of month (daily)
|
|
559
|
-
def is_last_friday(date):
|
|
560
|
-
# last day of the month
|
|
561
|
-
last_day_of_month = date.to_period("M").to_timestamp("M")
|
|
562
|
-
last_day_weekday = last_day_of_month.weekday() # Monday=0,...Sunday=6
|
|
563
|
-
# Determine how many days we go back from the last day to get Friday (weekday=4)
|
|
564
|
-
if last_day_weekday >= 4:
|
|
565
|
-
days_to_subtract = last_day_weekday - 4
|
|
566
|
-
else:
|
|
567
|
-
days_to_subtract = last_day_weekday + 3
|
|
568
|
-
last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
|
|
569
|
-
return 1 if date == last_friday else 0
|
|
570
|
-
|
|
571
|
-
df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
|
|
572
|
-
|
|
573
|
-
# ---------------------------------------------------------------------
|
|
574
|
-
# 5. Weekly aggregation for holiday columns & monthly dummies
|
|
575
|
-
# ---------------------------------------------------------------------
|
|
576
|
-
# For monthly dummies, create a daily col "Month", then get_dummies
|
|
577
|
-
df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
|
|
578
|
-
df_monthly_dummies = pd.get_dummies(
|
|
579
|
-
df_daily,
|
|
580
|
-
prefix="seas",
|
|
581
|
-
columns=["Month"],
|
|
582
|
-
dtype=int
|
|
583
|
-
)
|
|
584
|
-
# Recalculate 'week_start' (already in df_daily, but just to be sure)
|
|
585
|
-
df_monthly_dummies['week_start'] = df_daily['week_start']
|
|
586
|
-
|
|
587
|
-
# Group monthly dummies by .sum() or .mean()—we often spread them across the week
|
|
588
|
-
df_monthly_dummies = (
|
|
589
|
-
df_monthly_dummies
|
|
590
|
-
.groupby('week_start')
|
|
591
|
-
.sum(numeric_only=True) # sum the daily flags
|
|
592
|
-
.reset_index()
|
|
593
|
-
.rename(columns={'week_start': "Date"})
|
|
594
|
-
.set_index("Date")
|
|
595
|
-
)
|
|
596
|
-
# Spread monthly dummies by 7 to distribute across that week
|
|
597
|
-
monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
|
|
598
|
-
df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
|
|
599
|
-
|
|
600
|
-
# Group holiday & special-day columns by .max() => binary at weekly level
|
|
601
|
-
df_holidays = (
|
|
602
|
-
df_daily
|
|
603
|
-
.groupby('week_start')
|
|
604
|
-
.max(numeric_only=True) # if any day=1 in that week, entire week=1
|
|
605
|
-
.reset_index()
|
|
606
|
-
.rename(columns={'week_start': "Date"})
|
|
607
|
-
.set_index("Date")
|
|
608
|
-
)
|
|
609
430
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
431
|
+
# ---------------------------------------------------------------------
|
|
432
|
+
# 1.2 Calculate ISO week number for each DAY (for later aggregation)
|
|
433
|
+
# Also calculate Year for each DAY to handle year transitions correctly
|
|
434
|
+
# ---------------------------------------------------------------------
|
|
435
|
+
df_daily['iso_week_daily'] = df_daily['Date'].dt.isocalendar().week.astype(int)
|
|
436
|
+
df_daily['iso_year_daily'] = df_daily['Date'].dt.isocalendar().year.astype(int)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
# ---------------------------------------------------------------------
|
|
440
|
+
# 2. Build a weekly index (df_weekly_start) based on unique week_start dates
|
|
441
|
+
# ---------------------------------------------------------------------
|
|
442
|
+
df_weekly_start = df_daily[['week_start']].drop_duplicates().sort_values('week_start').reset_index(drop=True)
|
|
443
|
+
df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
|
|
444
|
+
df_weekly_start.set_index("Date", inplace=True)
|
|
445
|
+
|
|
446
|
+
# Create individual weekly dummies (optional, uncomment if needed)
|
|
447
|
+
# dummy_columns = {}
|
|
448
|
+
# for i, date_index in enumerate(df_weekly_start.index):
|
|
449
|
+
# col_name = f"dum_{date_index.strftime('%Y_%m_%d')}"
|
|
450
|
+
# dummy_columns[col_name] = [0] * len(df_weekly_start)
|
|
451
|
+
# dummy_columns[col_name][i] = 1
|
|
452
|
+
# df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
|
|
453
|
+
# df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
# ---------------------------------------------------------------------
|
|
457
|
+
# 3. Public holidays (daily) from 'holidays' package + each holiday name
|
|
458
|
+
# ---------------------------------------------------------------------
|
|
459
|
+
start_year = start_dt.year
|
|
460
|
+
end_year = end_dt.year
|
|
461
|
+
years_range = range(start_year, end_year + 1)
|
|
462
|
+
|
|
463
|
+
for country in countries:
|
|
464
|
+
try:
|
|
465
|
+
country_holidays = holidays.CountryHoliday(
|
|
466
|
+
country,
|
|
467
|
+
years=years_range,
|
|
468
|
+
observed=False # Typically you want the actual date, not observed substitute
|
|
469
|
+
)
|
|
470
|
+
# Handle cases like UK where specific subdivisions might be needed for some holidays
|
|
471
|
+
# Example: if country == 'GB': country_holidays.observed = True # If observed are needed
|
|
472
|
+
except KeyError:
|
|
473
|
+
print(f"Warning: Country code '{country}' not found in holidays library. Skipping.")
|
|
474
|
+
continue # Skip to next country
|
|
475
|
+
|
|
476
|
+
# Daily indicator: 1 if that date is a holiday
|
|
477
|
+
df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
|
|
478
|
+
lambda x: 1 if x in country_holidays else 0
|
|
479
|
+
)
|
|
480
|
+
# Create columns for specific holiday names
|
|
481
|
+
for date_hol, name in sorted(country_holidays.items()): # Sort for consistent column order
|
|
482
|
+
# Clean name: lower, replace space with underscore, remove non-alphanumeric (except underscore)
|
|
483
|
+
clean_name = ''.join(c for c in name if c.isalnum() or c == ' ').strip().replace(' ', '_').lower()
|
|
484
|
+
clean_name = clean_name.replace('_(observed)', '').replace("'", "") # specific cleaning
|
|
485
|
+
col_name = f"seas_{clean_name}_{country.lower()}"
|
|
486
|
+
|
|
487
|
+
# Only create column if the holiday occurs within the df_daily date range
|
|
488
|
+
if pd.Timestamp(date_hol).year in years_range:
|
|
489
|
+
if col_name not in df_daily.columns:
|
|
490
|
+
df_daily[col_name] = 0
|
|
491
|
+
# Ensure date_hol is within the actual daily range before assigning
|
|
492
|
+
if (pd.Timestamp(date_hol) >= df_daily["Date"].min()) and (pd.Timestamp(date_hol) <= df_daily["Date"].max()):
|
|
493
|
+
df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
|
|
494
|
+
|
|
495
|
+
# ---------------------------------------------------------------------
|
|
496
|
+
# 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
|
|
497
|
+
# ---------------------------------------------------------------------
|
|
498
|
+
extra_cols = [
|
|
499
|
+
"seas_valentines_day",
|
|
500
|
+
"seas_halloween",
|
|
501
|
+
"seas_fathers_day_us_uk", # Note: UK/US is 3rd Sun Jun, others vary
|
|
502
|
+
"seas_mothers_day_us", # Note: US is 2nd Sun May
|
|
503
|
+
"seas_mothers_day_uk", # Note: UK Mothering Sunday varies with Easter
|
|
504
|
+
"seas_good_friday",
|
|
505
|
+
"seas_easter_monday",
|
|
506
|
+
"seas_black_friday", # US-centric, but globally adopted
|
|
507
|
+
"seas_cyber_monday", # US-centric, but globally adopted
|
|
508
|
+
]
|
|
509
|
+
for c in extra_cols:
|
|
510
|
+
if c not in df_daily.columns: # Avoid overwriting if already created by holidays pkg
|
|
511
|
+
df_daily[c] = 0
|
|
512
|
+
|
|
513
|
+
# Helper: nth_weekday_of_month(year, month, weekday, nth)
|
|
514
|
+
def nth_weekday_of_month(year, month, weekday, nth):
|
|
515
|
+
d = datetime(year, month, 1)
|
|
516
|
+
w = d.weekday()
|
|
517
|
+
delta = (weekday - w + 7) % 7 # Ensure positive delta
|
|
518
|
+
first_weekday = d + timedelta(days=delta)
|
|
519
|
+
target_date = first_weekday + timedelta(days=7 * (nth - 1))
|
|
520
|
+
# Check if the calculated date is still in the same month
|
|
521
|
+
if target_date.month == month:
|
|
522
|
+
return target_date
|
|
523
|
+
else:
|
|
524
|
+
# This can happen if nth is too large (e.g., 5th Friday)
|
|
525
|
+
# Return the last occurrence of that weekday in the month instead
|
|
526
|
+
return target_date - timedelta(days=7)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def get_good_friday(year):
|
|
530
|
+
return easter(year) - timedelta(days=2)
|
|
531
|
+
|
|
532
|
+
def get_easter_monday(year):
|
|
533
|
+
return easter(year) + timedelta(days=1)
|
|
534
|
+
|
|
535
|
+
def get_black_friday(year):
|
|
536
|
+
# US Thanksgiving is 4th Thursday in November (weekday=3)
|
|
537
|
+
thanksgiving = nth_weekday_of_month(year, 11, 3, 4)
|
|
538
|
+
return thanksgiving + timedelta(days=1)
|
|
539
|
+
|
|
540
|
+
def get_cyber_monday(year):
|
|
541
|
+
# Monday after US Thanksgiving
|
|
542
|
+
thanksgiving = nth_weekday_of_month(year, 11, 3, 4)
|
|
543
|
+
return thanksgiving + timedelta(days=4)
|
|
544
|
+
|
|
545
|
+
def get_mothering_sunday_uk(year):
|
|
546
|
+
# Fourth Sunday in Lent (3 weeks before Easter Sunday)
|
|
547
|
+
# Lent starts on Ash Wednesday, 46 days before Easter.
|
|
548
|
+
# Easter Sunday is day 0. Sunday before is -7, etc.
|
|
549
|
+
# 4th Sunday in Lent is 3 weeks before Easter.
|
|
550
|
+
return easter(year) - timedelta(days=21)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
# Loop over each year in range
|
|
554
|
+
for yr in range(start_year, end_year + 1):
|
|
555
|
+
try: # Wrap calculations in try-except for robustness
|
|
556
|
+
# Valentines = Feb 14
|
|
557
|
+
valentines_day = datetime(yr, 2, 14)
|
|
558
|
+
# Halloween = Oct 31
|
|
559
|
+
halloween_day = datetime(yr, 10, 31)
|
|
560
|
+
# Father's Day (US & UK) = 3rd Sunday (6) in June
|
|
561
|
+
fathers_day = nth_weekday_of_month(yr, 6, 6, 3)
|
|
562
|
+
# Mother's Day US = 2nd Sunday (6) in May
|
|
563
|
+
mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
|
|
564
|
+
# Mother's Day UK (Mothering Sunday)
|
|
565
|
+
mothering_sunday = get_mothering_sunday_uk(yr)
|
|
566
|
+
|
|
567
|
+
# Good Friday, Easter Monday
|
|
568
|
+
gf = get_good_friday(yr)
|
|
569
|
+
em = get_easter_monday(yr)
|
|
570
|
+
|
|
571
|
+
# Black Friday, Cyber Monday
|
|
572
|
+
bf = get_black_friday(yr)
|
|
573
|
+
cm = get_cyber_monday(yr)
|
|
574
|
+
|
|
575
|
+
# Mark them in df_daily if in range
|
|
576
|
+
special_days_map = [
|
|
577
|
+
(valentines_day, "seas_valentines_day"),
|
|
578
|
+
(halloween_day, "seas_halloween"),
|
|
579
|
+
(fathers_day, "seas_fathers_day_us_uk"),
|
|
580
|
+
(mothers_day_us, "seas_mothers_day_us"),
|
|
581
|
+
(mothering_sunday,"seas_mothers_day_uk"),
|
|
582
|
+
(gf, "seas_good_friday"),
|
|
583
|
+
(em, "seas_easter_monday"),
|
|
584
|
+
(bf, "seas_black_friday"),
|
|
585
|
+
(cm, "seas_cyber_monday"),
|
|
586
|
+
]
|
|
587
|
+
|
|
588
|
+
for special_date, col in special_days_map:
|
|
589
|
+
if special_date is not None: # nth_weekday_of_month can return None edge cases
|
|
590
|
+
special_ts = pd.Timestamp(special_date)
|
|
591
|
+
# Only set if it's within the daily range AND column exists
|
|
592
|
+
if (special_ts >= df_daily["Date"].min()) and \
|
|
593
|
+
(special_ts <= df_daily["Date"].max()) and \
|
|
594
|
+
(col in df_daily.columns):
|
|
595
|
+
df_daily.loc[df_daily["Date"] == special_ts, col] = 1
|
|
596
|
+
except Exception as e:
|
|
597
|
+
print(f"Warning: Could not calculate special days for year {yr}: {e}")
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
# ---------------------------------------------------------------------
|
|
601
|
+
# 4. Add daily indicators for last day & last Friday of month
|
|
602
|
+
# ---------------------------------------------------------------------
|
|
603
|
+
df_daily["is_last_day_of_month"] = df_daily["Date"].dt.is_month_end
|
|
604
|
+
|
|
605
|
+
def is_last_friday(date):
|
|
606
|
+
# Check if it's a Friday first
|
|
607
|
+
if date.weekday() != 4: # Friday is 4
|
|
608
|
+
return 0
|
|
609
|
+
# Check if next Friday is in the next month
|
|
610
|
+
next_friday = date + timedelta(days=7)
|
|
611
|
+
return 1 if next_friday.month != date.month else 0
|
|
612
|
+
|
|
613
|
+
df_daily["is_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
|
|
614
|
+
|
|
615
|
+
# Rename for clarity prefix
|
|
616
|
+
df_daily.rename(columns={
|
|
617
|
+
"is_last_day_of_month": "seas_last_day_of_month",
|
|
618
|
+
"is_last_friday_of_month": "seas_last_friday_of_month"
|
|
619
|
+
}, inplace=True)
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
# ---------------------------------------------------------------------
|
|
623
|
+
# 5. Weekly aggregation
|
|
624
|
+
# ---------------------------------------------------------------------
|
|
625
|
+
|
|
626
|
+
# --- Aggregate flags using MAX (1 if any day in week is flagged) ---
|
|
627
|
+
# Select only columns that are indicators/flags (intended for max aggregation)
|
|
628
|
+
flag_cols = [col for col in df_daily.columns if col.startswith('seas_') or col.startswith('is_')]
|
|
629
|
+
# Ensure 'week_start' is present for grouping
|
|
630
|
+
df_to_agg = df_daily[['week_start'] + flag_cols]
|
|
631
|
+
|
|
632
|
+
df_weekly_flags = (
|
|
633
|
+
df_to_agg
|
|
634
|
+
.groupby('week_start')
|
|
635
|
+
.max() # if any day=1 in that week, entire week=1
|
|
636
|
+
.reset_index()
|
|
637
|
+
.rename(columns={'week_start': "Date"})
|
|
638
|
+
.set_index("Date")
|
|
639
|
+
)
|
|
641
640
|
|
|
641
|
+
# --- Aggregate Week Number using MODE ---
|
|
642
|
+
# Define aggregation function for mode (handling potential multi-modal cases by taking the first)
|
|
643
|
+
def get_mode(x):
|
|
644
|
+
modes = pd.Series.mode(x)
|
|
645
|
+
return modes[0] if not modes.empty else np.nan # Return first mode or NaN
|
|
646
|
+
|
|
647
|
+
df_weekly_iso_week_year = (
|
|
648
|
+
df_daily[['week_start', 'iso_week_daily', 'iso_year_daily']]
|
|
649
|
+
.groupby('week_start')
|
|
650
|
+
.agg(
|
|
651
|
+
# Find the most frequent week number and year within the group
|
|
652
|
+
Week=('iso_week_daily', get_mode),
|
|
653
|
+
Year=('iso_year_daily', get_mode)
|
|
654
|
+
)
|
|
655
|
+
.reset_index()
|
|
656
|
+
.rename(columns={'week_start': 'Date'})
|
|
657
|
+
.set_index('Date')
|
|
658
|
+
)
|
|
659
|
+
# Convert Week/Year back to integer type after aggregation
|
|
660
|
+
df_weekly_iso_week_year['Week'] = df_weekly_iso_week_year['Week'].astype(int)
|
|
661
|
+
df_weekly_iso_week_year['Year'] = df_weekly_iso_week_year['Year'].astype(int)
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
# --- Monthly dummies (spread evenly across week) ---
|
|
665
|
+
df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
|
|
666
|
+
df_monthly_dummies_daily = pd.get_dummies(
|
|
667
|
+
df_daily[["week_start", "Month"]], # Only need these columns
|
|
668
|
+
prefix="seas_month",
|
|
669
|
+
columns=["Month"],
|
|
670
|
+
dtype=float # Use float for division
|
|
671
|
+
)
|
|
672
|
+
# Sum daily dummies within the week
|
|
673
|
+
df_monthly_dummies_summed = df_monthly_dummies_daily.groupby('week_start').sum()
|
|
674
|
+
# Divide by number of days in that specific week group (usually 7, except potentially start/end)
|
|
675
|
+
days_in_week = df_daily.groupby('week_start').size()
|
|
676
|
+
df_weekly_monthly_dummies = df_monthly_dummies_summed.div(days_in_week, axis=0)
|
|
677
|
+
|
|
678
|
+
# Reset index to merge
|
|
679
|
+
df_weekly_monthly_dummies.reset_index(inplace=True)
|
|
680
|
+
df_weekly_monthly_dummies.rename(columns={'week_start': 'Date'}, inplace=True)
|
|
681
|
+
df_weekly_monthly_dummies.set_index('Date', inplace=True)
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
# ---------------------------------------------------------------------
|
|
685
|
+
# 6. Combine all weekly components
|
|
686
|
+
# ---------------------------------------------------------------------
|
|
687
|
+
# Start with the basic weekly index
|
|
688
|
+
df_combined = df_weekly_start.copy()
|
|
689
|
+
|
|
690
|
+
# Join the other aggregated DataFrames
|
|
691
|
+
df_combined = df_combined.join(df_weekly_flags, how='left')
|
|
692
|
+
df_combined = df_combined.join(df_weekly_iso_week_year, how='left')
|
|
693
|
+
df_combined = df_combined.join(df_weekly_monthly_dummies, how='left')
|
|
694
|
+
|
|
695
|
+
# Fill potential NaNs created by joins (e.g., if a flag column didn't exist) with 0
|
|
696
|
+
# Exclude 'Week' and 'Year' which should always be present
|
|
697
|
+
cols_to_fill = df_combined.columns.difference(['Week', 'Year'])
|
|
698
|
+
df_combined[cols_to_fill] = df_combined[cols_to_fill].fillna(0)
|
|
699
|
+
|
|
700
|
+
# Ensure correct types for flag columns (int)
|
|
701
|
+
for col in df_weekly_flags.columns:
|
|
702
|
+
if col in df_combined.columns:
|
|
703
|
+
df_combined[col] = df_combined[col].astype(int)
|
|
704
|
+
|
|
705
|
+
# Ensure correct types for month columns (float)
|
|
706
|
+
for col in df_weekly_monthly_dummies.columns:
|
|
707
|
+
if col in df_combined.columns:
|
|
708
|
+
df_combined[col] = df_combined[col].astype(float)
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
# ---------------------------------------------------------------------
|
|
712
|
+
# 7. Create weekly dummies for Week of Year & yearly dummies from aggregated cols
|
|
713
|
+
# ---------------------------------------------------------------------
|
|
714
|
+
df_combined.reset_index(inplace=True) # 'Date', 'Week', 'Year' become columns
|
|
715
|
+
|
|
716
|
+
# Create dummies from the aggregated 'Week' column
|
|
717
|
+
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int, prefix_sep='_')
|
|
718
|
+
|
|
719
|
+
# Create dummies from the aggregated 'Year' column
|
|
720
|
+
df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int, prefix_sep='_')
|
|
721
|
+
|
|
722
|
+
# ---------------------------------------------------------------------
|
|
723
|
+
# 8. Add constant & trend
|
|
724
|
+
# ---------------------------------------------------------------------
|
|
725
|
+
df_combined["Constant"] = 1
|
|
726
|
+
df_combined.reset_index(drop=True, inplace=True) # Ensure index is 0, 1, 2... for trend
|
|
727
|
+
df_combined["Trend"] = df_combined.index + 1
|
|
728
|
+
|
|
729
|
+
# ---------------------------------------------------------------------
|
|
730
|
+
# 9. Rename Date -> OBS and select final columns
|
|
731
|
+
# ---------------------------------------------------------------------
|
|
732
|
+
df_combined.rename(columns={"Date": "OBS"}, inplace=True)
|
|
733
|
+
|
|
734
|
+
# Reorder columns - OBS first, then Constant, Trend, then seasonal features
|
|
735
|
+
cols_order = ['OBS', 'Constant', 'Trend'] + \
|
|
736
|
+
sorted([col for col in df_combined.columns if col.startswith('seas_')]) + \
|
|
737
|
+
sorted([col for col in df_combined.columns if col.startswith('dum_')]) # If individual week dummies were enabled
|
|
738
|
+
|
|
739
|
+
# Filter out columns not in the desired order list (handles case where dum_ cols are off)
|
|
740
|
+
final_cols = [col for col in cols_order if col in df_combined.columns]
|
|
741
|
+
df_combined = df_combined[final_cols]
|
|
742
|
+
|
|
743
|
+
return df_combined
|
|
744
|
+
|
|
642
745
|
def pull_weather(self, week_commencing, start_date, country_codes) -> pd.DataFrame:
|
|
643
746
|
"""
|
|
644
747
|
Pull weather data for a given week-commencing day and one or more country codes.
|
|
@@ -1171,20 +1274,22 @@ class datapull:
|
|
|
1171
1274
|
|
|
1172
1275
|
def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
|
|
1173
1276
|
"""
|
|
1174
|
-
Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
|
|
1277
|
+
Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
|
|
1175
1278
|
aggregates it to weekly averages, and renames variables based on specified rules.
|
|
1176
1279
|
|
|
1177
1280
|
Parameters:
|
|
1178
|
-
cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
1179
|
-
week_start_day (str): The day the week starts on (
|
|
1180
|
-
sector (str): The sector for which the standard CDIDs are fetched
|
|
1281
|
+
cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
|
|
1282
|
+
week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
|
|
1283
|
+
sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
|
|
1284
|
+
(e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
|
|
1181
1285
|
|
|
1182
1286
|
Returns:
|
|
1183
|
-
pd.DataFrame: A DataFrame with weekly frequency, containing
|
|
1184
|
-
|
|
1287
|
+
pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
|
|
1288
|
+
and all series as renamed columns (e.g., 'macro_retail_sales_uk').
|
|
1289
|
+
Returns an empty DataFrame if no data is fetched or processed.
|
|
1185
1290
|
"""
|
|
1186
1291
|
# Define CDIDs for sectors and defaults
|
|
1187
|
-
|
|
1292
|
+
sector_cdids_map = {
|
|
1188
1293
|
"fast_food": ["L7TD", "L78Q", "DOAD"],
|
|
1189
1294
|
"clothing_footwear": ["D7BW","D7GO","CHBJ"],
|
|
1190
1295
|
"fuel": ["A9FS","L7FP","CHOL"],
|
|
@@ -1192,14 +1297,29 @@ class datapull:
|
|
|
1192
1297
|
"default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
|
|
1193
1298
|
}
|
|
1194
1299
|
|
|
1195
|
-
default_cdids =
|
|
1196
|
-
sector_specific_cdids =
|
|
1197
|
-
standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
|
|
1300
|
+
default_cdids = sector_cdids_map["default"]
|
|
1301
|
+
sector_specific_cdids = [] # Initialize empty list for sector CDIDs
|
|
1198
1302
|
|
|
1199
|
-
#
|
|
1303
|
+
if sector: # Check if sector is not None or empty
|
|
1304
|
+
if isinstance(sector, str):
|
|
1305
|
+
# If it's a single string, wrap it in a list
|
|
1306
|
+
sector_list = [sector]
|
|
1307
|
+
elif isinstance(sector, list):
|
|
1308
|
+
# If it's already a list, use it directly
|
|
1309
|
+
sector_list = sector
|
|
1310
|
+
else:
|
|
1311
|
+
raise TypeError("`sector` parameter must be a string or a list of strings.")
|
|
1312
|
+
|
|
1313
|
+
# Iterate through the list of sectors and collect their CDIDs
|
|
1314
|
+
for sec in sector_list:
|
|
1315
|
+
sector_specific_cdids.extend(sector_cdids_map.get(sec, [])) # Use extend to add items from the list
|
|
1316
|
+
|
|
1317
|
+
standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Combine default and selected sector CDIDs, ensure uniqueness
|
|
1318
|
+
|
|
1319
|
+
# Combine standard CDIDs and any additional user-provided CDIDs
|
|
1200
1320
|
if cdid_list is None:
|
|
1201
1321
|
cdid_list = []
|
|
1202
|
-
|
|
1322
|
+
final_cdid_list = list(set(standard_cdids + cdid_list)) # Ensure uniqueness in the final list
|
|
1203
1323
|
|
|
1204
1324
|
base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
|
|
1205
1325
|
base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
|
|
@@ -1207,41 +1327,57 @@ class datapull:
|
|
|
1207
1327
|
|
|
1208
1328
|
# Map week start day to pandas weekday convention
|
|
1209
1329
|
days_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
1210
|
-
if week_start_day not in days_map:
|
|
1330
|
+
if week_start_day.lower() not in days_map:
|
|
1211
1331
|
raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
|
|
1212
|
-
week_start = days_map[week_start_day]
|
|
1332
|
+
week_start = days_map[week_start_day.lower()] # Use lower() for case-insensitivity
|
|
1213
1333
|
|
|
1214
|
-
for cdid in
|
|
1334
|
+
for cdid in final_cdid_list: # Use the final combined list
|
|
1215
1335
|
try:
|
|
1216
1336
|
# Search for the series
|
|
1217
1337
|
search_url = f"{base_search_url}{cdid}"
|
|
1218
|
-
search_response = requests.get(search_url)
|
|
1338
|
+
search_response = requests.get(search_url, timeout=30) # Add timeout
|
|
1219
1339
|
search_response.raise_for_status()
|
|
1220
1340
|
search_data = search_response.json()
|
|
1221
1341
|
|
|
1222
1342
|
items = search_data.get("items", [])
|
|
1223
1343
|
if not items:
|
|
1224
|
-
print(f"No data found for CDID: {cdid}")
|
|
1344
|
+
print(f"Warning: No data found for CDID: {cdid}")
|
|
1225
1345
|
continue
|
|
1226
1346
|
|
|
1227
1347
|
# Extract series name and latest release URI
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1348
|
+
# Find the item with the most recent release_date
|
|
1349
|
+
latest_item = None
|
|
1350
|
+
latest_date = None
|
|
1351
|
+
for item in items:
|
|
1352
|
+
if "release_date" in item:
|
|
1353
|
+
try:
|
|
1354
|
+
# Ensure timezone awareness for comparison
|
|
1355
|
+
current_date = datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
|
|
1356
|
+
if latest_date is None or current_date > latest_date:
|
|
1357
|
+
latest_date = current_date
|
|
1358
|
+
latest_item = item
|
|
1359
|
+
except ValueError:
|
|
1360
|
+
print(f"Warning: Could not parse release_date '{item['release_date']}' for CDID {cdid}")
|
|
1361
|
+
continue # Skip this item if date is invalid
|
|
1362
|
+
|
|
1363
|
+
if latest_item is None:
|
|
1364
|
+
print(f"Warning: No valid release date found for CDID: {cdid}")
|
|
1365
|
+
continue
|
|
1366
|
+
|
|
1367
|
+
series_name = latest_item.get("title", f"Series_{cdid}") # Use title from the latest item
|
|
1368
|
+
latest_uri = latest_item.get("uri")
|
|
1369
|
+
if not latest_uri:
|
|
1370
|
+
print(f"Warning: No URI found for the latest release of CDID: {cdid}")
|
|
1371
|
+
continue
|
|
1237
1372
|
|
|
1238
1373
|
# Fetch the dataset
|
|
1239
1374
|
data_url = f"{base_data_url}{latest_uri}"
|
|
1240
|
-
data_response = requests.get(data_url)
|
|
1375
|
+
data_response = requests.get(data_url, timeout=30) # Add timeout
|
|
1241
1376
|
data_response.raise_for_status()
|
|
1242
1377
|
data_json = data_response.json()
|
|
1243
1378
|
|
|
1244
1379
|
# Detect the frequency and process accordingly
|
|
1380
|
+
frequency_key = None
|
|
1245
1381
|
if "months" in data_json and data_json["months"]:
|
|
1246
1382
|
frequency_key = "months"
|
|
1247
1383
|
elif "quarters" in data_json and data_json["quarters"]:
|
|
@@ -1249,72 +1385,142 @@ class datapull:
|
|
|
1249
1385
|
elif "years" in data_json and data_json["years"]:
|
|
1250
1386
|
frequency_key = "years"
|
|
1251
1387
|
else:
|
|
1252
|
-
print(f"Unsupported frequency or no data for CDID: {cdid}")
|
|
1388
|
+
print(f"Warning: Unsupported frequency or no data values found for CDID: {cdid} at URI {latest_uri}")
|
|
1253
1389
|
continue
|
|
1254
1390
|
|
|
1255
1391
|
# Prepare the DataFrame
|
|
1392
|
+
if not data_json[frequency_key]: # Check if the list of values is empty
|
|
1393
|
+
print(f"Warning: Empty data list for frequency '{frequency_key}' for CDID: {cdid}")
|
|
1394
|
+
continue
|
|
1395
|
+
|
|
1256
1396
|
df = pd.DataFrame(data_json[frequency_key])
|
|
1257
1397
|
|
|
1258
|
-
#
|
|
1259
|
-
if
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
def parse_quarter(quarter_str):
|
|
1263
|
-
year, qtr = quarter_str.split(" Q")
|
|
1264
|
-
month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
|
|
1265
|
-
return datetime(int(year), month, 1)
|
|
1266
|
-
df["date"] = df["date"].apply(parse_quarter)
|
|
1267
|
-
elif frequency_key == "years":
|
|
1268
|
-
df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
|
|
1398
|
+
# Check if essential columns exist
|
|
1399
|
+
if "date" not in df.columns or "value" not in df.columns:
|
|
1400
|
+
print(f"Warning: Missing 'date' or 'value' column for CDID: {cdid}")
|
|
1401
|
+
continue
|
|
1269
1402
|
|
|
1403
|
+
# Parse the 'date' field based on frequency
|
|
1404
|
+
try:
|
|
1405
|
+
if frequency_key == "months":
|
|
1406
|
+
# Handles "YYYY Mon" format (e.g., "2023 FEB") - adjust if format differs
|
|
1407
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
|
|
1408
|
+
elif frequency_key == "quarters":
|
|
1409
|
+
def parse_quarter(quarter_str):
|
|
1410
|
+
try:
|
|
1411
|
+
year, qtr = quarter_str.split(" Q")
|
|
1412
|
+
month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
|
|
1413
|
+
return datetime(int(year), month, 1)
|
|
1414
|
+
except (ValueError, KeyError):
|
|
1415
|
+
return pd.NaT # Return Not a Time for parsing errors
|
|
1416
|
+
df["date"] = df["date"].apply(parse_quarter)
|
|
1417
|
+
elif frequency_key == "years":
|
|
1418
|
+
df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
|
|
1419
|
+
except Exception as e:
|
|
1420
|
+
print(f"Error parsing date for CDID {cdid} with frequency {frequency_key}: {e}")
|
|
1421
|
+
continue # Skip this series if date parsing fails
|
|
1422
|
+
|
|
1423
|
+
# Coerce value to numeric, handle potential errors
|
|
1270
1424
|
df["value"] = pd.to_numeric(df["value"], errors="coerce")
|
|
1425
|
+
|
|
1426
|
+
# Drop rows where date or value parsing failed
|
|
1427
|
+
df.dropna(subset=["date", "value"], inplace=True)
|
|
1428
|
+
|
|
1429
|
+
if df.empty:
|
|
1430
|
+
print(f"Warning: No valid data points after processing for CDID: {cdid}")
|
|
1431
|
+
continue
|
|
1432
|
+
|
|
1271
1433
|
df.rename(columns={"value": series_name}, inplace=True)
|
|
1272
1434
|
|
|
1273
1435
|
# Combine data
|
|
1274
|
-
|
|
1436
|
+
df_subset = df.loc[:, ["date", series_name]].reset_index(drop=True) # Explicitly select columns
|
|
1275
1437
|
if combined_df.empty:
|
|
1276
|
-
combined_df =
|
|
1438
|
+
combined_df = df_subset
|
|
1277
1439
|
else:
|
|
1278
|
-
|
|
1440
|
+
# Use outer merge to keep all dates, sort afterwards
|
|
1441
|
+
combined_df = pd.merge(combined_df, df_subset, on="date", how="outer")
|
|
1279
1442
|
|
|
1280
1443
|
except requests.exceptions.RequestException as e:
|
|
1281
1444
|
print(f"Error fetching data for CDID {cdid}: {e}")
|
|
1282
|
-
except (KeyError, ValueError) as e:
|
|
1445
|
+
except (KeyError, ValueError, TypeError) as e: # Added TypeError
|
|
1283
1446
|
print(f"Error processing data for CDID {cdid}: {e}")
|
|
1447
|
+
except Exception as e: # Catch unexpected errors
|
|
1448
|
+
print(f"An unexpected error occurred for CDID {cdid}: {e}")
|
|
1449
|
+
|
|
1284
1450
|
|
|
1285
1451
|
if not combined_df.empty:
|
|
1452
|
+
# Sort by date after merging to ensure correct forward fill
|
|
1453
|
+
combined_df.sort_values(by="date", inplace=True)
|
|
1454
|
+
combined_df.reset_index(drop=True, inplace=True)
|
|
1455
|
+
|
|
1456
|
+
# Create a complete daily date range
|
|
1286
1457
|
min_date = combined_df["date"].min()
|
|
1287
|
-
max_date
|
|
1458
|
+
# Ensure max_date is timezone-naive if min_date is, or consistent otherwise
|
|
1459
|
+
max_date = pd.Timestamp(datetime.today().date()) # Use today's date, timezone-naive
|
|
1460
|
+
|
|
1461
|
+
if pd.isna(min_date):
|
|
1462
|
+
print("Error: Minimum date is NaT, cannot create date range.")
|
|
1463
|
+
return pd.DataFrame()
|
|
1464
|
+
|
|
1465
|
+
# Make sure min_date is not NaT before creating the range
|
|
1288
1466
|
date_range = pd.date_range(start=min_date, end=max_date, freq='D')
|
|
1289
1467
|
daily_df = pd.DataFrame(date_range, columns=['date'])
|
|
1468
|
+
|
|
1469
|
+
# Merge with original data and forward fill
|
|
1290
1470
|
daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
|
|
1291
1471
|
daily_df = daily_df.ffill()
|
|
1292
1472
|
|
|
1473
|
+
# Drop rows before the first valid data point after ffill
|
|
1474
|
+
first_valid_index = daily_df.dropna(subset=daily_df.columns.difference(['date'])).index.min()
|
|
1475
|
+
if pd.notna(first_valid_index):
|
|
1476
|
+
daily_df = daily_df.loc[first_valid_index:]
|
|
1477
|
+
else:
|
|
1478
|
+
print("Warning: No valid data points found after forward filling.")
|
|
1479
|
+
return pd.DataFrame() # Return empty if ffill results in no data
|
|
1480
|
+
|
|
1481
|
+
|
|
1293
1482
|
# Aggregate to weekly frequency
|
|
1294
|
-
|
|
1483
|
+
# Ensure 'date' column is datetime type before dt accessor
|
|
1484
|
+
daily_df['date'] = pd.to_datetime(daily_df['date'])
|
|
1485
|
+
daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start + 7) % 7, unit='D') # Corrected logic for week start
|
|
1486
|
+
# Group by week_commencing and calculate mean for numeric columns only
|
|
1295
1487
|
weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
|
|
1296
1488
|
|
|
1489
|
+
|
|
1297
1490
|
def clean_column_name(name):
|
|
1491
|
+
# Remove content within parentheses (e.g., CPI INDEX 00: ALL ITEMS 2015=100)
|
|
1298
1492
|
name = re.sub(r"\(.*?\)", "", name)
|
|
1493
|
+
# Take only the part before the first colon if present
|
|
1299
1494
|
name = re.split(r":", name)[0]
|
|
1300
|
-
|
|
1495
|
+
# Remove digits
|
|
1496
|
+
#name = re.sub(r"\d+", "", name) # Reconsider removing all digits, might be needed for some series
|
|
1497
|
+
# Remove specific words like 'annual', 'rate' case-insensitively
|
|
1301
1498
|
name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
|
|
1499
|
+
# Remove non-alphanumeric characters (except underscore and space)
|
|
1302
1500
|
name = re.sub(r"[^\w\s]", "", name)
|
|
1501
|
+
# Replace spaces with underscores
|
|
1502
|
+
name = name.strip() # Remove leading/trailing whitespace
|
|
1303
1503
|
name = name.replace(" ", "_")
|
|
1504
|
+
# Replace multiple underscores with a single one
|
|
1304
1505
|
name = re.sub(r"_+", "_", name)
|
|
1506
|
+
# Remove trailing underscores
|
|
1305
1507
|
name = name.rstrip("_")
|
|
1508
|
+
# Add prefix and suffix
|
|
1306
1509
|
return f"macro_{name.lower()}_uk"
|
|
1307
1510
|
|
|
1511
|
+
# Apply cleaning function to relevant columns
|
|
1308
1512
|
weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
|
|
1309
|
-
weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
|
|
1513
|
+
weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True) # Rename week commencing col
|
|
1310
1514
|
|
|
1311
|
-
|
|
1515
|
+
# Optional: Fill remaining NaNs (e.g., at the beginning if ffill didn't cover) with 0
|
|
1516
|
+
# Consider if 0 is the appropriate fill value for your use case
|
|
1517
|
+
# weekly_df = weekly_df.fillna(0)
|
|
1312
1518
|
|
|
1313
1519
|
return weekly_df
|
|
1314
1520
|
else:
|
|
1315
|
-
print("No data
|
|
1521
|
+
print("No data successfully fetched or processed.")
|
|
1316
1522
|
return pd.DataFrame()
|
|
1317
|
-
|
|
1523
|
+
|
|
1318
1524
|
def pull_yfinance(self, tickers=None, week_start_day="mon"):
|
|
1319
1525
|
"""
|
|
1320
1526
|
Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,
|