imsciences 0.9.6.2__py3-none-any.whl → 0.9.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of imsciences might be problematic. Click here for more details.

imsciences/pull.py CHANGED
@@ -380,265 +380,368 @@ class datapull:
380
380
  ############################################################### Seasonality ##########################################################################
381
381
 
382
382
  def pull_seasonality(self, week_commencing, start_date, countries):
383
- # ---------------------------------------------------------------------
384
- # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
385
- # ---------------------------------------------------------------------
386
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
387
-
388
- # ---------------------------------------------------------------------
389
- # 1. Create daily date range from start_date to today
390
- # ---------------------------------------------------------------------
391
- date_range = pd.date_range(
392
- start=pd.to_datetime(start_date),
393
- end=datetime.today(),
394
- freq="D"
395
- )
396
- df_daily = pd.DataFrame(date_range, columns=["Date"])
383
+ """
384
+ Generates a DataFrame with weekly seasonality features.
397
385
 
398
- # ---------------------------------------------------------------------
399
- # 1.1 Identify "week_start" for each daily row, based on week_commencing
400
- # ---------------------------------------------------------------------
401
- df_daily['week_start'] = df_daily["Date"].apply(
402
- lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
403
- )
386
+ Args:
387
+ week_commencing (str): The starting day of the week ('mon', 'tue', ..., 'sun').
388
+ start_date (str): The start date in 'YYYY-MM-DD' format.
389
+ countries (list): A list of country codes (e.g., ['GB', 'US']) for holidays.
404
390
 
405
- # ---------------------------------------------------------------------
406
- # 2. Build a weekly index (df_weekly_start) with dummy columns
407
- # ---------------------------------------------------------------------
408
- df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
409
- df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
410
-
411
- # Set index to weekly "start of week"
412
- df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
413
- df_weekly_start.set_index("Date", inplace=True)
414
-
415
- # Create individual weekly dummies
416
- dummy_columns = {}
417
- for i in range(len(df_weekly_start)):
418
- col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
419
- dummy_columns[col_name] = [0] * len(df_weekly_start)
420
- dummy_columns[col_name][i] = 1
421
-
422
- df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
423
- df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
424
-
425
- # ---------------------------------------------------------------------
426
- # 3. Public holidays (daily) from 'holidays' package + each holiday name
427
- # ---------------------------------------------------------------------
428
- for country in countries:
429
- country_holidays = holidays.CountryHoliday(
430
- country,
431
- years=range(int(start_date[:4]), datetime.today().year + 1)
391
+ Returns:
392
+ pd.DataFrame: A DataFrame indexed by week start date, containing various
393
+ seasonal dummy variables, holidays, trend, and constant.
394
+ The date column is named 'OBS'.
395
+ """
396
+ # ---------------------------------------------------------------------
397
+ # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
398
+ # ---------------------------------------------------------------------
399
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
400
+ if week_commencing not in day_dict:
401
+ raise ValueError(f"Invalid week_commencing value: {week_commencing}. Use one of {list(day_dict.keys())}")
402
+
403
+ # ---------------------------------------------------------------------
404
+ # 1. Create daily date range from start_date to today
405
+ # ---------------------------------------------------------------------
406
+ try:
407
+ start_dt = pd.to_datetime(start_date)
408
+ except ValueError:
409
+ raise ValueError(f"Invalid start_date format: {start_date}. Use 'YYYY-MM-DD'")
410
+
411
+ end_dt = datetime.today()
412
+ # Ensure end date is not before start date
413
+ if end_dt < start_dt:
414
+ end_dt = start_dt + timedelta(days=1) # Or handle as error if preferred
415
+
416
+ date_range = pd.date_range(
417
+ start=start_dt,
418
+ end=end_dt,
419
+ freq="D"
432
420
  )
433
- # Daily indicator: 1 if that date is a holiday
434
- df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
435
- lambda x: 1 if x in country_holidays else 0
421
+ df_daily = pd.DataFrame(date_range, columns=["Date"])
422
+
423
+ # ---------------------------------------------------------------------
424
+ # 1.1 Identify "week_start" for each daily row, based on week_commencing
425
+ # ---------------------------------------------------------------------
426
+ start_day_int = day_dict[week_commencing]
427
+ df_daily['week_start'] = df_daily["Date"].apply(
428
+ lambda x: x - pd.Timedelta(days=(x.weekday() - start_day_int) % 7)
436
429
  )
437
- # Create columns for specific holiday names
438
- for date_hol, name in country_holidays.items():
439
- col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
440
- if col_name not in df_daily.columns:
441
- df_daily[col_name] = 0
442
- df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
443
-
444
- # ---------------------------------------------------------------------
445
- # 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
446
- # We'll add daily columns for each.
447
- # ---------------------------------------------------------------------
448
- # Initialize columns
449
- extra_cols = [
450
- "seas_valentines_day",
451
- "seas_halloween",
452
- "seas_fathers_day_us_uk",
453
- "seas_mothers_day_us",
454
- "seas_mothers_day_uk",
455
- "seas_good_friday",
456
- "seas_easter_monday",
457
- "seas_black_friday",
458
- "seas_cyber_monday",
459
- ]
460
- for c in extra_cols:
461
- df_daily[c] = 0 # default zero
462
-
463
- # Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
464
- # weekday: Monday=0, Tuesday=1, ... Sunday=6
465
- def nth_weekday_of_month(year, month, weekday, nth):
466
- """
467
- Returns date of the nth <weekday> in <month> of <year>.
468
- E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
469
- """
470
- # 1st day of the month
471
- d = datetime(year, month, 1)
472
- # What is the weekday of day #1?
473
- w = d.weekday() # Monday=0, Tuesday=1, ... Sunday=6
474
- # If we want, e.g. Sunday=6, we see how many days to add
475
- delta = (weekday - w) % 7
476
- # This is the first <weekday> in that month
477
- first_weekday = d + timedelta(days=delta)
478
- # Now add 7*(nth-1) days
479
- return first_weekday + timedelta(days=7 * (nth-1))
480
-
481
- def get_good_friday(year):
482
- """Good Friday is 2 days before Easter Sunday."""
483
- return easter(year) - timedelta(days=2)
484
-
485
- def get_easter_monday(year):
486
- """Easter Monday is 1 day after Easter Sunday."""
487
- return easter(year) + timedelta(days=1)
488
-
489
- def get_black_friday(year):
490
- """
491
- Black Friday = day after US Thanksgiving,
492
- and US Thanksgiving is the 4th Thursday in November.
493
- """
494
- # 4th Thursday in November
495
- fourth_thursday = nth_weekday_of_month(year, 11, 3, 4) # weekday=3 => Thursday
496
- return fourth_thursday + timedelta(days=1)
497
-
498
- def get_cyber_monday(year):
499
- """Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
500
- # 4th Thursday in November
501
- fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
502
- return fourth_thursday + timedelta(days=4) # Monday after Thanksgiving
503
-
504
- # Loop over each year in range
505
- start_yr = int(start_date[:4])
506
- end_yr = datetime.today().year
507
-
508
- for yr in range(start_yr, end_yr + 1):
509
- # Valentines = Feb 14
510
- valentines_day = datetime(yr, 2, 14)
511
- # Halloween = Oct 31
512
- halloween_day = datetime(yr, 10, 31)
513
- # Father's Day (US & UK) = 3rd Sunday in June
514
- fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
515
- # Mother's Day US = 2nd Sunday in May
516
- mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
517
- mothering_sunday = easter(yr) - timedelta(days=21)
518
- # If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
519
- while mothering_sunday.weekday() != 6: # Sunday=6
520
- mothering_sunday -= timedelta(days=1)
521
-
522
- # Good Friday, Easter Monday
523
- gf = get_good_friday(yr)
524
- em = get_easter_monday(yr)
525
-
526
- # Black Friday, Cyber Monday
527
- bf = get_black_friday(yr)
528
- cm = get_cyber_monday(yr)
529
-
530
- # Mark them in df_daily if in range
531
- for special_date, col in [
532
- (valentines_day, "seas_valentines_day"),
533
- (halloween_day, "seas_halloween"),
534
- (fathers_day, "seas_fathers_day_us_uk"),
535
- (mothers_day_us, "seas_mothers_day_us"),
536
- (mothering_sunday, "seas_mothers_day_uk"),
537
- (gf, "seas_good_friday"),
538
- (em, "seas_easter_monday"),
539
- (bf, "seas_black_friday"),
540
- (cm, "seas_cyber_monday"),
541
- ]:
542
- # Convert to pd.Timestamp:
543
- special_ts = pd.Timestamp(special_date)
544
-
545
- # Only set if it's within your daily range
546
- if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
547
- df_daily.loc[df_daily["Date"] == special_ts, col] = 1
548
-
549
- # ---------------------------------------------------------------------
550
- # 4. Add daily indicators for last day & last Friday of month
551
- # Then aggregate them to weekly level using .max()
552
- # ---------------------------------------------------------------------
553
- # Last day of month (daily)
554
- df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
555
- lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
556
- )
557
-
558
- # Last Friday of month (daily)
559
- def is_last_friday(date):
560
- # last day of the month
561
- last_day_of_month = date.to_period("M").to_timestamp("M")
562
- last_day_weekday = last_day_of_month.weekday() # Monday=0,...Sunday=6
563
- # Determine how many days we go back from the last day to get Friday (weekday=4)
564
- if last_day_weekday >= 4:
565
- days_to_subtract = last_day_weekday - 4
566
- else:
567
- days_to_subtract = last_day_weekday + 3
568
- last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
569
- return 1 if date == last_friday else 0
570
-
571
- df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
572
-
573
- # ---------------------------------------------------------------------
574
- # 5. Weekly aggregation for holiday columns & monthly dummies
575
- # ---------------------------------------------------------------------
576
- # For monthly dummies, create a daily col "Month", then get_dummies
577
- df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
578
- df_monthly_dummies = pd.get_dummies(
579
- df_daily,
580
- prefix="seas",
581
- columns=["Month"],
582
- dtype=int
583
- )
584
- # Recalculate 'week_start' (already in df_daily, but just to be sure)
585
- df_monthly_dummies['week_start'] = df_daily['week_start']
586
-
587
- # Group monthly dummies by .sum() or .mean()—we often spread them across the week
588
- df_monthly_dummies = (
589
- df_monthly_dummies
590
- .groupby('week_start')
591
- .sum(numeric_only=True) # sum the daily flags
592
- .reset_index()
593
- .rename(columns={'week_start': "Date"})
594
- .set_index("Date")
595
- )
596
- # Spread monthly dummies by 7 to distribute across that week
597
- monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
598
- df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
599
-
600
- # Group holiday & special-day columns by .max() => binary at weekly level
601
- df_holidays = (
602
- df_daily
603
- .groupby('week_start')
604
- .max(numeric_only=True) # if any day=1 in that week, entire week=1
605
- .reset_index()
606
- .rename(columns={'week_start': "Date"})
607
- .set_index("Date")
608
- )
609
430
 
610
- # ---------------------------------------------------------------------
611
- # 6. Combine weekly start, monthly dummies, holiday flags
612
- # ---------------------------------------------------------------------
613
- df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
614
- df_combined = pd.concat([df_combined, df_holidays], axis=1)
615
- df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
616
-
617
- # ---------------------------------------------------------------------
618
- # 7. Create weekly dummies for Week of Year & yearly dummies
619
- # ---------------------------------------------------------------------
620
- df_combined.reset_index(inplace=True)
621
- df_combined.rename(columns={"index": "old_index"}, inplace=True) # just in case
622
-
623
- df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
624
- df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
625
-
626
- df_combined["Year"] = df_combined["Date"].dt.year
627
- df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
628
-
629
- # ---------------------------------------------------------------------
630
- # 8. Add constant & trend
631
- # ---------------------------------------------------------------------
632
- df_combined["Constant"] = 1
633
- df_combined["Trend"] = df_combined.index + 1
634
-
635
- # ---------------------------------------------------------------------
636
- # 9. Rename Date -> OBS and return
637
- # ---------------------------------------------------------------------
638
- df_combined.rename(columns={"Date": "OBS"}, inplace=True)
639
-
640
- return df_combined
431
+ # ---------------------------------------------------------------------
432
+ # 1.2 Calculate ISO week number for each DAY (for later aggregation)
433
+ # Also calculate Year for each DAY to handle year transitions correctly
434
+ # ---------------------------------------------------------------------
435
+ df_daily['iso_week_daily'] = df_daily['Date'].dt.isocalendar().week.astype(int)
436
+ df_daily['iso_year_daily'] = df_daily['Date'].dt.isocalendar().year.astype(int)
437
+
438
+
439
+ # ---------------------------------------------------------------------
440
+ # 2. Build a weekly index (df_weekly_start) based on unique week_start dates
441
+ # ---------------------------------------------------------------------
442
+ df_weekly_start = df_daily[['week_start']].drop_duplicates().sort_values('week_start').reset_index(drop=True)
443
+ df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
444
+ df_weekly_start.set_index("Date", inplace=True)
445
+
446
+ # Create individual weekly dummies (optional, uncomment if needed)
447
+ # dummy_columns = {}
448
+ # for i, date_index in enumerate(df_weekly_start.index):
449
+ # col_name = f"dum_{date_index.strftime('%Y_%m_%d')}"
450
+ # dummy_columns[col_name] = [0] * len(df_weekly_start)
451
+ # dummy_columns[col_name][i] = 1
452
+ # df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
453
+ # df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
454
+
455
+
456
+ # ---------------------------------------------------------------------
457
+ # 3. Public holidays (daily) from 'holidays' package + each holiday name
458
+ # ---------------------------------------------------------------------
459
+ start_year = start_dt.year
460
+ end_year = end_dt.year
461
+ years_range = range(start_year, end_year + 1)
462
+
463
+ for country in countries:
464
+ try:
465
+ country_holidays = holidays.CountryHoliday(
466
+ country,
467
+ years=years_range,
468
+ observed=False # Typically you want the actual date, not observed substitute
469
+ )
470
+ # Handle cases like UK where specific subdivisions might be needed for some holidays
471
+ # Example: if country == 'GB': country_holidays.observed = True # If observed are needed
472
+ except KeyError:
473
+ print(f"Warning: Country code '{country}' not found in holidays library. Skipping.")
474
+ continue # Skip to next country
475
+
476
+ # Daily indicator: 1 if that date is a holiday
477
+ df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
478
+ lambda x: 1 if x in country_holidays else 0
479
+ )
480
+ # Create columns for specific holiday names
481
+ for date_hol, name in sorted(country_holidays.items()): # Sort for consistent column order
482
+ # Clean name: lower, replace space with underscore, remove non-alphanumeric (except underscore)
483
+ clean_name = ''.join(c for c in name if c.isalnum() or c == ' ').strip().replace(' ', '_').lower()
484
+ clean_name = clean_name.replace('_(observed)', '').replace("'", "") # specific cleaning
485
+ col_name = f"seas_{clean_name}_{country.lower()}"
486
+
487
+ # Only create column if the holiday occurs within the df_daily date range
488
+ if pd.Timestamp(date_hol).year in years_range:
489
+ if col_name not in df_daily.columns:
490
+ df_daily[col_name] = 0
491
+ # Ensure date_hol is within the actual daily range before assigning
492
+ if (pd.Timestamp(date_hol) >= df_daily["Date"].min()) and (pd.Timestamp(date_hol) <= df_daily["Date"].max()):
493
+ df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
494
+
495
+ # ---------------------------------------------------------------------
496
+ # 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
497
+ # ---------------------------------------------------------------------
498
+ extra_cols = [
499
+ "seas_valentines_day",
500
+ "seas_halloween",
501
+ "seas_fathers_day_us_uk", # Note: UK/US is 3rd Sun Jun, others vary
502
+ "seas_mothers_day_us", # Note: US is 2nd Sun May
503
+ "seas_mothers_day_uk", # Note: UK Mothering Sunday varies with Easter
504
+ "seas_good_friday",
505
+ "seas_easter_monday",
506
+ "seas_black_friday", # US-centric, but globally adopted
507
+ "seas_cyber_monday", # US-centric, but globally adopted
508
+ ]
509
+ for c in extra_cols:
510
+ if c not in df_daily.columns: # Avoid overwriting if already created by holidays pkg
511
+ df_daily[c] = 0
512
+
513
+ # Helper: nth_weekday_of_month(year, month, weekday, nth)
514
+ def nth_weekday_of_month(year, month, weekday, nth):
515
+ d = datetime(year, month, 1)
516
+ w = d.weekday()
517
+ delta = (weekday - w + 7) % 7 # Ensure positive delta
518
+ first_weekday = d + timedelta(days=delta)
519
+ target_date = first_weekday + timedelta(days=7 * (nth - 1))
520
+ # Check if the calculated date is still in the same month
521
+ if target_date.month == month:
522
+ return target_date
523
+ else:
524
+ # This can happen if nth is too large (e.g., 5th Friday)
525
+ # Return the last occurrence of that weekday in the month instead
526
+ return target_date - timedelta(days=7)
527
+
528
+
529
+ def get_good_friday(year):
530
+ return easter(year) - timedelta(days=2)
531
+
532
+ def get_easter_monday(year):
533
+ return easter(year) + timedelta(days=1)
534
+
535
+ def get_black_friday(year):
536
+ # US Thanksgiving is 4th Thursday in November (weekday=3)
537
+ thanksgiving = nth_weekday_of_month(year, 11, 3, 4)
538
+ return thanksgiving + timedelta(days=1)
539
+
540
+ def get_cyber_monday(year):
541
+ # Monday after US Thanksgiving
542
+ thanksgiving = nth_weekday_of_month(year, 11, 3, 4)
543
+ return thanksgiving + timedelta(days=4)
544
+
545
+ def get_mothering_sunday_uk(year):
546
+ # Fourth Sunday in Lent (3 weeks before Easter Sunday)
547
+ # Lent starts on Ash Wednesday, 46 days before Easter.
548
+ # Easter Sunday is day 0. Sunday before is -7, etc.
549
+ # 4th Sunday in Lent is 3 weeks before Easter.
550
+ return easter(year) - timedelta(days=21)
551
+
552
+
553
+ # Loop over each year in range
554
+ for yr in range(start_year, end_year + 1):
555
+ try: # Wrap calculations in try-except for robustness
556
+ # Valentines = Feb 14
557
+ valentines_day = datetime(yr, 2, 14)
558
+ # Halloween = Oct 31
559
+ halloween_day = datetime(yr, 10, 31)
560
+ # Father's Day (US & UK) = 3rd Sunday (6) in June
561
+ fathers_day = nth_weekday_of_month(yr, 6, 6, 3)
562
+ # Mother's Day US = 2nd Sunday (6) in May
563
+ mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
564
+ # Mother's Day UK (Mothering Sunday)
565
+ mothering_sunday = get_mothering_sunday_uk(yr)
566
+
567
+ # Good Friday, Easter Monday
568
+ gf = get_good_friday(yr)
569
+ em = get_easter_monday(yr)
570
+
571
+ # Black Friday, Cyber Monday
572
+ bf = get_black_friday(yr)
573
+ cm = get_cyber_monday(yr)
574
+
575
+ # Mark them in df_daily if in range
576
+ special_days_map = [
577
+ (valentines_day, "seas_valentines_day"),
578
+ (halloween_day, "seas_halloween"),
579
+ (fathers_day, "seas_fathers_day_us_uk"),
580
+ (mothers_day_us, "seas_mothers_day_us"),
581
+ (mothering_sunday,"seas_mothers_day_uk"),
582
+ (gf, "seas_good_friday"),
583
+ (em, "seas_easter_monday"),
584
+ (bf, "seas_black_friday"),
585
+ (cm, "seas_cyber_monday"),
586
+ ]
587
+
588
+ for special_date, col in special_days_map:
589
+ if special_date is not None: # nth_weekday_of_month can return None edge cases
590
+ special_ts = pd.Timestamp(special_date)
591
+ # Only set if it's within the daily range AND column exists
592
+ if (special_ts >= df_daily["Date"].min()) and \
593
+ (special_ts <= df_daily["Date"].max()) and \
594
+ (col in df_daily.columns):
595
+ df_daily.loc[df_daily["Date"] == special_ts, col] = 1
596
+ except Exception as e:
597
+ print(f"Warning: Could not calculate special days for year {yr}: {e}")
598
+
599
+
600
+ # ---------------------------------------------------------------------
601
+ # 4. Add daily indicators for last day & last Friday of month
602
+ # ---------------------------------------------------------------------
603
+ df_daily["is_last_day_of_month"] = df_daily["Date"].dt.is_month_end
604
+
605
+ def is_last_friday(date):
606
+ # Check if it's a Friday first
607
+ if date.weekday() != 4: # Friday is 4
608
+ return 0
609
+ # Check if next Friday is in the next month
610
+ next_friday = date + timedelta(days=7)
611
+ return 1 if next_friday.month != date.month else 0
612
+
613
+ df_daily["is_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
614
+
615
+ # Rename for clarity prefix
616
+ df_daily.rename(columns={
617
+ "is_last_day_of_month": "seas_last_day_of_month",
618
+ "is_last_friday_of_month": "seas_last_friday_of_month"
619
+ }, inplace=True)
620
+
621
+
622
+ # ---------------------------------------------------------------------
623
+ # 5. Weekly aggregation
624
+ # ---------------------------------------------------------------------
625
+
626
+ # --- Aggregate flags using MAX (1 if any day in week is flagged) ---
627
+ # Select only columns that are indicators/flags (intended for max aggregation)
628
+ flag_cols = [col for col in df_daily.columns if col.startswith('seas_') or col.startswith('is_')]
629
+ # Ensure 'week_start' is present for grouping
630
+ df_to_agg = df_daily[['week_start'] + flag_cols]
631
+
632
+ df_weekly_flags = (
633
+ df_to_agg
634
+ .groupby('week_start')
635
+ .max() # if any day=1 in that week, entire week=1
636
+ .reset_index()
637
+ .rename(columns={'week_start': "Date"})
638
+ .set_index("Date")
639
+ )
641
640
 
641
+ # --- Aggregate Week Number using MODE ---
642
+ # Define aggregation function for mode (handling potential multi-modal cases by taking the first)
643
+ def get_mode(x):
644
+ modes = pd.Series.mode(x)
645
+ return modes[0] if not modes.empty else np.nan # Return first mode or NaN
646
+
647
+ df_weekly_iso_week_year = (
648
+ df_daily[['week_start', 'iso_week_daily', 'iso_year_daily']]
649
+ .groupby('week_start')
650
+ .agg(
651
+ # Find the most frequent week number and year within the group
652
+ Week=('iso_week_daily', get_mode),
653
+ Year=('iso_year_daily', get_mode)
654
+ )
655
+ .reset_index()
656
+ .rename(columns={'week_start': 'Date'})
657
+ .set_index('Date')
658
+ )
659
+ # Convert Week/Year back to integer type after aggregation
660
+ df_weekly_iso_week_year['Week'] = df_weekly_iso_week_year['Week'].astype(int)
661
+ df_weekly_iso_week_year['Year'] = df_weekly_iso_week_year['Year'].astype(int)
662
+
663
+
664
+ # --- Monthly dummies (spread evenly across week) ---
665
+ df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
666
+ df_monthly_dummies_daily = pd.get_dummies(
667
+ df_daily[["week_start", "Month"]], # Only need these columns
668
+ prefix="seas_month",
669
+ columns=["Month"],
670
+ dtype=float # Use float for division
671
+ )
672
+ # Sum daily dummies within the week
673
+ df_monthly_dummies_summed = df_monthly_dummies_daily.groupby('week_start').sum()
674
+ # Divide by number of days in that specific week group (usually 7, except potentially start/end)
675
+ days_in_week = df_daily.groupby('week_start').size()
676
+ df_weekly_monthly_dummies = df_monthly_dummies_summed.div(days_in_week, axis=0)
677
+
678
+ # Reset index to merge
679
+ df_weekly_monthly_dummies.reset_index(inplace=True)
680
+ df_weekly_monthly_dummies.rename(columns={'week_start': 'Date'}, inplace=True)
681
+ df_weekly_monthly_dummies.set_index('Date', inplace=True)
682
+
683
+
684
+ # ---------------------------------------------------------------------
685
+ # 6. Combine all weekly components
686
+ # ---------------------------------------------------------------------
687
+ # Start with the basic weekly index
688
+ df_combined = df_weekly_start.copy()
689
+
690
+ # Join the other aggregated DataFrames
691
+ df_combined = df_combined.join(df_weekly_flags, how='left')
692
+ df_combined = df_combined.join(df_weekly_iso_week_year, how='left')
693
+ df_combined = df_combined.join(df_weekly_monthly_dummies, how='left')
694
+
695
+ # Fill potential NaNs created by joins (e.g., if a flag column didn't exist) with 0
696
+ # Exclude 'Week' and 'Year' which should always be present
697
+ cols_to_fill = df_combined.columns.difference(['Week', 'Year'])
698
+ df_combined[cols_to_fill] = df_combined[cols_to_fill].fillna(0)
699
+
700
+ # Ensure correct types for flag columns (int)
701
+ for col in df_weekly_flags.columns:
702
+ if col in df_combined.columns:
703
+ df_combined[col] = df_combined[col].astype(int)
704
+
705
+ # Ensure correct types for month columns (float)
706
+ for col in df_weekly_monthly_dummies.columns:
707
+ if col in df_combined.columns:
708
+ df_combined[col] = df_combined[col].astype(float)
709
+
710
+
711
+ # ---------------------------------------------------------------------
712
+ # 7. Create weekly dummies for Week of Year & yearly dummies from aggregated cols
713
+ # ---------------------------------------------------------------------
714
+ df_combined.reset_index(inplace=True) # 'Date', 'Week', 'Year' become columns
715
+
716
+ # Create dummies from the aggregated 'Week' column
717
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int, prefix_sep='_')
718
+
719
+ # Create dummies from the aggregated 'Year' column
720
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int, prefix_sep='_')
721
+
722
+ # ---------------------------------------------------------------------
723
+ # 8. Add constant & trend
724
+ # ---------------------------------------------------------------------
725
+ df_combined["Constant"] = 1
726
+ df_combined.reset_index(drop=True, inplace=True) # Ensure index is 0, 1, 2... for trend
727
+ df_combined["Trend"] = df_combined.index + 1
728
+
729
+ # ---------------------------------------------------------------------
730
+ # 9. Rename Date -> OBS and select final columns
731
+ # ---------------------------------------------------------------------
732
+ df_combined.rename(columns={"Date": "OBS"}, inplace=True)
733
+
734
+ # Reorder columns - OBS first, then Constant, Trend, then seasonal features
735
+ cols_order = ['OBS', 'Constant', 'Trend'] + \
736
+ sorted([col for col in df_combined.columns if col.startswith('seas_')]) + \
737
+ sorted([col for col in df_combined.columns if col.startswith('dum_')]) # If individual week dummies were enabled
738
+
739
+ # Filter out columns not in the desired order list (handles case where dum_ cols are off)
740
+ final_cols = [col for col in cols_order if col in df_combined.columns]
741
+ df_combined = df_combined[final_cols]
742
+
743
+ return df_combined
744
+
642
745
  def pull_weather(self, week_commencing, start_date, country_codes) -> pd.DataFrame:
643
746
  """
644
747
  Pull weather data for a given week-commencing day and one or more country codes.
@@ -1171,20 +1274,22 @@ class datapull:
1171
1274
 
1172
1275
  def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
1173
1276
  """
1174
- Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
1277
+ Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
1175
1278
  aggregates it to weekly averages, and renames variables based on specified rules.
1176
1279
 
1177
1280
  Parameters:
1178
- cdid_list (list): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
1179
- week_start_day (str): The day the week starts on (e.g., 'Monday', 'Sunday').
1180
- sector (str): The sector for which the standard CDIDs are fetched (e.g., 'fast_food', 'retail').
1281
+ cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
1282
+ week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
1283
+ sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
1284
+ (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
1181
1285
 
1182
1286
  Returns:
1183
- pd.DataFrame: A DataFrame with weekly frequency, containing a 'week_commencing' column
1184
- and all series as renamed columns.
1287
+ pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
1288
+ and all series as renamed columns (e.g., 'macro_retail_sales_uk').
1289
+ Returns an empty DataFrame if no data is fetched or processed.
1185
1290
  """
1186
1291
  # Define CDIDs for sectors and defaults
1187
- sector_cdids = {
1292
+ sector_cdids_map = {
1188
1293
  "fast_food": ["L7TD", "L78Q", "DOAD"],
1189
1294
  "clothing_footwear": ["D7BW","D7GO","CHBJ"],
1190
1295
  "fuel": ["A9FS","L7FP","CHOL"],
@@ -1192,14 +1297,29 @@ class datapull:
1192
1297
  "default": ["D7G7", "MGSX", "UKPOP", "IHYQ", "YBEZ", "MS77"],
1193
1298
  }
1194
1299
 
1195
- default_cdids = sector_cdids["default"]
1196
- sector_specific_cdids = sector_cdids.get(sector, [])
1197
- standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Avoid duplicates
1300
+ default_cdids = sector_cdids_map["default"]
1301
+ sector_specific_cdids = [] # Initialize empty list for sector CDIDs
1198
1302
 
1199
- # Combine standard CDIDs and additional CDIDs
1303
+ if sector: # Check if sector is not None or empty
1304
+ if isinstance(sector, str):
1305
+ # If it's a single string, wrap it in a list
1306
+ sector_list = [sector]
1307
+ elif isinstance(sector, list):
1308
+ # If it's already a list, use it directly
1309
+ sector_list = sector
1310
+ else:
1311
+ raise TypeError("`sector` parameter must be a string or a list of strings.")
1312
+
1313
+ # Iterate through the list of sectors and collect their CDIDs
1314
+ for sec in sector_list:
1315
+ sector_specific_cdids.extend(sector_cdids_map.get(sec, [])) # Use extend to add items from the list
1316
+
1317
+ standard_cdids = list(set(default_cdids + sector_specific_cdids)) # Combine default and selected sector CDIDs, ensure uniqueness
1318
+
1319
+ # Combine standard CDIDs and any additional user-provided CDIDs
1200
1320
  if cdid_list is None:
1201
1321
  cdid_list = []
1202
- cdid_list = list(set(standard_cdids + cdid_list)) # Avoid duplicates
1322
+ final_cdid_list = list(set(standard_cdids + cdid_list)) # Ensure uniqueness in the final list
1203
1323
 
1204
1324
  base_search_url = "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
1205
1325
  base_data_url = "https://api.beta.ons.gov.uk/v1/data?uri="
@@ -1207,41 +1327,57 @@ class datapull:
1207
1327
 
1208
1328
  # Map week start day to pandas weekday convention
1209
1329
  days_map = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
1210
- if week_start_day not in days_map:
1330
+ if week_start_day.lower() not in days_map:
1211
1331
  raise ValueError("Invalid week start day. Choose from: " + ", ".join(days_map.keys()))
1212
- week_start = days_map[week_start_day]
1332
+ week_start = days_map[week_start_day.lower()] # Use lower() for case-insensitivity
1213
1333
 
1214
- for cdid in cdid_list:
1334
+ for cdid in final_cdid_list: # Use the final combined list
1215
1335
  try:
1216
1336
  # Search for the series
1217
1337
  search_url = f"{base_search_url}{cdid}"
1218
- search_response = requests.get(search_url)
1338
+ search_response = requests.get(search_url, timeout=30) # Add timeout
1219
1339
  search_response.raise_for_status()
1220
1340
  search_data = search_response.json()
1221
1341
 
1222
1342
  items = search_data.get("items", [])
1223
1343
  if not items:
1224
- print(f"No data found for CDID: {cdid}")
1344
+ print(f"Warning: No data found for CDID: {cdid}")
1225
1345
  continue
1226
1346
 
1227
1347
  # Extract series name and latest release URI
1228
- series_name = items[0].get("title", f"Series_{cdid}")
1229
- latest_date = max(
1230
- datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
1231
- for item in items if "release_date" in item
1232
- )
1233
- latest_uri = next(
1234
- item["uri"] for item in items
1235
- if "release_date" in item and datetime.fromisoformat(item["release_date"].replace("Z", "+00:00")) == latest_date
1236
- )
1348
+ # Find the item with the most recent release_date
1349
+ latest_item = None
1350
+ latest_date = None
1351
+ for item in items:
1352
+ if "release_date" in item:
1353
+ try:
1354
+ # Ensure timezone awareness for comparison
1355
+ current_date = datetime.fromisoformat(item["release_date"].replace("Z", "+00:00"))
1356
+ if latest_date is None or current_date > latest_date:
1357
+ latest_date = current_date
1358
+ latest_item = item
1359
+ except ValueError:
1360
+ print(f"Warning: Could not parse release_date '{item['release_date']}' for CDID {cdid}")
1361
+ continue # Skip this item if date is invalid
1362
+
1363
+ if latest_item is None:
1364
+ print(f"Warning: No valid release date found for CDID: {cdid}")
1365
+ continue
1366
+
1367
+ series_name = latest_item.get("title", f"Series_{cdid}") # Use title from the latest item
1368
+ latest_uri = latest_item.get("uri")
1369
+ if not latest_uri:
1370
+ print(f"Warning: No URI found for the latest release of CDID: {cdid}")
1371
+ continue
1237
1372
 
1238
1373
  # Fetch the dataset
1239
1374
  data_url = f"{base_data_url}{latest_uri}"
1240
- data_response = requests.get(data_url)
1375
+ data_response = requests.get(data_url, timeout=30) # Add timeout
1241
1376
  data_response.raise_for_status()
1242
1377
  data_json = data_response.json()
1243
1378
 
1244
1379
  # Detect the frequency and process accordingly
1380
+ frequency_key = None
1245
1381
  if "months" in data_json and data_json["months"]:
1246
1382
  frequency_key = "months"
1247
1383
  elif "quarters" in data_json and data_json["quarters"]:
@@ -1249,72 +1385,142 @@ class datapull:
1249
1385
  elif "years" in data_json and data_json["years"]:
1250
1386
  frequency_key = "years"
1251
1387
  else:
1252
- print(f"Unsupported frequency or no data for CDID: {cdid}")
1388
+ print(f"Warning: Unsupported frequency or no data values found for CDID: {cdid} at URI {latest_uri}")
1253
1389
  continue
1254
1390
 
1255
1391
  # Prepare the DataFrame
1392
+ if not data_json[frequency_key]: # Check if the list of values is empty
1393
+ print(f"Warning: Empty data list for frequency '{frequency_key}' for CDID: {cdid}")
1394
+ continue
1395
+
1256
1396
  df = pd.DataFrame(data_json[frequency_key])
1257
1397
 
1258
- # Parse the 'date' field based on frequency
1259
- if frequency_key == "months":
1260
- df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
1261
- elif frequency_key == "quarters":
1262
- def parse_quarter(quarter_str):
1263
- year, qtr = quarter_str.split(" Q")
1264
- month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
1265
- return datetime(int(year), month, 1)
1266
- df["date"] = df["date"].apply(parse_quarter)
1267
- elif frequency_key == "years":
1268
- df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
1398
+ # Check if essential columns exist
1399
+ if "date" not in df.columns or "value" not in df.columns:
1400
+ print(f"Warning: Missing 'date' or 'value' column for CDID: {cdid}")
1401
+ continue
1269
1402
 
1403
+ # Parse the 'date' field based on frequency
1404
+ try:
1405
+ if frequency_key == "months":
1406
+ # Handles "YYYY Mon" format (e.g., "2023 FEB") - adjust if format differs
1407
+ df["date"] = pd.to_datetime(df["date"], format="%Y %b", errors="coerce")
1408
+ elif frequency_key == "quarters":
1409
+ def parse_quarter(quarter_str):
1410
+ try:
1411
+ year, qtr = quarter_str.split(" Q")
1412
+ month = {"1": 1, "2": 4, "3": 7, "4": 10}[qtr]
1413
+ return datetime(int(year), month, 1)
1414
+ except (ValueError, KeyError):
1415
+ return pd.NaT # Return Not a Time for parsing errors
1416
+ df["date"] = df["date"].apply(parse_quarter)
1417
+ elif frequency_key == "years":
1418
+ df["date"] = pd.to_datetime(df["date"], format="%Y", errors="coerce")
1419
+ except Exception as e:
1420
+ print(f"Error parsing date for CDID {cdid} with frequency {frequency_key}: {e}")
1421
+ continue # Skip this series if date parsing fails
1422
+
1423
+ # Coerce value to numeric, handle potential errors
1270
1424
  df["value"] = pd.to_numeric(df["value"], errors="coerce")
1425
+
1426
+ # Drop rows where date or value parsing failed
1427
+ df.dropna(subset=["date", "value"], inplace=True)
1428
+
1429
+ if df.empty:
1430
+ print(f"Warning: No valid data points after processing for CDID: {cdid}")
1431
+ continue
1432
+
1271
1433
  df.rename(columns={"value": series_name}, inplace=True)
1272
1434
 
1273
1435
  # Combine data
1274
- df = df.loc[:, ["date", series_name]].dropna().reset_index(drop=True)
1436
+ df_subset = df.loc[:, ["date", series_name]].reset_index(drop=True) # Explicitly select columns
1275
1437
  if combined_df.empty:
1276
- combined_df = df
1438
+ combined_df = df_subset
1277
1439
  else:
1278
- combined_df = pd.merge(combined_df, df, on="date", how="outer")
1440
+ # Use outer merge to keep all dates, sort afterwards
1441
+ combined_df = pd.merge(combined_df, df_subset, on="date", how="outer")
1279
1442
 
1280
1443
  except requests.exceptions.RequestException as e:
1281
1444
  print(f"Error fetching data for CDID {cdid}: {e}")
1282
- except (KeyError, ValueError) as e:
1445
+ except (KeyError, ValueError, TypeError) as e: # Added TypeError
1283
1446
  print(f"Error processing data for CDID {cdid}: {e}")
1447
+ except Exception as e: # Catch unexpected errors
1448
+ print(f"An unexpected error occurred for CDID {cdid}: {e}")
1449
+
1284
1450
 
1285
1451
  if not combined_df.empty:
1452
+ # Sort by date after merging to ensure correct forward fill
1453
+ combined_df.sort_values(by="date", inplace=True)
1454
+ combined_df.reset_index(drop=True, inplace=True)
1455
+
1456
+ # Create a complete daily date range
1286
1457
  min_date = combined_df["date"].min()
1287
- max_date = datetime.today()
1458
+ # Ensure max_date is timezone-naive if min_date is, or consistent otherwise
1459
+ max_date = pd.Timestamp(datetime.today().date()) # Use today's date, timezone-naive
1460
+
1461
+ if pd.isna(min_date):
1462
+ print("Error: Minimum date is NaT, cannot create date range.")
1463
+ return pd.DataFrame()
1464
+
1465
+ # Make sure min_date is not NaT before creating the range
1288
1466
  date_range = pd.date_range(start=min_date, end=max_date, freq='D')
1289
1467
  daily_df = pd.DataFrame(date_range, columns=['date'])
1468
+
1469
+ # Merge with original data and forward fill
1290
1470
  daily_df = pd.merge(daily_df, combined_df, on="date", how="left")
1291
1471
  daily_df = daily_df.ffill()
1292
1472
 
1473
+ # Drop rows before the first valid data point after ffill
1474
+ first_valid_index = daily_df.dropna(subset=daily_df.columns.difference(['date'])).index.min()
1475
+ if pd.notna(first_valid_index):
1476
+ daily_df = daily_df.loc[first_valid_index:]
1477
+ else:
1478
+ print("Warning: No valid data points found after forward filling.")
1479
+ return pd.DataFrame() # Return empty if ffill results in no data
1480
+
1481
+
1293
1482
  # Aggregate to weekly frequency
1294
- daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start) % 7, unit='D')
1483
+ # Ensure 'date' column is datetime type before dt accessor
1484
+ daily_df['date'] = pd.to_datetime(daily_df['date'])
1485
+ daily_df["week_commencing"] = daily_df["date"] - pd.to_timedelta((daily_df["date"].dt.weekday - week_start + 7) % 7, unit='D') # Corrected logic for week start
1486
+ # Group by week_commencing and calculate mean for numeric columns only
1295
1487
  weekly_df = daily_df.groupby("week_commencing").mean(numeric_only=True).reset_index()
1296
1488
 
1489
+
1297
1490
  def clean_column_name(name):
1491
+ # Remove content within parentheses (e.g., CPI INDEX 00: ALL ITEMS 2015=100)
1298
1492
  name = re.sub(r"\(.*?\)", "", name)
1493
+ # Take only the part before the first colon if present
1299
1494
  name = re.split(r":", name)[0]
1300
- name = re.sub(r"\d+", "", name)
1495
+ # Remove digits
1496
+ #name = re.sub(r"\d+", "", name) # Reconsider removing all digits, might be needed for some series
1497
+ # Remove specific words like 'annual', 'rate' case-insensitively
1301
1498
  name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
1499
+ # Remove non-alphanumeric characters (except underscore and space)
1302
1500
  name = re.sub(r"[^\w\s]", "", name)
1501
+ # Replace spaces with underscores
1502
+ name = name.strip() # Remove leading/trailing whitespace
1303
1503
  name = name.replace(" ", "_")
1504
+ # Replace multiple underscores with a single one
1304
1505
  name = re.sub(r"_+", "_", name)
1506
+ # Remove trailing underscores
1305
1507
  name = name.rstrip("_")
1508
+ # Add prefix and suffix
1306
1509
  return f"macro_{name.lower()}_uk"
1307
1510
 
1511
+ # Apply cleaning function to relevant columns
1308
1512
  weekly_df.columns = [clean_column_name(col) if col != "week_commencing" else col for col in weekly_df.columns]
1309
- weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True)
1513
+ weekly_df.rename(columns={"week_commencing": "OBS"}, inplace=True) # Rename week commencing col
1310
1514
 
1311
- weekly_df = weekly_df.fillna(0)
1515
+ # Optional: Fill remaining NaNs (e.g., at the beginning if ffill didn't cover) with 0
1516
+ # Consider if 0 is the appropriate fill value for your use case
1517
+ # weekly_df = weekly_df.fillna(0)
1312
1518
 
1313
1519
  return weekly_df
1314
1520
  else:
1315
- print("No data available to process.")
1521
+ print("No data successfully fetched or processed.")
1316
1522
  return pd.DataFrame()
1317
-
1523
+
1318
1524
  def pull_yfinance(self, tickers=None, week_start_day="mon"):
1319
1525
  """
1320
1526
  Fetches stock data for multiple tickers from Yahoo Finance, converts it to daily frequency,