imsciences 0.9.6.3__tar.gz → 0.9.6.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of imsciences might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.6.3
3
+ Version: 0.9.6.5
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -380,265 +380,366 @@ class datapull:
380
380
  ############################################################### Seasonality ##########################################################################
381
381
 
382
382
  def pull_seasonality(self, week_commencing, start_date, countries):
383
- # ---------------------------------------------------------------------
384
- # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
385
- # ---------------------------------------------------------------------
386
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
387
-
388
- # ---------------------------------------------------------------------
389
- # 1. Create daily date range from start_date to today
390
- # ---------------------------------------------------------------------
391
- date_range = pd.date_range(
392
- start=pd.to_datetime(start_date),
393
- end=datetime.today(),
394
- freq="D"
395
- )
396
- df_daily = pd.DataFrame(date_range, columns=["Date"])
383
+ """
384
+ Generates a DataFrame with weekly seasonality features.
397
385
 
398
- # ---------------------------------------------------------------------
399
- # 1.1 Identify "week_start" for each daily row, based on week_commencing
400
- # ---------------------------------------------------------------------
401
- df_daily['week_start'] = df_daily["Date"].apply(
402
- lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
403
- )
386
+ Args:
387
+ week_commencing (str): The starting day of the week ('mon', 'tue', ..., 'sun').
388
+ start_date (str): The start date in 'YYYY-MM-DD' format.
389
+ countries (list): A list of country codes (e.g., ['GB', 'US']) for holidays.
404
390
 
405
- # ---------------------------------------------------------------------
406
- # 2. Build a weekly index (df_weekly_start) with dummy columns
407
- # ---------------------------------------------------------------------
408
- df_weekly_start = df_daily[['week_start']].drop_duplicates().reset_index(drop=True)
409
- df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
410
-
411
- # Set index to weekly "start of week"
412
- df_weekly_start.index = np.arange(1, len(df_weekly_start) + 1)
413
- df_weekly_start.set_index("Date", inplace=True)
414
-
415
- # Create individual weekly dummies
416
- dummy_columns = {}
417
- for i in range(len(df_weekly_start)):
418
- col_name = f"dum_{df_weekly_start.index[i].strftime('%Y_%m_%d')}"
419
- dummy_columns[col_name] = [0] * len(df_weekly_start)
420
- dummy_columns[col_name][i] = 1
421
-
422
- df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
423
- df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
424
-
425
- # ---------------------------------------------------------------------
426
- # 3. Public holidays (daily) from 'holidays' package + each holiday name
427
- # ---------------------------------------------------------------------
428
- for country in countries:
429
- country_holidays = holidays.CountryHoliday(
430
- country,
431
- years=range(int(start_date[:4]), datetime.today().year + 1)
391
+ Returns:
392
+ pd.DataFrame: A DataFrame indexed by week start date, containing various
393
+ seasonal dummy variables, holidays, trend, and constant.
394
+ The date column is named 'OBS'.
395
+ """
396
+ # ---------------------------------------------------------------------
397
+ # 0. Setup: dictionary for 'week_commencing' to Python weekday() integer
398
+ # ---------------------------------------------------------------------
399
+ day_dict = {"mon": 0, "tue": 1, "wed": 2, "thu": 3, "fri": 4, "sat": 5, "sun": 6}
400
+ if week_commencing not in day_dict:
401
+ raise ValueError(f"Invalid week_commencing value: {week_commencing}. Use one of {list(day_dict.keys())}")
402
+
403
+ # ---------------------------------------------------------------------
404
+ # 1. Create daily date range from start_date to today
405
+ # ---------------------------------------------------------------------
406
+ try:
407
+ start_dt = pd.to_datetime(start_date)
408
+ except ValueError:
409
+ raise ValueError(f"Invalid start_date format: {start_date}. Use 'YYYY-MM-DD'")
410
+
411
+ end_dt = datetime.today()
412
+ # Ensure end date is not before start date
413
+ if end_dt < start_dt:
414
+ end_dt = start_dt + timedelta(days=1) # Or handle as error if preferred
415
+
416
+ date_range = pd.date_range(
417
+ start=start_dt,
418
+ end=end_dt,
419
+ freq="D"
432
420
  )
433
- # Daily indicator: 1 if that date is a holiday
434
- df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
435
- lambda x: 1 if x in country_holidays else 0
421
+ df_daily = pd.DataFrame(date_range, columns=["Date"])
422
+
423
+ # ---------------------------------------------------------------------
424
+ # 1.1 Identify "week_start" for each daily row, based on week_commencing
425
+ # ---------------------------------------------------------------------
426
+ start_day_int = day_dict[week_commencing]
427
+ df_daily['week_start'] = df_daily["Date"].apply(
428
+ lambda x: x - pd.Timedelta(days=(x.weekday() - start_day_int) % 7)
436
429
  )
437
- # Create columns for specific holiday names
438
- for date_hol, name in country_holidays.items():
439
- col_name = f"seas_{name.replace(' ', '_').lower()}_{country.lower()}"
440
- if col_name not in df_daily.columns:
441
- df_daily[col_name] = 0
442
- df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
443
-
444
- # ---------------------------------------------------------------------
445
- # 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
446
- # We'll add daily columns for each.
447
- # ---------------------------------------------------------------------
448
- # Initialize columns
449
- extra_cols = [
450
- "seas_valentines_day",
451
- "seas_halloween",
452
- "seas_fathers_day_us_uk",
453
- "seas_mothers_day_us",
454
- "seas_mothers_day_uk",
455
- "seas_good_friday",
456
- "seas_easter_monday",
457
- "seas_black_friday",
458
- "seas_cyber_monday",
459
- ]
460
- for c in extra_cols:
461
- df_daily[c] = 0 # default zero
462
-
463
- # Helper: nth_weekday_of_month(year, month, weekday, nth=1 => first, 2 => second, etc.)
464
- # weekday: Monday=0, Tuesday=1, ... Sunday=6
465
- def nth_weekday_of_month(year, month, weekday, nth):
466
- """
467
- Returns date of the nth <weekday> in <month> of <year>.
468
- E.g. nth_weekday_of_month(2023, 6, 6, 3) => 3rd Sunday of June 2023.
469
- """
470
- # 1st day of the month
471
- d = datetime(year, month, 1)
472
- # What is the weekday of day #1?
473
- w = d.weekday() # Monday=0, Tuesday=1, ... Sunday=6
474
- # If we want, e.g. Sunday=6, we see how many days to add
475
- delta = (weekday - w) % 7
476
- # This is the first <weekday> in that month
477
- first_weekday = d + timedelta(days=delta)
478
- # Now add 7*(nth-1) days
479
- return first_weekday + timedelta(days=7 * (nth-1))
480
-
481
- def get_good_friday(year):
482
- """Good Friday is 2 days before Easter Sunday."""
483
- return easter(year) - timedelta(days=2)
484
-
485
- def get_easter_monday(year):
486
- """Easter Monday is 1 day after Easter Sunday."""
487
- return easter(year) + timedelta(days=1)
488
-
489
- def get_black_friday(year):
490
- """
491
- Black Friday = day after US Thanksgiving,
492
- and US Thanksgiving is the 4th Thursday in November.
493
- """
494
- # 4th Thursday in November
495
- fourth_thursday = nth_weekday_of_month(year, 11, 3, 4) # weekday=3 => Thursday
496
- return fourth_thursday + timedelta(days=1)
497
-
498
- def get_cyber_monday(year):
499
- """Cyber Monday = Monday after US Thanksgiving, i.e. 4 days after 4th Thursday in Nov."""
500
- # 4th Thursday in November
501
- fourth_thursday = nth_weekday_of_month(year, 11, 3, 4)
502
- return fourth_thursday + timedelta(days=4) # Monday after Thanksgiving
503
-
504
- # Loop over each year in range
505
- start_yr = int(start_date[:4])
506
- end_yr = datetime.today().year
507
-
508
- for yr in range(start_yr, end_yr + 1):
509
- # Valentines = Feb 14
510
- valentines_day = datetime(yr, 2, 14)
511
- # Halloween = Oct 31
512
- halloween_day = datetime(yr, 10, 31)
513
- # Father's Day (US & UK) = 3rd Sunday in June
514
- fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
515
- # Mother's Day US = 2nd Sunday in May
516
- mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
517
- mothering_sunday = easter(yr) - timedelta(days=21)
518
- # If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
519
- while mothering_sunday.weekday() != 6: # Sunday=6
520
- mothering_sunday -= timedelta(days=1)
521
-
522
- # Good Friday, Easter Monday
523
- gf = get_good_friday(yr)
524
- em = get_easter_monday(yr)
525
-
526
- # Black Friday, Cyber Monday
527
- bf = get_black_friday(yr)
528
- cm = get_cyber_monday(yr)
529
-
530
- # Mark them in df_daily if in range
531
- for special_date, col in [
532
- (valentines_day, "seas_valentines_day"),
533
- (halloween_day, "seas_halloween"),
534
- (fathers_day, "seas_fathers_day_us_uk"),
535
- (mothers_day_us, "seas_mothers_day_us"),
536
- (mothering_sunday, "seas_mothers_day_uk"),
537
- (gf, "seas_good_friday"),
538
- (em, "seas_easter_monday"),
539
- (bf, "seas_black_friday"),
540
- (cm, "seas_cyber_monday"),
541
- ]:
542
- # Convert to pd.Timestamp:
543
- special_ts = pd.Timestamp(special_date)
544
-
545
- # Only set if it's within your daily range
546
- if (special_ts >= df_daily["Date"].min()) and (special_ts <= df_daily["Date"].max()):
547
- df_daily.loc[df_daily["Date"] == special_ts, col] = 1
548
-
549
- # ---------------------------------------------------------------------
550
- # 4. Add daily indicators for last day & last Friday of month
551
- # Then aggregate them to weekly level using .max()
552
- # ---------------------------------------------------------------------
553
- # Last day of month (daily)
554
- df_daily["seas_last_day_of_month"] = df_daily["Date"].apply(
555
- lambda d: 1 if d == d.to_period("M").to_timestamp("M") else 0
556
- )
557
-
558
- # Last Friday of month (daily)
559
- def is_last_friday(date):
560
- # last day of the month
561
- last_day_of_month = date.to_period("M").to_timestamp("M")
562
- last_day_weekday = last_day_of_month.weekday() # Monday=0,...Sunday=6
563
- # Determine how many days we go back from the last day to get Friday (weekday=4)
564
- if last_day_weekday >= 4:
565
- days_to_subtract = last_day_weekday - 4
566
- else:
567
- days_to_subtract = last_day_weekday + 3
568
- last_friday = last_day_of_month - pd.Timedelta(days=days_to_subtract)
569
- return 1 if date == last_friday else 0
570
-
571
- df_daily["seas_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
572
-
573
- # ---------------------------------------------------------------------
574
- # 5. Weekly aggregation for holiday columns & monthly dummies
575
- # ---------------------------------------------------------------------
576
- # For monthly dummies, create a daily col "Month", then get_dummies
577
- df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
578
- df_monthly_dummies = pd.get_dummies(
579
- df_daily,
580
- prefix="seas",
581
- columns=["Month"],
582
- dtype=int
583
- )
584
- # Recalculate 'week_start' (already in df_daily, but just to be sure)
585
- df_monthly_dummies['week_start'] = df_daily['week_start']
586
-
587
- # Group monthly dummies by .sum() or .mean()—we often spread them across the week
588
- df_monthly_dummies = (
589
- df_monthly_dummies
590
- .groupby('week_start')
591
- .sum(numeric_only=True) # sum the daily flags
592
- .reset_index()
593
- .rename(columns={'week_start': "Date"})
594
- .set_index("Date")
595
- )
596
- # Spread monthly dummies by 7 to distribute across that week
597
- monthly_cols = [c for c in df_monthly_dummies.columns if c.startswith("seas_month_")]
598
- df_monthly_dummies[monthly_cols] = df_monthly_dummies[monthly_cols] / 7
599
-
600
- # Group holiday & special-day columns by .max() => binary at weekly level
601
- df_holidays = (
602
- df_daily
603
- .groupby('week_start')
604
- .max(numeric_only=True) # if any day=1 in that week, entire week=1
605
- .reset_index()
606
- .rename(columns={'week_start': "Date"})
607
- .set_index("Date")
608
- )
609
430
 
610
- # ---------------------------------------------------------------------
611
- # 6. Combine weekly start, monthly dummies, holiday flags
612
- # ---------------------------------------------------------------------
613
- df_combined = pd.concat([df_weekly_start, df_monthly_dummies], axis=1)
614
- df_combined = pd.concat([df_combined, df_holidays], axis=1)
615
- df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]
616
-
617
- # ---------------------------------------------------------------------
618
- # 7. Create weekly dummies for Week of Year & yearly dummies
619
- # ---------------------------------------------------------------------
620
- df_combined.reset_index(inplace=True)
621
- df_combined.rename(columns={"index": "old_index"}, inplace=True) # just in case
622
-
623
- df_combined["Week"] = df_combined["Date"].dt.isocalendar().week
624
- df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int)
625
-
626
- df_combined["Year"] = df_combined["Date"].dt.year
627
- df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int)
628
-
629
- # ---------------------------------------------------------------------
630
- # 8. Add constant & trend
631
- # ---------------------------------------------------------------------
632
- df_combined["Constant"] = 1
633
- df_combined["Trend"] = df_combined.index + 1
634
-
635
- # ---------------------------------------------------------------------
636
- # 9. Rename Date -> OBS and return
637
- # ---------------------------------------------------------------------
638
- df_combined.rename(columns={"Date": "OBS"}, inplace=True)
639
-
640
- return df_combined
431
+ # ---------------------------------------------------------------------
432
+ # 1.2 Calculate ISO week number for each DAY (for later aggregation)
433
+ # Also calculate Year for each DAY to handle year transitions correctly
434
+ # ---------------------------------------------------------------------
435
+ df_daily['iso_week_daily'] = df_daily['Date'].dt.isocalendar().week.astype(int)
436
+ df_daily['iso_year_daily'] = df_daily['Date'].dt.isocalendar().year.astype(int)
437
+
438
+
439
+ # ---------------------------------------------------------------------
440
+ # 2. Build a weekly index (df_weekly_start) based on unique week_start dates
441
+ # ---------------------------------------------------------------------
442
+ df_weekly_start = df_daily[['week_start']].drop_duplicates().sort_values('week_start').reset_index(drop=True)
443
+ df_weekly_start.rename(columns={'week_start': "Date"}, inplace=True)
444
+ df_weekly_start.set_index("Date", inplace=True)
445
+
446
+ # Create individual weekly dummies (optional, uncomment if needed)
447
+ dummy_columns = {}
448
+ for i, date_index in enumerate(df_weekly_start.index):
449
+ col_name = f"dum_{date_index.strftime('%Y_%m_%d')}"
450
+ dummy_columns[col_name] = [0] * len(df_weekly_start)
451
+ dummy_columns[col_name][i] = 1
452
+ df_dummies = pd.DataFrame(dummy_columns, index=df_weekly_start.index)
453
+ df_weekly_start = pd.concat([df_weekly_start, df_dummies], axis=1)
454
+
455
+ # ---------------------------------------------------------------------
456
+ # 3. Public holidays (daily) from 'holidays' package + each holiday name
457
+ # ---------------------------------------------------------------------
458
+ start_year = start_dt.year
459
+ end_year = end_dt.year
460
+ years_range = range(start_year, end_year + 1)
461
+
462
+ for country in countries:
463
+ try:
464
+ country_holidays = holidays.CountryHoliday(
465
+ country,
466
+ years=years_range,
467
+ observed=False # Typically you want the actual date, not observed substitute
468
+ )
469
+ # Handle cases like UK where specific subdivisions might be needed for some holidays
470
+ # Example: if country == 'GB': country_holidays.observed = True # If observed are needed
471
+ except KeyError:
472
+ print(f"Warning: Country code '{country}' not found in holidays library. Skipping.")
473
+ continue # Skip to next country
474
+
475
+ # Daily indicator: 1 if that date is a holiday
476
+ df_daily[f"seas_holiday_{country.lower()}"] = df_daily["Date"].apply(
477
+ lambda x: 1 if x in country_holidays else 0
478
+ )
479
+ # Create columns for specific holiday names
480
+ for date_hol, name in sorted(country_holidays.items()): # Sort for consistent column order
481
+ # Clean name: lower, replace space with underscore, remove non-alphanumeric (except underscore)
482
+ clean_name = ''.join(c for c in name if c.isalnum() or c == ' ').strip().replace(' ', '_').lower()
483
+ clean_name = clean_name.replace('_(observed)', '').replace("'", "") # specific cleaning
484
+ col_name = f"seas_{clean_name}_{country.lower()}"
485
+
486
+ # Only create column if the holiday occurs within the df_daily date range
487
+ if pd.Timestamp(date_hol).year in years_range:
488
+ if col_name not in df_daily.columns:
489
+ df_daily[col_name] = 0
490
+ # Ensure date_hol is within the actual daily range before assigning
491
+ if (pd.Timestamp(date_hol) >= df_daily["Date"].min()) and (pd.Timestamp(date_hol) <= df_daily["Date"].max()):
492
+ df_daily.loc[df_daily["Date"] == pd.Timestamp(date_hol), col_name] = 1
493
+
494
+ # ---------------------------------------------------------------------
495
+ # 3.1 Additional Special Days (Father's Day, Mother's Day, etc.)
496
+ # ---------------------------------------------------------------------
497
+ extra_cols = [
498
+ "seas_valentines_day",
499
+ "seas_halloween",
500
+ "seas_fathers_day_us_uk", # Note: UK/US is 3rd Sun Jun, others vary
501
+ "seas_mothers_day_us", # Note: US is 2nd Sun May
502
+ "seas_mothers_day_uk", # Note: UK Mothering Sunday varies with Easter
503
+ "seas_good_friday",
504
+ "seas_easter_monday",
505
+ "seas_black_friday", # US-centric, but globally adopted
506
+ "seas_cyber_monday", # US-centric, but globally adopted
507
+ ]
508
+ for c in extra_cols:
509
+ if c not in df_daily.columns: # Avoid overwriting if already created by holidays pkg
510
+ df_daily[c] = 0
511
+
512
+ # Helper: nth_weekday_of_month(year, month, weekday, nth)
513
+ def nth_weekday_of_month(year, month, weekday, nth):
514
+ d = datetime(year, month, 1)
515
+ w = d.weekday()
516
+ delta = (weekday - w + 7) % 7 # Ensure positive delta
517
+ first_weekday = d + timedelta(days=delta)
518
+ target_date = first_weekday + timedelta(days=7 * (nth - 1))
519
+ # Check if the calculated date is still in the same month
520
+ if target_date.month == month:
521
+ return target_date
522
+ else:
523
+ # This can happen if nth is too large (e.g., 5th Friday)
524
+ # Return the last occurrence of that weekday in the month instead
525
+ return target_date - timedelta(days=7)
526
+
527
+
528
+ def get_good_friday(year):
529
+ return easter(year) - timedelta(days=2)
530
+
531
+ def get_easter_monday(year):
532
+ return easter(year) + timedelta(days=1)
533
+
534
+ def get_black_friday(year):
535
+ # US Thanksgiving is 4th Thursday in November (weekday=3)
536
+ thanksgiving = nth_weekday_of_month(year, 11, 3, 4)
537
+ return thanksgiving + timedelta(days=1)
538
+
539
+ def get_cyber_monday(year):
540
+ # Monday after US Thanksgiving
541
+ thanksgiving = nth_weekday_of_month(year, 11, 3, 4)
542
+ return thanksgiving + timedelta(days=4)
543
+
544
+ def get_mothering_sunday_uk(year):
545
+ # Fourth Sunday in Lent (3 weeks before Easter Sunday)
546
+ # Lent starts on Ash Wednesday, 46 days before Easter.
547
+ # Easter Sunday is day 0. Sunday before is -7, etc.
548
+ # 4th Sunday in Lent is 3 weeks before Easter.
549
+ return easter(year) - timedelta(days=21)
550
+
551
+
552
+ # Loop over each year in range
553
+ for yr in range(start_year, end_year + 1):
554
+ try: # Wrap calculations in try-except for robustness
555
+ # Valentines = Feb 14
556
+ valentines_day = datetime(yr, 2, 14)
557
+ # Halloween = Oct 31
558
+ halloween_day = datetime(yr, 10, 31)
559
+ # Father's Day (US & UK) = 3rd Sunday (6) in June
560
+ fathers_day = nth_weekday_of_month(yr, 6, 6, 3)
561
+ # Mother's Day US = 2nd Sunday (6) in May
562
+ mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
563
+ # Mother's Day UK (Mothering Sunday)
564
+ mothering_sunday = get_mothering_sunday_uk(yr)
565
+
566
+ # Good Friday, Easter Monday
567
+ gf = get_good_friday(yr)
568
+ em = get_easter_monday(yr)
569
+
570
+ # Black Friday, Cyber Monday
571
+ bf = get_black_friday(yr)
572
+ cm = get_cyber_monday(yr)
573
+
574
+ # Mark them in df_daily if in range
575
+ special_days_map = [
576
+ (valentines_day, "seas_valentines_day"),
577
+ (halloween_day, "seas_halloween"),
578
+ (fathers_day, "seas_fathers_day_us_uk"),
579
+ (mothers_day_us, "seas_mothers_day_us"),
580
+ (mothering_sunday,"seas_mothers_day_uk"),
581
+ (gf, "seas_good_friday"),
582
+ (em, "seas_easter_monday"),
583
+ (bf, "seas_black_friday"),
584
+ (cm, "seas_cyber_monday"),
585
+ ]
586
+
587
+ for special_date, col in special_days_map:
588
+ if special_date is not None: # nth_weekday_of_month can return None edge cases
589
+ special_ts = pd.Timestamp(special_date)
590
+ # Only set if it's within the daily range AND column exists
591
+ if (special_ts >= df_daily["Date"].min()) and \
592
+ (special_ts <= df_daily["Date"].max()) and \
593
+ (col in df_daily.columns):
594
+ df_daily.loc[df_daily["Date"] == special_ts, col] = 1
595
+ except Exception as e:
596
+ print(f"Warning: Could not calculate special days for year {yr}: {e}")
597
+
598
+
599
+ # ---------------------------------------------------------------------
600
+ # 4. Add daily indicators for last day & last Friday of month
601
+ # ---------------------------------------------------------------------
602
+ df_daily["is_last_day_of_month"] = df_daily["Date"].dt.is_month_end
603
+
604
+ def is_last_friday(date):
605
+ # Check if it's a Friday first
606
+ if date.weekday() != 4: # Friday is 4
607
+ return 0
608
+ # Check if next Friday is in the next month
609
+ next_friday = date + timedelta(days=7)
610
+ return 1 if next_friday.month != date.month else 0
611
+
612
+ df_daily["is_last_friday_of_month"] = df_daily["Date"].apply(is_last_friday)
613
+
614
+ # Rename for clarity prefix
615
+ df_daily.rename(columns={
616
+ "is_last_day_of_month": "seas_last_day_of_month",
617
+ "is_last_friday_of_month": "seas_last_friday_of_month"
618
+ }, inplace=True)
619
+
620
+
621
+ # ---------------------------------------------------------------------
622
+ # 5. Weekly aggregation
623
+ # ---------------------------------------------------------------------
624
+
625
+ # Select only columns that are indicators/flags (intended for max aggregation)
626
+ flag_cols = [col for col in df_daily.columns if col.startswith('seas_') or col.startswith('is_')]
627
+ # Ensure 'week_start' is present for grouping
628
+ df_to_agg = df_daily[['week_start'] + flag_cols]
629
+
630
+ df_weekly_flags = (
631
+ df_to_agg
632
+ .groupby('week_start')
633
+ .max() # if any day=1 in that week, entire week=1
634
+ .reset_index()
635
+ .rename(columns={'week_start': "Date"})
636
+ .set_index("Date")
637
+ )
641
638
 
639
+ # --- Aggregate Week Number using MODE ---
640
+ # Define aggregation function for mode (handling potential multi-modal cases by taking the first)
641
+ def get_mode(x):
642
+ modes = pd.Series.mode(x)
643
+ return modes[0] if not modes.empty else np.nan # Return first mode or NaN
644
+
645
+ df_weekly_iso_week_year = (
646
+ df_daily[['week_start', 'iso_week_daily', 'iso_year_daily']]
647
+ .groupby('week_start')
648
+ .agg(
649
+ # Find the most frequent week number and year within the group
650
+ Week=('iso_week_daily', get_mode),
651
+ Year=('iso_year_daily', get_mode)
652
+ )
653
+ .reset_index()
654
+ .rename(columns={'week_start': 'Date'})
655
+ .set_index('Date')
656
+ )
657
+ # Convert Week/Year back to integer type after aggregation
658
+ df_weekly_iso_week_year['Week'] = df_weekly_iso_week_year['Week'].astype(int)
659
+ df_weekly_iso_week_year['Year'] = df_weekly_iso_week_year['Year'].astype(int)
660
+
661
+
662
+ # --- Monthly dummies (spread evenly across week) ---
663
+ df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
664
+ df_monthly_dummies_daily = pd.get_dummies(
665
+ df_daily[["week_start", "Month"]], # Only need these columns
666
+ prefix="seas_month",
667
+ columns=["Month"],
668
+ dtype=float # Use float for division
669
+ )
670
+ # Sum daily dummies within the week
671
+ df_monthly_dummies_summed = df_monthly_dummies_daily.groupby('week_start').sum()
672
+ # Divide by number of days in that specific week group (usually 7, except potentially start/end)
673
+ days_in_week = df_daily.groupby('week_start').size()
674
+ df_weekly_monthly_dummies = df_monthly_dummies_summed.div(days_in_week, axis=0)
675
+
676
+ # Reset index to merge
677
+ df_weekly_monthly_dummies.reset_index(inplace=True)
678
+ df_weekly_monthly_dummies.rename(columns={'week_start': 'Date'}, inplace=True)
679
+ df_weekly_monthly_dummies.set_index('Date', inplace=True)
680
+
681
+
682
+ # ---------------------------------------------------------------------
683
+ # 6. Combine all weekly components
684
+ # ---------------------------------------------------------------------
685
+ # Start with the basic weekly index
686
+ df_combined = df_weekly_start.copy()
687
+
688
+ # Join the other aggregated DataFrames
689
+ df_combined = df_combined.join(df_weekly_flags, how='left')
690
+ df_combined = df_combined.join(df_weekly_iso_week_year, how='left')
691
+ df_combined = df_combined.join(df_weekly_monthly_dummies, how='left')
692
+
693
+ # Fill potential NaNs created by joins (e.g., if a flag column didn't exist) with 0
694
+ # Exclude 'Week' and 'Year' which should always be present
695
+ cols_to_fill = df_combined.columns.difference(['Week', 'Year'])
696
+ df_combined[cols_to_fill] = df_combined[cols_to_fill].fillna(0)
697
+
698
+ # Ensure correct types for flag columns (int)
699
+ for col in df_weekly_flags.columns:
700
+ if col in df_combined.columns:
701
+ df_combined[col] = df_combined[col].astype(int)
702
+
703
+ # Ensure correct types for month columns (float)
704
+ for col in df_weekly_monthly_dummies.columns:
705
+ if col in df_combined.columns:
706
+ df_combined[col] = df_combined[col].astype(float)
707
+
708
+
709
+ # ---------------------------------------------------------------------
710
+ # 7. Create weekly dummies for Week of Year & yearly dummies from aggregated cols
711
+ # ---------------------------------------------------------------------
712
+ df_combined.reset_index(inplace=True) # 'Date', 'Week', 'Year' become columns
713
+
714
+ # Create dummies from the aggregated 'Week' column
715
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Week"], dtype=int, prefix_sep='_')
716
+
717
+ # Create dummies from the aggregated 'Year' column
718
+ df_combined = pd.get_dummies(df_combined, prefix="seas", columns=["Year"], dtype=int, prefix_sep='_')
719
+
720
+ # ---------------------------------------------------------------------
721
+ # 8. Add constant & trend
722
+ # ---------------------------------------------------------------------
723
+ df_combined["Constant"] = 1
724
+ df_combined.reset_index(drop=True, inplace=True) # Ensure index is 0, 1, 2... for trend
725
+ df_combined["Trend"] = df_combined.index + 1
726
+
727
+ # ---------------------------------------------------------------------
728
+ # 9. Rename Date -> OBS and select final columns
729
+ # ---------------------------------------------------------------------
730
+ df_combined.rename(columns={"Date": "OBS"}, inplace=True)
731
+
732
+ # Reorder columns - OBS first, then Constant, Trend, then seasonal features
733
+ cols_order = ['OBS', 'Constant', 'Trend'] + \
734
+ sorted([col for col in df_combined.columns if col.startswith('seas_')]) + \
735
+ sorted([col for col in df_combined.columns if col.startswith('dum_')]) # If individual week dummies were enabled
736
+
737
+ # Filter out columns not in the desired order list (handles case where dum_ cols are off)
738
+ final_cols = [col for col in cols_order if col in df_combined.columns]
739
+ df_combined = df_combined[final_cols]
740
+
741
+ return df_combined
742
+
642
743
  def pull_weather(self, week_commencing, start_date, country_codes) -> pd.DataFrame:
643
744
  """
644
745
  Pull weather data for a given week-commencing day and one or more country codes.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.9.6.3
3
+ Version: 0.9.6.5
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -8,7 +8,7 @@ def read_md(file_name):
8
8
  return f.read()
9
9
  return ''
10
10
 
11
- VERSION = '0.9.6.3'
11
+ VERSION = '0.9.6.5'
12
12
  DESCRIPTION = 'IMS Data Processing Package'
13
13
  LONG_DESCRIPTION = read_md('README.md')
14
14
 
File without changes
File without changes
File without changes