imsciences 0.9.5.4__py3-none-any.whl → 0.9.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of imsciences might be problematic. Click here for more details.

imsciences/pull.py CHANGED
@@ -11,6 +11,8 @@ from bs4 import BeautifulSoup
11
11
  import yfinance as yf
12
12
  import holidays
13
13
  from dateutil.easter import easter
14
+ import urllib.request
15
+ from geopy.geocoders import Nominatim
14
16
 
15
17
  from imsciences.mmm import dataprocessing
16
18
 
@@ -48,8 +50,8 @@ class datapull:
48
50
 
49
51
  print("\n6. pull_weather")
50
52
  print(" - Description: Fetch and process historical weather data for the specified country.")
51
- print(" - Usage: pull_weather(week_commencing, country)")
52
- print(" - Example: pull_weather('mon', 'GBR')")
53
+ print(" - Usage: pull_weather(week_commencing, start_date, country)")
54
+ print(" - Example: pull_weather('mon', '2020-01-01', ['GBR'])")
53
55
 
54
56
  print("\n7. pull_macro_ons_uk")
55
57
  print(" - Description: Fetch and process time series data from the Beta ONS API.")
@@ -60,6 +62,11 @@ class datapull:
60
62
  print(" - Description: Fetch and process time series data from the Beta ONS API.")
61
63
  print(" - Usage: pull_yfinance(tickers, week_start_day)")
62
64
  print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
65
+
66
+ print("\n9. pull_sports_events")
67
+ print(" - Description: Pull a veriety of sports events primaraly football and rugby.")
68
+ print(" - Usage: pull_sports_events(start_date, week_commencing)")
69
+ print(" - Example: pull_sports_events('2020-01-01', 'mon')")
63
70
 
64
71
  ############################################################### MACRO ##########################################################################
65
72
 
@@ -507,15 +514,6 @@ class datapull:
507
514
  fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
508
515
  # Mother's Day US = 2nd Sunday in May
509
516
  mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
510
- # Mother's Day UK: 4th Sunday in Lent => "Mothering Sunday"
511
- # We can approximate as: Easter Sunday - 21 days
512
- # BUT we also must ensure it's actually Sunday
513
- # (the 4th Sunday in Lent can shift. We'll do the official approach below.)
514
- # Another approach: Easter Sunday - 7 * (4 weeks) is the 4th Sunday prior to Easter.
515
- # But that might overshoot if Lent started mid-week.
516
- # Let's do a quick approach:
517
- # Officially: Mothering Sunday = 3 weeks before Easter Sunday (the 4th Sunday is Easter Sunday itself).
518
- # So Easter - 21 days should be the Sunday, but let's confirm with weekday check.
519
517
  mothering_sunday = easter(yr) - timedelta(days=21)
520
518
  # If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
521
519
  while mothering_sunday.weekday() != 6: # Sunday=6
@@ -641,610 +639,536 @@ class datapull:
641
639
 
642
640
  return df_combined
643
641
 
644
-
645
- def pull_weather(self, week_commencing, country) -> pd.DataFrame:
646
- import pandas as pd
647
- import urllib.request # noqa: F811
648
- from datetime import datetime
649
- import requests
650
- from geopy.geocoders import Nominatim # noqa: F811
651
-
652
- # Week commencing dictionary
642
+ def pull_weather(self, week_commencing, start_date, country_codes) -> pd.DataFrame:
643
+ """
644
+ Pull weather data for a given week-commencing day and one or more country codes.
645
+
646
+ LOGIC:
647
+ 1) For non-US countries (AU, GB, DE, CA, ZA):
648
+ - Mesonet => max_temp_f, min_temp_f -> compute mean_temp_f -> weekly average => 'avg_max_temp_f', etc.
649
+ - Open-Meteo => precipitation_sum => 'avg_rain_sum', snowfall_sum => 'avg_snow_sum'.
650
+ - Merge, then rename columns with prefix 'seas_{country}_'.
651
+
652
+ 2) For the US:
653
+ - We have multiple <STATE>_ASOS networks (e.g. CA_ASOS, TX_ASOS).
654
+ - For each state, fetch from Mesonet => max_temp_f, min_temp_f, precip_in, snow_in -> compute mean_temp_f -> weekly average => 'avg_max_temp_f', 'avg_rain_sum', 'avg_snow_sum', etc.
655
+ - Rename columns for each state with prefix 'seas_us_{state}_'.
656
+ - Merge all states (and countries) into a single DataFrame.
657
+
658
+ :param week_commencing: A string in {"mon","tue","wed","thur","fri","sat","sun"}.
659
+ :param country_codes: A list of 2-letter country codes or a single string, e.g. ["GB","US"].
660
+ :return: A single Pandas DataFrame with weekly-aggregated data for all requested countries.
661
+ """
662
+ # ------------------------------------------------------------------ #
663
+ # 0) Handle either a single code or list of codes
664
+ # ------------------------------------------------------------------ #
665
+ if isinstance(country_codes, str):
666
+ country_codes = [country_codes]
667
+ elif not isinstance(country_codes, (list, tuple)):
668
+ raise ValueError("country_codes must be a list/tuple or a single string.")
669
+
670
+ # --- Setup / Constants --- #
653
671
  day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
672
+ # Map each 2-letter code to a key
673
+ country_dict = {
674
+ "US": "US_STATES",
675
+ "CA": "Canada",
676
+ "AU": "AU__ASOS",
677
+ "GB": "GB__ASOS",
678
+ "DE": "DE__ASOS",
679
+ "ZA": "ZA__ASOS"
680
+ }
654
681
 
655
- # Country dictionary
656
- country_dict = {"AUS": "AU__ASOS", "GBR": "GB__ASOS", "USA": "USCRN", "DEU": "DE__ASOS", "CAN": "Canada", "ZAF": "ZA__ASOS"}
682
+ # Station-based countries for Mesonet
683
+ station_map = {
684
+ "GB__ASOS": [
685
+ "&stations=EGCC", "&stations=EGNM", "&stations=EGBB", "&stations=EGSH",
686
+ "&stations=EGFF", "&stations=EGHI", "&stations=EGLC", "&stations=EGHQ",
687
+ "&stations=EGAC", "&stations=EGPF", "&stations=EGGD", "&stations=EGPE",
688
+ "&stations=EGNT"
689
+ ],
690
+ "AU__ASOS": [
691
+ "&stations=YPDN", "&stations=YBCS", "&stations=YBBN", "&stations=YSSY",
692
+ "&stations=YSSY", "&stations=YMEN", "&stations=YPAD", "&stations=YPPH"
693
+ ],
694
+ "DE__ASOS": [
695
+ "&stations=EDDL", "&stations=EDDH", "&stations=EDDB", "&stations=EDDN",
696
+ "&stations=EDDF", "&stations=EDDK", "&stations=EDLW", "&stations=EDDM"
697
+ ],
698
+ # Example: if ZA is also station-based, add it here.
699
+ "ZA__ASOS": [
700
+ # If you know the station codes, add them here:
701
+ # e.g. "&stations=FACT", "&stations=FAJS", ...
702
+ ],
703
+ # "FR__ASOS" if you need France, etc.
704
+ }
657
705
 
658
- # Function to flatten a list of nested lists into a list
659
- def flatten_list(nested_list):
660
- return [item for sublist in nested_list for item in sublist]
706
+ # Non-US countries that also fetch RAIN & SNOW from Open-Meteo
707
+ rainfall_city_map = {
708
+ "GB__ASOS": [
709
+ "Manchester", "Leeds", "Birmingham", "London","Glasgow",
710
+ ],
711
+ "AU__ASOS": [
712
+ "Darwin", "Cairns", "Brisbane", "Sydney", "Melbourne", "Adelaide", "Perth"
713
+ ],
714
+ "DE__ASOS": [
715
+ "Dortmund", "Düsseldorf", "Frankfurt", "Munich", "Cologne", "Berlin", "Hamburg", "Nuernberg"
716
+ ],
717
+ "ZA__ASOS": [
718
+ "Johannesburg", "Cape Town", "Durban", "Pretoria"
719
+ ],
720
+ }
661
721
 
662
- # Choose country
663
- country = country_dict[country]
722
+ # Canada sub-networks
723
+ institute_vector = [
724
+ "CA_NB_ASOS", "CA_NF_ASOS", "CA_NT_ASOS", "CA_NS_ASOS", "CA_NU_ASOS"
725
+ ]
726
+ stations_list_canada = [
727
+ [
728
+ "&stations=CYQM", "&stations=CERM", "&stations=CZCR",
729
+ "&stations=CZBF", "&stations=CYFC", "&stations=CYCX"
730
+ ],
731
+ [
732
+ "&stations=CWZZ", "&stations=CYDP", "&stations=CYMH", "&stations=CYAY",
733
+ "&stations=CWDO", "&stations=CXTP", "&stations=CYJT", "&stations=CYYR",
734
+ "&stations=CZUM", "&stations=CYWK", "&stations=CYWK"
735
+ ],
736
+ [
737
+ "&stations=CYHI", "&stations=CZCP", "&stations=CWLI", "&stations=CWND",
738
+ "&stations=CXTV", "&stations=CYVL", "&stations=CYCO", "&stations=CXDE",
739
+ "&stations=CYWE", "&stations=CYLK", "&stations=CWID", "&stations=CYRF",
740
+ "&stations=CXYH", "&stations=CYWY", "&stations=CWMT"
741
+ ],
742
+ [
743
+ "&stations=CWEF", "&stations=CXIB", "&stations=CYQY", "&stations=CYPD",
744
+ "&stations=CXNP", "&stations=CXMY", "&stations=CYAW", "&stations=CWKG",
745
+ "&stations=CWVU", "&stations=CXLB", "&stations=CWSA", "&stations=CWRN"
746
+ ],
747
+ [
748
+ "&stations=CYLT", "&stations=CWEU", "&stations=CWGZ", "&stations=CYIO",
749
+ "&stations=CXSE", "&stations=CYCB", "&stations=CWIL", "&stations=CXWB",
750
+ "&stations=CYZS", "&stations=CWJC", "&stations=CYFB", "&stations=CWUW"
751
+ ]
752
+ ]
664
753
 
665
- # Choose start and end dates
666
- start_day = 1
667
- start_month = 1
668
- start_year = 2014
669
- formatted_date = datetime(start_year, start_month, start_day).strftime("%Y-%m-%d")
754
+ # US states and stations - each sub-network
755
+ us_state_networks = {
756
+ state: f"{state}_ASOS" for state in [
757
+ "AL", "AR", "AZ", "CA", "CO", "CT", "DE", "FL", "GA", "IA", "ID", "IL", "IN",
758
+ "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND",
759
+ "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD",
760
+ "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"
761
+ ]
762
+ }
763
+
764
+ us_stations_map = {
765
+ "AL_ASOS": ["&stations=BHM", "&stations=HSV", "&stations=MGM", "&stations=MOB", "&stations=TCL"],
766
+ "AR_ASOS": ["&stations=LIT", "&stations=FSM", "&stations=TXK", "&stations=HOT", "&stations=FYV"],
767
+ "AZ_ASOS": ["&stations=PHX", "&stations=TUS", "&stations=FLG", "&stations=YUM", "&stations=PRC"],
768
+ "CA_ASOS": ["&stations=LAX", "&stations=SAN", "&stations=SJC", "&stations=SFO", "&stations=FAT"],
769
+ "CO_ASOS": ["&stations=DEN", "&stations=COS", "&stations=GJT", "&stations=PUB", "&stations=ASE"],
770
+ "CT_ASOS": ["&stations=BDL", "&stations=HVN", "&stations=BDR", "&stations=GON", "&stations=HFD"],
771
+ "DE_ASOS": ["&stations=ILG", "&stations=GED", "&stations=DOV"],
772
+ "FL_ASOS": ["&stations=MIA", "&stations=TPA", "&stations=ORL", "&stations=JAX", "&stations=TLH"],
773
+ "GA_ASOS": ["&stations=ATL", "&stations=SAV", "&stations=CSG", "&stations=MCN", "&stations=AGS"],
774
+ "IA_ASOS": ["&stations=DSM", "&stations=CID", "&stations=DBQ", "&stations=ALO", "&stations=SUX"],
775
+ "ID_ASOS": ["&stations=BOI", "&stations=IDA", "&stations=PIH", "&stations=SUN", "&stations=COE"],
776
+ "IL_ASOS": ["&stations=ORD", "&stations=MDW", "&stations=PIA", "&stations=SPI", "&stations=MLI"],
777
+ "IN_ASOS": ["&stations=IND", "&stations=FWA", "&stations=SBN", "&stations=EVV", "&stations=HUF"],
778
+ "KS_ASOS": ["&stations=ICT", "&stations=FOE", "&stations=GCK", "&stations=HYS", "&stations=SLN"],
779
+ "KY_ASOS": ["&stations=SDF", "&stations=LEX", "&stations=CVG", "&stations=PAH", "&stations=BWG"],
780
+ "LA_ASOS": ["&stations=MSY", "&stations=SHV", "&stations=LFT", "&stations=BTR", "&stations=MLU"],
781
+ "MA_ASOS": ["&stations=BOS", "&stations=ORH", "&stations=HYA", "&stations=ACK", "&stations=BED"],
782
+ "MD_ASOS": ["&stations=BWI", "&stations=MTN", "&stations=SBY", "&stations=HGR", "&stations=ADW"],
783
+ "ME_ASOS": ["&stations=PWM", "&stations=BGR", "&stations=CAR", "&stations=PQI", "&stations=RKD"],
784
+ "MI_ASOS": ["&stations=DTW", "&stations=GRR", "&stations=FNT", "&stations=LAN", "&stations=MKG"],
785
+ "MN_ASOS": ["&stations=MSP", "&stations=DLH", "&stations=RST", "&stations=STC", "&stations=INL"],
786
+ "MO_ASOS": ["&stations=STL", "&stations=MCI", "&stations=SGF", "&stations=COU", "&stations=JLN"],
787
+ "MS_ASOS": ["&stations=JAN", "&stations=GPT", "&stations=MEI", "&stations=PIB", "&stations=GLH"],
788
+ "MT_ASOS": ["&stations=BIL", "&stations=MSO", "&stations=GTF", "&stations=HLN", "&stations=BZN"],
789
+ "NC_ASOS": ["&stations=CLT", "&stations=RDU", "&stations=GSO", "&stations=ILM", "&stations=AVL"],
790
+ "ND_ASOS": ["&stations=BIS", "&stations=FAR", "&stations=GFK", "&stations=ISN", "&stations=JMS"],
791
+ "NE_ASOS": ["&stations=OMA"],
792
+ "NH_ASOS": ["&stations=MHT", "&stations=PSM", "&stations=CON", "&stations=LEB", "&stations=ASH"],
793
+ "NJ_ASOS": ["&stations=EWR", "&stations=ACY", "&stations=TTN", "&stations=MMU", "&stations=TEB"],
794
+ "NM_ASOS": ["&stations=ABQ", "&stations=SAF", "&stations=ROW", "&stations=HOB", "&stations=FMN"],
795
+ "NV_ASOS": ["&stations=LAS"],
796
+ "NY_ASOS": ["&stations=JFK", "&stations=LGA", "&stations=BUF", "&stations=ALB", "&stations=SYR"],
797
+ "OH_ASOS": ["&stations=CMH"],
798
+ "OK_ASOS": ["&stations=OKC", "&stations=TUL", "&stations=LAW", "&stations=SWO", "&stations=PNC"],
799
+ "OR_ASOS": ["&stations=PDX"],
800
+ "PA_ASOS": ["&stations=PHL", "&stations=PIT", "&stations=ERI", "&stations=MDT", "&stations=AVP"],
801
+ "RI_ASOS": ["&stations=PVD", "&stations=WST", "&stations=UUU"],
802
+ "SC_ASOS": ["&stations=CHS", "&stations=CAE", "&stations=GSP", "&stations=MYR", "&stations=FLO"],
803
+ "SD_ASOS": ["&stations=FSD", "&stations=RAP", "&stations=PIR", "&stations=ABR", "&stations=YKN"],
804
+ "TN_ASOS": ["&stations=BNA", "&stations=MEM", "&stations=TYS", "&stations=CHA", "&stations=TRI"],
805
+ "TX_ASOS": ["&stations=DFW", "&stations=IAH", "&stations=AUS", "&stations=SAT", "&stations=ELP"],
806
+ "UT_ASOS": ["&stations=SLC", "&stations=OGD", "&stations=PVU", "&stations=SGU", "&stations=CNY"],
807
+ "VA_ASOS": ["&stations=DCA", "&stations=RIC", "&stations=ROA", "&stations=ORF", "&stations=SHD"],
808
+ "VT_ASOS": ["&stations=BTV", "&stations=MPV", "&stations=RUT", "&stations=VSF", "&stations=MVL"],
809
+ "WA_ASOS": ["&stations=SEA", "&stations=GEG", "&stations=TIW", "&stations=VUO", "&stations=BFI"],
810
+ "WI_ASOS": ["&stations=MKE", "&stations=MSN", "&stations=GRB", "&stations=EAU", "&stations=LSE"],
811
+ "WV_ASOS": ["&stations=CRW", "&stations=CKB", "&stations=HTS", "&stations=MGW", "&stations=BKW"],
812
+ "WY_ASOS": ["&stations=CPR", "&stations=JAC", "&stations=SHR", "&stations=COD", "&stations=RKS"],
813
+ }
814
+ # --- Date setup --- #
815
+ date_object = datetime.strptime(start_date, "%Y-%m-%d")
816
+ start_day = date_object.day
817
+ start_month = date_object.month
818
+ start_year = date_object.year
819
+ formatted_date = f"{start_year:04d}-01-01" # "2000-01-01"
670
820
  today = datetime.now()
671
- end_day = today.day
672
- end_month = today.month
673
- end_year = today.year
674
-
675
- if country == "GB__ASOS":
676
- stations = ["&stations=EGCC", "&stations=EGNM", "&stations=EGBB",
677
- "&stations=EGSH", "&stations=EGFF", "&stations=EGHI",
678
- "&stations=EGLC", "&stations=EGHQ", "&stations=EGAC",
679
- "&stations=EGPF", "&stations=EGGD", "&stations=EGPE",
680
- "&stations=EGNT"]
681
- elif country == "AU__ASOS":
682
- stations = ["&stations=YPDN", "&stations=YBCS", "&stations=YBBN",
683
- "&stations=YSSY", "&stations=YSSY", "&stations=YMEN",
684
- "&stations=YPAD", "&stations=YPPH"]
685
- elif country == "USCRN":
686
- stations = ["&stations=64756", "&stations=64758", "&stations=03761", "&stations=54797", # North
687
- "&stations=53968", "&stations=53960", "&stations=54932", "&stations=13301", # Midwest
688
- "&stations=64756", "&stations=64756", "&stations=92821", "&stations=63862", # South
689
- "&stations=53152", "&stations=93245", "&stations=04138", "&stations=04237"] # West
690
- elif country == "DE__ASOS":
691
- stations = ["&stations=EDDL", "&stations=EDDH", "&stations=EDDB",
692
- "&stations=EDDN", "&stations=EDDF", "&stations=EDDK",
693
- "&stations=EDLW", "&stations=EDDM"]
694
- elif country == "FR__ASOS":
695
- stations = ["&stations=LFPB"]
696
- elif country == "Canada":
697
- institute_vector = ["CA_NB_ASOS", "CA_NF_ASOS", "CA_NT_ASOS", "CA_NS_ASOS",
698
- "CA_NU_ASOS"]
699
- stations_list = [[] for _ in range(5)]
700
- stations_list[0].append(["&stations=CYQM", "&stations=CERM", "&stations=CZCR",
701
- "&stations=CZBF", "&stations=CYFC", "&stations=CYCX"])
702
-
703
- stations_list[1].append(["&stations=CWZZ", "&stations=CYDP", "&stations=CYMH",
704
- "&stations=CYAY", "&stations=CWDO", "&stations=CXTP",
705
- "&stations=CYJT", "&stations=CYYR", "&stations=CZUM",
706
- "&stations=CYWK", "&stations=CYWK"])
707
-
708
- stations_list[2].append(["&stations=CYHI", "&stations=CZCP", "&stations=CWLI",
709
- "&stations=CWND", "&stations=CXTV", "&stations=CYVL",
710
- "&stations=CYCO", "&stations=CXDE", "&stations=CYWE",
711
- "&stations=CYLK", "&stations=CWID", "&stations=CYRF",
712
- "&stations=CXYH", "&stations=CYWY", "&stations=CWMT"])
713
-
714
- stations_list[3].append(["&stations=CWEF", "&stations=CXIB", "&stations=CYQY",
715
- "&stations=CYPD", "&stations=CXNP", "&stations=CXMY",
716
- "&stations=CYAW", "&stations=CWKG", "&stations=CWVU",
717
- "&stations=CXLB", "&stations=CWSA", "&stations=CWRN"])
718
-
719
- stations_list[4].append(["&stations=CYLT", "&stations=CWEU", "&stations=CWGZ",
720
- "&stations=CYIO", "&stations=CXSE", "&stations=CYCB",
721
- "&stations=CWIL", "&stations=CXWB", "&stations=CYZS",
722
- "&stations=CWJC", "&stations=CYFB", "&stations=CWUW"])
723
-
724
- elif country == "ZA__ASOS":
725
- cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
726
- stations = []
821
+ end_day, end_month, end_year = today.day, today.month, today.year
727
822
 
728
- for city in cities:
729
- geolocator = Nominatim(user_agent="MyApp")
730
- location = geolocator.geocode(city)
731
- stations.append(f"&latitude={location.latitude}&longitude={location.longitude}")
823
+ # ------------------------------------------------------------------ #
824
+ # Utility functions
825
+ # ------------------------------------------------------------------ #
826
+ def convert_f_to_c(series_f: pd.Series) -> pd.Series:
827
+ """Convert Fahrenheit to Celsius."""
828
+ return (series_f - 32) * 5.0 / 9.0
829
+
830
+ def fetch_mesonet_data(network: str, stations: list) -> pd.DataFrame:
831
+ """Fetch station-based data (daily) from Iowa Mesonet."""
832
+ import csv
732
833
 
733
- # Temperature
734
- if country in ["GB__ASOS", "AU__ASOS", "DE__ASOS", "FR__ASOS"]:
735
- # We start by making a data frame of the following weather stations
736
834
  station_query = ''.join(stations)
835
+ url = (
836
+ "https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?"
837
+ f"network={network}{station_query}"
838
+ f"&year1={start_year}&month1={start_month}&day1={start_day}"
839
+ f"&year2={end_year}&month2={end_month}&day2={end_day}"
840
+ )
841
+ with urllib.request.urlopen(url) as f:
842
+ df = pd.read_csv(f, dtype=str, quoting=csv.QUOTE_ALL)
843
+ return df
844
+
845
+ def fetch_canada_data() -> pd.DataFrame:
846
+ """Canada uses multiple sub-networks. Combine them all."""
847
+ import csv
848
+ final_df = pd.DataFrame()
849
+ for i, institute_temp in enumerate(institute_vector):
850
+ station_query_temp = ''.join(stations_list_canada[i])
851
+ mesonet_url = (
852
+ "https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?"
853
+ f"network={institute_temp}{station_query_temp}"
854
+ f"&year1={start_year}&month1={start_month}&day1={start_day}"
855
+ f"&year2={end_year}&month2={end_month}&day2={end_day}"
856
+ )
857
+ with urllib.request.urlopen(mesonet_url) as f:
858
+ temp_df = pd.read_csv(f, dtype=str, quoting=csv.QUOTE_ALL)
737
859
 
738
- raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
739
- station_query,
740
- "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
741
- "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
742
- raw_weather = urllib.request.urlopen(raw_weather_list)
743
- raw_weather = pd.read_csv(raw_weather)
744
-
745
- # Replace the occurrences of "None" with Missing Value
746
- raw_weather["max_temp_f"].replace("None", 0, inplace=True)
747
- raw_weather["min_temp_f"].replace("None", 0, inplace=True)
748
-
749
- # Remove any data that isn't temperature-related
750
- weather = raw_weather.iloc[:, 0:4]
751
-
752
- weather[["max_temp_f", "min_temp_f"]] = weather[["max_temp_f", "min_temp_f"]].apply(pd.to_numeric)
753
-
754
- # Estimate mean temperature
755
- weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
756
-
757
- # Convert Fahrenheit to Celsius for max_temp_f
758
- weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
759
-
760
- # Convert Fahrenheit to Celsius for min_temp_f
761
- weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
762
-
763
- # Convert Fahrenheit to Celsius for mean_temp_f
764
- weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
765
-
766
- # Aggregate the data to week commencing sunday taking the average of the data
767
- # Convert the date column to a Date type
768
- weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
769
-
770
- # Determine the starting chosen day for each date
771
- weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
772
-
773
- # Group by week_starting and summarize
774
- numeric_columns = weather.select_dtypes(include='number').columns
775
- weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
776
- weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
777
- "min_temp_f": "avg_min_temp_f",
778
- "mean_temp_f": "avg_mean_temp_f",
779
- "max_temp_c": "avg_max_temp_c",
780
- "min_temp_c": "avg_min_temp_c",
781
- "mean_temp_c": "avg_mean_temp_c"}, inplace=True)
782
- elif country == "Canada":
783
- for i in range(len(institute_vector)):
784
- station_query_temp = ''.join(flatten_list(stations_list[i]))
785
- institute_temp = institute_vector[i]
786
- raw_weather_temp = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", institute_temp,
787
- station_query_temp,
788
- "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
789
- "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
790
- raw_weather_temp = urllib.request.urlopen(raw_weather_temp)
791
- raw_weather_temp = pd.read_csv(raw_weather_temp)
792
-
793
- if len(raw_weather_temp.index) == 0:
794
- continue
795
- raw_weather_temp = raw_weather_temp[['station', 'day', 'max_temp_f', 'min_temp_f', 'precip_in']]
860
+ if not temp_df.empty:
861
+ final_df = pd.concat([final_df, temp_df], ignore_index=True)
862
+ return final_df
796
863
 
797
- if i == 1:
798
- raw_weather = raw_weather_temp
799
- else:
800
- raw_weather = pd.concat([raw_weather, raw_weather_temp])
801
-
802
- # Drop error column if it exists
803
- if 'ERROR: Invalid network specified' in list(raw_weather.columns):
804
- raw_weather.drop('ERROR: Invalid network specified', axis=1, inplace=True)
805
-
806
- # Replace none values
807
- raw_weather["max_temp_f"].replace("None", 0, inplace=True)
808
- raw_weather["min_temp_f"].replace("None", 0, inplace=True)
809
- raw_weather["precip_in"].replace("None", 0, inplace=True)
810
-
811
- weather = raw_weather
812
- weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
813
-
814
- # Estimate mean temperature
815
- weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
816
-
817
- # Convert Fahrenheit to Celsius for max_temp_f
818
- weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
819
-
820
- # Convert Fahrenheit to Celsius for min_temp_f
821
- weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
822
-
823
- # Convert Fahrenheit to Celsius for mean_temp_f
824
- weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
825
-
826
- # Aggregate the data to week commencing sunday taking the average of the data
827
- # Convert the date column to a Date type
828
- weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
829
-
830
- # Determine the starting chosen day for each date
831
- weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
832
-
833
- # Group by week_starting and summarize
834
- numeric_columns = weather.select_dtypes(include='number').columns
835
- weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
836
- weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
837
- "min_temp_f": "avg_min_temp_f",
838
- "mean_temp_f": "avg_mean_temp_f",
839
- "max_temp_c": "avg_max_temp_c",
840
- "min_temp_c": "avg_min_temp_c",
841
- "mean_temp_c": "avg_mean_temp_c",
842
- "precip_in": "avg_mean_perc"}, inplace=True)
843
- elif country == "ZA__ASOS":
864
+ def fetch_openmeteo_rain_snow(cities: list) -> pd.DataFrame:
865
+ """
866
+ Fetch daily precipitation_sum (rain) and snowfall_sum (snow) from Open-Meteo.
867
+ Returns columns: ["date", "rain_sum", "snow_sum", "city"] for each day.
868
+ We'll then do a weekly aggregator that yields avg_rain_sum, avg_snow_sum.
869
+ """
844
870
  weather_data_list = []
871
+ geolocator = Nominatim(user_agent="MyApp")
845
872
 
846
873
  for city in cities:
847
- geolocator = Nominatim(user_agent="MyApp")
848
- location = geolocator.geocode(city)
849
- url = "https://archive-api.open-meteo.com/v1/archive"
874
+ loc = geolocator.geocode(city)
875
+ if not loc:
876
+ print(f"Could not find location for {city}, skipping.")
877
+ continue
850
878
 
879
+ url = "https://archive-api.open-meteo.com/v1/archive"
851
880
  params = {
852
- "latitude": location.latitude,
853
- "longitude": location.longitude,
881
+ "latitude": loc.latitude,
882
+ "longitude": loc.longitude,
854
883
  "start_date": formatted_date,
855
884
  "end_date": today.strftime("%Y-%m-%d"),
856
- "daily": "temperature_2m_max,temperature_2m_min,precipitation_sum",
885
+ "daily": "precipitation_sum,snowfall_sum",
857
886
  "timezone": "auto"
858
887
  }
888
+ resp = requests.get(url, params=params)
889
+ if resp.status_code != 200:
890
+ print(f"[ERROR] open-meteo returned status {resp.status_code} for city={city}")
891
+ continue
892
+ try:
893
+ data_json = resp.json()
894
+ except ValueError:
895
+ print(f"[ERROR] invalid JSON from open-meteo for city={city}")
896
+ continue
859
897
 
860
- response = requests.get(url, params=params)
861
- response_data = response.json()
862
-
863
- daily_data = response_data["daily"]
864
- dates = daily_data["time"]
898
+ daily_block = data_json.get("daily", {})
899
+ if not {"time", "precipitation_sum", "snowfall_sum"}.issubset(daily_block.keys()):
900
+ print(f"[ERROR] missing required keys in open-meteo for city={city}")
901
+ continue
865
902
 
866
- data = pd.DataFrame({
867
- "day": dates,
868
- "max_temp_f": daily_data["temperature_2m_max"],
869
- "min_temp_f": daily_data["temperature_2m_min"],
870
- "precip_in": daily_data["precipitation_sum"]
903
+ df_temp = pd.DataFrame({
904
+ "date": daily_block["time"],
905
+ "rain_sum": daily_block["precipitation_sum"],
906
+ "snow_sum": daily_block["snowfall_sum"]
871
907
  })
872
- data["city"] = city
873
- weather_data_list.append(data)
874
-
875
- weather = pd.concat(weather_data_list)
876
-
877
- # Convert the date column to a Date type
878
- weather["day"] = pd.to_datetime(weather["day"])
879
-
880
- # Replace None values
881
- weather["max_temp_f"].replace("None", 0, inplace=True)
882
- weather["min_temp_f"].replace("None", 0, inplace=True)
883
- weather["precip_in"].replace("None", 0, inplace=True)
884
-
885
- weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
886
-
887
- # Estimate mean temperature
888
- weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
889
-
890
- # Convert Fahrenheit to Celsius for max_temp_f
891
- weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
892
-
893
- # Convert Fahrenheit to Celsius for min_temp_f
894
- weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
895
-
896
- # Convert Fahrenheit to Celsius for mean_temp_f
897
- weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
898
-
899
- # Determine the starting chosen day for each date
900
- weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
901
-
902
- # Group by week_starting and summarize
903
- numeric_columns = weather.select_dtypes(include='number').columns
904
- weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
905
- weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
906
- "min_temp_f": "avg_min_temp_f",
907
- "mean_temp_f": "avg_mean_temp_f",
908
- "max_temp_c": "avg_max_temp_c",
909
- "min_temp_c": "avg_min_temp_c",
910
- "mean_temp_c": "avg_mean_temp_c",
911
- "precip_in": "avg_mean_perc"}, inplace=True)
912
-
913
- else:
914
- # We start by making a data frame of the following weather stations
915
- station_query = ''.join(stations)
916
-
917
- raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
918
- station_query,
919
- "&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
920
- "&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
921
- raw_weather = urllib.request.urlopen(raw_weather_list)
922
- raw_weather = pd.read_csv(raw_weather)
923
-
924
- raw_weather = raw_weather[['day', 'max_temp_f', 'min_temp_f', 'precip_in']]
925
-
926
- # Replace the occurrences of "None" with Missing Value
927
- raw_weather["max_temp_f"].replace("None", 0, inplace=True)
928
- raw_weather["min_temp_f"].replace("None", 0, inplace=True)
929
- raw_weather["precip_in"].replace("None", 0, inplace=True)
930
-
931
- # Remove any data that isn't temperature-related
932
- weather = raw_weather
933
-
934
- weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
935
-
936
- # Estimate mean temperature
937
- weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
938
-
939
- # Convert Fahrenheit to Celsius for max_temp_f
940
- weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
941
-
942
- # Convert Fahrenheit to Celsius for min_temp_f
943
- weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
944
-
945
- # Convert Fahrenheit to Celsius for mean_temp_f
946
- weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
947
-
948
- # Aggregate the data to week commencing sunday taking the average of the data
949
- # Convert the date column to a Date type
950
- weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
951
-
952
- # Determine the starting chosen day for each date
953
- weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
954
-
955
- # Group by week_starting and summarize
956
- numeric_columns = weather.select_dtypes(include='number').columns
957
- weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
958
- weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
959
- "min_temp_f": "avg_min_temp_f",
960
- "mean_temp_f": "avg_mean_temp_f",
961
- "max_temp_c": "avg_max_temp_c",
962
- "min_temp_c": "avg_min_temp_c",
963
- "mean_temp_c": "avg_mean_temp_c",
964
- "precip_in": "avg_mean_perc"}, inplace=True)
965
-
966
- # Rainfall
967
- if country == "GB__ASOS":
968
- # Define cities and date range
969
- cities = ["Manchester", "Leeds", "Birmingham", "Norwich", "Cardiff", "Southampton", "London", "Newquay", "Belfast", "Glasgow", "Bristol", "Newcastle"]
970
-
971
- start_date = formatted_date
972
- end_date = today.strftime("%Y-%m-%d")
973
-
974
- # Initialize an empty list to store the weather data for each city
975
- weather_data_list = []
976
-
977
- # Loop through each city and fetch weather data
978
- for city in cities:
979
- # Initialize Nominatim API
980
- geolocator = Nominatim(user_agent="MyApp")
981
- location = geolocator.geocode(city)
982
- url = "https://archive-api.open-meteo.com/v1/archive"
983
-
984
- params = {
985
- "latitude": location.latitude,
986
- "longitude": location.longitude,
987
- "start_date": start_date,
988
- "end_date": end_date,
989
- "daily": "precipitation_sum",
990
- "timezone": "auto"
991
- }
992
-
993
- response = requests.get(url, params=params)
994
- response_data = response.json()
995
-
996
- daily_data = response_data["daily"]["precipitation_sum"]
997
- dates = response_data["daily"]["time"]
998
-
999
- data = pd.DataFrame({"date": dates, "rainfall": daily_data})
1000
- data["city"] = city
1001
-
1002
- weather_data_list.append(data)
1003
-
1004
- # Combine all city data into a single data frame
1005
- all_weather_data = pd.concat(weather_data_list)
1006
-
1007
- # Convert the date column to a Date type
1008
- all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
1009
-
1010
- # Set week commencing col up
1011
- all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1012
-
1013
- # Group by week_starting and summarize
1014
- numeric_columns = all_weather_data.select_dtypes(include='number').columns
1015
- weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
1016
- weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
1017
-
1018
- # Change index to datetime
1019
- weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
1020
-
1021
- elif country == "AU__ASOS":
1022
-
1023
- # Define cities and date range
1024
- cities = ["Darwin", "Cairns", "Brisbane", "Sydney", "Melbourne", "Adelaide", "Perth"]
1025
-
1026
- start_date = formatted_date
1027
- end_date = today.strftime("%Y-%m-%d")
1028
-
1029
- # Initialize an empty list to store the weather data for each city
1030
- weather_data_list = []
1031
-
1032
- # Loop through each city and fetch weather data
1033
- for city in cities:
1034
- # Initialize Nominatim API
1035
- geolocator = Nominatim(user_agent="MyApp")
1036
- location = geolocator.geocode(city)
1037
- url = "https://archive-api.open-meteo.com/v1/archive"
1038
-
1039
- params = {
1040
- "latitude": location.latitude,
1041
- "longitude": location.longitude,
1042
- "start_date": start_date,
1043
- "end_date": end_date,
1044
- "daily": "precipitation_sum",
1045
- "timezone": "auto"
1046
- }
1047
-
1048
- response = requests.get(url, params=params)
1049
- response_data = response.json()
1050
-
1051
- daily_data = response_data["daily"]["precipitation_sum"]
1052
- dates = response_data["daily"]["time"]
1053
-
1054
- data = pd.DataFrame({"date": dates, "rainfall": daily_data})
1055
- data["city"] = city
1056
-
1057
- weather_data_list.append(data)
1058
-
1059
- # Combine all city data into a single data frame
1060
- all_weather_data = pd.concat(weather_data_list)
1061
-
1062
- # Convert the date column to a Date type
1063
- all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
908
+ df_temp["city"] = city
909
+ weather_data_list.append(df_temp)
1064
910
 
1065
- # Set week commencing col up
1066
- all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1067
-
1068
- # Group by week_starting and summarize
1069
- numeric_columns = all_weather_data.select_dtypes(include='number').columns
1070
- weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
1071
- weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
1072
-
1073
- # Change index to datetime
1074
- weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
1075
-
1076
- elif country == "DE__ASOS":
1077
-
1078
- # Define cities and date range
1079
- cities = ["Dortmund", "Düsseldorf", "Frankfurt", "Munich", "Cologne", "Berlin", "Hamburg", "Nuernberg"]
1080
-
1081
- start_date = formatted_date
1082
- end_date = today.strftime("%Y-%m-%d")
1083
-
1084
- # Initialize an empty list to store the weather data for each city
1085
- weather_data_list = []
1086
-
1087
- # Loop through each city and fetch weather data
1088
- for city in cities:
1089
- # Initialize Nominatim API
1090
- geolocator = Nominatim(user_agent="MyApp")
1091
- location = geolocator.geocode(city)
1092
- url = "https://archive-api.open-meteo.com/v1/archive"
1093
-
1094
- params = {
1095
- "latitude": location.latitude,
1096
- "longitude": location.longitude,
1097
- "start_date": start_date,
1098
- "end_date": end_date,
1099
- "daily": "precipitation_sum",
1100
- "timezone": "auto"
1101
- }
1102
-
1103
- response = requests.get(url, params=params)
1104
- response_data = response.json()
1105
-
1106
- daily_data = response_data["daily"]["precipitation_sum"]
1107
- dates = response_data["daily"]["time"]
1108
-
1109
- data = pd.DataFrame({"date": dates, "rainfall": daily_data})
1110
- data["city"] = city
1111
-
1112
- weather_data_list.append(data)
1113
-
1114
- # Combine all city data into a single data frame
1115
- all_weather_data = pd.concat(weather_data_list)
1116
-
1117
- # Convert the date column to a Date type
1118
- all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
1119
-
1120
- # Set week commencing col up
1121
- all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1122
-
1123
- # Group by week_starting and summarize
1124
- numeric_columns = all_weather_data.select_dtypes(include='number').columns
1125
- weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
1126
- weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
1127
-
1128
- # Change index to datetime
1129
- weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
1130
-
1131
- elif country == "FR__ASOS":
1132
-
1133
- # Define cities and date range
1134
- cities = ["Paris"]
1135
-
1136
- start_date = formatted_date
1137
- end_date = today.strftime("%Y-%m-%d")
1138
-
1139
- # Initialize an empty list to store the weather data for each city
1140
- weather_data_list = []
1141
-
1142
- # Loop through each city and fetch weather data
1143
- for city in cities:
1144
- # Initialize Nominatim API
1145
- geolocator = Nominatim(user_agent="MyApp")
1146
- location = geolocator.geocode(city)
1147
- url = "https://archive-api.open-meteo.com/v1/archive"
1148
-
1149
- params = {
1150
- "latitude": location.latitude,
1151
- "longitude": location.longitude,
1152
- "start_date": start_date,
1153
- "end_date": end_date,
1154
- "daily": "precipitation_sum",
1155
- "timezone": "auto"
1156
- }
1157
-
1158
- response = requests.get(url, params=params)
1159
- response_data = response.json()
1160
-
1161
- daily_data = response_data["daily"]["precipitation_sum"]
1162
- dates = response_data["daily"]["time"]
1163
-
1164
- data = pd.DataFrame({"date": dates, "rainfall": daily_data})
1165
- data["city"] = city
1166
-
1167
- weather_data_list.append(data)
1168
-
1169
- # Combine all city data into a single data frame
1170
- all_weather_data = pd.concat(weather_data_list)
1171
-
1172
- # Convert the date column to a Date type
1173
- all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
1174
-
1175
- # Set week commencing col up
1176
- all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1177
-
1178
- # Group by week_starting and summarize
1179
- numeric_columns = all_weather_data.select_dtypes(include='number').columns
1180
- weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
1181
- weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
1182
-
1183
- # Change index to datetime
1184
- weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
1185
-
1186
- elif country == "ZA__ASOS":
1187
- cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
1188
- start_date = formatted_date
1189
- end_date = today.strftime("%Y-%m-%d")
1190
-
1191
- weather_data_list = []
1192
-
1193
- for city in cities:
1194
- geolocator = Nominatim(user_agent="MyApp")
1195
- location = geolocator.geocode(city)
1196
- url = "https://archive-api.open-meteo.com/v1/archive"
911
+ if weather_data_list:
912
+ return pd.concat(weather_data_list, ignore_index=True)
913
+ else:
914
+ return pd.DataFrame()
1197
915
 
1198
- params = {
1199
- "latitude": location.latitude,
1200
- "longitude": location.longitude,
1201
- "start_date": start_date,
1202
- "end_date": end_date,
1203
- "daily": "precipitation_sum",
1204
- "timezone": "auto"
1205
- }
916
+ def weekly_aggregate_temp_mesonet(df: pd.DataFrame) -> pd.DataFrame:
917
+ """
918
+ For NON-US mesonet data, we only keep max_temp_f, min_temp_f,
919
+ then compute mean_temp_f, plus Celsius, and do weekly average.
920
+ """
921
+ import pandas as pd
922
+
923
+ # Convert day col
924
+ if "day" not in df.columns:
925
+ return pd.DataFrame()
926
+
927
+ # Only keep relevant columns
928
+ keep_cols = []
929
+ for c in ["day", "max_temp_f", "min_temp_f"]:
930
+ if c in df.columns:
931
+ keep_cols.append(c)
932
+ df = df[keep_cols].copy()
933
+
934
+ # Convert "None" => numeric
935
+ for c in ["max_temp_f", "min_temp_f"]:
936
+ if c in df.columns:
937
+ df[c] = df[c].replace("None", pd.NA)
938
+ df[c] = pd.to_numeric(df[c], errors="coerce")
939
+
940
+ df["day"] = pd.to_datetime(df["day"], errors="coerce")
941
+ df["mean_temp_f"] = (df["max_temp_f"] + df["min_temp_f"]) / 2
942
+ df["max_temp_c"] = convert_f_to_c(df["max_temp_f"])
943
+ df["min_temp_c"] = convert_f_to_c(df["min_temp_f"])
944
+ df["mean_temp_c"] = convert_f_to_c(df["mean_temp_f"])
945
+
946
+ # Group by "week_starting"
947
+ df["week_starting"] = df["day"].apply(
948
+ lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
949
+ if pd.notnull(x) else pd.NaT
950
+ )
951
+ numeric_cols = df.select_dtypes(include='number').columns
952
+ weekly = df.groupby("week_starting")[numeric_cols].mean()
953
+
954
+ # Rename columns
955
+ rename_map = {
956
+ "max_temp_f": "avg_max_temp_f",
957
+ "min_temp_f": "avg_min_temp_f",
958
+ "mean_temp_f": "avg_mean_temp_f",
959
+ "max_temp_c": "avg_max_temp_c",
960
+ "min_temp_c": "avg_min_temp_c",
961
+ "mean_temp_c": "avg_mean_temp_c",
962
+ }
963
+ weekly.rename(columns=rename_map, inplace=True)
964
+
965
+ # Return as a DataFrame w/ index = week_starting
966
+ return weekly
967
+
968
+ def weekly_aggregate_rain_snow_openmeteo(df: pd.DataFrame) -> pd.DataFrame:
969
+ """
970
+ For NON-US, from open-meteo, we have daily columns 'date','rain_sum','snow_sum'.
971
+ We'll do weekly average of each. -> 'avg_rain_sum', 'avg_snow_sum'.
972
+ """
973
+ import pandas as pd
974
+ if "date" not in df.columns:
975
+ return pd.DataFrame()
976
+
977
+ df["date"] = pd.to_datetime(df["date"], errors="coerce")
978
+ df["week_starting"] = df["date"].apply(
979
+ lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
980
+ if pd.notnull(x) else pd.NaT
981
+ )
1206
982
 
1207
- response = requests.get(url, params=params)
1208
- response_data = response.json()
983
+ # Convert to numeric
984
+ for c in ["rain_sum", "snow_sum"]:
985
+ if c in df.columns:
986
+ df[c] = pd.to_numeric(df[c], errors="coerce")
1209
987
 
1210
- daily_data = response_data["daily"]["precipitation_sum"]
1211
- dates = response_data["daily"]["time"]
988
+ numeric_cols = df.select_dtypes(include='number').columns
989
+ weekly = df.groupby("week_starting")[numeric_cols].mean()
1212
990
 
1213
- data = pd.DataFrame({"date": dates, "rainfall": daily_data})
1214
- data["city"] = city
991
+ rename_map = {
992
+ "rain_sum": "avg_rain_sum",
993
+ "snow_sum": "avg_snow_sum"
994
+ }
995
+ weekly.rename(columns=rename_map, inplace=True)
996
+ return weekly
1215
997
 
1216
- weather_data_list.append(data)
998
+ def weekly_aggregate_us(df: pd.DataFrame) -> pd.DataFrame:
999
+ """
1000
+ For US Mesonet data (per state), we keep max_temp_f, min_temp_f, precip_in, snow_in,
1001
+ then compute mean_temp_f & convert to celsius, group weekly.
1002
+ We'll rename:
1003
+ max_temp_f -> avg_max_temp_f
1004
+ min_temp_f -> avg_min_temp_f
1005
+ mean_temp_f -> avg_mean_temp_f
1006
+ precip_in -> avg_rain_sum
1007
+ snow_in -> avg_snow_sum
1008
+ """
1009
+ import pandas as pd
1010
+ if "day" not in df.columns:
1011
+ return pd.DataFrame()
1012
+
1013
+ # Convert day
1014
+ df["day"] = pd.to_datetime(df["day"], errors="coerce")
1015
+
1016
+ # Convert "None" => numeric
1017
+ for c in ["max_temp_f", "min_temp_f", "precip_in", "snow_in"]:
1018
+ if c in df.columns:
1019
+ df[c] = df[c].replace("None", pd.NA)
1020
+ df[c] = pd.to_numeric(df[c], errors="coerce")
1021
+
1022
+ # Compute mean_temp_f, celsius
1023
+ df["mean_temp_f"] = (df["max_temp_f"] + df["min_temp_f"]) / 2
1024
+ df["max_temp_c"] = convert_f_to_c(df["max_temp_f"])
1025
+ df["min_temp_c"] = convert_f_to_c(df["min_temp_f"])
1026
+ df["mean_temp_c"] = convert_f_to_c(df["mean_temp_f"])
1027
+
1028
+ # Weekly grouping
1029
+ df["week_starting"] = df["day"].apply(
1030
+ lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
1031
+ if pd.notnull(x) else pd.NaT
1032
+ )
1033
+ numeric_cols = df.select_dtypes(include='number').columns
1034
+ weekly = df.groupby("week_starting")[numeric_cols].mean()
1035
+
1036
+ rename_map = {
1037
+ "max_temp_f": "avg_max_temp_f",
1038
+ "min_temp_f": "avg_min_temp_f",
1039
+ "mean_temp_f": "avg_mean_temp_f",
1040
+ "max_temp_c": "avg_max_temp_c",
1041
+ "min_temp_c": "avg_min_temp_c",
1042
+ "mean_temp_c": "avg_mean_temp_c",
1043
+ "precip_in": "avg_rain_sum",
1044
+ "snow_in": "avg_snow_sum"
1045
+ }
1046
+ weekly.rename(columns=rename_map, inplace=True)
1047
+ return weekly
1048
+
1049
+ def rename_with_prefix(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
1050
+ """Rename all columns except 'week_starting' or 'OBS' with the given prefix."""
1051
+ df2 = df.copy()
1052
+ new_cols = {}
1053
+ for col in df2.columns:
1054
+ if col not in ["week_starting", "OBS"]:
1055
+ new_cols[col] = prefix + col
1056
+ df2.rename(columns=new_cols, inplace=True)
1057
+ return df2
1058
+
1059
+ # ------------------------------------------------------------------ #
1060
+ # The final combined DataFrame
1061
+ # ------------------------------------------------------------------ #
1062
+ combined_df = pd.DataFrame()
1217
1063
 
1218
- # Combine all city data into a single data frame
1219
- all_weather_data = pd.concat(weather_data_list)
1064
+ # ------------------------------------------------------------------ #
1065
+ # 1) Loop over each requested country
1066
+ # ------------------------------------------------------------------ #
1067
+ for country_code in country_codes:
1068
+ net = country_dict.get(country_code)
1069
+ if net is None:
1070
+ print(f"Warning: Invalid country_code '{country_code}' – skipping.")
1071
+ continue
1220
1072
 
1221
- # Convert the date column to a Date type
1222
- all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
1073
+ # =========================
1074
+ # 2) Special Logic for US
1075
+ # =========================
1076
+ if net == "US_STATES":
1077
+ for state_code, network_code in us_state_networks.items():
1078
+ stations = us_stations_map.get(network_code, [])
1079
+ if not stations:
1080
+ print(f"[DEBUG] No stations for {network_code}, skipping.")
1081
+ continue
1082
+
1083
+ raw_df = fetch_mesonet_data(network_code, stations)
1084
+ if raw_df.empty:
1085
+ print(f"[DEBUG] DataFrame empty for {network_code}, skipping.")
1086
+ continue
1087
+
1088
+ weekly_state = weekly_aggregate_us(raw_df)
1089
+ if weekly_state.empty:
1090
+ print(f"[DEBUG] Aggregated weekly DataFrame empty for {network_code}, skipping.")
1091
+ continue
1092
+
1093
+ weekly_state.reset_index(inplace=True)
1094
+ weekly_state.rename(columns={"week_starting": "OBS"}, inplace=True)
1095
+
1096
+ # Now rename columns with prefix: seas_us_{statecode}_
1097
+ prefix = f"seas_us_{state_code.lower()}_"
1098
+ weekly_state = rename_with_prefix(weekly_state, prefix)
1099
+
1100
+ # Merge into combined
1101
+ if combined_df.empty:
1102
+ combined_df = weekly_state
1103
+ else:
1104
+ combined_df = pd.merge(combined_df, weekly_state, on="OBS", how="outer")
1105
+
1106
+ # Done with the US. Move on to the next country in the loop
1107
+ continue
1223
1108
 
1224
- # Set week commencing col up
1225
- all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1109
+ # =======================================
1110
+ # 3) Logic for Non-US (AU, GB, DE, CA, ZA)
1111
+ # =======================================
1112
+ # A) Fetch temperature data from Mesonet
1113
+ if net == "Canada":
1114
+ raw_temp = fetch_canada_data()
1115
+ else:
1116
+ # e.g. "GB__ASOS", "AU__ASOS", "DE__ASOS", "ZA__ASOS" (if added)
1117
+ stations = station_map.get(net, [])
1118
+ if not stations and net != "ZA__ASOS":
1119
+ # If we have no stations for net and it's not ZA,
1120
+ # there's no data. (If ZA has stations, add them above.)
1121
+ raw_temp = pd.DataFrame()
1122
+ else:
1123
+ raw_temp = fetch_mesonet_data(net, stations)
1124
+
1125
+ weekly_temp = pd.DataFrame()
1126
+ if not raw_temp.empty:
1127
+ # For these countries, we only keep max_temp_f, min_temp_f, mean_temp_f
1128
+ weekly_temp = weekly_aggregate_temp_mesonet(raw_temp)
1129
+
1130
+ # B) Fetch rain+snow from Open-Meteo (only if we have an entry in rainfall_city_map)
1131
+ weekly_precip = pd.DataFrame()
1132
+ if net in rainfall_city_map:
1133
+ city_list = rainfall_city_map[net]
1134
+ df_rain_snow = fetch_openmeteo_rain_snow(city_list)
1135
+ if not df_rain_snow.empty:
1136
+ weekly_precip = weekly_aggregate_rain_snow_openmeteo(df_rain_snow)
1137
+
1138
+ # C) Merge the temperature data + precip/snow data on the weekly index
1139
+ if not weekly_temp.empty and not weekly_precip.empty:
1140
+ merged_df = pd.merge(weekly_temp, weekly_precip, left_index=True, right_index=True, how="outer")
1141
+ elif not weekly_temp.empty:
1142
+ merged_df = weekly_temp
1143
+ else:
1144
+ merged_df = weekly_precip
1226
1145
 
1227
- # Group by week_starting and summarize
1228
- numeric_columns = all_weather_data.select_dtypes(include='number').columns
1229
- weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
1230
- weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
1146
+ if merged_df.empty:
1147
+ print(f"No data retrieved for country: {country_code}")
1148
+ continue
1231
1149
 
1232
- # Change index to datetime
1233
- weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
1150
+ # D) Convert index -> a column OBS
1151
+ merged_df.reset_index(inplace=True)
1152
+ merged_df.rename(columns={"week_starting": "OBS"}, inplace=True)
1234
1153
 
1235
- # Merge the dataframes
1236
- if country in ["AU__ASOS", "DE__ASOS", "FR__ASOS", "GB__ASOS", "ZA__ASOS"]:
1237
- merged_df = weekly_avg_rain.merge(weekly_avg_temp, on="week_starting")
1238
- else:
1239
- merged_df = weekly_avg_temp
1154
+ # E) Rename with prefix = "seas_{country_code}_"
1155
+ prefix = f"seas_{country_code.lower()}_"
1156
+ merged_df = rename_with_prefix(merged_df, prefix)
1240
1157
 
1241
- merged_df.reset_index(drop=False, inplace=True)
1242
- merged_df.rename(columns={'week_starting': 'OBS'}, inplace=True)
1158
+ # F) Merge into combined_df
1159
+ if combined_df.empty:
1160
+ combined_df = merged_df
1161
+ else:
1162
+ combined_df = pd.merge(combined_df, merged_df, on="OBS", how="outer")
1243
1163
 
1244
- final_weather = ims_proc.rename_cols(merged_df, 'seas_')
1164
+ # ------------------------------------------------------------------ #
1165
+ # 4) Sort final by OBS (optional)
1166
+ # ------------------------------------------------------------------ #
1167
+ if not combined_df.empty:
1168
+ combined_df.sort_values(by="OBS", inplace=True)
1245
1169
 
1246
- return final_weather
1247
-
1170
+ return combined_df
1171
+
1248
1172
  def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
1249
1173
  """
1250
1174
  Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
@@ -1481,3 +1405,228 @@ class datapull:
1481
1405
  print("No data available to process.")
1482
1406
  return pd.DataFrame()
1483
1407
 
1408
+ def pull_sports_events(self, start_date="2020-01-01", week_commencing="mon"):
1409
+ """
1410
+ Combines scraping logic for:
1411
+ - UEFA Champions League and NFL from TheSportsDB (website-scraping approach)
1412
+ - FIFA World Cup, UEFA Euro, Rugby World Cup, Six Nations (via TheSportsDB API)
1413
+
1414
+ Returns a single merged DataFrame with all event dummy variables.
1415
+ """
1416
+
1417
+ ############################################################
1418
+ # 1) SCRAPE UEFA CHAMPIONS LEAGUE & NFL (YOUR FIRST FUNCTION)
1419
+ ############################################################
1420
+ def scrape_sports_events(start_date=start_date, week_commencing=week_commencing):
1421
+ sports = {
1422
+ "uefa_champions_league": {
1423
+ "league_id": "4480",
1424
+ "seasons_url": "https://www.thesportsdb.com/league/4480-UEFA-Champions-League?a=1#allseasons",
1425
+ "season_url_template": "https://www.thesportsdb.com/season/4480-UEFA-Champions-League/{season}&all=1&view=",
1426
+ "round_filters": ["quarter", "semi", "final"]
1427
+ },
1428
+ "nfl": {
1429
+ "league_id": "4391",
1430
+ "seasons_url": "https://www.thesportsdb.com/league/4391-NFL?a=1#allseasons",
1431
+ "season_url_template": "https://www.thesportsdb.com/season/4391-NFL/{season}&all=1&view=",
1432
+ "round_filters": ["quarter", "semi", "final"]
1433
+ }
1434
+ }
1435
+
1436
+ headers = {"User-Agent": "Mozilla/5.0"}
1437
+ start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
1438
+
1439
+ # Create a full date range DataFrame
1440
+ full_date_range = pd.date_range(start=start_date, end=pd.to_datetime("today"))
1441
+ time_series_df = pd.DataFrame({"date": full_date_range})
1442
+ time_series_df["seas_uefa_champions_league"] = 0
1443
+ time_series_df["seas_nfl"] = 0
1444
+
1445
+ for sport, details in sports.items():
1446
+ # Get available seasons
1447
+ response = requests.get(details["seasons_url"], headers=headers)
1448
+ if response.status_code != 200:
1449
+ continue # Skip this sport if the request fails
1450
+
1451
+ soup = BeautifulSoup(response.text, "html.parser")
1452
+
1453
+ # Extract season names
1454
+ seasons = []
1455
+ for link in soup.find_all("a", href=True):
1456
+ href = link["href"]
1457
+ if "season" in href and sport.replace("_", "-") in href.lower():
1458
+ season_name = href.split("/")[-1] # e.g. "2023-2024"
1459
+ try:
1460
+ season_start_year = int(season_name.split("-")[0])
1461
+ season_start_date = datetime(season_start_year, 1, 1)
1462
+ if season_start_date >= start_date_dt:
1463
+ seasons.append(season_name)
1464
+ except ValueError:
1465
+ continue
1466
+
1467
+ # Scrape matches for filtered seasons
1468
+ filtered_matches = []
1469
+ for season in seasons:
1470
+ season_url = details["season_url_template"].format(season=season)
1471
+ season_response = requests.get(season_url, headers=headers)
1472
+ if season_response.status_code != 200:
1473
+ continue
1474
+
1475
+ season_soup = BeautifulSoup(season_response.text, "html.parser")
1476
+ for row in season_soup.find_all("tr"):
1477
+ cols = row.find_all("td")
1478
+ if len(cols) >= 5:
1479
+ match_date = cols[0].text.strip()
1480
+ round_name = cols[1].text.strip().lower()
1481
+ try:
1482
+ match_date_dt = datetime.strptime(match_date, "%d %b %y")
1483
+ if (match_date_dt >= start_date_dt
1484
+ and any(r in round_name for r in details["round_filters"])):
1485
+ filtered_matches.append(match_date_dt)
1486
+ except ValueError:
1487
+ continue
1488
+
1489
+ # Convert matches into time series format
1490
+ df_sport = pd.DataFrame({"date": filtered_matches})
1491
+ if df_sport.empty:
1492
+ continue
1493
+
1494
+ col_name = "seas_nfl" if sport == "nfl" else "seas_uefa_champions_league"
1495
+ time_series_df.loc[time_series_df["date"].isin(df_sport["date"]), col_name] = 1
1496
+
1497
+ # Aggregate by week commencing
1498
+ day_offsets = {
1499
+ 'mon': 'W-MON',
1500
+ 'tues': 'W-TUE',
1501
+ 'wed': 'W-WED',
1502
+ 'thurs': 'W-THU',
1503
+ 'fri': 'W-FRI',
1504
+ 'sat': 'W-SAT',
1505
+ 'sun': 'W-SUN'
1506
+ }
1507
+ if week_commencing.lower() not in day_offsets:
1508
+ raise ValueError(f"Invalid week_commencing value: {week_commencing}. Must be one of {list(day_offsets.keys())}.")
1509
+
1510
+ time_series_df = (time_series_df
1511
+ .set_index("date")
1512
+ .resample(day_offsets[week_commencing.lower()])
1513
+ .max()
1514
+ .reset_index())
1515
+
1516
+ time_series_df.rename(columns={"date": "OBS"}, inplace=True)
1517
+ time_series_df.fillna(0, inplace=True)
1518
+
1519
+ return time_series_df
1520
+
1521
+ ############################################################
1522
+ # 2) FETCH FIFA WC, UEFA EURO, RUGBY, SIX NATIONS (2ND FUNC)
1523
+ ############################################################
1524
+ def fetch_events(start_date=start_date, week_commencing=week_commencing):
1525
+ # Initialize date range
1526
+ start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
1527
+ end_date_obj = datetime.today()
1528
+ date_range = pd.date_range(start=start_date_obj, end=end_date_obj)
1529
+ df = pd.DataFrame({'OBS': date_range}).set_index('OBS')
1530
+
1531
+ # Define columns for sports
1532
+ event_columns = {
1533
+ 'seas_fifa_world_cup': {
1534
+ 'league_id': 4429, 'start_year': 1950, 'interval': 4
1535
+ },
1536
+ 'seas_uefa_european_championship': {
1537
+ 'league_id': 4502, 'start_year': 1960, 'interval': 4, 'extra_years': [2021]
1538
+ },
1539
+ 'seas_rugby_world_cup': {
1540
+ 'league_id': 4574, 'start_year': 1987, 'interval': 4
1541
+ },
1542
+ 'seas_six_nations': {
1543
+ 'league_id': 4714, 'start_year': 2000, 'interval': 1
1544
+ },
1545
+ }
1546
+
1547
+ # Initialize columns
1548
+ for col in event_columns.keys():
1549
+ df[col] = 0
1550
+
1551
+ def fetch_league_events(league_id, column_name, start_year, interval, extra_years=None):
1552
+ extra_years = extra_years or []
1553
+ # Fetch seasons
1554
+ seasons_url = f"https://www.thesportsdb.com/api/v1/json/3/search_all_seasons.php?id={league_id}"
1555
+ seasons_response = requests.get(seasons_url)
1556
+ if seasons_response.status_code != 200:
1557
+ return # Skip on failure
1558
+
1559
+ seasons_data = seasons_response.json().get('seasons', [])
1560
+ for season in seasons_data:
1561
+ season_name = season.get('strSeason', '')
1562
+ if not season_name.isdigit():
1563
+ continue
1564
+
1565
+ year = int(season_name)
1566
+ # Check if the year is valid for this competition
1567
+ if year in extra_years or (year >= start_year and (year - start_year) % interval == 0):
1568
+ # Fetch events
1569
+ events_url = f"https://www.thesportsdb.com/api/v1/json/3/eventsseason.php?id={league_id}&s={season_name}"
1570
+ events_response = requests.get(events_url)
1571
+ if events_response.status_code != 200:
1572
+ continue
1573
+
1574
+ events_data = events_response.json().get('events', [])
1575
+ for event in events_data:
1576
+ event_date_str = event.get('dateEvent')
1577
+ if event_date_str:
1578
+ event_date = datetime.strptime(event_date_str, '%Y-%m-%d')
1579
+ if event_date in df.index:
1580
+ df.loc[event_date, column_name] = 1
1581
+
1582
+ # Fetch events for all defined leagues
1583
+ for column_name, params in event_columns.items():
1584
+ fetch_league_events(
1585
+ league_id=params['league_id'],
1586
+ column_name=column_name,
1587
+ start_year=params['start_year'],
1588
+ interval=params['interval'],
1589
+ extra_years=params.get('extra_years', [])
1590
+ )
1591
+
1592
+ # Resample by week
1593
+ day_offsets = {
1594
+ 'mon': 'W-MON',
1595
+ 'tues': 'W-TUE',
1596
+ 'wed': 'W-WED',
1597
+ 'thurs': 'W-THU',
1598
+ 'fri': 'W-FRI',
1599
+ 'sat': 'W-SAT',
1600
+ 'sun': 'W-SUN'
1601
+ }
1602
+
1603
+ if week_commencing.lower() not in day_offsets:
1604
+ raise ValueError(
1605
+ f"Invalid week_commencing value: {week_commencing}. "
1606
+ f"Must be one of {list(day_offsets.keys())}."
1607
+ )
1608
+
1609
+ df = df.resample(day_offsets[week_commencing.lower()]).max()
1610
+ df = df.reset_index()
1611
+ return df
1612
+
1613
+ ###################################################
1614
+ # 3) CALL BOTH, THEN MERGE ON "OBS" & FILL WITH 0s
1615
+ ###################################################
1616
+ df_uefa_nfl = scrape_sports_events(start_date, week_commencing)
1617
+ df_other_events = fetch_events(start_date, week_commencing)
1618
+
1619
+ # Merge on "OBS" column (outer join to preserve all dates in range)
1620
+ final_df = pd.merge(df_uefa_nfl, df_other_events, on='OBS', how='outer')
1621
+
1622
+ # Fill any NaNs with 0 for event columns
1623
+ # (Only fill numeric columns or everything except 'OBS')
1624
+ for col in final_df.columns:
1625
+ if col != 'OBS':
1626
+ final_df[col] = final_df[col].fillna(0)
1627
+
1628
+ # Sort by date just in case
1629
+ final_df.sort_values(by='OBS', inplace=True)
1630
+ final_df.reset_index(drop=True, inplace=True)
1631
+
1632
+ return final_df