imsciences 0.9.5.4__py3-none-any.whl → 0.9.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of imsciences might be problematic. Click here for more details.
- imsciences/pull.py +726 -577
- imsciences/unittesting.py +0 -1
- {imsciences-0.9.5.4.dist-info → imsciences-0.9.5.5.dist-info}/METADATA +25 -22
- {imsciences-0.9.5.4.dist-info → imsciences-0.9.5.5.dist-info}/RECORD +8 -8
- {imsciences-0.9.5.4.dist-info → imsciences-0.9.5.5.dist-info}/LICENSE.txt +0 -0
- {imsciences-0.9.5.4.dist-info → imsciences-0.9.5.5.dist-info}/PKG-INFO-IMS-24Ltp-3 +0 -0
- {imsciences-0.9.5.4.dist-info → imsciences-0.9.5.5.dist-info}/WHEEL +0 -0
- {imsciences-0.9.5.4.dist-info → imsciences-0.9.5.5.dist-info}/top_level.txt +0 -0
imsciences/pull.py
CHANGED
|
@@ -11,6 +11,8 @@ from bs4 import BeautifulSoup
|
|
|
11
11
|
import yfinance as yf
|
|
12
12
|
import holidays
|
|
13
13
|
from dateutil.easter import easter
|
|
14
|
+
import urllib.request
|
|
15
|
+
from geopy.geocoders import Nominatim
|
|
14
16
|
|
|
15
17
|
from imsciences.mmm import dataprocessing
|
|
16
18
|
|
|
@@ -48,8 +50,8 @@ class datapull:
|
|
|
48
50
|
|
|
49
51
|
print("\n6. pull_weather")
|
|
50
52
|
print(" - Description: Fetch and process historical weather data for the specified country.")
|
|
51
|
-
print(" - Usage: pull_weather(week_commencing, country)")
|
|
52
|
-
print(" - Example: pull_weather('mon', 'GBR')")
|
|
53
|
+
print(" - Usage: pull_weather(week_commencing, start_date, country)")
|
|
54
|
+
print(" - Example: pull_weather('mon', '2020-01-01', ['GBR'])")
|
|
53
55
|
|
|
54
56
|
print("\n7. pull_macro_ons_uk")
|
|
55
57
|
print(" - Description: Fetch and process time series data from the Beta ONS API.")
|
|
@@ -60,6 +62,11 @@ class datapull:
|
|
|
60
62
|
print(" - Description: Fetch and process time series data from the Beta ONS API.")
|
|
61
63
|
print(" - Usage: pull_yfinance(tickers, week_start_day)")
|
|
62
64
|
print(" - Example: pull_yfinance(['^FTMC', '^IXIC'], 'mon')")
|
|
65
|
+
|
|
66
|
+
print("\n9. pull_sports_events")
|
|
67
|
+
print(" - Description: Pull a veriety of sports events primaraly football and rugby.")
|
|
68
|
+
print(" - Usage: pull_sports_events(start_date, week_commencing)")
|
|
69
|
+
print(" - Example: pull_sports_events('2020-01-01', 'mon')")
|
|
63
70
|
|
|
64
71
|
############################################################### MACRO ##########################################################################
|
|
65
72
|
|
|
@@ -507,15 +514,6 @@ class datapull:
|
|
|
507
514
|
fathers_day = nth_weekday_of_month(yr, 6, 6, 3) # Sunday=6
|
|
508
515
|
# Mother's Day US = 2nd Sunday in May
|
|
509
516
|
mothers_day_us = nth_weekday_of_month(yr, 5, 6, 2)
|
|
510
|
-
# Mother's Day UK: 4th Sunday in Lent => "Mothering Sunday"
|
|
511
|
-
# We can approximate as: Easter Sunday - 21 days
|
|
512
|
-
# BUT we also must ensure it's actually Sunday
|
|
513
|
-
# (the 4th Sunday in Lent can shift. We'll do the official approach below.)
|
|
514
|
-
# Another approach: Easter Sunday - 7 * (4 weeks) is the 4th Sunday prior to Easter.
|
|
515
|
-
# But that might overshoot if Lent started mid-week.
|
|
516
|
-
# Let's do a quick approach:
|
|
517
|
-
# Officially: Mothering Sunday = 3 weeks before Easter Sunday (the 4th Sunday is Easter Sunday itself).
|
|
518
|
-
# So Easter - 21 days should be the Sunday, but let's confirm with weekday check.
|
|
519
517
|
mothering_sunday = easter(yr) - timedelta(days=21)
|
|
520
518
|
# If for some reason that's not a Sunday (rare corner cases), shift to Sunday:
|
|
521
519
|
while mothering_sunday.weekday() != 6: # Sunday=6
|
|
@@ -641,610 +639,536 @@ class datapull:
|
|
|
641
639
|
|
|
642
640
|
return df_combined
|
|
643
641
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
642
|
+
def pull_weather(self, week_commencing, start_date, country_codes) -> pd.DataFrame:
|
|
643
|
+
"""
|
|
644
|
+
Pull weather data for a given week-commencing day and one or more country codes.
|
|
645
|
+
|
|
646
|
+
LOGIC:
|
|
647
|
+
1) For non-US countries (AU, GB, DE, CA, ZA):
|
|
648
|
+
- Mesonet => max_temp_f, min_temp_f -> compute mean_temp_f -> weekly average => 'avg_max_temp_f', etc.
|
|
649
|
+
- Open-Meteo => precipitation_sum => 'avg_rain_sum', snowfall_sum => 'avg_snow_sum'.
|
|
650
|
+
- Merge, then rename columns with prefix 'seas_{country}_'.
|
|
651
|
+
|
|
652
|
+
2) For the US:
|
|
653
|
+
- We have multiple <STATE>_ASOS networks (e.g. CA_ASOS, TX_ASOS).
|
|
654
|
+
- For each state, fetch from Mesonet => max_temp_f, min_temp_f, precip_in, snow_in -> compute mean_temp_f -> weekly average => 'avg_max_temp_f', 'avg_rain_sum', 'avg_snow_sum', etc.
|
|
655
|
+
- Rename columns for each state with prefix 'seas_us_{state}_'.
|
|
656
|
+
- Merge all states (and countries) into a single DataFrame.
|
|
657
|
+
|
|
658
|
+
:param week_commencing: A string in {"mon","tue","wed","thur","fri","sat","sun"}.
|
|
659
|
+
:param country_codes: A list of 2-letter country codes or a single string, e.g. ["GB","US"].
|
|
660
|
+
:return: A single Pandas DataFrame with weekly-aggregated data for all requested countries.
|
|
661
|
+
"""
|
|
662
|
+
# ------------------------------------------------------------------ #
|
|
663
|
+
# 0) Handle either a single code or list of codes
|
|
664
|
+
# ------------------------------------------------------------------ #
|
|
665
|
+
if isinstance(country_codes, str):
|
|
666
|
+
country_codes = [country_codes]
|
|
667
|
+
elif not isinstance(country_codes, (list, tuple)):
|
|
668
|
+
raise ValueError("country_codes must be a list/tuple or a single string.")
|
|
669
|
+
|
|
670
|
+
# --- Setup / Constants --- #
|
|
653
671
|
day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
|
|
672
|
+
# Map each 2-letter code to a key
|
|
673
|
+
country_dict = {
|
|
674
|
+
"US": "US_STATES",
|
|
675
|
+
"CA": "Canada",
|
|
676
|
+
"AU": "AU__ASOS",
|
|
677
|
+
"GB": "GB__ASOS",
|
|
678
|
+
"DE": "DE__ASOS",
|
|
679
|
+
"ZA": "ZA__ASOS"
|
|
680
|
+
}
|
|
654
681
|
|
|
655
|
-
#
|
|
656
|
-
|
|
682
|
+
# Station-based countries for Mesonet
|
|
683
|
+
station_map = {
|
|
684
|
+
"GB__ASOS": [
|
|
685
|
+
"&stations=EGCC", "&stations=EGNM", "&stations=EGBB", "&stations=EGSH",
|
|
686
|
+
"&stations=EGFF", "&stations=EGHI", "&stations=EGLC", "&stations=EGHQ",
|
|
687
|
+
"&stations=EGAC", "&stations=EGPF", "&stations=EGGD", "&stations=EGPE",
|
|
688
|
+
"&stations=EGNT"
|
|
689
|
+
],
|
|
690
|
+
"AU__ASOS": [
|
|
691
|
+
"&stations=YPDN", "&stations=YBCS", "&stations=YBBN", "&stations=YSSY",
|
|
692
|
+
"&stations=YSSY", "&stations=YMEN", "&stations=YPAD", "&stations=YPPH"
|
|
693
|
+
],
|
|
694
|
+
"DE__ASOS": [
|
|
695
|
+
"&stations=EDDL", "&stations=EDDH", "&stations=EDDB", "&stations=EDDN",
|
|
696
|
+
"&stations=EDDF", "&stations=EDDK", "&stations=EDLW", "&stations=EDDM"
|
|
697
|
+
],
|
|
698
|
+
# Example: if ZA is also station-based, add it here.
|
|
699
|
+
"ZA__ASOS": [
|
|
700
|
+
# If you know the station codes, add them here:
|
|
701
|
+
# e.g. "&stations=FACT", "&stations=FAJS", ...
|
|
702
|
+
],
|
|
703
|
+
# "FR__ASOS" if you need France, etc.
|
|
704
|
+
}
|
|
657
705
|
|
|
658
|
-
#
|
|
659
|
-
|
|
660
|
-
|
|
706
|
+
# Non-US countries that also fetch RAIN & SNOW from Open-Meteo
|
|
707
|
+
rainfall_city_map = {
|
|
708
|
+
"GB__ASOS": [
|
|
709
|
+
"Manchester", "Leeds", "Birmingham", "London","Glasgow",
|
|
710
|
+
],
|
|
711
|
+
"AU__ASOS": [
|
|
712
|
+
"Darwin", "Cairns", "Brisbane", "Sydney", "Melbourne", "Adelaide", "Perth"
|
|
713
|
+
],
|
|
714
|
+
"DE__ASOS": [
|
|
715
|
+
"Dortmund", "Düsseldorf", "Frankfurt", "Munich", "Cologne", "Berlin", "Hamburg", "Nuernberg"
|
|
716
|
+
],
|
|
717
|
+
"ZA__ASOS": [
|
|
718
|
+
"Johannesburg", "Cape Town", "Durban", "Pretoria"
|
|
719
|
+
],
|
|
720
|
+
}
|
|
661
721
|
|
|
662
|
-
#
|
|
663
|
-
|
|
722
|
+
# Canada sub-networks
|
|
723
|
+
institute_vector = [
|
|
724
|
+
"CA_NB_ASOS", "CA_NF_ASOS", "CA_NT_ASOS", "CA_NS_ASOS", "CA_NU_ASOS"
|
|
725
|
+
]
|
|
726
|
+
stations_list_canada = [
|
|
727
|
+
[
|
|
728
|
+
"&stations=CYQM", "&stations=CERM", "&stations=CZCR",
|
|
729
|
+
"&stations=CZBF", "&stations=CYFC", "&stations=CYCX"
|
|
730
|
+
],
|
|
731
|
+
[
|
|
732
|
+
"&stations=CWZZ", "&stations=CYDP", "&stations=CYMH", "&stations=CYAY",
|
|
733
|
+
"&stations=CWDO", "&stations=CXTP", "&stations=CYJT", "&stations=CYYR",
|
|
734
|
+
"&stations=CZUM", "&stations=CYWK", "&stations=CYWK"
|
|
735
|
+
],
|
|
736
|
+
[
|
|
737
|
+
"&stations=CYHI", "&stations=CZCP", "&stations=CWLI", "&stations=CWND",
|
|
738
|
+
"&stations=CXTV", "&stations=CYVL", "&stations=CYCO", "&stations=CXDE",
|
|
739
|
+
"&stations=CYWE", "&stations=CYLK", "&stations=CWID", "&stations=CYRF",
|
|
740
|
+
"&stations=CXYH", "&stations=CYWY", "&stations=CWMT"
|
|
741
|
+
],
|
|
742
|
+
[
|
|
743
|
+
"&stations=CWEF", "&stations=CXIB", "&stations=CYQY", "&stations=CYPD",
|
|
744
|
+
"&stations=CXNP", "&stations=CXMY", "&stations=CYAW", "&stations=CWKG",
|
|
745
|
+
"&stations=CWVU", "&stations=CXLB", "&stations=CWSA", "&stations=CWRN"
|
|
746
|
+
],
|
|
747
|
+
[
|
|
748
|
+
"&stations=CYLT", "&stations=CWEU", "&stations=CWGZ", "&stations=CYIO",
|
|
749
|
+
"&stations=CXSE", "&stations=CYCB", "&stations=CWIL", "&stations=CXWB",
|
|
750
|
+
"&stations=CYZS", "&stations=CWJC", "&stations=CYFB", "&stations=CWUW"
|
|
751
|
+
]
|
|
752
|
+
]
|
|
664
753
|
|
|
665
|
-
#
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
754
|
+
# US states and stations - each sub-network
|
|
755
|
+
us_state_networks = {
|
|
756
|
+
state: f"{state}_ASOS" for state in [
|
|
757
|
+
"AL", "AR", "AZ", "CA", "CO", "CT", "DE", "FL", "GA", "IA", "ID", "IL", "IN",
|
|
758
|
+
"KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND",
|
|
759
|
+
"NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD",
|
|
760
|
+
"TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"
|
|
761
|
+
]
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
us_stations_map = {
|
|
765
|
+
"AL_ASOS": ["&stations=BHM", "&stations=HSV", "&stations=MGM", "&stations=MOB", "&stations=TCL"],
|
|
766
|
+
"AR_ASOS": ["&stations=LIT", "&stations=FSM", "&stations=TXK", "&stations=HOT", "&stations=FYV"],
|
|
767
|
+
"AZ_ASOS": ["&stations=PHX", "&stations=TUS", "&stations=FLG", "&stations=YUM", "&stations=PRC"],
|
|
768
|
+
"CA_ASOS": ["&stations=LAX", "&stations=SAN", "&stations=SJC", "&stations=SFO", "&stations=FAT"],
|
|
769
|
+
"CO_ASOS": ["&stations=DEN", "&stations=COS", "&stations=GJT", "&stations=PUB", "&stations=ASE"],
|
|
770
|
+
"CT_ASOS": ["&stations=BDL", "&stations=HVN", "&stations=BDR", "&stations=GON", "&stations=HFD"],
|
|
771
|
+
"DE_ASOS": ["&stations=ILG", "&stations=GED", "&stations=DOV"],
|
|
772
|
+
"FL_ASOS": ["&stations=MIA", "&stations=TPA", "&stations=ORL", "&stations=JAX", "&stations=TLH"],
|
|
773
|
+
"GA_ASOS": ["&stations=ATL", "&stations=SAV", "&stations=CSG", "&stations=MCN", "&stations=AGS"],
|
|
774
|
+
"IA_ASOS": ["&stations=DSM", "&stations=CID", "&stations=DBQ", "&stations=ALO", "&stations=SUX"],
|
|
775
|
+
"ID_ASOS": ["&stations=BOI", "&stations=IDA", "&stations=PIH", "&stations=SUN", "&stations=COE"],
|
|
776
|
+
"IL_ASOS": ["&stations=ORD", "&stations=MDW", "&stations=PIA", "&stations=SPI", "&stations=MLI"],
|
|
777
|
+
"IN_ASOS": ["&stations=IND", "&stations=FWA", "&stations=SBN", "&stations=EVV", "&stations=HUF"],
|
|
778
|
+
"KS_ASOS": ["&stations=ICT", "&stations=FOE", "&stations=GCK", "&stations=HYS", "&stations=SLN"],
|
|
779
|
+
"KY_ASOS": ["&stations=SDF", "&stations=LEX", "&stations=CVG", "&stations=PAH", "&stations=BWG"],
|
|
780
|
+
"LA_ASOS": ["&stations=MSY", "&stations=SHV", "&stations=LFT", "&stations=BTR", "&stations=MLU"],
|
|
781
|
+
"MA_ASOS": ["&stations=BOS", "&stations=ORH", "&stations=HYA", "&stations=ACK", "&stations=BED"],
|
|
782
|
+
"MD_ASOS": ["&stations=BWI", "&stations=MTN", "&stations=SBY", "&stations=HGR", "&stations=ADW"],
|
|
783
|
+
"ME_ASOS": ["&stations=PWM", "&stations=BGR", "&stations=CAR", "&stations=PQI", "&stations=RKD"],
|
|
784
|
+
"MI_ASOS": ["&stations=DTW", "&stations=GRR", "&stations=FNT", "&stations=LAN", "&stations=MKG"],
|
|
785
|
+
"MN_ASOS": ["&stations=MSP", "&stations=DLH", "&stations=RST", "&stations=STC", "&stations=INL"],
|
|
786
|
+
"MO_ASOS": ["&stations=STL", "&stations=MCI", "&stations=SGF", "&stations=COU", "&stations=JLN"],
|
|
787
|
+
"MS_ASOS": ["&stations=JAN", "&stations=GPT", "&stations=MEI", "&stations=PIB", "&stations=GLH"],
|
|
788
|
+
"MT_ASOS": ["&stations=BIL", "&stations=MSO", "&stations=GTF", "&stations=HLN", "&stations=BZN"],
|
|
789
|
+
"NC_ASOS": ["&stations=CLT", "&stations=RDU", "&stations=GSO", "&stations=ILM", "&stations=AVL"],
|
|
790
|
+
"ND_ASOS": ["&stations=BIS", "&stations=FAR", "&stations=GFK", "&stations=ISN", "&stations=JMS"],
|
|
791
|
+
"NE_ASOS": ["&stations=OMA"],
|
|
792
|
+
"NH_ASOS": ["&stations=MHT", "&stations=PSM", "&stations=CON", "&stations=LEB", "&stations=ASH"],
|
|
793
|
+
"NJ_ASOS": ["&stations=EWR", "&stations=ACY", "&stations=TTN", "&stations=MMU", "&stations=TEB"],
|
|
794
|
+
"NM_ASOS": ["&stations=ABQ", "&stations=SAF", "&stations=ROW", "&stations=HOB", "&stations=FMN"],
|
|
795
|
+
"NV_ASOS": ["&stations=LAS"],
|
|
796
|
+
"NY_ASOS": ["&stations=JFK", "&stations=LGA", "&stations=BUF", "&stations=ALB", "&stations=SYR"],
|
|
797
|
+
"OH_ASOS": ["&stations=CMH"],
|
|
798
|
+
"OK_ASOS": ["&stations=OKC", "&stations=TUL", "&stations=LAW", "&stations=SWO", "&stations=PNC"],
|
|
799
|
+
"OR_ASOS": ["&stations=PDX"],
|
|
800
|
+
"PA_ASOS": ["&stations=PHL", "&stations=PIT", "&stations=ERI", "&stations=MDT", "&stations=AVP"],
|
|
801
|
+
"RI_ASOS": ["&stations=PVD", "&stations=WST", "&stations=UUU"],
|
|
802
|
+
"SC_ASOS": ["&stations=CHS", "&stations=CAE", "&stations=GSP", "&stations=MYR", "&stations=FLO"],
|
|
803
|
+
"SD_ASOS": ["&stations=FSD", "&stations=RAP", "&stations=PIR", "&stations=ABR", "&stations=YKN"],
|
|
804
|
+
"TN_ASOS": ["&stations=BNA", "&stations=MEM", "&stations=TYS", "&stations=CHA", "&stations=TRI"],
|
|
805
|
+
"TX_ASOS": ["&stations=DFW", "&stations=IAH", "&stations=AUS", "&stations=SAT", "&stations=ELP"],
|
|
806
|
+
"UT_ASOS": ["&stations=SLC", "&stations=OGD", "&stations=PVU", "&stations=SGU", "&stations=CNY"],
|
|
807
|
+
"VA_ASOS": ["&stations=DCA", "&stations=RIC", "&stations=ROA", "&stations=ORF", "&stations=SHD"],
|
|
808
|
+
"VT_ASOS": ["&stations=BTV", "&stations=MPV", "&stations=RUT", "&stations=VSF", "&stations=MVL"],
|
|
809
|
+
"WA_ASOS": ["&stations=SEA", "&stations=GEG", "&stations=TIW", "&stations=VUO", "&stations=BFI"],
|
|
810
|
+
"WI_ASOS": ["&stations=MKE", "&stations=MSN", "&stations=GRB", "&stations=EAU", "&stations=LSE"],
|
|
811
|
+
"WV_ASOS": ["&stations=CRW", "&stations=CKB", "&stations=HTS", "&stations=MGW", "&stations=BKW"],
|
|
812
|
+
"WY_ASOS": ["&stations=CPR", "&stations=JAC", "&stations=SHR", "&stations=COD", "&stations=RKS"],
|
|
813
|
+
}
|
|
814
|
+
# --- Date setup --- #
|
|
815
|
+
date_object = datetime.strptime(start_date, "%Y-%m-%d")
|
|
816
|
+
start_day = date_object.day
|
|
817
|
+
start_month = date_object.month
|
|
818
|
+
start_year = date_object.year
|
|
819
|
+
formatted_date = f"{start_year:04d}-01-01" # "2000-01-01"
|
|
670
820
|
today = datetime.now()
|
|
671
|
-
end_day = today.day
|
|
672
|
-
end_month = today.month
|
|
673
|
-
end_year = today.year
|
|
674
|
-
|
|
675
|
-
if country == "GB__ASOS":
|
|
676
|
-
stations = ["&stations=EGCC", "&stations=EGNM", "&stations=EGBB",
|
|
677
|
-
"&stations=EGSH", "&stations=EGFF", "&stations=EGHI",
|
|
678
|
-
"&stations=EGLC", "&stations=EGHQ", "&stations=EGAC",
|
|
679
|
-
"&stations=EGPF", "&stations=EGGD", "&stations=EGPE",
|
|
680
|
-
"&stations=EGNT"]
|
|
681
|
-
elif country == "AU__ASOS":
|
|
682
|
-
stations = ["&stations=YPDN", "&stations=YBCS", "&stations=YBBN",
|
|
683
|
-
"&stations=YSSY", "&stations=YSSY", "&stations=YMEN",
|
|
684
|
-
"&stations=YPAD", "&stations=YPPH"]
|
|
685
|
-
elif country == "USCRN":
|
|
686
|
-
stations = ["&stations=64756", "&stations=64758", "&stations=03761", "&stations=54797", # North
|
|
687
|
-
"&stations=53968", "&stations=53960", "&stations=54932", "&stations=13301", # Midwest
|
|
688
|
-
"&stations=64756", "&stations=64756", "&stations=92821", "&stations=63862", # South
|
|
689
|
-
"&stations=53152", "&stations=93245", "&stations=04138", "&stations=04237"] # West
|
|
690
|
-
elif country == "DE__ASOS":
|
|
691
|
-
stations = ["&stations=EDDL", "&stations=EDDH", "&stations=EDDB",
|
|
692
|
-
"&stations=EDDN", "&stations=EDDF", "&stations=EDDK",
|
|
693
|
-
"&stations=EDLW", "&stations=EDDM"]
|
|
694
|
-
elif country == "FR__ASOS":
|
|
695
|
-
stations = ["&stations=LFPB"]
|
|
696
|
-
elif country == "Canada":
|
|
697
|
-
institute_vector = ["CA_NB_ASOS", "CA_NF_ASOS", "CA_NT_ASOS", "CA_NS_ASOS",
|
|
698
|
-
"CA_NU_ASOS"]
|
|
699
|
-
stations_list = [[] for _ in range(5)]
|
|
700
|
-
stations_list[0].append(["&stations=CYQM", "&stations=CERM", "&stations=CZCR",
|
|
701
|
-
"&stations=CZBF", "&stations=CYFC", "&stations=CYCX"])
|
|
702
|
-
|
|
703
|
-
stations_list[1].append(["&stations=CWZZ", "&stations=CYDP", "&stations=CYMH",
|
|
704
|
-
"&stations=CYAY", "&stations=CWDO", "&stations=CXTP",
|
|
705
|
-
"&stations=CYJT", "&stations=CYYR", "&stations=CZUM",
|
|
706
|
-
"&stations=CYWK", "&stations=CYWK"])
|
|
707
|
-
|
|
708
|
-
stations_list[2].append(["&stations=CYHI", "&stations=CZCP", "&stations=CWLI",
|
|
709
|
-
"&stations=CWND", "&stations=CXTV", "&stations=CYVL",
|
|
710
|
-
"&stations=CYCO", "&stations=CXDE", "&stations=CYWE",
|
|
711
|
-
"&stations=CYLK", "&stations=CWID", "&stations=CYRF",
|
|
712
|
-
"&stations=CXYH", "&stations=CYWY", "&stations=CWMT"])
|
|
713
|
-
|
|
714
|
-
stations_list[3].append(["&stations=CWEF", "&stations=CXIB", "&stations=CYQY",
|
|
715
|
-
"&stations=CYPD", "&stations=CXNP", "&stations=CXMY",
|
|
716
|
-
"&stations=CYAW", "&stations=CWKG", "&stations=CWVU",
|
|
717
|
-
"&stations=CXLB", "&stations=CWSA", "&stations=CWRN"])
|
|
718
|
-
|
|
719
|
-
stations_list[4].append(["&stations=CYLT", "&stations=CWEU", "&stations=CWGZ",
|
|
720
|
-
"&stations=CYIO", "&stations=CXSE", "&stations=CYCB",
|
|
721
|
-
"&stations=CWIL", "&stations=CXWB", "&stations=CYZS",
|
|
722
|
-
"&stations=CWJC", "&stations=CYFB", "&stations=CWUW"])
|
|
723
|
-
|
|
724
|
-
elif country == "ZA__ASOS":
|
|
725
|
-
cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
|
|
726
|
-
stations = []
|
|
821
|
+
end_day, end_month, end_year = today.day, today.month, today.year
|
|
727
822
|
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
823
|
+
# ------------------------------------------------------------------ #
|
|
824
|
+
# Utility functions
|
|
825
|
+
# ------------------------------------------------------------------ #
|
|
826
|
+
def convert_f_to_c(series_f: pd.Series) -> pd.Series:
|
|
827
|
+
"""Convert Fahrenheit to Celsius."""
|
|
828
|
+
return (series_f - 32) * 5.0 / 9.0
|
|
829
|
+
|
|
830
|
+
def fetch_mesonet_data(network: str, stations: list) -> pd.DataFrame:
|
|
831
|
+
"""Fetch station-based data (daily) from Iowa Mesonet."""
|
|
832
|
+
import csv
|
|
732
833
|
|
|
733
|
-
# Temperature
|
|
734
|
-
if country in ["GB__ASOS", "AU__ASOS", "DE__ASOS", "FR__ASOS"]:
|
|
735
|
-
# We start by making a data frame of the following weather stations
|
|
736
834
|
station_query = ''.join(stations)
|
|
835
|
+
url = (
|
|
836
|
+
"https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?"
|
|
837
|
+
f"network={network}{station_query}"
|
|
838
|
+
f"&year1={start_year}&month1={start_month}&day1={start_day}"
|
|
839
|
+
f"&year2={end_year}&month2={end_month}&day2={end_day}"
|
|
840
|
+
)
|
|
841
|
+
with urllib.request.urlopen(url) as f:
|
|
842
|
+
df = pd.read_csv(f, dtype=str, quoting=csv.QUOTE_ALL)
|
|
843
|
+
return df
|
|
844
|
+
|
|
845
|
+
def fetch_canada_data() -> pd.DataFrame:
|
|
846
|
+
"""Canada uses multiple sub-networks. Combine them all."""
|
|
847
|
+
import csv
|
|
848
|
+
final_df = pd.DataFrame()
|
|
849
|
+
for i, institute_temp in enumerate(institute_vector):
|
|
850
|
+
station_query_temp = ''.join(stations_list_canada[i])
|
|
851
|
+
mesonet_url = (
|
|
852
|
+
"https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?"
|
|
853
|
+
f"network={institute_temp}{station_query_temp}"
|
|
854
|
+
f"&year1={start_year}&month1={start_month}&day1={start_day}"
|
|
855
|
+
f"&year2={end_year}&month2={end_month}&day2={end_day}"
|
|
856
|
+
)
|
|
857
|
+
with urllib.request.urlopen(mesonet_url) as f:
|
|
858
|
+
temp_df = pd.read_csv(f, dtype=str, quoting=csv.QUOTE_ALL)
|
|
737
859
|
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
742
|
-
raw_weather = urllib.request.urlopen(raw_weather_list)
|
|
743
|
-
raw_weather = pd.read_csv(raw_weather)
|
|
744
|
-
|
|
745
|
-
# Replace the occurrences of "None" with Missing Value
|
|
746
|
-
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
747
|
-
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
748
|
-
|
|
749
|
-
# Remove any data that isn't temperature-related
|
|
750
|
-
weather = raw_weather.iloc[:, 0:4]
|
|
751
|
-
|
|
752
|
-
weather[["max_temp_f", "min_temp_f"]] = weather[["max_temp_f", "min_temp_f"]].apply(pd.to_numeric)
|
|
753
|
-
|
|
754
|
-
# Estimate mean temperature
|
|
755
|
-
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
756
|
-
|
|
757
|
-
# Convert Fahrenheit to Celsius for max_temp_f
|
|
758
|
-
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
759
|
-
|
|
760
|
-
# Convert Fahrenheit to Celsius for min_temp_f
|
|
761
|
-
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
762
|
-
|
|
763
|
-
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
764
|
-
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
765
|
-
|
|
766
|
-
# Aggregate the data to week commencing sunday taking the average of the data
|
|
767
|
-
# Convert the date column to a Date type
|
|
768
|
-
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
769
|
-
|
|
770
|
-
# Determine the starting chosen day for each date
|
|
771
|
-
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
772
|
-
|
|
773
|
-
# Group by week_starting and summarize
|
|
774
|
-
numeric_columns = weather.select_dtypes(include='number').columns
|
|
775
|
-
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
776
|
-
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
777
|
-
"min_temp_f": "avg_min_temp_f",
|
|
778
|
-
"mean_temp_f": "avg_mean_temp_f",
|
|
779
|
-
"max_temp_c": "avg_max_temp_c",
|
|
780
|
-
"min_temp_c": "avg_min_temp_c",
|
|
781
|
-
"mean_temp_c": "avg_mean_temp_c"}, inplace=True)
|
|
782
|
-
elif country == "Canada":
|
|
783
|
-
for i in range(len(institute_vector)):
|
|
784
|
-
station_query_temp = ''.join(flatten_list(stations_list[i]))
|
|
785
|
-
institute_temp = institute_vector[i]
|
|
786
|
-
raw_weather_temp = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", institute_temp,
|
|
787
|
-
station_query_temp,
|
|
788
|
-
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
789
|
-
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
790
|
-
raw_weather_temp = urllib.request.urlopen(raw_weather_temp)
|
|
791
|
-
raw_weather_temp = pd.read_csv(raw_weather_temp)
|
|
792
|
-
|
|
793
|
-
if len(raw_weather_temp.index) == 0:
|
|
794
|
-
continue
|
|
795
|
-
raw_weather_temp = raw_weather_temp[['station', 'day', 'max_temp_f', 'min_temp_f', 'precip_in']]
|
|
860
|
+
if not temp_df.empty:
|
|
861
|
+
final_df = pd.concat([final_df, temp_df], ignore_index=True)
|
|
862
|
+
return final_df
|
|
796
863
|
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
if 'ERROR: Invalid network specified' in list(raw_weather.columns):
|
|
804
|
-
raw_weather.drop('ERROR: Invalid network specified', axis=1, inplace=True)
|
|
805
|
-
|
|
806
|
-
# Replace none values
|
|
807
|
-
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
808
|
-
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
809
|
-
raw_weather["precip_in"].replace("None", 0, inplace=True)
|
|
810
|
-
|
|
811
|
-
weather = raw_weather
|
|
812
|
-
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
813
|
-
|
|
814
|
-
# Estimate mean temperature
|
|
815
|
-
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
816
|
-
|
|
817
|
-
# Convert Fahrenheit to Celsius for max_temp_f
|
|
818
|
-
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
819
|
-
|
|
820
|
-
# Convert Fahrenheit to Celsius for min_temp_f
|
|
821
|
-
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
822
|
-
|
|
823
|
-
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
824
|
-
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
825
|
-
|
|
826
|
-
# Aggregate the data to week commencing sunday taking the average of the data
|
|
827
|
-
# Convert the date column to a Date type
|
|
828
|
-
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
829
|
-
|
|
830
|
-
# Determine the starting chosen day for each date
|
|
831
|
-
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
832
|
-
|
|
833
|
-
# Group by week_starting and summarize
|
|
834
|
-
numeric_columns = weather.select_dtypes(include='number').columns
|
|
835
|
-
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
836
|
-
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
837
|
-
"min_temp_f": "avg_min_temp_f",
|
|
838
|
-
"mean_temp_f": "avg_mean_temp_f",
|
|
839
|
-
"max_temp_c": "avg_max_temp_c",
|
|
840
|
-
"min_temp_c": "avg_min_temp_c",
|
|
841
|
-
"mean_temp_c": "avg_mean_temp_c",
|
|
842
|
-
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
843
|
-
elif country == "ZA__ASOS":
|
|
864
|
+
def fetch_openmeteo_rain_snow(cities: list) -> pd.DataFrame:
|
|
865
|
+
"""
|
|
866
|
+
Fetch daily precipitation_sum (rain) and snowfall_sum (snow) from Open-Meteo.
|
|
867
|
+
Returns columns: ["date", "rain_sum", "snow_sum", "city"] for each day.
|
|
868
|
+
We'll then do a weekly aggregator that yields avg_rain_sum, avg_snow_sum.
|
|
869
|
+
"""
|
|
844
870
|
weather_data_list = []
|
|
871
|
+
geolocator = Nominatim(user_agent="MyApp")
|
|
845
872
|
|
|
846
873
|
for city in cities:
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
874
|
+
loc = geolocator.geocode(city)
|
|
875
|
+
if not loc:
|
|
876
|
+
print(f"Could not find location for {city}, skipping.")
|
|
877
|
+
continue
|
|
850
878
|
|
|
879
|
+
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
851
880
|
params = {
|
|
852
|
-
"latitude":
|
|
853
|
-
"longitude":
|
|
881
|
+
"latitude": loc.latitude,
|
|
882
|
+
"longitude": loc.longitude,
|
|
854
883
|
"start_date": formatted_date,
|
|
855
884
|
"end_date": today.strftime("%Y-%m-%d"),
|
|
856
|
-
"daily": "
|
|
885
|
+
"daily": "precipitation_sum,snowfall_sum",
|
|
857
886
|
"timezone": "auto"
|
|
858
887
|
}
|
|
888
|
+
resp = requests.get(url, params=params)
|
|
889
|
+
if resp.status_code != 200:
|
|
890
|
+
print(f"[ERROR] open-meteo returned status {resp.status_code} for city={city}")
|
|
891
|
+
continue
|
|
892
|
+
try:
|
|
893
|
+
data_json = resp.json()
|
|
894
|
+
except ValueError:
|
|
895
|
+
print(f"[ERROR] invalid JSON from open-meteo for city={city}")
|
|
896
|
+
continue
|
|
859
897
|
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
dates = daily_data["time"]
|
|
898
|
+
daily_block = data_json.get("daily", {})
|
|
899
|
+
if not {"time", "precipitation_sum", "snowfall_sum"}.issubset(daily_block.keys()):
|
|
900
|
+
print(f"[ERROR] missing required keys in open-meteo for city={city}")
|
|
901
|
+
continue
|
|
865
902
|
|
|
866
|
-
|
|
867
|
-
"
|
|
868
|
-
"
|
|
869
|
-
"
|
|
870
|
-
"precip_in": daily_data["precipitation_sum"]
|
|
903
|
+
df_temp = pd.DataFrame({
|
|
904
|
+
"date": daily_block["time"],
|
|
905
|
+
"rain_sum": daily_block["precipitation_sum"],
|
|
906
|
+
"snow_sum": daily_block["snowfall_sum"]
|
|
871
907
|
})
|
|
872
|
-
|
|
873
|
-
weather_data_list.append(
|
|
874
|
-
|
|
875
|
-
weather = pd.concat(weather_data_list)
|
|
876
|
-
|
|
877
|
-
# Convert the date column to a Date type
|
|
878
|
-
weather["day"] = pd.to_datetime(weather["day"])
|
|
879
|
-
|
|
880
|
-
# Replace None values
|
|
881
|
-
weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
882
|
-
weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
883
|
-
weather["precip_in"].replace("None", 0, inplace=True)
|
|
884
|
-
|
|
885
|
-
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
886
|
-
|
|
887
|
-
# Estimate mean temperature
|
|
888
|
-
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
889
|
-
|
|
890
|
-
# Convert Fahrenheit to Celsius for max_temp_f
|
|
891
|
-
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
892
|
-
|
|
893
|
-
# Convert Fahrenheit to Celsius for min_temp_f
|
|
894
|
-
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
895
|
-
|
|
896
|
-
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
897
|
-
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
898
|
-
|
|
899
|
-
# Determine the starting chosen day for each date
|
|
900
|
-
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
901
|
-
|
|
902
|
-
# Group by week_starting and summarize
|
|
903
|
-
numeric_columns = weather.select_dtypes(include='number').columns
|
|
904
|
-
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
905
|
-
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
906
|
-
"min_temp_f": "avg_min_temp_f",
|
|
907
|
-
"mean_temp_f": "avg_mean_temp_f",
|
|
908
|
-
"max_temp_c": "avg_max_temp_c",
|
|
909
|
-
"min_temp_c": "avg_min_temp_c",
|
|
910
|
-
"mean_temp_c": "avg_mean_temp_c",
|
|
911
|
-
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
912
|
-
|
|
913
|
-
else:
|
|
914
|
-
# We start by making a data frame of the following weather stations
|
|
915
|
-
station_query = ''.join(stations)
|
|
916
|
-
|
|
917
|
-
raw_weather_list = ''.join(["https://mesonet.agron.iastate.edu/cgi-bin/request/daily.py?network=", country,
|
|
918
|
-
station_query,
|
|
919
|
-
"&year1=", str(start_year), "&month1=", str(start_month), "&day1=", str(start_day),
|
|
920
|
-
"&year2=", str(end_year), "&month2=", str(end_month), "&day2=", str(end_day)])
|
|
921
|
-
raw_weather = urllib.request.urlopen(raw_weather_list)
|
|
922
|
-
raw_weather = pd.read_csv(raw_weather)
|
|
923
|
-
|
|
924
|
-
raw_weather = raw_weather[['day', 'max_temp_f', 'min_temp_f', 'precip_in']]
|
|
925
|
-
|
|
926
|
-
# Replace the occurrences of "None" with Missing Value
|
|
927
|
-
raw_weather["max_temp_f"].replace("None", 0, inplace=True)
|
|
928
|
-
raw_weather["min_temp_f"].replace("None", 0, inplace=True)
|
|
929
|
-
raw_weather["precip_in"].replace("None", 0, inplace=True)
|
|
930
|
-
|
|
931
|
-
# Remove any data that isn't temperature-related
|
|
932
|
-
weather = raw_weather
|
|
933
|
-
|
|
934
|
-
weather[["max_temp_f", "min_temp_f", "precip_in"]] = weather[["max_temp_f", "min_temp_f", "precip_in"]].apply(pd.to_numeric)
|
|
935
|
-
|
|
936
|
-
# Estimate mean temperature
|
|
937
|
-
weather["mean_temp_f"] = (weather["max_temp_f"] + weather["min_temp_f"]) / 2
|
|
938
|
-
|
|
939
|
-
# Convert Fahrenheit to Celsius for max_temp_f
|
|
940
|
-
weather["max_temp_c"] = (weather["max_temp_f"] - 32) * 5 / 9
|
|
941
|
-
|
|
942
|
-
# Convert Fahrenheit to Celsius for min_temp_f
|
|
943
|
-
weather["min_temp_c"] = (weather["min_temp_f"] - 32) * 5 / 9
|
|
944
|
-
|
|
945
|
-
# Convert Fahrenheit to Celsius for mean_temp_f
|
|
946
|
-
weather["mean_temp_c"] = (weather["mean_temp_f"] - 32) * 5 / 9
|
|
947
|
-
|
|
948
|
-
# Aggregate the data to week commencing sunday taking the average of the data
|
|
949
|
-
# Convert the date column to a Date type
|
|
950
|
-
weather["day"] = pd.to_datetime(weather["day"], format="%Y-%m-%d")
|
|
951
|
-
|
|
952
|
-
# Determine the starting chosen day for each date
|
|
953
|
-
weather['week_starting'] = weather["day"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
954
|
-
|
|
955
|
-
# Group by week_starting and summarize
|
|
956
|
-
numeric_columns = weather.select_dtypes(include='number').columns
|
|
957
|
-
weekly_avg_temp = weather.groupby("week_starting")[numeric_columns].mean()
|
|
958
|
-
weekly_avg_temp.rename(columns={"max_temp_f": "avg_max_temp_f",
|
|
959
|
-
"min_temp_f": "avg_min_temp_f",
|
|
960
|
-
"mean_temp_f": "avg_mean_temp_f",
|
|
961
|
-
"max_temp_c": "avg_max_temp_c",
|
|
962
|
-
"min_temp_c": "avg_min_temp_c",
|
|
963
|
-
"mean_temp_c": "avg_mean_temp_c",
|
|
964
|
-
"precip_in": "avg_mean_perc"}, inplace=True)
|
|
965
|
-
|
|
966
|
-
# Rainfall
|
|
967
|
-
if country == "GB__ASOS":
|
|
968
|
-
# Define cities and date range
|
|
969
|
-
cities = ["Manchester", "Leeds", "Birmingham", "Norwich", "Cardiff", "Southampton", "London", "Newquay", "Belfast", "Glasgow", "Bristol", "Newcastle"]
|
|
970
|
-
|
|
971
|
-
start_date = formatted_date
|
|
972
|
-
end_date = today.strftime("%Y-%m-%d")
|
|
973
|
-
|
|
974
|
-
# Initialize an empty list to store the weather data for each city
|
|
975
|
-
weather_data_list = []
|
|
976
|
-
|
|
977
|
-
# Loop through each city and fetch weather data
|
|
978
|
-
for city in cities:
|
|
979
|
-
# Initialize Nominatim API
|
|
980
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
981
|
-
location = geolocator.geocode(city)
|
|
982
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
983
|
-
|
|
984
|
-
params = {
|
|
985
|
-
"latitude": location.latitude,
|
|
986
|
-
"longitude": location.longitude,
|
|
987
|
-
"start_date": start_date,
|
|
988
|
-
"end_date": end_date,
|
|
989
|
-
"daily": "precipitation_sum",
|
|
990
|
-
"timezone": "auto"
|
|
991
|
-
}
|
|
992
|
-
|
|
993
|
-
response = requests.get(url, params=params)
|
|
994
|
-
response_data = response.json()
|
|
995
|
-
|
|
996
|
-
daily_data = response_data["daily"]["precipitation_sum"]
|
|
997
|
-
dates = response_data["daily"]["time"]
|
|
998
|
-
|
|
999
|
-
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
1000
|
-
data["city"] = city
|
|
1001
|
-
|
|
1002
|
-
weather_data_list.append(data)
|
|
1003
|
-
|
|
1004
|
-
# Combine all city data into a single data frame
|
|
1005
|
-
all_weather_data = pd.concat(weather_data_list)
|
|
1006
|
-
|
|
1007
|
-
# Convert the date column to a Date type
|
|
1008
|
-
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
1009
|
-
|
|
1010
|
-
# Set week commencing col up
|
|
1011
|
-
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1012
|
-
|
|
1013
|
-
# Group by week_starting and summarize
|
|
1014
|
-
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
1015
|
-
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
1016
|
-
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
1017
|
-
|
|
1018
|
-
# Change index to datetime
|
|
1019
|
-
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
1020
|
-
|
|
1021
|
-
elif country == "AU__ASOS":
|
|
1022
|
-
|
|
1023
|
-
# Define cities and date range
|
|
1024
|
-
cities = ["Darwin", "Cairns", "Brisbane", "Sydney", "Melbourne", "Adelaide", "Perth"]
|
|
1025
|
-
|
|
1026
|
-
start_date = formatted_date
|
|
1027
|
-
end_date = today.strftime("%Y-%m-%d")
|
|
1028
|
-
|
|
1029
|
-
# Initialize an empty list to store the weather data for each city
|
|
1030
|
-
weather_data_list = []
|
|
1031
|
-
|
|
1032
|
-
# Loop through each city and fetch weather data
|
|
1033
|
-
for city in cities:
|
|
1034
|
-
# Initialize Nominatim API
|
|
1035
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
1036
|
-
location = geolocator.geocode(city)
|
|
1037
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
1038
|
-
|
|
1039
|
-
params = {
|
|
1040
|
-
"latitude": location.latitude,
|
|
1041
|
-
"longitude": location.longitude,
|
|
1042
|
-
"start_date": start_date,
|
|
1043
|
-
"end_date": end_date,
|
|
1044
|
-
"daily": "precipitation_sum",
|
|
1045
|
-
"timezone": "auto"
|
|
1046
|
-
}
|
|
1047
|
-
|
|
1048
|
-
response = requests.get(url, params=params)
|
|
1049
|
-
response_data = response.json()
|
|
1050
|
-
|
|
1051
|
-
daily_data = response_data["daily"]["precipitation_sum"]
|
|
1052
|
-
dates = response_data["daily"]["time"]
|
|
1053
|
-
|
|
1054
|
-
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
1055
|
-
data["city"] = city
|
|
1056
|
-
|
|
1057
|
-
weather_data_list.append(data)
|
|
1058
|
-
|
|
1059
|
-
# Combine all city data into a single data frame
|
|
1060
|
-
all_weather_data = pd.concat(weather_data_list)
|
|
1061
|
-
|
|
1062
|
-
# Convert the date column to a Date type
|
|
1063
|
-
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
908
|
+
df_temp["city"] = city
|
|
909
|
+
weather_data_list.append(df_temp)
|
|
1064
910
|
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
1070
|
-
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
1071
|
-
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
1072
|
-
|
|
1073
|
-
# Change index to datetime
|
|
1074
|
-
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
1075
|
-
|
|
1076
|
-
elif country == "DE__ASOS":
|
|
1077
|
-
|
|
1078
|
-
# Define cities and date range
|
|
1079
|
-
cities = ["Dortmund", "Düsseldorf", "Frankfurt", "Munich", "Cologne", "Berlin", "Hamburg", "Nuernberg"]
|
|
1080
|
-
|
|
1081
|
-
start_date = formatted_date
|
|
1082
|
-
end_date = today.strftime("%Y-%m-%d")
|
|
1083
|
-
|
|
1084
|
-
# Initialize an empty list to store the weather data for each city
|
|
1085
|
-
weather_data_list = []
|
|
1086
|
-
|
|
1087
|
-
# Loop through each city and fetch weather data
|
|
1088
|
-
for city in cities:
|
|
1089
|
-
# Initialize Nominatim API
|
|
1090
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
1091
|
-
location = geolocator.geocode(city)
|
|
1092
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
1093
|
-
|
|
1094
|
-
params = {
|
|
1095
|
-
"latitude": location.latitude,
|
|
1096
|
-
"longitude": location.longitude,
|
|
1097
|
-
"start_date": start_date,
|
|
1098
|
-
"end_date": end_date,
|
|
1099
|
-
"daily": "precipitation_sum",
|
|
1100
|
-
"timezone": "auto"
|
|
1101
|
-
}
|
|
1102
|
-
|
|
1103
|
-
response = requests.get(url, params=params)
|
|
1104
|
-
response_data = response.json()
|
|
1105
|
-
|
|
1106
|
-
daily_data = response_data["daily"]["precipitation_sum"]
|
|
1107
|
-
dates = response_data["daily"]["time"]
|
|
1108
|
-
|
|
1109
|
-
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
1110
|
-
data["city"] = city
|
|
1111
|
-
|
|
1112
|
-
weather_data_list.append(data)
|
|
1113
|
-
|
|
1114
|
-
# Combine all city data into a single data frame
|
|
1115
|
-
all_weather_data = pd.concat(weather_data_list)
|
|
1116
|
-
|
|
1117
|
-
# Convert the date column to a Date type
|
|
1118
|
-
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
1119
|
-
|
|
1120
|
-
# Set week commencing col up
|
|
1121
|
-
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1122
|
-
|
|
1123
|
-
# Group by week_starting and summarize
|
|
1124
|
-
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
1125
|
-
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
1126
|
-
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
1127
|
-
|
|
1128
|
-
# Change index to datetime
|
|
1129
|
-
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
1130
|
-
|
|
1131
|
-
elif country == "FR__ASOS":
|
|
1132
|
-
|
|
1133
|
-
# Define cities and date range
|
|
1134
|
-
cities = ["Paris"]
|
|
1135
|
-
|
|
1136
|
-
start_date = formatted_date
|
|
1137
|
-
end_date = today.strftime("%Y-%m-%d")
|
|
1138
|
-
|
|
1139
|
-
# Initialize an empty list to store the weather data for each city
|
|
1140
|
-
weather_data_list = []
|
|
1141
|
-
|
|
1142
|
-
# Loop through each city and fetch weather data
|
|
1143
|
-
for city in cities:
|
|
1144
|
-
# Initialize Nominatim API
|
|
1145
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
1146
|
-
location = geolocator.geocode(city)
|
|
1147
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
1148
|
-
|
|
1149
|
-
params = {
|
|
1150
|
-
"latitude": location.latitude,
|
|
1151
|
-
"longitude": location.longitude,
|
|
1152
|
-
"start_date": start_date,
|
|
1153
|
-
"end_date": end_date,
|
|
1154
|
-
"daily": "precipitation_sum",
|
|
1155
|
-
"timezone": "auto"
|
|
1156
|
-
}
|
|
1157
|
-
|
|
1158
|
-
response = requests.get(url, params=params)
|
|
1159
|
-
response_data = response.json()
|
|
1160
|
-
|
|
1161
|
-
daily_data = response_data["daily"]["precipitation_sum"]
|
|
1162
|
-
dates = response_data["daily"]["time"]
|
|
1163
|
-
|
|
1164
|
-
data = pd.DataFrame({"date": dates, "rainfall": daily_data})
|
|
1165
|
-
data["city"] = city
|
|
1166
|
-
|
|
1167
|
-
weather_data_list.append(data)
|
|
1168
|
-
|
|
1169
|
-
# Combine all city data into a single data frame
|
|
1170
|
-
all_weather_data = pd.concat(weather_data_list)
|
|
1171
|
-
|
|
1172
|
-
# Convert the date column to a Date type
|
|
1173
|
-
all_weather_data["date"] = pd.to_datetime(all_weather_data["date"])
|
|
1174
|
-
|
|
1175
|
-
# Set week commencing col up
|
|
1176
|
-
all_weather_data['week_starting'] = all_weather_data["date"].apply(lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1177
|
-
|
|
1178
|
-
# Group by week_starting and summarize
|
|
1179
|
-
numeric_columns = all_weather_data.select_dtypes(include='number').columns
|
|
1180
|
-
weekly_avg_rain = all_weather_data.groupby("week_starting")[numeric_columns].mean()
|
|
1181
|
-
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
1182
|
-
|
|
1183
|
-
# Change index to datetime
|
|
1184
|
-
weekly_avg_rain.index = pd.to_datetime(weekly_avg_rain.index)
|
|
1185
|
-
|
|
1186
|
-
elif country == "ZA__ASOS":
|
|
1187
|
-
cities = ["Johannesburg", "Cape Town", "Durban", "Pretoria"]
|
|
1188
|
-
start_date = formatted_date
|
|
1189
|
-
end_date = today.strftime("%Y-%m-%d")
|
|
1190
|
-
|
|
1191
|
-
weather_data_list = []
|
|
1192
|
-
|
|
1193
|
-
for city in cities:
|
|
1194
|
-
geolocator = Nominatim(user_agent="MyApp")
|
|
1195
|
-
location = geolocator.geocode(city)
|
|
1196
|
-
url = "https://archive-api.open-meteo.com/v1/archive"
|
|
911
|
+
if weather_data_list:
|
|
912
|
+
return pd.concat(weather_data_list, ignore_index=True)
|
|
913
|
+
else:
|
|
914
|
+
return pd.DataFrame()
|
|
1197
915
|
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
916
|
+
def weekly_aggregate_temp_mesonet(df: pd.DataFrame) -> pd.DataFrame:
|
|
917
|
+
"""
|
|
918
|
+
For NON-US mesonet data, we only keep max_temp_f, min_temp_f,
|
|
919
|
+
then compute mean_temp_f, plus Celsius, and do weekly average.
|
|
920
|
+
"""
|
|
921
|
+
import pandas as pd
|
|
922
|
+
|
|
923
|
+
# Convert day col
|
|
924
|
+
if "day" not in df.columns:
|
|
925
|
+
return pd.DataFrame()
|
|
926
|
+
|
|
927
|
+
# Only keep relevant columns
|
|
928
|
+
keep_cols = []
|
|
929
|
+
for c in ["day", "max_temp_f", "min_temp_f"]:
|
|
930
|
+
if c in df.columns:
|
|
931
|
+
keep_cols.append(c)
|
|
932
|
+
df = df[keep_cols].copy()
|
|
933
|
+
|
|
934
|
+
# Convert "None" => numeric
|
|
935
|
+
for c in ["max_temp_f", "min_temp_f"]:
|
|
936
|
+
if c in df.columns:
|
|
937
|
+
df[c] = df[c].replace("None", pd.NA)
|
|
938
|
+
df[c] = pd.to_numeric(df[c], errors="coerce")
|
|
939
|
+
|
|
940
|
+
df["day"] = pd.to_datetime(df["day"], errors="coerce")
|
|
941
|
+
df["mean_temp_f"] = (df["max_temp_f"] + df["min_temp_f"]) / 2
|
|
942
|
+
df["max_temp_c"] = convert_f_to_c(df["max_temp_f"])
|
|
943
|
+
df["min_temp_c"] = convert_f_to_c(df["min_temp_f"])
|
|
944
|
+
df["mean_temp_c"] = convert_f_to_c(df["mean_temp_f"])
|
|
945
|
+
|
|
946
|
+
# Group by "week_starting"
|
|
947
|
+
df["week_starting"] = df["day"].apply(
|
|
948
|
+
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
949
|
+
if pd.notnull(x) else pd.NaT
|
|
950
|
+
)
|
|
951
|
+
numeric_cols = df.select_dtypes(include='number').columns
|
|
952
|
+
weekly = df.groupby("week_starting")[numeric_cols].mean()
|
|
953
|
+
|
|
954
|
+
# Rename columns
|
|
955
|
+
rename_map = {
|
|
956
|
+
"max_temp_f": "avg_max_temp_f",
|
|
957
|
+
"min_temp_f": "avg_min_temp_f",
|
|
958
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
959
|
+
"max_temp_c": "avg_max_temp_c",
|
|
960
|
+
"min_temp_c": "avg_min_temp_c",
|
|
961
|
+
"mean_temp_c": "avg_mean_temp_c",
|
|
962
|
+
}
|
|
963
|
+
weekly.rename(columns=rename_map, inplace=True)
|
|
964
|
+
|
|
965
|
+
# Return as a DataFrame w/ index = week_starting
|
|
966
|
+
return weekly
|
|
967
|
+
|
|
968
|
+
def weekly_aggregate_rain_snow_openmeteo(df: pd.DataFrame) -> pd.DataFrame:
|
|
969
|
+
"""
|
|
970
|
+
For NON-US, from open-meteo, we have daily columns 'date','rain_sum','snow_sum'.
|
|
971
|
+
We'll do weekly average of each. -> 'avg_rain_sum', 'avg_snow_sum'.
|
|
972
|
+
"""
|
|
973
|
+
import pandas as pd
|
|
974
|
+
if "date" not in df.columns:
|
|
975
|
+
return pd.DataFrame()
|
|
976
|
+
|
|
977
|
+
df["date"] = pd.to_datetime(df["date"], errors="coerce")
|
|
978
|
+
df["week_starting"] = df["date"].apply(
|
|
979
|
+
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
980
|
+
if pd.notnull(x) else pd.NaT
|
|
981
|
+
)
|
|
1206
982
|
|
|
1207
|
-
|
|
1208
|
-
|
|
983
|
+
# Convert to numeric
|
|
984
|
+
for c in ["rain_sum", "snow_sum"]:
|
|
985
|
+
if c in df.columns:
|
|
986
|
+
df[c] = pd.to_numeric(df[c], errors="coerce")
|
|
1209
987
|
|
|
1210
|
-
|
|
1211
|
-
|
|
988
|
+
numeric_cols = df.select_dtypes(include='number').columns
|
|
989
|
+
weekly = df.groupby("week_starting")[numeric_cols].mean()
|
|
1212
990
|
|
|
1213
|
-
|
|
1214
|
-
|
|
991
|
+
rename_map = {
|
|
992
|
+
"rain_sum": "avg_rain_sum",
|
|
993
|
+
"snow_sum": "avg_snow_sum"
|
|
994
|
+
}
|
|
995
|
+
weekly.rename(columns=rename_map, inplace=True)
|
|
996
|
+
return weekly
|
|
1215
997
|
|
|
1216
|
-
|
|
998
|
+
def weekly_aggregate_us(df: pd.DataFrame) -> pd.DataFrame:
|
|
999
|
+
"""
|
|
1000
|
+
For US Mesonet data (per state), we keep max_temp_f, min_temp_f, precip_in, snow_in,
|
|
1001
|
+
then compute mean_temp_f & convert to celsius, group weekly.
|
|
1002
|
+
We'll rename:
|
|
1003
|
+
max_temp_f -> avg_max_temp_f
|
|
1004
|
+
min_temp_f -> avg_min_temp_f
|
|
1005
|
+
mean_temp_f -> avg_mean_temp_f
|
|
1006
|
+
precip_in -> avg_rain_sum
|
|
1007
|
+
snow_in -> avg_snow_sum
|
|
1008
|
+
"""
|
|
1009
|
+
import pandas as pd
|
|
1010
|
+
if "day" not in df.columns:
|
|
1011
|
+
return pd.DataFrame()
|
|
1012
|
+
|
|
1013
|
+
# Convert day
|
|
1014
|
+
df["day"] = pd.to_datetime(df["day"], errors="coerce")
|
|
1015
|
+
|
|
1016
|
+
# Convert "None" => numeric
|
|
1017
|
+
for c in ["max_temp_f", "min_temp_f", "precip_in", "snow_in"]:
|
|
1018
|
+
if c in df.columns:
|
|
1019
|
+
df[c] = df[c].replace("None", pd.NA)
|
|
1020
|
+
df[c] = pd.to_numeric(df[c], errors="coerce")
|
|
1021
|
+
|
|
1022
|
+
# Compute mean_temp_f, celsius
|
|
1023
|
+
df["mean_temp_f"] = (df["max_temp_f"] + df["min_temp_f"]) / 2
|
|
1024
|
+
df["max_temp_c"] = convert_f_to_c(df["max_temp_f"])
|
|
1025
|
+
df["min_temp_c"] = convert_f_to_c(df["min_temp_f"])
|
|
1026
|
+
df["mean_temp_c"] = convert_f_to_c(df["mean_temp_f"])
|
|
1027
|
+
|
|
1028
|
+
# Weekly grouping
|
|
1029
|
+
df["week_starting"] = df["day"].apply(
|
|
1030
|
+
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7)
|
|
1031
|
+
if pd.notnull(x) else pd.NaT
|
|
1032
|
+
)
|
|
1033
|
+
numeric_cols = df.select_dtypes(include='number').columns
|
|
1034
|
+
weekly = df.groupby("week_starting")[numeric_cols].mean()
|
|
1035
|
+
|
|
1036
|
+
rename_map = {
|
|
1037
|
+
"max_temp_f": "avg_max_temp_f",
|
|
1038
|
+
"min_temp_f": "avg_min_temp_f",
|
|
1039
|
+
"mean_temp_f": "avg_mean_temp_f",
|
|
1040
|
+
"max_temp_c": "avg_max_temp_c",
|
|
1041
|
+
"min_temp_c": "avg_min_temp_c",
|
|
1042
|
+
"mean_temp_c": "avg_mean_temp_c",
|
|
1043
|
+
"precip_in": "avg_rain_sum",
|
|
1044
|
+
"snow_in": "avg_snow_sum"
|
|
1045
|
+
}
|
|
1046
|
+
weekly.rename(columns=rename_map, inplace=True)
|
|
1047
|
+
return weekly
|
|
1048
|
+
|
|
1049
|
+
def rename_with_prefix(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
|
|
1050
|
+
"""Rename all columns except 'week_starting' or 'OBS' with the given prefix."""
|
|
1051
|
+
df2 = df.copy()
|
|
1052
|
+
new_cols = {}
|
|
1053
|
+
for col in df2.columns:
|
|
1054
|
+
if col not in ["week_starting", "OBS"]:
|
|
1055
|
+
new_cols[col] = prefix + col
|
|
1056
|
+
df2.rename(columns=new_cols, inplace=True)
|
|
1057
|
+
return df2
|
|
1058
|
+
|
|
1059
|
+
# ------------------------------------------------------------------ #
|
|
1060
|
+
# The final combined DataFrame
|
|
1061
|
+
# ------------------------------------------------------------------ #
|
|
1062
|
+
combined_df = pd.DataFrame()
|
|
1217
1063
|
|
|
1218
|
-
|
|
1219
|
-
|
|
1064
|
+
# ------------------------------------------------------------------ #
|
|
1065
|
+
# 1) Loop over each requested country
|
|
1066
|
+
# ------------------------------------------------------------------ #
|
|
1067
|
+
for country_code in country_codes:
|
|
1068
|
+
net = country_dict.get(country_code)
|
|
1069
|
+
if net is None:
|
|
1070
|
+
print(f"Warning: Invalid country_code '{country_code}' – skipping.")
|
|
1071
|
+
continue
|
|
1220
1072
|
|
|
1221
|
-
#
|
|
1222
|
-
|
|
1073
|
+
# =========================
|
|
1074
|
+
# 2) Special Logic for US
|
|
1075
|
+
# =========================
|
|
1076
|
+
if net == "US_STATES":
|
|
1077
|
+
for state_code, network_code in us_state_networks.items():
|
|
1078
|
+
stations = us_stations_map.get(network_code, [])
|
|
1079
|
+
if not stations:
|
|
1080
|
+
print(f"[DEBUG] No stations for {network_code}, skipping.")
|
|
1081
|
+
continue
|
|
1082
|
+
|
|
1083
|
+
raw_df = fetch_mesonet_data(network_code, stations)
|
|
1084
|
+
if raw_df.empty:
|
|
1085
|
+
print(f"[DEBUG] DataFrame empty for {network_code}, skipping.")
|
|
1086
|
+
continue
|
|
1087
|
+
|
|
1088
|
+
weekly_state = weekly_aggregate_us(raw_df)
|
|
1089
|
+
if weekly_state.empty:
|
|
1090
|
+
print(f"[DEBUG] Aggregated weekly DataFrame empty for {network_code}, skipping.")
|
|
1091
|
+
continue
|
|
1092
|
+
|
|
1093
|
+
weekly_state.reset_index(inplace=True)
|
|
1094
|
+
weekly_state.rename(columns={"week_starting": "OBS"}, inplace=True)
|
|
1095
|
+
|
|
1096
|
+
# Now rename columns with prefix: seas_us_{statecode}_
|
|
1097
|
+
prefix = f"seas_us_{state_code.lower()}_"
|
|
1098
|
+
weekly_state = rename_with_prefix(weekly_state, prefix)
|
|
1099
|
+
|
|
1100
|
+
# Merge into combined
|
|
1101
|
+
if combined_df.empty:
|
|
1102
|
+
combined_df = weekly_state
|
|
1103
|
+
else:
|
|
1104
|
+
combined_df = pd.merge(combined_df, weekly_state, on="OBS", how="outer")
|
|
1105
|
+
|
|
1106
|
+
# Done with the US. Move on to the next country in the loop
|
|
1107
|
+
continue
|
|
1223
1108
|
|
|
1224
|
-
#
|
|
1225
|
-
|
|
1109
|
+
# =======================================
|
|
1110
|
+
# 3) Logic for Non-US (AU, GB, DE, CA, ZA)
|
|
1111
|
+
# =======================================
|
|
1112
|
+
# A) Fetch temperature data from Mesonet
|
|
1113
|
+
if net == "Canada":
|
|
1114
|
+
raw_temp = fetch_canada_data()
|
|
1115
|
+
else:
|
|
1116
|
+
# e.g. "GB__ASOS", "AU__ASOS", "DE__ASOS", "ZA__ASOS" (if added)
|
|
1117
|
+
stations = station_map.get(net, [])
|
|
1118
|
+
if not stations and net != "ZA__ASOS":
|
|
1119
|
+
# If we have no stations for net and it's not ZA,
|
|
1120
|
+
# there's no data. (If ZA has stations, add them above.)
|
|
1121
|
+
raw_temp = pd.DataFrame()
|
|
1122
|
+
else:
|
|
1123
|
+
raw_temp = fetch_mesonet_data(net, stations)
|
|
1124
|
+
|
|
1125
|
+
weekly_temp = pd.DataFrame()
|
|
1126
|
+
if not raw_temp.empty:
|
|
1127
|
+
# For these countries, we only keep max_temp_f, min_temp_f, mean_temp_f
|
|
1128
|
+
weekly_temp = weekly_aggregate_temp_mesonet(raw_temp)
|
|
1129
|
+
|
|
1130
|
+
# B) Fetch rain+snow from Open-Meteo (only if we have an entry in rainfall_city_map)
|
|
1131
|
+
weekly_precip = pd.DataFrame()
|
|
1132
|
+
if net in rainfall_city_map:
|
|
1133
|
+
city_list = rainfall_city_map[net]
|
|
1134
|
+
df_rain_snow = fetch_openmeteo_rain_snow(city_list)
|
|
1135
|
+
if not df_rain_snow.empty:
|
|
1136
|
+
weekly_precip = weekly_aggregate_rain_snow_openmeteo(df_rain_snow)
|
|
1137
|
+
|
|
1138
|
+
# C) Merge the temperature data + precip/snow data on the weekly index
|
|
1139
|
+
if not weekly_temp.empty and not weekly_precip.empty:
|
|
1140
|
+
merged_df = pd.merge(weekly_temp, weekly_precip, left_index=True, right_index=True, how="outer")
|
|
1141
|
+
elif not weekly_temp.empty:
|
|
1142
|
+
merged_df = weekly_temp
|
|
1143
|
+
else:
|
|
1144
|
+
merged_df = weekly_precip
|
|
1226
1145
|
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
weekly_avg_rain.rename(columns={"rainfall": "avg_rainfall"}, inplace=True)
|
|
1146
|
+
if merged_df.empty:
|
|
1147
|
+
print(f"No data retrieved for country: {country_code}")
|
|
1148
|
+
continue
|
|
1231
1149
|
|
|
1232
|
-
#
|
|
1233
|
-
|
|
1150
|
+
# D) Convert index -> a column OBS
|
|
1151
|
+
merged_df.reset_index(inplace=True)
|
|
1152
|
+
merged_df.rename(columns={"week_starting": "OBS"}, inplace=True)
|
|
1234
1153
|
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
merged_df =
|
|
1238
|
-
else:
|
|
1239
|
-
merged_df = weekly_avg_temp
|
|
1154
|
+
# E) Rename with prefix = "seas_{country_code}_"
|
|
1155
|
+
prefix = f"seas_{country_code.lower()}_"
|
|
1156
|
+
merged_df = rename_with_prefix(merged_df, prefix)
|
|
1240
1157
|
|
|
1241
|
-
|
|
1242
|
-
|
|
1158
|
+
# F) Merge into combined_df
|
|
1159
|
+
if combined_df.empty:
|
|
1160
|
+
combined_df = merged_df
|
|
1161
|
+
else:
|
|
1162
|
+
combined_df = pd.merge(combined_df, merged_df, on="OBS", how="outer")
|
|
1243
1163
|
|
|
1244
|
-
|
|
1164
|
+
# ------------------------------------------------------------------ #
|
|
1165
|
+
# 4) Sort final by OBS (optional)
|
|
1166
|
+
# ------------------------------------------------------------------ #
|
|
1167
|
+
if not combined_df.empty:
|
|
1168
|
+
combined_df.sort_values(by="OBS", inplace=True)
|
|
1245
1169
|
|
|
1246
|
-
return
|
|
1247
|
-
|
|
1170
|
+
return combined_df
|
|
1171
|
+
|
|
1248
1172
|
def pull_macro_ons_uk(self, cdid_list=None, week_start_day="mon", sector=None):
|
|
1249
1173
|
"""
|
|
1250
1174
|
Fetches time series data for multiple CDIDs from the ONS API, converts it to daily frequency,
|
|
@@ -1481,3 +1405,228 @@ class datapull:
|
|
|
1481
1405
|
print("No data available to process.")
|
|
1482
1406
|
return pd.DataFrame()
|
|
1483
1407
|
|
|
1408
|
+
def pull_sports_events(self, start_date="2020-01-01", week_commencing="mon"):
|
|
1409
|
+
"""
|
|
1410
|
+
Combines scraping logic for:
|
|
1411
|
+
- UEFA Champions League and NFL from TheSportsDB (website-scraping approach)
|
|
1412
|
+
- FIFA World Cup, UEFA Euro, Rugby World Cup, Six Nations (via TheSportsDB API)
|
|
1413
|
+
|
|
1414
|
+
Returns a single merged DataFrame with all event dummy variables.
|
|
1415
|
+
"""
|
|
1416
|
+
|
|
1417
|
+
############################################################
|
|
1418
|
+
# 1) SCRAPE UEFA CHAMPIONS LEAGUE & NFL (YOUR FIRST FUNCTION)
|
|
1419
|
+
############################################################
|
|
1420
|
+
def scrape_sports_events(start_date=start_date, week_commencing=week_commencing):
|
|
1421
|
+
sports = {
|
|
1422
|
+
"uefa_champions_league": {
|
|
1423
|
+
"league_id": "4480",
|
|
1424
|
+
"seasons_url": "https://www.thesportsdb.com/league/4480-UEFA-Champions-League?a=1#allseasons",
|
|
1425
|
+
"season_url_template": "https://www.thesportsdb.com/season/4480-UEFA-Champions-League/{season}&all=1&view=",
|
|
1426
|
+
"round_filters": ["quarter", "semi", "final"]
|
|
1427
|
+
},
|
|
1428
|
+
"nfl": {
|
|
1429
|
+
"league_id": "4391",
|
|
1430
|
+
"seasons_url": "https://www.thesportsdb.com/league/4391-NFL?a=1#allseasons",
|
|
1431
|
+
"season_url_template": "https://www.thesportsdb.com/season/4391-NFL/{season}&all=1&view=",
|
|
1432
|
+
"round_filters": ["quarter", "semi", "final"]
|
|
1433
|
+
}
|
|
1434
|
+
}
|
|
1435
|
+
|
|
1436
|
+
headers = {"User-Agent": "Mozilla/5.0"}
|
|
1437
|
+
start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
|
1438
|
+
|
|
1439
|
+
# Create a full date range DataFrame
|
|
1440
|
+
full_date_range = pd.date_range(start=start_date, end=pd.to_datetime("today"))
|
|
1441
|
+
time_series_df = pd.DataFrame({"date": full_date_range})
|
|
1442
|
+
time_series_df["seas_uefa_champions_league"] = 0
|
|
1443
|
+
time_series_df["seas_nfl"] = 0
|
|
1444
|
+
|
|
1445
|
+
for sport, details in sports.items():
|
|
1446
|
+
# Get available seasons
|
|
1447
|
+
response = requests.get(details["seasons_url"], headers=headers)
|
|
1448
|
+
if response.status_code != 200:
|
|
1449
|
+
continue # Skip this sport if the request fails
|
|
1450
|
+
|
|
1451
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
1452
|
+
|
|
1453
|
+
# Extract season names
|
|
1454
|
+
seasons = []
|
|
1455
|
+
for link in soup.find_all("a", href=True):
|
|
1456
|
+
href = link["href"]
|
|
1457
|
+
if "season" in href and sport.replace("_", "-") in href.lower():
|
|
1458
|
+
season_name = href.split("/")[-1] # e.g. "2023-2024"
|
|
1459
|
+
try:
|
|
1460
|
+
season_start_year = int(season_name.split("-")[0])
|
|
1461
|
+
season_start_date = datetime(season_start_year, 1, 1)
|
|
1462
|
+
if season_start_date >= start_date_dt:
|
|
1463
|
+
seasons.append(season_name)
|
|
1464
|
+
except ValueError:
|
|
1465
|
+
continue
|
|
1466
|
+
|
|
1467
|
+
# Scrape matches for filtered seasons
|
|
1468
|
+
filtered_matches = []
|
|
1469
|
+
for season in seasons:
|
|
1470
|
+
season_url = details["season_url_template"].format(season=season)
|
|
1471
|
+
season_response = requests.get(season_url, headers=headers)
|
|
1472
|
+
if season_response.status_code != 200:
|
|
1473
|
+
continue
|
|
1474
|
+
|
|
1475
|
+
season_soup = BeautifulSoup(season_response.text, "html.parser")
|
|
1476
|
+
for row in season_soup.find_all("tr"):
|
|
1477
|
+
cols = row.find_all("td")
|
|
1478
|
+
if len(cols) >= 5:
|
|
1479
|
+
match_date = cols[0].text.strip()
|
|
1480
|
+
round_name = cols[1].text.strip().lower()
|
|
1481
|
+
try:
|
|
1482
|
+
match_date_dt = datetime.strptime(match_date, "%d %b %y")
|
|
1483
|
+
if (match_date_dt >= start_date_dt
|
|
1484
|
+
and any(r in round_name for r in details["round_filters"])):
|
|
1485
|
+
filtered_matches.append(match_date_dt)
|
|
1486
|
+
except ValueError:
|
|
1487
|
+
continue
|
|
1488
|
+
|
|
1489
|
+
# Convert matches into time series format
|
|
1490
|
+
df_sport = pd.DataFrame({"date": filtered_matches})
|
|
1491
|
+
if df_sport.empty:
|
|
1492
|
+
continue
|
|
1493
|
+
|
|
1494
|
+
col_name = "seas_nfl" if sport == "nfl" else "seas_uefa_champions_league"
|
|
1495
|
+
time_series_df.loc[time_series_df["date"].isin(df_sport["date"]), col_name] = 1
|
|
1496
|
+
|
|
1497
|
+
# Aggregate by week commencing
|
|
1498
|
+
day_offsets = {
|
|
1499
|
+
'mon': 'W-MON',
|
|
1500
|
+
'tues': 'W-TUE',
|
|
1501
|
+
'wed': 'W-WED',
|
|
1502
|
+
'thurs': 'W-THU',
|
|
1503
|
+
'fri': 'W-FRI',
|
|
1504
|
+
'sat': 'W-SAT',
|
|
1505
|
+
'sun': 'W-SUN'
|
|
1506
|
+
}
|
|
1507
|
+
if week_commencing.lower() not in day_offsets:
|
|
1508
|
+
raise ValueError(f"Invalid week_commencing value: {week_commencing}. Must be one of {list(day_offsets.keys())}.")
|
|
1509
|
+
|
|
1510
|
+
time_series_df = (time_series_df
|
|
1511
|
+
.set_index("date")
|
|
1512
|
+
.resample(day_offsets[week_commencing.lower()])
|
|
1513
|
+
.max()
|
|
1514
|
+
.reset_index())
|
|
1515
|
+
|
|
1516
|
+
time_series_df.rename(columns={"date": "OBS"}, inplace=True)
|
|
1517
|
+
time_series_df.fillna(0, inplace=True)
|
|
1518
|
+
|
|
1519
|
+
return time_series_df
|
|
1520
|
+
|
|
1521
|
+
############################################################
|
|
1522
|
+
# 2) FETCH FIFA WC, UEFA EURO, RUGBY, SIX NATIONS (2ND FUNC)
|
|
1523
|
+
############################################################
|
|
1524
|
+
def fetch_events(start_date=start_date, week_commencing=week_commencing):
|
|
1525
|
+
# Initialize date range
|
|
1526
|
+
start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
|
|
1527
|
+
end_date_obj = datetime.today()
|
|
1528
|
+
date_range = pd.date_range(start=start_date_obj, end=end_date_obj)
|
|
1529
|
+
df = pd.DataFrame({'OBS': date_range}).set_index('OBS')
|
|
1530
|
+
|
|
1531
|
+
# Define columns for sports
|
|
1532
|
+
event_columns = {
|
|
1533
|
+
'seas_fifa_world_cup': {
|
|
1534
|
+
'league_id': 4429, 'start_year': 1950, 'interval': 4
|
|
1535
|
+
},
|
|
1536
|
+
'seas_uefa_european_championship': {
|
|
1537
|
+
'league_id': 4502, 'start_year': 1960, 'interval': 4, 'extra_years': [2021]
|
|
1538
|
+
},
|
|
1539
|
+
'seas_rugby_world_cup': {
|
|
1540
|
+
'league_id': 4574, 'start_year': 1987, 'interval': 4
|
|
1541
|
+
},
|
|
1542
|
+
'seas_six_nations': {
|
|
1543
|
+
'league_id': 4714, 'start_year': 2000, 'interval': 1
|
|
1544
|
+
},
|
|
1545
|
+
}
|
|
1546
|
+
|
|
1547
|
+
# Initialize columns
|
|
1548
|
+
for col in event_columns.keys():
|
|
1549
|
+
df[col] = 0
|
|
1550
|
+
|
|
1551
|
+
def fetch_league_events(league_id, column_name, start_year, interval, extra_years=None):
|
|
1552
|
+
extra_years = extra_years or []
|
|
1553
|
+
# Fetch seasons
|
|
1554
|
+
seasons_url = f"https://www.thesportsdb.com/api/v1/json/3/search_all_seasons.php?id={league_id}"
|
|
1555
|
+
seasons_response = requests.get(seasons_url)
|
|
1556
|
+
if seasons_response.status_code != 200:
|
|
1557
|
+
return # Skip on failure
|
|
1558
|
+
|
|
1559
|
+
seasons_data = seasons_response.json().get('seasons', [])
|
|
1560
|
+
for season in seasons_data:
|
|
1561
|
+
season_name = season.get('strSeason', '')
|
|
1562
|
+
if not season_name.isdigit():
|
|
1563
|
+
continue
|
|
1564
|
+
|
|
1565
|
+
year = int(season_name)
|
|
1566
|
+
# Check if the year is valid for this competition
|
|
1567
|
+
if year in extra_years or (year >= start_year and (year - start_year) % interval == 0):
|
|
1568
|
+
# Fetch events
|
|
1569
|
+
events_url = f"https://www.thesportsdb.com/api/v1/json/3/eventsseason.php?id={league_id}&s={season_name}"
|
|
1570
|
+
events_response = requests.get(events_url)
|
|
1571
|
+
if events_response.status_code != 200:
|
|
1572
|
+
continue
|
|
1573
|
+
|
|
1574
|
+
events_data = events_response.json().get('events', [])
|
|
1575
|
+
for event in events_data:
|
|
1576
|
+
event_date_str = event.get('dateEvent')
|
|
1577
|
+
if event_date_str:
|
|
1578
|
+
event_date = datetime.strptime(event_date_str, '%Y-%m-%d')
|
|
1579
|
+
if event_date in df.index:
|
|
1580
|
+
df.loc[event_date, column_name] = 1
|
|
1581
|
+
|
|
1582
|
+
# Fetch events for all defined leagues
|
|
1583
|
+
for column_name, params in event_columns.items():
|
|
1584
|
+
fetch_league_events(
|
|
1585
|
+
league_id=params['league_id'],
|
|
1586
|
+
column_name=column_name,
|
|
1587
|
+
start_year=params['start_year'],
|
|
1588
|
+
interval=params['interval'],
|
|
1589
|
+
extra_years=params.get('extra_years', [])
|
|
1590
|
+
)
|
|
1591
|
+
|
|
1592
|
+
# Resample by week
|
|
1593
|
+
day_offsets = {
|
|
1594
|
+
'mon': 'W-MON',
|
|
1595
|
+
'tues': 'W-TUE',
|
|
1596
|
+
'wed': 'W-WED',
|
|
1597
|
+
'thurs': 'W-THU',
|
|
1598
|
+
'fri': 'W-FRI',
|
|
1599
|
+
'sat': 'W-SAT',
|
|
1600
|
+
'sun': 'W-SUN'
|
|
1601
|
+
}
|
|
1602
|
+
|
|
1603
|
+
if week_commencing.lower() not in day_offsets:
|
|
1604
|
+
raise ValueError(
|
|
1605
|
+
f"Invalid week_commencing value: {week_commencing}. "
|
|
1606
|
+
f"Must be one of {list(day_offsets.keys())}."
|
|
1607
|
+
)
|
|
1608
|
+
|
|
1609
|
+
df = df.resample(day_offsets[week_commencing.lower()]).max()
|
|
1610
|
+
df = df.reset_index()
|
|
1611
|
+
return df
|
|
1612
|
+
|
|
1613
|
+
###################################################
|
|
1614
|
+
# 3) CALL BOTH, THEN MERGE ON "OBS" & FILL WITH 0s
|
|
1615
|
+
###################################################
|
|
1616
|
+
df_uefa_nfl = scrape_sports_events(start_date, week_commencing)
|
|
1617
|
+
df_other_events = fetch_events(start_date, week_commencing)
|
|
1618
|
+
|
|
1619
|
+
# Merge on "OBS" column (outer join to preserve all dates in range)
|
|
1620
|
+
final_df = pd.merge(df_uefa_nfl, df_other_events, on='OBS', how='outer')
|
|
1621
|
+
|
|
1622
|
+
# Fill any NaNs with 0 for event columns
|
|
1623
|
+
# (Only fill numeric columns or everything except 'OBS')
|
|
1624
|
+
for col in final_df.columns:
|
|
1625
|
+
if col != 'OBS':
|
|
1626
|
+
final_df[col] = final_df[col].fillna(0)
|
|
1627
|
+
|
|
1628
|
+
# Sort by date just in case
|
|
1629
|
+
final_df.sort_values(by='OBS', inplace=True)
|
|
1630
|
+
final_df.reset_index(drop=True, inplace=True)
|
|
1631
|
+
|
|
1632
|
+
return final_df
|