imsciences 0.6.1.1__tar.gz → 0.6.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/PKG-INFO +1 -1
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/imsciences/datafunctions.py +179 -361
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/imsciences.egg-info/PKG-INFO +1 -1
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/setup.py +1 -1
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/README.md +0 -0
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/imsciences/__init__.py +0 -0
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/imsciences.egg-info/SOURCES.txt +0 -0
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/imsciences.egg-info/dependency_links.txt +0 -0
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/imsciences.egg-info/requires.txt +0 -0
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/imsciences.egg-info/top_level.txt +0 -0
- {imsciences-0.6.1.1 → imsciences-0.6.1.3}/setup.cfg +0 -0
|
@@ -19,6 +19,7 @@ import requests
|
|
|
19
19
|
from geopy.geocoders import Nominatim
|
|
20
20
|
import subprocess
|
|
21
21
|
import json
|
|
22
|
+
import xml.etree.ElementTree as ET
|
|
22
23
|
|
|
23
24
|
class dataprocessing:
|
|
24
25
|
|
|
@@ -1335,9 +1336,9 @@ class datapull:
|
|
|
1335
1336
|
print(" - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
|
|
1336
1337
|
|
|
1337
1338
|
print("\n4. pull_oecd")
|
|
1338
|
-
print(" - Description: Fetch macroeconomic data from OECD
|
|
1339
|
-
print(" - Usage:
|
|
1340
|
-
print(" - Example:
|
|
1339
|
+
print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
|
|
1340
|
+
print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '1950-01-01')")
|
|
1341
|
+
print(" - Example: pull_oecd('GBR', 'mon', '1950-01-01')")
|
|
1341
1342
|
|
|
1342
1343
|
print("\n5. get_google_mobility_data")
|
|
1343
1344
|
print(" - Description: Fetch Google Mobility data for the specified country.")
|
|
@@ -1353,6 +1354,11 @@ class datapull:
|
|
|
1353
1354
|
print(" - Description: Fetch and process historical weather data for the specified country.")
|
|
1354
1355
|
print(" - Usage: pull_weather(week_commencing, country)")
|
|
1355
1356
|
print(" - Example: pull_weather('mon', 'GBR')")
|
|
1357
|
+
|
|
1358
|
+
print("\n8. pull_covid_data")
|
|
1359
|
+
print(" - Description: Get covid pandemic data for the country of interest.")
|
|
1360
|
+
print(" - Usage: pull_covid_data(folder_path, country, week_commencing)")
|
|
1361
|
+
print(" - Example: pull_covid_data('C:/Users/--username--/OneDrive/', 'GB', 'mon')")
|
|
1356
1362
|
|
|
1357
1363
|
############################################################### MACRO ##########################################################################
|
|
1358
1364
|
|
|
@@ -1579,369 +1585,148 @@ class datapull:
|
|
|
1579
1585
|
|
|
1580
1586
|
return ons_df_final
|
|
1581
1587
|
|
|
1582
|
-
def
|
|
1583
|
-
|
|
1584
|
-
|
|
1588
|
+
def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "1950-01-01") -> pd.DataFrame:
|
|
1589
|
+
"""
|
|
1590
|
+
Fetch and process time series data from the OECD API.
|
|
1585
1591
|
|
|
1586
|
-
|
|
1587
|
-
|
|
1592
|
+
Args:
|
|
1593
|
+
country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
|
|
1594
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
1595
|
+
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
1596
|
+
start_date (str): Dataset start date in the format "YYYY-MM-DD"
|
|
1588
1597
|
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
quarter =
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
#
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
#
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
#
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
Consumer_Price_Index_Inflation_df_M = pd.DataFrame()
|
|
1693
|
-
# For countries with monthly data
|
|
1694
|
-
else:
|
|
1695
|
-
Consumer_Price_Index_Inflation_df_M = data[0]['CPALTT01']['GY']
|
|
1696
|
-
Consumer_Price_Index_Inflation_df_M.rename('consumer_price_index_inflation', inplace=True)
|
|
1697
|
-
Consumer_Price_Index_Inflation_df_Q = pd.DataFrame()
|
|
1698
|
-
|
|
1699
|
-
# For GDP Index Smoothed
|
|
1700
|
-
# For countries with no data
|
|
1701
|
-
if country in ['NLD', 'CHE', 'NZL', 'SWE', 'NOR']:
|
|
1702
|
-
GDP_Index_Smoothed_df_M = pd.DataFrame()
|
|
1703
|
-
GDP_Index_Smoothed_df_Q = pd.DataFrame()
|
|
1704
|
-
# For countries with quarterly data
|
|
1705
|
-
elif country in []:
|
|
1706
|
-
GDP_Index_Smoothed_df_Q = data[1]['LORSGPRT']['STSA']
|
|
1707
|
-
GDP_Index_Smoothed_df_Q.rename('gdp_index_smoothed', inplace=True)
|
|
1708
|
-
GDP_Index_Smoothed_df_M = pd.DataFrame()
|
|
1709
|
-
# For countries with monthly data
|
|
1710
|
-
else:
|
|
1711
|
-
GDP_Index_Smoothed_df_M = data[0]['LORSGPRT']['STSA']
|
|
1712
|
-
GDP_Index_Smoothed_df_M.rename('gdp_index_smoothed', inplace=True)
|
|
1713
|
-
GDP_Index_Smoothed_df_Q = pd.DataFrame()
|
|
1714
|
-
|
|
1715
|
-
# For Harmonised Unemployment Index
|
|
1716
|
-
# For countries with no data
|
|
1717
|
-
if country in ['IND', 'CHE', 'ZAF', 'CHN']:
|
|
1718
|
-
Harmonised_Unemployment_Index_df_M = pd.DataFrame()
|
|
1719
|
-
Harmonised_Unemployment_Index_df_Q = pd.DataFrame()
|
|
1720
|
-
# For countries with quarterly data
|
|
1721
|
-
elif country in ['NZL']:
|
|
1722
|
-
Harmonised_Unemployment_Index_df_Q = data[1]['LRHUTTTT']['STSA']
|
|
1723
|
-
Harmonised_Unemployment_Index_df_Q.rename('harmonised_unemployment_index', inplace=True)
|
|
1724
|
-
Harmonised_Unemployment_Index_df_M = pd.DataFrame()
|
|
1725
|
-
# For countries with monthly data
|
|
1726
|
-
else:
|
|
1727
|
-
Harmonised_Unemployment_Index_df_M = data[0]['LRHUTTTT']['STSA']
|
|
1728
|
-
Harmonised_Unemployment_Index_df_M.rename('harmonised_unemployment_index', inplace=True)
|
|
1729
|
-
Harmonised_Unemployment_Index_df_Q = pd.DataFrame()
|
|
1730
|
-
|
|
1731
|
-
# For hourly earnings index manufacturing
|
|
1732
|
-
# For countries with no data
|
|
1733
|
-
if country in ['IND', 'CHE', 'ZAF', 'CHN']:
|
|
1734
|
-
Hourly_Earnings_Index_Manufacturing_df_M = pd.DataFrame()
|
|
1735
|
-
Hourly_Earnings_Index_Manufacturing_df_Q = pd.DataFrame()
|
|
1736
|
-
# For countries with quarterly data
|
|
1737
|
-
elif country in ['FRA', 'DEU', 'ESP', 'AUS', 'NZL', 'KOR', 'NOR']:
|
|
1738
|
-
Hourly_Earnings_Index_Manufacturing_df_Q = data[1]['LCEAMN01']['IXOBSA']
|
|
1739
|
-
Hourly_Earnings_Index_Manufacturing_df_Q.rename('hourly_earnings_index_manufacturing', inplace=True)
|
|
1740
|
-
Hourly_Earnings_Index_Manufacturing_df_M = pd.DataFrame()
|
|
1741
|
-
# For countries with monthly data
|
|
1742
|
-
else:
|
|
1743
|
-
Hourly_Earnings_Index_Manufacturing_df_M = data[0]['LCEAMN01']['IXOBSA']
|
|
1744
|
-
Hourly_Earnings_Index_Manufacturing_df_M.rename('hourly_earnings_index_manufacturing', inplace=True)
|
|
1745
|
-
Hourly_Earnings_Index_Manufacturing_df_Q = pd.DataFrame()
|
|
1746
|
-
|
|
1747
|
-
# For Short Term Interest Rate
|
|
1748
|
-
# For countries with no data
|
|
1749
|
-
if country in []:
|
|
1750
|
-
Short_Term_Interest_Rate_df_M = pd.DataFrame()
|
|
1751
|
-
Short_Term_Interest_Rate_df_Q = pd.DataFrame()
|
|
1752
|
-
# For countries with quarterly data
|
|
1753
|
-
elif country in []:
|
|
1754
|
-
Short_Term_Interest_Rate_df_Q = data[1]['IR3TIB01']['ST']
|
|
1755
|
-
Short_Term_Interest_Rate_df_Q.rename('short_term_interest_rate', inplace=True)
|
|
1756
|
-
Short_Term_Interest_Rate_df_M = pd.DataFrame()
|
|
1757
|
-
# For countries with monthly data
|
|
1758
|
-
else:
|
|
1759
|
-
Short_Term_Interest_Rate_df_M = data[0]['IR3TIB01']['ST']
|
|
1760
|
-
Short_Term_Interest_Rate_df_M.rename('short_term_interest_rate', inplace=True)
|
|
1761
|
-
Short_Term_Interest_Rate_df_Q = pd.DataFrame()
|
|
1762
|
-
|
|
1763
|
-
# For Industrial Product Growth on Previous Period
|
|
1764
|
-
# For countries with no data
|
|
1765
|
-
if country in ['ZAF', 'CHN']:
|
|
1766
|
-
Industrial_Product_Growth_on_Previous_Period_df_M = pd.DataFrame()
|
|
1767
|
-
Industrial_Product_Growth_on_Previous_Period_df_Q = pd.DataFrame()
|
|
1768
|
-
# For countries with quarterly data
|
|
1769
|
-
elif country in ['AUS', 'NZL']:
|
|
1770
|
-
Industrial_Product_Growth_on_Previous_Period_df_Q = data[1]['PRINTO01']['GPSA']
|
|
1771
|
-
Industrial_Product_Growth_on_Previous_Period_df_Q.rename('industrial_product_growth_on_previous_period', inplace=True)
|
|
1772
|
-
Industrial_Product_Growth_on_Previous_Period_df_M = pd.DataFrame()
|
|
1773
|
-
# For countries with monthly data
|
|
1774
|
-
else:
|
|
1775
|
-
Industrial_Product_Growth_on_Previous_Period_df_M = data[0]['PRINTO01']['GPSA']
|
|
1776
|
-
Industrial_Product_Growth_on_Previous_Period_df_M.rename('industrial_product_growth_on_previous_period', inplace=True)
|
|
1777
|
-
Industrial_Product_Growth_on_Previous_Period_df_Q = pd.DataFrame()
|
|
1778
|
-
|
|
1779
|
-
# For Industrial Production Index
|
|
1780
|
-
# For countries with no data
|
|
1781
|
-
if country in ['ZAF', 'CHN']:
|
|
1782
|
-
Industrial_Production_Index_df_M = pd.DataFrame()
|
|
1783
|
-
Industrial_Production_Index_df_Q = pd.DataFrame()
|
|
1784
|
-
# For countries with quarterly data
|
|
1785
|
-
elif country in ['AUS', 'NZL']:
|
|
1786
|
-
Industrial_Production_Index_df_Q = data[1]['PRINTO01']['IXOBSA']
|
|
1787
|
-
Industrial_Production_Index_df_Q.rename('industrial_production_index', inplace=True)
|
|
1788
|
-
Industrial_Production_Index_df_M = pd.DataFrame()
|
|
1789
|
-
# For countries with monthly data
|
|
1790
|
-
else:
|
|
1791
|
-
Industrial_Production_Index_df_M = data[0]['PRINTO01']['IXOBSA']
|
|
1792
|
-
Industrial_Production_Index_df_M.rename('industrial_production_index', inplace=True)
|
|
1793
|
-
Industrial_Production_Index_df_Q = pd.DataFrame()
|
|
1794
|
-
|
|
1795
|
-
# Create monthly macroeconomic dataframe
|
|
1796
|
-
all_dfs_list_M = [Consumer_Confidence_Index_df_M,
|
|
1797
|
-
Consumer_Price_Index_Cost_Of_Living_df_M,
|
|
1798
|
-
Consumer_Price_Index_Inflation_df_M,
|
|
1799
|
-
GDP_Index_Smoothed_df_M,
|
|
1800
|
-
Harmonised_Unemployment_Index_df_M,
|
|
1801
|
-
Hourly_Earnings_Index_Manufacturing_df_M,
|
|
1802
|
-
Short_Term_Interest_Rate_df_M,
|
|
1803
|
-
Industrial_Product_Growth_on_Previous_Period_df_M,
|
|
1804
|
-
Industrial_Production_Index_df_M]
|
|
1805
|
-
|
|
1806
|
-
# Check if any dataframes are empty and if there are remove them
|
|
1807
|
-
all_dfs_list_M = [df for df in all_dfs_list_M if not df.empty]
|
|
1808
|
-
cif_Macroeconomic_df_M = pd.concat(all_dfs_list_M, axis=1)
|
|
1809
|
-
|
|
1810
|
-
# Create quarterly macroeconomic dataframe
|
|
1811
|
-
all_dfs_list_Q = [Consumer_Confidence_Index_df_Q,
|
|
1812
|
-
Consumer_Price_Index_Cost_Of_Living_df_Q,
|
|
1813
|
-
Consumer_Price_Index_Inflation_df_Q,
|
|
1814
|
-
GDP_Index_Smoothed_df_Q,
|
|
1815
|
-
Harmonised_Unemployment_Index_df_Q,
|
|
1816
|
-
Hourly_Earnings_Index_Manufacturing_df_Q,
|
|
1817
|
-
Short_Term_Interest_Rate_df_Q,
|
|
1818
|
-
Industrial_Product_Growth_on_Previous_Period_df_Q,
|
|
1819
|
-
Industrial_Production_Index_df_Q]
|
|
1820
|
-
|
|
1821
|
-
# Check if any dataframes are empty and if there are remove them
|
|
1822
|
-
all_dfs_list_Q = [df for df in all_dfs_list_Q if not df.empty]
|
|
1823
|
-
if all_dfs_list_Q != []:
|
|
1824
|
-
macroeconomic_monthly_df_Q = pd.concat(all_dfs_list_Q, axis=1)
|
|
1825
|
-
else:
|
|
1826
|
-
macroeconomic_monthly_df_Q = pd.DataFrame()
|
|
1827
|
-
|
|
1828
|
-
# For USD GBP Exchange Rate
|
|
1829
|
-
# If it's the UK add this series else don't
|
|
1830
|
-
if countries_list[index] == 'GBR':
|
|
1831
|
-
USD_GBP_Exchange_Rate_df = pd.read_csv(
|
|
1832
|
-
'https://stats.oecd.org/SDMX-JSON/data/MEI_FIN/CCUS.' + countries_list[index] + '.M/OECD?contentType=csv')
|
|
1833
|
-
USD_GBP_Exchange_Rate_df.head()
|
|
1834
|
-
USD_GBP_Exchange_Rate_df_pivot = pd.pivot_table(USD_GBP_Exchange_Rate_df, values='Value', index='TIME',
|
|
1835
|
-
columns='Subject')
|
|
1836
|
-
USD_GBP_Exchange_Rate_df_pivot_final = USD_GBP_Exchange_Rate_df_pivot.loc["2015-01":]
|
|
1837
|
-
USD_GBP_Exchange_Rate_df_pivot_final.rename(
|
|
1838
|
-
columns={'Currency exchange rates, monthly average': 'usd_gbp_exchange_rate'}, inplace=True)
|
|
1839
|
-
|
|
1840
|
-
# Create final monthly dataframe
|
|
1841
|
-
macroeconomic_monthly_df_M = pd.concat([cif_Macroeconomic_df_M, USD_GBP_Exchange_Rate_df_pivot_final], axis=1)
|
|
1842
|
-
else:
|
|
1843
|
-
# Create final monthly dataframe
|
|
1844
|
-
macroeconomic_monthly_df_M = cif_Macroeconomic_df_M
|
|
1845
|
-
|
|
1846
|
-
# Create the final W/C Sunday dataframe
|
|
1847
|
-
# For monthly data
|
|
1848
|
-
macroeconomic_monthly_df_M['Date'] = macroeconomic_monthly_df_M.index
|
|
1849
|
-
df_M = macroeconomic_monthly_df_M.set_index(pd.to_datetime(macroeconomic_monthly_df_M['Date'])).drop(columns='Date')
|
|
1850
|
-
df_M.fillna(method="ffill", inplace=True)
|
|
1851
|
-
df_M.reset_index(inplace=True)
|
|
1852
|
-
|
|
1853
|
-
daily_records = []
|
|
1854
|
-
# Iterate over each row in the DataFrame
|
|
1855
|
-
for _, row in df_M.iterrows():
|
|
1856
|
-
# Calculate the number of days in the month
|
|
1857
|
-
num_days = calendar.monthrange(row["Date"].year, row["Date"].month)[1]
|
|
1858
|
-
# Create a new record for each day of the month
|
|
1859
|
-
for day in range(1, num_days + 1):
|
|
1860
|
-
daily_row = row.copy()
|
|
1861
|
-
daily_row["Date"] = row["Date"].replace(day=day)
|
|
1862
|
-
daily_records.append(daily_row)
|
|
1863
|
-
|
|
1864
|
-
# Convert the list of daily records into a DataFrame
|
|
1865
|
-
daily_df = pd.DataFrame(daily_records)
|
|
1866
|
-
|
|
1867
|
-
# Extend dataframe to include the current data if needed
|
|
1868
|
-
datelist = pd.date_range(daily_df["Date"].iloc[-1] + pd.Timedelta(days=1), datetime.today()).tolist()
|
|
1869
|
-
extended_data = np.repeat([list(daily_df.iloc[-1, 1:].values)], len(datelist), axis=0)
|
|
1870
|
-
q = pd.Series(datelist, name="Date")
|
|
1871
|
-
s = pd.DataFrame(extended_data, columns=list(df_M.columns[1:]))
|
|
1872
|
-
extended_daily_df = pd.concat([q, s], axis=1)
|
|
1873
|
-
extended_daily_df = pd.concat([daily_df, extended_daily_df], ignore_index=False)
|
|
1874
|
-
|
|
1875
|
-
# Create a week commencing column
|
|
1876
|
-
extended_daily_df["Date"] = pd.to_datetime(extended_daily_df["Date"], format='%d %b %Y')
|
|
1877
|
-
extended_daily_df['week_start'] = extended_daily_df["Date"].apply(
|
|
1878
|
-
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1879
|
-
extended_daily_df.drop("Date", axis=1, inplace=True)
|
|
1880
|
-
extended_daily_df.rename(columns={'week_start': "Date"}, inplace=True)
|
|
1881
|
-
|
|
1882
|
-
# Take a weekly average
|
|
1883
|
-
macroeconomic_weekly_df_M = extended_daily_df.groupby('Date').mean()
|
|
1884
|
-
|
|
1885
|
-
# For quarterly data
|
|
1886
|
-
# If there are quarterly datasets
|
|
1887
|
-
if all_dfs_list_Q != []:
|
|
1888
|
-
macroeconomic_monthly_df_Q['Date'] = macroeconomic_monthly_df_Q.index
|
|
1889
|
-
df_Q = macroeconomic_monthly_df_Q.set_index(pd.to_datetime(macroeconomic_monthly_df_Q['Date'])).drop(
|
|
1890
|
-
columns='Date')
|
|
1891
|
-
df_Q.fillna(method="ffill", inplace=True)
|
|
1892
|
-
df_Q.reset_index(inplace=True)
|
|
1893
|
-
|
|
1894
|
-
daily_records = []
|
|
1895
|
-
for _, row in df_Q.iterrows():
|
|
1896
|
-
year = row["Date"].year
|
|
1897
|
-
month = row["Date"].month
|
|
1898
|
-
day = row["Date"].day
|
|
1899
|
-
last_date = get_last_day_of_the_quarter(datetime(year, month, day).date())
|
|
1900
|
-
all_days = pd.date_range(row["Date"], last_date, freq="D")
|
|
1901
|
-
|
|
1902
|
-
# Create a new record for each day of the quarter
|
|
1903
|
-
for day in all_days:
|
|
1904
|
-
daily_row = row.copy()
|
|
1905
|
-
daily_row["Date"] = row["Date"].replace(day=day.day, month=day.month)
|
|
1906
|
-
daily_records.append(daily_row)
|
|
1907
|
-
|
|
1908
|
-
# Convert the list of daily records into a DataFrame
|
|
1909
|
-
daily_df = pd.DataFrame(daily_records)
|
|
1910
|
-
|
|
1911
|
-
# Extend dataframe to include data up to today
|
|
1912
|
-
datelist = pd.date_range(daily_df["Date"].iloc[-1] + pd.Timedelta(days=1), datetime.today()).tolist()
|
|
1913
|
-
extended_data = np.repeat([list(daily_df.iloc[-1, 1:].values)], len(datelist), axis=0)
|
|
1914
|
-
q = pd.Series(datelist, name="Date")
|
|
1915
|
-
s = pd.DataFrame(extended_data, columns=list(df_Q.columns[1:]))
|
|
1916
|
-
extended_daily_df = pd.concat([q, s], axis=1)
|
|
1917
|
-
extended_daily_df = pd.concat([daily_df, extended_daily_df], ignore_index=False)
|
|
1918
|
-
|
|
1919
|
-
# Create a week commencing column
|
|
1920
|
-
extended_daily_df["Date"] = pd.to_datetime(extended_daily_df["Date"], format='%d %b %Y')
|
|
1921
|
-
extended_daily_df['week_start'] = extended_daily_df["Date"].apply(
|
|
1922
|
-
lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
|
|
1923
|
-
extended_daily_df.drop("Date", axis=1, inplace=True)
|
|
1924
|
-
extended_daily_df.rename(columns={'week_start': "Date"}, inplace=True)
|
|
1925
|
-
|
|
1926
|
-
# Take a weekly average
|
|
1927
|
-
macroeconomic_weekly_df_Q = extended_daily_df.groupby('Date').mean()
|
|
1928
|
-
|
|
1929
|
-
# Merge the two datasets together
|
|
1930
|
-
if all_dfs_list_Q != []:
|
|
1931
|
-
macroeconomic_weekly_df = macroeconomic_weekly_df_M.merge(macroeconomic_weekly_df_Q, left_index=True,
|
|
1932
|
-
right_index=True)
|
|
1933
|
-
# If there are no quarterly datasets
|
|
1598
|
+
Returns:
|
|
1599
|
+
pd.DataFrame: A DataFrame with weekly aggregated OECD data. The 'OBS' column contains the week
|
|
1600
|
+
commencing dates, and other columns contain the aggregated time series values.
|
|
1601
|
+
"""
|
|
1602
|
+
|
|
1603
|
+
def parse_quarter(date_str):
|
|
1604
|
+
"""Parses a string in 'YYYY-Q#' format into a datetime object."""
|
|
1605
|
+
year, quarter = date_str.split('-')
|
|
1606
|
+
quarter_number = int(quarter[1])
|
|
1607
|
+
month = (quarter_number - 1) * 3 + 1
|
|
1608
|
+
return pd.Timestamp(f"{year}-{month:02d}-01")
|
|
1609
|
+
|
|
1610
|
+
# Generate a date range from 1950-01-01 to today
|
|
1611
|
+
date_range = pd.date_range(start=start_date, end=datetime.today(), freq='D')
|
|
1612
|
+
|
|
1613
|
+
url_details = [
|
|
1614
|
+
["BCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_business_confidence_index"],
|
|
1615
|
+
["CCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_consumer_confidence_index"],
|
|
1616
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA._T.N.GY", "macro_cpi_total"],
|
|
1617
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP041T043.N.GY", "macro_cpi_housing"],
|
|
1618
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP01.N.GY", "macro_cpi_food"],
|
|
1619
|
+
["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP045_0722.N.GY", "macro_cpi_energy"],
|
|
1620
|
+
["UNE_LF_M", "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,", "._Z.Y._T.Y_GE15.", "macro_unemployment_rate"],
|
|
1621
|
+
["EAR", "SDD.TPS,DSD_EAR@DF_HOU_EAR,", ".Y..S1D", "macro_private_hourly_earnings"],
|
|
1622
|
+
["RHP", "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0", "", "macro_real_house_prices"],
|
|
1623
|
+
["PRVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX.C..", "macro_manufacturing_production_volume"],
|
|
1624
|
+
["TOVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX...", "macro_retail_trade_volume"],
|
|
1625
|
+
["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
|
|
1626
|
+
["IRLT", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_long_term_interest_rate"],
|
|
1627
|
+
["B1GQ", "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1", "._Z....GY.T0102", "macro_gdp_growth_yoy"]
|
|
1628
|
+
]
|
|
1629
|
+
|
|
1630
|
+
# Create empty final dataframe
|
|
1631
|
+
oecd_df_final = pd.DataFrame()
|
|
1632
|
+
|
|
1633
|
+
daily_df = pd.DataFrame({'OBS': date_range})
|
|
1634
|
+
value_columns = []
|
|
1635
|
+
|
|
1636
|
+
# Iterate for each variable of interest
|
|
1637
|
+
for series_details in url_details:
|
|
1638
|
+
series = series_details[0]
|
|
1639
|
+
dataset_id = series_details[1]
|
|
1640
|
+
filter = series_details[2]
|
|
1641
|
+
col_name = series_details[3]
|
|
1642
|
+
|
|
1643
|
+
# check if request was successful and determine the most granular data available
|
|
1644
|
+
for freq in ['M', 'Q', 'A']:
|
|
1645
|
+
|
|
1646
|
+
if series in ["UNE_LF_M", "EAR"]:
|
|
1647
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
|
|
1648
|
+
elif series in ["B1GQ"]:
|
|
1649
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
|
|
1650
|
+
else:
|
|
1651
|
+
data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
|
|
1652
|
+
|
|
1653
|
+
# Make the request to the OECD API for data
|
|
1654
|
+
data_response = requests.get(data_url)
|
|
1655
|
+
|
|
1656
|
+
# Check if the request was successful
|
|
1657
|
+
if data_response.status_code != 200:
|
|
1658
|
+
print(f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}")
|
|
1659
|
+
url_test = False
|
|
1660
|
+
continue
|
|
1661
|
+
else:
|
|
1662
|
+
url_test = True
|
|
1663
|
+
break
|
|
1664
|
+
|
|
1665
|
+
# get data for the next variable if url doesn't exist
|
|
1666
|
+
if url_test == False:
|
|
1667
|
+
continue
|
|
1668
|
+
|
|
1669
|
+
root = ET.fromstring(data_response.content)
|
|
1670
|
+
|
|
1671
|
+
# Define namespaces if necessary (the namespace is included in the tags)
|
|
1672
|
+
namespaces = {'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic'}
|
|
1673
|
+
|
|
1674
|
+
# Lists to store the data
|
|
1675
|
+
dates = []
|
|
1676
|
+
values = []
|
|
1677
|
+
|
|
1678
|
+
# Iterate over all <Obs> elements and extract date and value
|
|
1679
|
+
for obs in root.findall('.//generic:Obs', namespaces):
|
|
1680
|
+
|
|
1681
|
+
# Extracting the time period (date)
|
|
1682
|
+
time_period = obs.find('.//generic:ObsDimension', namespaces).get('value')
|
|
1683
|
+
|
|
1684
|
+
# Extracting the observation value
|
|
1685
|
+
value = obs.find('.//generic:ObsValue', namespaces).get('value')
|
|
1686
|
+
|
|
1687
|
+
# Storing the data
|
|
1688
|
+
if time_period and value:
|
|
1689
|
+
dates.append(time_period)
|
|
1690
|
+
values.append(float(value)) # Convert value to float
|
|
1691
|
+
|
|
1692
|
+
# Add variable names that were found to a list
|
|
1693
|
+
value_columns.append(col_name)
|
|
1694
|
+
|
|
1695
|
+
# Creating a DataFrame
|
|
1696
|
+
data = pd.DataFrame({'OBS': dates, col_name: values})
|
|
1697
|
+
|
|
1698
|
+
# Convert date strings into datetime format
|
|
1699
|
+
if freq == 'Q':
|
|
1700
|
+
data['OBS'] = data['OBS'].apply(parse_quarter)
|
|
1934
1701
|
else:
|
|
1935
|
-
|
|
1702
|
+
# Display the DataFrame
|
|
1703
|
+
data['OBS'] = data['OBS'].apply(lambda x: datetime.strptime(x, '%Y-%m'))
|
|
1704
|
+
|
|
1705
|
+
# Sort data by chronological order
|
|
1706
|
+
data.sort_values(by='OBS', inplace=True)
|
|
1707
|
+
|
|
1708
|
+
# Merge the data based on the observation date
|
|
1709
|
+
daily_df = pd.merge_asof(daily_df, data[['OBS', col_name]], on='OBS', direction='backward')
|
|
1936
1710
|
|
|
1937
|
-
# Change datetime format
|
|
1938
|
-
macroeconomic_weekly_df.index = macroeconomic_weekly_df.index.strftime('%d/%m/%Y')
|
|
1939
1711
|
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1712
|
+
# Ensure columns are numeric
|
|
1713
|
+
for col in value_columns:
|
|
1714
|
+
if col in daily_df.columns:
|
|
1715
|
+
daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
|
|
1716
|
+
else:
|
|
1717
|
+
print(f"Column {col} not found in daily_df")
|
|
1718
|
+
|
|
1719
|
+
# Aggregate results by week
|
|
1720
|
+
country_df = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
|
|
1721
|
+
date_column="OBS",
|
|
1722
|
+
group_columns=[],
|
|
1723
|
+
sum_columns=value_columns,
|
|
1724
|
+
wc=week_commencing,
|
|
1725
|
+
aggregation="average")
|
|
1726
|
+
|
|
1727
|
+
oecd_df_final = pd.concat([oecd_df_final, country_df], axis=0, ignore_index=True)
|
|
1943
1728
|
|
|
1944
|
-
return
|
|
1729
|
+
return oecd_df_final
|
|
1945
1730
|
|
|
1946
1731
|
def get_google_mobility_data(self, country: str, wc: str) -> pd.DataFrame:
|
|
1947
1732
|
"""
|
|
@@ -2708,4 +2493,37 @@ class datapull:
|
|
|
2708
2493
|
|
|
2709
2494
|
final_weather = ims_proc.rename_cols(merged_df, 'seas_')
|
|
2710
2495
|
|
|
2711
|
-
return final_weather
|
|
2496
|
+
return final_weather
|
|
2497
|
+
|
|
2498
|
+
def pull_covid_data(folder_path: str, country: str = "GB", week_commencing: str = "mon") -> pd.DataFrame:
|
|
2499
|
+
"""
|
|
2500
|
+
Get covid pandemic data for the country of interest.
|
|
2501
|
+
|
|
2502
|
+
Args:
|
|
2503
|
+
folder_path (str): A string containing the local location of the OneDrive folder.
|
|
2504
|
+
Example: "C:/Users/-- username --/OneDrive - im-sciences.com"
|
|
2505
|
+
The file location within the MasterDrive of the worldwide covid data is:
|
|
2506
|
+
MasterDrive/Central Database/Covid/oxford-government-response.csv
|
|
2507
|
+
country (str): A string containing the country of interest (E.g: "GB", "FR")
|
|
2508
|
+
week_commencing (str): The starting day of the week for aggregation.
|
|
2509
|
+
Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
|
|
2510
|
+
|
|
2511
|
+
Returns:
|
|
2512
|
+
pd.DataFrame: A DataFrame containing seasonality and public holiday dummies for the country of interest.
|
|
2513
|
+
The 'OBS' column contains the week commencing dates.
|
|
2514
|
+
"""
|
|
2515
|
+
|
|
2516
|
+
df = pd.read_csv(f'{folder_path}/MasterDrive/Central Database/Covid/oxford-government-response.csv')
|
|
2517
|
+
|
|
2518
|
+
country_df = df[df['location_key']==country]
|
|
2519
|
+
country_df.rename(columns={'date': 'OBS'}, inplace=True)
|
|
2520
|
+
country_df.drop('location_key', axis=1, inplace=True)
|
|
2521
|
+
|
|
2522
|
+
agg_df = ims_proc.aggregate_daily_to_wc_wide(country_df, 'OBS', [], country_df.columns.to_list(), week_commencing, 'average')
|
|
2523
|
+
|
|
2524
|
+
covid_df = ims_proc.rename_cols(agg_df, 'covid_')
|
|
2525
|
+
|
|
2526
|
+
covid_df['OBS'] = covid_df['OBS'].apply(lambda x: x[0].date())
|
|
2527
|
+
|
|
2528
|
+
return covid_df
|
|
2529
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|