imsciences 0.6.1.1__tar.gz → 0.6.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.6.1.1
3
+ Version: 0.6.1.3
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -19,6 +19,7 @@ import requests
19
19
  from geopy.geocoders import Nominatim
20
20
  import subprocess
21
21
  import json
22
+ import xml.etree.ElementTree as ET
22
23
 
23
24
  class dataprocessing:
24
25
 
@@ -1335,9 +1336,9 @@ class datapull:
1335
1336
  print(" - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
1336
1337
 
1337
1338
  print("\n4. pull_oecd")
1338
- print(" - Description: Fetch macroeconomic data from OECD and other sources for a specified country.")
1339
- print(" - Usage: pull_macro(country='GBR', week_commencing='mon')")
1340
- print(" - Example: pull_macro('GBR', 'mon')")
1339
+ print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
1340
+ print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '1950-01-01')")
1341
+ print(" - Example: pull_oecd('GBR', 'mon', '1950-01-01')")
1341
1342
 
1342
1343
  print("\n5. get_google_mobility_data")
1343
1344
  print(" - Description: Fetch Google Mobility data for the specified country.")
@@ -1353,6 +1354,11 @@ class datapull:
1353
1354
  print(" - Description: Fetch and process historical weather data for the specified country.")
1354
1355
  print(" - Usage: pull_weather(week_commencing, country)")
1355
1356
  print(" - Example: pull_weather('mon', 'GBR')")
1357
+
1358
+ print("\n8. pull_covid_data")
1359
+ print(" - Description: Get covid pandemic data for the country of interest.")
1360
+ print(" - Usage: pull_covid_data(folder_path, country, week_commencing)")
1361
+ print(" - Example: pull_covid_data('C:/Users/--username--/OneDrive/', 'GB', 'mon')")
1356
1362
 
1357
1363
  ############################################################### MACRO ##########################################################################
1358
1364
 
@@ -1579,369 +1585,148 @@ class datapull:
1579
1585
 
1580
1586
  return ons_df_final
1581
1587
 
1582
- def pull_macro(self, country: str = "GBR", week_commencing: str = "mon"):
1583
- # Change country input to list
1584
- countries_list = [country]
1588
+ def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "1950-01-01") -> pd.DataFrame:
1589
+ """
1590
+ Fetch and process time series data from the OECD API.
1585
1591
 
1586
- # Check if the data wants to be inputted at any other week commencing date
1587
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1592
+ Args:
1593
+ country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
1594
+ week_commencing (str): The starting day of the week for aggregation.
1595
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1596
+ start_date (str): Dataset start date in the format "YYYY-MM-DD"
1588
1597
 
1589
- # Two useful functions for quarterly data
1590
- # Define a function to get quarterly data
1591
- def get_quarter(p_date: datetime.date) -> int:
1592
- return (p_date.month - 1) // 3 + 1
1593
-
1594
- # Define a function to get the last day of the quarter
1595
- def get_last_day_of_the_quarter(p_date: datetime.date):
1596
- quarter = get_quarter(p_date)
1597
- return datetime(p_date.year + 3 * quarter // 12, 3 * quarter % 12 + 1, 1) + pd.Timedelta(days=-1)
1598
-
1599
- # For the monthly data
1600
- data_M, subjects_M, measures_M = cif.createDataFrameFromOECD(countries=countries_list, dsname='MEI',
1601
- subject=['LCEAMN01', 'LCEAPR', 'CSCICP03', 'CPALTT01',
1602
- 'LRHUTTTT', 'LORSGPRT', 'IR3TIB01',
1603
- 'PRINTO01'],
1604
- measure=['IXOBSA', 'IXNSA', 'IXNB', 'STSA', 'ST', 'GPSA', 'GY'],
1605
- frequency='M', startDate='2015-01')
1606
- data_M = data_M.stack(level=[0, -1, -2]).reset_index()
1607
-
1608
- data_Q, subjects_Q, measures_Q = cif.createDataFrameFromOECD(countries=countries_list, dsname='MEI',
1609
- subject=['LCEAMN01', 'LCEAPR', 'CSCICP03', 'CPALTT01',
1610
- 'LRHUTTTT', 'LORSGPRT', 'IR3TIB01',
1611
- 'PRINTO01'],
1612
- measure=['IXOBSA', 'IXNSA', 'IXNB', 'STSA', 'ST', 'GPSA', 'GY'],
1613
- frequency='Q', startDate='2015-01')
1614
-
1615
- data_Q = data_Q.stack(level=[0, -1, -2]).reset_index()
1616
-
1617
- # Create a data frame dictionary to store your monthly data frames
1618
- DataFrameDict_M = {elem: pd.DataFrame() for elem in countries_list}
1619
- for key in DataFrameDict_M.keys():
1620
- DataFrameDict_M[key] = data_M[:][data_M.country == key]
1621
-
1622
- # Create a data frame dictionary to store your quarterly data frames
1623
- DataFrameDict_Q = {elem: pd.DataFrame() for elem in countries_list}
1624
- for key in DataFrameDict_Q.keys():
1625
- DataFrameDict_Q[key] = data_Q[:][data_Q.country == key]
1626
-
1627
- # Create a monthly list of the dataframes to iterate through
1628
- countries_df_list_M = []
1629
- for i in countries_list:
1630
- df = pd.DataFrame(DataFrameDict_M[i])
1631
- df.rename(columns={0: 'Values'}, inplace=True)
1632
- df = pd.pivot_table(data=df, index='time', values='Values', columns=['subject', 'measure'])
1633
- countries_df_list_M.append(df)
1634
-
1635
- # Create a quarterly list of the dataframes to iterate through
1636
- countries_df_list_Q = []
1637
- for i in countries_list:
1638
- df = pd.DataFrame(DataFrameDict_Q[i])
1639
- df.rename(columns={0: 'Values'}, inplace=True)
1640
- df = pd.pivot_table(data=df, index='time', values='Values', columns=['subject', 'measure'])
1641
- countries_df_list_Q.append(df)
1642
-
1643
- combined_countries_df_list = list(zip(countries_df_list_M, countries_df_list_Q))
1644
-
1645
- # Loop through and create dataframes for every country
1646
- for index, data in enumerate(combined_countries_df_list):
1647
- # Find country being extracted
1648
- country = countries_list[index]
1649
- print(country)
1650
-
1651
- # For consumer confidence
1652
- # For countries with no data
1653
- if country in ['CAN', 'IND', 'NOR']:
1654
- Consumer_Confidence_Index_df_M = pd.DataFrame()
1655
- Consumer_Confidence_Index_df_Q = pd.DataFrame()
1656
- # For countries with quarterly data
1657
- elif country in []:
1658
- Consumer_Confidence_Index_df_Q = data[1]['CSCICP03']['IXNSA']
1659
- Consumer_Confidence_Index_df_Q.rename('consumer_confidence_index', inplace=True)
1660
- Consumer_Confidence_Index_df_M = pd.DataFrame()
1661
- # For countries with monthly data
1662
- else:
1663
- Consumer_Confidence_Index_df_M = data[0]['CSCICP03']['IXNSA']
1664
- Consumer_Confidence_Index_df_M.rename('consumer_confidence_index', inplace=True)
1665
- Consumer_Confidence_Index_df_Q = pd.DataFrame()
1666
-
1667
- # For consumer prices for COST OF LIVING
1668
- # For countries with no data
1669
- if country in []:
1670
- Consumer_Price_Index_Cost_Of_Living_df_M = pd.DataFrame()
1671
- Consumer_Price_Index_Cost_Of_Living_df_Q = pd.DataFrame()
1672
- # For countries with quarterly data
1673
- elif country in ['AUS', 'NZL']:
1674
- Consumer_Price_Index_Cost_Of_Living_df_Q = data[1]['CPALTT01']['IXNB']
1675
- Consumer_Price_Index_Cost_Of_Living_df_Q.rename('consumer_price_index_cost_of_living', inplace=True)
1676
- Consumer_Price_Index_Cost_Of_Living_df_M = pd.DataFrame()
1677
- # For countries with monthly data
1678
- else:
1679
- Consumer_Price_Index_Cost_Of_Living_df_M = data[0]['CPALTT01']['IXNB']
1680
- Consumer_Price_Index_Cost_Of_Living_df_M.rename('consumer_price_index_cost_of_living', inplace=True)
1681
- Consumer_Price_Index_Cost_Of_Living_df_Q = pd.DataFrame()
1682
-
1683
- # For consumer prices FOR INFLATION
1684
- # For countries with no data
1685
- if country in []:
1686
- Consumer_Price_Index_Inflation_df_M = pd.DataFrame()
1687
- Consumer_Price_Index_Inflation_df_Q = pd.DataFrame()
1688
- # For countries with quarterly data
1689
- elif country in ['AUS', 'NZL']:
1690
- Consumer_Price_Index_Inflation_df_Q = data[1]['CPALTT01']['GY']
1691
- Consumer_Price_Index_Inflation_df_Q.rename('consumer_price_index_inflation', inplace=True)
1692
- Consumer_Price_Index_Inflation_df_M = pd.DataFrame()
1693
- # For countries with monthly data
1694
- else:
1695
- Consumer_Price_Index_Inflation_df_M = data[0]['CPALTT01']['GY']
1696
- Consumer_Price_Index_Inflation_df_M.rename('consumer_price_index_inflation', inplace=True)
1697
- Consumer_Price_Index_Inflation_df_Q = pd.DataFrame()
1698
-
1699
- # For GDP Index Smoothed
1700
- # For countries with no data
1701
- if country in ['NLD', 'CHE', 'NZL', 'SWE', 'NOR']:
1702
- GDP_Index_Smoothed_df_M = pd.DataFrame()
1703
- GDP_Index_Smoothed_df_Q = pd.DataFrame()
1704
- # For countries with quarterly data
1705
- elif country in []:
1706
- GDP_Index_Smoothed_df_Q = data[1]['LORSGPRT']['STSA']
1707
- GDP_Index_Smoothed_df_Q.rename('gdp_index_smoothed', inplace=True)
1708
- GDP_Index_Smoothed_df_M = pd.DataFrame()
1709
- # For countries with monthly data
1710
- else:
1711
- GDP_Index_Smoothed_df_M = data[0]['LORSGPRT']['STSA']
1712
- GDP_Index_Smoothed_df_M.rename('gdp_index_smoothed', inplace=True)
1713
- GDP_Index_Smoothed_df_Q = pd.DataFrame()
1714
-
1715
- # For Harmonised Unemployment Index
1716
- # For countries with no data
1717
- if country in ['IND', 'CHE', 'ZAF', 'CHN']:
1718
- Harmonised_Unemployment_Index_df_M = pd.DataFrame()
1719
- Harmonised_Unemployment_Index_df_Q = pd.DataFrame()
1720
- # For countries with quarterly data
1721
- elif country in ['NZL']:
1722
- Harmonised_Unemployment_Index_df_Q = data[1]['LRHUTTTT']['STSA']
1723
- Harmonised_Unemployment_Index_df_Q.rename('harmonised_unemployment_index', inplace=True)
1724
- Harmonised_Unemployment_Index_df_M = pd.DataFrame()
1725
- # For countries with monthly data
1726
- else:
1727
- Harmonised_Unemployment_Index_df_M = data[0]['LRHUTTTT']['STSA']
1728
- Harmonised_Unemployment_Index_df_M.rename('harmonised_unemployment_index', inplace=True)
1729
- Harmonised_Unemployment_Index_df_Q = pd.DataFrame()
1730
-
1731
- # For hourly earnings index manufacturing
1732
- # For countries with no data
1733
- if country in ['IND', 'CHE', 'ZAF', 'CHN']:
1734
- Hourly_Earnings_Index_Manufacturing_df_M = pd.DataFrame()
1735
- Hourly_Earnings_Index_Manufacturing_df_Q = pd.DataFrame()
1736
- # For countries with quarterly data
1737
- elif country in ['FRA', 'DEU', 'ESP', 'AUS', 'NZL', 'KOR', 'NOR']:
1738
- Hourly_Earnings_Index_Manufacturing_df_Q = data[1]['LCEAMN01']['IXOBSA']
1739
- Hourly_Earnings_Index_Manufacturing_df_Q.rename('hourly_earnings_index_manufacturing', inplace=True)
1740
- Hourly_Earnings_Index_Manufacturing_df_M = pd.DataFrame()
1741
- # For countries with monthly data
1742
- else:
1743
- Hourly_Earnings_Index_Manufacturing_df_M = data[0]['LCEAMN01']['IXOBSA']
1744
- Hourly_Earnings_Index_Manufacturing_df_M.rename('hourly_earnings_index_manufacturing', inplace=True)
1745
- Hourly_Earnings_Index_Manufacturing_df_Q = pd.DataFrame()
1746
-
1747
- # For Short Term Interest Rate
1748
- # For countries with no data
1749
- if country in []:
1750
- Short_Term_Interest_Rate_df_M = pd.DataFrame()
1751
- Short_Term_Interest_Rate_df_Q = pd.DataFrame()
1752
- # For countries with quarterly data
1753
- elif country in []:
1754
- Short_Term_Interest_Rate_df_Q = data[1]['IR3TIB01']['ST']
1755
- Short_Term_Interest_Rate_df_Q.rename('short_term_interest_rate', inplace=True)
1756
- Short_Term_Interest_Rate_df_M = pd.DataFrame()
1757
- # For countries with monthly data
1758
- else:
1759
- Short_Term_Interest_Rate_df_M = data[0]['IR3TIB01']['ST']
1760
- Short_Term_Interest_Rate_df_M.rename('short_term_interest_rate', inplace=True)
1761
- Short_Term_Interest_Rate_df_Q = pd.DataFrame()
1762
-
1763
- # For Industrial Product Growth on Previous Period
1764
- # For countries with no data
1765
- if country in ['ZAF', 'CHN']:
1766
- Industrial_Product_Growth_on_Previous_Period_df_M = pd.DataFrame()
1767
- Industrial_Product_Growth_on_Previous_Period_df_Q = pd.DataFrame()
1768
- # For countries with quarterly data
1769
- elif country in ['AUS', 'NZL']:
1770
- Industrial_Product_Growth_on_Previous_Period_df_Q = data[1]['PRINTO01']['GPSA']
1771
- Industrial_Product_Growth_on_Previous_Period_df_Q.rename('industrial_product_growth_on_previous_period', inplace=True)
1772
- Industrial_Product_Growth_on_Previous_Period_df_M = pd.DataFrame()
1773
- # For countries with monthly data
1774
- else:
1775
- Industrial_Product_Growth_on_Previous_Period_df_M = data[0]['PRINTO01']['GPSA']
1776
- Industrial_Product_Growth_on_Previous_Period_df_M.rename('industrial_product_growth_on_previous_period', inplace=True)
1777
- Industrial_Product_Growth_on_Previous_Period_df_Q = pd.DataFrame()
1778
-
1779
- # For Industrial Production Index
1780
- # For countries with no data
1781
- if country in ['ZAF', 'CHN']:
1782
- Industrial_Production_Index_df_M = pd.DataFrame()
1783
- Industrial_Production_Index_df_Q = pd.DataFrame()
1784
- # For countries with quarterly data
1785
- elif country in ['AUS', 'NZL']:
1786
- Industrial_Production_Index_df_Q = data[1]['PRINTO01']['IXOBSA']
1787
- Industrial_Production_Index_df_Q.rename('industrial_production_index', inplace=True)
1788
- Industrial_Production_Index_df_M = pd.DataFrame()
1789
- # For countries with monthly data
1790
- else:
1791
- Industrial_Production_Index_df_M = data[0]['PRINTO01']['IXOBSA']
1792
- Industrial_Production_Index_df_M.rename('industrial_production_index', inplace=True)
1793
- Industrial_Production_Index_df_Q = pd.DataFrame()
1794
-
1795
- # Create monthly macroeconomic dataframe
1796
- all_dfs_list_M = [Consumer_Confidence_Index_df_M,
1797
- Consumer_Price_Index_Cost_Of_Living_df_M,
1798
- Consumer_Price_Index_Inflation_df_M,
1799
- GDP_Index_Smoothed_df_M,
1800
- Harmonised_Unemployment_Index_df_M,
1801
- Hourly_Earnings_Index_Manufacturing_df_M,
1802
- Short_Term_Interest_Rate_df_M,
1803
- Industrial_Product_Growth_on_Previous_Period_df_M,
1804
- Industrial_Production_Index_df_M]
1805
-
1806
- # Check if any dataframes are empty and if there are remove them
1807
- all_dfs_list_M = [df for df in all_dfs_list_M if not df.empty]
1808
- cif_Macroeconomic_df_M = pd.concat(all_dfs_list_M, axis=1)
1809
-
1810
- # Create quarterly macroeconomic dataframe
1811
- all_dfs_list_Q = [Consumer_Confidence_Index_df_Q,
1812
- Consumer_Price_Index_Cost_Of_Living_df_Q,
1813
- Consumer_Price_Index_Inflation_df_Q,
1814
- GDP_Index_Smoothed_df_Q,
1815
- Harmonised_Unemployment_Index_df_Q,
1816
- Hourly_Earnings_Index_Manufacturing_df_Q,
1817
- Short_Term_Interest_Rate_df_Q,
1818
- Industrial_Product_Growth_on_Previous_Period_df_Q,
1819
- Industrial_Production_Index_df_Q]
1820
-
1821
- # Check if any dataframes are empty and if there are remove them
1822
- all_dfs_list_Q = [df for df in all_dfs_list_Q if not df.empty]
1823
- if all_dfs_list_Q != []:
1824
- macroeconomic_monthly_df_Q = pd.concat(all_dfs_list_Q, axis=1)
1825
- else:
1826
- macroeconomic_monthly_df_Q = pd.DataFrame()
1827
-
1828
- # For USD GBP Exchange Rate
1829
- # If it's the UK add this series else don't
1830
- if countries_list[index] == 'GBR':
1831
- USD_GBP_Exchange_Rate_df = pd.read_csv(
1832
- 'https://stats.oecd.org/SDMX-JSON/data/MEI_FIN/CCUS.' + countries_list[index] + '.M/OECD?contentType=csv')
1833
- USD_GBP_Exchange_Rate_df.head()
1834
- USD_GBP_Exchange_Rate_df_pivot = pd.pivot_table(USD_GBP_Exchange_Rate_df, values='Value', index='TIME',
1835
- columns='Subject')
1836
- USD_GBP_Exchange_Rate_df_pivot_final = USD_GBP_Exchange_Rate_df_pivot.loc["2015-01":]
1837
- USD_GBP_Exchange_Rate_df_pivot_final.rename(
1838
- columns={'Currency exchange rates, monthly average': 'usd_gbp_exchange_rate'}, inplace=True)
1839
-
1840
- # Create final monthly dataframe
1841
- macroeconomic_monthly_df_M = pd.concat([cif_Macroeconomic_df_M, USD_GBP_Exchange_Rate_df_pivot_final], axis=1)
1842
- else:
1843
- # Create final monthly dataframe
1844
- macroeconomic_monthly_df_M = cif_Macroeconomic_df_M
1845
-
1846
- # Create the final W/C Sunday dataframe
1847
- # For monthly data
1848
- macroeconomic_monthly_df_M['Date'] = macroeconomic_monthly_df_M.index
1849
- df_M = macroeconomic_monthly_df_M.set_index(pd.to_datetime(macroeconomic_monthly_df_M['Date'])).drop(columns='Date')
1850
- df_M.fillna(method="ffill", inplace=True)
1851
- df_M.reset_index(inplace=True)
1852
-
1853
- daily_records = []
1854
- # Iterate over each row in the DataFrame
1855
- for _, row in df_M.iterrows():
1856
- # Calculate the number of days in the month
1857
- num_days = calendar.monthrange(row["Date"].year, row["Date"].month)[1]
1858
- # Create a new record for each day of the month
1859
- for day in range(1, num_days + 1):
1860
- daily_row = row.copy()
1861
- daily_row["Date"] = row["Date"].replace(day=day)
1862
- daily_records.append(daily_row)
1863
-
1864
- # Convert the list of daily records into a DataFrame
1865
- daily_df = pd.DataFrame(daily_records)
1866
-
1867
- # Extend dataframe to include the current data if needed
1868
- datelist = pd.date_range(daily_df["Date"].iloc[-1] + pd.Timedelta(days=1), datetime.today()).tolist()
1869
- extended_data = np.repeat([list(daily_df.iloc[-1, 1:].values)], len(datelist), axis=0)
1870
- q = pd.Series(datelist, name="Date")
1871
- s = pd.DataFrame(extended_data, columns=list(df_M.columns[1:]))
1872
- extended_daily_df = pd.concat([q, s], axis=1)
1873
- extended_daily_df = pd.concat([daily_df, extended_daily_df], ignore_index=False)
1874
-
1875
- # Create a week commencing column
1876
- extended_daily_df["Date"] = pd.to_datetime(extended_daily_df["Date"], format='%d %b %Y')
1877
- extended_daily_df['week_start'] = extended_daily_df["Date"].apply(
1878
- lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1879
- extended_daily_df.drop("Date", axis=1, inplace=True)
1880
- extended_daily_df.rename(columns={'week_start': "Date"}, inplace=True)
1881
-
1882
- # Take a weekly average
1883
- macroeconomic_weekly_df_M = extended_daily_df.groupby('Date').mean()
1884
-
1885
- # For quarterly data
1886
- # If there are quarterly datasets
1887
- if all_dfs_list_Q != []:
1888
- macroeconomic_monthly_df_Q['Date'] = macroeconomic_monthly_df_Q.index
1889
- df_Q = macroeconomic_monthly_df_Q.set_index(pd.to_datetime(macroeconomic_monthly_df_Q['Date'])).drop(
1890
- columns='Date')
1891
- df_Q.fillna(method="ffill", inplace=True)
1892
- df_Q.reset_index(inplace=True)
1893
-
1894
- daily_records = []
1895
- for _, row in df_Q.iterrows():
1896
- year = row["Date"].year
1897
- month = row["Date"].month
1898
- day = row["Date"].day
1899
- last_date = get_last_day_of_the_quarter(datetime(year, month, day).date())
1900
- all_days = pd.date_range(row["Date"], last_date, freq="D")
1901
-
1902
- # Create a new record for each day of the quarter
1903
- for day in all_days:
1904
- daily_row = row.copy()
1905
- daily_row["Date"] = row["Date"].replace(day=day.day, month=day.month)
1906
- daily_records.append(daily_row)
1907
-
1908
- # Convert the list of daily records into a DataFrame
1909
- daily_df = pd.DataFrame(daily_records)
1910
-
1911
- # Extend dataframe to include data up to today
1912
- datelist = pd.date_range(daily_df["Date"].iloc[-1] + pd.Timedelta(days=1), datetime.today()).tolist()
1913
- extended_data = np.repeat([list(daily_df.iloc[-1, 1:].values)], len(datelist), axis=0)
1914
- q = pd.Series(datelist, name="Date")
1915
- s = pd.DataFrame(extended_data, columns=list(df_Q.columns[1:]))
1916
- extended_daily_df = pd.concat([q, s], axis=1)
1917
- extended_daily_df = pd.concat([daily_df, extended_daily_df], ignore_index=False)
1918
-
1919
- # Create a week commencing column
1920
- extended_daily_df["Date"] = pd.to_datetime(extended_daily_df["Date"], format='%d %b %Y')
1921
- extended_daily_df['week_start'] = extended_daily_df["Date"].apply(
1922
- lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1923
- extended_daily_df.drop("Date", axis=1, inplace=True)
1924
- extended_daily_df.rename(columns={'week_start': "Date"}, inplace=True)
1925
-
1926
- # Take a weekly average
1927
- macroeconomic_weekly_df_Q = extended_daily_df.groupby('Date').mean()
1928
-
1929
- # Merge the two datasets together
1930
- if all_dfs_list_Q != []:
1931
- macroeconomic_weekly_df = macroeconomic_weekly_df_M.merge(macroeconomic_weekly_df_Q, left_index=True,
1932
- right_index=True)
1933
- # If there are no quarterly datasets
1598
+ Returns:
1599
+ pd.DataFrame: A DataFrame with weekly aggregated OECD data. The 'OBS' column contains the week
1600
+ commencing dates, and other columns contain the aggregated time series values.
1601
+ """
1602
+
1603
+ def parse_quarter(date_str):
1604
+ """Parses a string in 'YYYY-Q#' format into a datetime object."""
1605
+ year, quarter = date_str.split('-')
1606
+ quarter_number = int(quarter[1])
1607
+ month = (quarter_number - 1) * 3 + 1
1608
+ return pd.Timestamp(f"{year}-{month:02d}-01")
1609
+
1610
+ # Generate a date range from 1950-01-01 to today
1611
+ date_range = pd.date_range(start=start_date, end=datetime.today(), freq='D')
1612
+
1613
+ url_details = [
1614
+ ["BCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_business_confidence_index"],
1615
+ ["CCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_consumer_confidence_index"],
1616
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA._T.N.GY", "macro_cpi_total"],
1617
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP041T043.N.GY", "macro_cpi_housing"],
1618
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP01.N.GY", "macro_cpi_food"],
1619
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP045_0722.N.GY", "macro_cpi_energy"],
1620
+ ["UNE_LF_M", "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,", "._Z.Y._T.Y_GE15.", "macro_unemployment_rate"],
1621
+ ["EAR", "SDD.TPS,DSD_EAR@DF_HOU_EAR,", ".Y..S1D", "macro_private_hourly_earnings"],
1622
+ ["RHP", "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0", "", "macro_real_house_prices"],
1623
+ ["PRVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX.C..", "macro_manufacturing_production_volume"],
1624
+ ["TOVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX...", "macro_retail_trade_volume"],
1625
+ ["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
1626
+ ["IRLT", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_long_term_interest_rate"],
1627
+ ["B1GQ", "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1", "._Z....GY.T0102", "macro_gdp_growth_yoy"]
1628
+ ]
1629
+
1630
+ # Create empty final dataframe
1631
+ oecd_df_final = pd.DataFrame()
1632
+
1633
+ daily_df = pd.DataFrame({'OBS': date_range})
1634
+ value_columns = []
1635
+
1636
+ # Iterate for each variable of interest
1637
+ for series_details in url_details:
1638
+ series = series_details[0]
1639
+ dataset_id = series_details[1]
1640
+ filter = series_details[2]
1641
+ col_name = series_details[3]
1642
+
1643
+ # check if request was successful and determine the most granular data available
1644
+ for freq in ['M', 'Q', 'A']:
1645
+
1646
+ if series in ["UNE_LF_M", "EAR"]:
1647
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
1648
+ elif series in ["B1GQ"]:
1649
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
1650
+ else:
1651
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
1652
+
1653
+ # Make the request to the OECD API for data
1654
+ data_response = requests.get(data_url)
1655
+
1656
+ # Check if the request was successful
1657
+ if data_response.status_code != 200:
1658
+ print(f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}")
1659
+ url_test = False
1660
+ continue
1661
+ else:
1662
+ url_test = True
1663
+ break
1664
+
1665
+ # get data for the next variable if url doesn't exist
1666
+ if url_test == False:
1667
+ continue
1668
+
1669
+ root = ET.fromstring(data_response.content)
1670
+
1671
+ # Define namespaces if necessary (the namespace is included in the tags)
1672
+ namespaces = {'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic'}
1673
+
1674
+ # Lists to store the data
1675
+ dates = []
1676
+ values = []
1677
+
1678
+ # Iterate over all <Obs> elements and extract date and value
1679
+ for obs in root.findall('.//generic:Obs', namespaces):
1680
+
1681
+ # Extracting the time period (date)
1682
+ time_period = obs.find('.//generic:ObsDimension', namespaces).get('value')
1683
+
1684
+ # Extracting the observation value
1685
+ value = obs.find('.//generic:ObsValue', namespaces).get('value')
1686
+
1687
+ # Storing the data
1688
+ if time_period and value:
1689
+ dates.append(time_period)
1690
+ values.append(float(value)) # Convert value to float
1691
+
1692
+ # Add variable names that were found to a list
1693
+ value_columns.append(col_name)
1694
+
1695
+ # Creating a DataFrame
1696
+ data = pd.DataFrame({'OBS': dates, col_name: values})
1697
+
1698
+ # Convert date strings into datetime format
1699
+ if freq == 'Q':
1700
+ data['OBS'] = data['OBS'].apply(parse_quarter)
1934
1701
  else:
1935
- macroeconomic_weekly_df = macroeconomic_weekly_df_M
1702
+ # Display the DataFrame
1703
+ data['OBS'] = data['OBS'].apply(lambda x: datetime.strptime(x, '%Y-%m'))
1704
+
1705
+ # Sort data by chronological order
1706
+ data.sort_values(by='OBS', inplace=True)
1707
+
1708
+ # Merge the data based on the observation date
1709
+ daily_df = pd.merge_asof(daily_df, data[['OBS', col_name]], on='OBS', direction='backward')
1936
1710
 
1937
- # Change datetime format
1938
- macroeconomic_weekly_df.index = macroeconomic_weekly_df.index.strftime('%d/%m/%Y')
1939
1711
 
1940
- macroeconomic_weekly_df.reset_index()
1941
- macroeconomic_weekly_df.reset_index(drop=False, inplace=True)
1942
- macroeconomic_weekly_df.rename(columns={'Date': 'OBS'}, inplace=True)
1712
+ # Ensure columns are numeric
1713
+ for col in value_columns:
1714
+ if col in daily_df.columns:
1715
+ daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
1716
+ else:
1717
+ print(f"Column {col} not found in daily_df")
1718
+
1719
+ # Aggregate results by week
1720
+ country_df = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
1721
+ date_column="OBS",
1722
+ group_columns=[],
1723
+ sum_columns=value_columns,
1724
+ wc=week_commencing,
1725
+ aggregation="average")
1726
+
1727
+ oecd_df_final = pd.concat([oecd_df_final, country_df], axis=0, ignore_index=True)
1943
1728
 
1944
- return macroeconomic_weekly_df
1729
+ return oecd_df_final
1945
1730
 
1946
1731
  def get_google_mobility_data(self, country: str, wc: str) -> pd.DataFrame:
1947
1732
  """
@@ -2708,4 +2493,37 @@ class datapull:
2708
2493
 
2709
2494
  final_weather = ims_proc.rename_cols(merged_df, 'seas_')
2710
2495
 
2711
- return final_weather
2496
+ return final_weather
2497
+
2498
+ def pull_covid_data(folder_path: str, country: str = "GB", week_commencing: str = "mon") -> pd.DataFrame:
2499
+ """
2500
+ Get covid pandemic data for the country of interest.
2501
+
2502
+ Args:
2503
+ folder_path (str): A string containing the local location of the OneDrive folder.
2504
+ Example: "C:/Users/-- username --/OneDrive - im-sciences.com"
2505
+ The file location within the MasterDrive of the worldwide covid data is:
2506
+ MasterDrive/Central Database/Covid/oxford-government-response.csv
2507
+ country (str): A string containing the country of interest (E.g: "GB", "FR")
2508
+ week_commencing (str): The starting day of the week for aggregation.
2509
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
2510
+
2511
+ Returns:
2512
+ pd.DataFrame: A DataFrame containing seasonality and public holiday dummies for the country of interest.
2513
+ The 'OBS' column contains the week commencing dates.
2514
+ """
2515
+
2516
+ df = pd.read_csv(f'{folder_path}/MasterDrive/Central Database/Covid/oxford-government-response.csv')
2517
+
2518
+ country_df = df[df['location_key']==country]
2519
+ country_df.rename(columns={'date': 'OBS'}, inplace=True)
2520
+ country_df.drop('location_key', axis=1, inplace=True)
2521
+
2522
+ agg_df = ims_proc.aggregate_daily_to_wc_wide(country_df, 'OBS', [], country_df.columns.to_list(), week_commencing, 'average')
2523
+
2524
+ covid_df = ims_proc.rename_cols(agg_df, 'covid_')
2525
+
2526
+ covid_df['OBS'] = covid_df['OBS'].apply(lambda x: x[0].date())
2527
+
2528
+ return covid_df
2529
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.6.1.1
3
+ Version: 0.6.1.3
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -8,7 +8,7 @@ def read_md(file_name):
8
8
  return f.read()
9
9
  return ''
10
10
 
11
- VERSION = '0.6.1.1'
11
+ VERSION = '0.6.1.3'
12
12
  DESCRIPTION = 'IMS Data Processing Package'
13
13
  LONG_DESCRIPTION = read_md('README.md') # Reading from README.md
14
14
 
File without changes
File without changes