imsciences 0.6.1.2__tar.gz → 0.6.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.6.1.2
3
+ Version: 0.6.1.4
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -19,6 +19,7 @@ import requests
19
19
  from geopy.geocoders import Nominatim
20
20
  import subprocess
21
21
  import json
22
+ import xml.etree.ElementTree as ET
22
23
 
23
24
  class dataprocessing:
24
25
 
@@ -1335,9 +1336,9 @@ class datapull:
1335
1336
  print(" - Example: pull_ons_data([{'series_id': 'LMSBSA', 'dataset_id': 'LMS'}], 'mon')")
1336
1337
 
1337
1338
  print("\n4. pull_oecd")
1338
- print(" - Description: Fetch macroeconomic data from OECD and other sources for a specified country.")
1339
- print(" - Usage: pull_macro(country='GBR', week_commencing='mon')")
1340
- print(" - Example: pull_macro('GBR', 'mon')")
1339
+ print(" - Description: Fetch macroeconomic data from OECD for a specified country.")
1340
+ print(" - Usage: pull_oecd(country='GBR', week_commencing='mon', start_date: '1950-01-01')")
1341
+ print(" - Example: pull_oecd('GBR', 'mon', '1950-01-01')")
1341
1342
 
1342
1343
  print("\n5. get_google_mobility_data")
1343
1344
  print(" - Description: Fetch Google Mobility data for the specified country.")
@@ -1353,6 +1354,11 @@ class datapull:
1353
1354
  print(" - Description: Fetch and process historical weather data for the specified country.")
1354
1355
  print(" - Usage: pull_weather(week_commencing, country)")
1355
1356
  print(" - Example: pull_weather('mon', 'GBR')")
1357
+
1358
+ print("\n8. pull_covid_data")
1359
+ print(" - Description: Get covid pandemic data for the country of interest.")
1360
+ print(" - Usage: pull_covid_data(folder_path, country, week_commencing)")
1361
+ print(" - Example: pull_covid_data('C:/Users/--username--/OneDrive/', 'GB', 'mon')")
1356
1362
 
1357
1363
  ############################################################### MACRO ##########################################################################
1358
1364
 
@@ -1579,370 +1585,148 @@ class datapull:
1579
1585
 
1580
1586
  return ons_df_final
1581
1587
 
1582
- def pull_macro(self, country: str = "GBR", week_commencing: str = "mon"):
1583
- # Change country input to list
1584
- print("")
1585
- countries_list = [country]
1588
+ def pull_oecd(self, country: str = "GBR", week_commencing: str = "mon", start_date: str = "1950-01-01") -> pd.DataFrame:
1589
+ """
1590
+ Fetch and process time series data from the OECD API.
1586
1591
 
1587
- # Check if the data wants to be inputted at any other week commencing date
1588
- day_dict = {"mon": 0, "tue": 1, "wed": 2, "thur": 3, "fri": 4, "sat": 5, "sun": 6}
1592
+ Args:
1593
+ country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
1594
+ week_commencing (str): The starting day of the week for aggregation.
1595
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
1596
+ start_date (str): Dataset start date in the format "YYYY-MM-DD"
1589
1597
 
1590
- # Two useful functions for quarterly data
1591
- # Define a function to get quarterly data
1592
- def get_quarter(p_date: datetime.date) -> int:
1593
- return (p_date.month - 1) // 3 + 1
1594
-
1595
- # Define a function to get the last day of the quarter
1596
- def get_last_day_of_the_quarter(p_date: datetime.date):
1597
- quarter = get_quarter(p_date)
1598
- return datetime(p_date.year + 3 * quarter // 12, 3 * quarter % 12 + 1, 1) + pd.Timedelta(days=-1)
1599
-
1600
- # For the monthly data
1601
- data_M, subjects_M, measures_M = cif.createDataFrameFromOECD(countries=countries_list, dsname='MEI',
1602
- subject=['LCEAMN01', 'LCEAPR', 'CSCICP03', 'CPALTT01',
1603
- 'LRHUTTTT', 'LORSGPRT', 'IR3TIB01',
1604
- 'PRINTO01'],
1605
- measure=['IXOBSA', 'IXNSA', 'IXNB', 'STSA', 'ST', 'GPSA', 'GY'],
1606
- frequency='M', startDate='2015-01')
1607
- data_M = data_M.stack(level=[0, -1, -2]).reset_index()
1608
-
1609
- data_Q, subjects_Q, measures_Q = cif.createDataFrameFromOECD(countries=countries_list, dsname='MEI',
1610
- subject=['LCEAMN01', 'LCEAPR', 'CSCICP03', 'CPALTT01',
1611
- 'LRHUTTTT', 'LORSGPRT', 'IR3TIB01',
1612
- 'PRINTO01'],
1613
- measure=['IXOBSA', 'IXNSA', 'IXNB', 'STSA', 'ST', 'GPSA', 'GY'],
1614
- frequency='Q', startDate='2015-01')
1615
-
1616
- data_Q = data_Q.stack(level=[0, -1, -2]).reset_index()
1617
-
1618
- # Create a data frame dictionary to store your monthly data frames
1619
- DataFrameDict_M = {elem: pd.DataFrame() for elem in countries_list}
1620
- for key in DataFrameDict_M.keys():
1621
- DataFrameDict_M[key] = data_M[:][data_M.country == key]
1622
-
1623
- # Create a data frame dictionary to store your quarterly data frames
1624
- DataFrameDict_Q = {elem: pd.DataFrame() for elem in countries_list}
1625
- for key in DataFrameDict_Q.keys():
1626
- DataFrameDict_Q[key] = data_Q[:][data_Q.country == key]
1627
-
1628
- # Create a monthly list of the dataframes to iterate through
1629
- countries_df_list_M = []
1630
- for i in countries_list:
1631
- df = pd.DataFrame(DataFrameDict_M[i])
1632
- df.rename(columns={0: 'Values'}, inplace=True)
1633
- df = pd.pivot_table(data=df, index='time', values='Values', columns=['subject', 'measure'])
1634
- countries_df_list_M.append(df)
1635
-
1636
- # Create a quarterly list of the dataframes to iterate through
1637
- countries_df_list_Q = []
1638
- for i in countries_list:
1639
- df = pd.DataFrame(DataFrameDict_Q[i])
1640
- df.rename(columns={0: 'Values'}, inplace=True)
1641
- df = pd.pivot_table(data=df, index='time', values='Values', columns=['subject', 'measure'])
1642
- countries_df_list_Q.append(df)
1643
-
1644
- combined_countries_df_list = list(zip(countries_df_list_M, countries_df_list_Q))
1645
-
1646
- # Loop through and create dataframes for every country
1647
- for index, data in enumerate(combined_countries_df_list):
1648
- # Find country being extracted
1649
- country = countries_list[index]
1650
- print(country)
1651
-
1652
- # For consumer confidence
1653
- # For countries with no data
1654
- if country in ['CAN', 'IND', 'NOR']:
1655
- Consumer_Confidence_Index_df_M = pd.DataFrame()
1656
- Consumer_Confidence_Index_df_Q = pd.DataFrame()
1657
- # For countries with quarterly data
1658
- elif country in []:
1659
- Consumer_Confidence_Index_df_Q = data[1]['CSCICP03']['IXNSA']
1660
- Consumer_Confidence_Index_df_Q.rename('consumer_confidence_index', inplace=True)
1661
- Consumer_Confidence_Index_df_M = pd.DataFrame()
1662
- # For countries with monthly data
1663
- else:
1664
- Consumer_Confidence_Index_df_M = data[0]['CSCICP03']['IXNSA']
1665
- Consumer_Confidence_Index_df_M.rename('consumer_confidence_index', inplace=True)
1666
- Consumer_Confidence_Index_df_Q = pd.DataFrame()
1667
-
1668
- # For consumer prices for COST OF LIVING
1669
- # For countries with no data
1670
- if country in []:
1671
- Consumer_Price_Index_Cost_Of_Living_df_M = pd.DataFrame()
1672
- Consumer_Price_Index_Cost_Of_Living_df_Q = pd.DataFrame()
1673
- # For countries with quarterly data
1674
- elif country in ['AUS', 'NZL']:
1675
- Consumer_Price_Index_Cost_Of_Living_df_Q = data[1]['CPALTT01']['IXNB']
1676
- Consumer_Price_Index_Cost_Of_Living_df_Q.rename('consumer_price_index_cost_of_living', inplace=True)
1677
- Consumer_Price_Index_Cost_Of_Living_df_M = pd.DataFrame()
1678
- # For countries with monthly data
1679
- else:
1680
- Consumer_Price_Index_Cost_Of_Living_df_M = data[0]['CPALTT01']['IXNB']
1681
- Consumer_Price_Index_Cost_Of_Living_df_M.rename('consumer_price_index_cost_of_living', inplace=True)
1682
- Consumer_Price_Index_Cost_Of_Living_df_Q = pd.DataFrame()
1683
-
1684
- # For consumer prices FOR INFLATION
1685
- # For countries with no data
1686
- if country in []:
1687
- Consumer_Price_Index_Inflation_df_M = pd.DataFrame()
1688
- Consumer_Price_Index_Inflation_df_Q = pd.DataFrame()
1689
- # For countries with quarterly data
1690
- elif country in ['AUS', 'NZL']:
1691
- Consumer_Price_Index_Inflation_df_Q = data[1]['CPALTT01']['GY']
1692
- Consumer_Price_Index_Inflation_df_Q.rename('consumer_price_index_inflation', inplace=True)
1693
- Consumer_Price_Index_Inflation_df_M = pd.DataFrame()
1694
- # For countries with monthly data
1695
- else:
1696
- Consumer_Price_Index_Inflation_df_M = data[0]['CPALTT01']['GY']
1697
- Consumer_Price_Index_Inflation_df_M.rename('consumer_price_index_inflation', inplace=True)
1698
- Consumer_Price_Index_Inflation_df_Q = pd.DataFrame()
1699
-
1700
- # For GDP Index Smoothed
1701
- # For countries with no data
1702
- if country in ['NLD', 'CHE', 'NZL', 'SWE', 'NOR']:
1703
- GDP_Index_Smoothed_df_M = pd.DataFrame()
1704
- GDP_Index_Smoothed_df_Q = pd.DataFrame()
1705
- # For countries with quarterly data
1706
- elif country in []:
1707
- GDP_Index_Smoothed_df_Q = data[1]['LORSGPRT']['STSA']
1708
- GDP_Index_Smoothed_df_Q.rename('gdp_index_smoothed', inplace=True)
1709
- GDP_Index_Smoothed_df_M = pd.DataFrame()
1710
- # For countries with monthly data
1711
- else:
1712
- GDP_Index_Smoothed_df_M = data[0]['LORSGPRT']['STSA']
1713
- GDP_Index_Smoothed_df_M.rename('gdp_index_smoothed', inplace=True)
1714
- GDP_Index_Smoothed_df_Q = pd.DataFrame()
1715
-
1716
- # For Harmonised Unemployment Index
1717
- # For countries with no data
1718
- if country in ['IND', 'CHE', 'ZAF', 'CHN']:
1719
- Harmonised_Unemployment_Index_df_M = pd.DataFrame()
1720
- Harmonised_Unemployment_Index_df_Q = pd.DataFrame()
1721
- # For countries with quarterly data
1722
- elif country in ['NZL']:
1723
- Harmonised_Unemployment_Index_df_Q = data[1]['LRHUTTTT']['STSA']
1724
- Harmonised_Unemployment_Index_df_Q.rename('harmonised_unemployment_index', inplace=True)
1725
- Harmonised_Unemployment_Index_df_M = pd.DataFrame()
1726
- # For countries with monthly data
1727
- else:
1728
- Harmonised_Unemployment_Index_df_M = data[0]['LRHUTTTT']['STSA']
1729
- Harmonised_Unemployment_Index_df_M.rename('harmonised_unemployment_index', inplace=True)
1730
- Harmonised_Unemployment_Index_df_Q = pd.DataFrame()
1731
-
1732
- # For hourly earnings index manufacturing
1733
- # For countries with no data
1734
- if country in ['IND', 'CHE', 'ZAF', 'CHN']:
1735
- Hourly_Earnings_Index_Manufacturing_df_M = pd.DataFrame()
1736
- Hourly_Earnings_Index_Manufacturing_df_Q = pd.DataFrame()
1737
- # For countries with quarterly data
1738
- elif country in ['FRA', 'DEU', 'ESP', 'AUS', 'NZL', 'KOR', 'NOR']:
1739
- Hourly_Earnings_Index_Manufacturing_df_Q = data[1]['LCEAMN01']['IXOBSA']
1740
- Hourly_Earnings_Index_Manufacturing_df_Q.rename('hourly_earnings_index_manufacturing', inplace=True)
1741
- Hourly_Earnings_Index_Manufacturing_df_M = pd.DataFrame()
1742
- # For countries with monthly data
1743
- else:
1744
- Hourly_Earnings_Index_Manufacturing_df_M = data[0]['LCEAMN01']['IXOBSA']
1745
- Hourly_Earnings_Index_Manufacturing_df_M.rename('hourly_earnings_index_manufacturing', inplace=True)
1746
- Hourly_Earnings_Index_Manufacturing_df_Q = pd.DataFrame()
1747
-
1748
- # For Short Term Interest Rate
1749
- # For countries with no data
1750
- if country in []:
1751
- Short_Term_Interest_Rate_df_M = pd.DataFrame()
1752
- Short_Term_Interest_Rate_df_Q = pd.DataFrame()
1753
- # For countries with quarterly data
1754
- elif country in []:
1755
- Short_Term_Interest_Rate_df_Q = data[1]['IR3TIB01']['ST']
1756
- Short_Term_Interest_Rate_df_Q.rename('short_term_interest_rate', inplace=True)
1757
- Short_Term_Interest_Rate_df_M = pd.DataFrame()
1758
- # For countries with monthly data
1759
- else:
1760
- Short_Term_Interest_Rate_df_M = data[0]['IR3TIB01']['ST']
1761
- Short_Term_Interest_Rate_df_M.rename('short_term_interest_rate', inplace=True)
1762
- Short_Term_Interest_Rate_df_Q = pd.DataFrame()
1763
-
1764
- # For Industrial Product Growth on Previous Period
1765
- # For countries with no data
1766
- if country in ['ZAF', 'CHN']:
1767
- Industrial_Product_Growth_on_Previous_Period_df_M = pd.DataFrame()
1768
- Industrial_Product_Growth_on_Previous_Period_df_Q = pd.DataFrame()
1769
- # For countries with quarterly data
1770
- elif country in ['AUS', 'NZL']:
1771
- Industrial_Product_Growth_on_Previous_Period_df_Q = data[1]['PRINTO01']['GPSA']
1772
- Industrial_Product_Growth_on_Previous_Period_df_Q.rename('industrial_product_growth_on_previous_period', inplace=True)
1773
- Industrial_Product_Growth_on_Previous_Period_df_M = pd.DataFrame()
1774
- # For countries with monthly data
1775
- else:
1776
- Industrial_Product_Growth_on_Previous_Period_df_M = data[0]['PRINTO01']['GPSA']
1777
- Industrial_Product_Growth_on_Previous_Period_df_M.rename('industrial_product_growth_on_previous_period', inplace=True)
1778
- Industrial_Product_Growth_on_Previous_Period_df_Q = pd.DataFrame()
1779
-
1780
- # For Industrial Production Index
1781
- # For countries with no data
1782
- if country in ['ZAF', 'CHN']:
1783
- Industrial_Production_Index_df_M = pd.DataFrame()
1784
- Industrial_Production_Index_df_Q = pd.DataFrame()
1785
- # For countries with quarterly data
1786
- elif country in ['AUS', 'NZL']:
1787
- Industrial_Production_Index_df_Q = data[1]['PRINTO01']['IXOBSA']
1788
- Industrial_Production_Index_df_Q.rename('industrial_production_index', inplace=True)
1789
- Industrial_Production_Index_df_M = pd.DataFrame()
1790
- # For countries with monthly data
1791
- else:
1792
- Industrial_Production_Index_df_M = data[0]['PRINTO01']['IXOBSA']
1793
- Industrial_Production_Index_df_M.rename('industrial_production_index', inplace=True)
1794
- Industrial_Production_Index_df_Q = pd.DataFrame()
1795
-
1796
- # Create monthly macroeconomic dataframe
1797
- all_dfs_list_M = [Consumer_Confidence_Index_df_M,
1798
- Consumer_Price_Index_Cost_Of_Living_df_M,
1799
- Consumer_Price_Index_Inflation_df_M,
1800
- GDP_Index_Smoothed_df_M,
1801
- Harmonised_Unemployment_Index_df_M,
1802
- Hourly_Earnings_Index_Manufacturing_df_M,
1803
- Short_Term_Interest_Rate_df_M,
1804
- Industrial_Product_Growth_on_Previous_Period_df_M,
1805
- Industrial_Production_Index_df_M]
1806
-
1807
- # Check if any dataframes are empty and if there are remove them
1808
- all_dfs_list_M = [df for df in all_dfs_list_M if not df.empty]
1809
- cif_Macroeconomic_df_M = pd.concat(all_dfs_list_M, axis=1)
1810
-
1811
- # Create quarterly macroeconomic dataframe
1812
- all_dfs_list_Q = [Consumer_Confidence_Index_df_Q,
1813
- Consumer_Price_Index_Cost_Of_Living_df_Q,
1814
- Consumer_Price_Index_Inflation_df_Q,
1815
- GDP_Index_Smoothed_df_Q,
1816
- Harmonised_Unemployment_Index_df_Q,
1817
- Hourly_Earnings_Index_Manufacturing_df_Q,
1818
- Short_Term_Interest_Rate_df_Q,
1819
- Industrial_Product_Growth_on_Previous_Period_df_Q,
1820
- Industrial_Production_Index_df_Q]
1821
-
1822
- # Check if any dataframes are empty and if there are remove them
1823
- all_dfs_list_Q = [df for df in all_dfs_list_Q if not df.empty]
1824
- if all_dfs_list_Q != []:
1825
- macroeconomic_monthly_df_Q = pd.concat(all_dfs_list_Q, axis=1)
1826
- else:
1827
- macroeconomic_monthly_df_Q = pd.DataFrame()
1828
-
1829
- # For USD GBP Exchange Rate
1830
- # If it's the UK add this series else don't
1831
- if countries_list[index] == 'GBR':
1832
- USD_GBP_Exchange_Rate_df = pd.read_csv(
1833
- 'https://stats.oecd.org/SDMX-JSON/data/MEI_FIN/CCUS.' + countries_list[index] + '.M/OECD?contentType=csv')
1834
- USD_GBP_Exchange_Rate_df.head()
1835
- USD_GBP_Exchange_Rate_df_pivot = pd.pivot_table(USD_GBP_Exchange_Rate_df, values='Value', index='TIME',
1836
- columns='Subject')
1837
- USD_GBP_Exchange_Rate_df_pivot_final = USD_GBP_Exchange_Rate_df_pivot.loc["2015-01":]
1838
- USD_GBP_Exchange_Rate_df_pivot_final.rename(
1839
- columns={'Currency exchange rates, monthly average': 'usd_gbp_exchange_rate'}, inplace=True)
1840
-
1841
- # Create final monthly dataframe
1842
- macroeconomic_monthly_df_M = pd.concat([cif_Macroeconomic_df_M, USD_GBP_Exchange_Rate_df_pivot_final], axis=1)
1843
- else:
1844
- # Create final monthly dataframe
1845
- macroeconomic_monthly_df_M = cif_Macroeconomic_df_M
1846
-
1847
- # Create the final W/C Sunday dataframe
1848
- # For monthly data
1849
- macroeconomic_monthly_df_M['Date'] = macroeconomic_monthly_df_M.index
1850
- df_M = macroeconomic_monthly_df_M.set_index(pd.to_datetime(macroeconomic_monthly_df_M['Date'])).drop(columns='Date')
1851
- df_M.fillna(method="ffill", inplace=True)
1852
- df_M.reset_index(inplace=True)
1853
-
1854
- daily_records = []
1855
- # Iterate over each row in the DataFrame
1856
- for _, row in df_M.iterrows():
1857
- # Calculate the number of days in the month
1858
- num_days = calendar.monthrange(row["Date"].year, row["Date"].month)[1]
1859
- # Create a new record for each day of the month
1860
- for day in range(1, num_days + 1):
1861
- daily_row = row.copy()
1862
- daily_row["Date"] = row["Date"].replace(day=day)
1863
- daily_records.append(daily_row)
1864
-
1865
- # Convert the list of daily records into a DataFrame
1866
- daily_df = pd.DataFrame(daily_records)
1867
-
1868
- # Extend dataframe to include the current data if needed
1869
- datelist = pd.date_range(daily_df["Date"].iloc[-1] + pd.Timedelta(days=1), datetime.today()).tolist()
1870
- extended_data = np.repeat([list(daily_df.iloc[-1, 1:].values)], len(datelist), axis=0)
1871
- q = pd.Series(datelist, name="Date")
1872
- s = pd.DataFrame(extended_data, columns=list(df_M.columns[1:]))
1873
- extended_daily_df = pd.concat([q, s], axis=1)
1874
- extended_daily_df = pd.concat([daily_df, extended_daily_df], ignore_index=False)
1875
-
1876
- # Create a week commencing column
1877
- extended_daily_df["Date"] = pd.to_datetime(extended_daily_df["Date"], format='%d %b %Y')
1878
- extended_daily_df['week_start'] = extended_daily_df["Date"].apply(
1879
- lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1880
- extended_daily_df.drop("Date", axis=1, inplace=True)
1881
- extended_daily_df.rename(columns={'week_start': "Date"}, inplace=True)
1882
-
1883
- # Take a weekly average
1884
- macroeconomic_weekly_df_M = extended_daily_df.groupby('Date').mean()
1885
-
1886
- # For quarterly data
1887
- # If there are quarterly datasets
1888
- if all_dfs_list_Q != []:
1889
- macroeconomic_monthly_df_Q['Date'] = macroeconomic_monthly_df_Q.index
1890
- df_Q = macroeconomic_monthly_df_Q.set_index(pd.to_datetime(macroeconomic_monthly_df_Q['Date'])).drop(
1891
- columns='Date')
1892
- df_Q.fillna(method="ffill", inplace=True)
1893
- df_Q.reset_index(inplace=True)
1894
-
1895
- daily_records = []
1896
- for _, row in df_Q.iterrows():
1897
- year = row["Date"].year
1898
- month = row["Date"].month
1899
- day = row["Date"].day
1900
- last_date = get_last_day_of_the_quarter(datetime(year, month, day).date())
1901
- all_days = pd.date_range(row["Date"], last_date, freq="D")
1902
-
1903
- # Create a new record for each day of the quarter
1904
- for day in all_days:
1905
- daily_row = row.copy()
1906
- daily_row["Date"] = row["Date"].replace(day=day.day, month=day.month)
1907
- daily_records.append(daily_row)
1908
-
1909
- # Convert the list of daily records into a DataFrame
1910
- daily_df = pd.DataFrame(daily_records)
1911
-
1912
- # Extend dataframe to include data up to today
1913
- datelist = pd.date_range(daily_df["Date"].iloc[-1] + pd.Timedelta(days=1), datetime.today()).tolist()
1914
- extended_data = np.repeat([list(daily_df.iloc[-1, 1:].values)], len(datelist), axis=0)
1915
- q = pd.Series(datelist, name="Date")
1916
- s = pd.DataFrame(extended_data, columns=list(df_Q.columns[1:]))
1917
- extended_daily_df = pd.concat([q, s], axis=1)
1918
- extended_daily_df = pd.concat([daily_df, extended_daily_df], ignore_index=False)
1919
-
1920
- # Create a week commencing column
1921
- extended_daily_df["Date"] = pd.to_datetime(extended_daily_df["Date"], format='%d %b %Y')
1922
- extended_daily_df['week_start'] = extended_daily_df["Date"].apply(
1923
- lambda x: x - pd.Timedelta(days=(x.weekday() - day_dict[week_commencing]) % 7))
1924
- extended_daily_df.drop("Date", axis=1, inplace=True)
1925
- extended_daily_df.rename(columns={'week_start': "Date"}, inplace=True)
1926
-
1927
- # Take a weekly average
1928
- macroeconomic_weekly_df_Q = extended_daily_df.groupby('Date').mean()
1929
-
1930
- # Merge the two datasets together
1931
- if all_dfs_list_Q != []:
1932
- macroeconomic_weekly_df = macroeconomic_weekly_df_M.merge(macroeconomic_weekly_df_Q, left_index=True,
1933
- right_index=True)
1934
- # If there are no quarterly datasets
1598
+ Returns:
1599
+ pd.DataFrame: A DataFrame with weekly aggregated OECD data. The 'OBS' column contains the week
1600
+ commencing dates, and other columns contain the aggregated time series values.
1601
+ """
1602
+
1603
+ def parse_quarter(date_str):
1604
+ """Parses a string in 'YYYY-Q#' format into a datetime object."""
1605
+ year, quarter = date_str.split('-')
1606
+ quarter_number = int(quarter[1])
1607
+ month = (quarter_number - 1) * 3 + 1
1608
+ return pd.Timestamp(f"{year}-{month:02d}-01")
1609
+
1610
+ # Generate a date range from 1950-01-01 to today
1611
+ date_range = pd.date_range(start=start_date, end=datetime.today(), freq='D')
1612
+
1613
+ url_details = [
1614
+ ["BCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_business_confidence_index"],
1615
+ ["CCICP", "SDD.STES,DSD_STES@DF_CLI,", ".....", "macro_consumer_confidence_index"],
1616
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA._T.N.GY", "macro_cpi_total"],
1617
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP041T043.N.GY", "macro_cpi_housing"],
1618
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP01.N.GY", "macro_cpi_food"],
1619
+ ["N.CPI", "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,", "PA.CP045_0722.N.GY", "macro_cpi_energy"],
1620
+ ["UNE_LF_M", "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,", "._Z.Y._T.Y_GE15.", "macro_unemployment_rate"],
1621
+ ["EAR", "SDD.TPS,DSD_EAR@DF_HOU_EAR,", ".Y..S1D", "macro_private_hourly_earnings"],
1622
+ ["RHP", "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0", "", "macro_real_house_prices"],
1623
+ ["PRVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX.C..", "macro_manufacturing_production_volume"],
1624
+ ["TOVM", "SDD.STES,DSD_KEI@DF_KEI,4.0", "IX...", "macro_retail_trade_volume"],
1625
+ ["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
1626
+ ["IRLT", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_long_term_interest_rate"],
1627
+ ["B1GQ", "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1", "._Z....GY.T0102", "macro_gdp_growth_yoy"]
1628
+ ]
1629
+
1630
+ # Create empty final dataframe
1631
+ oecd_df_final = pd.DataFrame()
1632
+
1633
+ daily_df = pd.DataFrame({'OBS': date_range})
1634
+ value_columns = []
1635
+
1636
+ # Iterate for each variable of interest
1637
+ for series_details in url_details:
1638
+ series = series_details[0]
1639
+ dataset_id = series_details[1]
1640
+ filter = series_details[2]
1641
+ col_name = series_details[3]
1642
+
1643
+ # check if request was successful and determine the most granular data available
1644
+ for freq in ['M', 'Q', 'A']:
1645
+
1646
+ if series in ["UNE_LF_M", "EAR"]:
1647
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
1648
+ elif series in ["B1GQ"]:
1649
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
1650
+ else:
1651
+ data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
1652
+
1653
+ # Make the request to the OECD API for data
1654
+ data_response = requests.get(data_url)
1655
+
1656
+ # Check if the request was successful
1657
+ if data_response.status_code != 200:
1658
+ print(f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}")
1659
+ url_test = False
1660
+ continue
1661
+ else:
1662
+ url_test = True
1663
+ break
1664
+
1665
+ # get data for the next variable if url doesn't exist
1666
+ if url_test == False:
1667
+ continue
1668
+
1669
+ root = ET.fromstring(data_response.content)
1670
+
1671
+ # Define namespaces if necessary (the namespace is included in the tags)
1672
+ namespaces = {'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic'}
1673
+
1674
+ # Lists to store the data
1675
+ dates = []
1676
+ values = []
1677
+
1678
+ # Iterate over all <Obs> elements and extract date and value
1679
+ for obs in root.findall('.//generic:Obs', namespaces):
1680
+
1681
+ # Extracting the time period (date)
1682
+ time_period = obs.find('.//generic:ObsDimension', namespaces).get('value')
1683
+
1684
+ # Extracting the observation value
1685
+ value = obs.find('.//generic:ObsValue', namespaces).get('value')
1686
+
1687
+ # Storing the data
1688
+ if time_period and value:
1689
+ dates.append(time_period)
1690
+ values.append(float(value)) # Convert value to float
1691
+
1692
+ # Add variable names that were found to a list
1693
+ value_columns.append(col_name)
1694
+
1695
+ # Creating a DataFrame
1696
+ data = pd.DataFrame({'OBS': dates, col_name: values})
1697
+
1698
+ # Convert date strings into datetime format
1699
+ if freq == 'Q':
1700
+ data['OBS'] = data['OBS'].apply(parse_quarter)
1935
1701
  else:
1936
- macroeconomic_weekly_df = macroeconomic_weekly_df_M
1702
+ # Display the DataFrame
1703
+ data['OBS'] = data['OBS'].apply(lambda x: datetime.strptime(x, '%Y-%m'))
1704
+
1705
+ # Sort data by chronological order
1706
+ data.sort_values(by='OBS', inplace=True)
1707
+
1708
+ # Merge the data based on the observation date
1709
+ daily_df = pd.merge_asof(daily_df, data[['OBS', col_name]], on='OBS', direction='backward')
1937
1710
 
1938
- # Change datetime format
1939
- macroeconomic_weekly_df.index = macroeconomic_weekly_df.index.strftime('%d/%m/%Y')
1940
1711
 
1941
- macroeconomic_weekly_df.reset_index()
1942
- macroeconomic_weekly_df.reset_index(drop=False, inplace=True)
1943
- macroeconomic_weekly_df.rename(columns={'Date': 'OBS'}, inplace=True)
1712
+ # Ensure columns are numeric
1713
+ for col in value_columns:
1714
+ if col in daily_df.columns:
1715
+ daily_df[col] = pd.to_numeric(daily_df[col], errors='coerce').fillna(0)
1716
+ else:
1717
+ print(f"Column {col} not found in daily_df")
1718
+
1719
+ # Aggregate results by week
1720
+ country_df = ims_proc.aggregate_daily_to_wc_wide(df=daily_df,
1721
+ date_column="OBS",
1722
+ group_columns=[],
1723
+ sum_columns=value_columns,
1724
+ wc=week_commencing,
1725
+ aggregation="average")
1726
+
1727
+ oecd_df_final = pd.concat([oecd_df_final, country_df], axis=0, ignore_index=True)
1944
1728
 
1945
- return macroeconomic_weekly_df
1729
+ return oecd_df_final
1946
1730
 
1947
1731
  def get_google_mobility_data(self, country: str, wc: str) -> pd.DataFrame:
1948
1732
  """
@@ -2709,4 +2493,37 @@ class datapull:
2709
2493
 
2710
2494
  final_weather = ims_proc.rename_cols(merged_df, 'seas_')
2711
2495
 
2712
- return final_weather
2496
+ return final_weather
2497
+
2498
+ def pull_covid_data(self, folder_path: str, country: str = "GB", week_commencing: str = "mon") -> pd.DataFrame:
2499
+ """
2500
+ Get covid pandemic data for the country of interest.
2501
+
2502
+ Args:
2503
+ folder_path (str): A string containing the local location of the OneDrive folder.
2504
+ Example: "C:/Users/-- username --/OneDrive - im-sciences.com"
2505
+ The file location within the MasterDrive of the worldwide covid data is:
2506
+ MasterDrive/Central Database/Covid/oxford-government-response.csv
2507
+ country (str): A string containing the country of interest (E.g: "GB", "FR")
2508
+ week_commencing (str): The starting day of the week for aggregation.
2509
+ Options are "mon", "tue", "wed", "thur", "fri", "sat", "sun".
2510
+
2511
+ Returns:
2512
+ pd.DataFrame: A DataFrame containing seasonality and public holiday dummies for the country of interest.
2513
+ The 'OBS' column contains the week commencing dates.
2514
+ """
2515
+
2516
+ df = pd.read_csv(f'{folder_path}/MasterDrive/Central Database/Covid/oxford-government-response.csv')
2517
+
2518
+ country_df = df[df['location_key']==country]
2519
+ country_df.rename(columns={'date': 'OBS'}, inplace=True)
2520
+ country_df.drop('location_key', axis=1, inplace=True)
2521
+
2522
+ agg_df = ims_proc.aggregate_daily_to_wc_wide(country_df, 'OBS', [], country_df.columns.to_list(), week_commencing, 'average')
2523
+
2524
+ covid_df = ims_proc.rename_cols(agg_df, 'covid_')
2525
+
2526
+ covid_df['OBS'] = covid_df['OBS'].apply(lambda x: x[0].date())
2527
+
2528
+ return covid_df
2529
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: imsciences
3
- Version: 0.6.1.2
3
+ Version: 0.6.1.4
4
4
  Summary: IMS Data Processing Package
5
5
  Author: IMS
6
6
  Author-email: cam@im-sciences.com
@@ -8,7 +8,7 @@ def read_md(file_name):
8
8
  return f.read()
9
9
  return ''
10
10
 
11
- VERSION = '0.6.1.2'
11
+ VERSION = '0.6.1.4'
12
12
  DESCRIPTION = 'IMS Data Processing Package'
13
13
  LONG_DESCRIPTION = read_md('README.md') # Reading from README.md
14
14
 
File without changes
File without changes