PyPI - imsciences - Versions diffs - 1.0.2__py3-none-any.whl → 1.1.6__py3-none-any.whl - Mend

imsciences 1.0.2py3-none-any.whl → 1.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

imsciences/__init__.py +2 -0
imsciences/oecd_pull.py +423 -0
imsciences/pull-IMS-24Ltp-3.py +3132 -0
imsciences/pull.py +137 -218
imsciences-1.1.6.dist-info/METADATA +365 -0
imsciences-1.0.2.dist-info/METADATA → imsciences-1.1.6.dist-info/PKG-INFO-IMS-24Ltp-3 +1 -1
imsciences-1.1.6.dist-info/RECORD +14 -0
{imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info}/WHEEL +1 -1
imsciences/unittesting.py +0 -1314
imsciences-1.0.2.dist-info/RECORD +0 -12
{imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info}/PKG-INFO-TomG-HP-290722 +0 -0
{imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info/licenses}/LICENSE.txt +0 -0
{imsciences-1.0.2.dist-info → imsciences-1.1.6.dist-info}/top_level.txt +0 -0

imsciences/pull.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import importlib
+import os
 import re
 import time
 import urllib.request
@@ -19,7 +20,6 @@ from imsciences.mmm import dataprocessing
 ims_proc = dataprocessing()
 class datapull:
     def help(self):
         print("This is the help section. The functions in the package are as follows:")
@@ -281,10 +281,16 @@ class datapull:
         start_date: str = "2020-01-01",
     ) -> pd.DataFrame:
         """
-        Fetch and process time series data from the OECD API.
+        Load and process time series data from the cached OECD parquet file.
+        This method loads pre-fetched OECD data from either:
+        1. Shared network path (if accessible)
+        2. Local cache directory (fallback)
+        If the cache doesn't exist anywhere, it automatically runs the OECDDataPuller to generate it.
         Args:
-            country (list): A string containing a 3-letter code the of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
+            country (str): A string containing a 3-letter code of country of interest (E.g: "GBR", "FRA", "USA", "DEU")
             week_commencing (str): The starting day of the week for aggregation.
                                 Options are "mon", "tue", "wed", "thu", "fri", "sat", "sun".
             start_date (str): Dataset start date in the format "YYYY-MM-DD"
@@ -294,196 +300,59 @@ class datapull:
                         commencing dates, and other columns contain the aggregated time series values.
         """
-        def parse_quarter(date_str):
-            """Parses a string in 'YYYY-Q#' format into a datetime object."""
-            year, quarter = date_str.split("-")
-            quarter_number = int(quarter[1])
-            month = (quarter_number - 1) * 3 + 1
-            return pd.Timestamp(f"{year}-{month:02d}-01")
-        # Generate a date range from 1950-01-01 to today
-        date_range = pd.date_range(start=start_date, end=datetime.today(), freq="D")
-        url_details = [
-            [
-                "BCICP",
-                "SDD.STES,DSD_STES@DF_CLI,",
-                ".....",
-                "macro_business_confidence_index",
-            ],
-            [
-                "CCICP",
-                "SDD.STES,DSD_STES@DF_CLI,",
-                ".....",
-                "macro_consumer_confidence_index",
-            ],
-            [
-                "N.CPI",
-                "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
-                "PA._T.N.GY",
-                "macro_cpi_total",
-            ],
-            [
-                "N.CPI",
-                "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
-                "PA.CP041T043.N.GY",
-                "macro_cpi_housing",
-            ],
-            [
-                "N.CPI",
-                "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
-                "PA.CP01.N.GY",
-                "macro_cpi_food",
-            ],
-            [
-                "N.CPI",
-                "SDD.TPS,DSD_PRICES@DF_PRICES_ALL,",
-                "PA.CP045_0722.N.GY",
-                "macro_cpi_energy",
-            ],
-            [
-                "UNE_LF_M",
-                "SDD.TPS,DSD_LFS@DF_IALFS_UNE_M,",
-                "._Z.Y._T.Y_GE15.",
-                "macro_unemployment_rate",
-            ],
-            [
-                "EAR",
-                "SDD.TPS,DSD_EAR@DF_HOU_EAR,",
-                ".Y..S1D",
-                "macro_private_hourly_earnings",
-            ],
-            [
-                "RHP",
-                "ECO.MPD,DSD_AN_HOUSE_PRICES@DF_HOUSE_PRICES,1.0",
-                "",
-                "macro_real_house_prices",
-            ],
-            [
-                "PRVM",
-                "SDD.STES,DSD_KEI@DF_KEI,4.0",
-                "IX.C..",
-                "macro_manufacturing_production_volume",
-            ],
-            [
-                "TOVM",
-                "SDD.STES,DSD_KEI@DF_KEI,4.0",
-                "IX...",
-                "macro_retail_trade_volume",
-            ],
-            ["IRSTCI", "SDD.STES,DSD_KEI@DF_KEI,4.0", "PA...", "macro_interbank_rate"],
-            [
-                "IRLT",
-                "SDD.STES,DSD_KEI@DF_KEI,4.0",
-                "PA...",
-                "macro_long_term_interest_rate",
-            ],
-            [
-                "B1GQ",
-                "SDD.NAD,DSD_NAMAIN1@DF_QNA,1.1",
-                "._Z....GY.T0102",
-                "macro_gdp_growth_yoy",
-            ],
-        ]
-        # Create empty final dataframe
-        oecd_df_final = pd.DataFrame()
-        daily_df = pd.DataFrame({"OBS": date_range})
-        value_columns = []
-        # Iterate for each variable of interest
-        for series_details in url_details:
-            series = series_details[0]
-            dataset_id = series_details[1]
-            filter = series_details[2]
-            col_name = series_details[3]
-            # check if request was successful and determine the most granular data available
-            for freq in ["M", "Q", "A"]:
-                if series in ["UNE_LF_M", "EAR"]:
-                    data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{series}.{filter}.{freq}?startPeriod=1950-01"
-                elif series in ["B1GQ"]:
-                    data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{freq}..{country}...{series}.{filter}?startPeriod=1950-01"
-                else:
-                    data_url = f"https://sdmx.oecd.org/public/rest/data/OECD.{dataset_id}/{country}.{freq}.{series}.{filter}?startPeriod=1950-01"
-                # Make the request to the OECD API for data
-                data_response = requests.get(data_url)
-                # Check if the request was successful
-                if data_response.status_code != 200:
-                    print(
-                        f"Failed to fetch data for series {series} with frequency '{freq}' for {country}: {data_response.status_code} {data_response.text}",
-                    )
-                    url_test = False
-                    continue
-                url_test = True
-                break
-            # get data for the next variable if url doesn't exist
-            if url_test is False:
-                continue
-            root = ET.fromstring(data_response.content)
-            # Define namespaces if necessary (the namespace is included in the tags)
-            namespaces = {
-                "generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
-            }
-            # Lists to store the data
-            dates = []
-            values = []
-            # Iterate over all <Obs> elements and extract date and value
-            for obs in root.findall(".//generic:Obs", namespaces):
-                # Extracting the time period (date)
-                time_period = obs.find(".//generic:ObsDimension", namespaces).get(
-                    "value",
-                )
-                # Extracting the observation value
-                value = obs.find(".//generic:ObsValue", namespaces).get("value")
-                # Storing the data
-                if time_period and value:
-                    dates.append(time_period)
-                    values.append(float(value))  # Convert value to float
-            # Add variable names that were found to a list
-            value_columns.append(col_name)
-            # Creating a DataFrame
-            data = pd.DataFrame({"OBS": dates, col_name: values})
-            # Convert date strings into datetime format
-            if freq == "Q":
-                data["OBS"] = data["OBS"].apply(parse_quarter)
+        from pathlib import Path
+        # Try shared network path first, then fall back to local
+        user_home = os.path.expanduser("~")
+        shared_path = Path(user_home) / "im-sciences.com" / "FileShare - MasterDrive" / "Central Database" / "Pull All" / "OECD Database"
+        local_path = Path("oecd_data")
+        shared_data_file = shared_path / f"oecd_data_{country}.csv"
+        local_data_file = local_path / f"oecd_data_{country}.csv"
+        data_file = None
+        data_location = None
+        # Check shared path first
+        if shared_data_file.exists():
+            data_file = shared_data_file
+            data_location = "shared network"
+        # Fall back to local path
+        elif local_data_file.exists():
+            data_file = local_data_file
+            data_location = "local"
+        # If no cache found anywhere, run the puller
+        if data_file is None:
+            print(f"OECD data cache not found. Running OECDDataPuller to fetch data...")
+            from .oecd_pull import OECDDataPuller
+            puller = OECDDataPuller(
+                country=country,
+                start_date=start_date,
+                output_dir=None  # Let puller decide between shared/local
+            )
+            puller.run_until_complete(max_iterations=1)  # Run one complete cycle
+            print(f"OECD data fetched and cached successfully.\n")
+            # Determine where it was saved
+            if shared_data_file.exists():
+                data_file = shared_data_file
+                data_location = "shared network"
             else:
-                # Display the DataFrame
-                data["OBS"] = data["OBS"].apply(lambda x: datetime.strptime(x, "%Y-%m"))
+                data_file = local_data_file
+                data_location = "local"
-            # Sort data by chronological order
-            data.sort_values(by="OBS", inplace=True)
+        print(f"Loading OECD data from {data_location}: {data_file}")
+        daily_df = pd.read_csv(data_file)
+        daily_df['OBS'] = pd.to_datetime(daily_df['OBS'])
-            # Merge the data based on the observation date
-            daily_df = pd.merge_asof(
-                daily_df,
-                data[["OBS", col_name]],
-                on="OBS",
-                direction="backward",
-            )
+        # Get list of value columns (exclude OBS)
+        value_columns = [col for col in daily_df.columns if col != "OBS"]
         # Ensure columns are numeric
         for col in value_columns:
-            if col in daily_df.columns:
-                daily_df[col] = pd.to_numeric(daily_df[col], errors="coerce").fillna(0)
-            else:
-                print(f"Column {col} not found in daily_df")
+            daily_df[col] = pd.to_numeric(daily_df[col], errors="coerce").fillna(0)
-        # Aggregate results by week
+        # Aggregate to weekly
         country_df = ims_proc.aggregate_daily_to_wc_wide(
             df=daily_df,
             date_column="OBS",
@@ -493,11 +362,7 @@ class datapull:
             aggregation="average",
         )
-        oecd_df_final = pd.concat(
-            [oecd_df_final, country_df],
-            axis=0,
-            ignore_index=True,
-        )
+        return country_df
         return oecd_df_final
@@ -909,6 +774,12 @@ class datapull:
             "NG": "Nigeria",
             "ST": "SaoTomeAndPrincipe",
         }
+    # Month Order Dictionary
+        month_order = [
+        "january", "february", "march", "april",
+        "may", "june", "july", "august",
+        "september", "october", "november", "december",
+        ]
         # ---------------------------------------------------------------------
         # 1. Create daily date range from start_date to today
@@ -1124,6 +995,9 @@ class datapull:
                 holiday_date = row["Date"]
                 # Create column name without modifying original holiday names
                 holiday_name = row["Holiday"].lower().replace(" ", "_")
+                # Remove all non-alphanumeric characters (except underscores) to prevent commas and apostrophes
+                holiday_name = re.sub(r"[^\w]", "", holiday_name)
                 # Remove "_shift" or "_substitute" if they appear as standalone suffixes
                 if holiday_name.endswith("_shift"):
@@ -1349,10 +1223,15 @@ class datapull:
         df_weekly_iso_week_year["Year"] = df_weekly_iso_week_year["Year"].astype(int)
         # --- Monthly dummies (spread evenly across week) ---
-        df_daily["Month"] = df_daily["Date"].dt.month_name().str.lower()
+        df_daily["Month"] = pd.Categorical(
+    df_daily["Date"].dt.month_name().str.lower(),
+    categories=month_order,
+    ordered=True,
+        )
         df_monthly_dummies_daily = pd.get_dummies(
             df_daily[["week_start", "Month"]],  # Only need these columns
-            prefix="seas_month",
+            prefix="seas",
             columns=["Month"],
             dtype=float,  # Use float for division
         )
@@ -1435,7 +1314,7 @@ class datapull:
         # Reorder columns - OBS first, then Constant, Trend, then seasonal features
         cols_order = (
             ["OBS", "Constant", "Trend"]
-            + sorted([col for col in df_combined.columns if col.startswith("seas_")])
+            + [col for col in df_combined.columns if col.startswith("seas_")]
             + sorted([col for col in df_combined.columns if col.startswith("dum_")])
         )  # If individual week dummies were enabled
@@ -1444,7 +1323,6 @@ class datapull:
         df_combined = df_combined[final_cols]
         return df_combined
     def pull_weather(self, week_commencing, start_date, country_codes) -> pd.DataFrame:
         """
         Pull weather data for a given week-commencing day and one or more country codes.
@@ -2397,13 +2275,13 @@ class datapull:
             cdid_list (list, optional): A list of additional CDIDs to fetch (e.g., ['JP9Z', 'UKPOP']). Defaults to None.
             week_start_day (str, optional): The day the week starts on ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'). Defaults to 'mon'.
             sector (str or list, optional): The sector(s) for which the standard CDIDs are fetched
-                                             (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
+                                                (e.g., 'fast_food', ['fast_food', 'retail']). Defaults to None (only default CDIDs).
         Returns
         -------
             pd.DataFrame: A DataFrame with weekly frequency, containing an 'OBS' column (week commencing date)
-                          and all series as renamed columns (e.g., 'macro_retail_sales_uk').
-                          Returns an empty DataFrame if no data is fetched or processed.
+                            and all series as renamed columns (e.g., 'macro_retail_sales_uk').
+                            Returns an empty DataFrame if no data is fetched or processed.
         """
         # Define CDIDs for sectors and defaults
@@ -2436,16 +2314,11 @@ class datapull:
                     sector_cdids_map.get(sec, []),
                 )  # Use extend to add items from the list
-        standard_cdids = list(
-            set(default_cdids + sector_specific_cdids),
-        )  # Combine default and selected sector CDIDs, ensure uniqueness
         # Combine standard CDIDs and any additional user-provided CDIDs
+        standard_cdids = list(dict.fromkeys(default_cdids + sector_specific_cdids))
         if cdid_list is None:
             cdid_list = []
-        final_cdid_list = list(
-            set(standard_cdids + cdid_list),
-        )  # Ensure uniqueness in the final list
+        final_cdid_list = list(dict.fromkeys(standard_cdids + cdid_list))
         base_search_url = (
             "https://api.beta.ons.gov.uk/v1/search?content_type=timeseries&cdids="
@@ -2670,26 +2543,59 @@ class datapull:
             )
             def clean_column_name(name):
-                # Remove content within parentheses (e.g., CPI INDEX 00: ALL ITEMS 2015=100)
+                # Remove content within parentheses
                 name = re.sub(r"\(.*?\)", "", name)
-                # Take only the part before the first colon if present
-                name = re.split(r":", name)[0]
-                # Remove digits
-                # name = re.sub(r"\d+", "", name) # Reconsider removing all digits, might be needed for some series
-                # Remove specific words like 'annual', 'rate' case-insensitively
-                name = re.sub(r"\b(annual|rate)\b", "", name, flags=re.IGNORECASE)
+                # Special handling for ANY CPI items (not just CPI INDEX)
+                if "CPI" in name.upper():
+                    # Extract the description part after the colon for CPI items
+                    if ":" in name:
+                        parts = name.split(":")
+                        if len(parts) >= 2:
+                            # Take the description part (usually the second part)
+                            description = parts[1].strip()
+                            # Remove any remaining colons and everything after
+                            description = description.split(":")[0].strip()
+                            name = f"CPI {description}"
+                    # Remove numbers and dots for ALL CPI items (like 00, 06.2.2, 12.5.3/5)
+                    name = re.sub(r"\d+\.?\d*/?\.?\d*", "", name)
+                else:
+                    # For non-CPI items, take only the part before the first colon
+                    name = re.split(r":", name)[0]
+                    # Remove all digits for non-CPI items too
+                    name = re.sub(r"\d+", "", name)
+                # Remove year references like "2015=100"
+                name = re.sub(r"\d{4}=\d+", "", name)
+                # Remove specific words case-insensitively
+                name = re.sub(r"\b(annual|rate|index|seasonally|adjusted|sa|cvm)\b", "", name, flags=re.IGNORECASE)
+                # Remove percentage symbols and "%"
+                name = re.sub(r"%", "percent", name)
                 # Remove non-alphanumeric characters (except underscore and space)
                 name = re.sub(r"[^\w\s]", "", name)
                 # Replace spaces with underscores
-                name = name.strip()  # Remove leading/trailing whitespace
-                name = name.replace(" ", "_")
+                name = name.strip().replace(" ", "_")
                 # Replace multiple underscores with a single one
                 name = re.sub(r"_+", "_", name)
-                # Remove trailing underscores
-                name = name.rstrip("_")
-                # Add prefix and suffix
+                # Remove leading/trailing underscores
+                name = name.strip("_")
+                # Truncate very long names (optional)
+                if len(name) > 50:
+                    words = name.split("_")
+                    # Keep first few meaningful words
+                    name = "_".join(words[:4])
                 return f"macro_{name.lower()}_uk"
             # Apply cleaning function to relevant columns
             weekly_df.columns = [
                 clean_column_name(col) if col != "week_commencing" else col
@@ -2704,6 +2610,19 @@ class datapull:
             # Consider if 0 is the appropriate fill value for your use case
             # weekly_df = weekly_df.fillna(0)
+            # Get only the data columns (excluding OBS)
+            data_columns = [col for col in weekly_df.columns if col != "OBS"]
+            new_columns = ["OBS"]
+            for i, col in enumerate(data_columns):
+                if i < len(final_cdid_list):
+                    new_columns.append(f"{col}_{final_cdid_list[i]}")
+                else:
+                    new_columns.append(col)  # Keep original if no matching CDID
+            # Apply the new column names to the DataFrame
+            weekly_df.columns = new_columns
             return weekly_df
         print("No data successfully fetched or processed.")
         return pd.DataFrame()

imsciences 1.0.2__py3-none-any.whl → 1.1.6__py3-none-any.whl

imsciences 1.0.2py3-none-any.whl → 1.1.6py3-none-any.whl