PyPI - pydeflate - Versions diffs - 2.1.3__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

pydeflate 2.1.3py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

pydeflate/__init__.py +92 -20
pydeflate/cache.py +139 -0
pydeflate/constants.py +121 -0
pydeflate/context.py +211 -0
pydeflate/core/api.py +33 -11
pydeflate/core/source.py +92 -11
pydeflate/deflate/deflators.py +1 -1
pydeflate/deflate/get_deflators.py +233 -0
pydeflate/deflate/legacy_deflate.py +1 -1
pydeflate/exceptions.py +166 -0
pydeflate/exchange/exchangers.py +1 -1
pydeflate/exchange/get_rates.py +207 -0
pydeflate/plugins.py +289 -0
pydeflate/protocols.py +168 -0
pydeflate/pydeflate_config.py +77 -6
pydeflate/schemas.py +297 -0
pydeflate/sources/common.py +59 -107
pydeflate/sources/dac.py +39 -52
pydeflate/sources/imf.py +23 -39
pydeflate/sources/world_bank.py +44 -117
pydeflate/utils.py +14 -9
{pydeflate-2.1.3.dist-info → pydeflate-2.3.0.dist-info}/METADATA +251 -18
pydeflate-2.3.0.dist-info/RECORD +34 -0
pydeflate-2.3.0.dist-info/WHEEL +4 -0
{pydeflate-2.1.3.dist-info → pydeflate-2.3.0.dist-info/licenses}/LICENSE +1 -1
pydeflate-2.1.3.dist-info/RECORD +0 -25
pydeflate-2.1.3.dist-info/WHEEL +0 -4

pydeflate/sources/dac.py CHANGED Viewed

@@ -1,71 +1,58 @@
+from __future__ import annotations
 from pathlib import Path
 import pandas as pd
 from oda_reader import download_dac1
-from pydeflate.pydeflate_config import PYDEFLATE_PATHS
+from pydeflate.cache import CacheEntry, cache_manager
+from pydeflate.pydeflate_config import logger
 from pydeflate.sources.common import (
-    today,
     add_pydeflate_iso3,
-    enforce_pyarrow_types,
     compute_exchange_deflator,
-    read_data,
+    enforce_pyarrow_types,
     prefix_pydeflate_to_columns,
 )
-def _find_dac_files_in_path(path: Path) -> list:
-    """Find all DAC parquet files in the specified directory.
-    Args:
-        path (Path): The directory path to search for DAC parquet files.
-    Returns:
-        list: List of DAC parquet files found in the directory.
-    """
-    return list(path.glob("dac_*.parquet"))
 def _to_units(df: pd.DataFrame) -> pd.DataFrame:
-    """Convert DAC values (in million) to units.
-    Args:
-        df (pd.DataFrame): Dataframe with raw observation values.
+    """Scale reported DAC values (supplied in millions) into base units."""
-    Returns:
-        pd.DataFrame: Dataframe with scaled observation values.
-    """
     df = df.copy()
     df["value"] = df["value"] * df["unit_multiplier"]
     return df
 def _keep_official_definition_only(df: pd.DataFrame) -> pd.DataFrame:
+    """Retain rows matching the official DAC definition across regime changes."""
     query = (
         "(aidtype_code == 1010 & flows_code == 1140 & year <2018 ) | "
         "(aidtype_code == 11010 & flows_code == 1160 & year >=2018)"
     )
     return df.query(query)
 def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
-    columns = ["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"]
+    """Select the key columns used downstream in pydeflate."""
-    return df.filter(columns)
+    return df.filter(["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"])
 def _pivot_amount_type(df: pd.DataFrame) -> pd.DataFrame:
+    """Pivot amount-type codes into separate columns (A/N/D)."""
     df = df.filter(["year", "donor_code", "donor_name", "amounttype_code", "value"])
     return df.pivot(
-        index=[c for c in df.columns if c not in ["amounttype_code", "value"]],
+        index=[c for c in df.columns if c not in {"amounttype_code", "value"}],
         columns="amounttype_code",
         values="value",
     ).reset_index()
 def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
-    # The values for certain providers should be 1
+    """Derive exchange rates, forcing DAC aggregates to unity."""
     df.loc[lambda d: d.donor_code >= 20000, "N"] = df.loc[
         lambda d: d.donor_code >= 20000, "A"
     ]
@@ -74,32 +61,32 @@ def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
 def _compute_dac_deflator(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate the published DAC price deflator from amounts A/D."""
     df["DAC_DEFLATOR"] = round(100 * df["A"] / df["D"], 6)
     return df
 def _compute_dac_gdp_deflator(df: pd.DataFrame) -> pd.DataFrame:
-    df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
+    """Back out a GDP-style deflator using the exchange deflator."""
+    df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
     return df
 def _rename_columns(df: pd.DataFrame) -> pd.DataFrame:
-    return df.rename(
-        columns={
-            "donor_code": "entity_code",
-            "donor_name": "entity",
-        }
-    )
+    """Align donor metadata with pydeflate naming conventions."""
+    return df.rename(columns={"donor_code": "entity_code", "donor_name": "entity"})
-def download_dac():
-    # Use oda_reader to get the data
+def _download_dac(output_path: Path) -> None:
+    """Download and cache the DAC statistics parquet file."""
+    logger.info("Downloading DAC statistics from ODA reader...")
     df = download_dac1(
         filters={"measure": ["1010", "11010"], "flow_type": ["1140", "1160"]}
     )
-    # Clean the data
     df = (
         df.pipe(_to_units)
         .pipe(_keep_official_definition_only)
@@ -115,23 +102,23 @@ def download_dac():
         .pipe(enforce_pyarrow_types)
         .reset_index(drop=True)
     )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(output_path)
+    logger.info("Saved DAC dataset to %s", output_path)
-    # Get today's date to use as a file suffix
-    suffix = today()
-    # Save the data
-    df.to_parquet(PYDEFLATE_PATHS.data / f"dac_{suffix}.parquet")
+_DAC_ENTRY = CacheEntry(
+    key="dac_stats",
+    filename="dac.parquet",
+    fetcher=_download_dac,
+    ttl_days=30,
+)
 def read_dac(update: bool = False) -> pd.DataFrame:
-    """Read the latest WEO data from parquet files or download fresh data."""
-    return read_data(
-        file_finder_func=_find_dac_files_in_path,
-        download_func=download_dac,
-        data_name="DAC",
-        update=update,
-    )
+    path = cache_manager().ensure(_DAC_ENTRY, refresh=update)
+    return pd.read_parquet(path)
-if __name__ == "__main__":
-    df = read_dac(update=True)
+if __name__ == "__main__":  # pragma: no cover
+    read_dac(update=True)

pydeflate/sources/imf.py CHANGED Viewed

@@ -1,15 +1,16 @@
+from __future__ import annotations
 from pathlib import Path
 import pandas as pd
 from imf_reader import weo
-from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
+from pydeflate.cache import CacheEntry, cache_manager
+from pydeflate.pydeflate_config import logger
 from pydeflate.sources.common import (
-    today,
     add_pydeflate_iso3,
-    enforce_pyarrow_types,
     compute_exchange_deflator,
-    read_data,
+    enforce_pyarrow_types,
     prefix_pydeflate_to_columns,
 )
@@ -93,7 +94,7 @@ def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
 def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
-    """Pivot the concept code column to get a wide format for the data.
+    """Pivot the concept dimension so each indicator becomes a column
     Args:
         df (pd.DataFrame): Dataframe with concept code column.
@@ -102,7 +103,7 @@ def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
         pd.DataFrame: Dataframe with concept code pivoted to columns.
     """
     return df.pivot(
-        index=[c for c in df.columns if c not in ["concept_code", "value"]],
+        index=[c for c in df.columns if c not in {"concept_code", "value"}],
         columns="concept_code",
         values="value",
     ).reset_index()
@@ -171,15 +172,13 @@ def _create_eur_series(df: pd.DataFrame) -> pd.DataFrame:
     df.loc[df.entity_code == 998, "EXCHANGE"] = df.loc[
         df.entity_code == 998, "year"
     ].map(eur)
     return df
-def download_weo() -> None:
-    """Download the WEO data, process it, and save it to a parquet file."""
-    logger.info("Downloading the latest WEO data...")
+def _download_weo(output_path: Path) -> None:
+    """Fetch, transform, and store the latest WEO dataset in Parquet format."""
-    # Fetch and process the data through a pipeline of transformations
+    logger.info("Downloading the latest IMF WEO dataset...")
     df = (
         weo.fetch_data()
         .pipe(_filter_indicators)
@@ -195,38 +194,23 @@ def download_weo() -> None:
         .pipe(enforce_pyarrow_types)
         .reset_index(drop=True)
     )
-    # Get today's date to use as a file suffix
-    suffix = today()
-    # Save the processed dataframe to parquet format
-    df.to_parquet(PYDEFLATE_PATHS.data / f"weo_{suffix}.parquet")
-    logger.info(f"Saved WEO data to weo_{suffix}.parquet")
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(output_path)
+    logger.info("Saved WEO data to %s", output_path)
-def _find_weo_files_in_path(path: Path) -> list:
-    """Find all WEO parquet files in the specified directory.
-    Args:
-        path (Path): The directory path to search for WEO parquet files.
-    Returns:
-        list: List of WEO parquet files found in the directory.
-    """
-    return list(path.glob("weo_*.parquet"))
+_IMF_CACHE_ENTRY = CacheEntry(
+    key="imf_weo",
+    filename="imf_weo.parquet",
+    fetcher=_download_weo,
+    ttl_days=60,
+)
 def read_weo(update: bool = False) -> pd.DataFrame:
-    """Read the latest WEO data from parquet files or download fresh data."""
-    return read_data(
-        file_finder_func=_find_weo_files_in_path,
-        download_func=download_weo,
-        data_name="WEO",
-        update=update,
-    )
+    path = cache_manager().ensure(_IMF_CACHE_ENTRY, refresh=update)
+    return pd.read_parquet(path)
-if __name__ == "__main__":
-    # Download the WEO data
-    dfi = read_weo(update=True)
+if __name__ == "__main__":  # pragma: no cover
+    read_weo(update=True)

pydeflate/sources/world_bank.py CHANGED Viewed

@@ -1,15 +1,17 @@
+from __future__ import annotations
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
+from typing import Callable
 import pandas as pd
 import wbgapi as wb
-from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
+from pydeflate.cache import CacheEntry, cache_manager
+from pydeflate.pydeflate_config import logger
 from pydeflate.sources.common import (
-    enforce_pyarrow_types,
-    today,
     compute_exchange_deflator,
-    read_data,
+    enforce_pyarrow_types,
     prefix_pydeflate_to_columns,
 )
 from pydeflate.utils import emu
@@ -56,8 +58,8 @@ def get_wb_indicator(series: str, value_name: str | None = None) -> pd.DataFrame
             labels=True,
         )
         .reset_index()
-        .sort_values(by=["economy", "Time"])  # Sort for easier reading
-        .drop(columns=["Time"])  # Remove unnecessary column
+        .sort_values(by=["economy", "Time"])
+        .drop(columns=["Time"])
         .rename(
             columns={
                 "economy": "entity_code",
@@ -66,7 +68,7 @@ def get_wb_indicator(series: str, value_name: str | None = None) -> pd.DataFrame
                 series: value_name or series,
             }
         )
-        .reset_index(drop=True)  # Drop the old index after reset
+        .reset_index(drop=True)
     )
@@ -119,22 +121,17 @@ def _parallel_download_indicators(indicators: dict) -> list[pd.DataFrame]:
     # Use ThreadPoolExecutor to fetch indicators in parallel
     with ThreadPoolExecutor() as executor:
-        # Submit all tasks to the executor (downloading indicators in parallel)
         future_to_series = {
             executor.submit(get_wb_indicator, series, value_name): series
             for series, value_name in indicators.items()
         }
-        # Collect the results as they complete
         for future in as_completed(future_to_series):
             series = future_to_series[future]
             try:
                 df_ = future.result().set_index(["year", "entity_code", "entity"])
                 dfs.append(df_)
-            except Exception as exc:
-                # Log or handle any errors that occur during the download
-                logger.warning(f"Error downloading series {series}: {exc}")
+            except Exception as exc:  # pragma: no cover - defensive logging
+                logger.warning("Error downloading series %s: %s", series, exc)
     return dfs
@@ -151,140 +148,70 @@ def _add_ppp_ppp_exchange(df: pd.DataFrame) -> pd.DataFrame:
     """
     ppp = df.loc[lambda d: d["entity_code"] == "USA"].copy()
     ppp[["entity_code", "entity", "pydeflate_iso3"]] = "PPP"
+    return pd.concat([df, ppp], ignore_index=True)
-    df = pd.concat([df, ppp], ignore_index=True)
-    return df
-def _download_wb(
-    indicators: dict, prefix: str = "wb", add_ppp_exchange: bool = False
+def _download_wb_dataset(
+    indicators: dict, output_path: Path, add_ppp_exchange: bool = False
 ) -> None:
-    """Download multiple World Bank indicators in parallel and save as a parquet file.
-    This function fetches all indicators defined in _INDICATORS in parallel, concatenates
-    them into a single DataFrame, and saves the result as a parquet file using today's date as a suffix.
-    """
-    logger.info("Downloading the latest World Bank data...")
-    indicators_data = _parallel_download_indicators(indicators=indicators)
+    """Download and materialise a World Bank dataset to ``output_path``."""
-    # Concatenate all DataFrames horizontally (by columns)
+    logger.info("Downloading World Bank indicators for %s", output_path.name)
+    indicators_data = _parallel_download_indicators(indicators)
     df = pd.concat(indicators_data, axis=1).reset_index()
-    # cleaning
     df = (
         df.pipe(_eur_series_fix)
         .pipe(compute_exchange_deflator, base_year_measure="NGDP_D")
         .assign(pydeflate_iso3=lambda d: d.entity_code)
         .sort_values(by=["year", "entity_code"])
     )
     if add_ppp_exchange:
         df = df.pipe(_add_ppp_ppp_exchange)
     df = (
         df.pipe(prefix_pydeflate_to_columns)
         .pipe(enforce_pyarrow_types)
         .reset_index(drop=True)
     )
-    # Get today's date to use as a file suffix
-    suffix = today()
-    # Save the DataFrame as a parquet file
-    output_path = PYDEFLATE_PATHS.data / f"{prefix}_{suffix}.parquet"
+    output_path.parent.mkdir(parents=True, exist_ok=True)
     df.to_parquet(output_path)
+    logger.info("Saved World Bank data to %s", output_path)
-    logger.info(f"Saved World Bank data to {prefix}_{suffix}.parquet")
-def download_wb() -> None:
-    """Download the latest World Bank data."""
-    _download_wb(indicators=_INDICATORS, prefix="wb")
-def download_wb_lcu_ppp() -> None:
-    """Download the latest World Bank data (PPP)."""
-    _download_wb(
-        indicators=_INDICATORS_LCU_PPP, prefix="wb_lcu_ppp", add_ppp_exchange=True
-    )
-def download_wb_usd_ppp() -> None:
-    """Download the latest World Bank data (PPP)."""
-    _download_wb(
-        indicators=_INDICATORS_USD_PPP, prefix="wb_usd_ppp", add_ppp_exchange=True
-    )
+def _entry(
+    key: str, filename: str, fetcher: Callable[[Path], None], ttl_days: int = 30
+) -> CacheEntry:
+    return CacheEntry(key=key, filename=filename, fetcher=fetcher, ttl_days=ttl_days)
-def _find_wb_files_in_path(path: Path) -> list:
-    """Find all WB parquet files in the specified directory.
-    Args:
-        path (Path): The directory path to search for WB parquet files.
-    Returns:
-        list: List of WB parquet files found in the directory.
-    """
-    return list(path.glob(f"wb_*.parquet"))
-def _find_wb_lcu_ppp_files_in_path(path: Path) -> list:
-    """Find all WB PPP parquet files in the specified directory.
-    Args:
-        path (Path): The directory path to search for WB parquet files.
-    Returns:
-        list: List of WB parquet files found in the directory.
-    """
-    return list(path.glob(f"wb_lcu_ppp_*.parquet"))
-def _find_wb_usd_ppp_files_in_path(path: Path) -> list:
-    """Find all WB PPP parquet files in the specified directory.
-    Args:
-        path (Path): The directory path to search for WB parquet files.
-    Returns:
-        list: List of WB parquet files found in the directory.
-    """
-    return list(path.glob(f"wb_usd_ppp_*.parquet"))
+_WB_ENTRY = _entry(
+    "world_bank", "wb.parquet", lambda p: _download_wb_dataset(_INDICATORS, p)
+)
+_WB_LCU_PPP_ENTRY = _entry(
+    "world_bank_lcu_ppp",
+    "wb_lcu_ppp.parquet",
+    lambda p: _download_wb_dataset(_INDICATORS_LCU_PPP, p, add_ppp_exchange=True),
+)
+_WB_USD_PPP_ENTRY = _entry(
+    "world_bank_usd_ppp",
+    "wb_usd_ppp.parquet",
+    lambda p: _download_wb_dataset(_INDICATORS_USD_PPP, p, add_ppp_exchange=True),
+)
 def read_wb(update: bool = False) -> pd.DataFrame:
-    """Read the latest World Bank data from parquet files or download fresh data."""
-    return read_data(
-        file_finder_func=_find_wb_files_in_path,
-        download_func=download_wb,
-        data_name="World Bank",
-        update=update,
-    )
+    path = cache_manager().ensure(_WB_ENTRY, refresh=update)
+    return pd.read_parquet(path)
 def read_wb_lcu_ppp(update: bool = False) -> pd.DataFrame:
-    """Read the latest World Bank data from parquet files or download fresh data."""
-    return read_data(
-        file_finder_func=_find_wb_lcu_ppp_files_in_path,
-        download_func=download_wb_lcu_ppp,
-        data_name="World Bank",
-        update=update,
-    )
+    path = cache_manager().ensure(_WB_LCU_PPP_ENTRY, refresh=update)
+    return pd.read_parquet(path)
 def read_wb_usd_ppp(update: bool = False) -> pd.DataFrame:
-    """Read the latest World Bank data from parquet files or download fresh data."""
-    return read_data(
-        file_finder_func=_find_wb_usd_ppp_files_in_path,
-        download_func=download_wb_usd_ppp,
-        data_name="World Bank",
-        update=update,
-    )
+    path = cache_manager().ensure(_WB_USD_PPP_ENTRY, refresh=update)
+    return pd.read_parquet(path)
-if __name__ == "__main__":
-    df_wb = read_wb(False)
-    df_usd = read_wb_usd_ppp(False)
-    df_lcu = read_wb_lcu_ppp(False)
+if __name__ == "__main__":  # pragma: no cover
+    read_wb(update=True)

pydeflate/utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+import re
 import numpy as np
 import pandas as pd
@@ -22,18 +23,25 @@ def emu() -> list:
 def clean_number(number):
-    """Clean a number and return as float"""
-    import re
+    """Clean a number-like value and return it as a float.
+    Preserves leading signs and scientific notation while stripping
+    formatting artifacts such as commas or surrounding text.
+    """
     if not isinstance(number, str):
         number = str(number)
-    number = re.sub(r"[^\d.]", "", number)
+    normalized = number.replace(",", "").strip()
+    match = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", normalized)
-    if number == "":
+    if not match:
         return np.nan
-    return float(number)
+    try:
+        return float(match.group())
+    except ValueError:
+        return np.nan
 def create_pydeflate_year(
@@ -65,9 +73,7 @@ def _use_implied_dac_rates(
     data.loc[
         lambda d: ~d[f"temp_{entity_column}"].isin(pydeflate_data[ix[-1]].unique()),
         f"temp_{entity_column}",
-    ] = (
-        20001 if source_codes else "DAC"
-    )
+    ] = 20001 if source_codes else "DAC"
     # Log the fact that implied rates are being used
     flag_missing_pydeflate_data(
@@ -90,7 +96,6 @@ def merge_user_and_pydeflate_data(
     source_codes: bool = True,
     dac: bool = False,
 ) -> pd.DataFrame:
     data[f"temp_{entity_column}"] = data[entity_column]
     if dac:

pydeflate 2.1.3__py3-none-any.whl → 2.3.0__py3-none-any.whl

pydeflate 2.1.3py3-none-any.whl → 2.3.0py3-none-any.whl