PyPI - pydeflate - Versions diffs - 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

pydeflate 2.1.3py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

pydeflate/__init__.py +64 -20
pydeflate/cache.py +139 -0
pydeflate/constants.py +121 -0
pydeflate/context.py +211 -0
pydeflate/core/api.py +33 -11
pydeflate/core/source.py +92 -11
pydeflate/deflate/deflators.py +1 -1
pydeflate/deflate/legacy_deflate.py +1 -1
pydeflate/exceptions.py +166 -0
pydeflate/exchange/exchangers.py +1 -1
pydeflate/plugins.py +289 -0
pydeflate/protocols.py +168 -0
pydeflate/pydeflate_config.py +77 -6
pydeflate/schemas.py +297 -0
pydeflate/sources/common.py +59 -107
pydeflate/sources/dac.py +39 -52
pydeflate/sources/imf.py +23 -39
pydeflate/sources/world_bank.py +44 -117
pydeflate/utils.py +14 -9
{pydeflate-2.1.3.dist-info → pydeflate-2.2.0.dist-info}/METADATA +119 -18
pydeflate-2.2.0.dist-info/RECORD +32 -0
pydeflate-2.2.0.dist-info/WHEEL +4 -0
{pydeflate-2.1.3.dist-info → pydeflate-2.2.0.dist-info/licenses}/LICENSE +1 -1
pydeflate-2.1.3.dist-info/RECORD +0 -25
pydeflate-2.1.3.dist-info/WHEEL +0 -4

pydeflate/sources/common.py CHANGED Viewed

@@ -1,41 +1,19 @@
-from datetime import datetime
-from pathlib import Path
+from __future__ import annotations
 from typing import Any, Literal
 import pandas as pd
 from hdx.location.country import Country
-from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
+from pydeflate.pydeflate_config import logger
 AvailableDeflators = Literal["NGDP_D", "NGDP_DL", "CPI", "PCPI", "PCPIE"]
-def check_file_age(file: Path) -> int:
-    """Check the age of a WEO file in days.
-    Args:
-        file (Path): The WEO parquet file to check.
-    Returns:
-        int: The number of days since the file was created.
-    """
-    current_date = datetime.today()
-    # Extract date from the filename (format: weo_YYYY-MM-DD.parquet)
-    file_date = datetime.strptime(file.stem.split("_")[-1], "%Y-%m-%d")
-    # Return the difference in days between today and the file's date
-    return (current_date - file_date).days
 def enforce_pyarrow_types(df: pd.DataFrame) -> pd.DataFrame:
-    """Ensures that a DataFrame uses pyarrow dtypes."""
-    return df.convert_dtypes(dtype_backend="pyarrow")
-def today() -> str:
-    from datetime import datetime
+    """Ensure that a DataFrame uses pyarrow-backed dtypes."""
-    return datetime.today().strftime("%Y-%m-%d")
+    return df.convert_dtypes(dtype_backend="pyarrow")
 def _match_regex_to_iso3(
@@ -52,20 +30,17 @@ def _match_regex_to_iso3(
     if additional_mapping is None:
         additional_mapping = {}
-    # Create a Country object
     country = Country()
-    # Match the regex strings to ISO3 country codes
-    matches = {}
+    matches: dict[str, str | None] = {}
     for match in to_match:
         try:
             match_ = country.get_iso3_country_code_fuzzy(match)[0]
-        except:
+        except Exception:  # pragma: no cover - defensive logging
             match_ = None
         matches[match] = match_
         if match_ is None and match not in additional_mapping:
-            logger.debug(f"No ISO3 match found for {match}")
+            logger.debug("No ISO3 match found for %s", match)
     return matches | additional_mapping
@@ -76,7 +51,7 @@ def convert_id(
     to_type: str = "ISO3",
     not_found: Any = None,
     *,
-    additional_mapping: dict = None,
+    additional_mapping: dict | None = None,
 ) -> pd.Series:
     """Takes a Pandas' series with country IDs and converts them into the desired type.
@@ -93,7 +68,6 @@ def convert_id(
             the same datatype as the target type.
     """
-    # if from and to are the same, return without changing anything
     if from_type == to_type:
         return series
@@ -107,7 +81,6 @@ def convert_id(
     mapping = mapping_functions[from_type](
         to_match=s_unique, additional_mapping=additional_mapping
     )
     return series.map(mapping).fillna(series if not_found is None else not_found)
@@ -141,7 +114,6 @@ def add_pydeflate_iso3(
             "Sub-Sahara Africa": "SSA",
         },
     )
     return df
@@ -160,7 +132,6 @@ def prefix_pydeflate_to_columns(
     df.columns = [
         f"{prefix}{col}" if not col.startswith(prefix) else col for col in df.columns
     ]
     return df
@@ -187,7 +158,7 @@ def compute_exchange_deflator(
     base_year_measure: str | None = None,
     exchange: str = "EXCHANGE",
     year: str = "year",
-    grouper: list[str] = None,
+    grouper: list[str] | None = None,
 ) -> pd.DataFrame:
     """Compute the exchange rate deflator for each group of entities.
@@ -205,87 +176,68 @@ def compute_exchange_deflator(
         pd.DataFrame: DataFrame with an additional column for the exchange rate deflator.
     """
-    def _add_deflator(
+    def _compute_deflator_for_group(
         group: pd.DataFrame,
-        measure: str | None = "NGDPD_D",
-        exchange: str = "EXCHANGE",
-        year: str = "year",
+        measure: str | None,
+        exchange_col: str,
+        year_col: str,
+        deflator_col: str,
     ) -> pd.DataFrame:
-        # if needed, clean exchange name
-        if exchange.endswith("_to") or exchange.endswith("_from"):
-            exchange_name = exchange.rsplit("_", 1)[0]
-        else:
-            exchange_name = exchange
-        # Identify the base year for the deflator
+        """Compute deflator for a single group and add it as a column."""
+        # Identify base year
         if measure is not None:
-            base_year = identify_base_year(group, measure=measure, year=year)
+            base_year = identify_base_year(group, measure=measure, year=year_col)
         else:
-            base_year = group.dropna(subset=exchange)[year].max()
+            valid_rows = group.dropna(subset=[exchange_col])
+            base_year = valid_rows[year_col].max() if not valid_rows.empty else None
-        # If no base year is found, return the group unchanged
+        # If no base year found, return group without deflator column
         if base_year is None or pd.isna(base_year):
             return group
         # Extract the exchange rate value for the base year
-        base_value = group.loc[group[year] == base_year, exchange].values
+        base_value_rows = group.loc[group[year_col] == base_year, exchange_col]
-        # If base value is found and valid, calculate the deflator
-        if base_value.size > 0 and pd.notna(base_value[0]):
-            group[f"{exchange_name}_D"] = round(
-                100 * group[exchange] / base_value[0], 6
-            )
+        # If no valid base value, return group without deflator column
+        if base_value_rows.empty or pd.isna(base_value_rows.iloc[0]):
+            return group
+        # Calculate and add deflator column
+        base_value = base_value_rows.iloc[0]
+        group = group.copy()
+        group[deflator_col] = round(100 * group[exchange_col] / base_value, 6)
         return group
     if grouper is None:
         grouper = ["entity", "entity_code"]
-    # Apply the deflator computation for each group of 'entity' and 'entity_code'
-    return df.groupby(grouper, group_keys=False).apply(
-        _add_deflator, measure=base_year_measure, exchange=exchange, year=year
-    )
-def read_data(
-    file_finder_func: callable,
-    download_func: callable,
-    data_name: str,
-    update: bool = False,
-) -> pd.DataFrame:
-    """Generic function to read data from parquet files or download fresh data.
-    Args:
-        file_finder_func (function): Function to find existing data files in the path.
-        download_func (function): Function to download fresh data if no files are
-        found or an update is needed.
-        data_name (str): Name of the dataset for logging purposes (e.g., "WEO", "DAC").
-        update (bool): If True, forces downloading of new data even if files exist.
-    Returns:
-        pd.DataFrame: The latest available data.
-    """
-    # Find existing files using the provided file finder function
-    files = file_finder_func(PYDEFLATE_PATHS.data)
-    # If no files are found or update is requested, download new data
-    if len(files) == 0 or update:
-        download_func()
-        files = file_finder_func(PYDEFLATE_PATHS.data)
-    # If files are found, sort them by age and load the most recent one
-    if len(files) > 0:
-        files = sorted(files, key=check_file_age)
-        latest_file = files[0]
-        # Check if the latest file is older than 120 days and log a warning
-        if check_file_age(latest_file) > 120:
-            logger.warn(
-                f"The latest {data_name} data is more than 120 days old.\n"
-                f"Consider updating by setting update=True in the function call."
-            )
-        # Read and return the latest parquet file as a DataFrame
-        logger.info(f"Reading {data_name} data from {latest_file}")
-        return pd.read_parquet(latest_file)
+    # Determine the exchange column name for the deflator
+    if exchange.endswith("_to") or exchange.endswith("_from"):
+        exchange_name = exchange.rsplit("_", 1)[0]
+    else:
+        exchange_name = exchange
+    deflator_col = f"{exchange_name}_D"
+    # Process each group and concatenate results
+    # This approach avoids the FutureWarning from groupby().apply() operating on grouping columns
+    processed_groups = []
+    for name, group in df.groupby(grouper, sort=False):
+        processed_group = _compute_deflator_for_group(
+            group=group,
+            measure=base_year_measure,
+            exchange_col=exchange,
+            year_col=year,
+            deflator_col=deflator_col,
+        )
+        processed_groups.append(processed_group)
+    # Concatenate all processed groups and restore original row order
+    result = pd.concat(processed_groups, ignore_index=False)
+    # Sort by index to restore original row order
+    # (groupby may have changed the order when grouping rows together)
+    result = result.sort_index()
+    return result

pydeflate/sources/dac.py CHANGED Viewed

@@ -1,71 +1,58 @@
+from __future__ import annotations
 from pathlib import Path
 import pandas as pd
 from oda_reader import download_dac1
-from pydeflate.pydeflate_config import PYDEFLATE_PATHS
+from pydeflate.cache import CacheEntry, cache_manager
+from pydeflate.pydeflate_config import logger
 from pydeflate.sources.common import (
-    today,
     add_pydeflate_iso3,
-    enforce_pyarrow_types,
     compute_exchange_deflator,
-    read_data,
+    enforce_pyarrow_types,
     prefix_pydeflate_to_columns,
 )
-def _find_dac_files_in_path(path: Path) -> list:
-    """Find all DAC parquet files in the specified directory.
-    Args:
-        path (Path): The directory path to search for DAC parquet files.
-    Returns:
-        list: List of DAC parquet files found in the directory.
-    """
-    return list(path.glob("dac_*.parquet"))
 def _to_units(df: pd.DataFrame) -> pd.DataFrame:
-    """Convert DAC values (in million) to units.
-    Args:
-        df (pd.DataFrame): Dataframe with raw observation values.
+    """Scale reported DAC values (supplied in millions) into base units."""
-    Returns:
-        pd.DataFrame: Dataframe with scaled observation values.
-    """
     df = df.copy()
     df["value"] = df["value"] * df["unit_multiplier"]
     return df
 def _keep_official_definition_only(df: pd.DataFrame) -> pd.DataFrame:
+    """Retain rows matching the official DAC definition across regime changes."""
     query = (
         "(aidtype_code == 1010 & flows_code == 1140 & year <2018 ) | "
         "(aidtype_code == 11010 & flows_code == 1160 & year >=2018)"
     )
     return df.query(query)
 def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
-    columns = ["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"]
+    """Select the key columns used downstream in pydeflate."""
-    return df.filter(columns)
+    return df.filter(["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"])
 def _pivot_amount_type(df: pd.DataFrame) -> pd.DataFrame:
+    """Pivot amount-type codes into separate columns (A/N/D)."""
     df = df.filter(["year", "donor_code", "donor_name", "amounttype_code", "value"])
     return df.pivot(
-        index=[c for c in df.columns if c not in ["amounttype_code", "value"]],
+        index=[c for c in df.columns if c not in {"amounttype_code", "value"}],
         columns="amounttype_code",
         values="value",
     ).reset_index()
 def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
-    # The values for certain providers should be 1
+    """Derive exchange rates, forcing DAC aggregates to unity."""
     df.loc[lambda d: d.donor_code >= 20000, "N"] = df.loc[
         lambda d: d.donor_code >= 20000, "A"
     ]
@@ -74,32 +61,32 @@ def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
 def _compute_dac_deflator(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate the published DAC price deflator from amounts A/D."""
     df["DAC_DEFLATOR"] = round(100 * df["A"] / df["D"], 6)
     return df
 def _compute_dac_gdp_deflator(df: pd.DataFrame) -> pd.DataFrame:
-    df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
+    """Back out a GDP-style deflator using the exchange deflator."""
+    df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
     return df
 def _rename_columns(df: pd.DataFrame) -> pd.DataFrame:
-    return df.rename(
-        columns={
-            "donor_code": "entity_code",
-            "donor_name": "entity",
-        }
-    )
+    """Align donor metadata with pydeflate naming conventions."""
+    return df.rename(columns={"donor_code": "entity_code", "donor_name": "entity"})
-def download_dac():
-    # Use oda_reader to get the data
+def _download_dac(output_path: Path) -> None:
+    """Download and cache the DAC statistics parquet file."""
+    logger.info("Downloading DAC statistics from ODA reader...")
     df = download_dac1(
         filters={"measure": ["1010", "11010"], "flow_type": ["1140", "1160"]}
     )
-    # Clean the data
     df = (
         df.pipe(_to_units)
         .pipe(_keep_official_definition_only)
@@ -115,23 +102,23 @@ def download_dac():
         .pipe(enforce_pyarrow_types)
         .reset_index(drop=True)
     )
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(output_path)
+    logger.info("Saved DAC dataset to %s", output_path)
-    # Get today's date to use as a file suffix
-    suffix = today()
-    # Save the data
-    df.to_parquet(PYDEFLATE_PATHS.data / f"dac_{suffix}.parquet")
+_DAC_ENTRY = CacheEntry(
+    key="dac_stats",
+    filename="dac.parquet",
+    fetcher=_download_dac,
+    ttl_days=30,
+)
 def read_dac(update: bool = False) -> pd.DataFrame:
-    """Read the latest WEO data from parquet files or download fresh data."""
-    return read_data(
-        file_finder_func=_find_dac_files_in_path,
-        download_func=download_dac,
-        data_name="DAC",
-        update=update,
-    )
+    path = cache_manager().ensure(_DAC_ENTRY, refresh=update)
+    return pd.read_parquet(path)
-if __name__ == "__main__":
-    df = read_dac(update=True)
+if __name__ == "__main__":  # pragma: no cover
+    read_dac(update=True)

pydeflate/sources/imf.py CHANGED Viewed

@@ -1,15 +1,16 @@
+from __future__ import annotations
 from pathlib import Path
 import pandas as pd
 from imf_reader import weo
-from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
+from pydeflate.cache import CacheEntry, cache_manager
+from pydeflate.pydeflate_config import logger
 from pydeflate.sources.common import (
-    today,
     add_pydeflate_iso3,
-    enforce_pyarrow_types,
     compute_exchange_deflator,
-    read_data,
+    enforce_pyarrow_types,
     prefix_pydeflate_to_columns,
 )
@@ -93,7 +94,7 @@ def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
 def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
-    """Pivot the concept code column to get a wide format for the data.
+    """Pivot the concept dimension so each indicator becomes a column
     Args:
         df (pd.DataFrame): Dataframe with concept code column.
@@ -102,7 +103,7 @@ def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
         pd.DataFrame: Dataframe with concept code pivoted to columns.
     """
     return df.pivot(
-        index=[c for c in df.columns if c not in ["concept_code", "value"]],
+        index=[c for c in df.columns if c not in {"concept_code", "value"}],
         columns="concept_code",
         values="value",
     ).reset_index()
@@ -171,15 +172,13 @@ def _create_eur_series(df: pd.DataFrame) -> pd.DataFrame:
     df.loc[df.entity_code == 998, "EXCHANGE"] = df.loc[
         df.entity_code == 998, "year"
     ].map(eur)
     return df
-def download_weo() -> None:
-    """Download the WEO data, process it, and save it to a parquet file."""
-    logger.info("Downloading the latest WEO data...")
+def _download_weo(output_path: Path) -> None:
+    """Fetch, transform, and store the latest WEO dataset in Parquet format."""
-    # Fetch and process the data through a pipeline of transformations
+    logger.info("Downloading the latest IMF WEO dataset...")
     df = (
         weo.fetch_data()
         .pipe(_filter_indicators)
@@ -195,38 +194,23 @@ def download_weo() -> None:
         .pipe(enforce_pyarrow_types)
         .reset_index(drop=True)
     )
-    # Get today's date to use as a file suffix
-    suffix = today()
-    # Save the processed dataframe to parquet format
-    df.to_parquet(PYDEFLATE_PATHS.data / f"weo_{suffix}.parquet")
-    logger.info(f"Saved WEO data to weo_{suffix}.parquet")
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(output_path)
+    logger.info("Saved WEO data to %s", output_path)
-def _find_weo_files_in_path(path: Path) -> list:
-    """Find all WEO parquet files in the specified directory.
-    Args:
-        path (Path): The directory path to search for WEO parquet files.
-    Returns:
-        list: List of WEO parquet files found in the directory.
-    """
-    return list(path.glob("weo_*.parquet"))
+_IMF_CACHE_ENTRY = CacheEntry(
+    key="imf_weo",
+    filename="imf_weo.parquet",
+    fetcher=_download_weo,
+    ttl_days=60,
+)
 def read_weo(update: bool = False) -> pd.DataFrame:
-    """Read the latest WEO data from parquet files or download fresh data."""
-    return read_data(
-        file_finder_func=_find_weo_files_in_path,
-        download_func=download_weo,
-        data_name="WEO",
-        update=update,
-    )
+    path = cache_manager().ensure(_IMF_CACHE_ENTRY, refresh=update)
+    return pd.read_parquet(path)
-if __name__ == "__main__":
-    # Download the WEO data
-    dfi = read_weo(update=True)
+if __name__ == "__main__":  # pragma: no cover
+    read_weo(update=True)

pydeflate 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl

pydeflate 2.1.3py3-none-any.whl → 2.2.0py3-none-any.whl