PyPI - pydeflate - Versions diffs - 2.1.3__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

pydeflate 2.1.3py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

pydeflate/__init__.py +92 -20
pydeflate/cache.py +139 -0
pydeflate/constants.py +121 -0
pydeflate/context.py +211 -0
pydeflate/core/api.py +33 -11
pydeflate/core/source.py +92 -11
pydeflate/deflate/deflators.py +1 -1
pydeflate/deflate/get_deflators.py +233 -0
pydeflate/deflate/legacy_deflate.py +1 -1
pydeflate/exceptions.py +166 -0
pydeflate/exchange/exchangers.py +1 -1
pydeflate/exchange/get_rates.py +207 -0
pydeflate/plugins.py +289 -0
pydeflate/protocols.py +168 -0
pydeflate/pydeflate_config.py +77 -6
pydeflate/schemas.py +297 -0
pydeflate/sources/common.py +59 -107
pydeflate/sources/dac.py +39 -52
pydeflate/sources/imf.py +23 -39
pydeflate/sources/world_bank.py +44 -117
pydeflate/utils.py +14 -9
{pydeflate-2.1.3.dist-info → pydeflate-2.3.0.dist-info}/METADATA +251 -18
pydeflate-2.3.0.dist-info/RECORD +34 -0
pydeflate-2.3.0.dist-info/WHEEL +4 -0
{pydeflate-2.1.3.dist-info → pydeflate-2.3.0.dist-info/licenses}/LICENSE +1 -1
pydeflate-2.1.3.dist-info/RECORD +0 -25
pydeflate-2.1.3.dist-info/WHEEL +0 -4

pydeflate/pydeflate_config.py CHANGED Viewed

@@ -1,14 +1,79 @@
+from __future__ import annotations
 import logging
+import os
+from dataclasses import dataclass
 from pathlib import Path
-class PYDEFLATE_PATHS:
-    """Class to store the paths to the data and output folders."""
+from platformdirs import user_cache_dir
+DATA_DIR_ENV = "PYDEFLATE_DATA_DIR"
+_PACKAGE_ROOT = Path(__file__).resolve().parent.parent
+_SETTINGS_DIR = _PACKAGE_ROOT / "pydeflate" / "settings"
+_TEST_DATA_DIR = _PACKAGE_ROOT / "tests" / "test_files"
+_DATA_DIR_OVERRIDE: Path | None = None
+def _ensure_dir(path: Path) -> Path:
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+def _default_data_dir() -> Path:
+    env_value = os.environ.get(DATA_DIR_ENV)
+    if env_value:
+        return _ensure_dir(Path(env_value).expanduser().resolve())
+    return _ensure_dir(Path(user_cache_dir("pydeflate", "pydeflate")))
+def get_data_dir() -> Path:
+    """Return the directory where pydeflate caches data files."""
+    if _DATA_DIR_OVERRIDE is not None:
+        return _ensure_dir(_DATA_DIR_OVERRIDE)
+    return _default_data_dir()
-    package = Path(__file__).resolve().parent.parent
-    data = package / "pydeflate" / ".pydeflate_data"
-    settings = package / "pydeflate" / "settings"
-    test_data = package / "tests" / "test_files"
+def set_data_dir(path: str | Path) -> Path:
+    """Override the pydeflate data directory for the current process."""
+    global _DATA_DIR_OVERRIDE
+    resolved = _ensure_dir(Path(path).expanduser().resolve())
+    _DATA_DIR_OVERRIDE = resolved
+    return resolved
+def reset_data_dir() -> None:
+    """Reset any process-level overrides and fall back to defaults."""
+    global _DATA_DIR_OVERRIDE
+    _DATA_DIR_OVERRIDE = None
+@dataclass(frozen=True)
+class _Paths:
+    package: Path
+    settings: Path
+    test_data: Path
+    @property
+    def data(self) -> Path:
+        return get_data_dir()
+    @data.setter  # type: ignore[override]
+    def data(self, value: Path | str) -> None:  # pragma: no cover - simple proxy
+        set_data_dir(value)
+PYDEFLATE_PATHS = _Paths(
+    package=_PACKAGE_ROOT,
+    settings=_SETTINGS_DIR,
+    test_data=_TEST_DATA_DIR,
+)
 def setup_logger(name) -> logging.Logger:
@@ -41,3 +106,9 @@ def setup_logger(name) -> logging.Logger:
 logger = setup_logger("pydeflate")
+def set_pydeflate_path(path: str | Path) -> Path:
+    """Set the path to the data folder (public API)."""
+    return set_data_dir(path)

pydeflate/schemas.py ADDED Viewed

@@ -0,0 +1,297 @@
+"""Pandera schemas for data validation.
+This module defines validation schemas for all DataFrame structures used
+in pydeflate. This ensures data integrity from external sources and
+catches API changes early.
+"""
+from __future__ import annotations
+import pandas as pd
+try:
+    # New import (pandera >= 0.20)
+    import pandera.pandas as pa
+    from pandera.pandas import Check, Column, DataFrameSchema
+except ImportError:
+    # Fallback to old import for older pandera versions
+    import pandera as pa
+    from pandera import Check, Column, DataFrameSchema
+# Column definitions for reuse
+YEAR_COLUMN = Column(
+    int,
+    checks=[
+        Check.ge(1960),  # No data before 1960
+        Check.le(2100),  # No projections beyond 2100
+    ],
+    nullable=False,
+    description="Year as integer",
+)
+ENTITY_CODE_COLUMN = Column(
+    str,
+    checks=[Check(lambda s: s.str.len() <= 10)],
+    nullable=False,
+    description="Entity code from source (varies by source)",
+)
+ISO3_COLUMN = Column(
+    str,
+    checks=[
+        Check(lambda s: (s.str.len() == 3) | s.isna()),
+    ],
+    nullable=True,
+    description="ISO3 country code",
+)
+EXCHANGE_RATE_COLUMN = Column(
+    float,
+    checks=[
+        Check.gt(0),  # Exchange rates must be positive
+    ],
+    nullable=True,
+    description="Exchange rate (LCU per USD)",
+)
+DEFLATOR_COLUMN = Column(
+    float,
+    checks=[
+        Check.gt(0),  # Deflators must be positive
+    ],
+    nullable=True,
+    description="Price deflator index",
+)
+class SourceDataSchema(pa.DataFrameModel):
+    """Base schema for all data sources.
+    All sources must have these minimum columns after processing.
+    """
+    pydeflate_year: int = pa.Field(ge=1960, le=2100)
+    pydeflate_entity_code: str = pa.Field(str_length={"max_value": 10})
+    pydeflate_iso3: str | None = pa.Field(nullable=True)
+    class Config:
+        """Schema configuration."""
+        strict = False  # Allow additional columns
+        coerce = True  # Attempt type coercion
+class ExchangeDataSchema(SourceDataSchema):
+    """Schema for exchange rate data."""
+    pydeflate_EXCHANGE = EXCHANGE_RATE_COLUMN
+    pydeflate_EXCHANGE_D = Column(
+        float,
+        checks=[Check.gt(0)],
+        nullable=True,
+        description="Exchange rate deflator (rebased)",
+    )
+    class Config:
+        """Schema configuration."""
+        strict = False
+        coerce = True
+class IMFDataSchema(SourceDataSchema):
+    """Schema for IMF WEO data.
+    IMF provides GDP deflators, CPI, and exchange rates.
+    """
+    pydeflate_NGDP_D = Column(
+        float,
+        checks=[Check.gt(0)],
+        nullable=True,
+        description="GDP deflator",
+    )
+    pydeflate_PCPI = Column(
+        float,
+        checks=[Check.gt(0)],
+        nullable=True,
+        description="CPI (period average)",
+    )
+    pydeflate_PCPIE = Column(
+        float,
+        checks=[Check.gt(0)],
+        nullable=True,
+        description="CPI (end of period)",
+    )
+    pydeflate_EXCHANGE = EXCHANGE_RATE_COLUMN
+    class Config:
+        """Schema configuration."""
+        strict = False
+        coerce = True
+class WorldBankDataSchema(SourceDataSchema):
+    """Schema for World Bank data."""
+    pydeflate_NGDP_D = Column(
+        float,
+        checks=[Check.gt(0)],
+        nullable=True,
+        description="GDP deflator",
+    )
+    pydeflate_NGDP_DL = Column(
+        float,
+        checks=[Check.gt(0)],
+        nullable=True,
+        description="GDP deflator (linked)",
+    )
+    pydeflate_CPI = Column(
+        float,
+        checks=[Check.gt(0)],
+        nullable=True,
+        description="Consumer Price Index",
+    )
+    pydeflate_EXCHANGE = EXCHANGE_RATE_COLUMN
+    class Config:
+        """Schema configuration."""
+        strict = False
+        coerce = True
+class DACDataSchema(SourceDataSchema):
+    """Schema for OECD DAC data."""
+    pydeflate_DAC_DEFLATOR = Column(
+        float,
+        checks=[Check.gt(0)],
+        nullable=True,
+        description="DAC deflator",
+    )
+    pydeflate_NGDP_D = Column(
+        float,
+        checks=[Check.gt(0)],
+        nullable=True,
+        description="GDP deflator (computed)",
+    )
+    pydeflate_EXCHANGE = EXCHANGE_RATE_COLUMN
+    class Config:
+        """Schema configuration."""
+        strict = False
+        coerce = True
+class UserInputSchema:
+    """Validation for user-provided DataFrames.
+    This is not a Pandera schema but provides methods to validate
+    user input with custom column names.
+    """
+    @staticmethod
+    def validate(
+        df,
+        id_column: str,
+        year_column: str,
+        value_column: str,
+    ) -> None:
+        """Validate user DataFrame has required columns and types.
+        Args:
+            df: User's DataFrame
+            id_column: Name of column with entity identifiers
+            year_column: Name of column with year data
+            value_column: Name of column with numeric values
+        Raises:
+            ConfigurationError: If required columns are missing
+            SchemaValidationError: If column types are invalid
+        """
+        from pydeflate.exceptions import ConfigurationError, SchemaValidationError
+        # Check required columns exist
+        missing_cols = []
+        for col_name, col in [
+            ("id_column", id_column),
+            ("year_column", year_column),
+            ("value_column", value_column),
+        ]:
+            if col not in df.columns:
+                missing_cols.append(f"{col_name}='{col}'")
+        if missing_cols:
+            raise ConfigurationError(
+                f"Required columns missing from DataFrame: {', '.join(missing_cols)}"
+            )
+        # Validate value column is numeric
+        if not pd.api.types.is_numeric_dtype(df[value_column]):
+            raise SchemaValidationError(
+                f"Column '{value_column}' must be numeric, got {df[value_column].dtype}"
+            )
+        # Validate year column can be converted to datetime
+        try:
+            pd.to_datetime(df[year_column], errors="coerce")
+        except Exception as e:
+            raise SchemaValidationError(
+                f"Column '{year_column}' cannot be interpreted as dates: {e}"
+            )
+# Registry of schemas by source name
+SCHEMA_REGISTRY: dict[str, type[DataFrameSchema]] = {
+    "IMF": IMFDataSchema,
+    "World Bank": WorldBankDataSchema,
+    "DAC": DACDataSchema,
+}
+def get_schema_for_source(source_name: str) -> type[DataFrameSchema] | None:
+    """Get the appropriate schema for a data source.
+    Args:
+        source_name: Name of the source (e.g., 'IMF', 'World Bank')
+    Returns:
+        Schema class for the source, or None if not found
+    """
+    return SCHEMA_REGISTRY.get(source_name)
+def validate_source_data(df, source_name: str) -> None:
+    """Validate that source data matches expected schema.
+    Args:
+        df: DataFrame to validate
+        source_name: Name of the source
+    Raises:
+        SchemaValidationError: If data doesn't match schema
+    """
+    from pydeflate.exceptions import SchemaValidationError
+    schema_class = get_schema_for_source(source_name)
+    if schema_class is None:
+        # No schema defined for this source, skip validation
+        return
+    try:
+        # Instantiate the schema and validate
+        schema = schema_class()
+        schema.validate(df, lazy=True)
+    except pa.errors.SchemaErrors as e:
+        # Collect all validation errors
+        error_messages = []
+        for error in e.failure_cases.itertuples():
+            error_messages.append(f"  - {error.check}: {error.failure_case}")
+        raise SchemaValidationError(
+            f"Data validation failed for {source_name}:\n" + "\n".join(error_messages),
+            source=source_name,
+        ) from e

pydeflate/sources/common.py CHANGED Viewed

@@ -1,41 +1,19 @@
-from datetime import datetime
-from pathlib import Path
+from __future__ import annotations
 from typing import Any, Literal
 import pandas as pd
 from hdx.location.country import Country
-from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
+from pydeflate.pydeflate_config import logger
 AvailableDeflators = Literal["NGDP_D", "NGDP_DL", "CPI", "PCPI", "PCPIE"]
-def check_file_age(file: Path) -> int:
-    """Check the age of a WEO file in days.
-    Args:
-        file (Path): The WEO parquet file to check.
-    Returns:
-        int: The number of days since the file was created.
-    """
-    current_date = datetime.today()
-    # Extract date from the filename (format: weo_YYYY-MM-DD.parquet)
-    file_date = datetime.strptime(file.stem.split("_")[-1], "%Y-%m-%d")
-    # Return the difference in days between today and the file's date
-    return (current_date - file_date).days
 def enforce_pyarrow_types(df: pd.DataFrame) -> pd.DataFrame:
-    """Ensures that a DataFrame uses pyarrow dtypes."""
-    return df.convert_dtypes(dtype_backend="pyarrow")
-def today() -> str:
-    from datetime import datetime
+    """Ensure that a DataFrame uses pyarrow-backed dtypes."""
-    return datetime.today().strftime("%Y-%m-%d")
+    return df.convert_dtypes(dtype_backend="pyarrow")
 def _match_regex_to_iso3(
@@ -52,20 +30,17 @@ def _match_regex_to_iso3(
     if additional_mapping is None:
         additional_mapping = {}
-    # Create a Country object
     country = Country()
-    # Match the regex strings to ISO3 country codes
-    matches = {}
+    matches: dict[str, str | None] = {}
     for match in to_match:
         try:
             match_ = country.get_iso3_country_code_fuzzy(match)[0]
-        except:
+        except Exception:  # pragma: no cover - defensive logging
             match_ = None
         matches[match] = match_
         if match_ is None and match not in additional_mapping:
-            logger.debug(f"No ISO3 match found for {match}")
+            logger.debug("No ISO3 match found for %s", match)
     return matches | additional_mapping
@@ -76,7 +51,7 @@ def convert_id(
     to_type: str = "ISO3",
     not_found: Any = None,
     *,
-    additional_mapping: dict = None,
+    additional_mapping: dict | None = None,
 ) -> pd.Series:
     """Takes a Pandas' series with country IDs and converts them into the desired type.
@@ -93,7 +68,6 @@ def convert_id(
             the same datatype as the target type.
     """
-    # if from and to are the same, return without changing anything
     if from_type == to_type:
         return series
@@ -107,7 +81,6 @@ def convert_id(
     mapping = mapping_functions[from_type](
         to_match=s_unique, additional_mapping=additional_mapping
     )
     return series.map(mapping).fillna(series if not_found is None else not_found)
@@ -141,7 +114,6 @@ def add_pydeflate_iso3(
             "Sub-Sahara Africa": "SSA",
         },
     )
     return df
@@ -160,7 +132,6 @@ def prefix_pydeflate_to_columns(
     df.columns = [
         f"{prefix}{col}" if not col.startswith(prefix) else col for col in df.columns
     ]
     return df
@@ -187,7 +158,7 @@ def compute_exchange_deflator(
     base_year_measure: str | None = None,
     exchange: str = "EXCHANGE",
     year: str = "year",
-    grouper: list[str] = None,
+    grouper: list[str] | None = None,
 ) -> pd.DataFrame:
     """Compute the exchange rate deflator for each group of entities.
@@ -205,87 +176,68 @@ def compute_exchange_deflator(
         pd.DataFrame: DataFrame with an additional column for the exchange rate deflator.
     """
-    def _add_deflator(
+    def _compute_deflator_for_group(
         group: pd.DataFrame,
-        measure: str | None = "NGDPD_D",
-        exchange: str = "EXCHANGE",
-        year: str = "year",
+        measure: str | None,
+        exchange_col: str,
+        year_col: str,
+        deflator_col: str,
     ) -> pd.DataFrame:
-        # if needed, clean exchange name
-        if exchange.endswith("_to") or exchange.endswith("_from"):
-            exchange_name = exchange.rsplit("_", 1)[0]
-        else:
-            exchange_name = exchange
-        # Identify the base year for the deflator
+        """Compute deflator for a single group and add it as a column."""
+        # Identify base year
         if measure is not None:
-            base_year = identify_base_year(group, measure=measure, year=year)
+            base_year = identify_base_year(group, measure=measure, year=year_col)
         else:
-            base_year = group.dropna(subset=exchange)[year].max()
+            valid_rows = group.dropna(subset=[exchange_col])
+            base_year = valid_rows[year_col].max() if not valid_rows.empty else None
-        # If no base year is found, return the group unchanged
+        # If no base year found, return group without deflator column
         if base_year is None or pd.isna(base_year):
             return group
         # Extract the exchange rate value for the base year
-        base_value = group.loc[group[year] == base_year, exchange].values
+        base_value_rows = group.loc[group[year_col] == base_year, exchange_col]
-        # If base value is found and valid, calculate the deflator
-        if base_value.size > 0 and pd.notna(base_value[0]):
-            group[f"{exchange_name}_D"] = round(
-                100 * group[exchange] / base_value[0], 6
-            )
+        # If no valid base value, return group without deflator column
+        if base_value_rows.empty or pd.isna(base_value_rows.iloc[0]):
+            return group
+        # Calculate and add deflator column
+        base_value = base_value_rows.iloc[0]
+        group = group.copy()
+        group[deflator_col] = round(100 * group[exchange_col] / base_value, 6)
         return group
     if grouper is None:
         grouper = ["entity", "entity_code"]
-    # Apply the deflator computation for each group of 'entity' and 'entity_code'
-    return df.groupby(grouper, group_keys=False).apply(
-        _add_deflator, measure=base_year_measure, exchange=exchange, year=year
-    )
-def read_data(
-    file_finder_func: callable,
-    download_func: callable,
-    data_name: str,
-    update: bool = False,
-) -> pd.DataFrame:
-    """Generic function to read data from parquet files or download fresh data.
-    Args:
-        file_finder_func (function): Function to find existing data files in the path.
-        download_func (function): Function to download fresh data if no files are
-        found or an update is needed.
-        data_name (str): Name of the dataset for logging purposes (e.g., "WEO", "DAC").
-        update (bool): If True, forces downloading of new data even if files exist.
-    Returns:
-        pd.DataFrame: The latest available data.
-    """
-    # Find existing files using the provided file finder function
-    files = file_finder_func(PYDEFLATE_PATHS.data)
-    # If no files are found or update is requested, download new data
-    if len(files) == 0 or update:
-        download_func()
-        files = file_finder_func(PYDEFLATE_PATHS.data)
-    # If files are found, sort them by age and load the most recent one
-    if len(files) > 0:
-        files = sorted(files, key=check_file_age)
-        latest_file = files[0]
-        # Check if the latest file is older than 120 days and log a warning
-        if check_file_age(latest_file) > 120:
-            logger.warn(
-                f"The latest {data_name} data is more than 120 days old.\n"
-                f"Consider updating by setting update=True in the function call."
-            )
-        # Read and return the latest parquet file as a DataFrame
-        logger.info(f"Reading {data_name} data from {latest_file}")
-        return pd.read_parquet(latest_file)
+    # Determine the exchange column name for the deflator
+    if exchange.endswith("_to") or exchange.endswith("_from"):
+        exchange_name = exchange.rsplit("_", 1)[0]
+    else:
+        exchange_name = exchange
+    deflator_col = f"{exchange_name}_D"
+    # Process each group and concatenate results
+    # This approach avoids the FutureWarning from groupby().apply() operating on grouping columns
+    processed_groups = []
+    for name, group in df.groupby(grouper, sort=False):
+        processed_group = _compute_deflator_for_group(
+            group=group,
+            measure=base_year_measure,
+            exchange_col=exchange,
+            year_col=year,
+            deflator_col=deflator_col,
+        )
+        processed_groups.append(processed_group)
+    # Concatenate all processed groups and restore original row order
+    result = pd.concat(processed_groups, ignore_index=False)
+    # Sort by index to restore original row order
+    # (groupby may have changed the order when grouping rows together)
+    result = result.sort_index()
+    return result

pydeflate 2.1.3__py3-none-any.whl → 2.3.0__py3-none-any.whl

pydeflate 2.1.3py3-none-any.whl → 2.3.0py3-none-any.whl