PyPI - oda-reader - Versions diffs - 0.0.9__tar.gz - Mend

oda-reader 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

oda_reader-0.0.9/LICENSE +21 -0
oda_reader-0.0.9/PKG-INFO +29 -0
oda_reader-0.0.9/README.md +11 -0
oda_reader-0.0.9/oda_reader/__init__.py +1 -0
oda_reader-0.0.9/oda_reader/common.py +94 -0
oda_reader-0.0.9/oda_reader/dac1.py +45 -0
oda_reader-0.0.9/oda_reader/dac2a.py +45 -0
oda_reader-0.0.9/oda_reader/download_tools.py +89 -0
oda_reader-0.0.9/oda_reader/query_builder.py +189 -0
oda_reader-0.0.9/oda_reader/schemas/__init__.py +0 -0
oda_reader-0.0.9/oda_reader/schemas/area_code_corrections.json +8 -0
oda_reader-0.0.9/oda_reader/schemas/code_prices_corrections.json +4 -0
oda_reader-0.0.9/oda_reader/schemas/dac1_codes_area.json +418 -0
oda_reader-0.0.9/oda_reader/schemas/dac1_codes_flow_types.json +5 -0
oda_reader-0.0.9/oda_reader/schemas/dac1_codes_prices.json +4 -0
oda_reader-0.0.9/oda_reader/schemas/dac1_dotstat.json +152 -0
oda_reader-0.0.9/oda_reader/schemas/dac1_translation.py +97 -0
oda_reader-0.0.9/oda_reader/schemas/dac2_codes_area.json +432 -0
oda_reader-0.0.9/oda_reader/schemas/dac2_translation.py +69 -0
oda_reader-0.0.9/oda_reader/schemas/dac2a_dotstat.json +142 -0
oda_reader-0.0.9/oda_reader/schemas/schema_tools.py +198 -0
oda_reader-0.0.9/oda_reader/schemas/xml_tools.py +181 -0
oda_reader-0.0.9/pyproject.toml +20 -0

oda_reader-0.0.9/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 ONE Campaign
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

oda_reader-0.0.9/PKG-INFO ADDED Viewed

@@ -0,0 +1,29 @@
+Metadata-Version: 2.1
+Name: oda_reader
+Version: 0.0.9
+Summary: A simple package to import ODA data using the OECD Data API
+License: MIT
+Author: Jorge Rivera
+Requires-Python: >=3.10,<4.0
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: pandas (>=2.2.2,<3.0.0)
+Requires-Dist: pyarrow (>=16.0.0)
+Requires-Dist: requests (>=2.31.0,<3.0.0)
+Description-Content-Type: text/markdown
+# oda_reader
+Tools to import data from the OECD DAC.
+This is a very simple package to make working with the Explorer API
+easier.
+This package is under active development.
+It includes a basic implementation of an API call for DAC1. It also includes
+tools to translate the API response into the old .Stat schema.

oda_reader-0.0.9/README.md ADDED Viewed

@@ -0,0 +1,11 @@
+# oda_reader
+Tools to import data from the OECD DAC.
+This is a very simple package to make working with the Explorer API
+easier.
+This package is under active development.
+It includes a basic implementation of an API call for DAC1. It also includes
+tools to translate the API response into the old .Stat schema.

oda_reader-0.0.9/oda_reader/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.0.9"

oda_reader-0.0.9/oda_reader/common.py ADDED Viewed

@@ -0,0 +1,94 @@
+import logging
+from io import StringIO
+from pathlib import Path
+import pandas as pd
+import requests
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+logger = logging.getLogger("oda_importer")
+class ImporterPaths:
+    """Class to store the paths to the data and output folders."""
+    project = Path(__file__).resolve().parent.parent
+    scripts = project / "oda_importer"
+    schemas = scripts / "schemas"
+def text_to_stringIO(response: requests.models.Response) -> StringIO:
+    """Convert the content of a response to bytes.
+    Args:
+        response (requests.models.Response): The response object from the API.
+    Returns:
+        StringIO: The content of the response as a stringIO object.
+    """
+    # Use BytesIO to handle the binary stream data
+    return StringIO(response.text)
+def get_data_from_api(url: str, compressed: bool = True) -> requests.models.Response:
+    """Download a CSV file from an API endpoint and return it as a DataFrame.
+    Args:
+        url (str): The URL of the API endpoint.
+        compressed (bool): Whether the data is fetched compressed. Strongly recommended.
+    Returns:
+        requests.models.Response: The response object from the API.
+    """
+    # Set the headers with gzip encoding (if required)
+    if compressed:
+        headers = {"Accept-Encoding": "gzip"}
+    else:
+        headers = {}
+    # Fetch the data with headers
+    logger.info(f"Fetching data from {url}")
+    response = requests.get(url, headers=headers)
+    if (response.status_code == 404) and (response.text == "NoRecordsFound"):
+        raise ConnectionError("No data found for the selected parameters.")
+    # Ensure the request was successful
+    response.raise_for_status()
+    return response
+def api_response_to_df(
+    url: str, read_csv_options: dict = None, compressed: bool = True
+) -> pd.DataFrame:
+    """Download a CSV file from an API endpoint and return it as a DataFrame.
+    Args:
+        url (str): The URL of the API endpoint.
+        read_csv_options (dict): Options to pass to `pd.read_csv`.
+        compressed (bool): Whether the data is fetched compressed. Strongly recommended.
+    Returns:
+        pd.DataFrame: The data as a DataFrame.
+    """
+    # Set default options for read_csv
+    if read_csv_options is None:
+        read_csv_options = {}
+    # If asked for uncompressed data, return the data as is
+    if not compressed:
+        return pd.read_csv(url, **read_csv_options)
+    # Fetch the data from the API with compression headers
+    response = get_data_from_api(url=url, compressed=compressed)
+    # Convert the content to stringIO
+    data = text_to_stringIO(response)
+    # Return the data as a DataFrame
+    return pd.read_csv(data, **read_csv_options)

oda_reader-0.0.9/oda_reader/dac1.py ADDED Viewed

@@ -0,0 +1,45 @@
+import pandas as pd
+from oda_reader.common import logger
+from oda_reader.download_tools import download
+DATAFLOW_ID: str = "DSD_DAC1@DF_DAC1"
+def download_dac1(
+    start_year: int | None = None,
+    end_year: int | None = None,
+    filters: dict | None = None,
+    pre_process: bool = True,
+    dotstat_codes: bool = True,
+) -> pd.DataFrame:
+    """
+    Download the DAC1 data from the API.
+    Args:
+        start_year (int): The start year of the data to download. Optional
+        end_year (int): The end year of the data to download. Optional
+        filters (dict): Optional filters to pass to the download.
+        pre_process (bool): Whether to preprocess the data. Defaults to True.
+        Preprocessing makes it comply with the .stat schema.
+        dotstat_codes (bool): Whether to convert the donor codes to the .stat schema.
+    Returns:
+        pd.DataFrame: The DAC1 data.
+    """
+    # Inform download is about to start
+    logger.info("Downloading DAC1 data. This may take a while...")
+    df = download(
+        version="dac1",
+        dataflow_id=DATAFLOW_ID,
+        start_year=start_year,
+        end_year=end_year,
+        filters=filters,
+        pre_process=pre_process,
+        dotstat_codes=dotstat_codes,
+    )
+    return df

oda_reader-0.0.9/oda_reader/dac2a.py ADDED Viewed

@@ -0,0 +1,45 @@
+import pandas as pd
+from oda_reader.common import logger
+from oda_reader.download_tools import download
+DATAFLOW_ID: str = "DSD_DAC2@DF_DAC2A"
+def download_dac2a(
+    start_year: int | None = None,
+    end_year: int | None = None,
+    filters: dict | None = None,
+    pre_process: bool = True,
+    dotstat_codes: bool = True,
+) -> pd.DataFrame:
+    """
+    Download the DAC1 data from the API.
+    Args:
+        start_year (int): The start year of the data to download. Optional
+        end_year (int): The end year of the data to download. Optional
+        filters (dict): Optional filters to pass to the download.
+        pre_process (bool): Whether to preprocess the data. Defaults to True.
+        Preprocessing makes it comply with the .stat schema.
+        dotstat_codes (bool): Whether to convert the donor codes to the .stat schema.
+    Returns:
+        pd.DataFrame: The DAC1 data.
+    """
+    # Inform download is about to start
+    logger.info("Downloading DAC2A data. This may take a while...")
+    df = download(
+        version="dac1",
+        dataflow_id=DATAFLOW_ID,
+        start_year=start_year,
+        end_year=end_year,
+        filters=filters,
+        pre_process=pre_process,
+        dotstat_codes=dotstat_codes,
+    )
+    return df

oda_reader-0.0.9/oda_reader/download_tools.py ADDED Viewed

@@ -0,0 +1,89 @@
+import pandas as pd
+from oda_reader.common import api_response_to_df, logger
+from oda_reader.query_builder import QueryBuilder
+from oda_reader.schemas.dac1_translation import convert_dac1_to_dotstat_codes
+from oda_reader.schemas.dac2_translation import convert_dac2a_to_dotstat_codes
+from oda_reader.schemas.schema_tools import (
+    read_schema_translation,
+    get_dtypes,
+    preprocess,
+)
+def download(
+    version: str,
+    dataflow_id: str,
+    start_year: int | None = None,
+    end_year: int | None = None,
+    filters: dict | None = None,
+    pre_process: bool = True,
+    dotstat_codes: bool = True,
+) -> pd.DataFrame:
+    """
+    Download the data from the API.
+    Args:
+        version (str): The version of the data to download.
+        dataflow_id (str): The dataflow id of the data to download.
+        start_year (int): The start year of the data to download. Optional
+        end_year (int): The end year of the data to download. Optional
+        filters (dict): Optional filters to pass to the download.
+        pre_process (bool): Whether to preprocess the data. Defaults to True.
+        Preprocessing makes it comply with the .stat schema.
+        dotstat_codes (bool): Whether to convert the donor codes to the .stat schema.
+    Returns:
+        pd.DataFrame: The DAC1 data.
+    """
+    # Load the translation schema from .stat  to the new explorer
+    schema_translation = read_schema_translation(version=version)
+    # Get a data types dictionary
+    data_types = get_dtypes(schema=schema_translation)
+    # Set read csv options
+    df_options = {
+        "na_values": ("_Z", "nan"),
+        "keep_default_na": True,
+        "dtype": data_types,
+    }
+    # instantiate the query builder
+    qb = QueryBuilder(dataflow_id=dataflow_id)
+    # Select right filter builder and dotstat codes
+    if version == "dac1":
+        filter_builder = qb.build_dac1_filter
+        convert_func = convert_dac1_to_dotstat_codes
+    elif version == "dac2a":
+        filter_builder = qb.build_dac2a_filter
+        convert_func = convert_dac2a_to_dotstat_codes
+    else:
+        raise ValueError("Version must be either 'dac1' or 'dac2a'.")
+    # Optionally set filters
+    if filters:
+        filter_str = filter_builder(**filters)
+        qb.set_filter(filter_str)
+    # Get the url
+    url = qb.set_time_period(start=start_year, end=end_year).build_query()
+    # Get the dataframe
+    df = api_response_to_df(url=url, read_csv_options=df_options)
+    # Preprocess the data
+    if pre_process:
+        df = preprocess(df=df, schema_translation=schema_translation)
+        if dotstat_codes:
+            df = convert_func(df)
+    else:
+        if dotstat_codes:
+            raise ValueError("Cannot convert to dotstat codes without preprocessing.")
+    # Return the dataframe
+    logger.info("Data downloaded correctly.")
+    return df

oda_reader-0.0.9/oda_reader/query_builder.py ADDED Viewed

@@ -0,0 +1,189 @@
+""" A module for constructing SDMX API queries for the OECD data. """
+from oda_reader.common import logger
+V1_BASE_URL: str = "https://sdmx.oecd.org/public/rest/data/"
+V2_BASE_URL: str = "https://sdmx.oecd.org/public/rest/v2/data/dataflow/"
+AGENCY_ID: str = "OECD.DCD.FSD"
+SHAPE: str = "dimensionAtObservation=AllDimensions"
+FORMAT: str = "csvfilewithlabels"
+class QueryBuilder:
+    """
+    A builder class for constructing SDMX API queries for the OECD data.
+    Attributes:
+        agency_id (str): The agency ID used in the query.
+        base_url (str): The base URL for the query, dynamically determined by the API version.
+        params (dict): A dictionary of query parameters, initialized with default format.
+        api_version (int): The version of the API to use.
+    """
+    def __init__(
+        self,
+        dataflow_id: str,
+        dataflow_version: str = None,
+        api_version: int = 1,
+    ) -> None:
+        """
+        Initialize the QueryBuilder with specific settings for the API and data flow.
+        Args:
+            dataflow_id (str): The identifier for the dataflow.
+            dataflow_version (str): The version of the dataflow
+            api_version (int): The version of the API to use, default is 2.
+        """
+        # If dataflow_version is not provided, use the latest version
+        dataflow_version = "+" if api_version == 2 and not dataflow_version else ""
+        # Set the base URL and separator based on the API version
+        base_url = V2_BASE_URL if api_version == 2 else V1_BASE_URL
+        self._separator = "/" if api_version == 2 else ","
+        # Set the agency ID
+        self.agency_id = AGENCY_ID
+        # Set the dimensions filter to all
+        self.filter = "*" if api_version == 2 else "all"
+        # Construct the base URL
+        self.base_url = (
+            f"{base_url}{self.agency_id}"
+            f"{self._separator}{dataflow_id}"
+            f"{self._separator}{dataflow_version}/"
+        )
+        # Initialize the query parameters with the default format
+        self.params = {"format": FORMAT}
+        # Store the API version
+        self.api_version = api_version
+    def _to_filter_str(self, param: str | list[str] | None) -> str:
+        """Convert a string parameter to a list, if it is not already a list.
+        Args:
+            param (str | list[str] | None): The parameter to convert.
+            api_version (int): The version of the API to use.
+        Returns:
+            list[str]: The parameter as a list.
+        """
+        if param is None:
+            return "*" if self.api_version == 2 else ""
+        if isinstance(param, str):
+            param = [param]
+        if (self.api_version == 2) & (len(param) > 1):
+            logger.info(
+                f"API version 2 does not support filtering on multiple values:"
+                f"\n{(', '.join(param))} \n"
+                "Returning all values."
+            )
+            return "*"
+        return "+".join(param)
+    def set_time_period(
+        self, start: int | str | None, end: int | str | None
+    ) -> "QueryBuilder":
+        """Set the time period for the query. The time period is inclusive.
+        Args:
+            start (int | str): The start year or date.
+            end (int | str): The end year or date.
+        Returns:
+            Self: Returns self to allow for method chaining.
+        """
+        if self.api_version == 2:
+            if start and end:
+                self.params["c[TIME_PERIOD]"] = f"ge:{start}+le:{end}"
+                return self
+            if start:
+                self.params["c[TIME_PERIOD]"] = f"ge:{start}"
+            if end:
+                self.params["c[TIME_PERIOD]"] = f"ge:1950+le:{end}"
+        else:
+            if start:
+                self.params["startPeriod"] = start
+            if end:
+                self.params["endPeriod"] = end
+        return self
+    def build_dac1_filter(
+        self,
+        donor: str | list[str] | None = None,
+        measure: str | list[str] | None = None,
+        flow_type: str | list[str] | None = None,
+        unit_measure: str | list[str] | None = None,
+        price_base: str | list[str] | None = None,
+    ) -> str:
+        # if any of the parameters are None, set them to the default value
+        donor = self._to_filter_str(donor)
+        measure = self._to_filter_str(measure)
+        untied = self._to_filter_str(None)
+        flow_type = self._to_filter_str(flow_type)
+        unit_measure = self._to_filter_str(unit_measure)
+        price_base = self._to_filter_str(price_base)
+        period = self._to_filter_str(None)
+        return ".".join(
+            [donor, measure, untied, flow_type, unit_measure, price_base, period]
+        )
+    def set_filter(self, filter_string: str) -> "QueryBuilder":
+        """Set the dimensions parameter for the query.
+        Args:
+            filter_string (str): The filter string for the query.
+        Returns:
+            Self: Returns self to allow for method chaining.
+        """
+        self.filter = filter_string
+        return self
+    def set_last_n_observations(self, n: int) -> "QueryBuilder":
+        """Set the number of most recent observations to return.
+        Args:
+            n (int): The number of most recent observations to return.
+        Returns:
+            Self: Returns self to allow for method chaining.
+        """
+        self.params["lastNObservations"] = n
+        return self
+    def set_format(self, file_format) -> "QueryBuilder":
+        """Set the format of the output file.
+        Args:
+            file_format (str): The file format for the output.
+        Returns:
+            Self: Returns self to allow for method chaining.
+        """
+        self.params["format"] = file_format
+        return self
+    def build_query(self) -> str:
+        """Construct and return the full query URL.
+        Returns:
+            str: The fully constructed URL.
+        """
+        # Create list to contain query parts
+        query_parts = [self.base_url + self.filter + "?"]
+        # Add each parameter to the query
+        query_parts.extend(f"{key}={value}&" for key, value in self.params.items())
+        # Return the full query URL, removing the trailing "&"
+        return "".join(query_parts).rstrip("&")

oda_reader-0.0.9/oda_reader/schemas/__init__.py ADDED Viewed

File without changes

oda_reader-0.0.9/oda_reader/schemas/area_code_corrections.json ADDED Viewed

@@ -0,0 +1,8 @@
+{
+  "20000": "DAC_EC",
+  "10280": "F5_X",
+  "6790": "S7_X",
+  "10330": "O7_X",
+  "8600": "O8_X",
+  "10350": "O9_X"
+}

oda_reader-0.0.9/oda_reader/schemas/code_prices_corrections.json ADDED Viewed

@@ -0,0 +1,4 @@
+{
+  "XDC": "N",
+  "PT_B5G": "PT_B5G"
+}