oda-reader 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 ONE Campaign
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,29 @@
1
+ Metadata-Version: 2.1
2
+ Name: oda_reader
3
+ Version: 0.0.9
4
+ Summary: A simple package to import ODA data using the OECD Data API
5
+ License: MIT
6
+ Author: Jorge Rivera
7
+ Requires-Python: >=3.10,<4.0
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
14
+ Requires-Dist: pyarrow (>=16.0.0)
15
+ Requires-Dist: requests (>=2.31.0,<3.0.0)
16
+ Description-Content-Type: text/markdown
17
+
18
+ # oda_reader
19
+ Tools to import data from the OECD DAC.
20
+
21
+ This is a very simple package to make working with the Explorer API
22
+ easier.
23
+
24
+ This package is under active development.
25
+
26
+
27
+ It includes a basic implementation of an API call for DAC1. It also includes
28
+ tools to translate the API response into the old .Stat schema.
29
+
@@ -0,0 +1,11 @@
1
+ # oda_reader
2
+ Tools to import data from the OECD DAC.
3
+
4
+ This is a very simple package to make working with the Explorer API
5
+ easier.
6
+
7
+ This package is under active development.
8
+
9
+
10
+ It includes a basic implementation of an API call for DAC1. It also includes
11
+ tools to translate the API response into the old .Stat schema.
@@ -0,0 +1 @@
1
+ __version__ = "0.0.9"
@@ -0,0 +1,94 @@
1
+ import logging
2
+ from io import StringIO
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+ import requests
7
+
8
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
9
+
10
+ logger = logging.getLogger("oda_importer")
11
+
12
+
13
+ class ImporterPaths:
14
+ """Class to store the paths to the data and output folders."""
15
+
16
+ project = Path(__file__).resolve().parent.parent
17
+ scripts = project / "oda_importer"
18
+ schemas = scripts / "schemas"
19
+
20
+
21
+ def text_to_stringIO(response: requests.models.Response) -> StringIO:
22
+ """Convert the content of a response to bytes.
23
+
24
+ Args:
25
+ response (requests.models.Response): The response object from the API.
26
+
27
+ Returns:
28
+ StringIO: The content of the response as a stringIO object.
29
+
30
+ """
31
+ # Use BytesIO to handle the binary stream data
32
+ return StringIO(response.text)
33
+
34
+
35
+ def get_data_from_api(url: str, compressed: bool = True) -> requests.models.Response:
36
+ """Download a CSV file from an API endpoint and return it as a DataFrame.
37
+
38
+ Args:
39
+ url (str): The URL of the API endpoint.
40
+ compressed (bool): Whether the data is fetched compressed. Strongly recommended.
41
+
42
+ Returns:
43
+ requests.models.Response: The response object from the API.
44
+ """
45
+
46
+ # Set the headers with gzip encoding (if required)
47
+ if compressed:
48
+ headers = {"Accept-Encoding": "gzip"}
49
+ else:
50
+ headers = {}
51
+
52
+ # Fetch the data with headers
53
+ logger.info(f"Fetching data from {url}")
54
+ response = requests.get(url, headers=headers)
55
+
56
+ if (response.status_code == 404) and (response.text == "NoRecordsFound"):
57
+ raise ConnectionError("No data found for the selected parameters.")
58
+
59
+ # Ensure the request was successful
60
+ response.raise_for_status()
61
+
62
+ return response
63
+
64
+
65
+ def api_response_to_df(
66
+ url: str, read_csv_options: dict = None, compressed: bool = True
67
+ ) -> pd.DataFrame:
68
+ """Download a CSV file from an API endpoint and return it as a DataFrame.
69
+
70
+ Args:
71
+ url (str): The URL of the API endpoint.
72
+ read_csv_options (dict): Options to pass to `pd.read_csv`.
73
+ compressed (bool): Whether the data is fetched compressed. Strongly recommended.
74
+
75
+ Returns:
76
+ pd.DataFrame: The data as a DataFrame.
77
+
78
+ """
79
+ # Set default options for read_csv
80
+ if read_csv_options is None:
81
+ read_csv_options = {}
82
+
83
+ # If asked for uncompressed data, return the data as is
84
+ if not compressed:
85
+ return pd.read_csv(url, **read_csv_options)
86
+
87
+ # Fetch the data from the API with compression headers
88
+ response = get_data_from_api(url=url, compressed=compressed)
89
+
90
+ # Convert the content to stringIO
91
+ data = text_to_stringIO(response)
92
+
93
+ # Return the data as a DataFrame
94
+ return pd.read_csv(data, **read_csv_options)
@@ -0,0 +1,45 @@
1
+ import pandas as pd
2
+
3
+ from oda_reader.common import logger
4
+ from oda_reader.download_tools import download
5
+
6
+ DATAFLOW_ID: str = "DSD_DAC1@DF_DAC1"
7
+
8
+
9
+ def download_dac1(
10
+ start_year: int | None = None,
11
+ end_year: int | None = None,
12
+ filters: dict | None = None,
13
+ pre_process: bool = True,
14
+ dotstat_codes: bool = True,
15
+ ) -> pd.DataFrame:
16
+ """
17
+ Download the DAC1 data from the API.
18
+
19
+ Args:
20
+ start_year (int): The start year of the data to download. Optional
21
+ end_year (int): The end year of the data to download. Optional
22
+ filters (dict): Optional filters to pass to the download.
23
+ pre_process (bool): Whether to preprocess the data. Defaults to True.
24
+ Preprocessing makes it comply with the .stat schema.
25
+ dotstat_codes (bool): Whether to convert the donor codes to the .stat schema.
26
+
27
+ Returns:
28
+ pd.DataFrame: The DAC1 data.
29
+
30
+ """
31
+
32
+ # Inform download is about to start
33
+ logger.info("Downloading DAC1 data. This may take a while...")
34
+
35
+ df = download(
36
+ version="dac1",
37
+ dataflow_id=DATAFLOW_ID,
38
+ start_year=start_year,
39
+ end_year=end_year,
40
+ filters=filters,
41
+ pre_process=pre_process,
42
+ dotstat_codes=dotstat_codes,
43
+ )
44
+
45
+ return df
@@ -0,0 +1,45 @@
1
+ import pandas as pd
2
+
3
+ from oda_reader.common import logger
4
+ from oda_reader.download_tools import download
5
+
6
+ DATAFLOW_ID: str = "DSD_DAC2@DF_DAC2A"
7
+
8
+
9
+ def download_dac2a(
10
+ start_year: int | None = None,
11
+ end_year: int | None = None,
12
+ filters: dict | None = None,
13
+ pre_process: bool = True,
14
+ dotstat_codes: bool = True,
15
+ ) -> pd.DataFrame:
16
+ """
17
+ Download the DAC1 data from the API.
18
+
19
+ Args:
20
+ start_year (int): The start year of the data to download. Optional
21
+ end_year (int): The end year of the data to download. Optional
22
+ filters (dict): Optional filters to pass to the download.
23
+ pre_process (bool): Whether to preprocess the data. Defaults to True.
24
+ Preprocessing makes it comply with the .stat schema.
25
+ dotstat_codes (bool): Whether to convert the donor codes to the .stat schema.
26
+
27
+ Returns:
28
+ pd.DataFrame: The DAC1 data.
29
+
30
+ """
31
+
32
+ # Inform download is about to start
33
+ logger.info("Downloading DAC2A data. This may take a while...")
34
+
35
+ df = download(
36
+ version="dac1",
37
+ dataflow_id=DATAFLOW_ID,
38
+ start_year=start_year,
39
+ end_year=end_year,
40
+ filters=filters,
41
+ pre_process=pre_process,
42
+ dotstat_codes=dotstat_codes,
43
+ )
44
+
45
+ return df
@@ -0,0 +1,89 @@
1
+ import pandas as pd
2
+
3
+ from oda_reader.common import api_response_to_df, logger
4
+ from oda_reader.query_builder import QueryBuilder
5
+ from oda_reader.schemas.dac1_translation import convert_dac1_to_dotstat_codes
6
+ from oda_reader.schemas.dac2_translation import convert_dac2a_to_dotstat_codes
7
+ from oda_reader.schemas.schema_tools import (
8
+ read_schema_translation,
9
+ get_dtypes,
10
+ preprocess,
11
+ )
12
+
13
+
14
+ def download(
15
+ version: str,
16
+ dataflow_id: str,
17
+ start_year: int | None = None,
18
+ end_year: int | None = None,
19
+ filters: dict | None = None,
20
+ pre_process: bool = True,
21
+ dotstat_codes: bool = True,
22
+ ) -> pd.DataFrame:
23
+ """
24
+ Download the data from the API.
25
+
26
+ Args:
27
+ version (str): The version of the data to download.
28
+ dataflow_id (str): The dataflow id of the data to download.
29
+ start_year (int): The start year of the data to download. Optional
30
+ end_year (int): The end year of the data to download. Optional
31
+ filters (dict): Optional filters to pass to the download.
32
+ pre_process (bool): Whether to preprocess the data. Defaults to True.
33
+ Preprocessing makes it comply with the .stat schema.
34
+ dotstat_codes (bool): Whether to convert the donor codes to the .stat schema.
35
+
36
+ Returns:
37
+ pd.DataFrame: The DAC1 data.
38
+
39
+ """
40
+ # Load the translation schema from .stat to the new explorer
41
+ schema_translation = read_schema_translation(version=version)
42
+
43
+ # Get a data types dictionary
44
+ data_types = get_dtypes(schema=schema_translation)
45
+
46
+ # Set read csv options
47
+ df_options = {
48
+ "na_values": ("_Z", "nan"),
49
+ "keep_default_na": True,
50
+ "dtype": data_types,
51
+ }
52
+
53
+ # instantiate the query builder
54
+ qb = QueryBuilder(dataflow_id=dataflow_id)
55
+
56
+ # Select right filter builder and dotstat codes
57
+ if version == "dac1":
58
+ filter_builder = qb.build_dac1_filter
59
+ convert_func = convert_dac1_to_dotstat_codes
60
+ elif version == "dac2a":
61
+ filter_builder = qb.build_dac2a_filter
62
+ convert_func = convert_dac2a_to_dotstat_codes
63
+ else:
64
+ raise ValueError("Version must be either 'dac1' or 'dac2a'.")
65
+
66
+ # Optionally set filters
67
+ if filters:
68
+ filter_str = filter_builder(**filters)
69
+ qb.set_filter(filter_str)
70
+
71
+ # Get the url
72
+ url = qb.set_time_period(start=start_year, end=end_year).build_query()
73
+
74
+ # Get the dataframe
75
+ df = api_response_to_df(url=url, read_csv_options=df_options)
76
+
77
+ # Preprocess the data
78
+ if pre_process:
79
+ df = preprocess(df=df, schema_translation=schema_translation)
80
+ if dotstat_codes:
81
+ df = convert_func(df)
82
+ else:
83
+ if dotstat_codes:
84
+ raise ValueError("Cannot convert to dotstat codes without preprocessing.")
85
+
86
+ # Return the dataframe
87
+ logger.info("Data downloaded correctly.")
88
+
89
+ return df
@@ -0,0 +1,189 @@
1
+ """ A module for constructing SDMX API queries for the OECD data. """
2
+ from oda_reader.common import logger
3
+
4
+ V1_BASE_URL: str = "https://sdmx.oecd.org/public/rest/data/"
5
+ V2_BASE_URL: str = "https://sdmx.oecd.org/public/rest/v2/data/dataflow/"
6
+ AGENCY_ID: str = "OECD.DCD.FSD"
7
+ SHAPE: str = "dimensionAtObservation=AllDimensions"
8
+ FORMAT: str = "csvfilewithlabels"
9
+
10
+
11
+ class QueryBuilder:
12
+ """
13
+ A builder class for constructing SDMX API queries for the OECD data.
14
+
15
+ Attributes:
16
+ agency_id (str): The agency ID used in the query.
17
+ base_url (str): The base URL for the query, dynamically determined by the API version.
18
+ params (dict): A dictionary of query parameters, initialized with default format.
19
+ api_version (int): The version of the API to use.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ dataflow_id: str,
25
+ dataflow_version: str = None,
26
+ api_version: int = 1,
27
+ ) -> None:
28
+ """
29
+ Initialize the QueryBuilder with specific settings for the API and data flow.
30
+
31
+ Args:
32
+ dataflow_id (str): The identifier for the dataflow.
33
+ dataflow_version (str): The version of the dataflow
34
+ api_version (int): The version of the API to use, default is 2.
35
+ """
36
+
37
+ # If dataflow_version is not provided, use the latest version
38
+ dataflow_version = "+" if api_version == 2 and not dataflow_version else ""
39
+
40
+ # Set the base URL and separator based on the API version
41
+ base_url = V2_BASE_URL if api_version == 2 else V1_BASE_URL
42
+ self._separator = "/" if api_version == 2 else ","
43
+
44
+ # Set the agency ID
45
+ self.agency_id = AGENCY_ID
46
+
47
+ # Set the dimensions filter to all
48
+ self.filter = "*" if api_version == 2 else "all"
49
+
50
+ # Construct the base URL
51
+ self.base_url = (
52
+ f"{base_url}{self.agency_id}"
53
+ f"{self._separator}{dataflow_id}"
54
+ f"{self._separator}{dataflow_version}/"
55
+ )
56
+
57
+ # Initialize the query parameters with the default format
58
+ self.params = {"format": FORMAT}
59
+
60
+ # Store the API version
61
+ self.api_version = api_version
62
+
63
+ def _to_filter_str(self, param: str | list[str] | None) -> str:
64
+ """Convert a string parameter to a list, if it is not already a list.
65
+
66
+ Args:
67
+ param (str | list[str] | None): The parameter to convert.
68
+ api_version (int): The version of the API to use.
69
+
70
+ Returns:
71
+ list[str]: The parameter as a list.
72
+ """
73
+
74
+ if param is None:
75
+ return "*" if self.api_version == 2 else ""
76
+ if isinstance(param, str):
77
+ param = [param]
78
+
79
+ if (self.api_version == 2) & (len(param) > 1):
80
+ logger.info(
81
+ f"API version 2 does not support filtering on multiple values:"
82
+ f"\n{(', '.join(param))} \n"
83
+ "Returning all values."
84
+ )
85
+ return "*"
86
+
87
+ return "+".join(param)
88
+
89
+ def set_time_period(
90
+ self, start: int | str | None, end: int | str | None
91
+ ) -> "QueryBuilder":
92
+ """Set the time period for the query. The time period is inclusive.
93
+
94
+ Args:
95
+ start (int | str): The start year or date.
96
+ end (int | str): The end year or date.
97
+
98
+ Returns:
99
+ Self: Returns self to allow for method chaining.
100
+ """
101
+ if self.api_version == 2:
102
+ if start and end:
103
+ self.params["c[TIME_PERIOD]"] = f"ge:{start}+le:{end}"
104
+ return self
105
+ if start:
106
+ self.params["c[TIME_PERIOD]"] = f"ge:{start}"
107
+ if end:
108
+ self.params["c[TIME_PERIOD]"] = f"ge:1950+le:{end}"
109
+
110
+ else:
111
+ if start:
112
+ self.params["startPeriod"] = start
113
+ if end:
114
+ self.params["endPeriod"] = end
115
+
116
+ return self
117
+
118
+ def build_dac1_filter(
119
+ self,
120
+ donor: str | list[str] | None = None,
121
+ measure: str | list[str] | None = None,
122
+ flow_type: str | list[str] | None = None,
123
+ unit_measure: str | list[str] | None = None,
124
+ price_base: str | list[str] | None = None,
125
+ ) -> str:
126
+ # if any of the parameters are None, set them to the default value
127
+ donor = self._to_filter_str(donor)
128
+ measure = self._to_filter_str(measure)
129
+ untied = self._to_filter_str(None)
130
+ flow_type = self._to_filter_str(flow_type)
131
+ unit_measure = self._to_filter_str(unit_measure)
132
+ price_base = self._to_filter_str(price_base)
133
+ period = self._to_filter_str(None)
134
+
135
+ return ".".join(
136
+ [donor, measure, untied, flow_type, unit_measure, price_base, period]
137
+ )
138
+
139
+ def set_filter(self, filter_string: str) -> "QueryBuilder":
140
+ """Set the dimensions parameter for the query.
141
+
142
+ Args:
143
+ filter_string (str): The filter string for the query.
144
+
145
+ Returns:
146
+ Self: Returns self to allow for method chaining.
147
+ """
148
+
149
+ self.filter = filter_string
150
+ return self
151
+
152
+ def set_last_n_observations(self, n: int) -> "QueryBuilder":
153
+ """Set the number of most recent observations to return.
154
+
155
+ Args:
156
+ n (int): The number of most recent observations to return.
157
+
158
+ Returns:
159
+ Self: Returns self to allow for method chaining.
160
+ """
161
+ self.params["lastNObservations"] = n
162
+ return self
163
+
164
+ def set_format(self, file_format) -> "QueryBuilder":
165
+ """Set the format of the output file.
166
+
167
+ Args:
168
+ file_format (str): The file format for the output.
169
+
170
+ Returns:
171
+ Self: Returns self to allow for method chaining.
172
+ """
173
+ self.params["format"] = file_format
174
+ return self
175
+
176
+ def build_query(self) -> str:
177
+ """Construct and return the full query URL.
178
+
179
+ Returns:
180
+ str: The fully constructed URL.
181
+ """
182
+ # Create list to contain query parts
183
+ query_parts = [self.base_url + self.filter + "?"]
184
+
185
+ # Add each parameter to the query
186
+ query_parts.extend(f"{key}={value}&" for key, value in self.params.items())
187
+
188
+ # Return the full query URL, removing the trailing "&"
189
+ return "".join(query_parts).rstrip("&")
File without changes
@@ -0,0 +1,8 @@
1
+ {
2
+ "20000": "DAC_EC",
3
+ "10280": "F5_X",
4
+ "6790": "S7_X",
5
+ "10330": "O7_X",
6
+ "8600": "O8_X",
7
+ "10350": "O9_X"
8
+ }
@@ -0,0 +1,4 @@
1
+ {
2
+ "XDC": "N",
3
+ "PT_B5G": "PT_B5G"
4
+ }