pydeflate 1.4.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydeflate/__init__.py +25 -17
- pydeflate/core/api.py +404 -0
- pydeflate/core/deflator.py +171 -0
- pydeflate/core/exchange.py +237 -0
- pydeflate/core/source.py +54 -0
- pydeflate/deflate/deflators.py +228 -0
- pydeflate/deflate/legacy_deflate.py +109 -0
- pydeflate/exchange/__init__.py +0 -0
- pydeflate/exchange/exchangers.py +147 -0
- pydeflate/pydeflate_config.py +25 -16
- pydeflate/sources/__init__.py +0 -0
- pydeflate/sources/common.py +278 -0
- pydeflate/sources/dac.py +137 -0
- pydeflate/sources/imf.py +203 -0
- pydeflate/sources/world_bank.py +186 -0
- pydeflate/utils.py +55 -22
- {pydeflate-1.4.2.dist-info → pydeflate-2.0.0.dist-info}/LICENSE +1 -1
- pydeflate-2.0.0.dist-info/METADATA +287 -0
- pydeflate-2.0.0.dist-info/RECORD +25 -0
- {pydeflate-1.4.2.dist-info → pydeflate-2.0.0.dist-info}/WHEEL +1 -1
- pydeflate/deflate/deflate.py +0 -324
- pydeflate/deflate/deflator.py +0 -78
- pydeflate/get_data/deflate_data.py +0 -70
- pydeflate/get_data/exchange_data.py +0 -371
- pydeflate/get_data/imf_data.py +0 -76
- pydeflate/get_data/oecd_data.py +0 -146
- pydeflate/get_data/wb_data.py +0 -75
- pydeflate/tools/__init__.py +0 -2
- pydeflate/tools/exchange.py +0 -171
- pydeflate/tools/update_data.py +0 -69
- pydeflate-1.4.2.dist-info/METADATA +0 -305
- pydeflate-1.4.2.dist-info/RECORD +0 -22
- /pydeflate/{get_data → core}/__init__.py +0 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from functools import wraps
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from pydeflate.core.api import BaseExchange
|
|
6
|
+
from pydeflate.core.source import DAC, WorldBank, IMF
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _generate_docstring(source_name: str) -> str:
|
|
10
|
+
"""Generate docstring for each decorated exchange function."""
|
|
11
|
+
return (
|
|
12
|
+
f"Exchange a DataFrame using the {source_name} rates source.\n\n"
|
|
13
|
+
f"This function applies exchange rates toa DataFrame using the {source_name} rates.\n\n"
|
|
14
|
+
"Args:\n"
|
|
15
|
+
" data (pd.DataFrame): The input DataFrame containing data to deflate.\n"
|
|
16
|
+
" source_currency (str, optional): The source currency code. Defaults to 'USA'.\n"
|
|
17
|
+
" target_currency (str, optional): The target currency code. Defaults to 'USA'.\n"
|
|
18
|
+
" id_column (str, optional): Column with entity identifiers. Defaults to 'iso_code'.\n"
|
|
19
|
+
" year_column (str, optional): Column with year information. Defaults to 'year'.\n"
|
|
20
|
+
" use_source_codes (bool, optional): Use source-specific entity codes. Defaults to False.\n"
|
|
21
|
+
" value_column (str, optional): Column with values to deflate. Defaults to 'value'.\n"
|
|
22
|
+
" target_value_column (str, optional): Column to store deflated values. Defaults to 'value'.\n"
|
|
23
|
+
" reversed_ (bool, optional): The reverse of an exchange conversion. Defaults to False.\n"
|
|
24
|
+
" year_format (str | None, optional): Format of the year in `year_column`. Defaults to None.\n"
|
|
25
|
+
" update_rates (bool, optional): Update the exchange rate data. Defaults to False.\n\n"
|
|
26
|
+
"Returns:\n"
|
|
27
|
+
" pd.DataFrame: DataFrame with converted values in the `target_value_column`.\n"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _exchange(exchange_source_cls):
|
|
32
|
+
"""Decorator to create exchange wrappers with specific source"""
|
|
33
|
+
|
|
34
|
+
def decorator(func):
|
|
35
|
+
@wraps(func)
|
|
36
|
+
def wrapper(
|
|
37
|
+
data: pd.DataFrame,
|
|
38
|
+
*,
|
|
39
|
+
source_currency: str = "USA",
|
|
40
|
+
target_currency: str = "USA",
|
|
41
|
+
id_column: str = "iso_code",
|
|
42
|
+
year_column: str = "year",
|
|
43
|
+
use_source_codes: bool = False,
|
|
44
|
+
value_column: str = "value",
|
|
45
|
+
target_value_column: str = "value",
|
|
46
|
+
reversed_: bool = False,
|
|
47
|
+
year_format: str | None = None,
|
|
48
|
+
update_rates: bool = False,
|
|
49
|
+
):
|
|
50
|
+
# Validate input parameters
|
|
51
|
+
if not isinstance(data, pd.DataFrame):
|
|
52
|
+
raise ValueError("The 'data' parameter must be a pandas DataFrame.")
|
|
53
|
+
|
|
54
|
+
if id_column not in data.columns:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"The id_column '{id_column}' is not in the DataFrame."
|
|
57
|
+
)
|
|
58
|
+
if year_column not in data.columns:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"The year_column '{year_column}' is not in the DataFrame."
|
|
61
|
+
)
|
|
62
|
+
if value_column not in data.columns:
|
|
63
|
+
raise ValueError(
|
|
64
|
+
f"The value_column '{value_column}' is not in the DataFrame."
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Copy the data to avoid modifying the original
|
|
68
|
+
to_exchange = data.copy()
|
|
69
|
+
|
|
70
|
+
# Initialize the deflator source
|
|
71
|
+
source = exchange_source_cls(update=update_rates)
|
|
72
|
+
|
|
73
|
+
# Create a deflator object
|
|
74
|
+
exchange = BaseExchange(
|
|
75
|
+
exchange_source=source,
|
|
76
|
+
source_currency=source_currency,
|
|
77
|
+
target_currency=target_currency,
|
|
78
|
+
use_source_codes=use_source_codes,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Deflate the data
|
|
82
|
+
return exchange.exchange(
|
|
83
|
+
data=to_exchange,
|
|
84
|
+
entity_column=id_column,
|
|
85
|
+
year_column=year_column,
|
|
86
|
+
value_column=value_column,
|
|
87
|
+
target_value_column=target_value_column,
|
|
88
|
+
year_format=year_format,
|
|
89
|
+
reversed_=reversed_,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Add the deflator source and price kind to the function
|
|
93
|
+
wrapper.__doc__ = _generate_docstring(exchange_source_cls.__name__)
|
|
94
|
+
return wrapper
|
|
95
|
+
|
|
96
|
+
return decorator
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@_exchange(DAC)
|
|
100
|
+
def oecd_dac_exchange(
|
|
101
|
+
data: pd.DataFrame,
|
|
102
|
+
*,
|
|
103
|
+
source_currency: str = "USA",
|
|
104
|
+
target_currency: str = "USA",
|
|
105
|
+
id_column: str = "iso_code",
|
|
106
|
+
year_column: str = "year",
|
|
107
|
+
use_source_codes: bool = False,
|
|
108
|
+
value_column: str = "value",
|
|
109
|
+
target_value_column: str = "value",
|
|
110
|
+
reversed_: bool = False,
|
|
111
|
+
year_format: str | None = None,
|
|
112
|
+
update_rates: bool = False,
|
|
113
|
+
) -> pd.DataFrame: ...
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@_exchange(WorldBank)
|
|
117
|
+
def wb_exchange(
|
|
118
|
+
data: pd.DataFrame,
|
|
119
|
+
*,
|
|
120
|
+
source_currency: str = "USA",
|
|
121
|
+
target_currency: str = "USA",
|
|
122
|
+
id_column: str = "iso_code",
|
|
123
|
+
year_column: str = "year",
|
|
124
|
+
use_source_codes: bool = False,
|
|
125
|
+
value_column: str = "value",
|
|
126
|
+
target_value_column: str = "value",
|
|
127
|
+
reversed_: bool = False,
|
|
128
|
+
year_format: str | None = None,
|
|
129
|
+
update_rates: bool = False,
|
|
130
|
+
) -> pd.DataFrame: ...
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@_exchange(IMF)
|
|
134
|
+
def imf_exchange(
|
|
135
|
+
data: pd.DataFrame,
|
|
136
|
+
*,
|
|
137
|
+
source_currency: str = "USA",
|
|
138
|
+
target_currency: str = "USA",
|
|
139
|
+
id_column: str = "iso_code",
|
|
140
|
+
year_column: str = "year",
|
|
141
|
+
use_source_codes: bool = False,
|
|
142
|
+
value_column: str = "value",
|
|
143
|
+
target_value_column: str = "value",
|
|
144
|
+
reversed_: bool = False,
|
|
145
|
+
year_format: str | None = None,
|
|
146
|
+
update_rates: bool = False,
|
|
147
|
+
) -> pd.DataFrame: ...
|
pydeflate/pydeflate_config.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
1
|
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class PYDEFLATE_PATHS:
|
|
@@ -11,24 +11,33 @@ class PYDEFLATE_PATHS:
|
|
|
11
11
|
test_data = package / "tests" / "test_files"
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
def setup_logger(name) -> logging.Logger:
|
|
15
|
+
"""Set up the logger.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
name (str): The name of the logger.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
logging.Logger: The logger.
|
|
22
|
+
|
|
23
|
+
"""
|
|
24
|
+
logger_ = logging.getLogger(name)
|
|
25
|
+
logger_.setLevel(logging.INFO)
|
|
16
26
|
|
|
17
|
-
#
|
|
18
|
-
|
|
27
|
+
# Only add handlers if the logger has none to avoid duplication
|
|
28
|
+
if not logger_.hasHandlers():
|
|
29
|
+
console_handler = logging.StreamHandler()
|
|
30
|
+
console_handler.setLevel(logging.INFO)
|
|
19
31
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
32
|
+
formatter = logging.Formatter(
|
|
33
|
+
"%(asctime)s - %(name)s - %(levelname)s:\n %(message)s"
|
|
34
|
+
)
|
|
35
|
+
console_handler.setFormatter(formatter)
|
|
23
36
|
|
|
24
|
-
|
|
25
|
-
|
|
37
|
+
logger_.addHandler(console_handler)
|
|
38
|
+
logger_.propagate = False
|
|
26
39
|
|
|
27
|
-
|
|
28
|
-
shell_formatter = logging.Formatter(fmt_shell)
|
|
40
|
+
return logger_
|
|
29
41
|
|
|
30
|
-
# Add formatters to handlers
|
|
31
|
-
shell_handler.setFormatter(shell_formatter)
|
|
32
42
|
|
|
33
|
-
|
|
34
|
-
logger.addHandler(shell_handler)
|
|
43
|
+
logger = setup_logger("pydeflate")
|
|
File without changes
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from hdx.location.country import Country
|
|
7
|
+
|
|
8
|
+
from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
|
|
9
|
+
|
|
10
|
+
AvailableDeflators = Literal["NGDP_D", "NGDP_DL", "CPI", "PCPI", "PCPIE"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def check_file_age(file: Path) -> int:
|
|
14
|
+
"""Check the age of a WEO file in days.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
file (Path): The WEO parquet file to check.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
int: The number of days since the file was created.
|
|
21
|
+
"""
|
|
22
|
+
current_date = datetime.today()
|
|
23
|
+
# Extract date from the filename (format: weo_YYYY-MM-DD.parquet)
|
|
24
|
+
file_date = datetime.strptime(file.stem.split("_")[1], "%Y-%m-%d")
|
|
25
|
+
|
|
26
|
+
# Return the difference in days between today and the file's date
|
|
27
|
+
return (current_date - file_date).days
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def enforce_pyarrow_types(df: pd.DataFrame) -> pd.DataFrame:
|
|
31
|
+
"""Ensures that a DataFrame uses pyarrow dtypes."""
|
|
32
|
+
return df.convert_dtypes(dtype_backend="pyarrow")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def today() -> str:
|
|
36
|
+
from datetime import datetime
|
|
37
|
+
|
|
38
|
+
return datetime.today().strftime("%Y-%m-%d")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _match_regex_to_iso3(
|
|
42
|
+
to_match: list[str], additional_mapping: dict | None
|
|
43
|
+
) -> dict[str, str]:
|
|
44
|
+
"""Match a list of regex strings to ISO3 country codes.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
to_match (list): A list of regex strings to match.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
dict: A dictionary with the regex strings as keys and the ISO3 codes as values.
|
|
51
|
+
"""
|
|
52
|
+
if additional_mapping is None:
|
|
53
|
+
additional_mapping = {}
|
|
54
|
+
|
|
55
|
+
# Create a Country object
|
|
56
|
+
country = Country()
|
|
57
|
+
|
|
58
|
+
# Match the regex strings to ISO3 country codes
|
|
59
|
+
matches = {}
|
|
60
|
+
|
|
61
|
+
for match in to_match:
|
|
62
|
+
match_ = country.get_iso3_country_code_fuzzy(match)[0]
|
|
63
|
+
matches[match] = match_
|
|
64
|
+
if match_ is None and match not in additional_mapping:
|
|
65
|
+
logger.debug(f"No ISO3 match found for {match}")
|
|
66
|
+
|
|
67
|
+
return matches | additional_mapping
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def convert_id(
|
|
71
|
+
series: pd.Series,
|
|
72
|
+
from_type: str = "regex",
|
|
73
|
+
to_type: str = "ISO3",
|
|
74
|
+
not_found: Any = None,
|
|
75
|
+
*,
|
|
76
|
+
additional_mapping: dict = None,
|
|
77
|
+
) -> pd.Series:
|
|
78
|
+
"""Takes a Pandas' series with country IDs and converts them into the desired type.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
series: the Pandas series to convert
|
|
82
|
+
from_type: the classification type according to which the series is encoded.
|
|
83
|
+
For example: ISO3, ISO2, regex, DACCode.
|
|
84
|
+
to_type: the target classification type. Same options as from_type
|
|
85
|
+
not_found: what to do if the value is not found. Can pass a string or None.
|
|
86
|
+
If None, the original value is passed through.
|
|
87
|
+
additional_mapping: Optionally, a dictionary with additional mappings can be used.
|
|
88
|
+
The keys are the values to be converted and the values are the converted values.
|
|
89
|
+
The keys follow the same datatype as the original values. The values must follow
|
|
90
|
+
the same datatype as the target type.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
# if from and to are the same, return without changing anything
|
|
94
|
+
if from_type == to_type:
|
|
95
|
+
return series
|
|
96
|
+
|
|
97
|
+
mapping_functions = {"regex": _match_regex_to_iso3}
|
|
98
|
+
|
|
99
|
+
# Get the unique values for mapping. This is done in order to significantly improve
|
|
100
|
+
# the performance of country_converter with very long datasets.
|
|
101
|
+
s_unique = series.unique()
|
|
102
|
+
|
|
103
|
+
# Create a correspondence dictionary
|
|
104
|
+
mapping = mapping_functions[from_type](
|
|
105
|
+
to_match=s_unique, additional_mapping=additional_mapping
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
return series.map(mapping).fillna(series if not_found is None else not_found)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def add_pydeflate_iso3(
|
|
112
|
+
df: pd.DataFrame, column: str, from_type: str = "regex", fillna: Any = pd.NA
|
|
113
|
+
) -> pd.DataFrame:
|
|
114
|
+
"""Add a column with ISO3 country codes to a dataframe.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
df (pd.DataFrame): The dataframe to add the column to.
|
|
118
|
+
column (str): The column containing the country codes.
|
|
119
|
+
from_type (str): The classification type of the country codes.
|
|
120
|
+
fillna (Any): The value to use when the country code is not found.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
pd.DataFrame: The dataframe with the added ISO3 column.
|
|
124
|
+
"""
|
|
125
|
+
# Convert the country codes to ISO3
|
|
126
|
+
df["pydeflate_iso3"] = convert_id(
|
|
127
|
+
df[column].fillna(""),
|
|
128
|
+
from_type=from_type,
|
|
129
|
+
to_type="ISO3",
|
|
130
|
+
not_found=fillna,
|
|
131
|
+
additional_mapping={
|
|
132
|
+
"World": "WLD",
|
|
133
|
+
"EU Institutions": "EUI",
|
|
134
|
+
"DAC countries": "DAC",
|
|
135
|
+
"Kosovo": "XXK",
|
|
136
|
+
"G7": "G7C",
|
|
137
|
+
"Sub-Sahara Africa": "SSA",
|
|
138
|
+
},
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
return df
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def prefix_pydeflate_to_columns(
|
|
145
|
+
df: pd.DataFrame, prefix: str = "pydeflate_"
|
|
146
|
+
) -> pd.DataFrame:
|
|
147
|
+
"""Add a prefix to all columns in a DataFrame.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
df (pd.DataFrame): The DataFrame to add the prefix to.
|
|
151
|
+
prefix (str): The prefix to add to the column names.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
pd.DataFrame: The DataFrame with the prefixed column names.
|
|
155
|
+
"""
|
|
156
|
+
df.columns = [
|
|
157
|
+
f"{prefix}{col}" if not col.startswith(prefix) else col for col in df.columns
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
return df
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def identify_base_year(df: pd.DataFrame, measure: str, year: str = "year") -> int:
|
|
164
|
+
"""Identify the base year for a given measure where the value is equal to 100.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
df (pd.DataFrame): DataFrame containing the deflator data with 'year' and the given measure.
|
|
168
|
+
measure (str): The column name for the measure to find the base year for.
|
|
169
|
+
year (str): The column name for the year.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
int: The base year, or None if no base year is found.
|
|
173
|
+
"""
|
|
174
|
+
# Find the year where the deflator measure is exactly 100 (or very close)
|
|
175
|
+
base_year = df.loc[df[measure].round(2) == 100, year]
|
|
176
|
+
|
|
177
|
+
# Return the year if found, otherwise return None
|
|
178
|
+
return base_year.iloc[0] if not base_year.empty else None
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def compute_exchange_deflator(
|
|
182
|
+
df: pd.DataFrame,
|
|
183
|
+
base_year_measure: str | None = None,
|
|
184
|
+
exchange: str = "EXCHANGE",
|
|
185
|
+
year: str = "year",
|
|
186
|
+
grouper: list[str] = None,
|
|
187
|
+
) -> pd.DataFrame:
|
|
188
|
+
"""Compute the exchange rate deflator for each group of entities.
|
|
189
|
+
|
|
190
|
+
This function calculates a deflator for the exchange rate by identifying a base year
|
|
191
|
+
where the base_year_measure is 100, then normalizing exchange values to that base year.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
df (pd.DataFrame): Input DataFrame containing columns 'year', and 'EXCHANGE'.
|
|
195
|
+
base_year_measure (str): The column name for the measure to find the base year for.
|
|
196
|
+
exchange (str): The column name for the exchange rate.
|
|
197
|
+
year (str): The column name for the year.
|
|
198
|
+
grouper (list): List of columns to group by before applying the deflator.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
pd.DataFrame: DataFrame with an additional column for the exchange rate deflator.
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
def _add_deflator(
|
|
205
|
+
group: pd.DataFrame,
|
|
206
|
+
measure: str | None = "NGDPD_D",
|
|
207
|
+
exchange: str = "EXCHANGE",
|
|
208
|
+
year: str = "year",
|
|
209
|
+
) -> pd.DataFrame:
|
|
210
|
+
# Identify the base year for the deflator
|
|
211
|
+
if measure is not None:
|
|
212
|
+
base_year = identify_base_year(group, measure=measure, year=year)
|
|
213
|
+
else:
|
|
214
|
+
base_year = group.dropna(subset=exchange)[year].max()
|
|
215
|
+
|
|
216
|
+
# If no base year is found, return the group unchanged
|
|
217
|
+
if base_year is None or pd.isna(base_year):
|
|
218
|
+
return group
|
|
219
|
+
|
|
220
|
+
# Extract the exchange rate value for the base year
|
|
221
|
+
base_value = group.loc[group[year] == base_year, exchange].values
|
|
222
|
+
|
|
223
|
+
# If base value is found and valid, calculate the deflator
|
|
224
|
+
if base_value.size > 0 and pd.notna(base_value[0]):
|
|
225
|
+
group[f"{exchange}_D"] = round(100 * group[exchange] / base_value[0], 6)
|
|
226
|
+
|
|
227
|
+
return group
|
|
228
|
+
|
|
229
|
+
if grouper is None:
|
|
230
|
+
grouper = ["entity", "entity_code"]
|
|
231
|
+
|
|
232
|
+
# Apply the deflator computation for each group of 'entity' and 'entity_code'
|
|
233
|
+
return df.groupby(grouper, group_keys=False).apply(
|
|
234
|
+
_add_deflator, measure=base_year_measure, exchange=exchange, year=year
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def read_data(
|
|
239
|
+
file_finder_func: callable,
|
|
240
|
+
download_func: callable,
|
|
241
|
+
data_name: str,
|
|
242
|
+
update: bool = False,
|
|
243
|
+
) -> pd.DataFrame:
|
|
244
|
+
"""Generic function to read data from parquet files or download fresh data.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
file_finder_func (function): Function to find existing data files in the path.
|
|
248
|
+
download_func (function): Function to download fresh data if no files are
|
|
249
|
+
found or an update is needed.
|
|
250
|
+
data_name (str): Name of the dataset for logging purposes (e.g., "WEO", "DAC").
|
|
251
|
+
update (bool): If True, forces downloading of new data even if files exist.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
pd.DataFrame: The latest available data.
|
|
255
|
+
"""
|
|
256
|
+
# Find existing files using the provided file finder function
|
|
257
|
+
files = file_finder_func(PYDEFLATE_PATHS.data)
|
|
258
|
+
|
|
259
|
+
# If no files are found or update is requested, download new data
|
|
260
|
+
if len(files) == 0 or update:
|
|
261
|
+
download_func()
|
|
262
|
+
files = file_finder_func(PYDEFLATE_PATHS.data)
|
|
263
|
+
|
|
264
|
+
# If files are found, sort them by age and load the most recent one
|
|
265
|
+
if len(files) > 0:
|
|
266
|
+
files = sorted(files, key=check_file_age)
|
|
267
|
+
latest_file = files[0]
|
|
268
|
+
|
|
269
|
+
# Check if the latest file is older than 120 days and log a warning
|
|
270
|
+
if check_file_age(latest_file) > 120:
|
|
271
|
+
logger.warn(
|
|
272
|
+
f"The latest {data_name} data is more than 120 days old.\n"
|
|
273
|
+
f"Consider updating by setting update=True in the function call."
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Read and return the latest parquet file as a DataFrame
|
|
277
|
+
logger.info(f"Reading {data_name} data from {latest_file}")
|
|
278
|
+
return pd.read_parquet(latest_file)
|
pydeflate/sources/dac.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from oda_reader import download_dac1
|
|
5
|
+
|
|
6
|
+
from pydeflate.pydeflate_config import PYDEFLATE_PATHS
|
|
7
|
+
from pydeflate.sources.common import (
|
|
8
|
+
today,
|
|
9
|
+
add_pydeflate_iso3,
|
|
10
|
+
enforce_pyarrow_types,
|
|
11
|
+
compute_exchange_deflator,
|
|
12
|
+
read_data,
|
|
13
|
+
prefix_pydeflate_to_columns,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _find_dac_files_in_path(path: Path) -> list:
|
|
18
|
+
"""Find all DAC parquet files in the specified directory.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
path (Path): The directory path to search for DAC parquet files.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
list: List of DAC parquet files found in the directory.
|
|
25
|
+
"""
|
|
26
|
+
return list(path.glob("dac_*.parquet"))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _to_units(df: pd.DataFrame) -> pd.DataFrame:
|
|
30
|
+
"""Convert DAC values (in million) to units.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
df (pd.DataFrame): Dataframe with raw observation values.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
pd.DataFrame: Dataframe with scaled observation values.
|
|
37
|
+
"""
|
|
38
|
+
df = df.copy()
|
|
39
|
+
df["value"] = df["value"] * df["unit_multiplier"]
|
|
40
|
+
return df
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _keep_official_definition_only(df: pd.DataFrame) -> pd.DataFrame:
|
|
44
|
+
query = (
|
|
45
|
+
"(aidtype_code == 1010 & flows_code == 1140 & year <2018 ) | "
|
|
46
|
+
"(aidtype_code == 11010 & flows_code == 1160 & year >=2018)"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
return df.query(query)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
53
|
+
columns = ["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"]
|
|
54
|
+
|
|
55
|
+
return df.filter(columns)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _pivot_amount_type(df: pd.DataFrame) -> pd.DataFrame:
|
|
59
|
+
df = df.filter(["year", "donor_code", "donor_name", "amounttype_code", "value"])
|
|
60
|
+
return df.pivot(
|
|
61
|
+
index=[c for c in df.columns if c not in ["amounttype_code", "value"]],
|
|
62
|
+
columns="amounttype_code",
|
|
63
|
+
values="value",
|
|
64
|
+
).reset_index()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
|
|
68
|
+
# The values for certain providers should be 1
|
|
69
|
+
df.loc[lambda d: d.donor_code >= 20000, "N"] = df.loc[
|
|
70
|
+
lambda d: d.donor_code >= 20000, "A"
|
|
71
|
+
]
|
|
72
|
+
df["EXCHANGE"] = round(df["N"] / df["A"], 6).fillna(1)
|
|
73
|
+
return df
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _compute_dac_deflator(df: pd.DataFrame) -> pd.DataFrame:
|
|
77
|
+
df["DAC_DEFLATOR"] = round(100 * df["A"] / df["D"], 6)
|
|
78
|
+
return df
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _compute_dac_gdp_deflator(df: pd.DataFrame) -> pd.DataFrame:
|
|
82
|
+
df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
|
|
83
|
+
|
|
84
|
+
return df
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _rename_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
88
|
+
return df.rename(
|
|
89
|
+
columns={
|
|
90
|
+
"donor_code": "entity_code",
|
|
91
|
+
"donor_name": "entity",
|
|
92
|
+
}
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def download_dac():
|
|
97
|
+
# Use oda_reader to get the data
|
|
98
|
+
df = download_dac1(
|
|
99
|
+
filters={"measure": ["1010", "11010"], "flow_type": ["1140", "1160"]}
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Clean the data
|
|
103
|
+
df = (
|
|
104
|
+
df.pipe(_to_units)
|
|
105
|
+
.pipe(_keep_official_definition_only)
|
|
106
|
+
.pipe(_pivot_amount_type)
|
|
107
|
+
.pipe(_compute_exchange)
|
|
108
|
+
.pipe(_compute_dac_deflator)
|
|
109
|
+
.pipe(_keep_useful_columns)
|
|
110
|
+
.pipe(add_pydeflate_iso3, column="donor_name", from_type="regex")
|
|
111
|
+
.pipe(_rename_columns)
|
|
112
|
+
.pipe(compute_exchange_deflator, base_year_measure="DAC_DEFLATOR")
|
|
113
|
+
.pipe(_compute_dac_gdp_deflator)
|
|
114
|
+
.pipe(prefix_pydeflate_to_columns)
|
|
115
|
+
.pipe(enforce_pyarrow_types)
|
|
116
|
+
.reset_index(drop=True)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Get today's date to use as a file suffix
|
|
120
|
+
suffix = today()
|
|
121
|
+
|
|
122
|
+
# Save the data
|
|
123
|
+
df.to_parquet(PYDEFLATE_PATHS.data / f"dac_{suffix}.parquet")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def read_dac(update: bool = False) -> pd.DataFrame:
|
|
127
|
+
"""Read the latest WEO data from parquet files or download fresh data."""
|
|
128
|
+
return read_data(
|
|
129
|
+
file_finder_func=_find_dac_files_in_path,
|
|
130
|
+
download_func=download_dac,
|
|
131
|
+
data_name="DAC",
|
|
132
|
+
update=update,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
if __name__ == "__main__":
|
|
137
|
+
df = read_dac(update=True)
|