pydeflate 2.1.3__py3-none-any.whl → 2.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydeflate/__init__.py +92 -20
- pydeflate/cache.py +139 -0
- pydeflate/constants.py +121 -0
- pydeflate/context.py +211 -0
- pydeflate/core/api.py +33 -11
- pydeflate/core/source.py +92 -11
- pydeflate/deflate/deflators.py +1 -1
- pydeflate/deflate/get_deflators.py +233 -0
- pydeflate/deflate/legacy_deflate.py +1 -1
- pydeflate/exceptions.py +166 -0
- pydeflate/exchange/exchangers.py +1 -1
- pydeflate/exchange/get_rates.py +207 -0
- pydeflate/plugins.py +289 -0
- pydeflate/protocols.py +168 -0
- pydeflate/pydeflate_config.py +77 -6
- pydeflate/schemas.py +297 -0
- pydeflate/sources/common.py +59 -107
- pydeflate/sources/dac.py +39 -52
- pydeflate/sources/imf.py +23 -39
- pydeflate/sources/world_bank.py +44 -117
- pydeflate/utils.py +14 -9
- {pydeflate-2.1.3.dist-info → pydeflate-2.3.0.dist-info}/METADATA +251 -18
- pydeflate-2.3.0.dist-info/RECORD +34 -0
- pydeflate-2.3.0.dist-info/WHEEL +4 -0
- {pydeflate-2.1.3.dist-info → pydeflate-2.3.0.dist-info/licenses}/LICENSE +1 -1
- pydeflate-2.1.3.dist-info/RECORD +0 -25
- pydeflate-2.1.3.dist-info/WHEEL +0 -4
pydeflate/sources/dac.py
CHANGED
|
@@ -1,71 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
2
4
|
|
|
3
5
|
import pandas as pd
|
|
4
6
|
from oda_reader import download_dac1
|
|
5
7
|
|
|
6
|
-
from pydeflate.
|
|
8
|
+
from pydeflate.cache import CacheEntry, cache_manager
|
|
9
|
+
from pydeflate.pydeflate_config import logger
|
|
7
10
|
from pydeflate.sources.common import (
|
|
8
|
-
today,
|
|
9
11
|
add_pydeflate_iso3,
|
|
10
|
-
enforce_pyarrow_types,
|
|
11
12
|
compute_exchange_deflator,
|
|
12
|
-
|
|
13
|
+
enforce_pyarrow_types,
|
|
13
14
|
prefix_pydeflate_to_columns,
|
|
14
15
|
)
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
def _find_dac_files_in_path(path: Path) -> list:
|
|
18
|
-
"""Find all DAC parquet files in the specified directory.
|
|
19
|
-
|
|
20
|
-
Args:
|
|
21
|
-
path (Path): The directory path to search for DAC parquet files.
|
|
22
|
-
|
|
23
|
-
Returns:
|
|
24
|
-
list: List of DAC parquet files found in the directory.
|
|
25
|
-
"""
|
|
26
|
-
return list(path.glob("dac_*.parquet"))
|
|
27
|
-
|
|
28
|
-
|
|
29
18
|
def _to_units(df: pd.DataFrame) -> pd.DataFrame:
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
df (pd.DataFrame): Dataframe with raw observation values.
|
|
19
|
+
"""Scale reported DAC values (supplied in millions) into base units."""
|
|
34
20
|
|
|
35
|
-
Returns:
|
|
36
|
-
pd.DataFrame: Dataframe with scaled observation values.
|
|
37
|
-
"""
|
|
38
21
|
df = df.copy()
|
|
39
22
|
df["value"] = df["value"] * df["unit_multiplier"]
|
|
40
23
|
return df
|
|
41
24
|
|
|
42
25
|
|
|
43
26
|
def _keep_official_definition_only(df: pd.DataFrame) -> pd.DataFrame:
|
|
27
|
+
"""Retain rows matching the official DAC definition across regime changes."""
|
|
28
|
+
|
|
44
29
|
query = (
|
|
45
30
|
"(aidtype_code == 1010 & flows_code == 1140 & year <2018 ) | "
|
|
46
31
|
"(aidtype_code == 11010 & flows_code == 1160 & year >=2018)"
|
|
47
32
|
)
|
|
48
|
-
|
|
49
33
|
return df.query(query)
|
|
50
34
|
|
|
51
35
|
|
|
52
36
|
def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
53
|
-
columns
|
|
37
|
+
"""Select the key columns used downstream in pydeflate."""
|
|
54
38
|
|
|
55
|
-
return df.filter(
|
|
39
|
+
return df.filter(["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"])
|
|
56
40
|
|
|
57
41
|
|
|
58
42
|
def _pivot_amount_type(df: pd.DataFrame) -> pd.DataFrame:
|
|
43
|
+
"""Pivot amount-type codes into separate columns (A/N/D)."""
|
|
44
|
+
|
|
59
45
|
df = df.filter(["year", "donor_code", "donor_name", "amounttype_code", "value"])
|
|
60
46
|
return df.pivot(
|
|
61
|
-
index=[c for c in df.columns if c not in
|
|
47
|
+
index=[c for c in df.columns if c not in {"amounttype_code", "value"}],
|
|
62
48
|
columns="amounttype_code",
|
|
63
49
|
values="value",
|
|
64
50
|
).reset_index()
|
|
65
51
|
|
|
66
52
|
|
|
67
53
|
def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
|
|
68
|
-
|
|
54
|
+
"""Derive exchange rates, forcing DAC aggregates to unity."""
|
|
55
|
+
|
|
69
56
|
df.loc[lambda d: d.donor_code >= 20000, "N"] = df.loc[
|
|
70
57
|
lambda d: d.donor_code >= 20000, "A"
|
|
71
58
|
]
|
|
@@ -74,32 +61,32 @@ def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
74
61
|
|
|
75
62
|
|
|
76
63
|
def _compute_dac_deflator(df: pd.DataFrame) -> pd.DataFrame:
|
|
64
|
+
"""Calculate the published DAC price deflator from amounts A/D."""
|
|
65
|
+
|
|
77
66
|
df["DAC_DEFLATOR"] = round(100 * df["A"] / df["D"], 6)
|
|
78
67
|
return df
|
|
79
68
|
|
|
80
69
|
|
|
81
70
|
def _compute_dac_gdp_deflator(df: pd.DataFrame) -> pd.DataFrame:
|
|
82
|
-
|
|
71
|
+
"""Back out a GDP-style deflator using the exchange deflator."""
|
|
83
72
|
|
|
73
|
+
df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
|
|
84
74
|
return df
|
|
85
75
|
|
|
86
76
|
|
|
87
77
|
def _rename_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
88
|
-
|
|
89
|
-
columns={
|
|
90
|
-
"donor_code": "entity_code",
|
|
91
|
-
"donor_name": "entity",
|
|
92
|
-
}
|
|
93
|
-
)
|
|
78
|
+
"""Align donor metadata with pydeflate naming conventions."""
|
|
94
79
|
|
|
80
|
+
return df.rename(columns={"donor_code": "entity_code", "donor_name": "entity"})
|
|
95
81
|
|
|
96
|
-
|
|
97
|
-
|
|
82
|
+
|
|
83
|
+
def _download_dac(output_path: Path) -> None:
|
|
84
|
+
"""Download and cache the DAC statistics parquet file."""
|
|
85
|
+
|
|
86
|
+
logger.info("Downloading DAC statistics from ODA reader...")
|
|
98
87
|
df = download_dac1(
|
|
99
88
|
filters={"measure": ["1010", "11010"], "flow_type": ["1140", "1160"]}
|
|
100
89
|
)
|
|
101
|
-
|
|
102
|
-
# Clean the data
|
|
103
90
|
df = (
|
|
104
91
|
df.pipe(_to_units)
|
|
105
92
|
.pipe(_keep_official_definition_only)
|
|
@@ -115,23 +102,23 @@ def download_dac():
|
|
|
115
102
|
.pipe(enforce_pyarrow_types)
|
|
116
103
|
.reset_index(drop=True)
|
|
117
104
|
)
|
|
105
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
106
|
+
df.to_parquet(output_path)
|
|
107
|
+
logger.info("Saved DAC dataset to %s", output_path)
|
|
118
108
|
|
|
119
|
-
# Get today's date to use as a file suffix
|
|
120
|
-
suffix = today()
|
|
121
109
|
|
|
122
|
-
|
|
123
|
-
|
|
110
|
+
_DAC_ENTRY = CacheEntry(
|
|
111
|
+
key="dac_stats",
|
|
112
|
+
filename="dac.parquet",
|
|
113
|
+
fetcher=_download_dac,
|
|
114
|
+
ttl_days=30,
|
|
115
|
+
)
|
|
124
116
|
|
|
125
117
|
|
|
126
118
|
def read_dac(update: bool = False) -> pd.DataFrame:
|
|
127
|
-
|
|
128
|
-
return
|
|
129
|
-
file_finder_func=_find_dac_files_in_path,
|
|
130
|
-
download_func=download_dac,
|
|
131
|
-
data_name="DAC",
|
|
132
|
-
update=update,
|
|
133
|
-
)
|
|
119
|
+
path = cache_manager().ensure(_DAC_ENTRY, refresh=update)
|
|
120
|
+
return pd.read_parquet(path)
|
|
134
121
|
|
|
135
122
|
|
|
136
|
-
if __name__ == "__main__":
|
|
137
|
-
|
|
123
|
+
if __name__ == "__main__": # pragma: no cover
|
|
124
|
+
read_dac(update=True)
|
pydeflate/sources/imf.py
CHANGED
|
@@ -1,15 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from pathlib import Path
|
|
2
4
|
|
|
3
5
|
import pandas as pd
|
|
4
6
|
from imf_reader import weo
|
|
5
7
|
|
|
6
|
-
from pydeflate.
|
|
8
|
+
from pydeflate.cache import CacheEntry, cache_manager
|
|
9
|
+
from pydeflate.pydeflate_config import logger
|
|
7
10
|
from pydeflate.sources.common import (
|
|
8
|
-
today,
|
|
9
11
|
add_pydeflate_iso3,
|
|
10
|
-
enforce_pyarrow_types,
|
|
11
12
|
compute_exchange_deflator,
|
|
12
|
-
|
|
13
|
+
enforce_pyarrow_types,
|
|
13
14
|
prefix_pydeflate_to_columns,
|
|
14
15
|
)
|
|
15
16
|
|
|
@@ -93,7 +94,7 @@ def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
93
94
|
|
|
94
95
|
|
|
95
96
|
def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
|
|
96
|
-
"""Pivot the concept
|
|
97
|
+
"""Pivot the concept dimension so each indicator becomes a column
|
|
97
98
|
|
|
98
99
|
Args:
|
|
99
100
|
df (pd.DataFrame): Dataframe with concept code column.
|
|
@@ -102,7 +103,7 @@ def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
102
103
|
pd.DataFrame: Dataframe with concept code pivoted to columns.
|
|
103
104
|
"""
|
|
104
105
|
return df.pivot(
|
|
105
|
-
index=[c for c in df.columns if c not in
|
|
106
|
+
index=[c for c in df.columns if c not in {"concept_code", "value"}],
|
|
106
107
|
columns="concept_code",
|
|
107
108
|
values="value",
|
|
108
109
|
).reset_index()
|
|
@@ -171,15 +172,13 @@ def _create_eur_series(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
171
172
|
df.loc[df.entity_code == 998, "EXCHANGE"] = df.loc[
|
|
172
173
|
df.entity_code == 998, "year"
|
|
173
174
|
].map(eur)
|
|
174
|
-
|
|
175
175
|
return df
|
|
176
176
|
|
|
177
177
|
|
|
178
|
-
def
|
|
179
|
-
"""
|
|
180
|
-
logger.info("Downloading the latest WEO data...")
|
|
178
|
+
def _download_weo(output_path: Path) -> None:
|
|
179
|
+
"""Fetch, transform, and store the latest WEO dataset in Parquet format."""
|
|
181
180
|
|
|
182
|
-
|
|
181
|
+
logger.info("Downloading the latest IMF WEO dataset...")
|
|
183
182
|
df = (
|
|
184
183
|
weo.fetch_data()
|
|
185
184
|
.pipe(_filter_indicators)
|
|
@@ -195,38 +194,23 @@ def download_weo() -> None:
|
|
|
195
194
|
.pipe(enforce_pyarrow_types)
|
|
196
195
|
.reset_index(drop=True)
|
|
197
196
|
)
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
# Save the processed dataframe to parquet format
|
|
203
|
-
df.to_parquet(PYDEFLATE_PATHS.data / f"weo_{suffix}.parquet")
|
|
204
|
-
|
|
205
|
-
logger.info(f"Saved WEO data to weo_{suffix}.parquet")
|
|
197
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
198
|
+
df.to_parquet(output_path)
|
|
199
|
+
logger.info("Saved WEO data to %s", output_path)
|
|
206
200
|
|
|
207
201
|
|
|
208
|
-
|
|
209
|
-
""
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
Returns:
|
|
215
|
-
list: List of WEO parquet files found in the directory.
|
|
216
|
-
"""
|
|
217
|
-
return list(path.glob("weo_*.parquet"))
|
|
202
|
+
_IMF_CACHE_ENTRY = CacheEntry(
|
|
203
|
+
key="imf_weo",
|
|
204
|
+
filename="imf_weo.parquet",
|
|
205
|
+
fetcher=_download_weo,
|
|
206
|
+
ttl_days=60,
|
|
207
|
+
)
|
|
218
208
|
|
|
219
209
|
|
|
220
210
|
def read_weo(update: bool = False) -> pd.DataFrame:
|
|
221
|
-
|
|
222
|
-
return
|
|
223
|
-
file_finder_func=_find_weo_files_in_path,
|
|
224
|
-
download_func=download_weo,
|
|
225
|
-
data_name="WEO",
|
|
226
|
-
update=update,
|
|
227
|
-
)
|
|
211
|
+
path = cache_manager().ensure(_IMF_CACHE_ENTRY, refresh=update)
|
|
212
|
+
return pd.read_parquet(path)
|
|
228
213
|
|
|
229
214
|
|
|
230
|
-
if __name__ == "__main__":
|
|
231
|
-
|
|
232
|
-
dfi = read_weo(update=True)
|
|
215
|
+
if __name__ == "__main__": # pragma: no cover
|
|
216
|
+
read_weo(update=True)
|
pydeflate/sources/world_bank.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
2
4
|
from pathlib import Path
|
|
5
|
+
from typing import Callable
|
|
3
6
|
|
|
4
7
|
import pandas as pd
|
|
5
8
|
import wbgapi as wb
|
|
6
9
|
|
|
7
|
-
from pydeflate.
|
|
10
|
+
from pydeflate.cache import CacheEntry, cache_manager
|
|
11
|
+
from pydeflate.pydeflate_config import logger
|
|
8
12
|
from pydeflate.sources.common import (
|
|
9
|
-
enforce_pyarrow_types,
|
|
10
|
-
today,
|
|
11
13
|
compute_exchange_deflator,
|
|
12
|
-
|
|
14
|
+
enforce_pyarrow_types,
|
|
13
15
|
prefix_pydeflate_to_columns,
|
|
14
16
|
)
|
|
15
17
|
from pydeflate.utils import emu
|
|
@@ -56,8 +58,8 @@ def get_wb_indicator(series: str, value_name: str | None = None) -> pd.DataFrame
|
|
|
56
58
|
labels=True,
|
|
57
59
|
)
|
|
58
60
|
.reset_index()
|
|
59
|
-
.sort_values(by=["economy", "Time"])
|
|
60
|
-
.drop(columns=["Time"])
|
|
61
|
+
.sort_values(by=["economy", "Time"])
|
|
62
|
+
.drop(columns=["Time"])
|
|
61
63
|
.rename(
|
|
62
64
|
columns={
|
|
63
65
|
"economy": "entity_code",
|
|
@@ -66,7 +68,7 @@ def get_wb_indicator(series: str, value_name: str | None = None) -> pd.DataFrame
|
|
|
66
68
|
series: value_name or series,
|
|
67
69
|
}
|
|
68
70
|
)
|
|
69
|
-
.reset_index(drop=True)
|
|
71
|
+
.reset_index(drop=True)
|
|
70
72
|
)
|
|
71
73
|
|
|
72
74
|
|
|
@@ -119,22 +121,17 @@ def _parallel_download_indicators(indicators: dict) -> list[pd.DataFrame]:
|
|
|
119
121
|
|
|
120
122
|
# Use ThreadPoolExecutor to fetch indicators in parallel
|
|
121
123
|
with ThreadPoolExecutor() as executor:
|
|
122
|
-
# Submit all tasks to the executor (downloading indicators in parallel)
|
|
123
124
|
future_to_series = {
|
|
124
125
|
executor.submit(get_wb_indicator, series, value_name): series
|
|
125
126
|
for series, value_name in indicators.items()
|
|
126
127
|
}
|
|
127
|
-
|
|
128
|
-
# Collect the results as they complete
|
|
129
128
|
for future in as_completed(future_to_series):
|
|
130
129
|
series = future_to_series[future]
|
|
131
130
|
try:
|
|
132
131
|
df_ = future.result().set_index(["year", "entity_code", "entity"])
|
|
133
132
|
dfs.append(df_)
|
|
134
|
-
except Exception as exc:
|
|
135
|
-
|
|
136
|
-
logger.warning(f"Error downloading series {series}: {exc}")
|
|
137
|
-
|
|
133
|
+
except Exception as exc: # pragma: no cover - defensive logging
|
|
134
|
+
logger.warning("Error downloading series %s: %s", series, exc)
|
|
138
135
|
return dfs
|
|
139
136
|
|
|
140
137
|
|
|
@@ -151,140 +148,70 @@ def _add_ppp_ppp_exchange(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
151
148
|
"""
|
|
152
149
|
ppp = df.loc[lambda d: d["entity_code"] == "USA"].copy()
|
|
153
150
|
ppp[["entity_code", "entity", "pydeflate_iso3"]] = "PPP"
|
|
151
|
+
return pd.concat([df, ppp], ignore_index=True)
|
|
154
152
|
|
|
155
|
-
df = pd.concat([df, ppp], ignore_index=True)
|
|
156
153
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
def _download_wb(
|
|
161
|
-
indicators: dict, prefix: str = "wb", add_ppp_exchange: bool = False
|
|
154
|
+
def _download_wb_dataset(
|
|
155
|
+
indicators: dict, output_path: Path, add_ppp_exchange: bool = False
|
|
162
156
|
) -> None:
|
|
163
|
-
"""Download
|
|
164
|
-
|
|
165
|
-
This function fetches all indicators defined in _INDICATORS in parallel, concatenates
|
|
166
|
-
them into a single DataFrame, and saves the result as a parquet file using today's date as a suffix.
|
|
167
|
-
"""
|
|
168
|
-
logger.info("Downloading the latest World Bank data...")
|
|
169
|
-
|
|
170
|
-
indicators_data = _parallel_download_indicators(indicators=indicators)
|
|
157
|
+
"""Download and materialise a World Bank dataset to ``output_path``."""
|
|
171
158
|
|
|
172
|
-
|
|
159
|
+
logger.info("Downloading World Bank indicators for %s", output_path.name)
|
|
160
|
+
indicators_data = _parallel_download_indicators(indicators)
|
|
173
161
|
df = pd.concat(indicators_data, axis=1).reset_index()
|
|
174
|
-
|
|
175
|
-
# cleaning
|
|
176
162
|
df = (
|
|
177
163
|
df.pipe(_eur_series_fix)
|
|
178
164
|
.pipe(compute_exchange_deflator, base_year_measure="NGDP_D")
|
|
179
165
|
.assign(pydeflate_iso3=lambda d: d.entity_code)
|
|
180
166
|
.sort_values(by=["year", "entity_code"])
|
|
181
167
|
)
|
|
182
|
-
|
|
183
168
|
if add_ppp_exchange:
|
|
184
169
|
df = df.pipe(_add_ppp_ppp_exchange)
|
|
185
|
-
|
|
186
170
|
df = (
|
|
187
171
|
df.pipe(prefix_pydeflate_to_columns)
|
|
188
172
|
.pipe(enforce_pyarrow_types)
|
|
189
173
|
.reset_index(drop=True)
|
|
190
174
|
)
|
|
191
|
-
|
|
192
|
-
# Get today's date to use as a file suffix
|
|
193
|
-
suffix = today()
|
|
194
|
-
|
|
195
|
-
# Save the DataFrame as a parquet file
|
|
196
|
-
output_path = PYDEFLATE_PATHS.data / f"{prefix}_{suffix}.parquet"
|
|
175
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
197
176
|
df.to_parquet(output_path)
|
|
177
|
+
logger.info("Saved World Bank data to %s", output_path)
|
|
198
178
|
|
|
199
|
-
logger.info(f"Saved World Bank data to {prefix}_{suffix}.parquet")
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
def download_wb() -> None:
|
|
203
|
-
"""Download the latest World Bank data."""
|
|
204
|
-
_download_wb(indicators=_INDICATORS, prefix="wb")
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
def download_wb_lcu_ppp() -> None:
|
|
208
|
-
"""Download the latest World Bank data (PPP)."""
|
|
209
|
-
_download_wb(
|
|
210
|
-
indicators=_INDICATORS_LCU_PPP, prefix="wb_lcu_ppp", add_ppp_exchange=True
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
def download_wb_usd_ppp() -> None:
|
|
215
|
-
"""Download the latest World Bank data (PPP)."""
|
|
216
|
-
_download_wb(
|
|
217
|
-
indicators=_INDICATORS_USD_PPP, prefix="wb_usd_ppp", add_ppp_exchange=True
|
|
218
|
-
)
|
|
219
179
|
|
|
180
|
+
def _entry(
|
|
181
|
+
key: str, filename: str, fetcher: Callable[[Path], None], ttl_days: int = 30
|
|
182
|
+
) -> CacheEntry:
|
|
183
|
+
return CacheEntry(key=key, filename=filename, fetcher=fetcher, ttl_days=ttl_days)
|
|
220
184
|
|
|
221
|
-
def _find_wb_files_in_path(path: Path) -> list:
|
|
222
|
-
"""Find all WB parquet files in the specified directory.
|
|
223
185
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
""
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
""
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
path (Path): The directory path to search for WB parquet files.
|
|
238
|
-
|
|
239
|
-
Returns:
|
|
240
|
-
list: List of WB parquet files found in the directory.
|
|
241
|
-
"""
|
|
242
|
-
return list(path.glob(f"wb_lcu_ppp_*.parquet"))
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
def _find_wb_usd_ppp_files_in_path(path: Path) -> list:
|
|
246
|
-
"""Find all WB PPP parquet files in the specified directory.
|
|
247
|
-
|
|
248
|
-
Args:
|
|
249
|
-
path (Path): The directory path to search for WB parquet files.
|
|
250
|
-
|
|
251
|
-
Returns:
|
|
252
|
-
list: List of WB parquet files found in the directory.
|
|
253
|
-
"""
|
|
254
|
-
return list(path.glob(f"wb_usd_ppp_*.parquet"))
|
|
186
|
+
_WB_ENTRY = _entry(
|
|
187
|
+
"world_bank", "wb.parquet", lambda p: _download_wb_dataset(_INDICATORS, p)
|
|
188
|
+
)
|
|
189
|
+
_WB_LCU_PPP_ENTRY = _entry(
|
|
190
|
+
"world_bank_lcu_ppp",
|
|
191
|
+
"wb_lcu_ppp.parquet",
|
|
192
|
+
lambda p: _download_wb_dataset(_INDICATORS_LCU_PPP, p, add_ppp_exchange=True),
|
|
193
|
+
)
|
|
194
|
+
_WB_USD_PPP_ENTRY = _entry(
|
|
195
|
+
"world_bank_usd_ppp",
|
|
196
|
+
"wb_usd_ppp.parquet",
|
|
197
|
+
lambda p: _download_wb_dataset(_INDICATORS_USD_PPP, p, add_ppp_exchange=True),
|
|
198
|
+
)
|
|
255
199
|
|
|
256
200
|
|
|
257
201
|
def read_wb(update: bool = False) -> pd.DataFrame:
|
|
258
|
-
|
|
259
|
-
return
|
|
260
|
-
file_finder_func=_find_wb_files_in_path,
|
|
261
|
-
download_func=download_wb,
|
|
262
|
-
data_name="World Bank",
|
|
263
|
-
update=update,
|
|
264
|
-
)
|
|
202
|
+
path = cache_manager().ensure(_WB_ENTRY, refresh=update)
|
|
203
|
+
return pd.read_parquet(path)
|
|
265
204
|
|
|
266
205
|
|
|
267
206
|
def read_wb_lcu_ppp(update: bool = False) -> pd.DataFrame:
|
|
268
|
-
|
|
269
|
-
return
|
|
270
|
-
file_finder_func=_find_wb_lcu_ppp_files_in_path,
|
|
271
|
-
download_func=download_wb_lcu_ppp,
|
|
272
|
-
data_name="World Bank",
|
|
273
|
-
update=update,
|
|
274
|
-
)
|
|
207
|
+
path = cache_manager().ensure(_WB_LCU_PPP_ENTRY, refresh=update)
|
|
208
|
+
return pd.read_parquet(path)
|
|
275
209
|
|
|
276
210
|
|
|
277
211
|
def read_wb_usd_ppp(update: bool = False) -> pd.DataFrame:
|
|
278
|
-
|
|
279
|
-
return
|
|
280
|
-
file_finder_func=_find_wb_usd_ppp_files_in_path,
|
|
281
|
-
download_func=download_wb_usd_ppp,
|
|
282
|
-
data_name="World Bank",
|
|
283
|
-
update=update,
|
|
284
|
-
)
|
|
212
|
+
path = cache_manager().ensure(_WB_USD_PPP_ENTRY, refresh=update)
|
|
213
|
+
return pd.read_parquet(path)
|
|
285
214
|
|
|
286
215
|
|
|
287
|
-
if __name__ == "__main__":
|
|
288
|
-
|
|
289
|
-
df_usd = read_wb_usd_ppp(False)
|
|
290
|
-
df_lcu = read_wb_lcu_ppp(False)
|
|
216
|
+
if __name__ == "__main__": # pragma: no cover
|
|
217
|
+
read_wb(update=True)
|
pydeflate/utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import re
|
|
2
3
|
|
|
3
4
|
import numpy as np
|
|
4
5
|
import pandas as pd
|
|
@@ -22,18 +23,25 @@ def emu() -> list:
|
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
def clean_number(number):
|
|
25
|
-
"""Clean a number and return as float
|
|
26
|
-
|
|
26
|
+
"""Clean a number-like value and return it as a float.
|
|
27
|
+
|
|
28
|
+
Preserves leading signs and scientific notation while stripping
|
|
29
|
+
formatting artifacts such as commas or surrounding text.
|
|
30
|
+
"""
|
|
27
31
|
|
|
28
32
|
if not isinstance(number, str):
|
|
29
33
|
number = str(number)
|
|
30
34
|
|
|
31
|
-
|
|
35
|
+
normalized = number.replace(",", "").strip()
|
|
36
|
+
match = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", normalized)
|
|
32
37
|
|
|
33
|
-
if
|
|
38
|
+
if not match:
|
|
34
39
|
return np.nan
|
|
35
40
|
|
|
36
|
-
|
|
41
|
+
try:
|
|
42
|
+
return float(match.group())
|
|
43
|
+
except ValueError:
|
|
44
|
+
return np.nan
|
|
37
45
|
|
|
38
46
|
|
|
39
47
|
def create_pydeflate_year(
|
|
@@ -65,9 +73,7 @@ def _use_implied_dac_rates(
|
|
|
65
73
|
data.loc[
|
|
66
74
|
lambda d: ~d[f"temp_{entity_column}"].isin(pydeflate_data[ix[-1]].unique()),
|
|
67
75
|
f"temp_{entity_column}",
|
|
68
|
-
] =
|
|
69
|
-
20001 if source_codes else "DAC"
|
|
70
|
-
)
|
|
76
|
+
] = 20001 if source_codes else "DAC"
|
|
71
77
|
|
|
72
78
|
# Log the fact that implied rates are being used
|
|
73
79
|
flag_missing_pydeflate_data(
|
|
@@ -90,7 +96,6 @@ def merge_user_and_pydeflate_data(
|
|
|
90
96
|
source_codes: bool = True,
|
|
91
97
|
dac: bool = False,
|
|
92
98
|
) -> pd.DataFrame:
|
|
93
|
-
|
|
94
99
|
data[f"temp_{entity_column}"] = data[entity_column]
|
|
95
100
|
|
|
96
101
|
if dac:
|