pydeflate 2.1.3__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pydeflate/sources/dac.py CHANGED
@@ -1,71 +1,58 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
 
3
5
  import pandas as pd
4
6
  from oda_reader import download_dac1
5
7
 
6
- from pydeflate.pydeflate_config import PYDEFLATE_PATHS
8
+ from pydeflate.cache import CacheEntry, cache_manager
9
+ from pydeflate.pydeflate_config import logger
7
10
  from pydeflate.sources.common import (
8
- today,
9
11
  add_pydeflate_iso3,
10
- enforce_pyarrow_types,
11
12
  compute_exchange_deflator,
12
- read_data,
13
+ enforce_pyarrow_types,
13
14
  prefix_pydeflate_to_columns,
14
15
  )
15
16
 
16
17
 
17
- def _find_dac_files_in_path(path: Path) -> list:
18
- """Find all DAC parquet files in the specified directory.
19
-
20
- Args:
21
- path (Path): The directory path to search for DAC parquet files.
22
-
23
- Returns:
24
- list: List of DAC parquet files found in the directory.
25
- """
26
- return list(path.glob("dac_*.parquet"))
27
-
28
-
29
18
  def _to_units(df: pd.DataFrame) -> pd.DataFrame:
30
- """Convert DAC values (in million) to units.
31
-
32
- Args:
33
- df (pd.DataFrame): Dataframe with raw observation values.
19
+ """Scale reported DAC values (supplied in millions) into base units."""
34
20
 
35
- Returns:
36
- pd.DataFrame: Dataframe with scaled observation values.
37
- """
38
21
  df = df.copy()
39
22
  df["value"] = df["value"] * df["unit_multiplier"]
40
23
  return df
41
24
 
42
25
 
43
26
  def _keep_official_definition_only(df: pd.DataFrame) -> pd.DataFrame:
27
+ """Retain rows matching the official DAC definition across regime changes."""
28
+
44
29
  query = (
45
30
  "(aidtype_code == 1010 & flows_code == 1140 & year <2018 ) | "
46
31
  "(aidtype_code == 11010 & flows_code == 1160 & year >=2018)"
47
32
  )
48
-
49
33
  return df.query(query)
50
34
 
51
35
 
52
36
  def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
53
- columns = ["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"]
37
+ """Select the key columns used downstream in pydeflate."""
54
38
 
55
- return df.filter(columns)
39
+ return df.filter(["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"])
56
40
 
57
41
 
58
42
  def _pivot_amount_type(df: pd.DataFrame) -> pd.DataFrame:
43
+ """Pivot amount-type codes into separate columns (A/N/D)."""
44
+
59
45
  df = df.filter(["year", "donor_code", "donor_name", "amounttype_code", "value"])
60
46
  return df.pivot(
61
- index=[c for c in df.columns if c not in ["amounttype_code", "value"]],
47
+ index=[c for c in df.columns if c not in {"amounttype_code", "value"}],
62
48
  columns="amounttype_code",
63
49
  values="value",
64
50
  ).reset_index()
65
51
 
66
52
 
67
53
  def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
68
- # The values for certain providers should be 1
54
+ """Derive exchange rates, forcing DAC aggregates to unity."""
55
+
69
56
  df.loc[lambda d: d.donor_code >= 20000, "N"] = df.loc[
70
57
  lambda d: d.donor_code >= 20000, "A"
71
58
  ]
@@ -74,32 +61,32 @@ def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
74
61
 
75
62
 
76
63
  def _compute_dac_deflator(df: pd.DataFrame) -> pd.DataFrame:
64
+ """Calculate the published DAC price deflator from amounts A/D."""
65
+
77
66
  df["DAC_DEFLATOR"] = round(100 * df["A"] / df["D"], 6)
78
67
  return df
79
68
 
80
69
 
81
70
  def _compute_dac_gdp_deflator(df: pd.DataFrame) -> pd.DataFrame:
82
- df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
71
+ """Back out a GDP-style deflator using the exchange deflator."""
83
72
 
73
+ df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
84
74
  return df
85
75
 
86
76
 
87
77
  def _rename_columns(df: pd.DataFrame) -> pd.DataFrame:
88
- return df.rename(
89
- columns={
90
- "donor_code": "entity_code",
91
- "donor_name": "entity",
92
- }
93
- )
78
+ """Align donor metadata with pydeflate naming conventions."""
94
79
 
80
+ return df.rename(columns={"donor_code": "entity_code", "donor_name": "entity"})
95
81
 
96
- def download_dac():
97
- # Use oda_reader to get the data
82
+
83
+ def _download_dac(output_path: Path) -> None:
84
+ """Download and cache the DAC statistics parquet file."""
85
+
86
+ logger.info("Downloading DAC statistics from ODA reader...")
98
87
  df = download_dac1(
99
88
  filters={"measure": ["1010", "11010"], "flow_type": ["1140", "1160"]}
100
89
  )
101
-
102
- # Clean the data
103
90
  df = (
104
91
  df.pipe(_to_units)
105
92
  .pipe(_keep_official_definition_only)
@@ -115,23 +102,23 @@ def download_dac():
115
102
  .pipe(enforce_pyarrow_types)
116
103
  .reset_index(drop=True)
117
104
  )
105
+ output_path.parent.mkdir(parents=True, exist_ok=True)
106
+ df.to_parquet(output_path)
107
+ logger.info("Saved DAC dataset to %s", output_path)
118
108
 
119
- # Get today's date to use as a file suffix
120
- suffix = today()
121
109
 
122
- # Save the data
123
- df.to_parquet(PYDEFLATE_PATHS.data / f"dac_{suffix}.parquet")
110
+ _DAC_ENTRY = CacheEntry(
111
+ key="dac_stats",
112
+ filename="dac.parquet",
113
+ fetcher=_download_dac,
114
+ ttl_days=30,
115
+ )
124
116
 
125
117
 
126
118
  def read_dac(update: bool = False) -> pd.DataFrame:
127
- """Read the latest WEO data from parquet files or download fresh data."""
128
- return read_data(
129
- file_finder_func=_find_dac_files_in_path,
130
- download_func=download_dac,
131
- data_name="DAC",
132
- update=update,
133
- )
119
+ path = cache_manager().ensure(_DAC_ENTRY, refresh=update)
120
+ return pd.read_parquet(path)
134
121
 
135
122
 
136
- if __name__ == "__main__":
137
- df = read_dac(update=True)
123
+ if __name__ == "__main__": # pragma: no cover
124
+ read_dac(update=True)
pydeflate/sources/imf.py CHANGED
@@ -1,15 +1,16 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
 
3
5
  import pandas as pd
4
6
  from imf_reader import weo
5
7
 
6
- from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
8
+ from pydeflate.cache import CacheEntry, cache_manager
9
+ from pydeflate.pydeflate_config import logger
7
10
  from pydeflate.sources.common import (
8
- today,
9
11
  add_pydeflate_iso3,
10
- enforce_pyarrow_types,
11
12
  compute_exchange_deflator,
12
- read_data,
13
+ enforce_pyarrow_types,
13
14
  prefix_pydeflate_to_columns,
14
15
  )
15
16
 
@@ -93,7 +94,7 @@ def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
93
94
 
94
95
 
95
96
  def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
96
- """Pivot the concept code column to get a wide format for the data.
97
+ """Pivot the concept dimension so each indicator becomes a column
97
98
 
98
99
  Args:
99
100
  df (pd.DataFrame): Dataframe with concept code column.
@@ -102,7 +103,7 @@ def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
102
103
  pd.DataFrame: Dataframe with concept code pivoted to columns.
103
104
  """
104
105
  return df.pivot(
105
- index=[c for c in df.columns if c not in ["concept_code", "value"]],
106
+ index=[c for c in df.columns if c not in {"concept_code", "value"}],
106
107
  columns="concept_code",
107
108
  values="value",
108
109
  ).reset_index()
@@ -171,15 +172,13 @@ def _create_eur_series(df: pd.DataFrame) -> pd.DataFrame:
171
172
  df.loc[df.entity_code == 998, "EXCHANGE"] = df.loc[
172
173
  df.entity_code == 998, "year"
173
174
  ].map(eur)
174
-
175
175
  return df
176
176
 
177
177
 
178
- def download_weo() -> None:
179
- """Download the WEO data, process it, and save it to a parquet file."""
180
- logger.info("Downloading the latest WEO data...")
178
+ def _download_weo(output_path: Path) -> None:
179
+ """Fetch, transform, and store the latest WEO dataset in Parquet format."""
181
180
 
182
- # Fetch and process the data through a pipeline of transformations
181
+ logger.info("Downloading the latest IMF WEO dataset...")
183
182
  df = (
184
183
  weo.fetch_data()
185
184
  .pipe(_filter_indicators)
@@ -195,38 +194,23 @@ def download_weo() -> None:
195
194
  .pipe(enforce_pyarrow_types)
196
195
  .reset_index(drop=True)
197
196
  )
198
-
199
- # Get today's date to use as a file suffix
200
- suffix = today()
201
-
202
- # Save the processed dataframe to parquet format
203
- df.to_parquet(PYDEFLATE_PATHS.data / f"weo_{suffix}.parquet")
204
-
205
- logger.info(f"Saved WEO data to weo_{suffix}.parquet")
197
+ output_path.parent.mkdir(parents=True, exist_ok=True)
198
+ df.to_parquet(output_path)
199
+ logger.info("Saved WEO data to %s", output_path)
206
200
 
207
201
 
208
- def _find_weo_files_in_path(path: Path) -> list:
209
- """Find all WEO parquet files in the specified directory.
210
-
211
- Args:
212
- path (Path): The directory path to search for WEO parquet files.
213
-
214
- Returns:
215
- list: List of WEO parquet files found in the directory.
216
- """
217
- return list(path.glob("weo_*.parquet"))
202
+ _IMF_CACHE_ENTRY = CacheEntry(
203
+ key="imf_weo",
204
+ filename="imf_weo.parquet",
205
+ fetcher=_download_weo,
206
+ ttl_days=60,
207
+ )
218
208
 
219
209
 
220
210
  def read_weo(update: bool = False) -> pd.DataFrame:
221
- """Read the latest WEO data from parquet files or download fresh data."""
222
- return read_data(
223
- file_finder_func=_find_weo_files_in_path,
224
- download_func=download_weo,
225
- data_name="WEO",
226
- update=update,
227
- )
211
+ path = cache_manager().ensure(_IMF_CACHE_ENTRY, refresh=update)
212
+ return pd.read_parquet(path)
228
213
 
229
214
 
230
- if __name__ == "__main__":
231
- # Download the WEO data
232
- dfi = read_weo(update=True)
215
+ if __name__ == "__main__": # pragma: no cover
216
+ read_weo(update=True)
@@ -1,15 +1,17 @@
1
+ from __future__ import annotations
2
+
1
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
2
4
  from pathlib import Path
5
+ from typing import Callable
3
6
 
4
7
  import pandas as pd
5
8
  import wbgapi as wb
6
9
 
7
- from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
10
+ from pydeflate.cache import CacheEntry, cache_manager
11
+ from pydeflate.pydeflate_config import logger
8
12
  from pydeflate.sources.common import (
9
- enforce_pyarrow_types,
10
- today,
11
13
  compute_exchange_deflator,
12
- read_data,
14
+ enforce_pyarrow_types,
13
15
  prefix_pydeflate_to_columns,
14
16
  )
15
17
  from pydeflate.utils import emu
@@ -56,8 +58,8 @@ def get_wb_indicator(series: str, value_name: str | None = None) -> pd.DataFrame
56
58
  labels=True,
57
59
  )
58
60
  .reset_index()
59
- .sort_values(by=["economy", "Time"]) # Sort for easier reading
60
- .drop(columns=["Time"]) # Remove unnecessary column
61
+ .sort_values(by=["economy", "Time"])
62
+ .drop(columns=["Time"])
61
63
  .rename(
62
64
  columns={
63
65
  "economy": "entity_code",
@@ -66,7 +68,7 @@ def get_wb_indicator(series: str, value_name: str | None = None) -> pd.DataFrame
66
68
  series: value_name or series,
67
69
  }
68
70
  )
69
- .reset_index(drop=True) # Drop the old index after reset
71
+ .reset_index(drop=True)
70
72
  )
71
73
 
72
74
 
@@ -119,22 +121,17 @@ def _parallel_download_indicators(indicators: dict) -> list[pd.DataFrame]:
119
121
 
120
122
  # Use ThreadPoolExecutor to fetch indicators in parallel
121
123
  with ThreadPoolExecutor() as executor:
122
- # Submit all tasks to the executor (downloading indicators in parallel)
123
124
  future_to_series = {
124
125
  executor.submit(get_wb_indicator, series, value_name): series
125
126
  for series, value_name in indicators.items()
126
127
  }
127
-
128
- # Collect the results as they complete
129
128
  for future in as_completed(future_to_series):
130
129
  series = future_to_series[future]
131
130
  try:
132
131
  df_ = future.result().set_index(["year", "entity_code", "entity"])
133
132
  dfs.append(df_)
134
- except Exception as exc:
135
- # Log or handle any errors that occur during the download
136
- logger.warning(f"Error downloading series {series}: {exc}")
137
-
133
+ except Exception as exc: # pragma: no cover - defensive logging
134
+ logger.warning("Error downloading series %s: %s", series, exc)
138
135
  return dfs
139
136
 
140
137
 
@@ -151,140 +148,70 @@ def _add_ppp_ppp_exchange(df: pd.DataFrame) -> pd.DataFrame:
151
148
  """
152
149
  ppp = df.loc[lambda d: d["entity_code"] == "USA"].copy()
153
150
  ppp[["entity_code", "entity", "pydeflate_iso3"]] = "PPP"
151
+ return pd.concat([df, ppp], ignore_index=True)
154
152
 
155
- df = pd.concat([df, ppp], ignore_index=True)
156
153
 
157
- return df
158
-
159
-
160
- def _download_wb(
161
- indicators: dict, prefix: str = "wb", add_ppp_exchange: bool = False
154
+ def _download_wb_dataset(
155
+ indicators: dict, output_path: Path, add_ppp_exchange: bool = False
162
156
  ) -> None:
163
- """Download multiple World Bank indicators in parallel and save as a parquet file.
164
-
165
- This function fetches all indicators defined in _INDICATORS in parallel, concatenates
166
- them into a single DataFrame, and saves the result as a parquet file using today's date as a suffix.
167
- """
168
- logger.info("Downloading the latest World Bank data...")
169
-
170
- indicators_data = _parallel_download_indicators(indicators=indicators)
157
+ """Download and materialise a World Bank dataset to ``output_path``."""
171
158
 
172
- # Concatenate all DataFrames horizontally (by columns)
159
+ logger.info("Downloading World Bank indicators for %s", output_path.name)
160
+ indicators_data = _parallel_download_indicators(indicators)
173
161
  df = pd.concat(indicators_data, axis=1).reset_index()
174
-
175
- # cleaning
176
162
  df = (
177
163
  df.pipe(_eur_series_fix)
178
164
  .pipe(compute_exchange_deflator, base_year_measure="NGDP_D")
179
165
  .assign(pydeflate_iso3=lambda d: d.entity_code)
180
166
  .sort_values(by=["year", "entity_code"])
181
167
  )
182
-
183
168
  if add_ppp_exchange:
184
169
  df = df.pipe(_add_ppp_ppp_exchange)
185
-
186
170
  df = (
187
171
  df.pipe(prefix_pydeflate_to_columns)
188
172
  .pipe(enforce_pyarrow_types)
189
173
  .reset_index(drop=True)
190
174
  )
191
-
192
- # Get today's date to use as a file suffix
193
- suffix = today()
194
-
195
- # Save the DataFrame as a parquet file
196
- output_path = PYDEFLATE_PATHS.data / f"{prefix}_{suffix}.parquet"
175
+ output_path.parent.mkdir(parents=True, exist_ok=True)
197
176
  df.to_parquet(output_path)
177
+ logger.info("Saved World Bank data to %s", output_path)
198
178
 
199
- logger.info(f"Saved World Bank data to {prefix}_{suffix}.parquet")
200
-
201
-
202
- def download_wb() -> None:
203
- """Download the latest World Bank data."""
204
- _download_wb(indicators=_INDICATORS, prefix="wb")
205
-
206
-
207
- def download_wb_lcu_ppp() -> None:
208
- """Download the latest World Bank data (PPP)."""
209
- _download_wb(
210
- indicators=_INDICATORS_LCU_PPP, prefix="wb_lcu_ppp", add_ppp_exchange=True
211
- )
212
-
213
-
214
- def download_wb_usd_ppp() -> None:
215
- """Download the latest World Bank data (PPP)."""
216
- _download_wb(
217
- indicators=_INDICATORS_USD_PPP, prefix="wb_usd_ppp", add_ppp_exchange=True
218
- )
219
179
 
180
+ def _entry(
181
+ key: str, filename: str, fetcher: Callable[[Path], None], ttl_days: int = 30
182
+ ) -> CacheEntry:
183
+ return CacheEntry(key=key, filename=filename, fetcher=fetcher, ttl_days=ttl_days)
220
184
 
221
- def _find_wb_files_in_path(path: Path) -> list:
222
- """Find all WB parquet files in the specified directory.
223
185
 
224
- Args:
225
- path (Path): The directory path to search for WB parquet files.
226
-
227
- Returns:
228
- list: List of WB parquet files found in the directory.
229
- """
230
- return list(path.glob(f"wb_*.parquet"))
231
-
232
-
233
- def _find_wb_lcu_ppp_files_in_path(path: Path) -> list:
234
- """Find all WB PPP parquet files in the specified directory.
235
-
236
- Args:
237
- path (Path): The directory path to search for WB parquet files.
238
-
239
- Returns:
240
- list: List of WB parquet files found in the directory.
241
- """
242
- return list(path.glob(f"wb_lcu_ppp_*.parquet"))
243
-
244
-
245
- def _find_wb_usd_ppp_files_in_path(path: Path) -> list:
246
- """Find all WB PPP parquet files in the specified directory.
247
-
248
- Args:
249
- path (Path): The directory path to search for WB parquet files.
250
-
251
- Returns:
252
- list: List of WB parquet files found in the directory.
253
- """
254
- return list(path.glob(f"wb_usd_ppp_*.parquet"))
186
+ _WB_ENTRY = _entry(
187
+ "world_bank", "wb.parquet", lambda p: _download_wb_dataset(_INDICATORS, p)
188
+ )
189
+ _WB_LCU_PPP_ENTRY = _entry(
190
+ "world_bank_lcu_ppp",
191
+ "wb_lcu_ppp.parquet",
192
+ lambda p: _download_wb_dataset(_INDICATORS_LCU_PPP, p, add_ppp_exchange=True),
193
+ )
194
+ _WB_USD_PPP_ENTRY = _entry(
195
+ "world_bank_usd_ppp",
196
+ "wb_usd_ppp.parquet",
197
+ lambda p: _download_wb_dataset(_INDICATORS_USD_PPP, p, add_ppp_exchange=True),
198
+ )
255
199
 
256
200
 
257
201
  def read_wb(update: bool = False) -> pd.DataFrame:
258
- """Read the latest World Bank data from parquet files or download fresh data."""
259
- return read_data(
260
- file_finder_func=_find_wb_files_in_path,
261
- download_func=download_wb,
262
- data_name="World Bank",
263
- update=update,
264
- )
202
+ path = cache_manager().ensure(_WB_ENTRY, refresh=update)
203
+ return pd.read_parquet(path)
265
204
 
266
205
 
267
206
  def read_wb_lcu_ppp(update: bool = False) -> pd.DataFrame:
268
- """Read the latest World Bank data from parquet files or download fresh data."""
269
- return read_data(
270
- file_finder_func=_find_wb_lcu_ppp_files_in_path,
271
- download_func=download_wb_lcu_ppp,
272
- data_name="World Bank",
273
- update=update,
274
- )
207
+ path = cache_manager().ensure(_WB_LCU_PPP_ENTRY, refresh=update)
208
+ return pd.read_parquet(path)
275
209
 
276
210
 
277
211
  def read_wb_usd_ppp(update: bool = False) -> pd.DataFrame:
278
- """Read the latest World Bank data from parquet files or download fresh data."""
279
- return read_data(
280
- file_finder_func=_find_wb_usd_ppp_files_in_path,
281
- download_func=download_wb_usd_ppp,
282
- data_name="World Bank",
283
- update=update,
284
- )
212
+ path = cache_manager().ensure(_WB_USD_PPP_ENTRY, refresh=update)
213
+ return pd.read_parquet(path)
285
214
 
286
215
 
287
- if __name__ == "__main__":
288
- df_wb = read_wb(False)
289
- df_usd = read_wb_usd_ppp(False)
290
- df_lcu = read_wb_lcu_ppp(False)
216
+ if __name__ == "__main__": # pragma: no cover
217
+ read_wb(update=True)
pydeflate/utils.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import re
2
3
 
3
4
  import numpy as np
4
5
  import pandas as pd
@@ -22,18 +23,25 @@ def emu() -> list:
22
23
 
23
24
 
24
25
  def clean_number(number):
25
- """Clean a number and return as float"""
26
- import re
26
+ """Clean a number-like value and return it as a float.
27
+
28
+ Preserves leading signs and scientific notation while stripping
29
+ formatting artifacts such as commas or surrounding text.
30
+ """
27
31
 
28
32
  if not isinstance(number, str):
29
33
  number = str(number)
30
34
 
31
- number = re.sub(r"[^\d.]", "", number)
35
+ normalized = number.replace(",", "").strip()
36
+ match = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", normalized)
32
37
 
33
- if number == "":
38
+ if not match:
34
39
  return np.nan
35
40
 
36
- return float(number)
41
+ try:
42
+ return float(match.group())
43
+ except ValueError:
44
+ return np.nan
37
45
 
38
46
 
39
47
  def create_pydeflate_year(
@@ -65,9 +73,7 @@ def _use_implied_dac_rates(
65
73
  data.loc[
66
74
  lambda d: ~d[f"temp_{entity_column}"].isin(pydeflate_data[ix[-1]].unique()),
67
75
  f"temp_{entity_column}",
68
- ] = (
69
- 20001 if source_codes else "DAC"
70
- )
76
+ ] = 20001 if source_codes else "DAC"
71
77
 
72
78
  # Log the fact that implied rates are being used
73
79
  flag_missing_pydeflate_data(
@@ -90,7 +96,6 @@ def merge_user_and_pydeflate_data(
90
96
  source_codes: bool = True,
91
97
  dac: bool = False,
92
98
  ) -> pd.DataFrame:
93
-
94
99
  data[f"temp_{entity_column}"] = data[entity_column]
95
100
 
96
101
  if dac: