pydeflate 2.1.3__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,41 +1,19 @@
1
- from datetime import datetime
2
- from pathlib import Path
1
+ from __future__ import annotations
2
+
3
3
  from typing import Any, Literal
4
4
 
5
5
  import pandas as pd
6
6
  from hdx.location.country import Country
7
7
 
8
- from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
8
+ from pydeflate.pydeflate_config import logger
9
9
 
10
10
  AvailableDeflators = Literal["NGDP_D", "NGDP_DL", "CPI", "PCPI", "PCPIE"]
11
11
 
12
12
 
13
- def check_file_age(file: Path) -> int:
14
- """Check the age of a WEO file in days.
15
-
16
- Args:
17
- file (Path): The WEO parquet file to check.
18
-
19
- Returns:
20
- int: The number of days since the file was created.
21
- """
22
- current_date = datetime.today()
23
- # Extract date from the filename (format: weo_YYYY-MM-DD.parquet)
24
- file_date = datetime.strptime(file.stem.split("_")[-1], "%Y-%m-%d")
25
-
26
- # Return the difference in days between today and the file's date
27
- return (current_date - file_date).days
28
-
29
-
30
13
  def enforce_pyarrow_types(df: pd.DataFrame) -> pd.DataFrame:
31
- """Ensures that a DataFrame uses pyarrow dtypes."""
32
- return df.convert_dtypes(dtype_backend="pyarrow")
33
-
34
-
35
- def today() -> str:
36
- from datetime import datetime
14
+ """Ensure that a DataFrame uses pyarrow-backed dtypes."""
37
15
 
38
- return datetime.today().strftime("%Y-%m-%d")
16
+ return df.convert_dtypes(dtype_backend="pyarrow")
39
17
 
40
18
 
41
19
  def _match_regex_to_iso3(
@@ -52,20 +30,17 @@ def _match_regex_to_iso3(
52
30
  if additional_mapping is None:
53
31
  additional_mapping = {}
54
32
 
55
- # Create a Country object
56
33
  country = Country()
57
-
58
- # Match the regex strings to ISO3 country codes
59
- matches = {}
34
+ matches: dict[str, str | None] = {}
60
35
 
61
36
  for match in to_match:
62
37
  try:
63
38
  match_ = country.get_iso3_country_code_fuzzy(match)[0]
64
- except:
39
+ except Exception: # pragma: no cover - defensive logging
65
40
  match_ = None
66
41
  matches[match] = match_
67
42
  if match_ is None and match not in additional_mapping:
68
- logger.debug(f"No ISO3 match found for {match}")
43
+ logger.debug("No ISO3 match found for %s", match)
69
44
 
70
45
  return matches | additional_mapping
71
46
 
@@ -76,7 +51,7 @@ def convert_id(
76
51
  to_type: str = "ISO3",
77
52
  not_found: Any = None,
78
53
  *,
79
- additional_mapping: dict = None,
54
+ additional_mapping: dict | None = None,
80
55
  ) -> pd.Series:
81
56
  """Takes a Pandas' series with country IDs and converts them into the desired type.
82
57
 
@@ -93,7 +68,6 @@ def convert_id(
93
68
  the same datatype as the target type.
94
69
  """
95
70
 
96
- # if from and to are the same, return without changing anything
97
71
  if from_type == to_type:
98
72
  return series
99
73
 
@@ -107,7 +81,6 @@ def convert_id(
107
81
  mapping = mapping_functions[from_type](
108
82
  to_match=s_unique, additional_mapping=additional_mapping
109
83
  )
110
-
111
84
  return series.map(mapping).fillna(series if not_found is None else not_found)
112
85
 
113
86
 
@@ -141,7 +114,6 @@ def add_pydeflate_iso3(
141
114
  "Sub-Sahara Africa": "SSA",
142
115
  },
143
116
  )
144
-
145
117
  return df
146
118
 
147
119
 
@@ -160,7 +132,6 @@ def prefix_pydeflate_to_columns(
160
132
  df.columns = [
161
133
  f"{prefix}{col}" if not col.startswith(prefix) else col for col in df.columns
162
134
  ]
163
-
164
135
  return df
165
136
 
166
137
 
@@ -187,7 +158,7 @@ def compute_exchange_deflator(
187
158
  base_year_measure: str | None = None,
188
159
  exchange: str = "EXCHANGE",
189
160
  year: str = "year",
190
- grouper: list[str] = None,
161
+ grouper: list[str] | None = None,
191
162
  ) -> pd.DataFrame:
192
163
  """Compute the exchange rate deflator for each group of entities.
193
164
 
@@ -205,87 +176,68 @@ def compute_exchange_deflator(
205
176
  pd.DataFrame: DataFrame with an additional column for the exchange rate deflator.
206
177
  """
207
178
 
208
- def _add_deflator(
179
+ def _compute_deflator_for_group(
209
180
  group: pd.DataFrame,
210
- measure: str | None = "NGDPD_D",
211
- exchange: str = "EXCHANGE",
212
- year: str = "year",
181
+ measure: str | None,
182
+ exchange_col: str,
183
+ year_col: str,
184
+ deflator_col: str,
213
185
  ) -> pd.DataFrame:
214
-
215
- # if needed, clean exchange name
216
- if exchange.endswith("_to") or exchange.endswith("_from"):
217
- exchange_name = exchange.rsplit("_", 1)[0]
218
- else:
219
- exchange_name = exchange
220
-
221
- # Identify the base year for the deflator
186
+ """Compute deflator for a single group and add it as a column."""
187
+ # Identify base year
222
188
  if measure is not None:
223
- base_year = identify_base_year(group, measure=measure, year=year)
189
+ base_year = identify_base_year(group, measure=measure, year=year_col)
224
190
  else:
225
- base_year = group.dropna(subset=exchange)[year].max()
191
+ valid_rows = group.dropna(subset=[exchange_col])
192
+ base_year = valid_rows[year_col].max() if not valid_rows.empty else None
226
193
 
227
- # If no base year is found, return the group unchanged
194
+ # If no base year found, return group without deflator column
228
195
  if base_year is None or pd.isna(base_year):
229
196
  return group
230
197
 
231
198
  # Extract the exchange rate value for the base year
232
- base_value = group.loc[group[year] == base_year, exchange].values
199
+ base_value_rows = group.loc[group[year_col] == base_year, exchange_col]
233
200
 
234
- # If base value is found and valid, calculate the deflator
235
- if base_value.size > 0 and pd.notna(base_value[0]):
236
- group[f"{exchange_name}_D"] = round(
237
- 100 * group[exchange] / base_value[0], 6
238
- )
201
+ # If no valid base value, return group without deflator column
202
+ if base_value_rows.empty or pd.isna(base_value_rows.iloc[0]):
203
+ return group
204
+
205
+ # Calculate and add deflator column
206
+ base_value = base_value_rows.iloc[0]
207
+ group = group.copy()
208
+ group[deflator_col] = round(100 * group[exchange_col] / base_value, 6)
239
209
 
240
210
  return group
241
211
 
242
212
  if grouper is None:
243
213
  grouper = ["entity", "entity_code"]
244
214
 
245
- # Apply the deflator computation for each group of 'entity' and 'entity_code'
246
- return df.groupby(grouper, group_keys=False).apply(
247
- _add_deflator, measure=base_year_measure, exchange=exchange, year=year
248
- )
249
-
250
-
251
- def read_data(
252
- file_finder_func: callable,
253
- download_func: callable,
254
- data_name: str,
255
- update: bool = False,
256
- ) -> pd.DataFrame:
257
- """Generic function to read data from parquet files or download fresh data.
258
-
259
- Args:
260
- file_finder_func (function): Function to find existing data files in the path.
261
- download_func (function): Function to download fresh data if no files are
262
- found or an update is needed.
263
- data_name (str): Name of the dataset for logging purposes (e.g., "WEO", "DAC").
264
- update (bool): If True, forces downloading of new data even if files exist.
265
-
266
- Returns:
267
- pd.DataFrame: The latest available data.
268
- """
269
- # Find existing files using the provided file finder function
270
- files = file_finder_func(PYDEFLATE_PATHS.data)
271
-
272
- # If no files are found or update is requested, download new data
273
- if len(files) == 0 or update:
274
- download_func()
275
- files = file_finder_func(PYDEFLATE_PATHS.data)
276
-
277
- # If files are found, sort them by age and load the most recent one
278
- if len(files) > 0:
279
- files = sorted(files, key=check_file_age)
280
- latest_file = files[0]
281
-
282
- # Check if the latest file is older than 120 days and log a warning
283
- if check_file_age(latest_file) > 120:
284
- logger.warn(
285
- f"The latest {data_name} data is more than 120 days old.\n"
286
- f"Consider updating by setting update=True in the function call."
287
- )
288
-
289
- # Read and return the latest parquet file as a DataFrame
290
- logger.info(f"Reading {data_name} data from {latest_file}")
291
- return pd.read_parquet(latest_file)
215
+ # Determine the exchange column name for the deflator
216
+ if exchange.endswith("_to") or exchange.endswith("_from"):
217
+ exchange_name = exchange.rsplit("_", 1)[0]
218
+ else:
219
+ exchange_name = exchange
220
+
221
+ deflator_col = f"{exchange_name}_D"
222
+
223
+ # Process each group and concatenate results
224
+ # This approach avoids the FutureWarning from groupby().apply() operating on grouping columns
225
+ processed_groups = []
226
+ for name, group in df.groupby(grouper, sort=False):
227
+ processed_group = _compute_deflator_for_group(
228
+ group=group,
229
+ measure=base_year_measure,
230
+ exchange_col=exchange,
231
+ year_col=year,
232
+ deflator_col=deflator_col,
233
+ )
234
+ processed_groups.append(processed_group)
235
+
236
+ # Concatenate all processed groups and restore original row order
237
+ result = pd.concat(processed_groups, ignore_index=False)
238
+
239
+ # Sort by index to restore original row order
240
+ # (groupby may have changed the order when grouping rows together)
241
+ result = result.sort_index()
242
+
243
+ return result
pydeflate/sources/dac.py CHANGED
@@ -1,71 +1,58 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
 
3
5
  import pandas as pd
4
6
  from oda_reader import download_dac1
5
7
 
6
- from pydeflate.pydeflate_config import PYDEFLATE_PATHS
8
+ from pydeflate.cache import CacheEntry, cache_manager
9
+ from pydeflate.pydeflate_config import logger
7
10
  from pydeflate.sources.common import (
8
- today,
9
11
  add_pydeflate_iso3,
10
- enforce_pyarrow_types,
11
12
  compute_exchange_deflator,
12
- read_data,
13
+ enforce_pyarrow_types,
13
14
  prefix_pydeflate_to_columns,
14
15
  )
15
16
 
16
17
 
17
- def _find_dac_files_in_path(path: Path) -> list:
18
- """Find all DAC parquet files in the specified directory.
19
-
20
- Args:
21
- path (Path): The directory path to search for DAC parquet files.
22
-
23
- Returns:
24
- list: List of DAC parquet files found in the directory.
25
- """
26
- return list(path.glob("dac_*.parquet"))
27
-
28
-
29
18
  def _to_units(df: pd.DataFrame) -> pd.DataFrame:
30
- """Convert DAC values (in million) to units.
31
-
32
- Args:
33
- df (pd.DataFrame): Dataframe with raw observation values.
19
+ """Scale reported DAC values (supplied in millions) into base units."""
34
20
 
35
- Returns:
36
- pd.DataFrame: Dataframe with scaled observation values.
37
- """
38
21
  df = df.copy()
39
22
  df["value"] = df["value"] * df["unit_multiplier"]
40
23
  return df
41
24
 
42
25
 
43
26
  def _keep_official_definition_only(df: pd.DataFrame) -> pd.DataFrame:
27
+ """Retain rows matching the official DAC definition across regime changes."""
28
+
44
29
  query = (
45
30
  "(aidtype_code == 1010 & flows_code == 1140 & year <2018 ) | "
46
31
  "(aidtype_code == 11010 & flows_code == 1160 & year >=2018)"
47
32
  )
48
-
49
33
  return df.query(query)
50
34
 
51
35
 
52
36
  def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
53
- columns = ["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"]
37
+ """Select the key columns used downstream in pydeflate."""
54
38
 
55
- return df.filter(columns)
39
+ return df.filter(["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"])
56
40
 
57
41
 
58
42
  def _pivot_amount_type(df: pd.DataFrame) -> pd.DataFrame:
43
+ """Pivot amount-type codes into separate columns (A/N/D)."""
44
+
59
45
  df = df.filter(["year", "donor_code", "donor_name", "amounttype_code", "value"])
60
46
  return df.pivot(
61
- index=[c for c in df.columns if c not in ["amounttype_code", "value"]],
47
+ index=[c for c in df.columns if c not in {"amounttype_code", "value"}],
62
48
  columns="amounttype_code",
63
49
  values="value",
64
50
  ).reset_index()
65
51
 
66
52
 
67
53
  def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
68
- # The values for certain providers should be 1
54
+ """Derive exchange rates, forcing DAC aggregates to unity."""
55
+
69
56
  df.loc[lambda d: d.donor_code >= 20000, "N"] = df.loc[
70
57
  lambda d: d.donor_code >= 20000, "A"
71
58
  ]
@@ -74,32 +61,32 @@ def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
74
61
 
75
62
 
76
63
  def _compute_dac_deflator(df: pd.DataFrame) -> pd.DataFrame:
64
+ """Calculate the published DAC price deflator from amounts A/D."""
65
+
77
66
  df["DAC_DEFLATOR"] = round(100 * df["A"] / df["D"], 6)
78
67
  return df
79
68
 
80
69
 
81
70
  def _compute_dac_gdp_deflator(df: pd.DataFrame) -> pd.DataFrame:
82
- df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
71
+ """Back out a GDP-style deflator using the exchange deflator."""
83
72
 
73
+ df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
84
74
  return df
85
75
 
86
76
 
87
77
  def _rename_columns(df: pd.DataFrame) -> pd.DataFrame:
88
- return df.rename(
89
- columns={
90
- "donor_code": "entity_code",
91
- "donor_name": "entity",
92
- }
93
- )
78
+ """Align donor metadata with pydeflate naming conventions."""
94
79
 
80
+ return df.rename(columns={"donor_code": "entity_code", "donor_name": "entity"})
95
81
 
96
- def download_dac():
97
- # Use oda_reader to get the data
82
+
83
+ def _download_dac(output_path: Path) -> None:
84
+ """Download and cache the DAC statistics parquet file."""
85
+
86
+ logger.info("Downloading DAC statistics from ODA reader...")
98
87
  df = download_dac1(
99
88
  filters={"measure": ["1010", "11010"], "flow_type": ["1140", "1160"]}
100
89
  )
101
-
102
- # Clean the data
103
90
  df = (
104
91
  df.pipe(_to_units)
105
92
  .pipe(_keep_official_definition_only)
@@ -115,23 +102,23 @@ def download_dac():
115
102
  .pipe(enforce_pyarrow_types)
116
103
  .reset_index(drop=True)
117
104
  )
105
+ output_path.parent.mkdir(parents=True, exist_ok=True)
106
+ df.to_parquet(output_path)
107
+ logger.info("Saved DAC dataset to %s", output_path)
118
108
 
119
- # Get today's date to use as a file suffix
120
- suffix = today()
121
109
 
122
- # Save the data
123
- df.to_parquet(PYDEFLATE_PATHS.data / f"dac_{suffix}.parquet")
110
+ _DAC_ENTRY = CacheEntry(
111
+ key="dac_stats",
112
+ filename="dac.parquet",
113
+ fetcher=_download_dac,
114
+ ttl_days=30,
115
+ )
124
116
 
125
117
 
126
118
  def read_dac(update: bool = False) -> pd.DataFrame:
127
- """Read the latest WEO data from parquet files or download fresh data."""
128
- return read_data(
129
- file_finder_func=_find_dac_files_in_path,
130
- download_func=download_dac,
131
- data_name="DAC",
132
- update=update,
133
- )
119
+ path = cache_manager().ensure(_DAC_ENTRY, refresh=update)
120
+ return pd.read_parquet(path)
134
121
 
135
122
 
136
- if __name__ == "__main__":
137
- df = read_dac(update=True)
123
+ if __name__ == "__main__": # pragma: no cover
124
+ read_dac(update=True)
pydeflate/sources/imf.py CHANGED
@@ -1,15 +1,16 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
 
3
5
  import pandas as pd
4
6
  from imf_reader import weo
5
7
 
6
- from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
8
+ from pydeflate.cache import CacheEntry, cache_manager
9
+ from pydeflate.pydeflate_config import logger
7
10
  from pydeflate.sources.common import (
8
- today,
9
11
  add_pydeflate_iso3,
10
- enforce_pyarrow_types,
11
12
  compute_exchange_deflator,
12
- read_data,
13
+ enforce_pyarrow_types,
13
14
  prefix_pydeflate_to_columns,
14
15
  )
15
16
 
@@ -93,7 +94,7 @@ def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
93
94
 
94
95
 
95
96
  def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
96
- """Pivot the concept code column to get a wide format for the data.
97
+ """Pivot the concept dimension so each indicator becomes a column
97
98
 
98
99
  Args:
99
100
  df (pd.DataFrame): Dataframe with concept code column.
@@ -102,7 +103,7 @@ def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
102
103
  pd.DataFrame: Dataframe with concept code pivoted to columns.
103
104
  """
104
105
  return df.pivot(
105
- index=[c for c in df.columns if c not in ["concept_code", "value"]],
106
+ index=[c for c in df.columns if c not in {"concept_code", "value"}],
106
107
  columns="concept_code",
107
108
  values="value",
108
109
  ).reset_index()
@@ -171,15 +172,13 @@ def _create_eur_series(df: pd.DataFrame) -> pd.DataFrame:
171
172
  df.loc[df.entity_code == 998, "EXCHANGE"] = df.loc[
172
173
  df.entity_code == 998, "year"
173
174
  ].map(eur)
174
-
175
175
  return df
176
176
 
177
177
 
178
- def download_weo() -> None:
179
- """Download the WEO data, process it, and save it to a parquet file."""
180
- logger.info("Downloading the latest WEO data...")
178
+ def _download_weo(output_path: Path) -> None:
179
+ """Fetch, transform, and store the latest WEO dataset in Parquet format."""
181
180
 
182
- # Fetch and process the data through a pipeline of transformations
181
+ logger.info("Downloading the latest IMF WEO dataset...")
183
182
  df = (
184
183
  weo.fetch_data()
185
184
  .pipe(_filter_indicators)
@@ -195,38 +194,23 @@ def download_weo() -> None:
195
194
  .pipe(enforce_pyarrow_types)
196
195
  .reset_index(drop=True)
197
196
  )
198
-
199
- # Get today's date to use as a file suffix
200
- suffix = today()
201
-
202
- # Save the processed dataframe to parquet format
203
- df.to_parquet(PYDEFLATE_PATHS.data / f"weo_{suffix}.parquet")
204
-
205
- logger.info(f"Saved WEO data to weo_{suffix}.parquet")
197
+ output_path.parent.mkdir(parents=True, exist_ok=True)
198
+ df.to_parquet(output_path)
199
+ logger.info("Saved WEO data to %s", output_path)
206
200
 
207
201
 
208
- def _find_weo_files_in_path(path: Path) -> list:
209
- """Find all WEO parquet files in the specified directory.
210
-
211
- Args:
212
- path (Path): The directory path to search for WEO parquet files.
213
-
214
- Returns:
215
- list: List of WEO parquet files found in the directory.
216
- """
217
- return list(path.glob("weo_*.parquet"))
202
+ _IMF_CACHE_ENTRY = CacheEntry(
203
+ key="imf_weo",
204
+ filename="imf_weo.parquet",
205
+ fetcher=_download_weo,
206
+ ttl_days=60,
207
+ )
218
208
 
219
209
 
220
210
  def read_weo(update: bool = False) -> pd.DataFrame:
221
- """Read the latest WEO data from parquet files or download fresh data."""
222
- return read_data(
223
- file_finder_func=_find_weo_files_in_path,
224
- download_func=download_weo,
225
- data_name="WEO",
226
- update=update,
227
- )
211
+ path = cache_manager().ensure(_IMF_CACHE_ENTRY, refresh=update)
212
+ return pd.read_parquet(path)
228
213
 
229
214
 
230
- if __name__ == "__main__":
231
- # Download the WEO data
232
- dfi = read_weo(update=True)
215
+ if __name__ == "__main__": # pragma: no cover
216
+ read_weo(update=True)