pydeflate 2.1.2__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,41 +1,19 @@
1
- from datetime import datetime
2
- from pathlib import Path
1
+ from __future__ import annotations
2
+
3
3
  from typing import Any, Literal
4
4
 
5
5
  import pandas as pd
6
6
  from hdx.location.country import Country
7
7
 
8
- from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
8
+ from pydeflate.pydeflate_config import logger
9
9
 
10
10
  AvailableDeflators = Literal["NGDP_D", "NGDP_DL", "CPI", "PCPI", "PCPIE"]
11
11
 
12
12
 
13
- def check_file_age(file: Path) -> int:
14
- """Check the age of a WEO file in days.
15
-
16
- Args:
17
- file (Path): The WEO parquet file to check.
18
-
19
- Returns:
20
- int: The number of days since the file was created.
21
- """
22
- current_date = datetime.today()
23
- # Extract date from the filename (format: weo_YYYY-MM-DD.parquet)
24
- file_date = datetime.strptime(file.stem.split("_")[-1], "%Y-%m-%d")
25
-
26
- # Return the difference in days between today and the file's date
27
- return (current_date - file_date).days
28
-
29
-
30
13
  def enforce_pyarrow_types(df: pd.DataFrame) -> pd.DataFrame:
31
- """Ensures that a DataFrame uses pyarrow dtypes."""
32
- return df.convert_dtypes(dtype_backend="pyarrow")
33
-
34
-
35
- def today() -> str:
36
- from datetime import datetime
14
+ """Ensure that a DataFrame uses pyarrow-backed dtypes."""
37
15
 
38
- return datetime.today().strftime("%Y-%m-%d")
16
+ return df.convert_dtypes(dtype_backend="pyarrow")
39
17
 
40
18
 
41
19
  def _match_regex_to_iso3(
@@ -52,20 +30,17 @@ def _match_regex_to_iso3(
52
30
  if additional_mapping is None:
53
31
  additional_mapping = {}
54
32
 
55
- # Create a Country object
56
33
  country = Country()
57
-
58
- # Match the regex strings to ISO3 country codes
59
- matches = {}
34
+ matches: dict[str, str | None] = {}
60
35
 
61
36
  for match in to_match:
62
37
  try:
63
38
  match_ = country.get_iso3_country_code_fuzzy(match)[0]
64
- except:
39
+ except Exception: # pragma: no cover - defensive logging
65
40
  match_ = None
66
41
  matches[match] = match_
67
42
  if match_ is None and match not in additional_mapping:
68
- logger.debug(f"No ISO3 match found for {match}")
43
+ logger.debug("No ISO3 match found for %s", match)
69
44
 
70
45
  return matches | additional_mapping
71
46
 
@@ -76,7 +51,7 @@ def convert_id(
76
51
  to_type: str = "ISO3",
77
52
  not_found: Any = None,
78
53
  *,
79
- additional_mapping: dict = None,
54
+ additional_mapping: dict | None = None,
80
55
  ) -> pd.Series:
81
56
  """Takes a Pandas' series with country IDs and converts them into the desired type.
82
57
 
@@ -93,7 +68,6 @@ def convert_id(
93
68
  the same datatype as the target type.
94
69
  """
95
70
 
96
- # if from and to are the same, return without changing anything
97
71
  if from_type == to_type:
98
72
  return series
99
73
 
@@ -107,7 +81,6 @@ def convert_id(
107
81
  mapping = mapping_functions[from_type](
108
82
  to_match=s_unique, additional_mapping=additional_mapping
109
83
  )
110
-
111
84
  return series.map(mapping).fillna(series if not_found is None else not_found)
112
85
 
113
86
 
@@ -133,6 +106,7 @@ def add_pydeflate_iso3(
133
106
  not_found=fillna,
134
107
  additional_mapping={
135
108
  "World": "WLD",
109
+ "European Union": "EUR",
136
110
  "EU Institutions": "EUI",
137
111
  "DAC countries": "DAC",
138
112
  "Kosovo": "XXK",
@@ -140,7 +114,6 @@ def add_pydeflate_iso3(
140
114
  "Sub-Sahara Africa": "SSA",
141
115
  },
142
116
  )
143
-
144
117
  return df
145
118
 
146
119
 
@@ -159,7 +132,6 @@ def prefix_pydeflate_to_columns(
159
132
  df.columns = [
160
133
  f"{prefix}{col}" if not col.startswith(prefix) else col for col in df.columns
161
134
  ]
162
-
163
135
  return df
164
136
 
165
137
 
@@ -186,7 +158,7 @@ def compute_exchange_deflator(
186
158
  base_year_measure: str | None = None,
187
159
  exchange: str = "EXCHANGE",
188
160
  year: str = "year",
189
- grouper: list[str] = None,
161
+ grouper: list[str] | None = None,
190
162
  ) -> pd.DataFrame:
191
163
  """Compute the exchange rate deflator for each group of entities.
192
164
 
@@ -204,87 +176,68 @@ def compute_exchange_deflator(
204
176
  pd.DataFrame: DataFrame with an additional column for the exchange rate deflator.
205
177
  """
206
178
 
207
- def _add_deflator(
179
+ def _compute_deflator_for_group(
208
180
  group: pd.DataFrame,
209
- measure: str | None = "NGDPD_D",
210
- exchange: str = "EXCHANGE",
211
- year: str = "year",
181
+ measure: str | None,
182
+ exchange_col: str,
183
+ year_col: str,
184
+ deflator_col: str,
212
185
  ) -> pd.DataFrame:
213
-
214
- # if needed, clean exchange name
215
- if exchange.endswith("_to") or exchange.endswith("_from"):
216
- exchange_name = exchange.rsplit("_", 1)[0]
217
- else:
218
- exchange_name = exchange
219
-
220
- # Identify the base year for the deflator
186
+ """Compute deflator for a single group and add it as a column."""
187
+ # Identify base year
221
188
  if measure is not None:
222
- base_year = identify_base_year(group, measure=measure, year=year)
189
+ base_year = identify_base_year(group, measure=measure, year=year_col)
223
190
  else:
224
- base_year = group.dropna(subset=exchange)[year].max()
191
+ valid_rows = group.dropna(subset=[exchange_col])
192
+ base_year = valid_rows[year_col].max() if not valid_rows.empty else None
225
193
 
226
- # If no base year is found, return the group unchanged
194
+ # If no base year found, return group without deflator column
227
195
  if base_year is None or pd.isna(base_year):
228
196
  return group
229
197
 
230
198
  # Extract the exchange rate value for the base year
231
- base_value = group.loc[group[year] == base_year, exchange].values
199
+ base_value_rows = group.loc[group[year_col] == base_year, exchange_col]
232
200
 
233
- # If base value is found and valid, calculate the deflator
234
- if base_value.size > 0 and pd.notna(base_value[0]):
235
- group[f"{exchange_name}_D"] = round(
236
- 100 * group[exchange] / base_value[0], 6
237
- )
201
+ # If no valid base value, return group without deflator column
202
+ if base_value_rows.empty or pd.isna(base_value_rows.iloc[0]):
203
+ return group
204
+
205
+ # Calculate and add deflator column
206
+ base_value = base_value_rows.iloc[0]
207
+ group = group.copy()
208
+ group[deflator_col] = round(100 * group[exchange_col] / base_value, 6)
238
209
 
239
210
  return group
240
211
 
241
212
  if grouper is None:
242
213
  grouper = ["entity", "entity_code"]
243
214
 
244
- # Apply the deflator computation for each group of 'entity' and 'entity_code'
245
- return df.groupby(grouper, group_keys=False).apply(
246
- _add_deflator, measure=base_year_measure, exchange=exchange, year=year
247
- )
248
-
249
-
250
- def read_data(
251
- file_finder_func: callable,
252
- download_func: callable,
253
- data_name: str,
254
- update: bool = False,
255
- ) -> pd.DataFrame:
256
- """Generic function to read data from parquet files or download fresh data.
257
-
258
- Args:
259
- file_finder_func (function): Function to find existing data files in the path.
260
- download_func (function): Function to download fresh data if no files are
261
- found or an update is needed.
262
- data_name (str): Name of the dataset for logging purposes (e.g., "WEO", "DAC").
263
- update (bool): If True, forces downloading of new data even if files exist.
264
-
265
- Returns:
266
- pd.DataFrame: The latest available data.
267
- """
268
- # Find existing files using the provided file finder function
269
- files = file_finder_func(PYDEFLATE_PATHS.data)
270
-
271
- # If no files are found or update is requested, download new data
272
- if len(files) == 0 or update:
273
- download_func()
274
- files = file_finder_func(PYDEFLATE_PATHS.data)
275
-
276
- # If files are found, sort them by age and load the most recent one
277
- if len(files) > 0:
278
- files = sorted(files, key=check_file_age)
279
- latest_file = files[0]
280
-
281
- # Check if the latest file is older than 120 days and log a warning
282
- if check_file_age(latest_file) > 120:
283
- logger.warn(
284
- f"The latest {data_name} data is more than 120 days old.\n"
285
- f"Consider updating by setting update=True in the function call."
286
- )
287
-
288
- # Read and return the latest parquet file as a DataFrame
289
- logger.info(f"Reading {data_name} data from {latest_file}")
290
- return pd.read_parquet(latest_file)
215
+ # Determine the exchange column name for the deflator
216
+ if exchange.endswith("_to") or exchange.endswith("_from"):
217
+ exchange_name = exchange.rsplit("_", 1)[0]
218
+ else:
219
+ exchange_name = exchange
220
+
221
+ deflator_col = f"{exchange_name}_D"
222
+
223
+ # Process each group and concatenate results
224
+ # This approach avoids the FutureWarning from groupby().apply() operating on grouping columns
225
+ processed_groups = []
226
+ for name, group in df.groupby(grouper, sort=False):
227
+ processed_group = _compute_deflator_for_group(
228
+ group=group,
229
+ measure=base_year_measure,
230
+ exchange_col=exchange,
231
+ year_col=year,
232
+ deflator_col=deflator_col,
233
+ )
234
+ processed_groups.append(processed_group)
235
+
236
+ # Concatenate all processed groups and restore original row order
237
+ result = pd.concat(processed_groups, ignore_index=False)
238
+
239
+ # Sort by index to restore original row order
240
+ # (groupby may have changed the order when grouping rows together)
241
+ result = result.sort_index()
242
+
243
+ return result
pydeflate/sources/dac.py CHANGED
@@ -1,71 +1,58 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
 
3
5
  import pandas as pd
4
6
  from oda_reader import download_dac1
5
7
 
6
- from pydeflate.pydeflate_config import PYDEFLATE_PATHS
8
+ from pydeflate.cache import CacheEntry, cache_manager
9
+ from pydeflate.pydeflate_config import logger
7
10
  from pydeflate.sources.common import (
8
- today,
9
11
  add_pydeflate_iso3,
10
- enforce_pyarrow_types,
11
12
  compute_exchange_deflator,
12
- read_data,
13
+ enforce_pyarrow_types,
13
14
  prefix_pydeflate_to_columns,
14
15
  )
15
16
 
16
17
 
17
- def _find_dac_files_in_path(path: Path) -> list:
18
- """Find all DAC parquet files in the specified directory.
19
-
20
- Args:
21
- path (Path): The directory path to search for DAC parquet files.
22
-
23
- Returns:
24
- list: List of DAC parquet files found in the directory.
25
- """
26
- return list(path.glob("dac_*.parquet"))
27
-
28
-
29
18
  def _to_units(df: pd.DataFrame) -> pd.DataFrame:
30
- """Convert DAC values (in million) to units.
31
-
32
- Args:
33
- df (pd.DataFrame): Dataframe with raw observation values.
19
+ """Scale reported DAC values (supplied in millions) into base units."""
34
20
 
35
- Returns:
36
- pd.DataFrame: Dataframe with scaled observation values.
37
- """
38
21
  df = df.copy()
39
22
  df["value"] = df["value"] * df["unit_multiplier"]
40
23
  return df
41
24
 
42
25
 
43
26
  def _keep_official_definition_only(df: pd.DataFrame) -> pd.DataFrame:
27
+ """Retain rows matching the official DAC definition across regime changes."""
28
+
44
29
  query = (
45
30
  "(aidtype_code == 1010 & flows_code == 1140 & year <2018 ) | "
46
31
  "(aidtype_code == 11010 & flows_code == 1160 & year >=2018)"
47
32
  )
48
-
49
33
  return df.query(query)
50
34
 
51
35
 
52
36
  def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
53
- columns = ["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"]
37
+ """Select the key columns used downstream in pydeflate."""
54
38
 
55
- return df.filter(columns)
39
+ return df.filter(["year", "donor_code", "donor_name", "EXCHANGE", "DAC_DEFLATOR"])
56
40
 
57
41
 
58
42
  def _pivot_amount_type(df: pd.DataFrame) -> pd.DataFrame:
43
+ """Pivot amount-type codes into separate columns (A/N/D)."""
44
+
59
45
  df = df.filter(["year", "donor_code", "donor_name", "amounttype_code", "value"])
60
46
  return df.pivot(
61
- index=[c for c in df.columns if c not in ["amounttype_code", "value"]],
47
+ index=[c for c in df.columns if c not in {"amounttype_code", "value"}],
62
48
  columns="amounttype_code",
63
49
  values="value",
64
50
  ).reset_index()
65
51
 
66
52
 
67
53
  def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
68
- # The values for certain providers should be 1
54
+ """Derive exchange rates, forcing DAC aggregates to unity."""
55
+
69
56
  df.loc[lambda d: d.donor_code >= 20000, "N"] = df.loc[
70
57
  lambda d: d.donor_code >= 20000, "A"
71
58
  ]
@@ -74,32 +61,32 @@ def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
74
61
 
75
62
 
76
63
  def _compute_dac_deflator(df: pd.DataFrame) -> pd.DataFrame:
64
+ """Calculate the published DAC price deflator from amounts A/D."""
65
+
77
66
  df["DAC_DEFLATOR"] = round(100 * df["A"] / df["D"], 6)
78
67
  return df
79
68
 
80
69
 
81
70
  def _compute_dac_gdp_deflator(df: pd.DataFrame) -> pd.DataFrame:
82
- df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
71
+ """Back out a GDP-style deflator using the exchange deflator."""
83
72
 
73
+ df["NGDP_D"] = round(df["EXCHANGE_D"] / 100 * df["DAC_DEFLATOR"], 5)
84
74
  return df
85
75
 
86
76
 
87
77
  def _rename_columns(df: pd.DataFrame) -> pd.DataFrame:
88
- return df.rename(
89
- columns={
90
- "donor_code": "entity_code",
91
- "donor_name": "entity",
92
- }
93
- )
78
+ """Align donor metadata with pydeflate naming conventions."""
94
79
 
80
+ return df.rename(columns={"donor_code": "entity_code", "donor_name": "entity"})
95
81
 
96
- def download_dac():
97
- # Use oda_reader to get the data
82
+
83
+ def _download_dac(output_path: Path) -> None:
84
+ """Download and cache the DAC statistics parquet file."""
85
+
86
+ logger.info("Downloading DAC statistics from ODA reader...")
98
87
  df = download_dac1(
99
88
  filters={"measure": ["1010", "11010"], "flow_type": ["1140", "1160"]}
100
89
  )
101
-
102
- # Clean the data
103
90
  df = (
104
91
  df.pipe(_to_units)
105
92
  .pipe(_keep_official_definition_only)
@@ -115,23 +102,23 @@ def download_dac():
115
102
  .pipe(enforce_pyarrow_types)
116
103
  .reset_index(drop=True)
117
104
  )
105
+ output_path.parent.mkdir(parents=True, exist_ok=True)
106
+ df.to_parquet(output_path)
107
+ logger.info("Saved DAC dataset to %s", output_path)
118
108
 
119
- # Get today's date to use as a file suffix
120
- suffix = today()
121
109
 
122
- # Save the data
123
- df.to_parquet(PYDEFLATE_PATHS.data / f"dac_{suffix}.parquet")
110
+ _DAC_ENTRY = CacheEntry(
111
+ key="dac_stats",
112
+ filename="dac.parquet",
113
+ fetcher=_download_dac,
114
+ ttl_days=30,
115
+ )
124
116
 
125
117
 
126
118
  def read_dac(update: bool = False) -> pd.DataFrame:
127
- """Read the latest WEO data from parquet files or download fresh data."""
128
- return read_data(
129
- file_finder_func=_find_dac_files_in_path,
130
- download_func=download_dac,
131
- data_name="DAC",
132
- update=update,
133
- )
119
+ path = cache_manager().ensure(_DAC_ENTRY, refresh=update)
120
+ return pd.read_parquet(path)
134
121
 
135
122
 
136
- if __name__ == "__main__":
137
- df = read_dac(update=True)
123
+ if __name__ == "__main__": # pragma: no cover
124
+ read_dac(update=True)
pydeflate/sources/imf.py CHANGED
@@ -1,15 +1,16 @@
1
+ from __future__ import annotations
2
+
1
3
  from pathlib import Path
2
4
 
3
5
  import pandas as pd
4
6
  from imf_reader import weo
5
7
 
6
- from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
8
+ from pydeflate.cache import CacheEntry, cache_manager
9
+ from pydeflate.pydeflate_config import logger
7
10
  from pydeflate.sources.common import (
8
- today,
9
11
  add_pydeflate_iso3,
10
- enforce_pyarrow_types,
11
12
  compute_exchange_deflator,
12
- read_data,
13
+ enforce_pyarrow_types,
13
14
  prefix_pydeflate_to_columns,
14
15
  )
15
16
 
@@ -93,7 +94,7 @@ def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
93
94
 
94
95
 
95
96
  def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
96
- """Pivot the concept code column to get a wide format for the data.
97
+ """Pivot the concept dimension so each indicator becomes a column
97
98
 
98
99
  Args:
99
100
  df (pd.DataFrame): Dataframe with concept code column.
@@ -102,7 +103,7 @@ def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
102
103
  pd.DataFrame: Dataframe with concept code pivoted to columns.
103
104
  """
104
105
  return df.pivot(
105
- index=[c for c in df.columns if c not in ["concept_code", "value"]],
106
+ index=[c for c in df.columns if c not in {"concept_code", "value"}],
106
107
  columns="concept_code",
107
108
  values="value",
108
109
  ).reset_index()
@@ -147,11 +148,37 @@ def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
147
148
  return pd.concat([df, exchange], ignore_index=True)
148
149
 
149
150
 
150
- def download_weo() -> None:
151
- """Download the WEO data, process it, and save it to a parquet file."""
152
- logger.info("Downloading the latest WEO data...")
151
+ def _create_eur_series(df: pd.DataFrame) -> pd.DataFrame:
152
+ """Create a EUR series from the exchange rate data.
153
+
154
+ This function creates exchange rate data for EUR by using the exchange rate
155
+ from France starting from 1999.
156
+
157
+ Args:
158
+ df (pd.DataFrame): DataFrame containing exchange rates.
159
+
160
+ Returns:
161
+ pd.DataFrame: DataFrame with the EUR exchange rate.
162
+ """
163
+
164
+ # Get France's exchange rates by year
165
+ eur = (
166
+ df.loc[lambda d: d.pydeflate_iso3 == "FRA"]
167
+ .loc[lambda d: d.year >= 1999]
168
+ .set_index("year")["EXCHANGE"]
169
+ )
170
+
171
+ # Apply France's exchange rates to rows with entity_code == 998 and matching year
172
+ df.loc[df.entity_code == 998, "EXCHANGE"] = df.loc[
173
+ df.entity_code == 998, "year"
174
+ ].map(eur)
175
+ return df
176
+
153
177
 
154
- # Fetch and process the data through a pipeline of transformations
178
+ def _download_weo(output_path: Path) -> None:
179
+ """Fetch, transform, and store the latest WEO dataset in Parquet format."""
180
+
181
+ logger.info("Downloading the latest IMF WEO dataset...")
155
182
  df = (
156
183
  weo.fetch_data()
157
184
  .pipe(_filter_indicators)
@@ -161,43 +188,29 @@ def download_weo() -> None:
161
188
  .pipe(_compute_exchange)
162
189
  .pipe(add_pydeflate_iso3, column="entity", from_type="regex")
163
190
  .pipe(_pivot_concept_code)
191
+ .pipe(_create_eur_series)
164
192
  .pipe(compute_exchange_deflator, base_year_measure="NGDP_D")
165
193
  .pipe(prefix_pydeflate_to_columns)
166
194
  .pipe(enforce_pyarrow_types)
167
195
  .reset_index(drop=True)
168
196
  )
197
+ output_path.parent.mkdir(parents=True, exist_ok=True)
198
+ df.to_parquet(output_path)
199
+ logger.info("Saved WEO data to %s", output_path)
169
200
 
170
- # Get today's date to use as a file suffix
171
- suffix = today()
172
-
173
- # Save the processed dataframe to parquet format
174
- df.to_parquet(PYDEFLATE_PATHS.data / f"weo_{suffix}.parquet")
175
-
176
- logger.info(f"Saved WEO data to weo_{suffix}.parquet")
177
201
 
178
-
179
- def _find_weo_files_in_path(path: Path) -> list:
180
- """Find all WEO parquet files in the specified directory.
181
-
182
- Args:
183
- path (Path): The directory path to search for WEO parquet files.
184
-
185
- Returns:
186
- list: List of WEO parquet files found in the directory.
187
- """
188
- return list(path.glob("weo_*.parquet"))
202
+ _IMF_CACHE_ENTRY = CacheEntry(
203
+ key="imf_weo",
204
+ filename="imf_weo.parquet",
205
+ fetcher=_download_weo,
206
+ ttl_days=60,
207
+ )
189
208
 
190
209
 
191
210
  def read_weo(update: bool = False) -> pd.DataFrame:
192
- """Read the latest WEO data from parquet files or download fresh data."""
193
- return read_data(
194
- file_finder_func=_find_weo_files_in_path,
195
- download_func=download_weo,
196
- data_name="WEO",
197
- update=update,
198
- )
211
+ path = cache_manager().ensure(_IMF_CACHE_ENTRY, refresh=update)
212
+ return pd.read_parquet(path)
199
213
 
200
214
 
201
- if __name__ == "__main__":
202
- # Download the WEO data
203
- dfi = read_weo(update=True)
215
+ if __name__ == "__main__": # pragma: no cover
216
+ read_weo(update=True)