pydeflate 1.4.2__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,203 @@
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+ from imf_reader import weo
5
+
6
+ from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
7
+ from pydeflate.sources.common import (
8
+ today,
9
+ add_pydeflate_iso3,
10
+ enforce_pyarrow_types,
11
+ compute_exchange_deflator,
12
+ read_data,
13
+ prefix_pydeflate_to_columns,
14
+ )
15
+
16
+ # List of WEO indicators of interest
17
+ WEO_INDICATORS: list[str] = [
18
+ "NGDP_D", # Gross domestic product, deflator
19
+ "PCPI", # Inflation, average consumer prices
20
+ "PCPIE", # Inflation, end of period consumer prices
21
+ "PPPEX", # Implied PPP conversion rate
22
+ "NGDPD", # Gross domestic product, current prices USD
23
+ "NGDP", # Gross domestic product, current prices
24
+ ]
25
+
26
+
27
+ def _filter_indicators(df: pd.DataFrame) -> pd.DataFrame:
28
+ """Filter the data to include only selected WEO indicators.
29
+
30
+ Args:
31
+ df (pd.DataFrame): The raw dataframe containing WEO data.
32
+
33
+ Returns:
34
+ pd.DataFrame: Filtered dataframe with only the relevant indicators.
35
+ """
36
+ return df.loc[df.CONCEPT_CODE.isin(WEO_INDICATORS)]
37
+
38
+
39
+ def _to_units(df: pd.DataFrame) -> pd.DataFrame:
40
+ """Convert OBS_VALUE using the SCALE_CODE multiplier to get proper units.
41
+
42
+ Args:
43
+ df (pd.DataFrame): Dataframe with raw observation values.
44
+
45
+ Returns:
46
+ pd.DataFrame: Dataframe with scaled observation values.
47
+ """
48
+ df = df.copy()
49
+ df["OBS_VALUE"] = df["OBS_VALUE"] * df["SCALE_CODE"]
50
+ return df
51
+
52
+
53
+ def _clean_columns(df: pd.DataFrame) -> pd.DataFrame:
54
+ """Rename columns to more readable and consistent names.
55
+
56
+ Args:
57
+ df (pd.DataFrame): Dataframe with original IMF column names.
58
+
59
+ Returns:
60
+ pd.DataFrame: Dataframe with cleaned and renamed columns.
61
+ """
62
+ df = df.rename(
63
+ columns={
64
+ "REF_AREA_CODE": "entity_code",
65
+ "REF_AREA_LABEL": "entity",
66
+ "LASTACTUALDATE": "estimates_start_after",
67
+ "TIME_PERIOD": "year",
68
+ "OBS_VALUE": "value",
69
+ }
70
+ )
71
+ # Standardize column names to snake_case
72
+ df.columns = df.columns.str.lower().str.replace(" ", "_")
73
+ return df
74
+
75
+
76
+ def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
77
+ """Retain only the columns that are useful for further analysis.
78
+
79
+ Args:
80
+ df (pd.DataFrame): Dataframe with cleaned columns.
81
+
82
+ Returns:
83
+ pd.DataFrame: Dataframe with only useful columns.
84
+ """
85
+ cols = [
86
+ "year",
87
+ "entity_code",
88
+ "entity",
89
+ "concept_code",
90
+ "value",
91
+ ]
92
+ return df[cols]
93
+
94
+
95
+ def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
96
+ """Pivot the concept code column to get a wide format for the data.
97
+
98
+ Args:
99
+ df (pd.DataFrame): Dataframe with concept code column.
100
+
101
+ Returns:
102
+ pd.DataFrame: Dataframe with concept code pivoted to columns.
103
+ """
104
+ return df.pivot(
105
+ index=[c for c in df.columns if c not in ["concept_code", "value"]],
106
+ columns="concept_code",
107
+ values="value",
108
+ ).reset_index()
109
+
110
+
111
+ def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
112
+ """Compute the exchange rate and append it to the original DataFrame.
113
+
114
+ This function calculates the exchange rate by dividing the 'NGDP'
115
+ Gross domestic product in local currency) by 'NGDPD' (Gross domestic product in USD).
116
+ It then appends this computed exchange rate to the original DataFrame,
117
+ and the new exchange rate rows are labeled with the 'EXCHANGE'
118
+ concept code.
119
+
120
+ Args:
121
+ df (pd.DataFrame): Input DataFrame containing columns with
122
+ 'concept_code' as 'NGDPD' and 'NGDP'.
123
+
124
+ Returns:
125
+ pd.DataFrame: DataFrame with the computed exchange rate included,
126
+ along with the original data.
127
+ """
128
+ # Filter rows with 'NGDPD' (GDP in USD) and 'NGDP' (GDP in local currency)
129
+ exchange = df.loc[lambda d: d.concept_code.isin(["NGDPD", "NGDP"])]
130
+
131
+ # Pivot the data so 'NGDPD' and 'NGDP' become separate columns
132
+ exchange = exchange.pipe(_pivot_concept_code)
133
+
134
+ # Remove rows that correspond to 'NGDPD' and 'NGDP' from the original DataFrame
135
+ df = df.loc[lambda d: ~d.concept_code.isin(["NGDPD", "NGDP"])]
136
+
137
+ # Compute the exchange rate as NGDP (local currency) divided by NGDPD (USD)
138
+ exchange["value"] = round(exchange["NGDP"] / exchange["NGDPD"], 7)
139
+
140
+ # Label the exchange rate with a new concept code 'EXCHANGE'
141
+ exchange["concept_code"] = "EXCHANGE"
142
+
143
+ # Drop the original 'NGDPD' and 'NGDP' columns as they are no longer needed
144
+ exchange = exchange.drop(columns=["NGDPD", "NGDP"])
145
+
146
+ # Concatenate the original DataFrame with the new exchange rate data
147
+ return pd.concat([df, exchange], ignore_index=True)
148
+
149
+
150
+ def download_weo() -> None:
151
+ """Download the WEO data, process it, and save it to a parquet file."""
152
+ logger.info("Downloading the latest WEO data...")
153
+
154
+ # Fetch and process the data through a pipeline of transformations
155
+ df = (
156
+ weo.fetch_data()
157
+ .pipe(_filter_indicators)
158
+ .pipe(_to_units)
159
+ .pipe(_clean_columns)
160
+ .pipe(_keep_useful_columns)
161
+ .pipe(_compute_exchange)
162
+ .pipe(add_pydeflate_iso3, column="entity", from_type="regex")
163
+ .pipe(_pivot_concept_code)
164
+ .pipe(compute_exchange_deflator, base_year_measure="NGDP_D")
165
+ .pipe(prefix_pydeflate_to_columns)
166
+ .pipe(enforce_pyarrow_types)
167
+ .reset_index(drop=True)
168
+ )
169
+
170
+ # Get today's date to use as a file suffix
171
+ suffix = today()
172
+
173
+ # Save the processed dataframe to parquet format
174
+ df.to_parquet(PYDEFLATE_PATHS.data / f"weo_{suffix}.parquet")
175
+
176
+ logger.info(f"Saved WEO data to weo_{suffix}.parquet")
177
+
178
+
179
+ def _find_weo_files_in_path(path: Path) -> list:
180
+ """Find all WEO parquet files in the specified directory.
181
+
182
+ Args:
183
+ path (Path): The directory path to search for WEO parquet files.
184
+
185
+ Returns:
186
+ list: List of WEO parquet files found in the directory.
187
+ """
188
+ return list(path.glob("weo_*.parquet"))
189
+
190
+
191
+ def read_weo(update: bool = False) -> pd.DataFrame:
192
+ """Read the latest WEO data from parquet files or download fresh data."""
193
+ return read_data(
194
+ file_finder_func=_find_weo_files_in_path,
195
+ download_func=download_weo,
196
+ data_name="WEO",
197
+ update=update,
198
+ )
199
+
200
+
201
+ if __name__ == "__main__":
202
+ # Download the WEO data
203
+ dfi = read_weo(update=True)
@@ -0,0 +1,186 @@
1
+ from concurrent.futures import ThreadPoolExecutor, as_completed
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ import wbgapi as wb
6
+
7
+ from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
8
+ from pydeflate.sources.common import (
9
+ enforce_pyarrow_types,
10
+ today,
11
+ compute_exchange_deflator,
12
+ read_data,
13
+ prefix_pydeflate_to_columns,
14
+ )
15
+ from pydeflate.utils import emu
16
+
17
+ _INDICATORS: dict = {
18
+ "NY.GDP.DEFL.ZS": "NGDP_D", # GDP Deflator (Index)
19
+ "NY.GDP.DEFL.ZS.AD": "NGDP_DL", # GDP Deflator linked series
20
+ "FP.CPI.TOTL": "CPI", # Consumer Price Index (CPI)
21
+ "PA.NUS.FCRF": "EXCHANGE", # Official Exchange Rate
22
+ }
23
+
24
+
25
+ def get_wb_indicator(series: str, value_name: str | None = None) -> pd.DataFrame:
26
+ """Fetch a World Bank indicator and transform it into a cleaned DataFrame.
27
+
28
+ Args:
29
+ series (str): The World Bank indicator series code.
30
+ value_name (str | None): The column name to assign to the series values.
31
+ If None, the series code will be used as the column name.
32
+
33
+ Returns:
34
+ pd.DataFrame: DataFrame with entity code, entity name, year, and the indicator values.
35
+ """
36
+ # Fetch the indicator data from World Bank API, clean and structure it
37
+ return (
38
+ wb.data.DataFrame(
39
+ series=series,
40
+ db=2, # World Development Indicators database
41
+ skipBlanks=True,
42
+ columns="series",
43
+ numericTimeKeys=True,
44
+ labels=True,
45
+ )
46
+ .reset_index()
47
+ .sort_values(by=["economy", "Time"]) # Sort for easier reading
48
+ .drop(columns=["Time"]) # Remove unnecessary column
49
+ .rename(
50
+ columns={
51
+ "economy": "entity_code",
52
+ "Country": "entity",
53
+ "time": "year",
54
+ series: value_name or series,
55
+ }
56
+ )
57
+ .reset_index(drop=True) # Drop the old index after reset
58
+ )
59
+
60
+
61
+ def _eur_series_fix(df: pd.DataFrame) -> pd.DataFrame:
62
+ """
63
+ Fix the exchange rate for the Euro area countries. This is done by assigning the
64
+ exchange rate of the Euro to the countries in the Euro area. This is necessary
65
+ because the series for Euro area countries are missing EUR exchange rates.
66
+
67
+ Args:
68
+ df: pd.DataFrame: The DataFrame containing the World Bank data.
69
+
70
+ Returns:
71
+ pd.DataFrame: The DataFrame with the fixed exchange rates for the Euro area countries.
72
+
73
+ """
74
+ # Handle cases where EUR is represented differently in the World Bank data
75
+ df["entity_code"] = df["entity_code"].replace({"EMU": "EUR"})
76
+
77
+ # Find the "Euro" data. This is done given that some countries are missing
78
+ # exchange rates, but they are part of the Euro area.
79
+ eur = (
80
+ df.loc[lambda d: d["entity_code"] == "EUR"]
81
+ .dropna(subset=["EXCHANGE"])
82
+ .set_index("year")["EXCHANGE"]
83
+ .to_dict()
84
+ )
85
+
86
+ # Euro area countries without exchange rates
87
+ eur_mask = (df["entity_code"].isin(emu())) & (df["EXCHANGE"].isna())
88
+
89
+ # Assign EURO exchange rate to euro are countries from year euro adopted
90
+ df.loc[eur_mask, "EXCHANGE"] = df["year"].map(eur)
91
+
92
+ return df
93
+
94
+
95
+ def _parallel_download_indicators(indicators: dict) -> list[pd.DataFrame]:
96
+ """Download multiple World Bank indicators in parallel.
97
+
98
+ Args:
99
+ indicators (dict): A dictionary of World Bank indicators to download.
100
+
101
+ Returns:
102
+ list[pd.DataFrame]: A list of DataFrames containing the downloaded indicators
103
+
104
+ """
105
+ # List to store the resulting dataframes
106
+ dfs = []
107
+
108
+ # Use ThreadPoolExecutor to fetch indicators in parallel
109
+ with ThreadPoolExecutor() as executor:
110
+ # Submit all tasks to the executor (downloading indicators in parallel)
111
+ future_to_series = {
112
+ executor.submit(get_wb_indicator, series, value_name): series
113
+ for series, value_name in indicators.items()
114
+ }
115
+
116
+ # Collect the results as they complete
117
+ for future in as_completed(future_to_series):
118
+ series = future_to_series[future]
119
+ try:
120
+ df_ = future.result().set_index(["year", "entity_code", "entity"])
121
+ dfs.append(df_)
122
+ except Exception as exc:
123
+ # Log or handle any errors that occur during the download
124
+ logger.warning(f"Error downloading series {series}: {exc}")
125
+
126
+ return dfs
127
+
128
+
129
+ def download_wb() -> None:
130
+ """Download multiple World Bank indicators in parallel and save as a parquet file.
131
+
132
+ This function fetches all indicators defined in _INDICATORS in parallel, concatenates
133
+ them into a single DataFrame, and saves the result as a parquet file using today's date as a suffix.
134
+ """
135
+ logger.info("Downloading the latest World Bank data...")
136
+
137
+ indicators_data = _parallel_download_indicators(indicators=_INDICATORS)
138
+
139
+ # Concatenate all DataFrames horizontally (by columns)
140
+ df = pd.concat(indicators_data, axis=1).reset_index()
141
+
142
+ # cleaning
143
+ df = (
144
+ df.pipe(_eur_series_fix)
145
+ .pipe(compute_exchange_deflator, base_year_measure="NGDP_D")
146
+ .assign(pydeflate_iso3=lambda d: d.entity_code)
147
+ .sort_values(by=["year", "entity_code"])
148
+ .pipe(prefix_pydeflate_to_columns)
149
+ .pipe(enforce_pyarrow_types)
150
+ .reset_index(drop=True)
151
+ )
152
+
153
+ # Get today's date to use as a file suffix
154
+ suffix = today()
155
+
156
+ # Save the DataFrame as a parquet file
157
+ output_path = PYDEFLATE_PATHS.data / f"wb_{suffix}.parquet"
158
+ df.to_parquet(output_path)
159
+
160
+ logger.info(f"Saved World Bank data to wb_{suffix}.parquet")
161
+
162
+
163
+ def _find_wb_files_in_path(path: Path) -> list:
164
+ """Find all WB parquet files in the specified directory.
165
+
166
+ Args:
167
+ path (Path): The directory path to search for WB parquet files.
168
+
169
+ Returns:
170
+ list: List of WB parquet files found in the directory.
171
+ """
172
+ return list(path.glob("wb_*.parquet"))
173
+
174
+
175
+ def read_wb(update: bool = False) -> pd.DataFrame:
176
+ """Read the latest World Bank data from parquet files or download fresh data."""
177
+ return read_data(
178
+ file_finder_func=_find_wb_files_in_path,
179
+ download_func=download_wb,
180
+ data_name="World Bank",
181
+ update=update,
182
+ )
183
+
184
+
185
+ if __name__ == "__main__":
186
+ df = read_wb(True)
pydeflate/utils.py CHANGED
@@ -1,10 +1,10 @@
1
1
  import json
2
2
 
3
- import country_converter as coco
4
3
  import numpy as np
5
4
  import pandas as pd
6
5
 
7
- from pydeflate.pydeflate_config import PYDEFLATE_PATHS
6
+ from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
7
+ from pydeflate.sources.common import enforce_pyarrow_types
8
8
 
9
9
 
10
10
  def oecd_codes() -> dict:
@@ -35,33 +35,66 @@ def clean_number(number):
35
35
 
36
36
  return float(number)
37
37
 
38
+ def create_pydeflate_year(
39
+ data: pd.DataFrame, year_column: str, year_format: str | None = None
40
+ ) -> pd.DataFrame:
41
+ if year_format is None:
42
+ year_format = "ISO8601"
38
43
 
39
- def check_year_as_number(df: pd.DataFrame, date_column: str) -> (pd.DataFrame, bool):
40
- """Check whether the date column contains an int instead of datetime.
41
- This changes the column to datetime and returns a flag"""
44
+ data = data.copy()
42
45
 
43
- if pd.api.types.is_numeric_dtype(df[date_column]):
44
- df[date_column] = pd.to_datetime(df[date_column], format="%Y")
45
- year_as_number = True
46
- else:
47
- year_as_number = False
46
+ data["pydeflate_year"] = pd.to_datetime(
47
+ data[year_column], format=year_format
48
+ ).dt.year
48
49
 
49
- return df, year_as_number
50
+ return data
50
51
 
51
52
 
52
- def to_iso3(
53
- df: pd.DataFrame,
54
- codes_col: str,
55
- target_col: str,
56
- src_classification: str | None = None,
57
- not_found: str | None = None,
53
+ def merge_user_and_pydeflate_data(
54
+ data: pd.DataFrame,
55
+ pydeflate_data: pd.DataFrame,
56
+ entity_column: str,
57
+ ix: list[str],
58
58
  ) -> pd.DataFrame:
59
- """Convert a column of country codes to iso3"""
59
+ return data.merge(
60
+ pydeflate_data,
61
+ how="outer",
62
+ left_on=["pydeflate_year", entity_column],
63
+ right_on=ix,
64
+ suffixes=("", "_pydeflate"),
65
+ indicator=True,
66
+ ).pipe(enforce_pyarrow_types)
67
+
68
+
69
+ def get_unmatched_pydeflate_data(
70
+ merged_data: pd.DataFrame,
71
+ ):
72
+ return merged_data.loc[merged_data["_merge"] == "left_only"].filter(
73
+ regex="^(?!pydeflate_)(?!.*_pydeflate$)"
74
+ )
75
+
76
+
77
+ def get_matched_pydeflate_data(
78
+ merged_data: pd.DataFrame,
79
+ ):
80
+ return (
81
+ merged_data.loc[merged_data["_merge"] != "right_only"]
82
+ .drop(columns="_merge")
83
+ .reset_index(drop=True)
84
+ )
85
+
60
86
 
61
- cc = coco.CountryConverter()
87
+ def flag_missing_pydeflate_data(unmatched_data: pd.DataFrame):
88
+ """Flag data which is present in the input data but missing in pydeflate's data."""
89
+ if unmatched_data.empty:
90
+ return
62
91
 
63
- df[target_col] = cc.pandas_convert(
64
- df[codes_col], src=src_classification, to="ISO3", not_found=not_found
92
+ missing = (
93
+ unmatched_data.drop_duplicates()
94
+ .dropna(axis=1)
95
+ .drop(columns="_merge")
96
+ .to_string(index=False)
65
97
  )
66
98
 
67
- return df
99
+ # log all missing data
100
+ logger.info(f"Missing exchange data for:\n {missing}")
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2022, Jorge Rivera
3
+ Copyright (c) 2021-2024, Jorge Rivera
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal