pydeflate 1.4.2__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydeflate/__init__.py +25 -17
- pydeflate/core/api.py +404 -0
- pydeflate/core/deflator.py +171 -0
- pydeflate/core/exchange.py +237 -0
- pydeflate/core/source.py +54 -0
- pydeflate/deflate/deflators.py +228 -0
- pydeflate/deflate/legacy_deflate.py +109 -0
- pydeflate/exchange/__init__.py +0 -0
- pydeflate/exchange/exchangers.py +147 -0
- pydeflate/pydeflate_config.py +25 -16
- pydeflate/sources/__init__.py +0 -0
- pydeflate/sources/common.py +278 -0
- pydeflate/sources/dac.py +137 -0
- pydeflate/sources/imf.py +203 -0
- pydeflate/sources/world_bank.py +186 -0
- pydeflate/utils.py +55 -22
- {pydeflate-1.4.2.dist-info → pydeflate-2.0.0.dist-info}/LICENSE +1 -1
- pydeflate-2.0.0.dist-info/METADATA +287 -0
- pydeflate-2.0.0.dist-info/RECORD +25 -0
- {pydeflate-1.4.2.dist-info → pydeflate-2.0.0.dist-info}/WHEEL +1 -1
- pydeflate/deflate/deflate.py +0 -324
- pydeflate/deflate/deflator.py +0 -78
- pydeflate/get_data/deflate_data.py +0 -70
- pydeflate/get_data/exchange_data.py +0 -371
- pydeflate/get_data/imf_data.py +0 -76
- pydeflate/get_data/oecd_data.py +0 -146
- pydeflate/get_data/wb_data.py +0 -75
- pydeflate/tools/__init__.py +0 -2
- pydeflate/tools/exchange.py +0 -171
- pydeflate/tools/update_data.py +0 -69
- pydeflate-1.4.2.dist-info/METADATA +0 -305
- pydeflate-1.4.2.dist-info/RECORD +0 -22
- /pydeflate/{get_data → core}/__init__.py +0 -0
pydeflate/sources/imf.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from imf_reader import weo
|
|
5
|
+
|
|
6
|
+
from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
|
|
7
|
+
from pydeflate.sources.common import (
|
|
8
|
+
today,
|
|
9
|
+
add_pydeflate_iso3,
|
|
10
|
+
enforce_pyarrow_types,
|
|
11
|
+
compute_exchange_deflator,
|
|
12
|
+
read_data,
|
|
13
|
+
prefix_pydeflate_to_columns,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# List of WEO indicators of interest
|
|
17
|
+
WEO_INDICATORS: list[str] = [
|
|
18
|
+
"NGDP_D", # Gross domestic product, deflator
|
|
19
|
+
"PCPI", # Inflation, average consumer prices
|
|
20
|
+
"PCPIE", # Inflation, end of period consumer prices
|
|
21
|
+
"PPPEX", # Implied PPP conversion rate
|
|
22
|
+
"NGDPD", # Gross domestic product, current prices USD
|
|
23
|
+
"NGDP", # Gross domestic product, current prices
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _filter_indicators(df: pd.DataFrame) -> pd.DataFrame:
|
|
28
|
+
"""Filter the data to include only selected WEO indicators.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
df (pd.DataFrame): The raw dataframe containing WEO data.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
pd.DataFrame: Filtered dataframe with only the relevant indicators.
|
|
35
|
+
"""
|
|
36
|
+
return df.loc[df.CONCEPT_CODE.isin(WEO_INDICATORS)]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _to_units(df: pd.DataFrame) -> pd.DataFrame:
|
|
40
|
+
"""Convert OBS_VALUE using the SCALE_CODE multiplier to get proper units.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
df (pd.DataFrame): Dataframe with raw observation values.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
pd.DataFrame: Dataframe with scaled observation values.
|
|
47
|
+
"""
|
|
48
|
+
df = df.copy()
|
|
49
|
+
df["OBS_VALUE"] = df["OBS_VALUE"] * df["SCALE_CODE"]
|
|
50
|
+
return df
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _clean_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
54
|
+
"""Rename columns to more readable and consistent names.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
df (pd.DataFrame): Dataframe with original IMF column names.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
pd.DataFrame: Dataframe with cleaned and renamed columns.
|
|
61
|
+
"""
|
|
62
|
+
df = df.rename(
|
|
63
|
+
columns={
|
|
64
|
+
"REF_AREA_CODE": "entity_code",
|
|
65
|
+
"REF_AREA_LABEL": "entity",
|
|
66
|
+
"LASTACTUALDATE": "estimates_start_after",
|
|
67
|
+
"TIME_PERIOD": "year",
|
|
68
|
+
"OBS_VALUE": "value",
|
|
69
|
+
}
|
|
70
|
+
)
|
|
71
|
+
# Standardize column names to snake_case
|
|
72
|
+
df.columns = df.columns.str.lower().str.replace(" ", "_")
|
|
73
|
+
return df
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _keep_useful_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
77
|
+
"""Retain only the columns that are useful for further analysis.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
df (pd.DataFrame): Dataframe with cleaned columns.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
pd.DataFrame: Dataframe with only useful columns.
|
|
84
|
+
"""
|
|
85
|
+
cols = [
|
|
86
|
+
"year",
|
|
87
|
+
"entity_code",
|
|
88
|
+
"entity",
|
|
89
|
+
"concept_code",
|
|
90
|
+
"value",
|
|
91
|
+
]
|
|
92
|
+
return df[cols]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _pivot_concept_code(df: pd.DataFrame) -> pd.DataFrame:
|
|
96
|
+
"""Pivot the concept code column to get a wide format for the data.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
df (pd.DataFrame): Dataframe with concept code column.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
pd.DataFrame: Dataframe with concept code pivoted to columns.
|
|
103
|
+
"""
|
|
104
|
+
return df.pivot(
|
|
105
|
+
index=[c for c in df.columns if c not in ["concept_code", "value"]],
|
|
106
|
+
columns="concept_code",
|
|
107
|
+
values="value",
|
|
108
|
+
).reset_index()
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _compute_exchange(df: pd.DataFrame) -> pd.DataFrame:
|
|
112
|
+
"""Compute the exchange rate and append it to the original DataFrame.
|
|
113
|
+
|
|
114
|
+
This function calculates the exchange rate by dividing the 'NGDP'
|
|
115
|
+
Gross domestic product in local currency) by 'NGDPD' (Gross domestic product in USD).
|
|
116
|
+
It then appends this computed exchange rate to the original DataFrame,
|
|
117
|
+
and the new exchange rate rows are labeled with the 'EXCHANGE'
|
|
118
|
+
concept code.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
df (pd.DataFrame): Input DataFrame containing columns with
|
|
122
|
+
'concept_code' as 'NGDPD' and 'NGDP'.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
pd.DataFrame: DataFrame with the computed exchange rate included,
|
|
126
|
+
along with the original data.
|
|
127
|
+
"""
|
|
128
|
+
# Filter rows with 'NGDPD' (GDP in USD) and 'NGDP' (GDP in local currency)
|
|
129
|
+
exchange = df.loc[lambda d: d.concept_code.isin(["NGDPD", "NGDP"])]
|
|
130
|
+
|
|
131
|
+
# Pivot the data so 'NGDPD' and 'NGDP' become separate columns
|
|
132
|
+
exchange = exchange.pipe(_pivot_concept_code)
|
|
133
|
+
|
|
134
|
+
# Remove rows that correspond to 'NGDPD' and 'NGDP' from the original DataFrame
|
|
135
|
+
df = df.loc[lambda d: ~d.concept_code.isin(["NGDPD", "NGDP"])]
|
|
136
|
+
|
|
137
|
+
# Compute the exchange rate as NGDP (local currency) divided by NGDPD (USD)
|
|
138
|
+
exchange["value"] = round(exchange["NGDP"] / exchange["NGDPD"], 7)
|
|
139
|
+
|
|
140
|
+
# Label the exchange rate with a new concept code 'EXCHANGE'
|
|
141
|
+
exchange["concept_code"] = "EXCHANGE"
|
|
142
|
+
|
|
143
|
+
# Drop the original 'NGDPD' and 'NGDP' columns as they are no longer needed
|
|
144
|
+
exchange = exchange.drop(columns=["NGDPD", "NGDP"])
|
|
145
|
+
|
|
146
|
+
# Concatenate the original DataFrame with the new exchange rate data
|
|
147
|
+
return pd.concat([df, exchange], ignore_index=True)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def download_weo() -> None:
|
|
151
|
+
"""Download the WEO data, process it, and save it to a parquet file."""
|
|
152
|
+
logger.info("Downloading the latest WEO data...")
|
|
153
|
+
|
|
154
|
+
# Fetch and process the data through a pipeline of transformations
|
|
155
|
+
df = (
|
|
156
|
+
weo.fetch_data()
|
|
157
|
+
.pipe(_filter_indicators)
|
|
158
|
+
.pipe(_to_units)
|
|
159
|
+
.pipe(_clean_columns)
|
|
160
|
+
.pipe(_keep_useful_columns)
|
|
161
|
+
.pipe(_compute_exchange)
|
|
162
|
+
.pipe(add_pydeflate_iso3, column="entity", from_type="regex")
|
|
163
|
+
.pipe(_pivot_concept_code)
|
|
164
|
+
.pipe(compute_exchange_deflator, base_year_measure="NGDP_D")
|
|
165
|
+
.pipe(prefix_pydeflate_to_columns)
|
|
166
|
+
.pipe(enforce_pyarrow_types)
|
|
167
|
+
.reset_index(drop=True)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Get today's date to use as a file suffix
|
|
171
|
+
suffix = today()
|
|
172
|
+
|
|
173
|
+
# Save the processed dataframe to parquet format
|
|
174
|
+
df.to_parquet(PYDEFLATE_PATHS.data / f"weo_{suffix}.parquet")
|
|
175
|
+
|
|
176
|
+
logger.info(f"Saved WEO data to weo_{suffix}.parquet")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _find_weo_files_in_path(path: Path) -> list:
|
|
180
|
+
"""Find all WEO parquet files in the specified directory.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
path (Path): The directory path to search for WEO parquet files.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
list: List of WEO parquet files found in the directory.
|
|
187
|
+
"""
|
|
188
|
+
return list(path.glob("weo_*.parquet"))
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def read_weo(update: bool = False) -> pd.DataFrame:
|
|
192
|
+
"""Read the latest WEO data from parquet files or download fresh data."""
|
|
193
|
+
return read_data(
|
|
194
|
+
file_finder_func=_find_weo_files_in_path,
|
|
195
|
+
download_func=download_weo,
|
|
196
|
+
data_name="WEO",
|
|
197
|
+
update=update,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
if __name__ == "__main__":
|
|
202
|
+
# Download the WEO data
|
|
203
|
+
dfi = read_weo(update=True)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import wbgapi as wb
|
|
6
|
+
|
|
7
|
+
from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
|
|
8
|
+
from pydeflate.sources.common import (
|
|
9
|
+
enforce_pyarrow_types,
|
|
10
|
+
today,
|
|
11
|
+
compute_exchange_deflator,
|
|
12
|
+
read_data,
|
|
13
|
+
prefix_pydeflate_to_columns,
|
|
14
|
+
)
|
|
15
|
+
from pydeflate.utils import emu
|
|
16
|
+
|
|
17
|
+
_INDICATORS: dict = {
|
|
18
|
+
"NY.GDP.DEFL.ZS": "NGDP_D", # GDP Deflator (Index)
|
|
19
|
+
"NY.GDP.DEFL.ZS.AD": "NGDP_DL", # GDP Deflator linked series
|
|
20
|
+
"FP.CPI.TOTL": "CPI", # Consumer Price Index (CPI)
|
|
21
|
+
"PA.NUS.FCRF": "EXCHANGE", # Official Exchange Rate
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_wb_indicator(series: str, value_name: str | None = None) -> pd.DataFrame:
|
|
26
|
+
"""Fetch a World Bank indicator and transform it into a cleaned DataFrame.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
series (str): The World Bank indicator series code.
|
|
30
|
+
value_name (str | None): The column name to assign to the series values.
|
|
31
|
+
If None, the series code will be used as the column name.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
pd.DataFrame: DataFrame with entity code, entity name, year, and the indicator values.
|
|
35
|
+
"""
|
|
36
|
+
# Fetch the indicator data from World Bank API, clean and structure it
|
|
37
|
+
return (
|
|
38
|
+
wb.data.DataFrame(
|
|
39
|
+
series=series,
|
|
40
|
+
db=2, # World Development Indicators database
|
|
41
|
+
skipBlanks=True,
|
|
42
|
+
columns="series",
|
|
43
|
+
numericTimeKeys=True,
|
|
44
|
+
labels=True,
|
|
45
|
+
)
|
|
46
|
+
.reset_index()
|
|
47
|
+
.sort_values(by=["economy", "Time"]) # Sort for easier reading
|
|
48
|
+
.drop(columns=["Time"]) # Remove unnecessary column
|
|
49
|
+
.rename(
|
|
50
|
+
columns={
|
|
51
|
+
"economy": "entity_code",
|
|
52
|
+
"Country": "entity",
|
|
53
|
+
"time": "year",
|
|
54
|
+
series: value_name or series,
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
.reset_index(drop=True) # Drop the old index after reset
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _eur_series_fix(df: pd.DataFrame) -> pd.DataFrame:
|
|
62
|
+
"""
|
|
63
|
+
Fix the exchange rate for the Euro area countries. This is done by assigning the
|
|
64
|
+
exchange rate of the Euro to the countries in the Euro area. This is necessary
|
|
65
|
+
because the series for Euro area countries are missing EUR exchange rates.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
df: pd.DataFrame: The DataFrame containing the World Bank data.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
pd.DataFrame: The DataFrame with the fixed exchange rates for the Euro area countries.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
# Handle cases where EUR is represented differently in the World Bank data
|
|
75
|
+
df["entity_code"] = df["entity_code"].replace({"EMU": "EUR"})
|
|
76
|
+
|
|
77
|
+
# Find the "Euro" data. This is done given that some countries are missing
|
|
78
|
+
# exchange rates, but they are part of the Euro area.
|
|
79
|
+
eur = (
|
|
80
|
+
df.loc[lambda d: d["entity_code"] == "EUR"]
|
|
81
|
+
.dropna(subset=["EXCHANGE"])
|
|
82
|
+
.set_index("year")["EXCHANGE"]
|
|
83
|
+
.to_dict()
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Euro area countries without exchange rates
|
|
87
|
+
eur_mask = (df["entity_code"].isin(emu())) & (df["EXCHANGE"].isna())
|
|
88
|
+
|
|
89
|
+
# Assign EURO exchange rate to euro are countries from year euro adopted
|
|
90
|
+
df.loc[eur_mask, "EXCHANGE"] = df["year"].map(eur)
|
|
91
|
+
|
|
92
|
+
return df
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _parallel_download_indicators(indicators: dict) -> list[pd.DataFrame]:
|
|
96
|
+
"""Download multiple World Bank indicators in parallel.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
indicators (dict): A dictionary of World Bank indicators to download.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
list[pd.DataFrame]: A list of DataFrames containing the downloaded indicators
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
# List to store the resulting dataframes
|
|
106
|
+
dfs = []
|
|
107
|
+
|
|
108
|
+
# Use ThreadPoolExecutor to fetch indicators in parallel
|
|
109
|
+
with ThreadPoolExecutor() as executor:
|
|
110
|
+
# Submit all tasks to the executor (downloading indicators in parallel)
|
|
111
|
+
future_to_series = {
|
|
112
|
+
executor.submit(get_wb_indicator, series, value_name): series
|
|
113
|
+
for series, value_name in indicators.items()
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Collect the results as they complete
|
|
117
|
+
for future in as_completed(future_to_series):
|
|
118
|
+
series = future_to_series[future]
|
|
119
|
+
try:
|
|
120
|
+
df_ = future.result().set_index(["year", "entity_code", "entity"])
|
|
121
|
+
dfs.append(df_)
|
|
122
|
+
except Exception as exc:
|
|
123
|
+
# Log or handle any errors that occur during the download
|
|
124
|
+
logger.warning(f"Error downloading series {series}: {exc}")
|
|
125
|
+
|
|
126
|
+
return dfs
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def download_wb() -> None:
|
|
130
|
+
"""Download multiple World Bank indicators in parallel and save as a parquet file.
|
|
131
|
+
|
|
132
|
+
This function fetches all indicators defined in _INDICATORS in parallel, concatenates
|
|
133
|
+
them into a single DataFrame, and saves the result as a parquet file using today's date as a suffix.
|
|
134
|
+
"""
|
|
135
|
+
logger.info("Downloading the latest World Bank data...")
|
|
136
|
+
|
|
137
|
+
indicators_data = _parallel_download_indicators(indicators=_INDICATORS)
|
|
138
|
+
|
|
139
|
+
# Concatenate all DataFrames horizontally (by columns)
|
|
140
|
+
df = pd.concat(indicators_data, axis=1).reset_index()
|
|
141
|
+
|
|
142
|
+
# cleaning
|
|
143
|
+
df = (
|
|
144
|
+
df.pipe(_eur_series_fix)
|
|
145
|
+
.pipe(compute_exchange_deflator, base_year_measure="NGDP_D")
|
|
146
|
+
.assign(pydeflate_iso3=lambda d: d.entity_code)
|
|
147
|
+
.sort_values(by=["year", "entity_code"])
|
|
148
|
+
.pipe(prefix_pydeflate_to_columns)
|
|
149
|
+
.pipe(enforce_pyarrow_types)
|
|
150
|
+
.reset_index(drop=True)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Get today's date to use as a file suffix
|
|
154
|
+
suffix = today()
|
|
155
|
+
|
|
156
|
+
# Save the DataFrame as a parquet file
|
|
157
|
+
output_path = PYDEFLATE_PATHS.data / f"wb_{suffix}.parquet"
|
|
158
|
+
df.to_parquet(output_path)
|
|
159
|
+
|
|
160
|
+
logger.info(f"Saved World Bank data to wb_{suffix}.parquet")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _find_wb_files_in_path(path: Path) -> list:
|
|
164
|
+
"""Find all WB parquet files in the specified directory.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
path (Path): The directory path to search for WB parquet files.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
list: List of WB parquet files found in the directory.
|
|
171
|
+
"""
|
|
172
|
+
return list(path.glob("wb_*.parquet"))
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def read_wb(update: bool = False) -> pd.DataFrame:
|
|
176
|
+
"""Read the latest World Bank data from parquet files or download fresh data."""
|
|
177
|
+
return read_data(
|
|
178
|
+
file_finder_func=_find_wb_files_in_path,
|
|
179
|
+
download_func=download_wb,
|
|
180
|
+
data_name="World Bank",
|
|
181
|
+
update=update,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
if __name__ == "__main__":
|
|
186
|
+
df = read_wb(True)
|
pydeflate/utils.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
|
-
import country_converter as coco
|
|
4
3
|
import numpy as np
|
|
5
4
|
import pandas as pd
|
|
6
5
|
|
|
7
|
-
from pydeflate.pydeflate_config import PYDEFLATE_PATHS
|
|
6
|
+
from pydeflate.pydeflate_config import PYDEFLATE_PATHS, logger
|
|
7
|
+
from pydeflate.sources.common import enforce_pyarrow_types
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def oecd_codes() -> dict:
|
|
@@ -35,33 +35,66 @@ def clean_number(number):
|
|
|
35
35
|
|
|
36
36
|
return float(number)
|
|
37
37
|
|
|
38
|
+
def create_pydeflate_year(
|
|
39
|
+
data: pd.DataFrame, year_column: str, year_format: str | None = None
|
|
40
|
+
) -> pd.DataFrame:
|
|
41
|
+
if year_format is None:
|
|
42
|
+
year_format = "ISO8601"
|
|
38
43
|
|
|
39
|
-
|
|
40
|
-
"""Check whether the date column contains an int instead of datetime.
|
|
41
|
-
This changes the column to datetime and returns a flag"""
|
|
44
|
+
data = data.copy()
|
|
42
45
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
else:
|
|
47
|
-
year_as_number = False
|
|
46
|
+
data["pydeflate_year"] = pd.to_datetime(
|
|
47
|
+
data[year_column], format=year_format
|
|
48
|
+
).dt.year
|
|
48
49
|
|
|
49
|
-
return
|
|
50
|
+
return data
|
|
50
51
|
|
|
51
52
|
|
|
52
|
-
def
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
not_found: str | None = None,
|
|
53
|
+
def merge_user_and_pydeflate_data(
|
|
54
|
+
data: pd.DataFrame,
|
|
55
|
+
pydeflate_data: pd.DataFrame,
|
|
56
|
+
entity_column: str,
|
|
57
|
+
ix: list[str],
|
|
58
58
|
) -> pd.DataFrame:
|
|
59
|
-
|
|
59
|
+
return data.merge(
|
|
60
|
+
pydeflate_data,
|
|
61
|
+
how="outer",
|
|
62
|
+
left_on=["pydeflate_year", entity_column],
|
|
63
|
+
right_on=ix,
|
|
64
|
+
suffixes=("", "_pydeflate"),
|
|
65
|
+
indicator=True,
|
|
66
|
+
).pipe(enforce_pyarrow_types)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_unmatched_pydeflate_data(
|
|
70
|
+
merged_data: pd.DataFrame,
|
|
71
|
+
):
|
|
72
|
+
return merged_data.loc[merged_data["_merge"] == "left_only"].filter(
|
|
73
|
+
regex="^(?!pydeflate_)(?!.*_pydeflate$)"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_matched_pydeflate_data(
|
|
78
|
+
merged_data: pd.DataFrame,
|
|
79
|
+
):
|
|
80
|
+
return (
|
|
81
|
+
merged_data.loc[merged_data["_merge"] != "right_only"]
|
|
82
|
+
.drop(columns="_merge")
|
|
83
|
+
.reset_index(drop=True)
|
|
84
|
+
)
|
|
85
|
+
|
|
60
86
|
|
|
61
|
-
|
|
87
|
+
def flag_missing_pydeflate_data(unmatched_data: pd.DataFrame):
|
|
88
|
+
"""Flag data which is present in the input data but missing in pydeflate's data."""
|
|
89
|
+
if unmatched_data.empty:
|
|
90
|
+
return
|
|
62
91
|
|
|
63
|
-
|
|
64
|
-
|
|
92
|
+
missing = (
|
|
93
|
+
unmatched_data.drop_duplicates()
|
|
94
|
+
.dropna(axis=1)
|
|
95
|
+
.drop(columns="_merge")
|
|
96
|
+
.to_string(index=False)
|
|
65
97
|
)
|
|
66
98
|
|
|
67
|
-
|
|
99
|
+
# log all missing data
|
|
100
|
+
logger.info(f"Missing exchange data for:\n {missing}")
|