openforis-whisp 2.0.0a4__py3-none-any.whl → 2.0.0a6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openforis_whisp/__init__.py +75 -75
- openforis_whisp/data_conversion.py +493 -371
- openforis_whisp/datasets.py +1384 -1381
- openforis_whisp/logger.py +75 -75
- openforis_whisp/parameters/__init__.py +15 -15
- openforis_whisp/parameters/config_runtime.py +44 -44
- openforis_whisp/parameters/lookup_context_and_metadata.csv +13 -13
- openforis_whisp/parameters/lookup_gee_datasets.csv +1 -1
- openforis_whisp/pd_schemas.py +77 -77
- openforis_whisp/reformat.py +495 -495
- openforis_whisp/risk.py +771 -777
- openforis_whisp/stats.py +1134 -953
- openforis_whisp/utils.py +154 -154
- {openforis_whisp-2.0.0a4.dist-info → openforis_whisp-2.0.0a6.dist-info}/LICENSE +21 -21
- {openforis_whisp-2.0.0a4.dist-info → openforis_whisp-2.0.0a6.dist-info}/METADATA +37 -46
- openforis_whisp-2.0.0a6.dist-info/RECORD +17 -0
- {openforis_whisp-2.0.0a4.dist-info → openforis_whisp-2.0.0a6.dist-info}/WHEEL +1 -1
- openforis_whisp-2.0.0a4.dist-info/RECORD +0 -17
openforis_whisp/reformat.py
CHANGED
|
@@ -1,495 +1,495 @@
|
|
|
1
|
-
# !pip install pandera[io] # special version used
|
|
2
|
-
import pandera as pa
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import os
|
|
5
|
-
import logging
|
|
6
|
-
from pathlib import Path # Add this import
|
|
7
|
-
|
|
8
|
-
from openforis_whisp.logger import StdoutLogger, FileLogger
|
|
9
|
-
|
|
10
|
-
from openforis_whisp.pd_schemas import data_lookup_type
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
from openforis_whisp.parameters.config_runtime import (
|
|
14
|
-
DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
|
|
15
|
-
DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
|
|
16
|
-
)
|
|
17
|
-
|
|
18
|
-
logger = StdoutLogger(__name__)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
# Dictionary to cache schema and modification times for multiple files
|
|
22
|
-
cached_schema = None
|
|
23
|
-
cached_file_mtimes = {}
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def validate_dataframe_using_lookups(
|
|
27
|
-
df_stats: pd.DataFrame, file_paths: list = None, national_codes: list = None
|
|
28
|
-
) -> pd.DataFrame:
|
|
29
|
-
"""
|
|
30
|
-
Load the schema if any file in the list has changed and validate the DataFrame against the loaded schema.
|
|
31
|
-
Optionally filter columns by country code.
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
df_stats (pd.DataFrame): The DataFrame to validate.
|
|
35
|
-
file_paths (list): List of paths to schema files.
|
|
36
|
-
national_codes (list, optional): List of ISO2 country codes to include.
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
pd.DataFrame: The validated DataFrame.
|
|
40
|
-
"""
|
|
41
|
-
# Load the schema
|
|
42
|
-
schema = load_schema_if_any_file_changed(file_paths, national_codes=national_codes)
|
|
43
|
-
|
|
44
|
-
# Validate the DataFrame
|
|
45
|
-
validated_df = validate_dataframe(df_stats, schema)
|
|
46
|
-
|
|
47
|
-
return validated_df
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def load_schema_if_any_file_changed(file_paths=None, national_codes=None):
|
|
51
|
-
"""Load schema if files changed OR if national_codes changed"""
|
|
52
|
-
|
|
53
|
-
if file_paths is None:
|
|
54
|
-
file_paths = [
|
|
55
|
-
DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
|
|
56
|
-
DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
|
|
57
|
-
]
|
|
58
|
-
|
|
59
|
-
# Include national_codes in cache key (including None case)
|
|
60
|
-
cache_key_parts = []
|
|
61
|
-
for file_path in file_paths:
|
|
62
|
-
if Path(file_path).exists():
|
|
63
|
-
mtime = Path(file_path).stat().st_mtime
|
|
64
|
-
cache_key_parts.append(f"{file_path}:{mtime}")
|
|
65
|
-
else:
|
|
66
|
-
cache_key_parts.append(f"{file_path}:missing")
|
|
67
|
-
|
|
68
|
-
# Always include national_codes in cache key (even if None)
|
|
69
|
-
national_codes_key = (
|
|
70
|
-
str(sorted(national_codes)) if national_codes else "no_countries"
|
|
71
|
-
)
|
|
72
|
-
cache_key_parts.append(f"national_codes:{national_codes_key}")
|
|
73
|
-
|
|
74
|
-
current_cache_key = "|".join(cache_key_parts)
|
|
75
|
-
|
|
76
|
-
# Check cache
|
|
77
|
-
if (
|
|
78
|
-
not hasattr(load_schema_if_any_file_changed, "_cached_schema")
|
|
79
|
-
or not hasattr(load_schema_if_any_file_changed, "_last_cache_key")
|
|
80
|
-
or load_schema_if_any_file_changed._last_cache_key != current_cache_key
|
|
81
|
-
):
|
|
82
|
-
|
|
83
|
-
print(f"Creating schema for national_codes: {national_codes}")
|
|
84
|
-
|
|
85
|
-
# Load and combine lookup files
|
|
86
|
-
combined_lookup_df = append_csvs_to_dataframe(file_paths)
|
|
87
|
-
|
|
88
|
-
# ALWAYS filter by national codes (even if None - this removes all country columns)
|
|
89
|
-
filtered_lookup_df = filter_lookup_by_country_codes(
|
|
90
|
-
lookup_df=combined_lookup_df,
|
|
91
|
-
filter_col="ISO2_code",
|
|
92
|
-
national_codes=national_codes,
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
# Create schema from filtered lookup
|
|
96
|
-
schema = create_schema_from_dataframe(filtered_lookup_df)
|
|
97
|
-
|
|
98
|
-
# Cache the results
|
|
99
|
-
load_schema_if_any_file_changed._cached_schema = schema
|
|
100
|
-
load_schema_if_any_file_changed._last_cache_key = current_cache_key
|
|
101
|
-
|
|
102
|
-
return schema
|
|
103
|
-
else:
|
|
104
|
-
print(f"Using cached schema for national_codes: {national_codes}")
|
|
105
|
-
return load_schema_if_any_file_changed._cached_schema
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def validate_dataframe(
|
|
109
|
-
df_stats: pd.DataFrame, schema: pa.DataFrameSchema
|
|
110
|
-
) -> pd.DataFrame:
|
|
111
|
-
"""Validate the DataFrame against the given schema, reorder columns to match schema order, and list missing columns.
|
|
112
|
-
|
|
113
|
-
Args:
|
|
114
|
-
schema (pa.DataFrameSchema): The schema to validate against.
|
|
115
|
-
df_stats (pd.DataFrame): The DataFrame to validate.
|
|
116
|
-
required_false (bool): If True, sets all columns in the schema as optional (required=False).
|
|
117
|
-
|
|
118
|
-
Returns:
|
|
119
|
-
pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
|
|
120
|
-
"""
|
|
121
|
-
log_missing_columns(df_stats, schema)
|
|
122
|
-
|
|
123
|
-
# df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
|
|
124
|
-
|
|
125
|
-
# Try to automatically coerce the DataFrame to match the schema types
|
|
126
|
-
try:
|
|
127
|
-
validated_df = schema(df_stats)
|
|
128
|
-
except pa.errors.SchemaError as e:
|
|
129
|
-
print("Error during validation:", e)
|
|
130
|
-
# Return None or raise the error if validation fails
|
|
131
|
-
return None # or raise e
|
|
132
|
-
|
|
133
|
-
# Reorder the validated DataFrame to match the schema's column order
|
|
134
|
-
validated_df = validated_df.reindex(schema.columns.keys(), axis=1)
|
|
135
|
-
|
|
136
|
-
return validated_df
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def append_csvs_to_dataframe(csv_paths):
|
|
140
|
-
"""
|
|
141
|
-
Appends multiple CSV files into a single Pandas DataFrame.
|
|
142
|
-
|
|
143
|
-
Args:
|
|
144
|
-
- csv_paths (list of str): List of paths to CSV files to append.
|
|
145
|
-
|
|
146
|
-
Returns:
|
|
147
|
-
- pd.DataFrame: Combined DataFrame containing data from all provided CSV files.
|
|
148
|
-
|
|
149
|
-
Raises:
|
|
150
|
-
- ValueError: If any CSV file cannot be read.
|
|
151
|
-
"""
|
|
152
|
-
|
|
153
|
-
combined_df = pd.DataFrame() # Initialize an empty DataFrame
|
|
154
|
-
|
|
155
|
-
for path in csv_paths:
|
|
156
|
-
try:
|
|
157
|
-
# Read the CSV file into a DataFrame
|
|
158
|
-
df = pd.read_csv(path)
|
|
159
|
-
# Append to the combined DataFrame
|
|
160
|
-
combined_df = pd.concat([combined_df, df], ignore_index=True)
|
|
161
|
-
except Exception as e:
|
|
162
|
-
raise ValueError(f"Error reading {path}: {e}")
|
|
163
|
-
|
|
164
|
-
return combined_df
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
|
|
168
|
-
"""Create a Pandera schema from a DataFrame containing schema information."""
|
|
169
|
-
|
|
170
|
-
if schema_df.empty:
|
|
171
|
-
raise ValueError("The input DataFrame is empty.")
|
|
172
|
-
|
|
173
|
-
required_columns = ["name", "col_type", "is_nullable", "is_required"]
|
|
174
|
-
missing_columns = [col for col in required_columns if col not in schema_df.columns]
|
|
175
|
-
if missing_columns:
|
|
176
|
-
raise ValueError(f"Missing columns in schema DataFrame: {missing_columns}")
|
|
177
|
-
|
|
178
|
-
# print("Schema DataFrame columns:", schema_df.columns)
|
|
179
|
-
|
|
180
|
-
# Sort DataFrame by 'order' if it exists
|
|
181
|
-
if "order" in schema_df.columns:
|
|
182
|
-
schema_df = schema_df.sort_values(by="order")
|
|
183
|
-
|
|
184
|
-
# Remove rows where 'exclude_from_output' equals 1, if that column exists
|
|
185
|
-
if "exclude_from_output" in schema_df.columns:
|
|
186
|
-
schema_df = schema_df[schema_df["exclude_from_output"] != 1]
|
|
187
|
-
|
|
188
|
-
# Create a dictionary to hold the column schema
|
|
189
|
-
schema_dict = {}
|
|
190
|
-
for _, row in schema_df.iterrows():
|
|
191
|
-
col_name = row["name"]
|
|
192
|
-
col_type = row["col_type"]
|
|
193
|
-
is_nullable = row["is_nullable"] in (1, "1", True, "True")
|
|
194
|
-
is_required = row["is_required"] in (1, "1", True, "True")
|
|
195
|
-
|
|
196
|
-
# print(
|
|
197
|
-
# f"Processing column: {col_name}, Type: {col_type}, Nullable: {is_nullable}, Required: {is_required}"
|
|
198
|
-
# )
|
|
199
|
-
|
|
200
|
-
# Map DataFrame types to Pandera types
|
|
201
|
-
if col_type == "int64":
|
|
202
|
-
schema_dict[col_name] = pa.Column(
|
|
203
|
-
pa.Int64, nullable=is_nullable, required=is_required
|
|
204
|
-
)
|
|
205
|
-
elif col_type == "int":
|
|
206
|
-
schema_dict[col_name] = pa.Column(
|
|
207
|
-
pa.Int, nullable=is_nullable, required=is_required
|
|
208
|
-
)
|
|
209
|
-
elif col_type == "string":
|
|
210
|
-
schema_dict[col_name] = pa.Column(
|
|
211
|
-
pa.String, nullable=is_nullable, required=is_required
|
|
212
|
-
)
|
|
213
|
-
elif col_type == "float32":
|
|
214
|
-
schema_dict[col_name] = pa.Column(
|
|
215
|
-
pa.Float32, nullable=is_nullable, required=is_required
|
|
216
|
-
)
|
|
217
|
-
elif col_type == "float64":
|
|
218
|
-
schema_dict[col_name] = pa.Column(
|
|
219
|
-
pa.Float64, nullable=is_nullable, required=is_required
|
|
220
|
-
)
|
|
221
|
-
elif col_type == "bool":
|
|
222
|
-
schema_dict[col_name] = pa.Column(
|
|
223
|
-
pa.Bool, nullable=is_nullable, required=is_required
|
|
224
|
-
)
|
|
225
|
-
else:
|
|
226
|
-
raise ValueError(f"Unsupported type: {col_type}")
|
|
227
|
-
|
|
228
|
-
# Create and return the DataFrame schema with coercion enabled
|
|
229
|
-
schema = pa.DataFrameSchema(
|
|
230
|
-
schema_dict,
|
|
231
|
-
strict=False,
|
|
232
|
-
unique_column_names=True,
|
|
233
|
-
add_missing_columns=True,
|
|
234
|
-
coerce=True,
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
return schema
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
def setup_logger(name):
|
|
241
|
-
# Create and configure logger
|
|
242
|
-
logging.basicConfig(level=logging.INFO)
|
|
243
|
-
logger = logging.getLogger(name)
|
|
244
|
-
return logger
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
|
|
248
|
-
# Initialize the logger
|
|
249
|
-
logger = setup_logger(__name__)
|
|
250
|
-
|
|
251
|
-
# Extract the expected columns from the DataFrameSchema
|
|
252
|
-
template_columns = template_schema.columns.keys()
|
|
253
|
-
df_stats_columns = df_stats.columns
|
|
254
|
-
|
|
255
|
-
# Find missing columns
|
|
256
|
-
missing_in_template = [
|
|
257
|
-
col for col in df_stats_columns if col not in template_columns
|
|
258
|
-
]
|
|
259
|
-
missing_in_stats = [col for col in template_columns if col not in df_stats_columns]
|
|
260
|
-
|
|
261
|
-
# Log results for missing columns in df_stats
|
|
262
|
-
if missing_in_template:
|
|
263
|
-
logger.warning(
|
|
264
|
-
f"The following columns from the results dataframe did not match any columns in the schema: \n{', '.join(missing_in_template)}"
|
|
265
|
-
)
|
|
266
|
-
else:
|
|
267
|
-
logger.info("All columns from dataframe found in the schema.")
|
|
268
|
-
|
|
269
|
-
# Log results for missing columns in template_df
|
|
270
|
-
if missing_in_stats:
|
|
271
|
-
logger.warning(
|
|
272
|
-
f"The following columns in the schema did not match any columns from the results dataframe: \n{', '.join(missing_in_stats)}"
|
|
273
|
-
)
|
|
274
|
-
else:
|
|
275
|
-
logger.info("All columns from the schema found in the results dataframe.")
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
def setup_logger(name):
|
|
279
|
-
"""
|
|
280
|
-
Set up a logger with a specific name to avoid duplicate logs.
|
|
281
|
-
"""
|
|
282
|
-
logger = logging.getLogger(name)
|
|
283
|
-
if not logger.hasHandlers():
|
|
284
|
-
# Create handlers only if there are none
|
|
285
|
-
stdout_handler = logging.StreamHandler()
|
|
286
|
-
file_handler = logging.FileHandler("missing_columns.log")
|
|
287
|
-
|
|
288
|
-
# Set levels
|
|
289
|
-
stdout_handler.setLevel(logging.WARNING)
|
|
290
|
-
file_handler.setLevel(logging.WARNING)
|
|
291
|
-
|
|
292
|
-
# Create formatter and add it to the handlers
|
|
293
|
-
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
294
|
-
stdout_handler.setFormatter(formatter)
|
|
295
|
-
file_handler.setFormatter(formatter)
|
|
296
|
-
|
|
297
|
-
# Add handlers to the logger
|
|
298
|
-
logger.addHandler(stdout_handler)
|
|
299
|
-
logger.addHandler(file_handler)
|
|
300
|
-
|
|
301
|
-
return logger
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
# def filter_lookup_by_country_codes(
|
|
305
|
-
# lookup_df: pd.DataFrame, national_codes: list
|
|
306
|
-
# ) -> pd.DataFrame:
|
|
307
|
-
# """
|
|
308
|
-
# Filter lookup DataFrame to include only:
|
|
309
|
-
# 1. Global columns (prefixed with 'g_')
|
|
310
|
-
# 2. General columns (not country-specific)
|
|
311
|
-
# 3. Country-specific columns matching the provided ISO2 codes
|
|
312
|
-
|
|
313
|
-
# Args:
|
|
314
|
-
# lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
|
|
315
|
-
# national_codes (list): List of ISO2 country codes to include
|
|
316
|
-
|
|
317
|
-
# Returns:
|
|
318
|
-
# pd.DataFrame: Filtered lookup DataFrame
|
|
319
|
-
# """
|
|
320
|
-
# if not national_codes:
|
|
321
|
-
# return lookup_df
|
|
322
|
-
|
|
323
|
-
# # Normalize national_codes to lowercase for case-insensitive comparison
|
|
324
|
-
# normalized_codes = [
|
|
325
|
-
# code.lower() for code in national_codes if isinstance(code, str)
|
|
326
|
-
# ]
|
|
327
|
-
|
|
328
|
-
# # Keep track of rows to filter out
|
|
329
|
-
# rows_to_remove = []
|
|
330
|
-
|
|
331
|
-
# # Process each row in the lookup DataFrame
|
|
332
|
-
# for idx, row in lookup_df.iterrows():
|
|
333
|
-
# col_name = row["name"]
|
|
334
|
-
|
|
335
|
-
# # Skip if not a column name entry
|
|
336
|
-
# if pd.isna(col_name):
|
|
337
|
-
# continue
|
|
338
|
-
|
|
339
|
-
# # Always keep global columns (g_) and columns that aren't country-specific
|
|
340
|
-
# if col_name.startswith("g_"):
|
|
341
|
-
# continue
|
|
342
|
-
|
|
343
|
-
# # Check if this is a country-specific column (nXX_)
|
|
344
|
-
# is_country_column = False
|
|
345
|
-
# matched_country = False
|
|
346
|
-
|
|
347
|
-
# # Look for pattern nXX_ which would indicate a country-specific column
|
|
348
|
-
# for i in range(len(col_name) - 3):
|
|
349
|
-
# if (
|
|
350
|
-
# col_name[i : i + 1].lower() == "n"
|
|
351
|
-
# and len(col_name) > i + 3
|
|
352
|
-
# and col_name[i + 3 : i + 4] == "_"
|
|
353
|
-
# ):
|
|
354
|
-
# country_code = col_name[i + 1 : i + 3].lower()
|
|
355
|
-
# is_country_column = True
|
|
356
|
-
# if country_code in normalized_codes:
|
|
357
|
-
# matched_country = True
|
|
358
|
-
# break
|
|
359
|
-
|
|
360
|
-
# # If it's a country column but doesn't match our list, flag for removal
|
|
361
|
-
# if is_country_column and not matched_country:
|
|
362
|
-
# rows_to_remove.append(idx)
|
|
363
|
-
|
|
364
|
-
# # Filter out rows for countries not in our list
|
|
365
|
-
# if rows_to_remove:
|
|
366
|
-
# return lookup_df.drop(rows_to_remove)
|
|
367
|
-
|
|
368
|
-
# # return lookup_df
|
|
369
|
-
# def filter_lookup_by_country_codes(
|
|
370
|
-
# lookup_df: pd.DataFrame, national_codes: list = None
|
|
371
|
-
# ) -> pd.DataFrame:
|
|
372
|
-
# """
|
|
373
|
-
# Filter lookup DataFrame to include only:
|
|
374
|
-
# 1. Global columns (prefixed with 'g_')
|
|
375
|
-
# 2. General columns (not country-specific)
|
|
376
|
-
# 3. Country-specific columns matching the provided ISO2 codes (if national_codes provided)
|
|
377
|
-
|
|
378
|
-
# If no national_codes are provided, ALL country-specific columns are filtered out.
|
|
379
|
-
|
|
380
|
-
# Args:
|
|
381
|
-
# lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
|
|
382
|
-
# national_codes (list, optional): List of ISO2 country codes to include.
|
|
383
|
-
# If None, all country-specific columns are removed.
|
|
384
|
-
|
|
385
|
-
# Returns:
|
|
386
|
-
# pd.DataFrame: Filtered lookup DataFrame
|
|
387
|
-
# """
|
|
388
|
-
|
|
389
|
-
# # Normalize national_codes to lowercase for case-insensitive comparison
|
|
390
|
-
# if national_codes:
|
|
391
|
-
# normalized_codes = [
|
|
392
|
-
# code.lower() for code in national_codes if isinstance(code, str)
|
|
393
|
-
# ]
|
|
394
|
-
# else:
|
|
395
|
-
# normalized_codes = []
|
|
396
|
-
|
|
397
|
-
# # Keep track of rows to remove
|
|
398
|
-
# rows_to_remove = []
|
|
399
|
-
|
|
400
|
-
# # Process each row in the lookup DataFrame
|
|
401
|
-
# for idx, row in lookup_df.iterrows():
|
|
402
|
-
# col_name = row["name"]
|
|
403
|
-
|
|
404
|
-
# # Skip if not a column name entry
|
|
405
|
-
# if pd.isna(col_name):
|
|
406
|
-
# continue
|
|
407
|
-
|
|
408
|
-
# # Always keep global columns (g_) and general columns
|
|
409
|
-
# if col_name.startswith("g_"):
|
|
410
|
-
# continue
|
|
411
|
-
|
|
412
|
-
# # Check if this is a country-specific column (nXX_)
|
|
413
|
-
# is_country_column = False
|
|
414
|
-
# matched_country = False
|
|
415
|
-
|
|
416
|
-
# # Look for pattern nXX_ which indicates a country-specific column
|
|
417
|
-
# for i in range(len(col_name) - 3):
|
|
418
|
-
# if (
|
|
419
|
-
# col_name[i : i + 1].lower() == "n"
|
|
420
|
-
# and len(col_name) > i + 3
|
|
421
|
-
# and col_name[i + 3 : i + 4] == "_"
|
|
422
|
-
# ):
|
|
423
|
-
# country_code = col_name[i + 1 : i + 3].lower()
|
|
424
|
-
# is_country_column = True
|
|
425
|
-
|
|
426
|
-
# # Only match if we have national_codes AND this country is in the list
|
|
427
|
-
# if national_codes and country_code in normalized_codes:
|
|
428
|
-
# matched_country = True
|
|
429
|
-
# break
|
|
430
|
-
|
|
431
|
-
# # Remove country-specific columns that don't match our criteria:
|
|
432
|
-
# # - If no national_codes provided: remove ALL country columns
|
|
433
|
-
# # - If national_codes provided: remove country columns NOT in the list
|
|
434
|
-
# if is_country_column and not matched_country:
|
|
435
|
-
# rows_to_remove.append(idx)
|
|
436
|
-
|
|
437
|
-
# # Filter out flagged rows
|
|
438
|
-
# if rows_to_remove:
|
|
439
|
-
# print(f"Filtering out {(rows_to_remove)} country-specific row(s) not matching criteria")
|
|
440
|
-
# filtered_df = lookup_df.drop(rows_to_remove)
|
|
441
|
-
|
|
442
|
-
# # Filter out flagged rows
|
|
443
|
-
# if rows_to_remove:
|
|
444
|
-
# # Create detailed debug info
|
|
445
|
-
# removed_rows_info = []
|
|
446
|
-
# for idx in rows_to_remove:
|
|
447
|
-
# row_name = lookup_df.loc[idx, "name"]
|
|
448
|
-
# removed_rows_info.append({
|
|
449
|
-
# 'index': idx,
|
|
450
|
-
# 'name': row_name
|
|
451
|
-
# })
|
|
452
|
-
|
|
453
|
-
# # Extract just the column names for easy viewing
|
|
454
|
-
# removed_column_names = [info['name'] for info in removed_rows_info]
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
# print(f"Filtered out {len(rows_to_remove)} country-specific row(s) not matching criteria")
|
|
458
|
-
# print(f"Removed column names: {removed_column_names}")
|
|
459
|
-
# return filtered_df
|
|
460
|
-
|
|
461
|
-
# return lookup_df
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
def filter_lookup_by_country_codes(
|
|
465
|
-
lookup_df: pd.DataFrame, filter_col, national_codes: list = None
|
|
466
|
-
):
|
|
467
|
-
"""Filter by actual ISO2 column values instead of column name patterns"""
|
|
468
|
-
|
|
469
|
-
if not national_codes:
|
|
470
|
-
# Remove all rows with country codes
|
|
471
|
-
rows_with_country_codes = ~lookup_df[filter_col].isna()
|
|
472
|
-
removed_names = lookup_df[rows_with_country_codes]["name"].tolist()
|
|
473
|
-
logger.debug(
|
|
474
|
-
f"No national codes provided - removing {len(removed_names)} rows with country codes"
|
|
475
|
-
)
|
|
476
|
-
logger.debug(f"Removed column names: {removed_names}")
|
|
477
|
-
return lookup_df[lookup_df[filter_col].isna()]
|
|
478
|
-
|
|
479
|
-
logger.debug(f"Filtering for national codes: {national_codes}")
|
|
480
|
-
logger.debug(f"Total rows before filtering: {len(lookup_df)}")
|
|
481
|
-
|
|
482
|
-
# Keep rows with no country code (global) OR matching country codes
|
|
483
|
-
normalized_codes = [code.lower() for code in national_codes]
|
|
484
|
-
|
|
485
|
-
mask = lookup_df[filter_col].isna() | lookup_df[ # Global datasets
|
|
486
|
-
filter_col
|
|
487
|
-
].str.lower().isin(
|
|
488
|
-
normalized_codes
|
|
489
|
-
) # Matching countries
|
|
490
|
-
|
|
491
|
-
logger.debug(
|
|
492
|
-
f"Filtering lookup by country codes: {national_codes}, keeping {mask.sum()} rows"
|
|
493
|
-
)
|
|
494
|
-
|
|
495
|
-
return lookup_df[mask]
|
|
1
|
+
# !pip install pandera[io] # special version used
|
|
2
|
+
import pandera as pa
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import os
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path # Add this import
|
|
7
|
+
|
|
8
|
+
from openforis_whisp.logger import StdoutLogger, FileLogger
|
|
9
|
+
|
|
10
|
+
from openforis_whisp.pd_schemas import data_lookup_type
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
from openforis_whisp.parameters.config_runtime import (
|
|
14
|
+
DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
|
|
15
|
+
DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = StdoutLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Dictionary to cache schema and modification times for multiple files
|
|
22
|
+
cached_schema = None
|
|
23
|
+
cached_file_mtimes = {}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def validate_dataframe_using_lookups(
|
|
27
|
+
df_stats: pd.DataFrame, file_paths: list = None, national_codes: list = None
|
|
28
|
+
) -> pd.DataFrame:
|
|
29
|
+
"""
|
|
30
|
+
Load the schema if any file in the list has changed and validate the DataFrame against the loaded schema.
|
|
31
|
+
Optionally filter columns by country code.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
df_stats (pd.DataFrame): The DataFrame to validate.
|
|
35
|
+
file_paths (list): List of paths to schema files.
|
|
36
|
+
national_codes (list, optional): List of ISO2 country codes to include.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
pd.DataFrame: The validated DataFrame.
|
|
40
|
+
"""
|
|
41
|
+
# Load the schema
|
|
42
|
+
schema = load_schema_if_any_file_changed(file_paths, national_codes=national_codes)
|
|
43
|
+
|
|
44
|
+
# Validate the DataFrame
|
|
45
|
+
validated_df = validate_dataframe(df_stats, schema)
|
|
46
|
+
|
|
47
|
+
return validated_df
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def load_schema_if_any_file_changed(file_paths=None, national_codes=None):
|
|
51
|
+
"""Load schema if files changed OR if national_codes changed"""
|
|
52
|
+
|
|
53
|
+
if file_paths is None:
|
|
54
|
+
file_paths = [
|
|
55
|
+
DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
|
|
56
|
+
DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# Include national_codes in cache key (including None case)
|
|
60
|
+
cache_key_parts = []
|
|
61
|
+
for file_path in file_paths:
|
|
62
|
+
if Path(file_path).exists():
|
|
63
|
+
mtime = Path(file_path).stat().st_mtime
|
|
64
|
+
cache_key_parts.append(f"{file_path}:{mtime}")
|
|
65
|
+
else:
|
|
66
|
+
cache_key_parts.append(f"{file_path}:missing")
|
|
67
|
+
|
|
68
|
+
# Always include national_codes in cache key (even if None)
|
|
69
|
+
national_codes_key = (
|
|
70
|
+
str(sorted(national_codes)) if national_codes else "no_countries"
|
|
71
|
+
)
|
|
72
|
+
cache_key_parts.append(f"national_codes:{national_codes_key}")
|
|
73
|
+
|
|
74
|
+
current_cache_key = "|".join(cache_key_parts)
|
|
75
|
+
|
|
76
|
+
# Check cache
|
|
77
|
+
if (
|
|
78
|
+
not hasattr(load_schema_if_any_file_changed, "_cached_schema")
|
|
79
|
+
or not hasattr(load_schema_if_any_file_changed, "_last_cache_key")
|
|
80
|
+
or load_schema_if_any_file_changed._last_cache_key != current_cache_key
|
|
81
|
+
):
|
|
82
|
+
|
|
83
|
+
print(f"Creating schema for national_codes: {national_codes}")
|
|
84
|
+
|
|
85
|
+
# Load and combine lookup files
|
|
86
|
+
combined_lookup_df = append_csvs_to_dataframe(file_paths)
|
|
87
|
+
|
|
88
|
+
# ALWAYS filter by national codes (even if None - this removes all country columns)
|
|
89
|
+
filtered_lookup_df = filter_lookup_by_country_codes(
|
|
90
|
+
lookup_df=combined_lookup_df,
|
|
91
|
+
filter_col="ISO2_code",
|
|
92
|
+
national_codes=national_codes,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Create schema from filtered lookup
|
|
96
|
+
schema = create_schema_from_dataframe(filtered_lookup_df)
|
|
97
|
+
|
|
98
|
+
# Cache the results
|
|
99
|
+
load_schema_if_any_file_changed._cached_schema = schema
|
|
100
|
+
load_schema_if_any_file_changed._last_cache_key = current_cache_key
|
|
101
|
+
|
|
102
|
+
return schema
|
|
103
|
+
else:
|
|
104
|
+
print(f"Using cached schema for national_codes: {national_codes}")
|
|
105
|
+
return load_schema_if_any_file_changed._cached_schema
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def validate_dataframe(
|
|
109
|
+
df_stats: pd.DataFrame, schema: pa.DataFrameSchema
|
|
110
|
+
) -> pd.DataFrame:
|
|
111
|
+
"""Validate the DataFrame against the given schema, reorder columns to match schema order, and list missing columns.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
schema (pa.DataFrameSchema): The schema to validate against.
|
|
115
|
+
df_stats (pd.DataFrame): The DataFrame to validate.
|
|
116
|
+
required_false (bool): If True, sets all columns in the schema as optional (required=False).
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
|
|
120
|
+
"""
|
|
121
|
+
log_missing_columns(df_stats, schema)
|
|
122
|
+
|
|
123
|
+
# df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
|
|
124
|
+
|
|
125
|
+
# Try to automatically coerce the DataFrame to match the schema types
|
|
126
|
+
try:
|
|
127
|
+
validated_df = schema(df_stats)
|
|
128
|
+
except pa.errors.SchemaError as e:
|
|
129
|
+
print("Error during validation:", e)
|
|
130
|
+
# Return None or raise the error if validation fails
|
|
131
|
+
return None # or raise e
|
|
132
|
+
|
|
133
|
+
# Reorder the validated DataFrame to match the schema's column order
|
|
134
|
+
validated_df = validated_df.reindex(schema.columns.keys(), axis=1)
|
|
135
|
+
|
|
136
|
+
return validated_df
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def append_csvs_to_dataframe(csv_paths):
|
|
140
|
+
"""
|
|
141
|
+
Appends multiple CSV files into a single Pandas DataFrame.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
- csv_paths (list of str): List of paths to CSV files to append.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
- pd.DataFrame: Combined DataFrame containing data from all provided CSV files.
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
- ValueError: If any CSV file cannot be read.
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
combined_df = pd.DataFrame() # Initialize an empty DataFrame
|
|
154
|
+
|
|
155
|
+
for path in csv_paths:
|
|
156
|
+
try:
|
|
157
|
+
# Read the CSV file into a DataFrame
|
|
158
|
+
df = pd.read_csv(path)
|
|
159
|
+
# Append to the combined DataFrame
|
|
160
|
+
combined_df = pd.concat([combined_df, df], ignore_index=True)
|
|
161
|
+
except Exception as e:
|
|
162
|
+
raise ValueError(f"Error reading {path}: {e}")
|
|
163
|
+
|
|
164
|
+
return combined_df
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
|
|
168
|
+
"""Create a Pandera schema from a DataFrame containing schema information."""
|
|
169
|
+
|
|
170
|
+
if schema_df.empty:
|
|
171
|
+
raise ValueError("The input DataFrame is empty.")
|
|
172
|
+
|
|
173
|
+
required_columns = ["name", "col_type", "is_nullable", "is_required"]
|
|
174
|
+
missing_columns = [col for col in required_columns if col not in schema_df.columns]
|
|
175
|
+
if missing_columns:
|
|
176
|
+
raise ValueError(f"Missing columns in schema DataFrame: {missing_columns}")
|
|
177
|
+
|
|
178
|
+
# print("Schema DataFrame columns:", schema_df.columns)
|
|
179
|
+
|
|
180
|
+
# Sort DataFrame by 'order' if it exists
|
|
181
|
+
if "order" in schema_df.columns:
|
|
182
|
+
schema_df = schema_df.sort_values(by="order")
|
|
183
|
+
|
|
184
|
+
# Remove rows where 'exclude_from_output' equals 1, if that column exists
|
|
185
|
+
if "exclude_from_output" in schema_df.columns:
|
|
186
|
+
schema_df = schema_df[schema_df["exclude_from_output"] != 1]
|
|
187
|
+
|
|
188
|
+
# Create a dictionary to hold the column schema
|
|
189
|
+
schema_dict = {}
|
|
190
|
+
for _, row in schema_df.iterrows():
|
|
191
|
+
col_name = row["name"]
|
|
192
|
+
col_type = row["col_type"]
|
|
193
|
+
is_nullable = row["is_nullable"] in (1, "1", True, "True")
|
|
194
|
+
is_required = row["is_required"] in (1, "1", True, "True")
|
|
195
|
+
|
|
196
|
+
# print(
|
|
197
|
+
# f"Processing column: {col_name}, Type: {col_type}, Nullable: {is_nullable}, Required: {is_required}"
|
|
198
|
+
# )
|
|
199
|
+
|
|
200
|
+
# Map DataFrame types to Pandera types
|
|
201
|
+
if col_type == "int64":
|
|
202
|
+
schema_dict[col_name] = pa.Column(
|
|
203
|
+
pa.Int64, nullable=is_nullable, required=is_required
|
|
204
|
+
)
|
|
205
|
+
elif col_type == "int":
|
|
206
|
+
schema_dict[col_name] = pa.Column(
|
|
207
|
+
pa.Int, nullable=is_nullable, required=is_required
|
|
208
|
+
)
|
|
209
|
+
elif col_type == "string":
|
|
210
|
+
schema_dict[col_name] = pa.Column(
|
|
211
|
+
pa.String, nullable=is_nullable, required=is_required
|
|
212
|
+
)
|
|
213
|
+
elif col_type == "float32":
|
|
214
|
+
schema_dict[col_name] = pa.Column(
|
|
215
|
+
pa.Float32, nullable=is_nullable, required=is_required
|
|
216
|
+
)
|
|
217
|
+
elif col_type == "float64":
|
|
218
|
+
schema_dict[col_name] = pa.Column(
|
|
219
|
+
pa.Float64, nullable=is_nullable, required=is_required
|
|
220
|
+
)
|
|
221
|
+
elif col_type == "bool":
|
|
222
|
+
schema_dict[col_name] = pa.Column(
|
|
223
|
+
pa.Bool, nullable=is_nullable, required=is_required
|
|
224
|
+
)
|
|
225
|
+
else:
|
|
226
|
+
raise ValueError(f"Unsupported type: {col_type}")
|
|
227
|
+
|
|
228
|
+
# Create and return the DataFrame schema with coercion enabled
|
|
229
|
+
schema = pa.DataFrameSchema(
|
|
230
|
+
schema_dict,
|
|
231
|
+
strict=False,
|
|
232
|
+
unique_column_names=True,
|
|
233
|
+
add_missing_columns=True,
|
|
234
|
+
coerce=True,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
return schema
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def setup_logger(name):
|
|
241
|
+
# Create and configure logger
|
|
242
|
+
logging.basicConfig(level=logging.INFO)
|
|
243
|
+
logger = logging.getLogger(name)
|
|
244
|
+
return logger
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
|
|
248
|
+
# Initialize the logger
|
|
249
|
+
logger = setup_logger(__name__)
|
|
250
|
+
|
|
251
|
+
# Extract the expected columns from the DataFrameSchema
|
|
252
|
+
template_columns = template_schema.columns.keys()
|
|
253
|
+
df_stats_columns = df_stats.columns
|
|
254
|
+
|
|
255
|
+
# Find missing columns
|
|
256
|
+
missing_in_template = [
|
|
257
|
+
col for col in df_stats_columns if col not in template_columns
|
|
258
|
+
]
|
|
259
|
+
missing_in_stats = [col for col in template_columns if col not in df_stats_columns]
|
|
260
|
+
|
|
261
|
+
# Log results for missing columns in df_stats
|
|
262
|
+
if missing_in_template:
|
|
263
|
+
logger.warning(
|
|
264
|
+
f"The following columns from the results dataframe did not match any columns in the schema: \n{', '.join(missing_in_template)}"
|
|
265
|
+
)
|
|
266
|
+
else:
|
|
267
|
+
logger.info("All columns from dataframe found in the schema.")
|
|
268
|
+
|
|
269
|
+
# Log results for missing columns in template_df
|
|
270
|
+
if missing_in_stats:
|
|
271
|
+
logger.warning(
|
|
272
|
+
f"The following columns in the schema did not match any columns from the results dataframe: \n{', '.join(missing_in_stats)}"
|
|
273
|
+
)
|
|
274
|
+
else:
|
|
275
|
+
logger.info("All columns from the schema found in the results dataframe.")
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def setup_logger(name):
|
|
279
|
+
"""
|
|
280
|
+
Set up a logger with a specific name to avoid duplicate logs.
|
|
281
|
+
"""
|
|
282
|
+
logger = logging.getLogger(name)
|
|
283
|
+
if not logger.hasHandlers():
|
|
284
|
+
# Create handlers only if there are none
|
|
285
|
+
stdout_handler = logging.StreamHandler()
|
|
286
|
+
file_handler = logging.FileHandler("missing_columns.log")
|
|
287
|
+
|
|
288
|
+
# Set levels
|
|
289
|
+
stdout_handler.setLevel(logging.WARNING)
|
|
290
|
+
file_handler.setLevel(logging.WARNING)
|
|
291
|
+
|
|
292
|
+
# Create formatter and add it to the handlers
|
|
293
|
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
294
|
+
stdout_handler.setFormatter(formatter)
|
|
295
|
+
file_handler.setFormatter(formatter)
|
|
296
|
+
|
|
297
|
+
# Add handlers to the logger
|
|
298
|
+
logger.addHandler(stdout_handler)
|
|
299
|
+
logger.addHandler(file_handler)
|
|
300
|
+
|
|
301
|
+
return logger
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
# def filter_lookup_by_country_codes(
|
|
305
|
+
# lookup_df: pd.DataFrame, national_codes: list
|
|
306
|
+
# ) -> pd.DataFrame:
|
|
307
|
+
# """
|
|
308
|
+
# Filter lookup DataFrame to include only:
|
|
309
|
+
# 1. Global columns (prefixed with 'g_')
|
|
310
|
+
# 2. General columns (not country-specific)
|
|
311
|
+
# 3. Country-specific columns matching the provided ISO2 codes
|
|
312
|
+
|
|
313
|
+
# Args:
|
|
314
|
+
# lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
|
|
315
|
+
# national_codes (list): List of ISO2 country codes to include
|
|
316
|
+
|
|
317
|
+
# Returns:
|
|
318
|
+
# pd.DataFrame: Filtered lookup DataFrame
|
|
319
|
+
# """
|
|
320
|
+
# if not national_codes:
|
|
321
|
+
# return lookup_df
|
|
322
|
+
|
|
323
|
+
# # Normalize national_codes to lowercase for case-insensitive comparison
|
|
324
|
+
# normalized_codes = [
|
|
325
|
+
# code.lower() for code in national_codes if isinstance(code, str)
|
|
326
|
+
# ]
|
|
327
|
+
|
|
328
|
+
# # Keep track of rows to filter out
|
|
329
|
+
# rows_to_remove = []
|
|
330
|
+
|
|
331
|
+
# # Process each row in the lookup DataFrame
|
|
332
|
+
# for idx, row in lookup_df.iterrows():
|
|
333
|
+
# col_name = row["name"]
|
|
334
|
+
|
|
335
|
+
# # Skip if not a column name entry
|
|
336
|
+
# if pd.isna(col_name):
|
|
337
|
+
# continue
|
|
338
|
+
|
|
339
|
+
# # Always keep global columns (g_) and columns that aren't country-specific
|
|
340
|
+
# if col_name.startswith("g_"):
|
|
341
|
+
# continue
|
|
342
|
+
|
|
343
|
+
# # Check if this is a country-specific column (nXX_)
|
|
344
|
+
# is_country_column = False
|
|
345
|
+
# matched_country = False
|
|
346
|
+
|
|
347
|
+
# # Look for pattern nXX_ which would indicate a country-specific column
|
|
348
|
+
# for i in range(len(col_name) - 3):
|
|
349
|
+
# if (
|
|
350
|
+
# col_name[i : i + 1].lower() == "n"
|
|
351
|
+
# and len(col_name) > i + 3
|
|
352
|
+
# and col_name[i + 3 : i + 4] == "_"
|
|
353
|
+
# ):
|
|
354
|
+
# country_code = col_name[i + 1 : i + 3].lower()
|
|
355
|
+
# is_country_column = True
|
|
356
|
+
# if country_code in normalized_codes:
|
|
357
|
+
# matched_country = True
|
|
358
|
+
# break
|
|
359
|
+
|
|
360
|
+
# # If it's a country column but doesn't match our list, flag for removal
|
|
361
|
+
# if is_country_column and not matched_country:
|
|
362
|
+
# rows_to_remove.append(idx)
|
|
363
|
+
|
|
364
|
+
# # Filter out rows for countries not in our list
|
|
365
|
+
# if rows_to_remove:
|
|
366
|
+
# return lookup_df.drop(rows_to_remove)
|
|
367
|
+
|
|
368
|
+
# # return lookup_df
|
|
369
|
+
# def filter_lookup_by_country_codes(
|
|
370
|
+
# lookup_df: pd.DataFrame, national_codes: list = None
|
|
371
|
+
# ) -> pd.DataFrame:
|
|
372
|
+
# """
|
|
373
|
+
# Filter lookup DataFrame to include only:
|
|
374
|
+
# 1. Global columns (prefixed with 'g_')
|
|
375
|
+
# 2. General columns (not country-specific)
|
|
376
|
+
# 3. Country-specific columns matching the provided ISO2 codes (if national_codes provided)
|
|
377
|
+
|
|
378
|
+
# If no national_codes are provided, ALL country-specific columns are filtered out.
|
|
379
|
+
|
|
380
|
+
# Args:
|
|
381
|
+
# lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
|
|
382
|
+
# national_codes (list, optional): List of ISO2 country codes to include.
|
|
383
|
+
# If None, all country-specific columns are removed.
|
|
384
|
+
|
|
385
|
+
# Returns:
|
|
386
|
+
# pd.DataFrame: Filtered lookup DataFrame
|
|
387
|
+
# """
|
|
388
|
+
|
|
389
|
+
# # Normalize national_codes to lowercase for case-insensitive comparison
|
|
390
|
+
# if national_codes:
|
|
391
|
+
# normalized_codes = [
|
|
392
|
+
# code.lower() for code in national_codes if isinstance(code, str)
|
|
393
|
+
# ]
|
|
394
|
+
# else:
|
|
395
|
+
# normalized_codes = []
|
|
396
|
+
|
|
397
|
+
# # Keep track of rows to remove
|
|
398
|
+
# rows_to_remove = []
|
|
399
|
+
|
|
400
|
+
# # Process each row in the lookup DataFrame
|
|
401
|
+
# for idx, row in lookup_df.iterrows():
|
|
402
|
+
# col_name = row["name"]
|
|
403
|
+
|
|
404
|
+
# # Skip if not a column name entry
|
|
405
|
+
# if pd.isna(col_name):
|
|
406
|
+
# continue
|
|
407
|
+
|
|
408
|
+
# # Always keep global columns (g_) and general columns
|
|
409
|
+
# if col_name.startswith("g_"):
|
|
410
|
+
# continue
|
|
411
|
+
|
|
412
|
+
# # Check if this is a country-specific column (nXX_)
|
|
413
|
+
# is_country_column = False
|
|
414
|
+
# matched_country = False
|
|
415
|
+
|
|
416
|
+
# # Look for pattern nXX_ which indicates a country-specific column
|
|
417
|
+
# for i in range(len(col_name) - 3):
|
|
418
|
+
# if (
|
|
419
|
+
# col_name[i : i + 1].lower() == "n"
|
|
420
|
+
# and len(col_name) > i + 3
|
|
421
|
+
# and col_name[i + 3 : i + 4] == "_"
|
|
422
|
+
# ):
|
|
423
|
+
# country_code = col_name[i + 1 : i + 3].lower()
|
|
424
|
+
# is_country_column = True
|
|
425
|
+
|
|
426
|
+
# # Only match if we have national_codes AND this country is in the list
|
|
427
|
+
# if national_codes and country_code in normalized_codes:
|
|
428
|
+
# matched_country = True
|
|
429
|
+
# break
|
|
430
|
+
|
|
431
|
+
# # Remove country-specific columns that don't match our criteria:
|
|
432
|
+
# # - If no national_codes provided: remove ALL country columns
|
|
433
|
+
# # - If national_codes provided: remove country columns NOT in the list
|
|
434
|
+
# if is_country_column and not matched_country:
|
|
435
|
+
# rows_to_remove.append(idx)
|
|
436
|
+
|
|
437
|
+
# # Filter out flagged rows
|
|
438
|
+
# if rows_to_remove:
|
|
439
|
+
# print(f"Filtering out {(rows_to_remove)} country-specific row(s) not matching criteria")
|
|
440
|
+
# filtered_df = lookup_df.drop(rows_to_remove)
|
|
441
|
+
|
|
442
|
+
# # Filter out flagged rows
|
|
443
|
+
# if rows_to_remove:
|
|
444
|
+
# # Create detailed debug info
|
|
445
|
+
# removed_rows_info = []
|
|
446
|
+
# for idx in rows_to_remove:
|
|
447
|
+
# row_name = lookup_df.loc[idx, "name"]
|
|
448
|
+
# removed_rows_info.append({
|
|
449
|
+
# 'index': idx,
|
|
450
|
+
# 'name': row_name
|
|
451
|
+
# })
|
|
452
|
+
|
|
453
|
+
# # Extract just the column names for easy viewing
|
|
454
|
+
# removed_column_names = [info['name'] for info in removed_rows_info]
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
# print(f"Filtered out {len(rows_to_remove)} country-specific row(s) not matching criteria")
|
|
458
|
+
# print(f"Removed column names: {removed_column_names}")
|
|
459
|
+
# return filtered_df
|
|
460
|
+
|
|
461
|
+
# return lookup_df
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def filter_lookup_by_country_codes(
|
|
465
|
+
lookup_df: pd.DataFrame, filter_col, national_codes: list = None
|
|
466
|
+
):
|
|
467
|
+
"""Filter by actual ISO2 column values instead of column name patterns"""
|
|
468
|
+
|
|
469
|
+
if not national_codes:
|
|
470
|
+
# Remove all rows with country codes
|
|
471
|
+
rows_with_country_codes = ~lookup_df[filter_col].isna()
|
|
472
|
+
removed_names = lookup_df[rows_with_country_codes]["name"].tolist()
|
|
473
|
+
logger.debug(
|
|
474
|
+
f"No national codes provided - removing {len(removed_names)} rows with country codes"
|
|
475
|
+
)
|
|
476
|
+
logger.debug(f"Removed column names: {removed_names}")
|
|
477
|
+
return lookup_df[lookup_df[filter_col].isna()]
|
|
478
|
+
|
|
479
|
+
logger.debug(f"Filtering for national codes: {national_codes}")
|
|
480
|
+
logger.debug(f"Total rows before filtering: {len(lookup_df)}")
|
|
481
|
+
|
|
482
|
+
# Keep rows with no country code (global) OR matching country codes
|
|
483
|
+
normalized_codes = [code.lower() for code in national_codes]
|
|
484
|
+
|
|
485
|
+
mask = lookup_df[filter_col].isna() | lookup_df[ # Global datasets
|
|
486
|
+
filter_col
|
|
487
|
+
].str.lower().isin(
|
|
488
|
+
normalized_codes
|
|
489
|
+
) # Matching countries
|
|
490
|
+
|
|
491
|
+
logger.debug(
|
|
492
|
+
f"Filtering lookup by country codes: {national_codes}, keeping {mask.sum()} rows"
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
return lookup_df[mask]
|