openforis-whisp 2.0.0a6__py3-none-any.whl → 2.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,495 +1,696 @@
1
- # !pip install pandera[io] # special version used
2
- import pandera as pa
3
- import pandas as pd
4
- import os
5
- import logging
6
- from pathlib import Path # Add this import
7
-
8
- from openforis_whisp.logger import StdoutLogger, FileLogger
9
-
10
- from openforis_whisp.pd_schemas import data_lookup_type
11
-
12
-
13
- from openforis_whisp.parameters.config_runtime import (
14
- DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
15
- DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
16
- )
17
-
18
- logger = StdoutLogger(__name__)
19
-
20
-
21
- # Dictionary to cache schema and modification times for multiple files
22
- cached_schema = None
23
- cached_file_mtimes = {}
24
-
25
-
26
- def validate_dataframe_using_lookups(
27
- df_stats: pd.DataFrame, file_paths: list = None, national_codes: list = None
28
- ) -> pd.DataFrame:
29
- """
30
- Load the schema if any file in the list has changed and validate the DataFrame against the loaded schema.
31
- Optionally filter columns by country code.
32
-
33
- Args:
34
- df_stats (pd.DataFrame): The DataFrame to validate.
35
- file_paths (list): List of paths to schema files.
36
- national_codes (list, optional): List of ISO2 country codes to include.
37
-
38
- Returns:
39
- pd.DataFrame: The validated DataFrame.
40
- """
41
- # Load the schema
42
- schema = load_schema_if_any_file_changed(file_paths, national_codes=national_codes)
43
-
44
- # Validate the DataFrame
45
- validated_df = validate_dataframe(df_stats, schema)
46
-
47
- return validated_df
48
-
49
-
50
- def load_schema_if_any_file_changed(file_paths=None, national_codes=None):
51
- """Load schema if files changed OR if national_codes changed"""
52
-
53
- if file_paths is None:
54
- file_paths = [
55
- DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
56
- DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
57
- ]
58
-
59
- # Include national_codes in cache key (including None case)
60
- cache_key_parts = []
61
- for file_path in file_paths:
62
- if Path(file_path).exists():
63
- mtime = Path(file_path).stat().st_mtime
64
- cache_key_parts.append(f"{file_path}:{mtime}")
65
- else:
66
- cache_key_parts.append(f"{file_path}:missing")
67
-
68
- # Always include national_codes in cache key (even if None)
69
- national_codes_key = (
70
- str(sorted(national_codes)) if national_codes else "no_countries"
71
- )
72
- cache_key_parts.append(f"national_codes:{national_codes_key}")
73
-
74
- current_cache_key = "|".join(cache_key_parts)
75
-
76
- # Check cache
77
- if (
78
- not hasattr(load_schema_if_any_file_changed, "_cached_schema")
79
- or not hasattr(load_schema_if_any_file_changed, "_last_cache_key")
80
- or load_schema_if_any_file_changed._last_cache_key != current_cache_key
81
- ):
82
-
83
- print(f"Creating schema for national_codes: {national_codes}")
84
-
85
- # Load and combine lookup files
86
- combined_lookup_df = append_csvs_to_dataframe(file_paths)
87
-
88
- # ALWAYS filter by national codes (even if None - this removes all country columns)
89
- filtered_lookup_df = filter_lookup_by_country_codes(
90
- lookup_df=combined_lookup_df,
91
- filter_col="ISO2_code",
92
- national_codes=national_codes,
93
- )
94
-
95
- # Create schema from filtered lookup
96
- schema = create_schema_from_dataframe(filtered_lookup_df)
97
-
98
- # Cache the results
99
- load_schema_if_any_file_changed._cached_schema = schema
100
- load_schema_if_any_file_changed._last_cache_key = current_cache_key
101
-
102
- return schema
103
- else:
104
- print(f"Using cached schema for national_codes: {national_codes}")
105
- return load_schema_if_any_file_changed._cached_schema
106
-
107
-
108
- def validate_dataframe(
109
- df_stats: pd.DataFrame, schema: pa.DataFrameSchema
110
- ) -> pd.DataFrame:
111
- """Validate the DataFrame against the given schema, reorder columns to match schema order, and list missing columns.
112
-
113
- Args:
114
- schema (pa.DataFrameSchema): The schema to validate against.
115
- df_stats (pd.DataFrame): The DataFrame to validate.
116
- required_false (bool): If True, sets all columns in the schema as optional (required=False).
117
-
118
- Returns:
119
- pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
120
- """
121
- log_missing_columns(df_stats, schema)
122
-
123
- # df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
124
-
125
- # Try to automatically coerce the DataFrame to match the schema types
126
- try:
127
- validated_df = schema(df_stats)
128
- except pa.errors.SchemaError as e:
129
- print("Error during validation:", e)
130
- # Return None or raise the error if validation fails
131
- return None # or raise e
132
-
133
- # Reorder the validated DataFrame to match the schema's column order
134
- validated_df = validated_df.reindex(schema.columns.keys(), axis=1)
135
-
136
- return validated_df
137
-
138
-
139
- def append_csvs_to_dataframe(csv_paths):
140
- """
141
- Appends multiple CSV files into a single Pandas DataFrame.
142
-
143
- Args:
144
- - csv_paths (list of str): List of paths to CSV files to append.
145
-
146
- Returns:
147
- - pd.DataFrame: Combined DataFrame containing data from all provided CSV files.
148
-
149
- Raises:
150
- - ValueError: If any CSV file cannot be read.
151
- """
152
-
153
- combined_df = pd.DataFrame() # Initialize an empty DataFrame
154
-
155
- for path in csv_paths:
156
- try:
157
- # Read the CSV file into a DataFrame
158
- df = pd.read_csv(path)
159
- # Append to the combined DataFrame
160
- combined_df = pd.concat([combined_df, df], ignore_index=True)
161
- except Exception as e:
162
- raise ValueError(f"Error reading {path}: {e}")
163
-
164
- return combined_df
165
-
166
-
167
- def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
168
- """Create a Pandera schema from a DataFrame containing schema information."""
169
-
170
- if schema_df.empty:
171
- raise ValueError("The input DataFrame is empty.")
172
-
173
- required_columns = ["name", "col_type", "is_nullable", "is_required"]
174
- missing_columns = [col for col in required_columns if col not in schema_df.columns]
175
- if missing_columns:
176
- raise ValueError(f"Missing columns in schema DataFrame: {missing_columns}")
177
-
178
- # print("Schema DataFrame columns:", schema_df.columns)
179
-
180
- # Sort DataFrame by 'order' if it exists
181
- if "order" in schema_df.columns:
182
- schema_df = schema_df.sort_values(by="order")
183
-
184
- # Remove rows where 'exclude_from_output' equals 1, if that column exists
185
- if "exclude_from_output" in schema_df.columns:
186
- schema_df = schema_df[schema_df["exclude_from_output"] != 1]
187
-
188
- # Create a dictionary to hold the column schema
189
- schema_dict = {}
190
- for _, row in schema_df.iterrows():
191
- col_name = row["name"]
192
- col_type = row["col_type"]
193
- is_nullable = row["is_nullable"] in (1, "1", True, "True")
194
- is_required = row["is_required"] in (1, "1", True, "True")
195
-
196
- # print(
197
- # f"Processing column: {col_name}, Type: {col_type}, Nullable: {is_nullable}, Required: {is_required}"
198
- # )
199
-
200
- # Map DataFrame types to Pandera types
201
- if col_type == "int64":
202
- schema_dict[col_name] = pa.Column(
203
- pa.Int64, nullable=is_nullable, required=is_required
204
- )
205
- elif col_type == "int":
206
- schema_dict[col_name] = pa.Column(
207
- pa.Int, nullable=is_nullable, required=is_required
208
- )
209
- elif col_type == "string":
210
- schema_dict[col_name] = pa.Column(
211
- pa.String, nullable=is_nullable, required=is_required
212
- )
213
- elif col_type == "float32":
214
- schema_dict[col_name] = pa.Column(
215
- pa.Float32, nullable=is_nullable, required=is_required
216
- )
217
- elif col_type == "float64":
218
- schema_dict[col_name] = pa.Column(
219
- pa.Float64, nullable=is_nullable, required=is_required
220
- )
221
- elif col_type == "bool":
222
- schema_dict[col_name] = pa.Column(
223
- pa.Bool, nullable=is_nullable, required=is_required
224
- )
225
- else:
226
- raise ValueError(f"Unsupported type: {col_type}")
227
-
228
- # Create and return the DataFrame schema with coercion enabled
229
- schema = pa.DataFrameSchema(
230
- schema_dict,
231
- strict=False,
232
- unique_column_names=True,
233
- add_missing_columns=True,
234
- coerce=True,
235
- )
236
-
237
- return schema
238
-
239
-
240
- def setup_logger(name):
241
- # Create and configure logger
242
- logging.basicConfig(level=logging.INFO)
243
- logger = logging.getLogger(name)
244
- return logger
245
-
246
-
247
- def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
248
- # Initialize the logger
249
- logger = setup_logger(__name__)
250
-
251
- # Extract the expected columns from the DataFrameSchema
252
- template_columns = template_schema.columns.keys()
253
- df_stats_columns = df_stats.columns
254
-
255
- # Find missing columns
256
- missing_in_template = [
257
- col for col in df_stats_columns if col not in template_columns
258
- ]
259
- missing_in_stats = [col for col in template_columns if col not in df_stats_columns]
260
-
261
- # Log results for missing columns in df_stats
262
- if missing_in_template:
263
- logger.warning(
264
- f"The following columns from the results dataframe did not match any columns in the schema: \n{', '.join(missing_in_template)}"
265
- )
266
- else:
267
- logger.info("All columns from dataframe found in the schema.")
268
-
269
- # Log results for missing columns in template_df
270
- if missing_in_stats:
271
- logger.warning(
272
- f"The following columns in the schema did not match any columns from the results dataframe: \n{', '.join(missing_in_stats)}"
273
- )
274
- else:
275
- logger.info("All columns from the schema found in the results dataframe.")
276
-
277
-
278
- def setup_logger(name):
279
- """
280
- Set up a logger with a specific name to avoid duplicate logs.
281
- """
282
- logger = logging.getLogger(name)
283
- if not logger.hasHandlers():
284
- # Create handlers only if there are none
285
- stdout_handler = logging.StreamHandler()
286
- file_handler = logging.FileHandler("missing_columns.log")
287
-
288
- # Set levels
289
- stdout_handler.setLevel(logging.WARNING)
290
- file_handler.setLevel(logging.WARNING)
291
-
292
- # Create formatter and add it to the handlers
293
- formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
294
- stdout_handler.setFormatter(formatter)
295
- file_handler.setFormatter(formatter)
296
-
297
- # Add handlers to the logger
298
- logger.addHandler(stdout_handler)
299
- logger.addHandler(file_handler)
300
-
301
- return logger
302
-
303
-
304
- # def filter_lookup_by_country_codes(
305
- # lookup_df: pd.DataFrame, national_codes: list
306
- # ) -> pd.DataFrame:
307
- # """
308
- # Filter lookup DataFrame to include only:
309
- # 1. Global columns (prefixed with 'g_')
310
- # 2. General columns (not country-specific)
311
- # 3. Country-specific columns matching the provided ISO2 codes
312
-
313
- # Args:
314
- # lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
315
- # national_codes (list): List of ISO2 country codes to include
316
-
317
- # Returns:
318
- # pd.DataFrame: Filtered lookup DataFrame
319
- # """
320
- # if not national_codes:
321
- # return lookup_df
322
-
323
- # # Normalize national_codes to lowercase for case-insensitive comparison
324
- # normalized_codes = [
325
- # code.lower() for code in national_codes if isinstance(code, str)
326
- # ]
327
-
328
- # # Keep track of rows to filter out
329
- # rows_to_remove = []
330
-
331
- # # Process each row in the lookup DataFrame
332
- # for idx, row in lookup_df.iterrows():
333
- # col_name = row["name"]
334
-
335
- # # Skip if not a column name entry
336
- # if pd.isna(col_name):
337
- # continue
338
-
339
- # # Always keep global columns (g_) and columns that aren't country-specific
340
- # if col_name.startswith("g_"):
341
- # continue
342
-
343
- # # Check if this is a country-specific column (nXX_)
344
- # is_country_column = False
345
- # matched_country = False
346
-
347
- # # Look for pattern nXX_ which would indicate a country-specific column
348
- # for i in range(len(col_name) - 3):
349
- # if (
350
- # col_name[i : i + 1].lower() == "n"
351
- # and len(col_name) > i + 3
352
- # and col_name[i + 3 : i + 4] == "_"
353
- # ):
354
- # country_code = col_name[i + 1 : i + 3].lower()
355
- # is_country_column = True
356
- # if country_code in normalized_codes:
357
- # matched_country = True
358
- # break
359
-
360
- # # If it's a country column but doesn't match our list, flag for removal
361
- # if is_country_column and not matched_country:
362
- # rows_to_remove.append(idx)
363
-
364
- # # Filter out rows for countries not in our list
365
- # if rows_to_remove:
366
- # return lookup_df.drop(rows_to_remove)
367
-
368
- # # return lookup_df
369
- # def filter_lookup_by_country_codes(
370
- # lookup_df: pd.DataFrame, national_codes: list = None
371
- # ) -> pd.DataFrame:
372
- # """
373
- # Filter lookup DataFrame to include only:
374
- # 1. Global columns (prefixed with 'g_')
375
- # 2. General columns (not country-specific)
376
- # 3. Country-specific columns matching the provided ISO2 codes (if national_codes provided)
377
-
378
- # If no national_codes are provided, ALL country-specific columns are filtered out.
379
-
380
- # Args:
381
- # lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
382
- # national_codes (list, optional): List of ISO2 country codes to include.
383
- # If None, all country-specific columns are removed.
384
-
385
- # Returns:
386
- # pd.DataFrame: Filtered lookup DataFrame
387
- # """
388
-
389
- # # Normalize national_codes to lowercase for case-insensitive comparison
390
- # if national_codes:
391
- # normalized_codes = [
392
- # code.lower() for code in national_codes if isinstance(code, str)
393
- # ]
394
- # else:
395
- # normalized_codes = []
396
-
397
- # # Keep track of rows to remove
398
- # rows_to_remove = []
399
-
400
- # # Process each row in the lookup DataFrame
401
- # for idx, row in lookup_df.iterrows():
402
- # col_name = row["name"]
403
-
404
- # # Skip if not a column name entry
405
- # if pd.isna(col_name):
406
- # continue
407
-
408
- # # Always keep global columns (g_) and general columns
409
- # if col_name.startswith("g_"):
410
- # continue
411
-
412
- # # Check if this is a country-specific column (nXX_)
413
- # is_country_column = False
414
- # matched_country = False
415
-
416
- # # Look for pattern nXX_ which indicates a country-specific column
417
- # for i in range(len(col_name) - 3):
418
- # if (
419
- # col_name[i : i + 1].lower() == "n"
420
- # and len(col_name) > i + 3
421
- # and col_name[i + 3 : i + 4] == "_"
422
- # ):
423
- # country_code = col_name[i + 1 : i + 3].lower()
424
- # is_country_column = True
425
-
426
- # # Only match if we have national_codes AND this country is in the list
427
- # if national_codes and country_code in normalized_codes:
428
- # matched_country = True
429
- # break
430
-
431
- # # Remove country-specific columns that don't match our criteria:
432
- # # - If no national_codes provided: remove ALL country columns
433
- # # - If national_codes provided: remove country columns NOT in the list
434
- # if is_country_column and not matched_country:
435
- # rows_to_remove.append(idx)
436
-
437
- # # Filter out flagged rows
438
- # if rows_to_remove:
439
- # print(f"Filtering out {(rows_to_remove)} country-specific row(s) not matching criteria")
440
- # filtered_df = lookup_df.drop(rows_to_remove)
441
-
442
- # # Filter out flagged rows
443
- # if rows_to_remove:
444
- # # Create detailed debug info
445
- # removed_rows_info = []
446
- # for idx in rows_to_remove:
447
- # row_name = lookup_df.loc[idx, "name"]
448
- # removed_rows_info.append({
449
- # 'index': idx,
450
- # 'name': row_name
451
- # })
452
-
453
- # # Extract just the column names for easy viewing
454
- # removed_column_names = [info['name'] for info in removed_rows_info]
455
-
456
-
457
- # print(f"Filtered out {len(rows_to_remove)} country-specific row(s) not matching criteria")
458
- # print(f"Removed column names: {removed_column_names}")
459
- # return filtered_df
460
-
461
- # return lookup_df
462
-
463
-
464
- def filter_lookup_by_country_codes(
465
- lookup_df: pd.DataFrame, filter_col, national_codes: list = None
466
- ):
467
- """Filter by actual ISO2 column values instead of column name patterns"""
468
-
469
- if not national_codes:
470
- # Remove all rows with country codes
471
- rows_with_country_codes = ~lookup_df[filter_col].isna()
472
- removed_names = lookup_df[rows_with_country_codes]["name"].tolist()
473
- logger.debug(
474
- f"No national codes provided - removing {len(removed_names)} rows with country codes"
475
- )
476
- logger.debug(f"Removed column names: {removed_names}")
477
- return lookup_df[lookup_df[filter_col].isna()]
478
-
479
- logger.debug(f"Filtering for national codes: {national_codes}")
480
- logger.debug(f"Total rows before filtering: {len(lookup_df)}")
481
-
482
- # Keep rows with no country code (global) OR matching country codes
483
- normalized_codes = [code.lower() for code in national_codes]
484
-
485
- mask = lookup_df[filter_col].isna() | lookup_df[ # Global datasets
486
- filter_col
487
- ].str.lower().isin(
488
- normalized_codes
489
- ) # Matching countries
490
-
491
- logger.debug(
492
- f"Filtering lookup by country codes: {national_codes}, keeping {mask.sum()} rows"
493
- )
494
-
495
- return lookup_df[mask]
1
+ # !pip install pandera[io] # special version used
2
+ import pandera as pa
3
+ import pandas as pd
4
+ import os
5
+ import logging
6
+ from pathlib import Path # Add this import
7
+
8
+ from openforis_whisp.logger import StdoutLogger, FileLogger
9
+
10
+ from openforis_whisp.pd_schemas import data_lookup_type
11
+
12
+
13
+ from openforis_whisp.parameters.config_runtime import (
14
+ DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
15
+ DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
16
+ )
17
+
18
+ logger = StdoutLogger(__name__)
19
+
20
+
21
+ # Dictionary to cache schema and modification times for multiple files
22
+ cached_schema = None
23
+ cached_file_mtimes = {}
24
+
25
+
26
+ def validate_dataframe_using_lookups(
27
+ df_stats: pd.DataFrame, file_paths: list = None, national_codes: list = None
28
+ ) -> pd.DataFrame:
29
+ """
30
+ Load the schema if any file in the list has changed and validate the DataFrame against the loaded schema.
31
+ Optionally filter columns by country code.
32
+
33
+ Args:
34
+ df_stats (pd.DataFrame): The DataFrame to validate.
35
+ file_paths (list): List of paths to schema files.
36
+ national_codes (list, optional): List of ISO2 country codes to include.
37
+
38
+ Returns:
39
+ pd.DataFrame: The validated DataFrame.
40
+ """
41
+
42
+ # Load the schema
43
+ schema = load_schema_if_any_file_changed(file_paths, national_codes=national_codes)
44
+
45
+ # Validate the DataFrame
46
+ validated_df = validate_dataframe(df_stats, schema)
47
+
48
+ return validated_df
49
+
50
+
51
+ def load_schema_if_any_file_changed(file_paths=None, national_codes=None):
52
+ """Load schema if files changed OR if national_codes changed"""
53
+
54
+ if file_paths is None:
55
+ file_paths = [
56
+ DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
57
+ DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
58
+ ]
59
+
60
+ # Include national_codes in cache key (including None case)
61
+ cache_key_parts = []
62
+ for file_path in file_paths:
63
+ if Path(file_path).exists():
64
+ mtime = Path(file_path).stat().st_mtime
65
+ cache_key_parts.append(f"{file_path}:{mtime}")
66
+ else:
67
+ cache_key_parts.append(f"{file_path}:missing")
68
+
69
+ # Always include national_codes in cache key (even if None)
70
+ national_codes_key = (
71
+ str(sorted(national_codes)) if national_codes else "no_countries"
72
+ )
73
+ cache_key_parts.append(f"national_codes:{national_codes_key}")
74
+
75
+ current_cache_key = "|".join(cache_key_parts)
76
+
77
+ # Check cache
78
+ if (
79
+ not hasattr(load_schema_if_any_file_changed, "_cached_schema")
80
+ or not hasattr(load_schema_if_any_file_changed, "_last_cache_key")
81
+ or load_schema_if_any_file_changed._last_cache_key != current_cache_key
82
+ ):
83
+
84
+ print(f"Creating schema for national_codes: {national_codes}")
85
+
86
+ # Load and combine lookup files
87
+ combined_lookup_df = append_csvs_to_dataframe(file_paths)
88
+
89
+ # ALWAYS filter by national codes (even if None - this removes all country columns)
90
+ filtered_lookup_df = filter_lookup_by_country_codes(
91
+ lookup_df=combined_lookup_df,
92
+ filter_col="ISO2_code",
93
+ national_codes=national_codes,
94
+ )
95
+
96
+ # Create schema from filtered lookup
97
+ schema = create_schema_from_dataframe(filtered_lookup_df)
98
+
99
+ # Cache the results
100
+ load_schema_if_any_file_changed._cached_schema = schema
101
+ load_schema_if_any_file_changed._last_cache_key = current_cache_key
102
+
103
+ return schema
104
+ else:
105
+ print(f"Using cached schema for national_codes: {national_codes}")
106
+ return load_schema_if_any_file_changed._cached_schema
107
+
108
+
109
+ def validate_dataframe(
110
+ df_stats: pd.DataFrame, schema: pa.DataFrameSchema
111
+ ) -> pd.DataFrame:
112
+ """Validate the DataFrame against the given schema, reorder columns to match schema order, and list missing columns.
113
+
114
+ Args:
115
+ schema (pa.DataFrameSchema): The schema to validate against.
116
+ df_stats (pd.DataFrame): The DataFrame to validate.
117
+ required_false (bool): If True, sets all columns in the schema as optional (required=False).
118
+
119
+ Returns:
120
+ pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
121
+ """
122
+ log_missing_columns(df_stats, schema)
123
+
124
+ # df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
125
+
126
+ # Try to automatically coerce the DataFrame to match the schema types
127
+ try:
128
+ validated_df = schema(df_stats)
129
+ except pa.errors.SchemaError as e:
130
+ print("Error during validation:", e)
131
+ # Return None or raise the error if validation fails
132
+ return None # or raise e
133
+
134
+ # Reorder the validated DataFrame to match the schema's column order
135
+ validated_df = validated_df.reindex(schema.columns.keys(), axis=1)
136
+
137
+ return validated_df
138
+
139
+
140
+ def append_csvs_to_dataframe(csv_paths):
141
+ """
142
+ Appends multiple CSV files into a single Pandas DataFrame.
143
+
144
+ Args:
145
+ - csv_paths (list of str): List of paths to CSV files to append.
146
+
147
+ Returns:
148
+ - pd.DataFrame: Combined DataFrame containing data from all provided CSV files.
149
+
150
+ Raises:
151
+ - ValueError: If any CSV file cannot be read.
152
+ """
153
+
154
+ combined_df = pd.DataFrame() # Initialize an empty DataFrame
155
+
156
+ for path in csv_paths:
157
+ try:
158
+ # Read the CSV file into a DataFrame
159
+ df = pd.read_csv(path)
160
+ # Append to the combined DataFrame
161
+ combined_df = pd.concat([combined_df, df], ignore_index=True)
162
+ except Exception as e:
163
+ raise ValueError(f"Error reading {path}: {e}")
164
+
165
+ return combined_df
166
+
167
+
168
+ def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
169
+ """Create a Pandera schema from a DataFrame containing schema information."""
170
+
171
+ if schema_df.empty:
172
+ raise ValueError("The input DataFrame is empty.")
173
+
174
+ required_columns = ["name", "col_type", "is_nullable", "is_required"]
175
+ missing_columns = [col for col in required_columns if col not in schema_df.columns]
176
+ if missing_columns:
177
+ raise ValueError(f"Missing columns in schema DataFrame: {missing_columns}")
178
+
179
+ # print("Schema DataFrame columns:", schema_df.columns)
180
+
181
+ # Sort DataFrame by 'order' if it exists
182
+ if "order" in schema_df.columns:
183
+ schema_df = schema_df.sort_values(by="order")
184
+
185
+ # Remove rows where 'exclude_from_output' equals 1, if that column exists
186
+ if "exclude_from_output" in schema_df.columns:
187
+ schema_df = schema_df[schema_df["exclude_from_output"] != 1]
188
+
189
+ # Create a dictionary to hold the column schema
190
+ schema_dict = {}
191
+ for _, row in schema_df.iterrows():
192
+ col_name = row["name"]
193
+ col_type = row["col_type"]
194
+ is_nullable = row["is_nullable"] in (1, "1", True, "True")
195
+ is_required = row["is_required"] in (1, "1", True, "True")
196
+
197
+ # print(
198
+ # f"Processing column: {col_name}, Type: {col_type}, Nullable: {is_nullable}, Required: {is_required}"
199
+ # )
200
+
201
+ # Map DataFrame types to Pandera types
202
+ if col_type == "int64":
203
+ schema_dict[col_name] = pa.Column(
204
+ pa.Int64, nullable=is_nullable, required=is_required
205
+ )
206
+ elif col_type == "int":
207
+ schema_dict[col_name] = pa.Column(
208
+ pa.Int, nullable=is_nullable, required=is_required
209
+ )
210
+ elif col_type == "string":
211
+ schema_dict[col_name] = pa.Column(
212
+ pa.String, nullable=is_nullable, required=is_required
213
+ )
214
+ elif col_type == "float32":
215
+ schema_dict[col_name] = pa.Column(
216
+ pa.Float32, nullable=is_nullable, required=is_required
217
+ )
218
+ elif col_type == "float64":
219
+ schema_dict[col_name] = pa.Column(
220
+ pa.Float64, nullable=is_nullable, required=is_required
221
+ )
222
+ elif col_type == "bool":
223
+ schema_dict[col_name] = pa.Column(
224
+ pa.Bool, nullable=is_nullable, required=is_required
225
+ )
226
+ else:
227
+ raise ValueError(f"Unsupported type: {col_type}")
228
+
229
+ # Create and return the DataFrame schema with coercion enabled
230
+ schema = pa.DataFrameSchema(
231
+ schema_dict,
232
+ strict=False,
233
+ unique_column_names=True,
234
+ add_missing_columns=True,
235
+ coerce=True,
236
+ )
237
+
238
+ return schema
239
+
240
+
241
+ # def setup_logger(name):
242
+ # # Create and configure logger
243
+ # logging.basicConfig(level=logging.INFO)
244
+ # logger = logging.getLogger(name)
245
+ # return logger
246
+
247
+
248
+ def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
249
+ # Initialize the logger
250
+ logger = setup_logger(__name__)
251
+
252
+ # Extract the expected columns from the DataFrameSchema
253
+ template_columns = template_schema.columns.keys()
254
+ df_stats_columns = df_stats.columns
255
+
256
+ # Find missing columns
257
+ missing_in_template = [
258
+ col for col in df_stats_columns if col not in template_columns
259
+ ]
260
+ missing_in_stats = [col for col in template_columns if col not in df_stats_columns]
261
+
262
+ # Log results for missing columns in df_stats
263
+ if missing_in_template:
264
+ logger.warning(
265
+ f"The following columns from the results dataframe did not match any columns in the schema: \n{', '.join(missing_in_template)}"
266
+ )
267
+ else:
268
+ logger.info("All columns from dataframe found in the schema.")
269
+
270
+ # Log results for missing columns in template_df
271
+ if missing_in_stats:
272
+ logger.warning(
273
+ f"The following columns in the schema did not match any columns from the results dataframe: \n{', '.join(missing_in_stats)}"
274
+ )
275
+ else:
276
+ logger.info("All columns from the schema found in the results dataframe.")
277
+
278
+
279
+ def setup_logger(name):
280
+ """
281
+ Set up a logger with a specific name to avoid duplicate logs.
282
+ """
283
+ logger = logging.getLogger(name)
284
+ if not logger.hasHandlers():
285
+ # Create handlers only if there are none
286
+ stdout_handler = logging.StreamHandler()
287
+ file_handler = logging.FileHandler("missing_columns.log")
288
+
289
+ # Set levels
290
+ stdout_handler.setLevel(logging.WARNING)
291
+ file_handler.setLevel(logging.WARNING)
292
+
293
+ # Create formatter and add it to the handlers
294
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
295
+ stdout_handler.setFormatter(formatter)
296
+ file_handler.setFormatter(formatter)
297
+
298
+ # Add handlers to the logger
299
+ logger.addHandler(stdout_handler)
300
+ logger.addHandler(file_handler)
301
+
302
+ return logger
303
+
304
+
305
+ def filter_lookup_by_country_codes(
306
+ lookup_df: pd.DataFrame, national_codes: list
307
+ ) -> pd.DataFrame:
308
+ """
309
+ Filter lookup DataFrame to include only:
310
+ 1. Global columns (prefixed with 'g_')
311
+ 2. General columns (not country-specific)
312
+ 3. Country-specific columns matching the provided ISO2 codes
313
+
314
+ Args:
315
+ lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
316
+ national_codes (list): List of ISO2 country codes to include
317
+
318
+ Returns:
319
+ pd.DataFrame: Filtered lookup DataFrame
320
+ """
321
+ if not national_codes:
322
+ return lookup_df
323
+
324
+ # Normalize national_codes to lowercase for case-insensitive comparison
325
+ normalized_codes = [
326
+ code.lower() for code in national_codes if isinstance(code, str)
327
+ ]
328
+
329
+ # Keep track of rows to filter out
330
+ rows_to_remove = []
331
+
332
+ # Process each row in the lookup DataFrame
333
+ for idx, row in lookup_df.iterrows():
334
+ col_name = row["name"]
335
+
336
+ # Skip if not a column name entry
337
+ if pd.isna(col_name):
338
+ continue
339
+
340
+ # Always keep global columns (g_) and columns that aren't country-specific
341
+ if col_name.startswith("g_"):
342
+ continue
343
+
344
+ # Check if this is a country-specific column (nXX_)
345
+ is_country_column = False
346
+ matched_country = False
347
+
348
+ # Look for pattern nXX_ which would indicate a country-specific column
349
+ for i in range(len(col_name) - 3):
350
+ if (
351
+ col_name[i : i + 1].lower() == "n"
352
+ and len(col_name) > i + 3
353
+ and col_name[i + 3 : i + 4] == "_"
354
+ ):
355
+ country_code = col_name[i + 1 : i + 3].lower()
356
+ is_country_column = True
357
+ if country_code in normalized_codes:
358
+ matched_country = True
359
+ break
360
+
361
+ # If it's a country column but doesn't match our list, flag for removal
362
+ if is_country_column and not matched_country:
363
+ rows_to_remove.append(idx)
364
+
365
+ # Filter out rows for countries not in our list
366
+ if rows_to_remove:
367
+ return lookup_df.drop(rows_to_remove)
368
+
369
+
370
+ # return lookup_df
371
+ def filter_lookup_by_country_codes(
372
+ lookup_df: pd.DataFrame, national_codes: list = None
373
+ ) -> pd.DataFrame:
374
+ """
375
+ Filter lookup DataFrame to include only:
376
+ 1. Global columns (prefixed with 'g_')
377
+ 2. General columns (not country-specific)
378
+ 3. Country-specific columns matching the provided ISO2 codes (if national_codes provided)
379
+
380
+ If no national_codes are provided, ALL country-specific columns are filtered out.
381
+
382
+ Args:
383
+ lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
384
+ national_codes (list, optional): List of ISO2 country codes to include.
385
+ If None, all country-specific columns are removed.
386
+
387
+ Returns:
388
+ pd.DataFrame: Filtered lookup DataFrame
389
+ """
390
+
391
+ # Normalize national_codes to lowercase for case-insensitive comparison
392
+ if national_codes:
393
+ normalized_codes = [
394
+ code.lower() for code in national_codes if isinstance(code, str)
395
+ ]
396
+ else:
397
+ normalized_codes = []
398
+
399
+ # Keep track of rows to remove
400
+ rows_to_remove = []
401
+
402
+ # Process each row in the lookup DataFrame
403
+ for idx, row in lookup_df.iterrows():
404
+ col_name = row["name"]
405
+
406
+ # Skip if not a column name entry
407
+ if pd.isna(col_name):
408
+ continue
409
+
410
+ # Always keep global columns (g_) and general columns
411
+ if col_name.startswith("g_"):
412
+ continue
413
+
414
+ # Check if this is a country-specific column (nXX_)
415
+ is_country_column = False
416
+ matched_country = False
417
+
418
+ # Look for pattern nXX_ which indicates a country-specific column
419
+ for i in range(len(col_name) - 3):
420
+ if (
421
+ col_name[i : i + 1].lower() == "n"
422
+ and len(col_name) > i + 3
423
+ and col_name[i + 3 : i + 4] == "_"
424
+ ):
425
+ country_code = col_name[i + 1 : i + 3].lower()
426
+ is_country_column = True
427
+
428
+ # Only match if we have national_codes AND this country is in the list
429
+ if national_codes and country_code in normalized_codes:
430
+ matched_country = True
431
+ break
432
+
433
+ # Remove country-specific columns that don't match our criteria:
434
+ # - If no national_codes provided: remove ALL country columns
435
+ # - If national_codes provided: remove country columns NOT in the list
436
+ if is_country_column and not matched_country:
437
+ rows_to_remove.append(idx)
438
+
439
+ # Filter out flagged rows
440
+ if rows_to_remove:
441
+ print(
442
+ f"Filtering out {(rows_to_remove)} country-specific row(s) not matching criteria"
443
+ )
444
+ filtered_df = lookup_df.drop(rows_to_remove)
445
+
446
+ # Filter out flagged rows
447
+ # if rows_to_remove:
448
+ # # Create detailed debug info
449
+ # removed_rows_info = []
450
+ # for idx in rows_to_remove:
451
+ # row_name = lookup_df.loc[idx, "name"]
452
+ # removed_rows_info.append({
453
+ # 'index': idx,
454
+ # 'name': row_name
455
+ # })
456
+
457
+ # # Extract just the column names for easy viewing
458
+ # removed_column_names = [info['name'] for info in removed_rows_info]
459
+
460
+ # print(f"Filtered out {len(rows_to_remove)} country-specific row(s) not matching criteria")
461
+ # print(f"Removed column names: {removed_column_names}")
462
+ # return filtered_df
463
+
464
+ return lookup_df
465
+
466
+
467
+ def filter_lookup_by_country_codes(
468
+ lookup_df: pd.DataFrame, filter_col, national_codes: list = None
469
+ ):
470
+ """Filter by actual ISO2 column values instead of column name patterns"""
471
+
472
+ if not national_codes:
473
+ # Remove all rows with country codes
474
+ rows_with_country_codes = ~lookup_df[filter_col].isna()
475
+ removed_names = lookup_df[rows_with_country_codes]["name"].tolist()
476
+ logger.debug(
477
+ f"No national codes provided - removing {len(removed_names)} rows with country codes"
478
+ )
479
+ logger.debug(f"Removed column names: {removed_names}")
480
+ return lookup_df[lookup_df[filter_col].isna()]
481
+
482
+ logger.debug(f"Filtering for national codes: {national_codes}")
483
+ logger.debug(f"Total rows before filtering: {len(lookup_df)}")
484
+
485
+ # Keep rows with no country code (global) OR matching country codes
486
+ normalized_codes = [code.lower() for code in national_codes]
487
+
488
+ mask = lookup_df[filter_col].isna() | lookup_df[ # Global datasets
489
+ filter_col
490
+ ].str.lower().isin(
491
+ normalized_codes
492
+ ) # Matching countries
493
+
494
+ logger.debug(
495
+ f"Filtering lookup by country codes: {national_codes}, keeping {mask.sum()} rows"
496
+ )
497
+
498
+ return lookup_df[mask]
499
+
500
+
501
+ def validate_dataframe_using_lookups_flexible(
502
+ df_stats: pd.DataFrame,
503
+ file_paths: list = None,
504
+ national_codes: list = None,
505
+ custom_bands=None,
506
+ ) -> pd.DataFrame:
507
+ """
508
+ Load schema and validate DataFrame while handling custom bands properly.
509
+
510
+ Parameters
511
+ ----------
512
+ df_stats : pd.DataFrame
513
+ DataFrame to validate
514
+ file_paths : list, optional
515
+ Schema file paths
516
+ national_codes : list, optional
517
+ Country codes for filtering
518
+ custom_bands : list or dict or None, optional
519
+ Custom band information:
520
+ - List: ['band1', 'band2'] - only preserves these specific bands
521
+ - Dict: {'band1': 'float64', 'band2': 'int64'} - validates these specific bands with types
522
+ - None: excludes ALL custom bands (strict mode)
523
+
524
+ Returns
525
+ -------
526
+ pd.DataFrame
527
+ Validated DataFrame with custom bands handled according to specification
528
+ """
529
+ # Load default schema
530
+ schema = load_schema_if_any_file_changed(file_paths, national_codes=national_codes)
531
+ schema_columns = list(schema.columns.keys())
532
+
533
+ # Identify extra columns
534
+ df_columns = df_stats.columns.tolist()
535
+ extra_columns = [col for col in df_columns if col not in schema_columns]
536
+ schema_only_columns = [col for col in df_columns if col in schema_columns]
537
+
538
+ if extra_columns:
539
+ logger.info(f"Found {len(extra_columns)} extra columns: {extra_columns}")
540
+
541
+ # Split DataFrame
542
+ df_schema_part = (
543
+ df_stats[schema_only_columns].copy()
544
+ if schema_only_columns
545
+ else pd.DataFrame()
546
+ )
547
+ df_extra_part = df_stats[extra_columns].copy()
548
+
549
+ # Validate schema columns if any exist
550
+ if not df_schema_part.empty:
551
+ try:
552
+ validated_schema_part = validate_dataframe(df_schema_part, schema)
553
+ except Exception as e:
554
+ logger.error(f"Schema validation failed: {e}")
555
+ validated_schema_part = (
556
+ df_schema_part # Keep original if validation fails
557
+ )
558
+ else:
559
+ validated_schema_part = pd.DataFrame()
560
+
561
+ # ========== KEY FIX: Handle custom_bands=None properly ==========
562
+ if custom_bands is None:
563
+ # STRICT MODE: Exclude all custom bands when None
564
+ logger.info("custom_bands=None: Excluding all custom bands (strict mode)")
565
+ # Return only the schema columns, no extra columns
566
+ return (
567
+ validated_schema_part
568
+ if not validated_schema_part.empty
569
+ else pd.DataFrame()
570
+ )
571
+
572
+ elif custom_bands is not None:
573
+ # Process custom bands as specified
574
+ df_extra_part = _process_custom_bands(df_extra_part, custom_bands)
575
+
576
+ # Combine results
577
+ if not validated_schema_part.empty and not df_extra_part.empty:
578
+ result = pd.concat([validated_schema_part, df_extra_part], axis=1)
579
+ elif not validated_schema_part.empty:
580
+ result = validated_schema_part
581
+ else:
582
+ result = df_extra_part
583
+
584
+ # Reorder: schema columns first, then extra columns
585
+ if not validated_schema_part.empty:
586
+ ordered_columns = [
587
+ col for col in schema_columns if col in result.columns
588
+ ] + [col for col in df_extra_part.columns]
589
+ result = result[ordered_columns]
590
+
591
+ return result
592
+
593
+ else:
594
+ # No extra columns - use normal validation
595
+ return validate_dataframe(df_stats, schema)
596
+
597
+
598
+ def _process_custom_bands(df_extra: pd.DataFrame, custom_bands) -> pd.DataFrame:
599
+ """
600
+ Process custom bands according to user specifications.
601
+
602
+ Parameters
603
+ ----------
604
+ df_extra : pd.DataFrame
605
+ DataFrame with extra columns
606
+ custom_bands : list or dict
607
+ Custom band specifications
608
+
609
+ Returns
610
+ -------
611
+ pd.DataFrame
612
+ Processed DataFrame with custom bands
613
+ """
614
+ if isinstance(custom_bands, list):
615
+ # Just preserve specified columns as-is
616
+ custom_band_cols = [col for col in custom_bands if col in df_extra.columns]
617
+ if custom_band_cols:
618
+ logger.info(f"Preserving custom bands as-is: {custom_band_cols}")
619
+ return df_extra[custom_band_cols]
620
+ else:
621
+ logger.warning(
622
+ f"None of the specified custom bands {custom_bands} found in DataFrame"
623
+ )
624
+ return df_extra
625
+
626
+ elif isinstance(custom_bands, dict):
627
+ # Apply type conversions
628
+ result_df = df_extra.copy()
629
+
630
+ for band_name, target_type in custom_bands.items():
631
+ if band_name in result_df.columns:
632
+ try:
633
+ if target_type == "float64":
634
+ result_df[band_name] = pd.to_numeric(
635
+ result_df[band_name], errors="coerce"
636
+ ).astype("float64")
637
+ elif target_type == "float32":
638
+ result_df[band_name] = pd.to_numeric(
639
+ result_df[band_name], errors="coerce"
640
+ ).astype("float32")
641
+ elif target_type == "int64":
642
+ result_df[band_name] = pd.to_numeric(
643
+ result_df[band_name], errors="coerce"
644
+ ).astype(
645
+ "Int64"
646
+ ) # Nullable int
647
+ elif target_type == "string":
648
+ result_df[band_name] = result_df[band_name].astype("string")
649
+ elif target_type == "bool":
650
+ result_df[band_name] = result_df[band_name].astype("bool")
651
+
652
+ logger.info(f"Converted {band_name} to {target_type}")
653
+
654
+ except Exception as e:
655
+ logger.warning(
656
+ f"Failed to convert {band_name} to {target_type}: {e}"
657
+ )
658
+ else:
659
+ logger.warning(f"Custom band {band_name} not found in DataFrame")
660
+
661
+ return result_df
662
+
663
+ else:
664
+ # Unknown format, just return as-is
665
+ logger.warning(
666
+ f"Unknown custom_bands format: {type(custom_bands)}. Preserving all extra columns as-is."
667
+ )
668
+ return df_extra
669
+
670
+
671
+ # Fix the duplicate logging issue
672
+ def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
673
+ # Remove the duplicate logger creation line
674
+ # logger = setup_logger(__name__) # DELETE THIS LINE
675
+
676
+ # Use the existing module-level logger (line 18: logger = StdoutLogger(__name__))
677
+
678
+ # Extract the expected columns from the DataFrameSchema
679
+ template_columns = list(template_schema.columns.keys())
680
+ df_stats_columns = df_stats.columns.tolist()
681
+
682
+ # Find missing and extra columns
683
+ missing_in_df = [col for col in template_columns if col not in df_stats_columns]
684
+ extra_in_df = [col for col in df_stats_columns if col not in template_columns]
685
+
686
+ # Log missing schema columns
687
+ if missing_in_df:
688
+ logger.warning(f"Missing expected schema columns: {missing_in_df}")
689
+ else:
690
+ logger.info("All expected schema columns found in DataFrame.")
691
+
692
+ # Log extra columns (will be preserved)
693
+ if extra_in_df:
694
+ logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
695
+ else:
696
+ logger.info("No extra columns found in DataFrame.")