openforis-whisp 1.0.0a1__py3-none-any.whl → 2.0.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ import pandera as pa
3
3
  import pandas as pd
4
4
  import os
5
5
  import logging
6
-
6
+ from pathlib import Path # Add this import
7
7
 
8
8
  from openforis_whisp.logger import StdoutLogger, FileLogger
9
9
 
@@ -24,20 +24,22 @@ cached_file_mtimes = {}
24
24
 
25
25
 
26
26
  def validate_dataframe_using_lookups(
27
- df_stats: pd.DataFrame, file_paths: list = None
27
+ df_stats: pd.DataFrame, file_paths: list = None, national_codes: list = None
28
28
  ) -> pd.DataFrame:
29
29
  """
30
30
  Load the schema if any file in the list has changed and validate the DataFrame against the loaded schema.
31
+ Optionally filter columns by country code.
31
32
 
32
33
  Args:
33
34
  df_stats (pd.DataFrame): The DataFrame to validate.
34
35
  file_paths (list): List of paths to schema files.
36
+ national_codes (list, optional): List of ISO2 country codes to include.
35
37
 
36
38
  Returns:
37
39
  pd.DataFrame: The validated DataFrame.
38
40
  """
39
41
  # Load the schema
40
- schema = load_schema_if_any_file_changed(file_paths)
42
+ schema = load_schema_if_any_file_changed(file_paths, national_codes=national_codes)
41
43
 
42
44
  # Validate the DataFrame
43
45
  validated_df = validate_dataframe(df_stats, schema)
@@ -45,10 +47,8 @@ def validate_dataframe_using_lookups(
45
47
  return validated_df
46
48
 
47
49
 
48
- # NB uses default inputs. If you want to use custom inputs, you can pass them as arguments
49
- def load_schema_if_any_file_changed(file_paths):
50
- """Load schema only if any file in the list has changed."""
51
- global cached_schema, cached_file_mtimes
50
+ def load_schema_if_any_file_changed(file_paths=None, national_codes=None):
51
+ """Load schema if files changed OR if national_codes changed"""
52
52
 
53
53
  if file_paths is None:
54
54
  file_paths = [
@@ -56,43 +56,53 @@ def load_schema_if_any_file_changed(file_paths):
56
56
  DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
57
57
  ]
58
58
 
59
- # Flag to indicate if any file has changed
60
- schema_needs_update = False
61
-
62
- # Check each file's modification time
59
+ # Include national_codes in cache key (including None case)
60
+ cache_key_parts = []
63
61
  for file_path in file_paths:
64
- current_mtime = os.path.getmtime(file_path)
65
-
66
- # If the file is new or has been modified, mark schema for update
67
- if (
68
- file_path not in cached_file_mtimes
69
- or current_mtime != cached_file_mtimes[file_path]
70
- ):
71
- print(f"File {file_path} changed, updating schema...")
72
- schema_needs_update = True
73
- cached_file_mtimes[
74
- file_path
75
- ] = current_mtime # Update the modification time
76
-
77
- # If any file has changed, update the schema
78
- if schema_needs_update or cached_schema is None:
79
- print("Creating or updating schema based on changed files...")
80
- # You can combine the files as needed; here we assume one schema file
81
- # If you want to handle multiple schema files differently, adjust this
82
-
83
- # add checks on lookup inputs (i.e. a dataframe in type format: data_lookup_type)
84
- combined_lookup_df: data_lookup_type = append_csvs_to_dataframe(
85
- file_paths
86
- ) # concatonates input lookup files
87
-
88
- cached_schema = create_schema_from_dataframe(
89
- combined_lookup_df
90
- ) # create cached schema
62
+ if Path(file_path).exists():
63
+ mtime = Path(file_path).stat().st_mtime
64
+ cache_key_parts.append(f"{file_path}:{mtime}")
65
+ else:
66
+ cache_key_parts.append(f"{file_path}:missing")
91
67
 
92
- else:
93
- print("Using cached schema.")
68
+ # Always include national_codes in cache key (even if None)
69
+ national_codes_key = (
70
+ str(sorted(national_codes)) if national_codes else "no_countries"
71
+ )
72
+ cache_key_parts.append(f"national_codes:{national_codes_key}")
73
+
74
+ current_cache_key = "|".join(cache_key_parts)
75
+
76
+ # Check cache
77
+ if (
78
+ not hasattr(load_schema_if_any_file_changed, "_cached_schema")
79
+ or not hasattr(load_schema_if_any_file_changed, "_last_cache_key")
80
+ or load_schema_if_any_file_changed._last_cache_key != current_cache_key
81
+ ):
82
+
83
+ print(f"Creating schema for national_codes: {national_codes}")
84
+
85
+ # Load and combine lookup files
86
+ combined_lookup_df = append_csvs_to_dataframe(file_paths)
87
+
88
+ # ALWAYS filter by national codes (even if None - this removes all country columns)
89
+ filtered_lookup_df = filter_lookup_by_country_codes(
90
+ lookup_df=combined_lookup_df,
91
+ filter_col="ISO2_code",
92
+ national_codes=national_codes,
93
+ )
94
+
95
+ # Create schema from filtered lookup
96
+ schema = create_schema_from_dataframe(filtered_lookup_df)
94
97
 
95
- return cached_schema
98
+ # Cache the results
99
+ load_schema_if_any_file_changed._cached_schema = schema
100
+ load_schema_if_any_file_changed._last_cache_key = current_cache_key
101
+
102
+ return schema
103
+ else:
104
+ print(f"Using cached schema for national_codes: {national_codes}")
105
+ return load_schema_if_any_file_changed._cached_schema
96
106
 
97
107
 
98
108
  def validate_dataframe(
@@ -126,61 +136,6 @@ def validate_dataframe(
126
136
  return validated_df
127
137
 
128
138
 
129
- def load_schema_if_any_file_changed(file_paths):
130
- """Load schema only if any file in the list has changed."""
131
- global cached_schema, cached_file_mtimes
132
-
133
- if file_paths is None:
134
- file_paths = [
135
- DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
136
- DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
137
- ]
138
-
139
- # Flag to indicate if any file has changed
140
- schema_needs_update = False
141
-
142
- # Check each file's modification time
143
- for file_path in file_paths:
144
- current_mtime = os.path.getmtime(file_path)
145
-
146
- # If the file is new or has been modified, mark schema for update
147
- if (
148
- file_path not in cached_file_mtimes
149
- or current_mtime != cached_file_mtimes[file_path]
150
- ):
151
- print(f"File {file_path} changed, updating schema...")
152
- schema_needs_update = True
153
- cached_file_mtimes[
154
- file_path
155
- ] = current_mtime # Update the modification time
156
-
157
- # If any file has changed, update the schema
158
- if schema_needs_update or cached_schema is None:
159
- print("Creating or updating schema based on changed files...")
160
- # You can combine the files as needed; here we assume one schema file
161
- # If you want to handle multiple schema files differently, adjust this
162
-
163
- # add checks on lookup inputs (i.e. a dataframe in type format: data_lookup_type)
164
- combined_lookup_df: data_lookup_type = append_csvs_to_dataframe(
165
- file_paths
166
- ) # concatonates input lookup files
167
-
168
- cached_schema = create_schema_from_dataframe(
169
- combined_lookup_df
170
- ) # create cached schema
171
-
172
- else:
173
- print("Using cached schema.")
174
-
175
- return cached_schema
176
-
177
-
178
- # example code to convert schema to JSON format if want to export (note pandera[io] required)
179
- # cached_schema.to_yaml(output_file_path)
180
-
181
- # loaded_schema = io.from_yaml(output_file_path)
182
-
183
-
184
139
  def append_csvs_to_dataframe(csv_paths):
185
140
  """
186
141
  Appends multiple CSV files into a single Pandas DataFrame.
@@ -344,3 +299,197 @@ def setup_logger(name):
344
299
  logger.addHandler(file_handler)
345
300
 
346
301
  return logger
302
+
303
+
304
+ # def filter_lookup_by_country_codes(
305
+ # lookup_df: pd.DataFrame, national_codes: list
306
+ # ) -> pd.DataFrame:
307
+ # """
308
+ # Filter lookup DataFrame to include only:
309
+ # 1. Global columns (prefixed with 'g_')
310
+ # 2. General columns (not country-specific)
311
+ # 3. Country-specific columns matching the provided ISO2 codes
312
+
313
+ # Args:
314
+ # lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
315
+ # national_codes (list): List of ISO2 country codes to include
316
+
317
+ # Returns:
318
+ # pd.DataFrame: Filtered lookup DataFrame
319
+ # """
320
+ # if not national_codes:
321
+ # return lookup_df
322
+
323
+ # # Normalize national_codes to lowercase for case-insensitive comparison
324
+ # normalized_codes = [
325
+ # code.lower() for code in national_codes if isinstance(code, str)
326
+ # ]
327
+
328
+ # # Keep track of rows to filter out
329
+ # rows_to_remove = []
330
+
331
+ # # Process each row in the lookup DataFrame
332
+ # for idx, row in lookup_df.iterrows():
333
+ # col_name = row["name"]
334
+
335
+ # # Skip if not a column name entry
336
+ # if pd.isna(col_name):
337
+ # continue
338
+
339
+ # # Always keep global columns (g_) and columns that aren't country-specific
340
+ # if col_name.startswith("g_"):
341
+ # continue
342
+
343
+ # # Check if this is a country-specific column (nXX_)
344
+ # is_country_column = False
345
+ # matched_country = False
346
+
347
+ # # Look for pattern nXX_ which would indicate a country-specific column
348
+ # for i in range(len(col_name) - 3):
349
+ # if (
350
+ # col_name[i : i + 1].lower() == "n"
351
+ # and len(col_name) > i + 3
352
+ # and col_name[i + 3 : i + 4] == "_"
353
+ # ):
354
+ # country_code = col_name[i + 1 : i + 3].lower()
355
+ # is_country_column = True
356
+ # if country_code in normalized_codes:
357
+ # matched_country = True
358
+ # break
359
+
360
+ # # If it's a country column but doesn't match our list, flag for removal
361
+ # if is_country_column and not matched_country:
362
+ # rows_to_remove.append(idx)
363
+
364
+ # # Filter out rows for countries not in our list
365
+ # if rows_to_remove:
366
+ # return lookup_df.drop(rows_to_remove)
367
+
368
+ # # return lookup_df
369
+ # def filter_lookup_by_country_codes(
370
+ # lookup_df: pd.DataFrame, national_codes: list = None
371
+ # ) -> pd.DataFrame:
372
+ # """
373
+ # Filter lookup DataFrame to include only:
374
+ # 1. Global columns (prefixed with 'g_')
375
+ # 2. General columns (not country-specific)
376
+ # 3. Country-specific columns matching the provided ISO2 codes (if national_codes provided)
377
+
378
+ # If no national_codes are provided, ALL country-specific columns are filtered out.
379
+
380
+ # Args:
381
+ # lookup_df (pd.DataFrame): The lookup DataFrame used to create the schema
382
+ # national_codes (list, optional): List of ISO2 country codes to include.
383
+ # If None, all country-specific columns are removed.
384
+
385
+ # Returns:
386
+ # pd.DataFrame: Filtered lookup DataFrame
387
+ # """
388
+
389
+ # # Normalize national_codes to lowercase for case-insensitive comparison
390
+ # if national_codes:
391
+ # normalized_codes = [
392
+ # code.lower() for code in national_codes if isinstance(code, str)
393
+ # ]
394
+ # else:
395
+ # normalized_codes = []
396
+
397
+ # # Keep track of rows to remove
398
+ # rows_to_remove = []
399
+
400
+ # # Process each row in the lookup DataFrame
401
+ # for idx, row in lookup_df.iterrows():
402
+ # col_name = row["name"]
403
+
404
+ # # Skip if not a column name entry
405
+ # if pd.isna(col_name):
406
+ # continue
407
+
408
+ # # Always keep global columns (g_) and general columns
409
+ # if col_name.startswith("g_"):
410
+ # continue
411
+
412
+ # # Check if this is a country-specific column (nXX_)
413
+ # is_country_column = False
414
+ # matched_country = False
415
+
416
+ # # Look for pattern nXX_ which indicates a country-specific column
417
+ # for i in range(len(col_name) - 3):
418
+ # if (
419
+ # col_name[i : i + 1].lower() == "n"
420
+ # and len(col_name) > i + 3
421
+ # and col_name[i + 3 : i + 4] == "_"
422
+ # ):
423
+ # country_code = col_name[i + 1 : i + 3].lower()
424
+ # is_country_column = True
425
+
426
+ # # Only match if we have national_codes AND this country is in the list
427
+ # if national_codes and country_code in normalized_codes:
428
+ # matched_country = True
429
+ # break
430
+
431
+ # # Remove country-specific columns that don't match our criteria:
432
+ # # - If no national_codes provided: remove ALL country columns
433
+ # # - If national_codes provided: remove country columns NOT in the list
434
+ # if is_country_column and not matched_country:
435
+ # rows_to_remove.append(idx)
436
+
437
+ # # Filter out flagged rows
438
+ # if rows_to_remove:
439
+ # print(f"Filtering out {(rows_to_remove)} country-specific row(s) not matching criteria")
440
+ # filtered_df = lookup_df.drop(rows_to_remove)
441
+
442
+ # # Filter out flagged rows
443
+ # if rows_to_remove:
444
+ # # Create detailed debug info
445
+ # removed_rows_info = []
446
+ # for idx in rows_to_remove:
447
+ # row_name = lookup_df.loc[idx, "name"]
448
+ # removed_rows_info.append({
449
+ # 'index': idx,
450
+ # 'name': row_name
451
+ # })
452
+
453
+ # # Extract just the column names for easy viewing
454
+ # removed_column_names = [info['name'] for info in removed_rows_info]
455
+
456
+
457
+ # print(f"Filtered out {len(rows_to_remove)} country-specific row(s) not matching criteria")
458
+ # print(f"Removed column names: {removed_column_names}")
459
+ # return filtered_df
460
+
461
+ # return lookup_df
462
+
463
+
464
+ def filter_lookup_by_country_codes(
465
+ lookup_df: pd.DataFrame, filter_col, national_codes: list = None
466
+ ):
467
+ """Filter by actual ISO2 column values instead of column name patterns"""
468
+
469
+ if not national_codes:
470
+ # Remove all rows with country codes
471
+ rows_with_country_codes = ~lookup_df[filter_col].isna()
472
+ removed_names = lookup_df[rows_with_country_codes]["name"].tolist()
473
+ logger.debug(
474
+ f"No national codes provided - removing {len(removed_names)} rows with country codes"
475
+ )
476
+ logger.debug(f"Removed column names: {removed_names}")
477
+ return lookup_df[lookup_df[filter_col].isna()]
478
+
479
+ logger.debug(f"Filtering for national codes: {national_codes}")
480
+ logger.debug(f"Total rows before filtering: {len(lookup_df)}")
481
+
482
+ # Keep rows with no country code (global) OR matching country codes
483
+ normalized_codes = [code.lower() for code in national_codes]
484
+
485
+ mask = lookup_df[filter_col].isna() | lookup_df[ # Global datasets
486
+ filter_col
487
+ ].str.lower().isin(
488
+ normalized_codes
489
+ ) # Matching countries
490
+
491
+ logger.debug(
492
+ f"Filtering lookup by country codes: {national_codes}, keeping {mask.sum()} rows"
493
+ )
494
+
495
+ return lookup_df[mask]