pointblank 0.18.0__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/_constants.py +258 -166
- pointblank/_constants_translations.py +378 -0
- pointblank/_interrogation.py +204 -0
- pointblank/validate.py +1263 -11
- {pointblank-0.18.0.dist-info → pointblank-0.19.0.dist-info}/METADATA +1 -1
- {pointblank-0.18.0.dist-info → pointblank-0.19.0.dist-info}/RECORD +10 -10
- {pointblank-0.18.0.dist-info → pointblank-0.19.0.dist-info}/WHEEL +1 -1
- {pointblank-0.18.0.dist-info → pointblank-0.19.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.18.0.dist-info → pointblank-0.19.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.18.0.dist-info → pointblank-0.19.0.dist-info}/top_level.txt +0 -0
pointblank/_interrogation.py
CHANGED
|
@@ -4,6 +4,7 @@ import functools
|
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from typing import TYPE_CHECKING, Any
|
|
7
|
+
from zoneinfo import ZoneInfo
|
|
7
8
|
|
|
8
9
|
import narwhals as nw
|
|
9
10
|
from narwhals.dependencies import (
|
|
@@ -2992,3 +2993,206 @@ def interrogate_prompt(
|
|
|
2992
2993
|
result_tbl["pb_is_good_"] = validation_results
|
|
2993
2994
|
|
|
2994
2995
|
return result_tbl
|
|
2996
|
+
|
|
2997
|
+
|
|
2998
|
+
def data_freshness(
|
|
2999
|
+
data_tbl: IntoFrame,
|
|
3000
|
+
column: str,
|
|
3001
|
+
max_age: Any, # datetime.timedelta
|
|
3002
|
+
reference_time: Any | None, # datetime.datetime | None
|
|
3003
|
+
timezone: str | None,
|
|
3004
|
+
allow_tz_mismatch: bool,
|
|
3005
|
+
) -> dict:
|
|
3006
|
+
"""
|
|
3007
|
+
Check if the most recent datetime value in a column is within the allowed max_age.
|
|
3008
|
+
|
|
3009
|
+
Parameters
|
|
3010
|
+
----------
|
|
3011
|
+
data_tbl
|
|
3012
|
+
The data table to check.
|
|
3013
|
+
column
|
|
3014
|
+
The datetime column to check.
|
|
3015
|
+
max_age
|
|
3016
|
+
The maximum allowed age as a timedelta.
|
|
3017
|
+
reference_time
|
|
3018
|
+
The reference time to compare against (None = use current time).
|
|
3019
|
+
timezone
|
|
3020
|
+
The timezone to use for interpretation.
|
|
3021
|
+
allow_tz_mismatch
|
|
3022
|
+
Whether to suppress timezone mismatch warnings.
|
|
3023
|
+
|
|
3024
|
+
Returns
|
|
3025
|
+
-------
|
|
3026
|
+
dict
|
|
3027
|
+
A dictionary containing:
|
|
3028
|
+
- 'passed': bool, whether the validation passed
|
|
3029
|
+
- 'max_datetime': the maximum datetime found in the column
|
|
3030
|
+
- 'reference_time': the reference time used
|
|
3031
|
+
- 'age': the calculated age (timedelta)
|
|
3032
|
+
- 'max_age': the maximum allowed age
|
|
3033
|
+
- 'tz_warning': any timezone warning message
|
|
3034
|
+
"""
|
|
3035
|
+
import datetime
|
|
3036
|
+
|
|
3037
|
+
nw_frame = nw.from_native(data_tbl)
|
|
3038
|
+
|
|
3039
|
+
# Handle LazyFrames by collecting them first
|
|
3040
|
+
if is_narwhals_lazyframe(nw_frame):
|
|
3041
|
+
nw_frame = nw_frame.collect()
|
|
3042
|
+
|
|
3043
|
+
assert is_narwhals_dataframe(nw_frame)
|
|
3044
|
+
|
|
3045
|
+
result = {
|
|
3046
|
+
"passed": False,
|
|
3047
|
+
"max_datetime": None,
|
|
3048
|
+
"reference_time": None,
|
|
3049
|
+
"age": None,
|
|
3050
|
+
"max_age": max_age,
|
|
3051
|
+
"tz_warning": None,
|
|
3052
|
+
"column_empty": False,
|
|
3053
|
+
}
|
|
3054
|
+
|
|
3055
|
+
# Get the maximum datetime value from the column
|
|
3056
|
+
try:
|
|
3057
|
+
# Use narwhals to get max value
|
|
3058
|
+
max_val_result = nw_frame.select(nw.col(column).max())
|
|
3059
|
+
max_datetime_raw = max_val_result.item()
|
|
3060
|
+
|
|
3061
|
+
if max_datetime_raw is None:
|
|
3062
|
+
result["column_empty"] = True
|
|
3063
|
+
result["passed"] = False
|
|
3064
|
+
return result
|
|
3065
|
+
|
|
3066
|
+
# Convert to Python datetime if needed
|
|
3067
|
+
if hasattr(max_datetime_raw, "to_pydatetime"):
|
|
3068
|
+
# Pandas Timestamp
|
|
3069
|
+
max_datetime = max_datetime_raw.to_pydatetime()
|
|
3070
|
+
elif hasattr(max_datetime_raw, "isoformat"):
|
|
3071
|
+
# Already a datetime-like object
|
|
3072
|
+
max_datetime = max_datetime_raw
|
|
3073
|
+
else:
|
|
3074
|
+
# Try to parse as string or handle other types
|
|
3075
|
+
max_datetime = datetime.datetime.fromisoformat(str(max_datetime_raw))
|
|
3076
|
+
|
|
3077
|
+
result["max_datetime"] = max_datetime
|
|
3078
|
+
|
|
3079
|
+
except Exception as e:
|
|
3080
|
+
result["error"] = str(e)
|
|
3081
|
+
result["passed"] = False
|
|
3082
|
+
return result
|
|
3083
|
+
|
|
3084
|
+
# Determine the reference time
|
|
3085
|
+
# We'll set the reference time after we know the timezone awareness of the data
|
|
3086
|
+
if reference_time is None:
|
|
3087
|
+
ref_time = None # Will be set below based on data timezone awareness
|
|
3088
|
+
else:
|
|
3089
|
+
ref_time = reference_time
|
|
3090
|
+
|
|
3091
|
+
# Handle timezone awareness/naivete
|
|
3092
|
+
max_dt_aware = _is_datetime_aware(max_datetime)
|
|
3093
|
+
|
|
3094
|
+
# Helper to parse timezone string (supports IANA names and offsets like "-7", "-07:00")
|
|
3095
|
+
def _get_tz_from_string(tz_str: str) -> datetime.tzinfo:
|
|
3096
|
+
import re
|
|
3097
|
+
|
|
3098
|
+
# Check for offset formats: "-7", "+5", "-07:00", "+05:30", etc.
|
|
3099
|
+
offset_pattern = r"^([+-]?)(\d{1,2})(?::(\d{2}))?$"
|
|
3100
|
+
match = re.match(offset_pattern, tz_str.strip())
|
|
3101
|
+
|
|
3102
|
+
if match:
|
|
3103
|
+
sign_str, hours_str, minutes_str = match.groups()
|
|
3104
|
+
hours = int(hours_str)
|
|
3105
|
+
minutes = int(minutes_str) if minutes_str else 0
|
|
3106
|
+
|
|
3107
|
+
total_minutes = hours * 60 + minutes
|
|
3108
|
+
if sign_str == "-":
|
|
3109
|
+
total_minutes = -total_minutes
|
|
3110
|
+
|
|
3111
|
+
return datetime.timezone(datetime.timedelta(minutes=total_minutes))
|
|
3112
|
+
|
|
3113
|
+
# Try IANA timezone names (zoneinfo is standard in Python 3.9+)
|
|
3114
|
+
try:
|
|
3115
|
+
return ZoneInfo(tz_str)
|
|
3116
|
+
except KeyError:
|
|
3117
|
+
# Invalid timezone name, fall back to UTC
|
|
3118
|
+
return datetime.timezone.utc
|
|
3119
|
+
|
|
3120
|
+
# If ref_time is None (no reference_time provided), set it based on data awareness
|
|
3121
|
+
if ref_time is None:
|
|
3122
|
+
if max_dt_aware:
|
|
3123
|
+
# Data is timezone-aware, use timezone-aware now
|
|
3124
|
+
if timezone:
|
|
3125
|
+
ref_time = datetime.datetime.now(_get_tz_from_string(timezone))
|
|
3126
|
+
else:
|
|
3127
|
+
# Default to UTC when data is aware but no timezone specified
|
|
3128
|
+
ref_time = datetime.datetime.now(datetime.timezone.utc)
|
|
3129
|
+
else:
|
|
3130
|
+
# Data is naive, use naive local time for comparison
|
|
3131
|
+
if timezone:
|
|
3132
|
+
# If user specified timezone, use it for reference
|
|
3133
|
+
ref_time = datetime.datetime.now(_get_tz_from_string(timezone))
|
|
3134
|
+
else:
|
|
3135
|
+
# No timezone specified and data is naive -> use naive local time
|
|
3136
|
+
ref_time = datetime.datetime.now()
|
|
3137
|
+
|
|
3138
|
+
result["reference_time"] = ref_time
|
|
3139
|
+
ref_dt_aware = _is_datetime_aware(ref_time)
|
|
3140
|
+
|
|
3141
|
+
# Track timezone warnings - use keys for translation lookup
|
|
3142
|
+
tz_warning_key = None
|
|
3143
|
+
|
|
3144
|
+
if max_dt_aware != ref_dt_aware:
|
|
3145
|
+
if not allow_tz_mismatch:
|
|
3146
|
+
if max_dt_aware and not ref_dt_aware:
|
|
3147
|
+
tz_warning_key = "data_freshness_tz_warning_aware_naive"
|
|
3148
|
+
else:
|
|
3149
|
+
tz_warning_key = "data_freshness_tz_warning_naive_aware"
|
|
3150
|
+
result["tz_warning_key"] = tz_warning_key
|
|
3151
|
+
|
|
3152
|
+
# Make both comparable
|
|
3153
|
+
try:
|
|
3154
|
+
if max_dt_aware and not ref_dt_aware:
|
|
3155
|
+
# Add timezone to reference time
|
|
3156
|
+
if timezone:
|
|
3157
|
+
try:
|
|
3158
|
+
ref_time = ref_time.replace(tzinfo=ZoneInfo(timezone))
|
|
3159
|
+
except KeyError:
|
|
3160
|
+
ref_time = ref_time.replace(tzinfo=datetime.timezone.utc)
|
|
3161
|
+
else:
|
|
3162
|
+
# Assume UTC
|
|
3163
|
+
ref_time = ref_time.replace(tzinfo=datetime.timezone.utc)
|
|
3164
|
+
|
|
3165
|
+
elif not max_dt_aware and ref_dt_aware:
|
|
3166
|
+
# Localize the max_datetime if we have a timezone
|
|
3167
|
+
if timezone:
|
|
3168
|
+
try:
|
|
3169
|
+
max_datetime = max_datetime.replace(tzinfo=ZoneInfo(timezone))
|
|
3170
|
+
except KeyError:
|
|
3171
|
+
# Remove timezone from reference for comparison
|
|
3172
|
+
ref_time = ref_time.replace(tzinfo=None)
|
|
3173
|
+
else:
|
|
3174
|
+
# Remove timezone from reference for comparison
|
|
3175
|
+
ref_time = ref_time.replace(tzinfo=None)
|
|
3176
|
+
|
|
3177
|
+
# Calculate the age
|
|
3178
|
+
age = ref_time - max_datetime
|
|
3179
|
+
result["age"] = age
|
|
3180
|
+
result["reference_time"] = ref_time
|
|
3181
|
+
|
|
3182
|
+
# Check if within max_age
|
|
3183
|
+
result["passed"] = age <= max_age
|
|
3184
|
+
|
|
3185
|
+
except Exception as e:
|
|
3186
|
+
result["error"] = str(e)
|
|
3187
|
+
result["passed"] = False
|
|
3188
|
+
|
|
3189
|
+
return result
|
|
3190
|
+
|
|
3191
|
+
|
|
3192
|
+
def _is_datetime_aware(dt: Any) -> bool:
|
|
3193
|
+
"""Check if a datetime object is timezone-aware."""
|
|
3194
|
+
if dt is None:
|
|
3195
|
+
return False
|
|
3196
|
+
if hasattr(dt, "tzinfo"):
|
|
3197
|
+
return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None
|
|
3198
|
+
return False
|