daytashield 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,337 @@
1
+ """Freshness validation for temporal data checks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from datetime import datetime, timedelta, timezone
7
+ from typing import Any
8
+
9
+ from dateutil import parser as date_parser
10
+ from pydantic import Field
11
+
12
+ from daytashield.core.result import ValidationResult, ValidationStatus
13
+ from daytashield.validators.base import BaseValidator, ValidatorConfig
14
+
15
+
16
+ class FreshnessValidatorConfig(ValidatorConfig):
17
+ """Configuration for freshness validation."""
18
+
19
+ timestamp_fields: list[str] = Field(
20
+ default_factory=lambda: ["timestamp", "created_at", "updated_at", "date", "datetime"],
21
+ description="Field names to check for timestamps",
22
+ )
23
+ date_formats: list[str] = Field(
24
+ default_factory=lambda: [
25
+ "%Y-%m-%d",
26
+ "%Y-%m-%dT%H:%M:%S",
27
+ "%Y-%m-%dT%H:%M:%SZ",
28
+ "%Y-%m-%dT%H:%M:%S%z",
29
+ "%Y/%m/%d",
30
+ "%d/%m/%Y",
31
+ "%m/%d/%Y",
32
+ ],
33
+ description="Date formats to try when parsing",
34
+ )
35
+ warn_threshold_ratio: float = Field(
36
+ 0.8, description="Ratio of max_age at which to warn (0.8 = 80%)"
37
+ )
38
+
39
+
40
+ class FreshnessValidator(BaseValidator):
41
+ """Validates data freshness based on timestamps.
42
+
43
+ Checks that data is not stale by examining timestamp fields and
44
+ comparing them against configurable age thresholds.
45
+
46
+ Example:
47
+ >>> validator = FreshnessValidator(max_age="7d")
48
+ >>> result = validator.validate(
49
+ ... {"content": "Hello", "timestamp": "2024-01-15"},
50
+ ... result
51
+ ... )
52
+
53
+ Supported time units:
54
+ - s, sec, second, seconds
55
+ - m, min, minute, minutes
56
+ - h, hr, hour, hours
57
+ - d, day, days
58
+ - w, week, weeks
59
+ - M, month, months (30 days)
60
+ - y, year, years (365 days)
61
+ """
62
+
63
+ name = "freshness"
64
+
65
+ # Regex for parsing duration strings like "7d", "2 weeks", "30 minutes"
66
+ # Note: M for months is case-sensitive, all others are case-insensitive
67
+ DURATION_PATTERN = re.compile(
68
+ r"^\s*(\d+)\s*(s|sec|seconds?|m|min|minutes?|"
69
+ r"h|hr|hours?|d|days?|w|weeks?|M|months?|"
70
+ r"y|years?)\s*$",
71
+ re.IGNORECASE,
72
+ )
73
+
74
+ # Mapping of unit aliases to timedelta kwargs
75
+ UNIT_MAPPING = {
76
+ "s": "seconds",
77
+ "sec": "seconds",
78
+ "second": "seconds",
79
+ "seconds": "seconds",
80
+ "m": "minutes",
81
+ "min": "minutes",
82
+ "minute": "minutes",
83
+ "minutes": "minutes",
84
+ "h": "hours",
85
+ "hr": "hours",
86
+ "hour": "hours",
87
+ "hours": "hours",
88
+ "d": "days",
89
+ "day": "days",
90
+ "days": "days",
91
+ "w": "weeks",
92
+ "week": "weeks",
93
+ "weeks": "weeks",
94
+ "M": "months", # Case-sensitive - uppercase M only
95
+ "month": "months",
96
+ "months": "months",
97
+ "y": "years",
98
+ "year": "years",
99
+ "years": "years",
100
+ }
101
+
102
+ def __init__(
103
+ self,
104
+ max_age: str | timedelta,
105
+ timestamp_field: str | None = None,
106
+ config: FreshnessValidatorConfig | dict[str, Any] | None = None,
107
+ ):
108
+ """Initialize the freshness validator.
109
+
110
+ Args:
111
+ max_age: Maximum allowed age (e.g., "7d", "2 weeks", timedelta)
112
+ timestamp_field: Specific field to check (overrides config)
113
+ config: Validator configuration
114
+ """
115
+ if config is None:
116
+ super().__init__(FreshnessValidatorConfig())
117
+ elif isinstance(config, dict):
118
+ super().__init__(FreshnessValidatorConfig(**config))
119
+ else:
120
+ super().__init__(config)
121
+
122
+ self.max_age = self._parse_duration(max_age) if isinstance(max_age, str) else max_age
123
+ self.timestamp_field = timestamp_field
124
+
125
+ def _parse_duration(self, duration_str: str) -> timedelta:
126
+ """Parse a duration string into a timedelta.
127
+
128
+ Args:
129
+ duration_str: Duration string like "7d", "2 weeks", "30m"
130
+
131
+ Returns:
132
+ timedelta representing the duration
133
+
134
+ Raises:
135
+ ValueError: If the duration string is invalid
136
+ """
137
+ match = self.DURATION_PATTERN.match(duration_str)
138
+ if not match:
139
+ raise ValueError(
140
+ f"Invalid duration format: {duration_str}. "
141
+ "Use formats like '7d', '2 weeks', '30 minutes'"
142
+ )
143
+
144
+ value = int(match.group(1))
145
+ unit_raw = match.group(2)
146
+ # Preserve uppercase M for months, lowercase everything else
147
+ unit = unit_raw if unit_raw == "M" else unit_raw.lower()
148
+ unit_name = self.UNIT_MAPPING.get(unit, unit)
149
+
150
+ # Handle months and years specially
151
+ if unit_name == "months":
152
+ return timedelta(days=value * 30)
153
+ elif unit_name == "years":
154
+ return timedelta(days=value * 365)
155
+ else:
156
+ return timedelta(**{unit_name: value})
157
+
158
+ def validate(self, data: Any, result: ValidationResult) -> ValidationResult:
159
+ """Validate data freshness.
160
+
161
+ Args:
162
+ data: The data to validate
163
+ result: The ValidationResult to update
164
+
165
+ Returns:
166
+ Updated ValidationResult
167
+ """
168
+ if not isinstance(data, dict):
169
+ result.add_message(
170
+ code="FRESHNESS_NOT_DICT",
171
+ message="Freshness validation requires dict data with timestamp fields",
172
+ severity=ValidationStatus.WARNING,
173
+ validator=self.name,
174
+ )
175
+ return result
176
+
177
+ # Find timestamp field
178
+ timestamp_value = self._find_timestamp(data)
179
+ if timestamp_value is None:
180
+ result.add_message(
181
+ code="FRESHNESS_NO_TIMESTAMP",
182
+ message="No timestamp field found in data",
183
+ severity=ValidationStatus.WARNING,
184
+ validator=self.name,
185
+ details={"searched_fields": self._get_timestamp_fields()},
186
+ )
187
+ return result
188
+
189
+ # Parse the timestamp
190
+ parsed_time = self._parse_timestamp(timestamp_value)
191
+ if parsed_time is None:
192
+ result.add_message(
193
+ code="FRESHNESS_PARSE_ERROR",
194
+ message=f"Could not parse timestamp value: {timestamp_value}",
195
+ severity=ValidationStatus.WARNING,
196
+ validator=self.name,
197
+ )
198
+ return result
199
+
200
+ # Calculate age
201
+ now = datetime.now(timezone.utc)
202
+ if parsed_time.tzinfo is None:
203
+ parsed_time = parsed_time.replace(tzinfo=timezone.utc)
204
+
205
+ age = now - parsed_time
206
+ max_age_seconds = self.max_age.total_seconds()
207
+ age_seconds = age.total_seconds()
208
+
209
+ # Check freshness
210
+ config = self.config
211
+ if not isinstance(config, FreshnessValidatorConfig):
212
+ config = FreshnessValidatorConfig()
213
+
214
+ result.metadata["data_age_seconds"] = age_seconds
215
+ result.metadata["max_age_seconds"] = max_age_seconds
216
+ result.metadata["timestamp_value"] = str(timestamp_value)
217
+
218
+ if age_seconds > max_age_seconds:
219
+ result.add_message(
220
+ code="FRESHNESS_STALE",
221
+ message=f"Data is stale: age is {self._format_duration(age)}, max allowed is {self._format_duration(self.max_age)}",
222
+ severity=ValidationStatus.FAILED,
223
+ validator=self.name,
224
+ details={
225
+ "age_seconds": age_seconds,
226
+ "max_age_seconds": max_age_seconds,
227
+ "timestamp": str(parsed_time),
228
+ },
229
+ )
230
+ result.status = ValidationStatus.FAILED
231
+ elif age_seconds > max_age_seconds * config.warn_threshold_ratio:
232
+ result.add_message(
233
+ code="FRESHNESS_NEAR_STALE",
234
+ message=f"Data is approaching staleness: age is {self._format_duration(age)} "
235
+ f"({age_seconds / max_age_seconds * 100:.0f}% of max)",
236
+ severity=ValidationStatus.WARNING,
237
+ validator=self.name,
238
+ )
239
+ if result.status == ValidationStatus.PASSED:
240
+ result.status = ValidationStatus.WARNING
241
+
242
+ return result
243
+
244
+ def _get_timestamp_fields(self) -> list[str]:
245
+ """Get the list of timestamp fields to check."""
246
+ if self.timestamp_field:
247
+ return [self.timestamp_field]
248
+ config = self.config
249
+ if isinstance(config, FreshnessValidatorConfig):
250
+ return config.timestamp_fields
251
+ return FreshnessValidatorConfig().timestamp_fields
252
+
253
+ def _find_timestamp(self, data: dict[str, Any]) -> Any:
254
+ """Find a timestamp value in the data.
255
+
256
+ Args:
257
+ data: Dict to search
258
+
259
+ Returns:
260
+ Timestamp value or None
261
+ """
262
+ fields_to_check = self._get_timestamp_fields()
263
+
264
+ for field in fields_to_check:
265
+ if field in data:
266
+ return data[field]
267
+ # Check nested fields (e.g., "metadata.timestamp")
268
+ if "." in field:
269
+ value = self._get_nested_value(data, field)
270
+ if value is not None:
271
+ return value
272
+
273
+ return None
274
+
275
+ def _get_nested_value(self, data: dict[str, Any], path: str) -> Any:
276
+ """Get a nested value from a dict using dot notation."""
277
+ parts = path.split(".")
278
+ current = data
279
+ for part in parts:
280
+ if isinstance(current, dict) and part in current:
281
+ current = current[part]
282
+ else:
283
+ return None
284
+ return current
285
+
286
+ def _parse_timestamp(self, value: Any) -> datetime | None:
287
+ """Parse a timestamp value into a datetime.
288
+
289
+ Args:
290
+ value: The value to parse
291
+
292
+ Returns:
293
+ datetime or None if parsing fails
294
+ """
295
+ if isinstance(value, datetime):
296
+ return value
297
+
298
+ if isinstance(value, (int, float)):
299
+ # Assume Unix timestamp
300
+ try:
301
+ return datetime.fromtimestamp(value, tz=timezone.utc)
302
+ except (ValueError, OSError):
303
+ return None
304
+
305
+ if isinstance(value, str):
306
+ # Try dateutil parser (handles most formats)
307
+ try:
308
+ return date_parser.parse(value)
309
+ except (ValueError, TypeError):
310
+ pass
311
+
312
+ # Try configured formats
313
+ config = self.config
314
+ if isinstance(config, FreshnessValidatorConfig):
315
+ for fmt in config.date_formats:
316
+ try:
317
+ return datetime.strptime(value, fmt)
318
+ except ValueError:
319
+ continue
320
+
321
+ return None
322
+
323
+ def _format_duration(self, td: timedelta) -> str:
324
+ """Format a timedelta as a human-readable string."""
325
+ total_seconds = int(td.total_seconds())
326
+
327
+ if total_seconds < 60:
328
+ return f"{total_seconds}s"
329
+ elif total_seconds < 3600:
330
+ return f"{total_seconds // 60}m"
331
+ elif total_seconds < 86400:
332
+ return f"{total_seconds // 3600}h"
333
+ else:
334
+ return f"{total_seconds // 86400}d"
335
+
336
+ def __repr__(self) -> str:
337
+ return f"FreshnessValidator(max_age={self._format_duration(self.max_age)})"
@@ -0,0 +1,176 @@
1
+ """Schema validation using JSON Schema and Pydantic."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Type
6
+
7
+ import jsonschema
8
+ from pydantic import BaseModel, Field, ValidationError
9
+
10
+ from daytashield.core.result import ValidationResult, ValidationStatus
11
+ from daytashield.validators.base import BaseValidator, ValidatorConfig
12
+
13
+
14
+ class SchemaValidatorConfig(ValidatorConfig):
15
+ """Configuration for schema validation."""
16
+
17
+ strict: bool = Field(True, description="Strict mode fails on extra fields")
18
+ coerce_types: bool = Field(False, description="Attempt to coerce types")
19
+ allow_none: bool = Field(False, description="Allow None/null values for optional fields")
20
+
21
+
22
+ class SchemaValidator(BaseValidator):
23
+ """Validates data against JSON Schema or Pydantic models.
24
+
25
+ Supports two modes:
26
+ 1. JSON Schema validation (dict schema)
27
+ 2. Pydantic model validation (model class)
28
+
29
+ Example with JSON Schema:
30
+ >>> schema = {
31
+ ... "type": "object",
32
+ ... "required": ["id", "content"],
33
+ ... "properties": {
34
+ ... "id": {"type": "integer"},
35
+ ... "content": {"type": "string"},
36
+ ... }
37
+ ... }
38
+ >>> validator = SchemaValidator(schema=schema)
39
+ >>> result = validator.validate({"id": 1, "content": "Hello"}, result)
40
+
41
+ Example with Pydantic:
42
+ >>> from pydantic import BaseModel
43
+ >>> class Document(BaseModel):
44
+ ... id: int
45
+ ... content: str
46
+ >>> validator = SchemaValidator(model=Document)
47
+ >>> result = validator.validate({"id": 1, "content": "Hello"}, result)
48
+ """
49
+
50
+ name = "schema"
51
+
52
+ def __init__(
53
+ self,
54
+ schema: dict[str, Any] | None = None,
55
+ model: Type[BaseModel] | None = None,
56
+ config: SchemaValidatorConfig | dict[str, Any] | None = None,
57
+ ):
58
+ """Initialize the schema validator.
59
+
60
+ Args:
61
+ schema: JSON Schema dict for validation
62
+ model: Pydantic model class for validation
63
+ config: Validator configuration
64
+
65
+ Raises:
66
+ ValueError: If neither schema nor model is provided
67
+ """
68
+ if config is None:
69
+ super().__init__(SchemaValidatorConfig())
70
+ elif isinstance(config, dict):
71
+ super().__init__(SchemaValidatorConfig(**config))
72
+ else:
73
+ super().__init__(config)
74
+
75
+ if schema is None and model is None:
76
+ raise ValueError("Either 'schema' or 'model' must be provided")
77
+
78
+ self.schema = schema
79
+ self.model = model
80
+ self._json_schema_validator: jsonschema.Draft7Validator | None = None
81
+
82
+ if schema:
83
+ self._json_schema_validator = jsonschema.Draft7Validator(schema)
84
+
85
+ def validate(self, data: Any, result: ValidationResult) -> ValidationResult:
86
+ """Validate data against the schema.
87
+
88
+ Args:
89
+ data: The data to validate
90
+ result: The ValidationResult to update
91
+
92
+ Returns:
93
+ Updated ValidationResult
94
+ """
95
+ if self.model is not None:
96
+ return self._validate_pydantic(data, result)
97
+ elif self.schema is not None:
98
+ return self._validate_json_schema(data, result)
99
+ return result
100
+
101
+ def _validate_pydantic(self, data: Any, result: ValidationResult) -> ValidationResult:
102
+ """Validate using Pydantic model.
103
+
104
+ Args:
105
+ data: The data to validate
106
+ result: The ValidationResult to update
107
+
108
+ Returns:
109
+ Updated ValidationResult
110
+ """
111
+ if self.model is None:
112
+ return result
113
+
114
+ try:
115
+ # Validate and get the model instance
116
+ validated = self.model.model_validate(data)
117
+ result.data = validated.model_dump()
118
+ result.metadata["validated_model"] = self.model.__name__
119
+ except ValidationError as e:
120
+ for error in e.errors():
121
+ field_path = ".".join(str(loc) for loc in error["loc"])
122
+ result.add_message(
123
+ code=f"SCHEMA_{error['type'].upper()}",
124
+ message=error["msg"],
125
+ severity=ValidationStatus.FAILED,
126
+ validator=self.name,
127
+ field=field_path,
128
+ details={"error_type": error["type"], "input": error.get("input")},
129
+ )
130
+ result.status = ValidationStatus.FAILED
131
+
132
+ return result
133
+
134
+ def _validate_json_schema(self, data: Any, result: ValidationResult) -> ValidationResult:
135
+ """Validate using JSON Schema.
136
+
137
+ Args:
138
+ data: The data to validate
139
+ result: The ValidationResult to update
140
+
141
+ Returns:
142
+ Updated ValidationResult
143
+ """
144
+ if self._json_schema_validator is None:
145
+ return result
146
+
147
+ errors = list(self._json_schema_validator.iter_errors(data))
148
+
149
+ if not errors:
150
+ result.data = data
151
+ return result
152
+
153
+ for error in errors:
154
+ # Build field path from error path
155
+ field_path = ".".join(str(p) for p in error.absolute_path) if error.absolute_path else None
156
+
157
+ result.add_message(
158
+ code=f"SCHEMA_{error.validator.upper()}",
159
+ message=error.message,
160
+ severity=ValidationStatus.FAILED,
161
+ validator=self.name,
162
+ field=field_path,
163
+ details={
164
+ "validator": error.validator,
165
+ "validator_value": str(error.validator_value)[:100], # Truncate long values
166
+ "schema_path": list(error.schema_path),
167
+ },
168
+ )
169
+
170
+ result.status = ValidationStatus.FAILED
171
+ return result
172
+
173
+ def __repr__(self) -> str:
174
+ if self.model:
175
+ return f"SchemaValidator(model={self.model.__name__})"
176
+ return f"SchemaValidator(schema={bool(self.schema)})"