daytashield 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- daytashield/__init__.py +55 -0
- daytashield/cli/__init__.py +5 -0
- daytashield/cli/main.py +541 -0
- daytashield/core/__init__.py +15 -0
- daytashield/core/audit.py +275 -0
- daytashield/core/pipeline.py +240 -0
- daytashield/core/result.py +185 -0
- daytashield/core/router.py +217 -0
- daytashield/integrations/__init__.py +7 -0
- daytashield/integrations/langchain.py +391 -0
- daytashield/processors/__init__.py +13 -0
- daytashield/processors/base.py +182 -0
- daytashield/processors/csv.py +269 -0
- daytashield/processors/json.py +260 -0
- daytashield/processors/pdf.py +232 -0
- daytashield/rules/__init__.py +14 -0
- daytashield/rules/base.py +67 -0
- daytashield/rules/gdpr.py +348 -0
- daytashield/rules/hipaa.py +229 -0
- daytashield/rules/pii.py +208 -0
- daytashield/validators/__init__.py +15 -0
- daytashield/validators/base.py +103 -0
- daytashield/validators/compliance.py +222 -0
- daytashield/validators/freshness.py +337 -0
- daytashield/validators/schema.py +176 -0
- daytashield/validators/semantic.py +256 -0
- daytashield-0.1.1.dist-info/METADATA +316 -0
- daytashield-0.1.1.dist-info/RECORD +31 -0
- daytashield-0.1.1.dist-info/WHEEL +4 -0
- daytashield-0.1.1.dist-info/entry_points.txt +2 -0
- daytashield-0.1.1.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""Immutable audit trail for validation operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import gzip
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Iterator
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
|
|
11
|
+
import orjson
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
|
|
14
|
+
from daytashield.core.result import ValidationResult, ValidationStatus
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AuditEntry(BaseModel):
|
|
18
|
+
"""A single audit log entry."""
|
|
19
|
+
|
|
20
|
+
timestamp: datetime = Field(
|
|
21
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
22
|
+
description="When the entry was created",
|
|
23
|
+
)
|
|
24
|
+
result_id: UUID = Field(..., description="ID of the validation result")
|
|
25
|
+
status: ValidationStatus = Field(..., description="Validation status")
|
|
26
|
+
validators_run: list[str] = Field(default_factory=list, description="Validators executed")
|
|
27
|
+
message_count: int = Field(0, description="Number of validation messages")
|
|
28
|
+
error_count: int = Field(0, description="Number of errors")
|
|
29
|
+
warning_count: int = Field(0, description="Number of warnings")
|
|
30
|
+
duration_ms: float | None = Field(None, description="Validation duration")
|
|
31
|
+
source_id: str | None = Field(None, description="Source identifier")
|
|
32
|
+
source_path: str | None = Field(None, description="Source file path")
|
|
33
|
+
checksum: str | None = Field(None, description="Data checksum")
|
|
34
|
+
metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def from_result(cls, result: ValidationResult, metadata: dict[str, Any] | None = None) -> AuditEntry:
|
|
38
|
+
"""Create an audit entry from a validation result.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
result: The validation result to log
|
|
42
|
+
metadata: Additional metadata to include
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
New AuditEntry
|
|
46
|
+
"""
|
|
47
|
+
return cls(
|
|
48
|
+
result_id=result.id,
|
|
49
|
+
status=result.status,
|
|
50
|
+
validators_run=result.validators_run,
|
|
51
|
+
message_count=len(result.messages),
|
|
52
|
+
error_count=len(result.errors),
|
|
53
|
+
warning_count=len(result.warnings),
|
|
54
|
+
duration_ms=result.duration_ms,
|
|
55
|
+
source_id=result.provenance.source_id if result.provenance else None,
|
|
56
|
+
source_path=result.provenance.source_path if result.provenance else None,
|
|
57
|
+
checksum=result.provenance.checksum if result.provenance else None,
|
|
58
|
+
metadata=metadata or {},
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def to_jsonl(self) -> bytes:
|
|
62
|
+
"""Serialize to JSON Lines format (single line)."""
|
|
63
|
+
return orjson.dumps(self.model_dump(mode="json")) + b"\n"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class AuditTrailConfig(BaseModel):
|
|
67
|
+
"""Configuration for the audit trail."""
|
|
68
|
+
|
|
69
|
+
path: Path = Field(
|
|
70
|
+
default_factory=lambda: Path("./audit.jsonl"),
|
|
71
|
+
description="Path to the audit log file",
|
|
72
|
+
)
|
|
73
|
+
compress: bool = Field(False, description="Use gzip compression")
|
|
74
|
+
max_size_bytes: int = Field(100 * 1024 * 1024, description="Max file size before rotation")
|
|
75
|
+
include_data: bool = Field(False, description="Include actual data in audit (careful!)")
|
|
76
|
+
buffer_size: int = Field(100, description="Number of entries to buffer before flush")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class AuditTrail:
|
|
80
|
+
"""Immutable audit trail using JSON Lines format.
|
|
81
|
+
|
|
82
|
+
The audit trail provides an append-only log of all validation operations.
|
|
83
|
+
It uses JSON Lines format for easy querying and analysis.
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
>>> audit = AuditTrail(path="./validation_audit.jsonl")
|
|
87
|
+
>>> result = pipeline.validate(data)
|
|
88
|
+
>>> audit.log(result)
|
|
89
|
+
>>> # Later, query the audit trail
|
|
90
|
+
>>> for entry in audit.query(status=ValidationStatus.FAILED):
|
|
91
|
+
... print(f"Failed: {entry.source_id} at {entry.timestamp}")
|
|
92
|
+
|
|
93
|
+
Features:
|
|
94
|
+
- Append-only (immutable)
|
|
95
|
+
- JSON Lines format (one JSON object per line)
|
|
96
|
+
- Optional gzip compression
|
|
97
|
+
- File rotation support
|
|
98
|
+
- Query by status, date range, source
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def __init__(self, config: AuditTrailConfig | dict[str, Any] | Path | str | None = None):
|
|
102
|
+
"""Initialize the audit trail.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
config: Configuration (AuditTrailConfig, dict, or path string)
|
|
106
|
+
"""
|
|
107
|
+
if config is None:
|
|
108
|
+
self.config = AuditTrailConfig()
|
|
109
|
+
elif isinstance(config, (str, Path)):
|
|
110
|
+
self.config = AuditTrailConfig(path=Path(config))
|
|
111
|
+
elif isinstance(config, dict):
|
|
112
|
+
self.config = AuditTrailConfig(**config)
|
|
113
|
+
else:
|
|
114
|
+
self.config = config
|
|
115
|
+
|
|
116
|
+
self._buffer: list[AuditEntry] = []
|
|
117
|
+
self._ensure_path()
|
|
118
|
+
|
|
119
|
+
def _ensure_path(self) -> None:
|
|
120
|
+
"""Ensure the audit file directory exists."""
|
|
121
|
+
self.config.path.parent.mkdir(parents=True, exist_ok=True)
|
|
122
|
+
|
|
123
|
+
def _get_file_path(self) -> Path:
|
|
124
|
+
"""Get the current audit file path."""
|
|
125
|
+
if self.config.compress:
|
|
126
|
+
return self.config.path.with_suffix(".jsonl.gz")
|
|
127
|
+
return self.config.path
|
|
128
|
+
|
|
129
|
+
def log(self, result: ValidationResult, metadata: dict[str, Any] | None = None) -> AuditEntry:
|
|
130
|
+
"""Log a validation result to the audit trail.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
result: The validation result to log
|
|
134
|
+
metadata: Additional metadata to include
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
The created AuditEntry
|
|
138
|
+
"""
|
|
139
|
+
entry = AuditEntry.from_result(result, metadata)
|
|
140
|
+
self._buffer.append(entry)
|
|
141
|
+
|
|
142
|
+
if len(self._buffer) >= self.config.buffer_size:
|
|
143
|
+
self.flush()
|
|
144
|
+
|
|
145
|
+
return entry
|
|
146
|
+
|
|
147
|
+
def flush(self) -> int:
|
|
148
|
+
"""Flush buffered entries to disk.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Number of entries flushed
|
|
152
|
+
"""
|
|
153
|
+
if not self._buffer:
|
|
154
|
+
return 0
|
|
155
|
+
|
|
156
|
+
count = len(self._buffer)
|
|
157
|
+
data = b"".join(entry.to_jsonl() for entry in self._buffer)
|
|
158
|
+
|
|
159
|
+
file_path = self._get_file_path()
|
|
160
|
+
|
|
161
|
+
if self.config.compress:
|
|
162
|
+
with gzip.open(file_path, "ab") as f:
|
|
163
|
+
f.write(data)
|
|
164
|
+
else:
|
|
165
|
+
with open(file_path, "ab") as f:
|
|
166
|
+
f.write(data)
|
|
167
|
+
|
|
168
|
+
self._buffer.clear()
|
|
169
|
+
return count
|
|
170
|
+
|
|
171
|
+
def log_batch(self, results: list[ValidationResult]) -> list[AuditEntry]:
|
|
172
|
+
"""Log multiple validation results.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
results: List of validation results
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
List of created AuditEntries
|
|
179
|
+
"""
|
|
180
|
+
entries = [self.log(result) for result in results]
|
|
181
|
+
self.flush()
|
|
182
|
+
return entries
|
|
183
|
+
|
|
184
|
+
def query(
|
|
185
|
+
self,
|
|
186
|
+
status: ValidationStatus | None = None,
|
|
187
|
+
start_time: datetime | None = None,
|
|
188
|
+
end_time: datetime | None = None,
|
|
189
|
+
source_id: str | None = None,
|
|
190
|
+
limit: int | None = None,
|
|
191
|
+
) -> Iterator[AuditEntry]:
|
|
192
|
+
"""Query the audit trail.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
status: Filter by validation status
|
|
196
|
+
start_time: Filter entries after this time
|
|
197
|
+
end_time: Filter entries before this time
|
|
198
|
+
source_id: Filter by source identifier
|
|
199
|
+
limit: Maximum number of entries to return
|
|
200
|
+
|
|
201
|
+
Yields:
|
|
202
|
+
Matching AuditEntry objects
|
|
203
|
+
"""
|
|
204
|
+
self.flush() # Ensure all entries are on disk
|
|
205
|
+
|
|
206
|
+
file_path = self._get_file_path()
|
|
207
|
+
if not file_path.exists():
|
|
208
|
+
return
|
|
209
|
+
|
|
210
|
+
count = 0
|
|
211
|
+
opener = gzip.open if self.config.compress else open
|
|
212
|
+
|
|
213
|
+
with opener(file_path, "rb") as f: # type: ignore[operator]
|
|
214
|
+
for line in f:
|
|
215
|
+
if not line.strip():
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
data = orjson.loads(line)
|
|
219
|
+
entry = AuditEntry(**data)
|
|
220
|
+
|
|
221
|
+
# Apply filters
|
|
222
|
+
if status is not None and entry.status != status:
|
|
223
|
+
continue
|
|
224
|
+
if start_time is not None and entry.timestamp < start_time:
|
|
225
|
+
continue
|
|
226
|
+
if end_time is not None and entry.timestamp > end_time:
|
|
227
|
+
continue
|
|
228
|
+
if source_id is not None and entry.source_id != source_id:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
yield entry
|
|
232
|
+
count += 1
|
|
233
|
+
|
|
234
|
+
if limit is not None and count >= limit:
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
def stats(self) -> dict[str, Any]:
|
|
238
|
+
"""Get statistics from the audit trail.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Dict with count by status, total count, etc.
|
|
242
|
+
"""
|
|
243
|
+
self.flush()
|
|
244
|
+
|
|
245
|
+
stats: dict[str, Any] = {
|
|
246
|
+
"total": 0,
|
|
247
|
+
"by_status": {status.value: 0 for status in ValidationStatus},
|
|
248
|
+
"avg_duration_ms": 0.0,
|
|
249
|
+
"total_errors": 0,
|
|
250
|
+
"total_warnings": 0,
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
durations: list[float] = []
|
|
254
|
+
|
|
255
|
+
for entry in self.query():
|
|
256
|
+
stats["total"] += 1
|
|
257
|
+
stats["by_status"][entry.status.value] += 1
|
|
258
|
+
stats["total_errors"] += entry.error_count
|
|
259
|
+
stats["total_warnings"] += entry.warning_count
|
|
260
|
+
if entry.duration_ms is not None:
|
|
261
|
+
durations.append(entry.duration_ms)
|
|
262
|
+
|
|
263
|
+
if durations:
|
|
264
|
+
stats["avg_duration_ms"] = sum(durations) / len(durations)
|
|
265
|
+
|
|
266
|
+
return stats
|
|
267
|
+
|
|
268
|
+
def __enter__(self) -> AuditTrail:
|
|
269
|
+
return self
|
|
270
|
+
|
|
271
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
272
|
+
self.flush()
|
|
273
|
+
|
|
274
|
+
def __repr__(self) -> str:
|
|
275
|
+
return f"AuditTrail(path={self.config.path})"
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""Validation pipeline orchestrator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any, BinaryIO
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from daytashield.core.result import ValidationResult, ValidationStatus, create_result
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from daytashield.processors.base import BaseProcessor
|
|
16
|
+
from daytashield.validators.base import BaseValidator
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PipelineConfig(BaseModel):
|
|
20
|
+
"""Configuration for the validation pipeline."""
|
|
21
|
+
|
|
22
|
+
fail_fast: bool = Field(False, description="Stop on first validation failure")
|
|
23
|
+
parallel: bool = Field(False, description="Run validators in parallel (async)")
|
|
24
|
+
include_original_data: bool = Field(True, description="Include original data in result")
|
|
25
|
+
auto_detect_processor: bool = Field(True, description="Auto-detect processor by file extension")
|
|
26
|
+
|
|
27
|
+
model_config = {"extra": "allow"}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ValidationPipeline:
|
|
31
|
+
"""Orchestrates validation across multiple validators.
|
|
32
|
+
|
|
33
|
+
The pipeline chains validators together, passing data and results through
|
|
34
|
+
each validator in sequence. It handles processor selection, result
|
|
35
|
+
aggregation, and provides hooks for auditing.
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
>>> from daytashield import ValidationPipeline, SchemaValidator, FreshnessValidator
|
|
39
|
+
>>> pipeline = ValidationPipeline([
|
|
40
|
+
... SchemaValidator(schema={"type": "object"}),
|
|
41
|
+
... FreshnessValidator(max_age="7d"),
|
|
42
|
+
... ])
|
|
43
|
+
>>> result = pipeline.validate({"id": 1, "timestamp": "2024-01-01"})
|
|
44
|
+
>>> print(result.status)
|
|
45
|
+
ValidationStatus.PASSED
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
validators: List of validators to run
|
|
49
|
+
processors: Dict mapping extensions to processors
|
|
50
|
+
config: Pipeline configuration
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
validators: list[BaseValidator] | None = None,
|
|
56
|
+
processors: dict[str, BaseProcessor] | None = None,
|
|
57
|
+
config: PipelineConfig | dict[str, Any] | None = None,
|
|
58
|
+
):
|
|
59
|
+
"""Initialize the validation pipeline.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
validators: List of validators to apply (in order)
|
|
63
|
+
processors: Dict mapping file extensions to processors
|
|
64
|
+
config: Pipeline configuration
|
|
65
|
+
"""
|
|
66
|
+
self.validators: list[BaseValidator] = validators or []
|
|
67
|
+
self.processors: dict[str, BaseProcessor] = processors or {}
|
|
68
|
+
|
|
69
|
+
if config is None:
|
|
70
|
+
self.config = PipelineConfig()
|
|
71
|
+
elif isinstance(config, dict):
|
|
72
|
+
self.config = PipelineConfig(**config)
|
|
73
|
+
else:
|
|
74
|
+
self.config = config
|
|
75
|
+
|
|
76
|
+
def add_validator(self, validator: BaseValidator) -> ValidationPipeline:
|
|
77
|
+
"""Add a validator to the pipeline.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
validator: The validator to add
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Self for method chaining
|
|
84
|
+
"""
|
|
85
|
+
self.validators.append(validator)
|
|
86
|
+
return self
|
|
87
|
+
|
|
88
|
+
def add_processor(self, extension: str, processor: BaseProcessor) -> ValidationPipeline:
|
|
89
|
+
"""Register a processor for a file extension.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
extension: File extension (e.g., ".pdf")
|
|
93
|
+
processor: The processor to use
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Self for method chaining
|
|
97
|
+
"""
|
|
98
|
+
self.processors[extension.lower()] = processor
|
|
99
|
+
return self
|
|
100
|
+
|
|
101
|
+
def validate(
|
|
102
|
+
self,
|
|
103
|
+
data: Any,
|
|
104
|
+
source: str | Path | BinaryIO | bytes | None = None,
|
|
105
|
+
result: ValidationResult | None = None,
|
|
106
|
+
) -> ValidationResult:
|
|
107
|
+
"""Run all validators on the data.
|
|
108
|
+
|
|
109
|
+
This is the main entry point for validation. It:
|
|
110
|
+
1. Creates or uses an existing result
|
|
111
|
+
2. Processes the source if provided (file → structured data)
|
|
112
|
+
3. Runs each validator in sequence
|
|
113
|
+
4. Returns the aggregated result
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
data: The data to validate (dict, list, or ProcessedData)
|
|
117
|
+
source: Optional original source (file path, bytes, etc.)
|
|
118
|
+
result: Optional existing result to continue
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
ValidationResult with all validation findings
|
|
122
|
+
"""
|
|
123
|
+
start_time = time.perf_counter()
|
|
124
|
+
|
|
125
|
+
# Create or use existing result
|
|
126
|
+
if result is None:
|
|
127
|
+
result = create_result(status=ValidationStatus.PASSED, data=data)
|
|
128
|
+
if self.config.include_original_data:
|
|
129
|
+
result.original_data = data
|
|
130
|
+
|
|
131
|
+
# Process source if provided
|
|
132
|
+
if source is not None:
|
|
133
|
+
result = self._process_source(source, result)
|
|
134
|
+
if result.failed:
|
|
135
|
+
return result.complete()
|
|
136
|
+
# Use processed data for validation
|
|
137
|
+
if result.data is not None:
|
|
138
|
+
data = result.data
|
|
139
|
+
|
|
140
|
+
# Run validators
|
|
141
|
+
for validator in self.validators:
|
|
142
|
+
if not validator.should_run(data, result):
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
result = validator.validate(data, result)
|
|
147
|
+
result.validators_run.append(validator.name)
|
|
148
|
+
|
|
149
|
+
# Check for fail-fast
|
|
150
|
+
if self.config.fail_fast and result.failed:
|
|
151
|
+
break
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
result.add_message(
|
|
155
|
+
code="PIPELINE_ERROR",
|
|
156
|
+
message=f"Validator {validator.name} raised an exception: {e}",
|
|
157
|
+
severity=ValidationStatus.ERROR,
|
|
158
|
+
validator="pipeline",
|
|
159
|
+
details={"exception": str(e), "validator": validator.name},
|
|
160
|
+
)
|
|
161
|
+
result.status = ValidationStatus.ERROR
|
|
162
|
+
if self.config.fail_fast:
|
|
163
|
+
break
|
|
164
|
+
|
|
165
|
+
# Calculate duration
|
|
166
|
+
end_time = time.perf_counter()
|
|
167
|
+
result.duration_ms = (end_time - start_time) * 1000
|
|
168
|
+
result.completed_at = datetime.now(timezone.utc)
|
|
169
|
+
|
|
170
|
+
return result
|
|
171
|
+
|
|
172
|
+
def validate_file(self, path: str | Path) -> ValidationResult:
|
|
173
|
+
"""Validate a file, auto-detecting the processor.
|
|
174
|
+
|
|
175
|
+
Convenience method that reads a file, selects the appropriate
|
|
176
|
+
processor based on extension, and runs validation.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
path: Path to the file to validate
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
ValidationResult with all validation findings
|
|
183
|
+
"""
|
|
184
|
+
path = Path(path) if isinstance(path, str) else path
|
|
185
|
+
|
|
186
|
+
if not path.exists():
|
|
187
|
+
result = create_result(status=ValidationStatus.ERROR)
|
|
188
|
+
result.add_message(
|
|
189
|
+
code="FILE_NOT_FOUND",
|
|
190
|
+
message=f"File not found: {path}",
|
|
191
|
+
severity=ValidationStatus.ERROR,
|
|
192
|
+
validator="pipeline",
|
|
193
|
+
)
|
|
194
|
+
return result.complete()
|
|
195
|
+
|
|
196
|
+
return self.validate(data=None, source=path)
|
|
197
|
+
|
|
198
|
+
def _process_source(
|
|
199
|
+
self, source: str | Path | BinaryIO | bytes, result: ValidationResult
|
|
200
|
+
) -> ValidationResult:
|
|
201
|
+
"""Process a source using the appropriate processor.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
source: The data source
|
|
205
|
+
result: The current result
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Updated result with processed data
|
|
209
|
+
"""
|
|
210
|
+
if not self.config.auto_detect_processor:
|
|
211
|
+
return result
|
|
212
|
+
|
|
213
|
+
# Determine file extension
|
|
214
|
+
extension: str | None = None
|
|
215
|
+
if isinstance(source, (str, Path)):
|
|
216
|
+
extension = Path(source).suffix.lower()
|
|
217
|
+
elif hasattr(source, "name") and source.name:
|
|
218
|
+
extension = Path(source.name).suffix.lower()
|
|
219
|
+
|
|
220
|
+
if extension and extension in self.processors:
|
|
221
|
+
processor = self.processors[extension]
|
|
222
|
+
try:
|
|
223
|
+
result = processor.process(source, result)
|
|
224
|
+
if result.provenance:
|
|
225
|
+
result.provenance.processor_chain.append(processor.name)
|
|
226
|
+
except Exception as e:
|
|
227
|
+
result.add_message(
|
|
228
|
+
code="PROCESSOR_ERROR",
|
|
229
|
+
message=f"Processor {processor.name} failed: {e}",
|
|
230
|
+
severity=ValidationStatus.ERROR,
|
|
231
|
+
validator="pipeline",
|
|
232
|
+
details={"exception": str(e), "processor": processor.name},
|
|
233
|
+
)
|
|
234
|
+
result.status = ValidationStatus.ERROR
|
|
235
|
+
|
|
236
|
+
return result
|
|
237
|
+
|
|
238
|
+
def __repr__(self) -> str:
|
|
239
|
+
validator_names = [v.name for v in self.validators]
|
|
240
|
+
return f"ValidationPipeline(validators={validator_names})"
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Validation result types and data structures."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Any
|
|
8
|
+
from uuid import UUID, uuid4
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ValidationStatus(str, Enum):
|
|
14
|
+
"""Status of a validation operation."""
|
|
15
|
+
|
|
16
|
+
PASSED = "passed"
|
|
17
|
+
WARNING = "warning"
|
|
18
|
+
FAILED = "failed"
|
|
19
|
+
SKIPPED = "skipped"
|
|
20
|
+
ERROR = "error"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ValidationMessage(BaseModel):
|
|
24
|
+
"""A single validation message with context."""
|
|
25
|
+
|
|
26
|
+
code: str = Field(..., description="Machine-readable error/warning code")
|
|
27
|
+
message: str = Field(..., description="Human-readable description")
|
|
28
|
+
severity: ValidationStatus = Field(..., description="Severity level")
|
|
29
|
+
field: str | None = Field(None, description="Field path that triggered the message")
|
|
30
|
+
validator: str = Field(..., description="Name of the validator that produced this message")
|
|
31
|
+
details: dict[str, Any] = Field(default_factory=dict, description="Additional context")
|
|
32
|
+
|
|
33
|
+
def __str__(self) -> str:
|
|
34
|
+
field_str = f" [{self.field}]" if self.field else ""
|
|
35
|
+
return f"[{self.severity.value.upper()}] {self.validator}{field_str}: {self.message}"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Provenance(BaseModel):
|
|
39
|
+
"""Tracks the origin and processing history of data."""
|
|
40
|
+
|
|
41
|
+
source_id: str = Field(..., description="Unique identifier for the data source")
|
|
42
|
+
source_type: str = Field(..., description="Type of source (file, api, stream)")
|
|
43
|
+
source_path: str | None = Field(None, description="Path or URL of the source")
|
|
44
|
+
checksum: str | None = Field(None, description="SHA-256 hash of the original data")
|
|
45
|
+
processed_at: datetime = Field(
|
|
46
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
47
|
+
description="When the data was processed",
|
|
48
|
+
)
|
|
49
|
+
processor_chain: list[str] = Field(
|
|
50
|
+
default_factory=list, description="List of processors applied"
|
|
51
|
+
)
|
|
52
|
+
metadata: dict[str, Any] = Field(default_factory=dict, description="Additional source metadata")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ValidationResult(BaseModel):
|
|
56
|
+
"""Complete result of a validation operation."""
|
|
57
|
+
|
|
58
|
+
id: UUID = Field(default_factory=uuid4, description="Unique result identifier")
|
|
59
|
+
status: ValidationStatus = Field(..., description="Overall validation status")
|
|
60
|
+
messages: list[ValidationMessage] = Field(
|
|
61
|
+
default_factory=list, description="All validation messages"
|
|
62
|
+
)
|
|
63
|
+
data: Any = Field(None, description="The validated/transformed data")
|
|
64
|
+
original_data: Any = Field(None, description="The original input data")
|
|
65
|
+
provenance: Provenance | None = Field(None, description="Data provenance information")
|
|
66
|
+
validators_run: list[str] = Field(
|
|
67
|
+
default_factory=list, description="Names of validators that were executed"
|
|
68
|
+
)
|
|
69
|
+
started_at: datetime = Field(
|
|
70
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
71
|
+
description="When validation started",
|
|
72
|
+
)
|
|
73
|
+
completed_at: datetime | None = Field(None, description="When validation completed")
|
|
74
|
+
duration_ms: float | None = Field(None, description="Total validation time in milliseconds")
|
|
75
|
+
metadata: dict[str, Any] = Field(default_factory=dict, description="Additional result metadata")
|
|
76
|
+
|
|
77
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def passed(self) -> bool:
|
|
81
|
+
"""Check if validation passed (no failures or errors)."""
|
|
82
|
+
return self.status in (ValidationStatus.PASSED, ValidationStatus.WARNING)
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def failed(self) -> bool:
|
|
86
|
+
"""Check if validation failed."""
|
|
87
|
+
return self.status in (ValidationStatus.FAILED, ValidationStatus.ERROR)
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def errors(self) -> list[ValidationMessage]:
|
|
91
|
+
"""Get all error-level messages."""
|
|
92
|
+
return [m for m in self.messages if m.severity == ValidationStatus.FAILED]
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def warnings(self) -> list[ValidationMessage]:
|
|
96
|
+
"""Get all warning-level messages."""
|
|
97
|
+
return [m for m in self.messages if m.severity == ValidationStatus.WARNING]
|
|
98
|
+
|
|
99
|
+
def add_message(
|
|
100
|
+
self,
|
|
101
|
+
code: str,
|
|
102
|
+
message: str,
|
|
103
|
+
severity: ValidationStatus,
|
|
104
|
+
validator: str,
|
|
105
|
+
field: str | None = None,
|
|
106
|
+
details: dict[str, Any] | None = None,
|
|
107
|
+
) -> None:
|
|
108
|
+
"""Add a validation message to the result."""
|
|
109
|
+
self.messages.append(
|
|
110
|
+
ValidationMessage(
|
|
111
|
+
code=code,
|
|
112
|
+
message=message,
|
|
113
|
+
severity=severity,
|
|
114
|
+
field=field,
|
|
115
|
+
validator=validator,
|
|
116
|
+
details=details or {},
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def merge(self, other: ValidationResult) -> ValidationResult:
|
|
121
|
+
"""Merge another result into this one, combining messages and updating status."""
|
|
122
|
+
# Combine messages
|
|
123
|
+
self.messages.extend(other.messages)
|
|
124
|
+
|
|
125
|
+
# Update validators run
|
|
126
|
+
self.validators_run.extend(other.validators_run)
|
|
127
|
+
|
|
128
|
+
# Update status to the most severe
|
|
129
|
+
status_priority = {
|
|
130
|
+
ValidationStatus.ERROR: 4,
|
|
131
|
+
ValidationStatus.FAILED: 3,
|
|
132
|
+
ValidationStatus.WARNING: 2,
|
|
133
|
+
ValidationStatus.PASSED: 1,
|
|
134
|
+
ValidationStatus.SKIPPED: 0,
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if status_priority[other.status] > status_priority[self.status]:
|
|
138
|
+
self.status = other.status
|
|
139
|
+
|
|
140
|
+
# Use the latest data
|
|
141
|
+
if other.data is not None:
|
|
142
|
+
self.data = other.data
|
|
143
|
+
|
|
144
|
+
# Merge metadata
|
|
145
|
+
self.metadata.update(other.metadata)
|
|
146
|
+
|
|
147
|
+
return self
|
|
148
|
+
|
|
149
|
+
def complete(self) -> ValidationResult:
|
|
150
|
+
"""Mark the validation as complete and calculate duration."""
|
|
151
|
+
self.completed_at = datetime.now(timezone.utc)
|
|
152
|
+
if self.started_at:
|
|
153
|
+
delta = self.completed_at - self.started_at
|
|
154
|
+
self.duration_ms = delta.total_seconds() * 1000
|
|
155
|
+
return self
|
|
156
|
+
|
|
157
|
+
def to_dict(self) -> dict[str, Any]:
|
|
158
|
+
"""Convert to a dictionary suitable for JSON serialization."""
|
|
159
|
+
return self.model_dump(mode="json")
|
|
160
|
+
|
|
161
|
+
def __str__(self) -> str:
|
|
162
|
+
status_str = self.status.value.upper()
|
|
163
|
+
msg_count = len(self.messages)
|
|
164
|
+
duration_str = f" ({self.duration_ms:.1f}ms)" if self.duration_ms else ""
|
|
165
|
+
return f"ValidationResult[{status_str}] - {msg_count} message(s){duration_str}"
|
|
166
|
+
|
|
167
|
+
def __repr__(self) -> str:
|
|
168
|
+
return (
|
|
169
|
+
f"ValidationResult(id={self.id}, status={self.status.value}, "
|
|
170
|
+
f"messages={len(self.messages)}, validators={self.validators_run})"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def create_result(
|
|
175
|
+
status: ValidationStatus = ValidationStatus.PASSED,
|
|
176
|
+
data: Any = None,
|
|
177
|
+
provenance: Provenance | None = None,
|
|
178
|
+
) -> ValidationResult:
|
|
179
|
+
"""Factory function to create a new validation result."""
|
|
180
|
+
return ValidationResult(
|
|
181
|
+
status=status,
|
|
182
|
+
data=data,
|
|
183
|
+
original_data=data,
|
|
184
|
+
provenance=provenance,
|
|
185
|
+
)
|