daytashield 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- daytashield/__init__.py +55 -0
- daytashield/cli/__init__.py +5 -0
- daytashield/cli/main.py +541 -0
- daytashield/core/__init__.py +15 -0
- daytashield/core/audit.py +275 -0
- daytashield/core/pipeline.py +240 -0
- daytashield/core/result.py +185 -0
- daytashield/core/router.py +217 -0
- daytashield/integrations/__init__.py +7 -0
- daytashield/integrations/langchain.py +391 -0
- daytashield/processors/__init__.py +13 -0
- daytashield/processors/base.py +182 -0
- daytashield/processors/csv.py +269 -0
- daytashield/processors/json.py +260 -0
- daytashield/processors/pdf.py +232 -0
- daytashield/rules/__init__.py +14 -0
- daytashield/rules/base.py +67 -0
- daytashield/rules/gdpr.py +348 -0
- daytashield/rules/hipaa.py +229 -0
- daytashield/rules/pii.py +208 -0
- daytashield/validators/__init__.py +15 -0
- daytashield/validators/base.py +103 -0
- daytashield/validators/compliance.py +222 -0
- daytashield/validators/freshness.py +337 -0
- daytashield/validators/schema.py +176 -0
- daytashield/validators/semantic.py +256 -0
- daytashield-0.1.1.dist-info/METADATA +316 -0
- daytashield-0.1.1.dist-info/RECORD +31 -0
- daytashield-0.1.1.dist-info/WHEEL +4 -0
- daytashield-0.1.1.dist-info/entry_points.txt +2 -0
- daytashield-0.1.1.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Base processor abstract class."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, BinaryIO
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from daytashield.core.result import Provenance, ValidationResult, ValidationStatus, create_result
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ProcessorConfig(BaseModel):
|
|
16
|
+
"""Base configuration for all processors."""
|
|
17
|
+
|
|
18
|
+
enabled: bool = Field(True, description="Whether this processor is enabled")
|
|
19
|
+
extract_metadata: bool = Field(True, description="Extract file/data metadata")
|
|
20
|
+
compute_checksum: bool = Field(True, description="Compute SHA-256 checksum of input")
|
|
21
|
+
max_size_bytes: int | None = Field(None, description="Maximum input size in bytes")
|
|
22
|
+
|
|
23
|
+
model_config = {"extra": "allow"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ProcessedData(BaseModel):
|
|
27
|
+
"""Container for processed data with metadata."""
|
|
28
|
+
|
|
29
|
+
content: Any = Field(..., description="The extracted/processed content")
|
|
30
|
+
content_type: str = Field(..., description="Type of content (text, records, structured)")
|
|
31
|
+
source_type: str = Field(..., description="Original source type (pdf, csv, json)")
|
|
32
|
+
metadata: dict[str, Any] = Field(default_factory=dict, description="Extracted metadata")
|
|
33
|
+
page_count: int | None = Field(None, description="Number of pages (for documents)")
|
|
34
|
+
record_count: int | None = Field(None, description="Number of records (for tabular data)")
|
|
35
|
+
raw_size_bytes: int | None = Field(None, description="Size of raw input")
|
|
36
|
+
|
|
37
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class BaseProcessor(ABC):
|
|
41
|
+
"""Abstract base class for all DaytaShield processors.
|
|
42
|
+
|
|
43
|
+
Processors are responsible for extracting content and metadata from
|
|
44
|
+
various file formats (PDF, CSV, JSON, etc.) and preparing them for
|
|
45
|
+
validation.
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
>>> class MyProcessor(BaseProcessor):
|
|
49
|
+
... name = "my_processor"
|
|
50
|
+
... supported_extensions = [".xyz"]
|
|
51
|
+
...
|
|
52
|
+
... def process(self, source, result):
|
|
53
|
+
... content = self._extract(source)
|
|
54
|
+
... result.data = ProcessedData(
|
|
55
|
+
... content=content,
|
|
56
|
+
... content_type="text",
|
|
57
|
+
... source_type="xyz",
|
|
58
|
+
... )
|
|
59
|
+
... return result
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
name: str = "base_processor"
|
|
63
|
+
supported_extensions: list[str] = []
|
|
64
|
+
supported_mime_types: list[str] = []
|
|
65
|
+
|
|
66
|
+
def __init__(self, config: ProcessorConfig | dict[str, Any] | None = None):
|
|
67
|
+
"""Initialize the processor with optional configuration.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
config: Processor configuration, either as ProcessorConfig or dict.
|
|
71
|
+
"""
|
|
72
|
+
if config is None:
|
|
73
|
+
self.config = ProcessorConfig()
|
|
74
|
+
elif isinstance(config, dict):
|
|
75
|
+
self.config = ProcessorConfig(**config)
|
|
76
|
+
else:
|
|
77
|
+
self.config = config
|
|
78
|
+
|
|
79
|
+
@abstractmethod
|
|
80
|
+
def process(
|
|
81
|
+
self, source: str | Path | BinaryIO | bytes, result: ValidationResult | None = None
|
|
82
|
+
) -> ValidationResult:
|
|
83
|
+
"""Process the source and extract content.
|
|
84
|
+
|
|
85
|
+
This is the main method that subclasses must implement. It should:
|
|
86
|
+
1. Read/parse the source data
|
|
87
|
+
2. Extract content and metadata
|
|
88
|
+
3. Create a ProcessedData object and set it as result.data
|
|
89
|
+
4. Return the ValidationResult
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
source: The data source (file path, file object, or bytes)
|
|
93
|
+
result: Optional existing ValidationResult to update
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
ValidationResult with ProcessedData in result.data
|
|
97
|
+
"""
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
def supports(self, source: str | Path) -> bool:
|
|
101
|
+
"""Check if this processor supports the given source.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
source: File path to check
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
True if this processor can handle the source
|
|
108
|
+
"""
|
|
109
|
+
path = Path(source) if isinstance(source, str) else source
|
|
110
|
+
return path.suffix.lower() in self.supported_extensions
|
|
111
|
+
|
|
112
|
+
def _create_result(
|
|
113
|
+
self, source: str | Path | BinaryIO | bytes
|
|
114
|
+
) -> tuple[ValidationResult, Provenance]:
|
|
115
|
+
"""Create a new result with provenance information.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
source: The data source
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Tuple of (ValidationResult, Provenance)
|
|
122
|
+
"""
|
|
123
|
+
source_path: str | None = None
|
|
124
|
+
source_type = "bytes"
|
|
125
|
+
source_id = ""
|
|
126
|
+
|
|
127
|
+
if isinstance(source, (str, Path)):
|
|
128
|
+
path = Path(source)
|
|
129
|
+
source_path = str(path.absolute())
|
|
130
|
+
source_type = "file"
|
|
131
|
+
source_id = path.name
|
|
132
|
+
elif hasattr(source, "name"):
|
|
133
|
+
source_path = getattr(source, "name", None)
|
|
134
|
+
source_type = "stream"
|
|
135
|
+
source_id = Path(source_path).name if source_path else "stream"
|
|
136
|
+
else:
|
|
137
|
+
source_id = "bytes"
|
|
138
|
+
|
|
139
|
+
provenance = Provenance(
|
|
140
|
+
source_id=source_id,
|
|
141
|
+
source_type=source_type,
|
|
142
|
+
source_path=source_path,
|
|
143
|
+
processor_chain=[self.name],
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
result = create_result(
|
|
147
|
+
status=ValidationStatus.PASSED,
|
|
148
|
+
provenance=provenance,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return result, provenance
|
|
152
|
+
|
|
153
|
+
def _compute_checksum(self, data: bytes) -> str:
|
|
154
|
+
"""Compute SHA-256 checksum of data.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
data: Raw bytes to hash
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Hex-encoded SHA-256 hash
|
|
161
|
+
"""
|
|
162
|
+
return hashlib.sha256(data).hexdigest()
|
|
163
|
+
|
|
164
|
+
def _read_source(self, source: str | Path | BinaryIO | bytes) -> bytes:
|
|
165
|
+
"""Read raw bytes from the source.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
source: The data source
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Raw bytes from the source
|
|
172
|
+
"""
|
|
173
|
+
if isinstance(source, bytes):
|
|
174
|
+
return source
|
|
175
|
+
elif isinstance(source, (str, Path)):
|
|
176
|
+
return Path(source).read_bytes()
|
|
177
|
+
else:
|
|
178
|
+
# File-like object
|
|
179
|
+
return source.read()
|
|
180
|
+
|
|
181
|
+
def __repr__(self) -> str:
|
|
182
|
+
return f"{self.__class__.__name__}(name={self.name!r}, extensions={self.supported_extensions})"
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""CSV file processor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, BinaryIO
|
|
7
|
+
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from daytashield.core.result import ValidationResult, ValidationStatus
|
|
11
|
+
from daytashield.processors.base import BaseProcessor, ProcessedData, ProcessorConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CSVProcessorConfig(ProcessorConfig):
|
|
15
|
+
"""Configuration for CSV processing."""
|
|
16
|
+
|
|
17
|
+
delimiter: str = Field(",", description="Field delimiter")
|
|
18
|
+
encoding: str = Field("utf-8", description="File encoding")
|
|
19
|
+
has_header: bool = Field(True, description="First row is header")
|
|
20
|
+
infer_types: bool = Field(True, description="Infer column data types")
|
|
21
|
+
max_rows: int | None = Field(None, description="Maximum rows to process")
|
|
22
|
+
skip_rows: int = Field(0, description="Rows to skip at start")
|
|
23
|
+
null_values: list[str] = Field(
|
|
24
|
+
default_factory=lambda: ["", "NA", "N/A", "null", "NULL", "None", "nan", "NaN"],
|
|
25
|
+
description="Values to treat as null",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CSVProcessor(BaseProcessor):
|
|
30
|
+
"""Processes CSV files to extract structured data.
|
|
31
|
+
|
|
32
|
+
Uses pandas for robust CSV parsing with:
|
|
33
|
+
- Automatic type inference
|
|
34
|
+
- Missing value detection
|
|
35
|
+
- Schema extraction
|
|
36
|
+
- Encoding detection
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
>>> processor = CSVProcessor()
|
|
40
|
+
>>> result = processor.process("data.csv")
|
|
41
|
+
>>> records = result.data.content # List of dicts
|
|
42
|
+
>>> schema = result.data.metadata["schema"] # Inferred schema
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
name = "csv"
|
|
46
|
+
supported_extensions = [".csv", ".tsv", ".txt"]
|
|
47
|
+
supported_mime_types = ["text/csv", "text/tab-separated-values"]
|
|
48
|
+
|
|
49
|
+
def __init__(self, config: CSVProcessorConfig | dict[str, Any] | None = None):
|
|
50
|
+
"""Initialize the CSV processor.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
config: Processor configuration
|
|
54
|
+
"""
|
|
55
|
+
if config is None:
|
|
56
|
+
super().__init__(CSVProcessorConfig())
|
|
57
|
+
elif isinstance(config, dict):
|
|
58
|
+
super().__init__(CSVProcessorConfig(**config))
|
|
59
|
+
else:
|
|
60
|
+
super().__init__(config)
|
|
61
|
+
|
|
62
|
+
def process(
|
|
63
|
+
self, source: str | Path | BinaryIO | bytes, result: ValidationResult | None = None
|
|
64
|
+
) -> ValidationResult:
|
|
65
|
+
"""Process a CSV file and extract records.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
source: CSV file path, file object, or bytes
|
|
69
|
+
result: Optional existing ValidationResult
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
ValidationResult with ProcessedData containing records
|
|
73
|
+
"""
|
|
74
|
+
# Create result if not provided
|
|
75
|
+
if result is None:
|
|
76
|
+
result, provenance = self._create_result(source)
|
|
77
|
+
else:
|
|
78
|
+
provenance = result.provenance
|
|
79
|
+
|
|
80
|
+
config = self.config
|
|
81
|
+
if not isinstance(config, CSVProcessorConfig):
|
|
82
|
+
config = CSVProcessorConfig()
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
import pandas as pd
|
|
86
|
+
except ImportError:
|
|
87
|
+
result.add_message(
|
|
88
|
+
code="CSV_NO_PANDAS",
|
|
89
|
+
message="pandas package not installed. Install with: pip install pandas",
|
|
90
|
+
severity=ValidationStatus.ERROR,
|
|
91
|
+
validator=self.name,
|
|
92
|
+
)
|
|
93
|
+
result.status = ValidationStatus.ERROR
|
|
94
|
+
return result
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
# Read raw bytes for checksum
|
|
98
|
+
raw_bytes = self._read_source(source)
|
|
99
|
+
|
|
100
|
+
# Compute checksum if configured
|
|
101
|
+
if self.config.compute_checksum and provenance:
|
|
102
|
+
provenance.checksum = self._compute_checksum(raw_bytes)
|
|
103
|
+
|
|
104
|
+
# Determine delimiter for TSV
|
|
105
|
+
delimiter = config.delimiter
|
|
106
|
+
if isinstance(source, (str, Path)):
|
|
107
|
+
if Path(source).suffix.lower() == ".tsv":
|
|
108
|
+
delimiter = "\t"
|
|
109
|
+
|
|
110
|
+
# Read CSV with pandas
|
|
111
|
+
import io
|
|
112
|
+
|
|
113
|
+
df = pd.read_csv(
|
|
114
|
+
io.BytesIO(raw_bytes),
|
|
115
|
+
delimiter=delimiter,
|
|
116
|
+
encoding=config.encoding,
|
|
117
|
+
header=0 if config.has_header else None,
|
|
118
|
+
nrows=config.max_rows,
|
|
119
|
+
skiprows=config.skip_rows if config.skip_rows > 0 else None,
|
|
120
|
+
na_values=config.null_values,
|
|
121
|
+
low_memory=False,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Extract schema information
|
|
125
|
+
schema = self._infer_schema(df) if config.infer_types else {}
|
|
126
|
+
|
|
127
|
+
# Get quality metrics
|
|
128
|
+
quality_metrics = self._compute_quality_metrics(df)
|
|
129
|
+
|
|
130
|
+
# Convert to records
|
|
131
|
+
records = df.to_dict(orient="records")
|
|
132
|
+
|
|
133
|
+
# Create processed data
|
|
134
|
+
processed = ProcessedData(
|
|
135
|
+
content=records,
|
|
136
|
+
content_type="records",
|
|
137
|
+
source_type="csv",
|
|
138
|
+
metadata={
|
|
139
|
+
"schema": schema,
|
|
140
|
+
"columns": list(df.columns),
|
|
141
|
+
"quality": quality_metrics,
|
|
142
|
+
},
|
|
143
|
+
record_count=len(records),
|
|
144
|
+
raw_size_bytes=len(raw_bytes),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
result.data = processed
|
|
148
|
+
|
|
149
|
+
# Add warnings for data quality issues
|
|
150
|
+
if quality_metrics["null_percentage"] > 20:
|
|
151
|
+
result.add_message(
|
|
152
|
+
code="CSV_HIGH_NULL_RATE",
|
|
153
|
+
message=f"High null rate: {quality_metrics['null_percentage']:.1f}% of values are null",
|
|
154
|
+
severity=ValidationStatus.WARNING,
|
|
155
|
+
validator=self.name,
|
|
156
|
+
)
|
|
157
|
+
if result.status == ValidationStatus.PASSED:
|
|
158
|
+
result.status = ValidationStatus.WARNING
|
|
159
|
+
|
|
160
|
+
if quality_metrics["duplicate_rows"] > 0:
|
|
161
|
+
result.add_message(
|
|
162
|
+
code="CSV_DUPLICATE_ROWS",
|
|
163
|
+
message=f"Found {quality_metrics['duplicate_rows']} duplicate rows",
|
|
164
|
+
severity=ValidationStatus.WARNING,
|
|
165
|
+
validator=self.name,
|
|
166
|
+
)
|
|
167
|
+
if result.status == ValidationStatus.PASSED:
|
|
168
|
+
result.status = ValidationStatus.WARNING
|
|
169
|
+
|
|
170
|
+
except pd.errors.EmptyDataError:
|
|
171
|
+
result.add_message(
|
|
172
|
+
code="CSV_EMPTY",
|
|
173
|
+
message="CSV file is empty",
|
|
174
|
+
severity=ValidationStatus.ERROR,
|
|
175
|
+
validator=self.name,
|
|
176
|
+
)
|
|
177
|
+
result.status = ValidationStatus.ERROR
|
|
178
|
+
|
|
179
|
+
except pd.errors.ParserError as e:
|
|
180
|
+
result.add_message(
|
|
181
|
+
code="CSV_PARSE_ERROR",
|
|
182
|
+
message=f"Failed to parse CSV: {e}",
|
|
183
|
+
severity=ValidationStatus.ERROR,
|
|
184
|
+
validator=self.name,
|
|
185
|
+
)
|
|
186
|
+
result.status = ValidationStatus.ERROR
|
|
187
|
+
|
|
188
|
+
except Exception as e:
|
|
189
|
+
result.add_message(
|
|
190
|
+
code="CSV_PROCESSING_ERROR",
|
|
191
|
+
message=f"Failed to process CSV: {e}",
|
|
192
|
+
severity=ValidationStatus.ERROR,
|
|
193
|
+
validator=self.name,
|
|
194
|
+
details={"error": str(e)},
|
|
195
|
+
)
|
|
196
|
+
result.status = ValidationStatus.ERROR
|
|
197
|
+
|
|
198
|
+
return result
|
|
199
|
+
|
|
200
|
+
def _infer_schema(self, df: Any) -> dict[str, Any]:
|
|
201
|
+
"""Infer schema from DataFrame.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
df: pandas DataFrame
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
Schema dict with column types
|
|
208
|
+
"""
|
|
209
|
+
schema: dict[str, Any] = {
|
|
210
|
+
"type": "array",
|
|
211
|
+
"items": {
|
|
212
|
+
"type": "object",
|
|
213
|
+
"properties": {},
|
|
214
|
+
},
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
type_mapping = {
|
|
218
|
+
"int64": "integer",
|
|
219
|
+
"int32": "integer",
|
|
220
|
+
"float64": "number",
|
|
221
|
+
"float32": "number",
|
|
222
|
+
"bool": "boolean",
|
|
223
|
+
"object": "string",
|
|
224
|
+
"datetime64[ns]": "string",
|
|
225
|
+
"category": "string",
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
for col in df.columns:
|
|
229
|
+
dtype_str = str(df[col].dtype)
|
|
230
|
+
json_type = type_mapping.get(dtype_str, "string")
|
|
231
|
+
|
|
232
|
+
col_schema: dict[str, Any] = {"type": json_type}
|
|
233
|
+
|
|
234
|
+
# Add nullable if column has nulls
|
|
235
|
+
if df[col].isna().any():
|
|
236
|
+
col_schema["nullable"] = True
|
|
237
|
+
|
|
238
|
+
# Add enum for low-cardinality string columns
|
|
239
|
+
if json_type == "string":
|
|
240
|
+
unique_count = df[col].nunique()
|
|
241
|
+
if unique_count <= 10 and unique_count > 0:
|
|
242
|
+
unique_values = df[col].dropna().unique().tolist()
|
|
243
|
+
col_schema["enum"] = [str(v) for v in unique_values]
|
|
244
|
+
|
|
245
|
+
schema["items"]["properties"][str(col)] = col_schema
|
|
246
|
+
|
|
247
|
+
return schema
|
|
248
|
+
|
|
249
|
+
def _compute_quality_metrics(self, df: Any) -> dict[str, Any]:
|
|
250
|
+
"""Compute data quality metrics.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
df: pandas DataFrame
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Dict of quality metrics
|
|
257
|
+
"""
|
|
258
|
+
total_cells = df.size
|
|
259
|
+
null_cells = df.isna().sum().sum()
|
|
260
|
+
|
|
261
|
+
return {
|
|
262
|
+
"row_count": len(df),
|
|
263
|
+
"column_count": len(df.columns),
|
|
264
|
+
"total_cells": total_cells,
|
|
265
|
+
"null_cells": int(null_cells),
|
|
266
|
+
"null_percentage": (null_cells / total_cells * 100) if total_cells > 0 else 0,
|
|
267
|
+
"duplicate_rows": int(df.duplicated().sum()),
|
|
268
|
+
"memory_usage_bytes": int(df.memory_usage(deep=True).sum()),
|
|
269
|
+
}
|