daytashield 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,182 @@
1
+ """Base processor abstract class."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from abc import ABC, abstractmethod
7
+ from pathlib import Path
8
+ from typing import Any, BinaryIO
9
+
10
+ from pydantic import BaseModel, Field
11
+
12
+ from daytashield.core.result import Provenance, ValidationResult, ValidationStatus, create_result
13
+
14
+
15
+ class ProcessorConfig(BaseModel):
16
+ """Base configuration for all processors."""
17
+
18
+ enabled: bool = Field(True, description="Whether this processor is enabled")
19
+ extract_metadata: bool = Field(True, description="Extract file/data metadata")
20
+ compute_checksum: bool = Field(True, description="Compute SHA-256 checksum of input")
21
+ max_size_bytes: int | None = Field(None, description="Maximum input size in bytes")
22
+
23
+ model_config = {"extra": "allow"}
24
+
25
+
26
+ class ProcessedData(BaseModel):
27
+ """Container for processed data with metadata."""
28
+
29
+ content: Any = Field(..., description="The extracted/processed content")
30
+ content_type: str = Field(..., description="Type of content (text, records, structured)")
31
+ source_type: str = Field(..., description="Original source type (pdf, csv, json)")
32
+ metadata: dict[str, Any] = Field(default_factory=dict, description="Extracted metadata")
33
+ page_count: int | None = Field(None, description="Number of pages (for documents)")
34
+ record_count: int | None = Field(None, description="Number of records (for tabular data)")
35
+ raw_size_bytes: int | None = Field(None, description="Size of raw input")
36
+
37
+ model_config = {"arbitrary_types_allowed": True}
38
+
39
+
40
+ class BaseProcessor(ABC):
41
+ """Abstract base class for all DaytaShield processors.
42
+
43
+ Processors are responsible for extracting content and metadata from
44
+ various file formats (PDF, CSV, JSON, etc.) and preparing them for
45
+ validation.
46
+
47
+ Example:
48
+ >>> class MyProcessor(BaseProcessor):
49
+ ... name = "my_processor"
50
+ ... supported_extensions = [".xyz"]
51
+ ...
52
+ ... def process(self, source, result):
53
+ ... content = self._extract(source)
54
+ ... result.data = ProcessedData(
55
+ ... content=content,
56
+ ... content_type="text",
57
+ ... source_type="xyz",
58
+ ... )
59
+ ... return result
60
+ """
61
+
62
+ name: str = "base_processor"
63
+ supported_extensions: list[str] = []
64
+ supported_mime_types: list[str] = []
65
+
66
+ def __init__(self, config: ProcessorConfig | dict[str, Any] | None = None):
67
+ """Initialize the processor with optional configuration.
68
+
69
+ Args:
70
+ config: Processor configuration, either as ProcessorConfig or dict.
71
+ """
72
+ if config is None:
73
+ self.config = ProcessorConfig()
74
+ elif isinstance(config, dict):
75
+ self.config = ProcessorConfig(**config)
76
+ else:
77
+ self.config = config
78
+
79
+ @abstractmethod
80
+ def process(
81
+ self, source: str | Path | BinaryIO | bytes, result: ValidationResult | None = None
82
+ ) -> ValidationResult:
83
+ """Process the source and extract content.
84
+
85
+ This is the main method that subclasses must implement. It should:
86
+ 1. Read/parse the source data
87
+ 2. Extract content and metadata
88
+ 3. Create a ProcessedData object and set it as result.data
89
+ 4. Return the ValidationResult
90
+
91
+ Args:
92
+ source: The data source (file path, file object, or bytes)
93
+ result: Optional existing ValidationResult to update
94
+
95
+ Returns:
96
+ ValidationResult with ProcessedData in result.data
97
+ """
98
+ pass
99
+
100
+ def supports(self, source: str | Path) -> bool:
101
+ """Check if this processor supports the given source.
102
+
103
+ Args:
104
+ source: File path to check
105
+
106
+ Returns:
107
+ True if this processor can handle the source
108
+ """
109
+ path = Path(source) if isinstance(source, str) else source
110
+ return path.suffix.lower() in self.supported_extensions
111
+
112
+ def _create_result(
113
+ self, source: str | Path | BinaryIO | bytes
114
+ ) -> tuple[ValidationResult, Provenance]:
115
+ """Create a new result with provenance information.
116
+
117
+ Args:
118
+ source: The data source
119
+
120
+ Returns:
121
+ Tuple of (ValidationResult, Provenance)
122
+ """
123
+ source_path: str | None = None
124
+ source_type = "bytes"
125
+ source_id = ""
126
+
127
+ if isinstance(source, (str, Path)):
128
+ path = Path(source)
129
+ source_path = str(path.absolute())
130
+ source_type = "file"
131
+ source_id = path.name
132
+ elif hasattr(source, "name"):
133
+ source_path = getattr(source, "name", None)
134
+ source_type = "stream"
135
+ source_id = Path(source_path).name if source_path else "stream"
136
+ else:
137
+ source_id = "bytes"
138
+
139
+ provenance = Provenance(
140
+ source_id=source_id,
141
+ source_type=source_type,
142
+ source_path=source_path,
143
+ processor_chain=[self.name],
144
+ )
145
+
146
+ result = create_result(
147
+ status=ValidationStatus.PASSED,
148
+ provenance=provenance,
149
+ )
150
+
151
+ return result, provenance
152
+
153
+ def _compute_checksum(self, data: bytes) -> str:
154
+ """Compute SHA-256 checksum of data.
155
+
156
+ Args:
157
+ data: Raw bytes to hash
158
+
159
+ Returns:
160
+ Hex-encoded SHA-256 hash
161
+ """
162
+ return hashlib.sha256(data).hexdigest()
163
+
164
+ def _read_source(self, source: str | Path | BinaryIO | bytes) -> bytes:
165
+ """Read raw bytes from the source.
166
+
167
+ Args:
168
+ source: The data source
169
+
170
+ Returns:
171
+ Raw bytes from the source
172
+ """
173
+ if isinstance(source, bytes):
174
+ return source
175
+ elif isinstance(source, (str, Path)):
176
+ return Path(source).read_bytes()
177
+ else:
178
+ # File-like object
179
+ return source.read()
180
+
181
+ def __repr__(self) -> str:
182
+ return f"{self.__class__.__name__}(name={self.name!r}, extensions={self.supported_extensions})"
@@ -0,0 +1,269 @@
1
+ """CSV file processor."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any, BinaryIO
7
+
8
+ from pydantic import Field
9
+
10
+ from daytashield.core.result import ValidationResult, ValidationStatus
11
+ from daytashield.processors.base import BaseProcessor, ProcessedData, ProcessorConfig
12
+
13
+
14
+ class CSVProcessorConfig(ProcessorConfig):
15
+ """Configuration for CSV processing."""
16
+
17
+ delimiter: str = Field(",", description="Field delimiter")
18
+ encoding: str = Field("utf-8", description="File encoding")
19
+ has_header: bool = Field(True, description="First row is header")
20
+ infer_types: bool = Field(True, description="Infer column data types")
21
+ max_rows: int | None = Field(None, description="Maximum rows to process")
22
+ skip_rows: int = Field(0, description="Rows to skip at start")
23
+ null_values: list[str] = Field(
24
+ default_factory=lambda: ["", "NA", "N/A", "null", "NULL", "None", "nan", "NaN"],
25
+ description="Values to treat as null",
26
+ )
27
+
28
+
29
+ class CSVProcessor(BaseProcessor):
30
+ """Processes CSV files to extract structured data.
31
+
32
+ Uses pandas for robust CSV parsing with:
33
+ - Automatic type inference
34
+ - Missing value detection
35
+ - Schema extraction
36
+ - Encoding detection
37
+
38
+ Example:
39
+ >>> processor = CSVProcessor()
40
+ >>> result = processor.process("data.csv")
41
+ >>> records = result.data.content # List of dicts
42
+ >>> schema = result.data.metadata["schema"] # Inferred schema
43
+ """
44
+
45
+ name = "csv"
46
+ supported_extensions = [".csv", ".tsv", ".txt"]
47
+ supported_mime_types = ["text/csv", "text/tab-separated-values"]
48
+
49
+ def __init__(self, config: CSVProcessorConfig | dict[str, Any] | None = None):
50
+ """Initialize the CSV processor.
51
+
52
+ Args:
53
+ config: Processor configuration
54
+ """
55
+ if config is None:
56
+ super().__init__(CSVProcessorConfig())
57
+ elif isinstance(config, dict):
58
+ super().__init__(CSVProcessorConfig(**config))
59
+ else:
60
+ super().__init__(config)
61
+
62
+ def process(
63
+ self, source: str | Path | BinaryIO | bytes, result: ValidationResult | None = None
64
+ ) -> ValidationResult:
65
+ """Process a CSV file and extract records.
66
+
67
+ Args:
68
+ source: CSV file path, file object, or bytes
69
+ result: Optional existing ValidationResult
70
+
71
+ Returns:
72
+ ValidationResult with ProcessedData containing records
73
+ """
74
+ # Create result if not provided
75
+ if result is None:
76
+ result, provenance = self._create_result(source)
77
+ else:
78
+ provenance = result.provenance
79
+
80
+ config = self.config
81
+ if not isinstance(config, CSVProcessorConfig):
82
+ config = CSVProcessorConfig()
83
+
84
+ try:
85
+ import pandas as pd
86
+ except ImportError:
87
+ result.add_message(
88
+ code="CSV_NO_PANDAS",
89
+ message="pandas package not installed. Install with: pip install pandas",
90
+ severity=ValidationStatus.ERROR,
91
+ validator=self.name,
92
+ )
93
+ result.status = ValidationStatus.ERROR
94
+ return result
95
+
96
+ try:
97
+ # Read raw bytes for checksum
98
+ raw_bytes = self._read_source(source)
99
+
100
+ # Compute checksum if configured
101
+ if self.config.compute_checksum and provenance:
102
+ provenance.checksum = self._compute_checksum(raw_bytes)
103
+
104
+ # Determine delimiter for TSV
105
+ delimiter = config.delimiter
106
+ if isinstance(source, (str, Path)):
107
+ if Path(source).suffix.lower() == ".tsv":
108
+ delimiter = "\t"
109
+
110
+ # Read CSV with pandas
111
+ import io
112
+
113
+ df = pd.read_csv(
114
+ io.BytesIO(raw_bytes),
115
+ delimiter=delimiter,
116
+ encoding=config.encoding,
117
+ header=0 if config.has_header else None,
118
+ nrows=config.max_rows,
119
+ skiprows=config.skip_rows if config.skip_rows > 0 else None,
120
+ na_values=config.null_values,
121
+ low_memory=False,
122
+ )
123
+
124
+ # Extract schema information
125
+ schema = self._infer_schema(df) if config.infer_types else {}
126
+
127
+ # Get quality metrics
128
+ quality_metrics = self._compute_quality_metrics(df)
129
+
130
+ # Convert to records
131
+ records = df.to_dict(orient="records")
132
+
133
+ # Create processed data
134
+ processed = ProcessedData(
135
+ content=records,
136
+ content_type="records",
137
+ source_type="csv",
138
+ metadata={
139
+ "schema": schema,
140
+ "columns": list(df.columns),
141
+ "quality": quality_metrics,
142
+ },
143
+ record_count=len(records),
144
+ raw_size_bytes=len(raw_bytes),
145
+ )
146
+
147
+ result.data = processed
148
+
149
+ # Add warnings for data quality issues
150
+ if quality_metrics["null_percentage"] > 20:
151
+ result.add_message(
152
+ code="CSV_HIGH_NULL_RATE",
153
+ message=f"High null rate: {quality_metrics['null_percentage']:.1f}% of values are null",
154
+ severity=ValidationStatus.WARNING,
155
+ validator=self.name,
156
+ )
157
+ if result.status == ValidationStatus.PASSED:
158
+ result.status = ValidationStatus.WARNING
159
+
160
+ if quality_metrics["duplicate_rows"] > 0:
161
+ result.add_message(
162
+ code="CSV_DUPLICATE_ROWS",
163
+ message=f"Found {quality_metrics['duplicate_rows']} duplicate rows",
164
+ severity=ValidationStatus.WARNING,
165
+ validator=self.name,
166
+ )
167
+ if result.status == ValidationStatus.PASSED:
168
+ result.status = ValidationStatus.WARNING
169
+
170
+ except pd.errors.EmptyDataError:
171
+ result.add_message(
172
+ code="CSV_EMPTY",
173
+ message="CSV file is empty",
174
+ severity=ValidationStatus.ERROR,
175
+ validator=self.name,
176
+ )
177
+ result.status = ValidationStatus.ERROR
178
+
179
+ except pd.errors.ParserError as e:
180
+ result.add_message(
181
+ code="CSV_PARSE_ERROR",
182
+ message=f"Failed to parse CSV: {e}",
183
+ severity=ValidationStatus.ERROR,
184
+ validator=self.name,
185
+ )
186
+ result.status = ValidationStatus.ERROR
187
+
188
+ except Exception as e:
189
+ result.add_message(
190
+ code="CSV_PROCESSING_ERROR",
191
+ message=f"Failed to process CSV: {e}",
192
+ severity=ValidationStatus.ERROR,
193
+ validator=self.name,
194
+ details={"error": str(e)},
195
+ )
196
+ result.status = ValidationStatus.ERROR
197
+
198
+ return result
199
+
200
+ def _infer_schema(self, df: Any) -> dict[str, Any]:
201
+ """Infer schema from DataFrame.
202
+
203
+ Args:
204
+ df: pandas DataFrame
205
+
206
+ Returns:
207
+ Schema dict with column types
208
+ """
209
+ schema: dict[str, Any] = {
210
+ "type": "array",
211
+ "items": {
212
+ "type": "object",
213
+ "properties": {},
214
+ },
215
+ }
216
+
217
+ type_mapping = {
218
+ "int64": "integer",
219
+ "int32": "integer",
220
+ "float64": "number",
221
+ "float32": "number",
222
+ "bool": "boolean",
223
+ "object": "string",
224
+ "datetime64[ns]": "string",
225
+ "category": "string",
226
+ }
227
+
228
+ for col in df.columns:
229
+ dtype_str = str(df[col].dtype)
230
+ json_type = type_mapping.get(dtype_str, "string")
231
+
232
+ col_schema: dict[str, Any] = {"type": json_type}
233
+
234
+ # Add nullable if column has nulls
235
+ if df[col].isna().any():
236
+ col_schema["nullable"] = True
237
+
238
+ # Add enum for low-cardinality string columns
239
+ if json_type == "string":
240
+ unique_count = df[col].nunique()
241
+ if unique_count <= 10 and unique_count > 0:
242
+ unique_values = df[col].dropna().unique().tolist()
243
+ col_schema["enum"] = [str(v) for v in unique_values]
244
+
245
+ schema["items"]["properties"][str(col)] = col_schema
246
+
247
+ return schema
248
+
249
+ def _compute_quality_metrics(self, df: Any) -> dict[str, Any]:
250
+ """Compute data quality metrics.
251
+
252
+ Args:
253
+ df: pandas DataFrame
254
+
255
+ Returns:
256
+ Dict of quality metrics
257
+ """
258
+ total_cells = df.size
259
+ null_cells = df.isna().sum().sum()
260
+
261
+ return {
262
+ "row_count": len(df),
263
+ "column_count": len(df.columns),
264
+ "total_cells": total_cells,
265
+ "null_cells": int(null_cells),
266
+ "null_percentage": (null_cells / total_cells * 100) if total_cells > 0 else 0,
267
+ "duplicate_rows": int(df.duplicated().sum()),
268
+ "memory_usage_bytes": int(df.memory_usage(deep=True).sum()),
269
+ }