daytashield 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,260 @@
1
+ """JSON file processor."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any, BinaryIO
7
+
8
+ from pydantic import Field
9
+
10
+ from daytashield.core.result import ValidationResult, ValidationStatus
11
+ from daytashield.processors.base import BaseProcessor, ProcessedData, ProcessorConfig
12
+
13
+
14
+ class JSONProcessorConfig(ProcessorConfig):
15
+ """Configuration for JSON processing."""
16
+
17
+ encoding: str = Field("utf-8", description="File encoding")
18
+ allow_comments: bool = Field(False, description="Allow JSON5-style comments")
19
+ max_depth: int = Field(100, description="Maximum nesting depth")
20
+ flatten: bool = Field(False, description="Flatten nested structures")
21
+ flatten_separator: str = Field(".", description="Separator for flattened keys")
22
+
23
+
24
+ class JSONProcessor(BaseProcessor):
25
+ """Processes JSON files to extract structured data.
26
+
27
+ Uses orjson for fast JSON parsing with:
28
+ - Streaming support for large files
29
+ - Schema extraction
30
+ - Nested structure analysis
31
+ - Optional flattening
32
+
33
+ Example:
34
+ >>> processor = JSONProcessor()
35
+ >>> result = processor.process("data.json")
36
+ >>> data = result.data.content # Parsed JSON
37
+ >>> schema = result.data.metadata["schema"] # Inferred structure
38
+ """
39
+
40
+ name = "json"
41
+ supported_extensions = [".json", ".jsonl", ".ndjson"]
42
+ supported_mime_types = ["application/json", "application/x-ndjson"]
43
+
44
+ def __init__(self, config: JSONProcessorConfig | dict[str, Any] | None = None):
45
+ """Initialize the JSON processor.
46
+
47
+ Args:
48
+ config: Processor configuration
49
+ """
50
+ if config is None:
51
+ super().__init__(JSONProcessorConfig())
52
+ elif isinstance(config, dict):
53
+ super().__init__(JSONProcessorConfig(**config))
54
+ else:
55
+ super().__init__(config)
56
+
57
+ def process(
58
+ self, source: str | Path | BinaryIO | bytes, result: ValidationResult | None = None
59
+ ) -> ValidationResult:
60
+ """Process a JSON file and extract content.
61
+
62
+ Args:
63
+ source: JSON file path, file object, or bytes
64
+ result: Optional existing ValidationResult
65
+
66
+ Returns:
67
+ ValidationResult with ProcessedData containing parsed JSON
68
+ """
69
+ # Create result if not provided
70
+ if result is None:
71
+ result, provenance = self._create_result(source)
72
+ else:
73
+ provenance = result.provenance
74
+
75
+ config = self.config
76
+ if not isinstance(config, JSONProcessorConfig):
77
+ config = JSONProcessorConfig()
78
+
79
+ try:
80
+ import orjson
81
+ except ImportError:
82
+ # Fall back to standard json
83
+ import json as orjson # type: ignore[no-redef]
84
+
85
+ try:
86
+ # Read raw bytes
87
+ raw_bytes = self._read_source(source)
88
+
89
+ # Compute checksum if configured
90
+ if self.config.compute_checksum and provenance:
91
+ provenance.checksum = self._compute_checksum(raw_bytes)
92
+
93
+ # Determine if JSONL format
94
+ is_jsonl = False
95
+ if isinstance(source, (str, Path)):
96
+ ext = Path(source).suffix.lower()
97
+ is_jsonl = ext in (".jsonl", ".ndjson")
98
+
99
+ # Parse JSON
100
+ if is_jsonl:
101
+ content = self._parse_jsonl(raw_bytes, config.encoding)
102
+ content_type = "records"
103
+ record_count = len(content)
104
+ else:
105
+ content = orjson.loads(raw_bytes)
106
+ content_type = "array" if isinstance(content, list) else "object"
107
+ record_count = len(content) if isinstance(content, list) else None
108
+
109
+ # Analyze structure
110
+ structure_info = self._analyze_structure(content, config.max_depth)
111
+
112
+ # Optionally flatten
113
+ if config.flatten and isinstance(content, dict):
114
+ content = self._flatten(content, config.flatten_separator)
115
+
116
+ # Create processed data
117
+ processed = ProcessedData(
118
+ content=content,
119
+ content_type=content_type,
120
+ source_type="json",
121
+ metadata={
122
+ "structure": structure_info,
123
+ "is_jsonl": is_jsonl,
124
+ },
125
+ record_count=record_count,
126
+ raw_size_bytes=len(raw_bytes),
127
+ )
128
+
129
+ result.data = processed
130
+
131
+ # Add warnings for potential issues
132
+ if structure_info["max_depth"] > 10:
133
+ result.add_message(
134
+ code="JSON_DEEP_NESTING",
135
+ message=f"Deep nesting detected: {structure_info['max_depth']} levels",
136
+ severity=ValidationStatus.WARNING,
137
+ validator=self.name,
138
+ )
139
+ if result.status == ValidationStatus.PASSED:
140
+ result.status = ValidationStatus.WARNING
141
+
142
+ except orjson.JSONDecodeError as e: # type: ignore[union-attr]
143
+ result.add_message(
144
+ code="JSON_PARSE_ERROR",
145
+ message=f"Invalid JSON: {e}",
146
+ severity=ValidationStatus.ERROR,
147
+ validator=self.name,
148
+ )
149
+ result.status = ValidationStatus.ERROR
150
+
151
+ except Exception as e:
152
+ result.add_message(
153
+ code="JSON_PROCESSING_ERROR",
154
+ message=f"Failed to process JSON: {e}",
155
+ severity=ValidationStatus.ERROR,
156
+ validator=self.name,
157
+ details={"error": str(e)},
158
+ )
159
+ result.status = ValidationStatus.ERROR
160
+
161
+ return result
162
+
163
+ def _parse_jsonl(self, raw_bytes: bytes, encoding: str) -> list[Any]:
164
+ """Parse JSON Lines format.
165
+
166
+ Args:
167
+ raw_bytes: Raw file bytes
168
+ encoding: Text encoding
169
+
170
+ Returns:
171
+ List of parsed JSON objects
172
+ """
173
+ try:
174
+ import orjson
175
+
176
+ records = []
177
+ for line in raw_bytes.decode(encoding).splitlines():
178
+ line = line.strip()
179
+ if line: # Skip empty lines
180
+ records.append(orjson.loads(line))
181
+ return records
182
+ except ImportError:
183
+ import json
184
+
185
+ records = []
186
+ for line in raw_bytes.decode(encoding).splitlines():
187
+ line = line.strip()
188
+ if line:
189
+ records.append(json.loads(line))
190
+ return records
191
+
192
+ def _analyze_structure(self, data: Any, max_depth: int) -> dict[str, Any]:
193
+ """Analyze JSON structure.
194
+
195
+ Args:
196
+ data: Parsed JSON data
197
+ max_depth: Maximum depth to analyze
198
+
199
+ Returns:
200
+ Structure analysis dict
201
+ """
202
+ info: dict[str, Any] = {
203
+ "type": type(data).__name__,
204
+ "max_depth": 0,
205
+ "total_keys": 0,
206
+ "array_lengths": [],
207
+ }
208
+
209
+ def analyze(obj: Any, depth: int) -> None:
210
+ if depth > max_depth:
211
+ return
212
+
213
+ info["max_depth"] = max(info["max_depth"], depth)
214
+
215
+ if isinstance(obj, dict):
216
+ info["total_keys"] += len(obj)
217
+ for value in obj.values():
218
+ analyze(value, depth + 1)
219
+ elif isinstance(obj, list):
220
+ info["array_lengths"].append(len(obj))
221
+ for item in obj:
222
+ analyze(item, depth + 1)
223
+
224
+ analyze(data, 0)
225
+
226
+ # Summarize array lengths
227
+ if info["array_lengths"]:
228
+ info["min_array_length"] = min(info["array_lengths"])
229
+ info["max_array_length"] = max(info["array_lengths"])
230
+ info["array_count"] = len(info["array_lengths"])
231
+ del info["array_lengths"]
232
+
233
+ return info
234
+
235
+ def _flatten(self, data: dict[str, Any], separator: str) -> dict[str, Any]:
236
+ """Flatten a nested dictionary.
237
+
238
+ Args:
239
+ data: Nested dictionary
240
+ separator: Key separator
241
+
242
+ Returns:
243
+ Flattened dictionary
244
+ """
245
+ result: dict[str, Any] = {}
246
+
247
+ def flatten_recursive(obj: Any, prefix: str) -> None:
248
+ if isinstance(obj, dict):
249
+ for key, value in obj.items():
250
+ new_key = f"{prefix}{separator}{key}" if prefix else key
251
+ flatten_recursive(value, new_key)
252
+ elif isinstance(obj, list):
253
+ for i, item in enumerate(obj):
254
+ new_key = f"{prefix}[{i}]"
255
+ flatten_recursive(item, new_key)
256
+ else:
257
+ result[prefix] = obj
258
+
259
+ flatten_recursive(data, "")
260
+ return result
@@ -0,0 +1,232 @@
1
+ """PDF document processor."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any, BinaryIO
7
+
8
+ from pydantic import Field
9
+
10
+ from daytashield.core.result import ValidationResult, ValidationStatus
11
+ from daytashield.processors.base import BaseProcessor, ProcessedData, ProcessorConfig
12
+
13
+
14
+ class PDFProcessorConfig(ProcessorConfig):
15
+ """Configuration for PDF processing."""
16
+
17
+ extract_images: bool = Field(False, description="Extract embedded images")
18
+ extract_tables: bool = Field(True, description="Extract tables from PDF")
19
+ ocr_fallback: bool = Field(False, description="Use OCR if text extraction fails")
20
+ max_pages: int | None = Field(None, description="Maximum pages to process")
21
+ password: str | None = Field(None, description="PDF password if encrypted")
22
+
23
+
24
+ class PDFProcessor(BaseProcessor):
25
+ """Processes PDF documents to extract text and metadata.
26
+
27
+ Uses pdfplumber for reliable text extraction with layout preservation.
28
+ Supports:
29
+ - Text extraction with layout
30
+ - Metadata extraction (author, title, dates)
31
+ - Table extraction
32
+ - Page-by-page processing
33
+ - Optional OCR fallback
34
+
35
+ Example:
36
+ >>> processor = PDFProcessor()
37
+ >>> result = processor.process("invoice.pdf")
38
+ >>> print(result.data.content) # Extracted text
39
+ >>> print(result.data.metadata) # PDF metadata
40
+ """
41
+
42
+ name = "pdf"
43
+ supported_extensions = [".pdf"]
44
+ supported_mime_types = ["application/pdf"]
45
+
46
+ def __init__(self, config: PDFProcessorConfig | dict[str, Any] | None = None):
47
+ """Initialize the PDF processor.
48
+
49
+ Args:
50
+ config: Processor configuration
51
+ """
52
+ if config is None:
53
+ super().__init__(PDFProcessorConfig())
54
+ elif isinstance(config, dict):
55
+ super().__init__(PDFProcessorConfig(**config))
56
+ else:
57
+ super().__init__(config)
58
+
59
+ def process(
60
+ self, source: str | Path | BinaryIO | bytes, result: ValidationResult | None = None
61
+ ) -> ValidationResult:
62
+ """Process a PDF file and extract content.
63
+
64
+ Args:
65
+ source: PDF file path, file object, or bytes
66
+ result: Optional existing ValidationResult
67
+
68
+ Returns:
69
+ ValidationResult with ProcessedData containing extracted text
70
+ """
71
+ # Create result if not provided
72
+ if result is None:
73
+ result, provenance = self._create_result(source)
74
+ else:
75
+ provenance = result.provenance
76
+
77
+ config = self.config
78
+ if not isinstance(config, PDFProcessorConfig):
79
+ config = PDFProcessorConfig()
80
+
81
+ try:
82
+ import pdfplumber
83
+ except ImportError:
84
+ result.add_message(
85
+ code="PDF_NO_PDFPLUMBER",
86
+ message="pdfplumber package not installed. Install with: pip install pdfplumber",
87
+ severity=ValidationStatus.ERROR,
88
+ validator=self.name,
89
+ )
90
+ result.status = ValidationStatus.ERROR
91
+ return result
92
+
93
+ try:
94
+ # Read the source
95
+ raw_bytes = self._read_source(source)
96
+
97
+ # Compute checksum if configured
98
+ if self.config.compute_checksum and provenance:
99
+ provenance.checksum = self._compute_checksum(raw_bytes)
100
+
101
+ # Open PDF
102
+ pdf_file: Any
103
+ if isinstance(source, (str, Path)):
104
+ pdf_file = pdfplumber.open(source, password=config.password)
105
+ else:
106
+ import io
107
+ pdf_file = pdfplumber.open(io.BytesIO(raw_bytes), password=config.password)
108
+
109
+ with pdf_file as pdf:
110
+ # Extract metadata
111
+ metadata = self._extract_metadata(pdf)
112
+
113
+ # Extract text from pages
114
+ pages_text: list[str] = []
115
+ tables: list[list[list[str]]] = []
116
+ page_count = len(pdf.pages)
117
+
118
+ max_pages = config.max_pages or page_count
119
+ for i, page in enumerate(pdf.pages[:max_pages]):
120
+ # Extract text
121
+ page_text = page.extract_text() or ""
122
+ pages_text.append(page_text)
123
+
124
+ # Extract tables if configured
125
+ if config.extract_tables:
126
+ page_tables = page.extract_tables() or []
127
+ tables.extend(page_tables)
128
+
129
+ # Combine all text
130
+ full_text = "\n\n".join(pages_text)
131
+
132
+ # Check if text extraction worked
133
+ if not full_text.strip() and config.ocr_fallback:
134
+ full_text = self._ocr_fallback(raw_bytes)
135
+ metadata["ocr_used"] = True
136
+
137
+ # Create processed data
138
+ processed = ProcessedData(
139
+ content=full_text,
140
+ content_type="text",
141
+ source_type="pdf",
142
+ metadata=metadata,
143
+ page_count=page_count,
144
+ raw_size_bytes=len(raw_bytes),
145
+ )
146
+
147
+ # Add tables to metadata if extracted
148
+ if tables:
149
+ processed.metadata["tables"] = tables
150
+ processed.metadata["table_count"] = len(tables)
151
+
152
+ result.data = processed
153
+
154
+ # Add info message about extraction
155
+ result.metadata["pdf_pages"] = page_count
156
+ result.metadata["pdf_text_length"] = len(full_text)
157
+
158
+ except Exception as e:
159
+ result.add_message(
160
+ code="PDF_PROCESSING_ERROR",
161
+ message=f"Failed to process PDF: {e}",
162
+ severity=ValidationStatus.ERROR,
163
+ validator=self.name,
164
+ details={"error": str(e)},
165
+ )
166
+ result.status = ValidationStatus.ERROR
167
+
168
+ return result
169
+
170
+ def _extract_metadata(self, pdf: Any) -> dict[str, Any]:
171
+ """Extract metadata from PDF.
172
+
173
+ Args:
174
+ pdf: pdfplumber PDF object
175
+
176
+ Returns:
177
+ Dict of metadata
178
+ """
179
+ metadata: dict[str, Any] = {}
180
+
181
+ if hasattr(pdf, "metadata") and pdf.metadata:
182
+ pdf_meta = pdf.metadata
183
+ # Common PDF metadata fields
184
+ field_mapping = {
185
+ "Title": "title",
186
+ "Author": "author",
187
+ "Subject": "subject",
188
+ "Creator": "creator",
189
+ "Producer": "producer",
190
+ "CreationDate": "created_at",
191
+ "ModDate": "modified_at",
192
+ "Keywords": "keywords",
193
+ }
194
+
195
+ for pdf_key, our_key in field_mapping.items():
196
+ if pdf_key in pdf_meta and pdf_meta[pdf_key]:
197
+ metadata[our_key] = pdf_meta[pdf_key]
198
+
199
+ metadata["page_count"] = len(pdf.pages)
200
+
201
+ return metadata
202
+
203
+ def _ocr_fallback(self, pdf_bytes: bytes) -> str:
204
+ """Attempt OCR on PDF pages.
205
+
206
+ Args:
207
+ pdf_bytes: Raw PDF bytes
208
+
209
+ Returns:
210
+ Extracted text via OCR
211
+ """
212
+ try:
213
+ import io
214
+
215
+ from pdf2image import convert_from_bytes
216
+ import pytesseract
217
+
218
+ # Convert PDF to images
219
+ images = convert_from_bytes(pdf_bytes)
220
+
221
+ # OCR each page
222
+ texts = []
223
+ for img in images:
224
+ text = pytesseract.image_to_string(img)
225
+ texts.append(text)
226
+
227
+ return "\n\n".join(texts)
228
+
229
+ except ImportError:
230
+ return "[OCR unavailable - install pytesseract and pdf2image]"
231
+ except Exception:
232
+ return "[OCR failed]"
@@ -0,0 +1,14 @@
1
+ """DaytaShield compliance rule packs."""
2
+
3
+ from daytashield.rules.base import ComplianceRule, ComplianceViolation
4
+ from daytashield.rules.gdpr import GDPRRules
5
+ from daytashield.rules.hipaa import HIPAARules
6
+ from daytashield.rules.pii import PIIDetector
7
+
8
+ __all__ = [
9
+ "ComplianceRule",
10
+ "ComplianceViolation",
11
+ "HIPAARules",
12
+ "GDPRRules",
13
+ "PIIDetector",
14
+ ]
@@ -0,0 +1,67 @@
1
+ """Base compliance rule abstract class."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class ComplianceViolation(BaseModel):
12
+ """A compliance violation detected by a rule."""
13
+
14
+ code: str = Field(..., description="Machine-readable violation code")
15
+ message: str = Field(..., description="Human-readable description")
16
+ severity: str = Field("error", description="Severity: error, warning, info")
17
+ category: str = Field(..., description="Violation category (e.g., PHI, PII)")
18
+ field: str | None = Field(None, description="Field path where violation was found")
19
+ matched_value: str | None = Field(None, description="The value that matched (redacted)")
20
+ recommendation: str | None = Field(None, description="How to fix the violation")
21
+
22
+
23
+ class ComplianceRule(ABC):
24
+ """Abstract base class for compliance rules.
25
+
26
+ Compliance rules detect specific types of violations in data,
27
+ such as exposed PII, missing consent, or regulatory violations.
28
+
29
+ Example:
30
+ >>> class MyRule(ComplianceRule):
31
+ ... name = "my_rule"
32
+ ... description = "Checks for custom violations"
33
+ ...
34
+ ... def check(self, data, text_content):
35
+ ... violations = []
36
+ ... for field, text in text_content:
37
+ ... if "secret" in text.lower():
38
+ ... violations.append(ComplianceViolation(
39
+ ... code="SECRET_EXPOSED",
40
+ ... message="Secret value detected",
41
+ ... category="security",
42
+ ... field=field,
43
+ ... ))
44
+ ... return violations
45
+ """
46
+
47
+ name: str = "base_rule"
48
+ description: str = "Base compliance rule"
49
+ enabled: bool = True
50
+
51
+ @abstractmethod
52
+ def check(
53
+ self, data: Any, text_content: list[tuple[str, str]]
54
+ ) -> list[ComplianceViolation]:
55
+ """Check data for compliance violations.
56
+
57
+ Args:
58
+ data: The original data structure
59
+ text_content: List of (field_path, text_value) tuples
60
+
61
+ Returns:
62
+ List of ComplianceViolation objects
63
+ """
64
+ pass
65
+
66
+ def __repr__(self) -> str:
67
+ return f"{self.__class__.__name__}(name={self.name!r})"